From f56602b92d781e63f152e398d73ec7dec92e17d9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:37 -0700 Subject: [PATCH 001/278] perf: Generic hotplug support for a PMU with a scope ANBZ: #13617 commit 4ba4f1afb6a9fed8ef896c2363076e36572f71da upstream. The perf subsystem assumes that the counters of a PMU are per-CPU. So the user space tool reads a counter from each CPU in the system wide mode. However, many PMUs don't have a per-CPU counter. The counter is effective for a scope, e.g., a die or a socket. To address this, a cpumask is exposed by the kernel driver to restrict to one CPU to stand for a specific scope. In case the given CPU is removed, the hotplug support has to be implemented for each such driver. The codes to support the cpumask and hotplug are very similar. - Expose a cpumask into sysfs - Pickup another CPU in the same scope if the given CPU is removed. - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. - In event init, always set the CPU in the cpumask to event->cpu Similar duplicated codes are implemented for each such PMU driver. It would be good to introduce a generic infrastructure to avoid such duplication. 5 popular scopes are implemented here, core, die, cluster, pkg, and the system-wide. The scope can be set when a PMU is registered. If so, a "cpumask" is automatically exposed for the PMU. The "cpumask" is from the perf_online__mask, which is to track the active CPU for each scope. They are set when the first CPU of the scope is online via the generic perf hotplug support. When a corresponding CPU is removed, the perf_online__mask is updated accordingly and the PMU will be moved to a new CPU from the same scope if possible. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com Signed-off-by: Jay Chen --- include/linux/perf_event.h | 18 ++++ kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 823eb3587d8f..1009ea4da946 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,6 +294,19 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +/** + * pmu::scope + */ +enum perf_pmu_scope { + PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_CORE, + PERF_PMU_SCOPE_DIE, + PERF_PMU_SCOPE_CLUSTER, + PERF_PMU_SCOPE_PKG, + PERF_PMU_SCOPE_SYS_WIDE, + PERF_PMU_MAX_SCOPE, +}; + struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) @@ -317,6 +330,11 @@ struct pmu { */ int capabilities; + /* + * PMU scope + */ + unsigned int scope; + int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 69b7d29d4fdf..4aa3f81fa02b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -424,6 +424,11 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* @@ -11559,10 +11564,60 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, NULL, }; @@ -11574,6 +11629,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int if (n == 2 && !pmu->nr_addr_filters) return 0; + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + return a->mode; } @@ -11673,6 +11732,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto free_pdc; } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; if (type >= 0) @@ -11836,6 +11900,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->cpu = cpu; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -13791,6 +13871,12 @@ static void __init perf_event_init_all_cpus(void) int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); @@ -13840,6 +13926,40 @@ static void __perf_event_exit_context(void *__info) raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; @@ -13847,6 +13967,11 @@ static void perf_event_exit_cpu_context(int cpu) // XXX simplify cpuctx->online mutex_lock(&pmus_lock); + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -13854,7 +13979,6 @@ static void perf_event_exit_cpu_context(int cpu) smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); - cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else @@ -13863,6 +13987,42 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + cpumask_set_cpu(cpu, perf_online_mask); + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online__masks includes the first CPU of each domain. + * Always uncondifionally set the boot CPU for the perf_online__masks. + */ + if (!topology_sibling_cpumask(cpu)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + return; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; @@ -13871,7 +14031,7 @@ int perf_event_init_cpu(unsigned int cpu) perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); + perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; -- Gitee From 743a6e4b8ab265fec1d580a75965c474450d61c6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 12 Sep 2024 07:50:25 -0700 Subject: [PATCH 002/278] perf: Fix topology_sibling_cpumask check warning on ARM ANBZ: #13617 commit 673a5009cf2f020dac440cd79e70c4c8b8e20d08 upstream. The below warning is triggered when building with arm multi_v7_defconfig. kernel/events/core.c: In function 'perf_event_setup_cpumask': kernel/events/core.c:14012:13: warning: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL [-Waddress] 14012 | if (!topology_sibling_cpumask(cpu)) { The perf_event_init_cpu() may be invoked at the early boot stage, while the topology_*_cpumask hasn't been initialized yet. The check is to specially handle the case, and initialize the perf_online__masks on the boot CPU. X86 uses a per-cpu cpumask pointer, which could be NULL at the early boot stage. However, ARM uses a global variable, which never be NULL. Use perf_online_mask as an indicator instead. Only initialize the perf_online__masks when perf_online_mask is empty. Fix a typo as well. Fixes: 4ba4f1afb6a9 ("perf: Generic hotplug support for a PMU with a scope") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20240911153854.240bbc1f@canb.auug.org.au/ Reported-by: Steven Price Closes: https://lore.kernel.org/lkml/1835eb6d-3e05-47f3-9eae-507ce165c3bf@arm.com/ Signed-off-by: Kan Liang Tested-by: Steven Price Signed-off-by: Linus Torvalds --- kernel/events/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4aa3f81fa02b..e7076609fd93 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13992,21 +13992,19 @@ static void perf_event_setup_cpumask(unsigned int cpu) struct cpumask *pmu_cpumask; unsigned int scope; - cpumask_set_cpu(cpu, perf_online_mask); - /* * Early boot stage, the cpumask hasn't been set yet. * The perf_online__masks includes the first CPU of each domain. - * Always uncondifionally set the boot CPU for the perf_online__masks. + * Always unconditionally set the boot CPU for the perf_online__masks. */ - if (!topology_sibling_cpumask(cpu)) { + if (cpumask_empty(perf_online_mask)) { for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask)) continue; cpumask_set_cpu(cpu, pmu_cpumask); } - return; + goto end; } for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { @@ -14021,6 +14019,8 @@ static void perf_event_setup_cpumask(unsigned int cpu) cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) cpumask_set_cpu(cpu, pmu_cpumask); } +end: + cpumask_set_cpu(cpu, perf_online_mask); } int perf_event_init_cpu(unsigned int cpu) -- Gitee From 2ce9b53d119c58533bda9007a02e9125d3cda13c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 Sep 2024 10:14:05 +0300 Subject: [PATCH 003/278] dma-mapping: use IOMMU DMA calls for common alloc/free page calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #13617 commit 19156263cb1f24128a9ba6ef7340be5cbacc3d22 upstream. Common alloca and free pages routines are called when IOMMU DMA is used, and internally it calls to DMA ops structure which is not available for default IOMMU. This patch adds necessary if checks to call IOMMU DMA. It fixes the following crash: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000040 Mem abort info: ESR = 0x0000000096000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000d20bb000 [0000000000000040] pgd=08000000d20c1003 , p4d=08000000d20c1003 , pud=08000000d20c2003, pmd=0000000000000000 Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP Modules linked in: ipv6 hci_uart venus_core btqca v4l2_mem2mem btrtl qcom_spmi_adc5 sbs_battery btbcm qcom_vadc_common cros_ec_typec videobuf2_v4l2 leds_cros_ec cros_kbd_led_backlight cros_ec_chardev videodev elan_i2c videobuf2_common qcom_stats mc bluetooth coresight_stm stm_core ecdh_generic ecc pwrseq_core panel_edp icc_bwmon ath10k_snoc ath10k_core ath mac80211 phy_qcom_qmp_combo aux_bridge libarc4 coresight_replicator coresight_etm4x coresight_tmc coresight_funnel cfg80211 rfkill coresight qcom_wdt cbmem ramoops reed_solomon pwm_bl coreboot_table backlight crct10dif_ce CPU: 7 UID: 0 PID: 70 Comm: kworker/u32:4 Not tainted 6.11.0-rc6-next-20240903-00003-gdfc6015d0711 #660 Hardware name: Google Lazor Limozeen without Touchscreen (rev5 - rev8) (DT) Workqueue: events_unbound deferred_probe_work_func hub 2-1:1.0: 4 ports detected pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dma_common_alloc_pages+0x54/0x1b4 lr : dma_common_alloc_pages+0x4c/0x1b4 sp : ffff8000807d3730 x29: ffff8000807d3730 x28: ffff02a7d312f880 x27: 0000000000000001 x26: 000000000000c000 x25: 0000000000000000 x24: 0000000000000001 x23: ffff02a7d23b6898 x22: 0000000000006cc0 x21: 000000000000c000 x20: ffff02a7858bf410 x19: fffffe0a60006000 x18: 0000000000000001 x17: 00000000000000d5 x16: 1fffe054f0bcc261 x15: 0000000000000001 x14: ffff02a7844dc680 x13: 0000000000100180 x12: dead000000000100 x11: dead000000000122 x10: 00000000001001ff x9 : ffff02a87f7b7b00 x8 : ffff02a87f7b7b00 x7 : ffff405977d6b000 x6 : ffff8000807d3310 x5 : ffff02a87f6b6398 x4 : 0000000000000001 x3 : ffff405977d6b000 x2 : ffff02a7844dc600 x1 : 0000000100000000 x0 : fffffe0a60006000 Call trace: dma_common_alloc_pages+0x54/0x1b4 __dma_alloc_pages+0x68/0x90 dma_alloc_pages+0x10/0x1c snd_dma_noncoherent_alloc+0x28/0x8c __snd_dma_alloc_pages+0x30/0x50 snd_dma_alloc_dir_pages+0x40/0x80 do_alloc_pages+0xb8/0x13c preallocate_pcm_pages+0x6c/0xf8 preallocate_pages+0x160/0x1a4 snd_pcm_set_managed_buffer_all+0x64/0xb0 lpass_platform_pcm_new+0xc0/0xe8 snd_soc_pcm_component_new+0x3c/0xc8 soc_new_pcm+0x4fc/0x668 snd_soc_bind_card+0xabc/0xbac snd_soc_register_card+0xf0/0x108 devm_snd_soc_register_card+0x4c/0xa4 sc7180_snd_platform_probe+0x180/0x224 platform_probe+0x68/0xc0 really_probe+0xbc/0x298 __driver_probe_device+0x78/0x12c driver_probe_device+0x3c/0x15c __device_attach_driver+0xb8/0x134 bus_for_each_drv+0x84/0xe0 __device_attach+0x9c/0x188 device_initial_probe+0x14/0x20 bus_probe_device+0xac/0xb0 deferred_probe_work_func+0x88/0xc0 process_one_work+0x14c/0x28c worker_thread+0x2cc/0x3d4 kthread+0x114/0x118 ret_from_fork+0x10/0x20 Code: f9411c19 940000c9 aa0003f3 b4000460 (f9402326) ---[ end trace 0000000000000000 ]--- Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") Closes: https://lore.kernel.org/all/10431dfd-ce04-4e0f-973b-c78477303c18@notapiano Reported-by: Nícolas F. R. A. Prado #KernelCI Signed-off-by: Leon Romanovsky Tested-by: Nícolas F. R. A. Prado Signed-off-by: Christoph Hellwig Signed-off-by: Jay Chen --- include/linux/iommu-dma.h | 8 ++++++++ kernel/dma/mapping.c | 12 ------------ kernel/dma/ops_helpers.c | 14 +++++++++++--- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index d30a58bf00fd..1bb55ca1ab79 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -10,6 +10,10 @@ #include #ifdef CONFIG_IOMMU_DMA +static inline bool use_dma_iommu(struct device *dev) +{ + return dev->dma_iommu; +} dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -49,6 +53,10 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); #else +static inline bool use_dma_iommu(struct device *dev) +{ + return false; +} static inline dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index ca43011b2412..65bb2dbb190e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -114,18 +114,6 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); -#ifdef CONFIG_IOMMU_DMA -static bool use_dma_iommu(struct device *dev) -{ - return dev->dma_iommu; -} -#else -static bool use_dma_iommu(struct device *dev) -{ - return false; -} -#endif - static bool dma_go_direct(struct device *dev, dma_addr_t mask, const struct dma_map_ops *ops) { diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index af4a6ef48ce0..9afd569eadb9 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -4,6 +4,7 @@ * the allocated memory contains normal pages in the direct kernel mapping. */ #include +#include static struct page *dma_common_vaddr_to_page(void *cpu_addr) { @@ -70,8 +71,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; - *dma_handle = ops->map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (use_dma_iommu(dev)) + *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else + *dma_handle = ops->map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); return NULL; @@ -86,7 +91,10 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (ops->unmap_page) + if (use_dma_iommu(dev)) + iommu_dma_unmap_page(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); -- Gitee From 7173928ffaa7e1cafdf46971703b6a5ea5fff102 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Sep 2024 09:21:10 +0200 Subject: [PATCH 004/278] iommu/dma: remove most stubs in iommu-dma.h ANBZ: #13617 commit 3d09ff45469eb2912ce2227c47efac297230d6d6 upstream. The direct calls from mapping.c all guarded by use_dma_iommu(), so don't bother to provide stubs, but instead just expose the prototypes unconditionally. Signed-off-by: Christoph Hellwig Reviewed-by: Leon Romanovsky [Jay Chen Conflicts: include/linux/iommu-dma.h ] Signed-off-by: Jay Chen --- include/linux/iommu-dma.h | 108 +++----------------------------------- 1 file changed, 8 insertions(+), 100 deletions(-) diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 1bb55ca1ab79..564698fcbbd5 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -14,6 +14,13 @@ static inline bool use_dma_iommu(struct device *dev) { return dev->dma_iommu; } +#else +static inline bool use_dma_iommu(struct device *dev) +{ + return false; +} +#endif /* CONFIG_IOMMU_DMA */ + dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -52,104 +59,5 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); -#else -static inline bool use_dma_iommu(struct device *dev) -{ - return false; -} -static inline dma_addr_t iommu_dma_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - return DMA_MAPPING_ERROR; -} -static inline void iommu_dma_unmap_page(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - return -EINVAL; -} -static inline void iommu_dma_unmap_sg(struct device *dev, - struct scatterlist *sg, int nents, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline void *iommu_dma_alloc(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp, unsigned long attrs) -{ - return NULL; -} -static inline int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -EINVAL; -} -static inline int iommu_dma_get_sgtable(struct device *dev, - struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, - size_t size, unsigned long attrs) -{ - return -EINVAL; -} -static inline unsigned long iommu_dma_get_merge_boundary(struct device *dev) -{ - return 0; -} -static inline size_t iommu_dma_opt_mapping_size(void) -{ - return 0; -} -static inline size_t iommu_dma_max_mapping_size(struct device *dev) -{ - return 0; -} -static inline void iommu_dma_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle, unsigned long attrs) -{ -} -static inline dma_addr_t iommu_dma_map_resource(struct device *dev, - phys_addr_t phys, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - return DMA_MAPPING_ERROR; -} -static inline void iommu_dma_unmap_resource(struct device *dev, - dma_addr_t handle, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline struct sg_table * -iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, - enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) -{ - return NULL; -} -static inline void iommu_dma_free_noncontiguous(struct device *dev, size_t size, - struct sg_table *sgt, enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ -} -#endif /* CONFIG_IOMMU_DMA */ + #endif /* _LINUX_IOMMU_DMA_H */ -- Gitee From 022b0100669672dbe753eab5646808d88218263f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:40 -0700 Subject: [PATCH 005/278] iommu/vt-d: Clean up cpumask and hotplug for perfmon ANBZ: #13617 commit a8c73b82f7792400555143773d0152d5bc3ac068 upstream. The iommu PMU is system-wide scope, which is supported by the generic perf_event subsystem now. Set the scope for the iommu PMU and remove all the cpumask and hotplug codes. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20240802151643.1691631-5-kan.liang@linux.intel.com Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.h | 2 - drivers/iommu/intel/perfmon.c | 111 +--------------------------------- 2 files changed, 2 insertions(+), 111 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index fe9cbfbe38f7..9d77f3b7e263 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -700,8 +700,6 @@ struct iommu_pmu { DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; - struct hlist_node cpuhp_node; - int cpu; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index 44083d01852d..75f493bcb353 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = { .attrs = attrs_empty, }; -static cpumask_t iommu_pmu_cpu_mask; - -static ssize_t -cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); -} -static DEVICE_ATTR_RO(cpumask); - -static struct attribute *iommu_pmu_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL -}; - -static struct attribute_group iommu_pmu_cpumask_attr_group = { - .attrs = iommu_pmu_cpumask_attrs, -}; - static const struct attribute_group *iommu_pmu_attr_groups[] = { &iommu_pmu_format_attr_group, &iommu_pmu_events_attr_group, - &iommu_pmu_cpumask_attr_group, NULL }; @@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu) iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; iommu_pmu->pmu.module = THIS_MODULE; return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); @@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) iommu->perf_irq = 0; } -static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - - if (cpumask_empty(&iommu_pmu_cpu_mask)) - cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); - - if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) - iommu_pmu->cpu = cpu; - - return 0; -} - -static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - int target = cpumask_first(&iommu_pmu_cpu_mask); - - /* - * The iommu_pmu_cpu_mask has been updated when offline the CPU - * for the first iommu_pmu. Migrate the other iommu_pmu to the - * new target. - */ - if (target < nr_cpu_ids && target != iommu_pmu->cpu) { - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - return 0; - } - - if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) - return 0; - - target = cpumask_any_but(cpu_online_mask, cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &iommu_pmu_cpu_mask); - else - return 0; - - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - - return 0; -} - -static int nr_iommu_pmu; -static enum cpuhp_state iommu_cpuhp_slot; - -static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) -{ - int ret; - - if (!nr_iommu_pmu) { - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "driver/iommu/intel/perfmon:online", - iommu_pmu_cpu_online, - iommu_pmu_cpu_offline); - if (ret < 0) - return ret; - iommu_cpuhp_slot = ret; - } - - ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - if (ret) { - if (!nr_iommu_pmu) - cpuhp_remove_multi_state(iommu_cpuhp_slot); - return ret; - } - nr_iommu_pmu++; - - return 0; -} - -static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) -{ - cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - - if (--nr_iommu_pmu) - return; - - cpuhp_remove_multi_state(iommu_cpuhp_slot); -} - void iommu_pmu_register(struct intel_iommu *iommu) { struct iommu_pmu *iommu_pmu = iommu->pmu; @@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu) if (__iommu_pmu_register(iommu)) goto err; - if (iommu_pmu_cpuhp_setup(iommu_pmu)) - goto unregister; - /* Set interrupt for overflow */ if (iommu_pmu_set_interrupt(iommu)) - goto cpuhp_free; + goto unregister; return; -cpuhp_free: - iommu_pmu_cpuhp_free(iommu_pmu); unregister: perf_pmu_unregister(&iommu_pmu->pmu); err: @@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu) return; iommu_pmu_unset_interrupt(iommu); - iommu_pmu_cpuhp_free(iommu_pmu); perf_pmu_unregister(&iommu_pmu->pmu); } -- Gitee From f002edfc3027d6e04ef989543eb1d04330b8ba7c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 5 Sep 2024 09:22:40 +0200 Subject: [PATCH 006/278] iommu/amd: Add kernel parameters to limit V1 page-sizes ANBZ: #13617 commit f0295913c4b4f377c454e06f50c1a04f2f80d9df upstream. Add two new kernel command line parameters to limit the page-sizes used for v1 page-tables: nohugepages - Limits page-sizes to 4KiB v2_pgsizes_only - Limits page-sizes to 4Kib/2Mib/1GiB; The same as the sizes used with v2 page-tables This is needed for multiple scenarios. When assigning devices to SEV-SNP guests the IOMMU page-sizes need to match the sizes in the RMP table, otherwise the device will not be able to access all shared memory. Also, some ATS devices do not work properly with arbitrary IO page-sizes as supported by AMD-Vi, so limiting the sizes used by the driver is a suitable workaround. All-in-all, these parameters are only workarounds until the IOMMU core and related APIs gather the ability to negotiate the page-sizes in a better way. Signed-off-by: Joerg Roedel Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20240905072240.253313-1-joro@8bytes.org Signed-off-by: Shuai Xue --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++------ drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/amd_iommu_types.h | 4 ++++ drivers/iommu/amd/init.c | 8 ++++++++ drivers/iommu/amd/io_pgtable.c | 2 +- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2f6ce5dbb77d..7657f088912b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -318,12 +318,17 @@ allowed anymore to lift isolation requirements as needed. This option does not override iommu=pt - force_enable - Force enable the IOMMU on platforms known - to be buggy with IOMMU enabled. Use this - option with care. - pgtbl_v1 - Use v1 page table for DMA-API (Default). - pgtbl_v2 - Use v2 page table for DMA-API. - irtcachedis - Disable Interrupt Remapping Table (IRT) caching. + force_enable - Force enable the IOMMU on platforms known + to be buggy with IOMMU enabled. Use this + option with care. + pgtbl_v1 - Use v1 page table for DMA-API (Default). + pgtbl_v2 - Use v2 page table for DMA-API. + irtcachedis - Disable Interrupt Remapping Table (IRT) caching. + nohugepages - Limit page-sizes used for v1 page-tables + to 4 KiB. + v2_pgsizes_only - Limit page-sizes used for v1 page-tables + to 4KiB/2Mib/1GiB. + amd_iommu_dump= [HW,X86-64] Enable AMD IOMMU driver option to dump the ACPI table diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index fa0af030d8b3..6b971105ebb7 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -43,6 +43,7 @@ int amd_iommu_enable_faulting(unsigned int cpu); extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; +extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ struct protection_domain *protection_domain_alloc(unsigned int type, int nid); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 35aa4ff020f5..601fb4ee6900 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -293,6 +293,10 @@ * Page sizes >= the 52 bit max physical address of the CPU are not supported. */ #define AMD_IOMMU_PGSIZES (GENMASK_ULL(51, 12) ^ SZ_512G) + +/* Special mode where page-sizes are limited to 4 KiB */ +#define AMD_IOMMU_PGSIZES_4K (PAGE_SIZE) + /* 4K, 2MB, 1G page sizes are supported */ #define AMD_IOMMU_PGSIZES_V2 (PAGE_SIZE | (1ULL << 21) | (1ULL << 30)) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index d161de2dd740..22af9ba0c980 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -191,6 +191,8 @@ bool amdr_ivrs_remap_support __read_mostly; bool amd_iommu_force_isolation __read_mostly; +unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; + /* * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap * to know which ones are already in use. @@ -3465,6 +3467,12 @@ static int __init parse_amd_iommu_options(char *str) amd_iommu_pgtable = AMD_IOMMU_V2; } else if (strncmp(str, "irtcachedis", 11) == 0) { amd_iommu_irtcachedis = true; + } else if (strncmp(str, "nohugepages", 11) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_4K; + } else if (strncmp(str, "v2_pgsizes_only", 15) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB/2MiB/1GiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; } else { pr_notice("Unknown option - '%s'\n", str); } diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 14f62c420e4a..804b788f3f16 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -548,7 +548,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo return NULL; pgtable->mode = PAGE_MODE_3_LEVEL; - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES; + cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; -- Gitee From 2a3a5d2f49000b19a2e38e9a9b4d5a0cb3a3c258 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sun, 8 Sep 2024 04:42:55 -0700 Subject: [PATCH 007/278] iommufd: Avoid duplicated __iommu_group_set_core_domain() call ANBZ: #13617 commit d9dfb5e6225a0a99e08dc2a538b0c30a5a9a460c upstream. For the fault-capable hwpts, the iommufd_hwpt_detach_device() calls both iommufd_fault_domain_detach_dev() and iommu_detach_group(). This would have duplicated __iommu_group_set_core_domain() call since both functions call it in the end. This looks no harm as the __iommu_group_set_core_domain() returns if the new domain equals to the existing one. But it makes sense to avoid such duplicated calls in caller side. Link: https://patch.msgid.link/r/20240908114256.979518-2-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_private.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 5d3768d77099..f1d865e6fab6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -510,8 +510,10 @@ static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { - if (hwpt->fault) + if (hwpt->fault) { iommufd_fault_domain_detach_dev(hwpt, idev); + return; + } iommu_detach_group(hwpt->domain, idev->igroup->group); } -- Gitee From 9fd6c4c1764414cd3c654af5cea6295731300cbb Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sun, 8 Sep 2024 04:42:56 -0700 Subject: [PATCH 008/278] iommu: Set iommu_attach_handle->domain in core ANBZ: #13617 commit 79805c1bbbf9846fe91c16933d64614cbbff1dee upstream. The IOMMU core sets the iommu_attach_handle->domain for the iommu_attach_group_handle() path, while the iommu_replace_group_handle() sets it on the caller side. Make the two paths aligned on it. Link: https://patch.msgid.link/r/20240908114256.979518-3-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 1 + drivers/iommu/iommufd/fault.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6b0f42ea9218..a9e565349373 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3596,6 +3596,7 @@ int iommu_replace_group_handle(struct iommu_group *group, ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; + handle->domain = new_domain; } ret = __iommu_group_set_domain(group, new_domain); diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index df03411c8728..8c8226f0dffd 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -161,7 +161,6 @@ static int __fault_domain_replace_dev(struct iommufd_device *idev, if (!handle) return -ENOMEM; - handle->handle.domain = hwpt->domain; handle->idev = idev; ret = iommu_replace_group_handle(idev->igroup->group, hwpt->domain, &handle->handle); -- Gitee From 737c672138e332bf4eb763e759b43afd0601915a Mon Sep 17 00:00:00 2001 From: Eliav Bar-ilan Date: Tue, 10 Sep 2024 16:44:16 -0300 Subject: [PATCH 009/278] iommu/amd: Fix argument order in amd_iommu_dev_flush_pasid_all() ANBZ: #13617 commit 8386207f37e98453e1de3f51e50eeeea089103f9 upstream. An incorrect argument order calling amd_iommu_dev_flush_pasid_pages() causes improper flushing of the IOMMU, leaving the old value of GCR3 from a previous process attached to the same PASID. The function has the signature: void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, ioasid_t pasid, u64 address, size_t size) Correct the argument order. Cc: stable@vger.kernel.org Fixes: 474bf01ed9f0 ("iommu/amd: Add support for device based TLB invalidation") Signed-off-by: Eliav Bar-ilan Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/0-v1-fc6bc37d8208+250b-amd_pasid_flush_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d7a141f3dbdf..2a823a25288a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1577,8 +1577,8 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, ioasid_t pasid) { - amd_iommu_dev_flush_pasid_pages(dev_data, 0, - CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid); + amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } /* Flush the not present cache if it exists */ -- Gitee From 1ec4f8234053c182f9cecfe6acd27386ce3eb1b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 10 Sep 2024 17:00:34 -0300 Subject: [PATCH 010/278] iommu/amd: Test for PAGING domains before freeing a domain ANBZ: #13617 commit 3ab9d8d1b50b516507655b10e19d8e624ad1d79c upstream. This domain free function can be called for IDENTITY and SVA domains too, and they don't have page tables. For now protect against this by checking the type. Eventually the different types should have their own free functions. Fixes: 485534bfccb2 ("iommu/amd: Remove conditions from domain free paths") Reported-by: Vasant Hegde Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/0-v1-ad9884ee5f5b+da-amd_iopgtbl_fix_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2a823a25288a..735ed742489d 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2266,7 +2266,8 @@ static void cleanup_domain(struct protection_domain *domain) void protection_domain_free(struct protection_domain *domain) { WARN_ON(!list_empty(&domain->dev_list)); - free_io_pgtable_ops(&domain->iop.pgtbl.ops); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) + free_io_pgtable_ops(&domain->iop.pgtbl.ops); domain_id_free(domain->id); kfree(domain); } -- Gitee From 278ca9f33f57570b1135a872fd0086899c4fcfd7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 19 Aug 2025 17:47:47 +0800 Subject: [PATCH 011/278] iommu: Reorganize kerneldoc parameter names ANBZ: #13617 commit 7fd9aeb07f83d871e861d152fbc62a6ced0f70d1 upstream. Reorganize kerneldoc parameter names to match the parameter order in the function header. Problems identified using Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20240930112121.95324-20-Julia.Lawall@inria.fr Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a9e565349373..680a6efb9907 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2227,8 +2227,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); /** * iommu_group_replace_domain - replace the domain that a group is attached to - * @new_domain: new IOMMU domain to replace with * @group: IOMMU group that will be attached to the new domain + * @new_domain: new IOMMU domain to replace with * * This API allows the group to switch domains without being forced to go to * the blocking domain in-between. -- Gitee From 13bb319b22fae74ccec7c8b20795a492d7d698a1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 19 Aug 2025 17:48:37 +0800 Subject: [PATCH 012/278] iommu/amd: Use atomic64_inc_return() in iommu.c ANBZ: #13617 commit 5ce73c524f5fb5abd7b1bfed0115474b4fb437b4 upstream. Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Signed-off-by: Uros Bizjak Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: Will Deacon Cc: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241007084356.47799-1-ubizjak@gmail.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 735ed742489d..a0c974361643 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1238,7 +1238,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); @@ -2912,7 +2912,7 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); -- Gitee From 733e821c756ac606da00a83ead6ca47f2c484e79 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 23 Nov 2023 13:41:12 +0000 Subject: [PATCH 013/278] drm/mediatek: Stop using iommu_present() ANBZ: #13617 commit c5a761e2fe58631e1cd0249a729ce1934a11bcbe upstream. Remove the pointless check. If an IOMMU is providing transparent DMA API ops for any device(s) we care about, the DT code will have enforced the appropriate probe ordering already. And if the IOMMU *is* entirely absent, then attempting to go ahead with CMA and either suceeding or failing decisively seems more useful than deferring forever. Signed-off-by: Robin Murphy Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/fd1b62aa006556f29f37535814abfe41be63f7ae.1700746094.git.robin.murphy@arm.com/ Signed-off-by: Chun-Kuang Hu Signed-off-by: Jay Chen --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index ef4fa70119de..6d7e3d5ad17e 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include @@ -589,9 +588,6 @@ static int mtk_drm_bind(struct device *dev) struct drm_device *drm; int ret, i; - if (!iommu_present(&platform_bus_type)) - return -EPROBE_DEFER; - pdev = of_find_device_by_node(private->mutex_node); if (!pdev) { dev_err(dev, "Waiting for disp-mutex device %pOF\n", -- Gitee From cac8ccf8c26e899e31763f640bc7a0cd8206bd59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Sep 2024 09:10:17 +0200 Subject: [PATCH 014/278] dma-mapping: fix vmap and mmap of noncontiougs allocations ANBZ: #13617 commit bb0e391975f8da826305cbaa3e3d34b03c47e2a6 upstream. Commit b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") switched to use direct calls to dma-iommu, but missed the dma_vmap_noncontiguous, dma_vunmap_noncontiguous and dma_mmap_noncontiguous behavior keyed off the presence of the alloc_noncontiguous method. Fix this by removing the now unused alloc_noncontiguous and free_noncontiguous methods and moving the vmapping and mmaping of the noncontiguous allocations into the iommu code, as it is the only provider of actually noncontiguous allocations. Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") Reported-by: Xi Ruoyao Signed-off-by: Christoph Hellwig Reviewed-by: Leon Romanovsky Tested-by: Xi Ruoyao Conflicts: kernel/dma/mapping.c --- drivers/iommu/dma-iommu.c | 33 +++++++++++++++++++++++++++++++++ include/linux/dma-map-ops.h | 19 ------------------- include/linux/iommu-dma.h | 6 ++++++ kernel/dma/mapping.c | 37 ++++++++++--------------------------- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index b20b734b90f5..fc67f42450d8 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1058,6 +1058,21 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, return NULL; } +/* + * This is the actual return value from the iommu_dma_alloc_noncontiguous. + * + * The users of the DMA API should only care about the sg_table, but to make + * the DMA-API internal vmaping and freeing easier we stash away the page + * array as well (except for the fallback case). This can go away any time, + * e.g. when a vmap-variant that takes a scatterlist comes along. + */ +struct dma_sgt_handle { + struct sg_table sgt; + struct page **pages; +}; +#define sgt_handle(sgt) \ + container_of((sgt), struct dma_sgt_handle, sgt) + struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { @@ -1086,6 +1101,24 @@ void iommu_dma_free_noncontiguous(struct device *dev, size_t size, kfree(sh); } +void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); +} + +int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); +} + void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 780d54a089e8..bf36d7519c16 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -24,11 +24,6 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); - struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size, - enum dma_data_direction dir, gfp_t gfp, - unsigned long attrs); - void (*free_noncontiguous)(struct device *dev, size_t size, - struct sg_table *sgt, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); @@ -213,20 +208,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_GLOBAL_POOL */ -/* - * This is the actual return value from the ->alloc_noncontiguous method. - * The users of the DMA API should only care about the sg_table, but to make - * the DMA-API internal vmaping and freeing easier we stash away the page - * array as well (except for the fallback case). This can go away any time, - * e.g. when a vmap-variant that takes a scatterlist comes along. - */ -struct dma_sgt_handle { - struct sg_table sgt; - struct page **pages; -}; -#define sgt_handle(sgt) \ - container_of((sgt), struct dma_sgt_handle, sgt) - int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 564698fcbbd5..508beaa44c39 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -51,6 +51,12 @@ struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void iommu_dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); +void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt); +#define iommu_dma_vunmap_noncontiguous(dev, vaddr) \ + vunmap(vaddr); +int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt); void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 65bb2dbb190e..8cec2d0b42ab 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -730,7 +730,6 @@ static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); struct sg_table *sgt; if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) @@ -738,9 +737,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (WARN_ON_ONCE(gfp & __GFP_COMP)) return NULL; - if (ops && ops->alloc_noncontiguous) - sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); - else if (use_dma_iommu(dev)) + if (use_dma_iommu(dev)) sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs); else sgt = alloc_single_sgt(dev, size, dir, gfp); @@ -765,12 +762,9 @@ static void free_single_sgt(struct device *dev, size_t size, void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { - const struct dma_map_ops *ops = get_dma_ops(dev); - debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); - if (ops && ops->free_noncontiguous) - ops->free_noncontiguous(dev, size, sgt, dir); - else if (use_dma_iommu(dev)) + + if (use_dma_iommu(dev)) iommu_dma_free_noncontiguous(dev, size, sgt, dir); else free_single_sgt(dev, size, sgt, dir); @@ -780,37 +774,26 @@ EXPORT_SYMBOL_GPL(dma_free_noncontiguous); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { - const struct dma_map_ops *ops = get_dma_ops(dev); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - if (ops && ops->alloc_noncontiguous) - return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); + if (use_dma_iommu(dev)) + return iommu_dma_vmap_noncontiguous(dev, size, sgt); + return page_address(sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (ops && ops->alloc_noncontiguous) - vunmap(vaddr); + if (use_dma_iommu(dev)) + iommu_dma_vunmap_noncontiguous(dev, vaddr); } EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (ops && ops->alloc_noncontiguous) { - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - if (vma->vm_pgoff >= count || - vma_pages(vma) > count - vma->vm_pgoff) - return -ENXIO; - return vm_map_pages(vma, sgt_handle(sgt)->pages, count); - } + if (use_dma_iommu(dev)) + return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt); return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -- Gitee From ea4b15c151aaf66e2d220d14e244c70ac2f7ae3a Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Tue, 1 Oct 2024 18:53:57 -0700 Subject: [PATCH 015/278] iommu/arm-smmu-v3: Fix last_sid_idx calculation for sid_bits==32 ANBZ: #13617 commit f63237f54cf18448728201a65e6c82018e807cd9 upstream. The function arm_smmu_init_strtab_2lvl uses the expression ((1 << smmu->sid_bits) - 1) to calculate the largest StreamID value. However, this fails for the maximum allowed value of SMMU_IDR1.SIDSIZE which is 32. The C standard states: "If the value of the right operand is negative or is greater than or equal to the width of the promoted left operand, the behavior is undefined." With smmu->sid_bits being 32, the prerequisites for undefined behavior are met. We observed that the value of (1 << 32) is 1 and not 0 as we initially expected. Similar bit shift operations in arm_smmu_init_strtab_linear seem to not be affected, because it appears to be unlikely for an SMMU to have SMMU_IDR1.SIDSIZE set to 32 but then not support 2-level Stream tables This issue was found by Ryan Huang on our team. Fixes: ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()") Signed-off-by: Daniel Mentz Link: https://lore.kernel.org/r/20241002015357.1766934-1-danielmentz@google.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 58bd779f214a..c267e6e7a82a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3636,7 +3636,7 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu) u32 l1size; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; unsigned int last_sid_idx = - arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1); + arm_smmu_strtab_l1_idx((1ULL << smmu->sid_bits) - 1); /* Calculate the L1 size, capped to the SIDSIZE. */ cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES); -- Gitee From 4d626806caea81138839ebd026ef29d75c1a193b Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Mon, 23 Sep 2024 10:15:57 +0800 Subject: [PATCH 016/278] iommu/arm-smmu-v3: Convert comma to semicolon ANBZ: #13617 commit 7de7d35429aa2e9667e51b88ff097be968feaf8f upstream. Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Fixes: e3b1be2e73db ("iommu/arm-smmu-v3: Reorganize struct arm_smmu_ctx_desc_cfg") Signed-off-by: Chen Ni Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20240923021557.3432068-1-nichen@iscas.ac.cn Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c267e6e7a82a..363e8a8638e3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1420,7 +1420,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; cd_table->linear.num_ents = max_contexts; - l1size = max_contexts * sizeof(struct arm_smmu_cd), + l1size = max_contexts * sizeof(struct arm_smmu_cd); cd_table->linear.table = dma_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, GFP_KERNEL); -- Gitee From d428374a125d723c9cb112fb2c926f47563aecf7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 6 Mar 2024 18:13:26 +0800 Subject: [PATCH 017/278] mm: record the migration reason for struct migration_target_control ANBZ: #13617 commit e42dfe4e0a51b476dcc6f1461c51fdb1b76573aa upstream. Patch series "make the hugetlb migration strategy consistent", v2. As discussed in previous thread [1], there is an inconsistency when handling hugetlb migration. When handling the migration of freed hugetlb, it prevents fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However, when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool and might result in unexpected failures when node bound workloads doesn't get what is asssumed available. This patchset tries to make the hugetlb migration strategy more clear and consistent. Please find details in each patch. [1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/ This patch (of 2): To support different hugetlb allocation strategies during hugetlb migration based on various migration reasons, record the migration reason in the migration_target_control structure as a preparation. Link: https://lkml.kernel.org/r/cover.1709719720.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/7b95d4981e07211f57139fc5b1f7ce91b920cee4.1709719720.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Shuai Xue --- mm/gup.c | 1 + mm/internal.h | 1 + mm/memory-failure.c | 1 + mm/memory_hotplug.c | 1 + mm/mempolicy.c | 1 + mm/migrate.c | 1 + mm/page_alloc.c | 1 + mm/vmscan.c | 3 ++- 8 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index c1cd784fd5cd..00714f6d1faf 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2095,6 +2095,7 @@ static int migrate_longterm_unpinnable_pages( struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, + .reason = MR_LONGTERM_PIN, }; if (migrate_pages(movable_page_list, alloc_migration_target, diff --git a/mm/internal.h b/mm/internal.h index b2ac97a81765..53d7aaa02064 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1215,6 +1215,7 @@ struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; + enum migrate_reason reason; }; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 72c0e2fa7f83..54f8200e5d06 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2686,6 +2686,7 @@ static int soft_offline_in_use_page(struct page *page) struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_FAILURE, }; if (!huge && PageTransHuge(hpage)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 252bc31ed630..67eee10ada4b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1777,6 +1777,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) struct migration_target_control mtc = { .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_HOTPLUG, }; int ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72c2fc211da3..b65f5f6b6942 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1105,6 +1105,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; nodes_clear(nmask); diff --git a/mm/migrate.c b/mm/migrate.c index 6eb79c69a88b..cb9e5702dee8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2167,6 +2167,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index baf5bdd65ea7..2a9fa4572dcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6493,6 +6493,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_CONTIG_RANGE, }; lru_cache_disable(); diff --git a/mm/vmscan.c b/mm/vmscan.c index b71f2552a561..e52ffe1446da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1026,7 +1026,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, - .nmask = &allowed_mask + .nmask = &allowed_mask, + .reason = MR_DEMOTION, }; if (list_empty(demote_folios)) -- Gitee From 86386a5420569194335533cfc124b9434c6916fd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 6 Mar 2024 18:13:27 +0800 Subject: [PATCH 018/278] mm: hugetlb: make the hugetlb migration strategy consistent ANBZ: #13617 commit 42d0c3fbb5811fbfb663d8ede1d7ffba02e7ae18 upstream. As discussed in previous thread [1], there is an inconsistency when handing hugetlb migration. When handling the migration of freed hugetlb, it prevents fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However, when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool and might result in unexpected failures when node bound workloads doesn't get what is asssumed available. To make hugetlb migration strategy more clear, we should list all the scenarios of hugetlb migration and analyze whether allocation fallback is permitted: 1) Memory offline: will call dissolve_free_huge_pages() to free the freed hugetlb, and call do_migrate_range() to migrate the in-use hugetlb. Both can break the per-node hugetlb pool, but as this is an explicit offlining operation, no better choice. So should allow the hugetlb allocation fallback. 2) Memory failure: same as memory offline. Should allow fallback to a different node might be the only option to handle it, otherwise the impact of poisoned memory can be amplified. 3) Longterm pinning: will call migrate_longterm_unpinnable_pages() to migrate in-use and not-longterm-pinnable hugetlb, which can break the per-node pool. But we should fail to longterm pinning if can not allocate on current node to avoid breaking the per-node pool. 4) Syscalls (mbind, migrate_pages, move_pages): these are explicit users operation to move pages to other nodes, so fallback to other nodes should not be prohibited. 5) alloc_contig_range: used by CMA allocation and virtio-mem fake-offline to allocate given range of pages. Now the freed hugetlb migration is not allowed to fallback, to keep consistency, the in-use hugetlb migration should be also not allowed to fallback. 6) alloc_contig_pages: used by kfence, pgtable_debug etc. The strategy should be consistent with that of alloc_contig_range(). Based on the analysis of the various scenarios above, introducing a new helper to determine whether fallback is permitted according to the migration reason.. [1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/ Link: https://lkml.kernel.org/r/3519fcd41522817307a05b40fb551e2e17e68101.1709719720.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Shuai Xue --- include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++-- mm/hugetlb.c | 14 ++++++++++++-- mm/mempolicy.c | 3 ++- mm/migrate.c | 3 ++- 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ab37179dc94c..89ff4d46a619 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -752,7 +752,8 @@ void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask); + nodemask_t *nmask, gfp_t gfp_mask, + bool allow_alloc_fallback); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -975,6 +976,30 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) return modified_mask; } +static inline bool htlb_allow_alloc_fallback(int reason) +{ + bool allowed_fallback = false; + + /* + * Note: the memory offline, memory failure and migration syscalls will + * be allowed to fallback to other nodes due to lack of a better chioce, + * that might break the per-node hugetlb pool. While other cases will + * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. + */ + switch (reason) { + case MR_MEMORY_HOTPLUG: + case MR_MEMORY_FAILURE: + case MR_SYSCALL: + case MR_MEMPOLICY_MBIND: + allowed_fallback = true; + break; + default: + break; + } + + return allowed_fallback; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { @@ -1082,7 +1107,8 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, + bool allow_alloc_fallback) { return NULL; } @@ -1198,6 +1224,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) return 0; } +static inline bool htlb_allow_alloc_fallback(int reason) +{ + return false; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5286f85e9601..6a8e42c02108 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2466,7 +2466,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { @@ -2481,6 +2481,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } spin_unlock_irq(&hugetlb_lock); + /* We cannot fallback to other nodes, as we could break the per-node pool. */ + if (!allow_alloc_fallback) + gfp_mask |= __GFP_THISNODE; + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } @@ -6335,7 +6339,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + /* + * This is used to allocate a temporary hugetlb to hold the copied + * content, which will then be copied again to the final hugetlb + * consuming a reservation. Set the alloc_fallback to false to indicate + * that breaking the per-node hugetlb pool is not allowed in this case. + */ + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b65f5f6b6942..cedf955d0429 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1267,7 +1267,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); - return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) diff --git a/mm/migrate.c b/mm/migrate.c index cb9e5702dee8..2118fef4d924 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2128,7 +2128,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, - mtc->nmask, gfp_mask); + mtc->nmask, gfp_mask, + htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { -- Gitee From 85bf61519f39c8a1b57a28180dadd9ccfe55cb0e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 6 Mar 2024 18:13:28 +0800 Subject: [PATCH 019/278] docs: hugetlbpage.rst: add hugetlb migration description ANBZ: #13617 commit 353dc187840100fabeb946bb9573bc5ca5e04fcb upstream. Add some description of the hugetlb migration strategy. Link: https://lkml.kernel.org/r/63fb16e7a4ebc5cb69ce655af86e29b2d8e9ba34.1709719720.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Shuai Xue --- Documentation/admin-guide/mm/hugetlbpage.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index e4d4b4a8dc97..f34a0d798d5b 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -376,6 +376,13 @@ Note that the number of overcommit and reserve pages remain global quantities, as we don't know until fault time, when the faulting task's mempolicy is applied, from which node the huge page allocation will be attempted. +The hugetlb may be migrated between the per-node hugepages pool in the following +scenarios: memory offline, memory failure, longterm pinning, syscalls(mbind, +migrate_pages and move_pages), alloc_contig_range() and alloc_contig_pages(). +Now only memory offline, memory failure and syscalls allow fallbacking to allocate +a new hugetlb on a different node if the current node is unable to allocate during +hugetlb migration, that means these 3 cases can break the per-node hugepages pool. + .. _using_huge_pages: Using Huge Pages -- Gitee From 1a5af69a9acf0afca70de2b72b61c2a048b5c71a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Aug 2025 11:40:54 +0800 Subject: [PATCH 020/278] mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked" ANBZ: #13617 commit 517f496e1e61bd169d585dab4dd77e7147506322 upstream. After commit 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") we are able to longterm pin folios that are not supposed to get longterm pinned, simply because they temporarily have the LRU flag cleared (esp. temporarily isolated). For example, two __get_longterm_locked() callers can race, or __get_longterm_locked() can race with anything else that temporarily isolates folios. The introducing commit mentions the use case of a driver that uses vm_ops->fault to insert pages allocated through cma_alloc() into the page tables, assuming they can later get longterm pinned. These pages/ folios would never have the LRU flag set and consequently cannot get isolated. There is no known in-tree user making use of that so far, fortunately. To handle that in the future -- and avoid retrying forever to isolate/migrate them -- we will need a different mechanism for the CMA area *owner* to indicate that it actually already allocated the page and is fine with longterm pinning it. The LRU flag is not suitable for that. Probably we can lookup the relevant CMA area and query the bitmap; we only have have to care about some races, probably. If already allocated, we could just allow longterm pinning) Anyhow, let's fix the "must not be longterm pinned" problem first by reverting the original commit. Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") Signed-off-by: David Hildenbrand Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/ Reported-by: Hyesoo Yu Reviewed-by: John Hubbard Cc: Jason Gunthorpe Cc: Peter Xu Cc: Zhaoyang Huang Cc: Aijun Sun Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton Signed-off-by: Jay Chen --- mm/gup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 00714f6d1faf..6359e2c3c31e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2005,14 +2005,14 @@ struct page *get_dump_page(unsigned long addr) /* * Returns the number of collected pages. Return value is always >= 0. */ -static void collect_longterm_unpinnable_pages( +static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { + unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - unsigned long i; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); @@ -2024,6 +2024,8 @@ static void collect_longterm_unpinnable_pages( if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2045,6 +2047,8 @@ static void collect_longterm_unpinnable_pages( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2138,10 +2142,12 @@ static int migrate_longterm_unpinnable_pages( static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { + unsigned long collected; LIST_HEAD(movable_page_list); - collect_longterm_unpinnable_pages(&movable_page_list, nr_pages, pages); - if (list_empty(&movable_page_list)) + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) return 0; return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, -- Gitee From 969076079a2a0ac79a7c14a0fd337a9ae3bd9f1f Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:10 -0700 Subject: [PATCH 021/278] mm/gup: introduce check_and_migrate_movable_folios() ANBZ: #13617 commit 53ba78de064b6a45f5925947b3b45e9e833c2f8a upstream. This helper is the folio equivalent of check_and_migrate_movable_pages(). Therefore, all the rules that apply to check_and_migrate_movable_pages() also apply to this one as well. Currently, this helper is only used by memfd_pin_folios(). This patch also includes changes to rename and convert the internal functions collect_longterm_unpinnable_pages() and migrate_longterm_unpinnable_pages() to work on folios. As a result, check_and_migrate_movable_pages() is now a wrapper around check_and_migrate_movable_folios(). Link: https://lkml.kernel.org/r/20240624063952.1572359-3-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Peter Xu Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dongwon Kim Cc: Hugh Dickins Cc: Junxiao Chang Cc: Mike Kravetz Cc: Oscar Salvador Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Jay Chen --- mm/gup.c | 124 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 6359e2c3c31e..f15eaf44034a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2003,19 +2003,19 @@ struct page *get_dump_page(unsigned long addr) #ifdef CONFIG_MIGRATION /* - * Returns the number of collected pages. Return value is always >= 0. + * Returns the number of collected folios. Return value is always >= 0. */ -static unsigned long collect_longterm_unpinnable_pages( - struct list_head *movable_page_list, - unsigned long nr_pages, - struct page **pages) +static unsigned long collect_longterm_unpinnable_folios( + struct list_head *movable_folio_list, + unsigned long nr_folios, + struct folio **folios) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - for (i = 0; i < nr_pages; i++) { - struct folio *folio = page_folio(pages[i]); + for (i = 0; i < nr_folios; i++) { + struct folio *folio = folios[i]; if (folio == prev_folio) continue; @@ -2030,7 +2030,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_page_list); + isolate_hugetlb(folio, movable_folio_list); continue; } @@ -2042,7 +2042,7 @@ static unsigned long collect_longterm_unpinnable_pages( if (!folio_isolate_lru(folio)) continue; - list_add_tail(&folio->lru, movable_page_list); + list_add_tail(&folio->lru, movable_folio_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); @@ -2052,27 +2052,28 @@ static unsigned long collect_longterm_unpinnable_pages( } /* - * Unpins all pages and migrates device coherent pages and movable_page_list. - * Returns -EAGAIN if all pages were successfully migrated or -errno for failure - * (or partial success). + * Unpins all folios and migrates device coherent folios and movable_folio_list. + * Returns -EAGAIN if all folios were successfully migrated or -errno for + * failure (or partial success). */ -static int migrate_longterm_unpinnable_pages( - struct list_head *movable_page_list, - unsigned long nr_pages, - struct page **pages) +static int migrate_longterm_unpinnable_folios( + struct list_head *movable_folio_list, + unsigned long nr_folios, + struct folio **folios) { int ret; unsigned long i; - for (i = 0; i < nr_pages; i++) { - struct folio *folio = page_folio(pages[i]); + for (i = 0; i < nr_folios; i++) { + struct folio *folio = folios[i]; if (folio_is_device_coherent(folio)) { /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. + * Migration will fail if the folio is pinned, so + * convert the pin on the source folio to a normal + * reference. */ - pages[i] = NULL; + folios[i] = NULL; folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); @@ -2085,24 +2086,24 @@ static int migrate_longterm_unpinnable_pages( } /* - * We can't migrate pages with unexpected references, so drop + * We can't migrate folios with unexpected references, so drop * the reference obtained by __get_user_pages_locked(). - * Migrating pages have been added to movable_page_list after + * Migrating folios have been added to movable_folio_list after * calling folio_isolate_lru() which takes a reference so the - * page won't be freed if it's migrating. + * folio won't be freed if it's migrating. */ - unpin_user_page(pages[i]); - pages[i] = NULL; + unpin_folio(folios[i]); + folios[i] = NULL; } - if (!list_empty(movable_page_list)) { + if (!list_empty(movable_folio_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, .reason = MR_LONGTERM_PIN, }; - if (migrate_pages(movable_page_list, alloc_migration_target, + if (migrate_pages(movable_folio_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; @@ -2110,48 +2111,71 @@ static int migrate_longterm_unpinnable_pages( } } - putback_movable_pages(movable_page_list); + putback_movable_pages(movable_folio_list); return -EAGAIN; err: - for (i = 0; i < nr_pages; i++) - if (pages[i]) - unpin_user_page(pages[i]); - putback_movable_pages(movable_page_list); + unpin_folios(folios, nr_folios); + putback_movable_pages(movable_folio_list); return ret; } /* - * Check whether all pages are *allowed* to be pinned. Rather confusingly, all - * pages in the range are required to be pinned via FOLL_PIN, before calling - * this routine. + * Check whether all folios are *allowed* to be pinned indefinitely (longterm). + * Rather confusingly, all folios in the range are required to be pinned via + * FOLL_PIN, before calling this routine. * - * If any pages in the range are not allowed to be pinned, then this routine - * will migrate those pages away, unpin all the pages in the range and return + * If any folios in the range are not allowed to be pinned, then this routine + * will migrate those folios away, unpin all the folios in the range and return * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then * call this routine again. * * If an error other than -EAGAIN occurs, this indicates a migration failure. * The caller should give up, and propagate the error back up the call stack. * - * If everything is OK and all pages in the range are allowed to be pinned, then - * this routine leaves all pages pinned and returns zero for success. + * If everything is OK and all folios in the range are allowed to be pinned, + * then this routine leaves all folios pinned and returns zero for success. */ -static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages) +static long check_and_migrate_movable_folios(unsigned long nr_folios, + struct folio **folios) { unsigned long collected; - LIST_HEAD(movable_page_list); + LIST_HEAD(movable_folio_list); - collected = collect_longterm_unpinnable_pages(&movable_page_list, - nr_pages, pages); + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + nr_folios, folios); if (!collected) return 0; - return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, - pages); + return migrate_longterm_unpinnable_folios(&movable_folio_list, + nr_folios, folios); +} + +/* + * This routine just converts all the pages in the @pages array to folios and + * calls check_and_migrate_movable_folios() to do the heavy lifting. + * + * Please see the check_and_migrate_movable_folios() documentation for details. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + struct folio **folios; + long i, ret; + + folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL); + if (!folios) + return -ENOMEM; + + for (i = 0; i < nr_pages; i++) + folios[i] = page_folio(pages[i]); + + ret = check_and_migrate_movable_folios(nr_pages, folios); + + kfree(folios); + return ret; } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, @@ -2159,6 +2183,12 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, { return 0; } + +static long check_and_migrate_movable_folios(unsigned long nr_folios, + struct folio **folios) +{ + return 0; +} #endif /* CONFIG_MIGRATION */ /* -- Gitee From cd98564549ea2454f936878a762572a00842696f Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:11 -0700 Subject: [PATCH 022/278] mm/gup: introduce memfd_pin_folios() for pinning memfd folios ANBZ: #13617 commit 89c1905d9c140372b7f50ef48f42378cf85d9bc5 upstream. For drivers that would like to longterm-pin the folios associated with a memfd, the memfd_pin_folios() API provides an option to not only pin the folios via FOLL_PIN but also to check and migrate them if they reside in movable zone or CMA block. This API currently works with memfds but it should work with any files that belong to either shmemfs or hugetlbfs. Files belonging to other filesystems are rejected for now. The folios need to be located first before pinning them via FOLL_PIN. If they are found in the page cache, they can be immediately pinned. Otherwise, they need to be allocated using the filesystem specific APIs and then pinned. [akpm@linux-foundation.org: improve the CONFIG_MMU=n situation, per SeongJae] [vivek.kasireddy@intel.com: return -EINVAL if the end offset is greater than the size of memfd] Link: https://lkml.kernel.org/r/IA0PR11MB71850525CBC7D541CAB45DF1F8DB2@IA0PR11MB7185.namprd11.prod.outlook.com Link: https://lkml.kernel.org/r/20240624063952.1572359-4-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe (v2) Reviewed-by: David Hildenbrand (v3) Reviewed-by: Christoph Hellwig (v6) Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox (Oracle) Cc: Daniel Vetter Cc: Hugh Dickins Cc: Peter Xu Cc: Dongwon Kim Cc: Junxiao Chang Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Oscar Salvador Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Jay Chen --- include/linux/memfd.h | 5 ++ include/linux/mm.h | 3 + mm/gup.c | 139 ++++++++++++++++++++++++++++++++++++++++++ mm/memfd.c | 45 ++++++++++++++ 4 files changed, 192 insertions(+) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index e7abf6fa4c52..3f2cf339ceaf 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -6,11 +6,16 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { return -EINVAL; } +static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ + return ERR_PTR(-EINVAL); +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ea1333659fde..a4148eae3425 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2623,6 +2623,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index f15eaf44034a..42995d543f64 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include @@ -3522,3 +3524,140 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/** + * memfd_pin_folios() - pin folios associated with a memfd + * @memfd: the memfd whose folios are to be pinned + * @start: the first memfd offset + * @end: the last memfd offset (inclusive) + * @folios: array that receives pointers to the folios pinned + * @max_folios: maximum number of entries in @folios + * @offset: the offset into the first folio + * + * Attempt to pin folios associated with a memfd in the contiguous range + * [start, end]. Given that a memfd is either backed by shmem or hugetlb, + * the folios can either be found in the page cache or need to be allocated + * if necessary. Once the folios are located, they are all pinned via + * FOLL_PIN and @offset is populatedwith the offset into the first folio. + * And, eventually, these pinned folios must be released either using + * unpin_folios() or unpin_folio(). + * + * It must be noted that the folios may be pinned for an indefinite amount + * of time. And, in most cases, the duration of time they may stay pinned + * would be controlled by the userspace. This behavior is effectively the + * same as using FOLL_LONGTERM with other GUP APIs. + * + * Returns number of folios pinned, which could be less than @max_folios + * as it depends on the folio sizes that cover the range [start, end]. + * If no folios were pinned, it returns -errno. + */ +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset) +{ + unsigned int flags, nr_folios, nr_found; + unsigned int i, pgshift = PAGE_SHIFT; + pgoff_t start_idx, end_idx, next_idx; + struct folio *folio = NULL; + struct folio_batch fbatch; + struct hstate *h; + long ret = -EINVAL; + + if (start < 0 || start > end || !max_folios) + return -EINVAL; + + if (!memfd) + return -EINVAL; + + if (!shmem_file(memfd) && !is_file_hugepages(memfd)) + return -EINVAL; + + if (end >= i_size_read(file_inode(memfd))) + return -EINVAL; + + if (is_file_hugepages(memfd)) { + h = hstate_file(memfd); + pgshift = huge_page_shift(h); + } + + flags = memalloc_pin_save(); + do { + nr_folios = 0; + start_idx = start >> pgshift; + end_idx = end >> pgshift; + if (is_file_hugepages(memfd)) { + start_idx <<= huge_page_order(h); + end_idx <<= huge_page_order(h); + } + + folio_batch_init(&fbatch); + while (start_idx <= end_idx && nr_folios < max_folios) { + /* + * In most cases, we should be able to find the folios + * in the page cache. If we cannot find them for some + * reason, we try to allocate them and add them to the + * page cache. + */ + nr_found = filemap_get_folios_contig(memfd->f_mapping, + &start_idx, + end_idx, + &fbatch); + if (folio) { + folio_put(folio); + folio = NULL; + } + + next_idx = 0; + for (i = 0; i < nr_found; i++) { + /* + * As there can be multiple entries for a + * given folio in the batch returned by + * filemap_get_folios_contig(), the below + * check is to ensure that we pin and return a + * unique set of folios between start and end. + */ + if (next_idx && + next_idx != folio_index(fbatch.folios[i])) + continue; + + folio = page_folio(&fbatch.folios[i]->page); + + if (try_grab_folio(folio, 1, FOLL_PIN)) { + folio_batch_release(&fbatch); + ret = -EINVAL; + goto err; + } + + if (nr_folios == 0) + *offset = offset_in_folio(folio, start); + + folios[nr_folios] = folio; + next_idx = folio_next_index(folio); + if (++nr_folios == max_folios) + break; + } + + folio = NULL; + folio_batch_release(&fbatch); + if (!nr_found) { + folio = memfd_alloc_folio(memfd, start_idx); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + if (ret != -EEXIST) + goto err; + } + } + } + + ret = check_and_migrate_movable_folios(nr_folios, folios); + } while (ret == -EAGAIN); + + memalloc_pin_restore(flags); + return ret ? ret : nr_folios; +err: + memalloc_pin_restore(flags); + unpin_folios(folios, nr_folios); + + return ret; +} +EXPORT_SYMBOL_GPL(memfd_pin_folios); diff --git a/mm/memfd.c b/mm/memfd.c index 2dba2cb6f0d0..9cb06d85bcce 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -63,6 +63,51 @@ static void memfd_tag_pins(struct xa_state *xas) xas_unlock_irq(xas); } +/* + * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). + * It is mainly called to allocate a folio in a memfd when the caller + * (memfd_pin_folios()) cannot find a folio in the page cache at a given + * index in the mapping. + */ +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct folio *folio; + gfp_t gfp_mask; + int err; + + if (is_file_hugepages(memfd)) { + /* + * The folio would most likely be accessed by a DMA driver, + * therefore, we have zone memory constraints where we can + * alloc from. Also, the folio will be pinned for an indefinite + * amount of time, so it is not expected to be migrated away. + */ + gfp_mask = htlb_alloc_mask(hstate_file(memfd)); + gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); + + folio = alloc_hugetlb_folio_nodemask(hstate_file(memfd), + numa_node_id(), + NULL, + gfp_mask, + false); + if (folio && folio_try_get(folio)) { + err = hugetlb_add_to_page_cache(folio, + memfd->f_mapping, + idx); + if (err) { + folio_put(folio); + free_huge_folio(folio); + return ERR_PTR(err); + } + return folio; + } + return ERR_PTR(-ENOMEM); + } +#endif + return shmem_read_folio(memfd->f_mapping, idx); +} + /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active -- Gitee From cab740bc8f8c4ae1618fee8f58dfa1c938acdb4d Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:51 -0700 Subject: [PATCH 023/278] mm/gup: Add folio_add_pins() ANBZ: #13617 commit a2ad1b8101a36c927bcbd1317475b9576c352155 upstream. Export a function that adds pins to an already-pinned huge-page folio. This allows any range of small pages within the folio to be unpinned later. For example, pages pinned via memfd_pin_folios and modified by folio_add_pins could be unpinned via unpin_user_page(s). Link: https://patch.msgid.link/r/1729861919-234514-2-git-send-email-steven.sistare@oracle.com Suggested-by: Jason Gunthorpe Suggested-by: David Hildenbrand Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- include/linux/mm.h | 1 + mm/gup.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index a4148eae3425..c894df090873 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2626,6 +2626,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); +int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index 42995d543f64..952a35700b12 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3661,3 +3661,27 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, return ret; } EXPORT_SYMBOL_GPL(memfd_pin_folios); + +/** + * folio_add_pins() - add pins to an already-pinned folio + * @folio: the folio to add more pins to + * @pins: number of pins to add + * + * Try to add more pins to an already-pinned folio. The semantics + * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot + * be changed. + * + * This function is helpful when having obtained a pin on a large folio + * using memfd_pin_folios(), but wanting to logically unpin parts + * (e.g., individual pages) of the folio later, for example, using + * unpin_user_page_range_dirty_lock(). + * + * This is not the right interface to initially pin a folio. + */ +int folio_add_pins(struct folio *folio, unsigned int pins) +{ + VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio)); + + return try_grab_folio(folio, pins, FOLL_PIN); +} +EXPORT_SYMBOL_GPL(folio_add_pins); -- Gitee From 01d4c2db15f1272e937a6ec85145c9b12fe8ca8b Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:52 -0700 Subject: [PATCH 024/278] iommufd: Rename uptr in iopt_alloc_iova() ANBZ: #13617 commit 32383c085c1c7e2e4d9e1a3b31eb1ff17b0a5a85 upstream. iopt_alloc_iova() takes a uptr argument but only checks for its alignment. Generalize this to an unsigned address, which can be the offset from the start of a file in a subsequent patch. No functional change. Link: https://patch.msgid.link/r/1729861919-234514-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 4bf7ccd39d46..68ad91d42d03 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -107,9 +107,9 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, - unsigned long uptr, unsigned long length) + unsigned long addr, unsigned long length) { - unsigned long page_offset = uptr % PAGE_SIZE; + unsigned long page_offset = addr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; unsigned long max_alignment = PAGE_SIZE; @@ -122,15 +122,15 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, return -EOVERFLOW; /* - * Keep alignment present in the uptr when building the IOVA, this + * Keep alignment present in addr when building the IOVA, which * increases the chance we can map a THP. */ - if (!uptr) + if (!addr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), - 1UL << __ffs64(uptr)); + 1UL << __ffs64(addr)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE max_alignment = HPAGE_SIZE; @@ -248,6 +248,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; + unsigned long start; unsigned long iova; int rc = 0; @@ -267,9 +268,8 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); - rc = iopt_alloc_iova( - iopt, dst_iova, - (uintptr_t)elm->pages->uptr + elm->start_byte, length); + start = elm->start_byte + (uintptr_t)elm->pages->uptr; + rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && -- Gitee From 0baede73d96ee602f66bc6101492272f96bcbf98 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:53 -0700 Subject: [PATCH 025/278] iommufd: Generalize iopt_pages address ANBZ: #13617 commit 99ff06dcca20b0d88b21278ebc563ce1bfbbd5b0 upstream. The starting address in iopt_pages is currently a __user *uptr. Generalize to allow other types of addresses. Refactor iopt_alloc_pages() and iopt_map_user_pages() into address-type specific and common functions. Link: https://patch.msgid.link/r/1729861919-234514-4-git-send-email-steven.sistare@oracle.com Suggested-by: Nicolin Chen Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.c | 55 +++++++++++++++++----------- drivers/iommu/iommufd/io_pagetable.h | 13 +++++-- drivers/iommu/iommufd/pages.c | 31 ++++++++++++---- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 68ad91d42d03..874ee9ea7efc 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -384,6 +384,34 @@ int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, return rc; } +static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + struct iopt_pages *pages, unsigned long *iova, + unsigned long length, unsigned long start_byte, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = pages; + elm.start_byte = start_byte; + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of @@ -408,29 +436,14 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags) { - struct iopt_pages_list elm = {}; - LIST_HEAD(pages_list); - int rc; + struct iopt_pages *pages; - elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); - if (IS_ERR(elm.pages)) - return PTR_ERR(elm.pages); - if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && - elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) - elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; - elm.start_byte = uptr - elm.pages->uptr; - elm.length = length; - list_add(&elm.next, &pages_list); + pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); - rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); - if (rc) { - if (elm.area) - iopt_abort_area(elm.area); - if (elm.pages) - iopt_put_pages(elm.pages); - return rc; - } - return 0; + return iopt_map_common(ictx, iopt, pages, iova, length, + uptr - pages->uptr, iommu_prot, flags); } struct iova_bitmap_fn_arg { diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index c61d74471684..8e482663c91a 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -175,6 +175,10 @@ enum { IOPT_PAGES_ACCOUNT_MM = 2, }; +enum iopt_address_type { + IOPT_ADDRESS_USER = 0, +}; + /* * This holds a pinned page list for multiple areas of IO address space. The * pages always originate from a linear chunk of userspace VA. Multiple @@ -195,7 +199,10 @@ struct iopt_pages { struct task_struct *source_task; struct mm_struct *source_mm; struct user_struct *source_user; - void __user *uptr; + enum iopt_address_type type; + union { + void __user *uptr; /* IOPT_ADDRESS_USER */ + }; bool writable:1; u8 account_mode; @@ -206,8 +213,8 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable); +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 93d806c9c073..4206e90cc451 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1139,11 +1139,11 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, return 0; } -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable) +static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, + unsigned long length, + bool writable) { struct iopt_pages *pages; - unsigned long end; /* * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP @@ -1152,9 +1152,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, if (length > SIZE_MAX - PAGE_SIZE || length == 0) return ERR_PTR(-EINVAL); - if (check_add_overflow((unsigned long)uptr, length, &end)) - return ERR_PTR(-EOVERFLOW); - pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); @@ -1164,8 +1161,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, mutex_init(&pages->mutex); pages->source_mm = current->mm; mmgrab(pages->source_mm); - pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); - pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE); pages->access_itree = RB_ROOT_CACHED; pages->domains_itree = RB_ROOT_CACHED; pages->writable = writable; @@ -1179,6 +1175,25 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, return pages; } +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable) +{ + struct iopt_pages *pages; + unsigned long end; + void __user *uptr_down = + (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + + if (check_add_overflow((unsigned long)uptr, length, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(uptr - uptr_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->uptr = uptr_down; + pages->type = IOPT_ADDRESS_USER; + return pages; +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); -- Gitee From 5f3a4106e25bbef91e623e8fc739b6e08102a881 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:54 -0700 Subject: [PATCH 026/278] iommufd: pfn_reader local variables ANBZ: #13617 commit c27f0a606c2abcea8db00a6f962fc023b8e68562 upstream. Add local variables for common sub-expressions needed by a subsequent patch. No functional change. Link: https://patch.msgid.link/r/1729861919-234514-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/pages.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 4206e90cc451..9097b2575651 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -978,6 +978,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; + struct pfn_reader_user *user = &pfns->user; + unsigned long npages; struct iopt_area *area; int rc; @@ -1015,10 +1017,9 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return rc; } - batch_from_pages(&pfns->batch, - pfns->user.upages + - (start_index - pfns->user.upages_start), - pfns->user.upages_end - start_index); + npages = user->upages_end - start_index; + start_index -= user->upages_start; + batch_from_pages(&pfns->batch, user->upages + start_index, npages); return 0; } @@ -1092,16 +1093,18 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; + struct pfn_reader_user *user = &pfns->user; - if (pfns->user.upages_end > pfns->batch_end_index) { - size_t npages = pfns->user.upages_end - pfns->batch_end_index; - + if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ - unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - - pfns->user.upages_start), - npages); + + unsigned long npages = user->upages_end - pfns->batch_end_index; + unsigned long start_index = pfns->batch_end_index - + user->upages_start; + + unpin_user_pages(user->upages + start_index, npages); iopt_pages_sub_npinned(pages, npages); - pfns->user.upages_end = pfns->batch_end_index; + user->upages_end = pfns->batch_end_index; } if (pfns->batch_start_index != pfns->batch_end_index) { pfn_reader_unpin(pfns); -- Gitee From 5364788df6d6bff5ca518741de6c5c3d5dd41d21 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:55 -0700 Subject: [PATCH 027/278] iommufd: Folio subroutines ANBZ: #13617 commit ed9178fbfd4eb0f6a8c87f56dfb8309c65a4b8a4 upstream. Add subroutines for copying folios to a batch. Link: https://patch.msgid.link/r/1729861919-234514-6-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/pages.c | 79 ++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 9097b2575651..aa79504af758 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -346,27 +346,41 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) kfree(batch->pfns); } -/* true if the pfn was added, false otherwise */ -static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, + u32 nr) { const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); - - if (batch->end && - pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && - batch->npfns[batch->end - 1] != MAX_NPFNS) { - batch->npfns[batch->end - 1]++; - batch->total_pfns++; - return true; - } - if (batch->end == batch->array_size) + unsigned int end = batch->end; + + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && + nr <= MAX_NPFNS - batch->npfns[end - 1]) { + batch->npfns[end - 1] += nr; + } else if (end < batch->array_size) { + batch->pfns[end] = pfn; + batch->npfns[end] = nr; + batch->end++; + } else { return false; - batch->total_pfns++; - batch->pfns[batch->end] = pfn; - batch->npfns[batch->end] = 1; - batch->end++; + } + + batch->total_pfns += nr; return true; } +static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) +{ + batch->npfns[batch->end - 1] -= nr; + if (batch->npfns[batch->end - 1] == 0) + batch->end--; + batch->total_pfns -= nr; +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + return batch_add_pfn_num(batch, pfn, 1); +} + /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use @@ -622,6 +636,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, break; } +static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, + unsigned long *offset_p, unsigned long npages) +{ + int rc = 0; + struct folio **folios = *folios_p; + unsigned long offset = *offset_p; + + while (npages) { + struct folio *folio = *folios; + unsigned long nr = folio_nr_pages(folio) - offset; + unsigned long pfn = page_to_pfn(folio_page(folio, offset)); + + nr = min(nr, npages); + npages -= nr; + + if (!batch_add_pfn_num(batch, pfn, nr)) + break; + if (nr > 1) { + rc = folio_add_pins(folio, nr - 1); + if (rc) { + batch_remove_pfn_num(batch, nr); + goto out; + } + } + + folios++; + offset = 0; + } + +out: + *folios_p = folios; + *offset_p = offset; + return rc; +} + static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { -- Gitee From 37291ded9d8e1cf4f6141aabf9715845008977eb Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:56 -0700 Subject: [PATCH 028/278] iommufd: pfn_reader for file mappings ANBZ: #13617 commit 92687c793644c01aa4ca961a5f2cb93ac20a27f0 upstream. Extend pfn_reader_user() to pin file mappings, by calling memfd_pin_folios(). Repin at small page granularity, and fill the batch from folios. Expand folios to upages for the iopt_pages_fill() path. Link: https://patch.msgid.link/r/1729861919-234514-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.h | 5 ++ drivers/iommu/iommufd/pages.c | 128 +++++++++++++++++++++++---- 2 files changed, 116 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 8e482663c91a..5ac4eedc0be3 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -177,6 +177,7 @@ enum { enum iopt_address_type { IOPT_ADDRESS_USER = 0, + IOPT_ADDRESS_FILE = 1, }; /* @@ -202,6 +203,10 @@ struct iopt_pages { enum iopt_address_type type; union { void __user *uptr; /* IOPT_ADDRESS_USER */ + struct { /* IOPT_ADDRESS_FILE */ + struct file *file; + unsigned long start; + }; }; bool writable:1; u8 account_mode; diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index aa79504af758..5f371fa88a4a 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -752,19 +752,32 @@ struct pfn_reader_user { * neither */ int locked; + + /* The following are only valid if file != NULL. */ + struct file *file; + struct folio **ufolios; + size_t ufolios_len; + unsigned long ufolios_offset; + struct folio **ufolios_next; }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { user->upages = NULL; + user->upages_len = 0; user->upages_start = 0; user->upages_end = 0; user->locked = -1; - user->gup_flags = FOLL_LONGTERM; if (pages->writable) user->gup_flags |= FOLL_WRITE; + + user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL; + user->ufolios = NULL; + user->ufolios_len = 0; + user->ufolios_next = NULL; + user->ufolios_offset = 0; } static void pfn_reader_user_destroy(struct pfn_reader_user *user, @@ -773,13 +786,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user, if (user->locked != -1) { if (user->locked) mmap_read_unlock(pages->source_mm); - if (pages->source_mm != current->mm) + if (!user->file && pages->source_mm != current->mm) mmput(pages->source_mm); user->locked = -1; } kfree(user->upages); user->upages = NULL; + kfree(user->ufolios); + user->ufolios = NULL; +} + +static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, + unsigned long npages) +{ + unsigned long i; + unsigned long offset; + unsigned long npages_out = 0; + struct page **upages = user->upages; + unsigned long end = start + (npages << PAGE_SHIFT) - 1; + long nfolios = user->ufolios_len / sizeof(*user->ufolios); + + /* + * todo: memfd_pin_folios should return the last pinned offset so + * we can compute npages pinned, and avoid looping over folios here + * if upages == NULL. + */ + nfolios = memfd_pin_folios(user->file, start, end, user->ufolios, + nfolios, &offset); + if (nfolios <= 0) + return nfolios; + + offset >>= PAGE_SHIFT; + user->ufolios_next = user->ufolios; + user->ufolios_offset = offset; + + for (i = 0; i < nfolios; i++) { + struct folio *folio = user->ufolios[i]; + unsigned long nr = folio_nr_pages(folio); + unsigned long npin = min(nr - offset, npages); + + npages -= npin; + npages_out += npin; + + if (upages) { + if (npin == 1) { + *upages++ = folio_page(folio, offset); + } else { + int rc = folio_add_pins(folio, npin - 1); + + if (rc) + return rc; + + while (npin--) + *upages++ = folio_page(folio, offset++); + } + } + + offset = 0; + } + + return npages_out; } static int pfn_reader_user_pin(struct pfn_reader_user *user, @@ -788,7 +855,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, unsigned long last_index) { bool remote_mm = pages->source_mm != current->mm; - unsigned long npages; + unsigned long npages = last_index - start_index + 1; + unsigned long start; + unsigned long unum; uintptr_t uptr; long rc; @@ -796,40 +865,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, WARN_ON(last_index < start_index)) return -EINVAL; - if (!user->upages) { + if (!user->file && !user->upages) { /* All undone in pfn_reader_destroy() */ - user->upages_len = - (last_index - start_index + 1) * sizeof(*user->upages); + user->upages_len = npages * sizeof(*user->upages); user->upages = temp_kmalloc(&user->upages_len, NULL, 0); if (!user->upages) return -ENOMEM; } + if (user->file && !user->ufolios) { + user->ufolios_len = npages * sizeof(*user->ufolios); + user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0); + if (!user->ufolios) + return -ENOMEM; + } + if (user->locked == -1) { /* * The majority of usages will run the map task within the mm * providing the pages, so we can optimize into * get_user_pages_fast() */ - if (remote_mm) { + if (!user->file && remote_mm) { if (!mmget_not_zero(pages->source_mm)) return -EFAULT; } user->locked = 0; } - npages = min_t(unsigned long, last_index - start_index + 1, - user->upages_len / sizeof(*user->upages)); - + unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) : + user->upages_len / sizeof(*user->upages); + npages = min_t(unsigned long, npages, unum); if (iommufd_should_fail()) return -EFAULT; - uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); - if (!remote_mm) + if (user->file) { + start = pages->start + (start_index * PAGE_SIZE); + rc = pin_memfd_pages(user, start, npages); + } else if (!remote_mm) { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, user->upages); - else { + } else { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!user->locked) { mmap_read_lock(pages->source_mm); user->locked = 1; @@ -887,7 +966,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, mmap_read_unlock(pages->source_mm); user->locked = 0; /* If we had the lock then we also have a get */ - } else if ((!user || !user->upages) && + + } else if ((!user || (!user->upages && !user->ufolios)) && pages->source_mm != current->mm) { if (!mmget_not_zero(pages->source_mm)) return -EINVAL; @@ -1068,8 +1148,15 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) npages = user->upages_end - start_index; start_index -= user->upages_start; - batch_from_pages(&pfns->batch, user->upages + start_index, npages); - return 0; + rc = 0; + + if (!user->file) + batch_from_pages(&pfns->batch, user->upages + start_index, + npages); + else + rc = batch_from_folios(&pfns->batch, &user->ufolios_next, + &user->ufolios_offset, npages); + return rc; } static bool pfn_reader_done(struct pfn_reader *pfns) @@ -1151,7 +1238,14 @@ static void pfn_reader_release_pins(struct pfn_reader *pfns) unsigned long start_index = pfns->batch_end_index - user->upages_start; - unpin_user_pages(user->upages + start_index, npages); + if (!user->file) { + unpin_user_pages(user->upages + start_index, npages); + } else { + long n = user->ufolios_len / sizeof(*user->ufolios); + + unpin_folios(user->ufolios_next, + user->ufolios + n - user->ufolios_next); + } iopt_pages_sub_npinned(pages, npages); user->upages_end = pfns->batch_end_index; } -- Gitee From 6b878d243ab90ad7a0f511ec04b43c66fc41debf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:57 -0700 Subject: [PATCH 029/278] iommufd: Add IOMMU_IOAS_MAP_FILE ANBZ: #13617 commit f4986a72d6e4be78ec0e4ee0e03531474621183f upstream. Define the IOMMU_IOAS_MAP_FILE ioctl interface, which allows a user to register memory by passing a memfd plus offset and length. Implement it using the memfd_pin_folios() kAPI. Link: https://patch.msgid.link/r/1729861919-234514-8-git-send-email-steven.sistare@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.c | 36 ++++++++++++++++++- drivers/iommu/iommufd/io_pagetable.h | 2 ++ drivers/iommu/iommufd/ioas.c | 47 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 5 +++ drivers/iommu/iommufd/main.c | 2 ++ drivers/iommu/iommufd/pages.c | 23 ++++++++++++ include/uapi/linux/iommufd.h | 25 +++++++++++++ 7 files changed, 139 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 874ee9ea7efc..8a790e597e12 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -268,7 +268,14 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); - start = elm->start_byte + (uintptr_t)elm->pages->uptr; + switch (elm->pages->type) { + case IOPT_ADDRESS_USER: + start = elm->start_byte + (uintptr_t)elm->pages->uptr; + break; + case IOPT_ADDRESS_FILE: + start = elm->start_byte + elm->pages->start; + break; + } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; @@ -446,6 +453,33 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, uptr - pages->uptr, iommu_prot, flags); } +/** + * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @file: file to map + * @start: map file starting at this byte offset + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + */ +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages *pages; + + pages = iopt_alloc_file_pages(file, start, length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); + return iopt_map_common(ictx, iopt, pages, iova, length, + start - pages->start, iommu_prot, flags); +} + struct iova_bitmap_fn_arg { unsigned long flags; struct io_pagetable *iopt; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 5ac4eedc0be3..9b40b2237932 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -220,6 +220,8 @@ struct iopt_pages { struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 2c4b2bb11e78..c05d33fc3a50 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include #include #include #include @@ -197,6 +198,52 @@ static int conv_iommu_prot(u32 map_flags) return iommu_prot; } +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map_file *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + struct file *file; + int rc; + + if (cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -EOPNOTSUPP; + + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + + file = fget(cmd->fd); + if (!file) + return -EBADF; + + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + cmd->start, cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(ucmd->ictx, &ioas->obj); + fput(file); + return rc; +} + int iommufd_ioas_map(struct iommufd_ucmd *ucmd) { struct iommu_ioas_map *cmd = ucmd->cmd; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f1d865e6fab6..8f3c21a664bd 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -69,6 +69,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); @@ -276,6 +280,7 @@ void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index b5f5d27ee963..826a2b2be52f 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -378,6 +378,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_ioas_iova_ranges, out_iova_alignment), IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), + IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, + struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 5f371fa88a4a..2ee6fcd2b551 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,7 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include #include #include #include @@ -1340,6 +1341,26 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, return pages; } +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable) + +{ + struct iopt_pages *pages; + unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); + unsigned long end; + + if (length && check_add_overflow(start, length - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(start - start_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->file = get_file(file); + pages->start = start_down; + pages->type = IOPT_ADDRESS_FILE; + return pages; +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1352,6 +1373,8 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); + if (pages->type == IOPT_ADDRESS_FILE) + fput(pages->file); kfree(pages); } diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 72010f71c5e4..41b1a01e9293 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -51,6 +51,7 @@ enum { IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, + IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f, }; /** @@ -213,6 +214,30 @@ struct iommu_ioas_map { }; #define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP) +/** + * struct iommu_ioas_map_file - ioctl(IOMMU_IOAS_MAP_FILE) + * @size: sizeof(struct iommu_ioas_map_file) + * @flags: same as for iommu_ioas_map + * @ioas_id: same as for iommu_ioas_map + * @fd: the memfd to map + * @start: byte offset from start of file to map from + * @length: same as for iommu_ioas_map + * @iova: same as for iommu_ioas_map + * + * Set an IOVA mapping from a memfd file. All other arguments and semantics + * match those of IOMMU_IOAS_MAP. + */ +struct iommu_ioas_map_file { + __u32 size; + __u32 flags; + __u32 ioas_id; + __s32 fd; + __aligned_u64 start; + __aligned_u64 length; + __aligned_u64 iova; +}; +#define IOMMU_IOAS_MAP_FILE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP_FILE) + /** * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) * @size: sizeof(struct iommu_ioas_copy) -- Gitee From efc9fa1dac5d20b5d9ad9e1b207f7c92b56014f4 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:58 -0700 Subject: [PATCH 030/278] iommufd: File mappings for mdev ANBZ: #13617 commit 976a40c0756194001392e6624afd830fcc3873e6 upstream. Support file mappings for mediated devices, aka mdevs. Access is initiated by the vfio_pin_pages() and vfio_dma_rw() kernel interfaces. Link: https://patch.msgid.link/r/1729861919-234514-9-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/pages.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 2ee6fcd2b551..8f249169831a 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1814,11 +1814,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages, return 0; } -static int iopt_pages_fill_from_mm(struct iopt_pages *pages, - struct pfn_reader_user *user, - unsigned long start_index, - unsigned long last_index, - struct page **out_pages) +static int iopt_pages_fill(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) { unsigned long cur_index = start_index; int rc; @@ -1892,8 +1892,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, /* hole */ cur_pages = out_pages + (span.start_hole - start_index); - rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, - span.last_hole, cur_pages); + rc = iopt_pages_fill(pages, &user, span.start_hole, + span.last_hole, cur_pages); if (rc) goto out_clean_xa; rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, @@ -1973,6 +1973,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, struct page *page = NULL; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); @@ -2028,6 +2032,15 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; + if (pages->type == IOPT_ADDRESS_FILE) + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, -- Gitee From 1f91d5f08a30df24540ebeae6c15fb33fa32074c Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:59 -0700 Subject: [PATCH 031/278] iommufd: Selftest coverage for IOMMU_IOAS_MAP_FILE ANBZ: #13617 commit 0bcceb1f51c77f6b98a7aab00847ed340bf36e35 upstream. Add test cases to exercise IOMMU_IOAS_MAP_FILE. Link: https://patch.msgid.link/r/1729861919-234514-10-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- tools/testing/selftests/iommu/iommufd.c | 124 +++++++++++++++--- .../selftests/iommu/iommufd_fail_nth.c | 39 ++++++ tools/testing/selftests/iommu/iommufd_utils.h | 57 ++++++++ 3 files changed, 205 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 4927b9add5ad..88b92bb69756 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include #include #include #include @@ -49,6 +50,9 @@ static __attribute__((constructor)) void setup_sizes(void) vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); assert(vrc == buffer); + + mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + &mfd); } FIXTURE(iommufd) @@ -128,6 +132,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length); TEST_LENGTH(iommu_option, IOMMU_OPTION, val64); TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved); + TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova); #undef TEST_LENGTH } @@ -1372,6 +1377,7 @@ FIXTURE_VARIANT(iommufd_mock_domain) { unsigned int mock_domains; bool hugepages; + bool file; }; FIXTURE_SETUP(iommufd_mock_domain) @@ -1410,26 +1416,45 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain) { .mock_domains = 1, .hugepages = false, + .file = false, }; FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains) { .mock_domains = 2, .hugepages = false, + .file = false, }; FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage) { .mock_domains = 1, .hugepages = true, + .file = false, }; FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage) { .mock_domains = 2, .hugepages = true, + .file = false, +}; + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file) +{ + .mock_domains = 1, + .hugepages = false, + .file = true, +}; + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file_hugepage) +{ + .mock_domains = 1, + .hugepages = true, + .file = true, }; + /* Have the kernel check that the user pages made it to the iommu_domain */ #define check_mock_iova(_ptr, _iova, _length) \ ({ \ @@ -1455,7 +1480,10 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage) } \ }) -TEST_F(iommufd_mock_domain, basic) +static void +test_basic_mmap(struct __test_metadata *_metadata, + struct _test_data_iommufd_mock_domain *self, + const struct _fixture_variant_iommufd_mock_domain *variant) { size_t buf_size = self->mmap_buf_size; uint8_t *buf; @@ -1478,6 +1506,40 @@ TEST_F(iommufd_mock_domain, basic) test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova); } +static void +test_basic_file(struct __test_metadata *_metadata, + struct _test_data_iommufd_mock_domain *self, + const struct _fixture_variant_iommufd_mock_domain *variant) +{ + size_t buf_size = self->mmap_buf_size; + uint8_t *buf; + __u64 iova; + int mfd_tmp; + int prot = PROT_READ | PROT_WRITE; + + /* Simple one page map */ + test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova); + check_mock_iova(mfd_buffer, iova, PAGE_SIZE); + + buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd_tmp); + ASSERT_NE(MAP_FAILED, buf); + + test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size + 1, &iova); + + ASSERT_EQ(0, ftruncate(mfd_tmp, 0)); + test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size, &iova); + + close(mfd_tmp); +} + +TEST_F(iommufd_mock_domain, basic) +{ + if (variant->file) + test_basic_file(_metadata, self, variant); + else + test_basic_mmap(_metadata, self, variant); +} + TEST_F(iommufd_mock_domain, ro_unshare) { uint8_t *buf; @@ -1513,9 +1575,13 @@ TEST_F(iommufd_mock_domain, all_aligns) unsigned int start; unsigned int end; uint8_t *buf; + int prot = PROT_READ | PROT_WRITE; + int mfd; - buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1, - 0); + if (variant->file) + buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd); + else + buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0); ASSERT_NE(MAP_FAILED, buf); check_refs(buf, buf_size, 0); @@ -1532,7 +1598,12 @@ TEST_F(iommufd_mock_domain, all_aligns) size_t length = end - start; __u64 iova; - test_ioctl_ioas_map(buf + start, length, &iova); + if (variant->file) { + test_ioctl_ioas_map_file(mfd, start, length, + &iova); + } else { + test_ioctl_ioas_map(buf + start, length, &iova); + } check_mock_iova(buf + start, iova, length); check_refs(buf + start / PAGE_SIZE * PAGE_SIZE, end / PAGE_SIZE * PAGE_SIZE - @@ -1544,6 +1615,8 @@ TEST_F(iommufd_mock_domain, all_aligns) } check_refs(buf, buf_size, 0); ASSERT_EQ(0, munmap(buf, buf_size)); + if (variant->file) + close(mfd); } TEST_F(iommufd_mock_domain, all_aligns_copy) @@ -1554,9 +1627,13 @@ TEST_F(iommufd_mock_domain, all_aligns_copy) unsigned int start; unsigned int end; uint8_t *buf; + int prot = PROT_READ | PROT_WRITE; + int mfd; - buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1, - 0); + if (variant->file) + buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd); + else + buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0); ASSERT_NE(MAP_FAILED, buf); check_refs(buf, buf_size, 0); @@ -1575,7 +1652,12 @@ TEST_F(iommufd_mock_domain, all_aligns_copy) uint32_t mock_stdev_id; __u64 iova; - test_ioctl_ioas_map(buf + start, length, &iova); + if (variant->file) { + test_ioctl_ioas_map_file(mfd, start, length, + &iova); + } else { + test_ioctl_ioas_map(buf + start, length, &iova); + } /* Add and destroy a domain while the area exists */ old_id = self->hwpt_ids[1]; @@ -1596,15 +1678,18 @@ TEST_F(iommufd_mock_domain, all_aligns_copy) } check_refs(buf, buf_size, 0); ASSERT_EQ(0, munmap(buf, buf_size)); + if (variant->file) + close(mfd); } TEST_F(iommufd_mock_domain, user_copy) { + void *buf = variant->file ? mfd_buffer : buffer; struct iommu_test_cmd access_cmd = { .size = sizeof(access_cmd), .op = IOMMU_TEST_OP_ACCESS_PAGES, .access_pages = { .length = BUFFER_SIZE, - .uptr = (uintptr_t)buffer }, + .uptr = (uintptr_t)buf }, }; struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), @@ -1623,9 +1708,13 @@ TEST_F(iommufd_mock_domain, user_copy) /* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */ test_ioctl_ioas_alloc(&ioas_id); - test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE, - ©_cmd.src_iova); - + if (variant->file) { + test_ioctl_ioas_map_id_file(ioas_id, mfd, 0, BUFFER_SIZE, + ©_cmd.src_iova); + } else { + test_ioctl_ioas_map_id(ioas_id, buf, BUFFER_SIZE, + ©_cmd.src_iova); + } test_cmd_create_access(ioas_id, &access_cmd.id, MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES); @@ -1635,12 +1724,17 @@ TEST_F(iommufd_mock_domain, user_copy) &access_cmd)); copy_cmd.src_ioas_id = ioas_id; ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); - check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE); /* Now replace the ioas with a new one */ test_ioctl_ioas_alloc(&new_ioas_id); - test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE, - ©_cmd.src_iova); + if (variant->file) { + test_ioctl_ioas_map_id_file(new_ioas_id, mfd, 0, BUFFER_SIZE, + ©_cmd.src_iova); + } else { + test_ioctl_ioas_map_id(new_ioas_id, buf, BUFFER_SIZE, + ©_cmd.src_iova); + } test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id); /* Destroy the old ioas and cleanup copied mapping */ @@ -1654,7 +1748,7 @@ TEST_F(iommufd_mock_domain, user_copy) &access_cmd)); copy_cmd.src_ioas_id = new_ioas_id; ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); - check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE); test_cmd_destroy_access_pages( access_cmd.id, access_cmd.access_pages.out_access_pages_id); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index c5d5e69452b0..2d7d01638be8 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -47,6 +47,9 @@ static __attribute__((constructor)) void setup_buffer(void) buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + &mfd); } /* @@ -331,6 +334,42 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain) return 0; } +/* iopt_area_fill_domains() and iopt_area_fill_domain() */ +TEST_FAIL_NTH(basic_fail_nth, map_file_domain) +{ + uint32_t ioas_id; + __u32 stdev_id; + __u32 hwpt_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) + return -1; + + if (_test_ioctl_ioas_map_file(self->fd, ioas_id, mfd, 0, 262144, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + if (_test_ioctl_destroy(self->fd, stdev_id)) + return -1; + + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) + return -1; + return 0; +} + TEST_FAIL_NTH(basic_fail_nth, map_two_domains) { uint32_t ioas_id; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 40f6f14ce136..6a11c26370f3 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -40,12 +40,28 @@ static inline bool test_bit(unsigned int nr, unsigned long *addr) static void *buffer; static unsigned long BUFFER_SIZE; +static void *mfd_buffer; +static int mfd; + static unsigned long PAGE_SIZE; #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) +static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p) +{ + int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0; + int mfd = memfd_create("buffer", mfd_flags); + + if (mfd <= 0) + return MAP_FAILED; + if (ftruncate(mfd, length)) + return MAP_FAILED; + *mfd_p = mfd; + return mmap(0, length, prot, flags, mfd, 0); +} + /* * Have the kernel check the refcount on pages. I don't know why a freshly * mmap'd anon non-compound page starts out with a ref of 3 @@ -589,6 +605,47 @@ static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova, EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \ iova, length, NULL)) +static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd, + size_t start, size_t length, __u64 *iova, + unsigned int flags) +{ + struct iommu_ioas_map_file cmd = { + .size = sizeof(cmd), + .flags = flags, + .ioas_id = ioas_id, + .fd = mfd, + .start = start, + .length = length, + }; + int ret; + + if (flags & IOMMU_IOAS_MAP_FIXED_IOVA) + cmd.iova = *iova; + + ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &cmd); + *iova = cmd.iova; + return ret; +} + +#define test_ioctl_ioas_map_file(mfd, start, length, iova_p) \ + ASSERT_EQ(0, \ + _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, length, iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) + +#define test_err_ioctl_ioas_map_file(_errno, mfd, start, length, iova_p) \ + EXPECT_ERRNO( \ + _errno, \ + _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, length, iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) + +#define test_ioctl_ioas_map_id_file(ioas_id, mfd, start, length, iova_p) \ + ASSERT_EQ(0, \ + _test_ioctl_ioas_map_file( \ + self->fd, ioas_id, mfd, start, length, iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) + static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) { struct iommu_test_cmd memlimit_cmd = { -- Gitee From 4b32b40871619a64490a138341984a0cec9b607a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 16 Oct 2024 08:49:58 +0000 Subject: [PATCH 032/278] iommu/amd: Do not try copy old DTE resume path ANBZ: #13617 commit 3f6eeada6930056c38f20964120ac402cf0030b9 upstream. In suspend/resume path, no need to copy old DTE (early_enable_iommus()). Just need to reload IOMMU hardware. This is the side effect of commit 3ac3e5ee5ed5 ("iommu/amd: Copy old trans table from old kernel") which changed early_enable_iommus() but missed to fix enable_iommus(). Resume path continue to work as 'amd_iommu_pre_enabled' is set to false and copy_device_table() will fail. It will just re-loaded IOMMU. Hence I think we don't need to backport this to stable tree. Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20241016084958.99727-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/init.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 22af9ba0c980..c68ecbad2913 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2881,11 +2881,6 @@ static void enable_iommus_vapic(void) #endif } -static void enable_iommus(void) -{ - early_enable_iommus(); -} - static void disable_iommus(void) { struct amd_iommu *iommu; @@ -2912,7 +2907,8 @@ static void amd_iommu_resume(void) iommu_apply_resume_quirks(iommu); /* re-load the hardware */ - enable_iommus(); + for_each_iommu(iommu) + early_enable_iommu(iommu); amd_iommu_enable_interrupts(); } -- Gitee From 18adee5e046d73ca2573421ea6b1677c99a246ed Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 18 Oct 2024 14:17:25 +0200 Subject: [PATCH 033/278] iommu/sysfs: constify the class struct ANBZ: #13617 commit a2f528e91458b3c3360f21a34381b91883ff28e1 upstream. All functions that take the class address as argument expect a const pointer so we can make the iommu class constant. Signed-off-by: Bartosz Golaszewski Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241018121725.61128-1-brgl@bgdev.pl Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c index cbe378c34ba3..170022c09536 100644 --- a/drivers/iommu/iommu-sysfs.c +++ b/drivers/iommu/iommu-sysfs.c @@ -34,7 +34,7 @@ static void release_device(struct device *dev) kfree(dev); } -static struct class iommu_class = { +static const struct class iommu_class = { .name = "iommu", .dev_release = release_device, .dev_groups = dev_groups, -- Gitee From 1c9f94e734ace8838288736b33ff735cde023e3e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 21 Oct 2024 16:08:46 -0700 Subject: [PATCH 034/278] iommu/tegra241-cmdqv: Staticize cmdqv_debugfs_dir ANBZ: #13617 commit 89edbe88db2857880b08ce363a2695eec657f51b upstream. Fix a sparse warning. Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410172003.bRQEReTc-lkp@intel.com/ Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021230847.811218-1-nicolinc@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index fcd13d301fff..a243c543598c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -800,7 +800,7 @@ static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu) return 0; } -struct dentry *cmdqv_debugfs_dir; +static struct dentry *cmdqv_debugfs_dir; static struct arm_smmu_device * __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, -- Gitee From 7f0d768a7bc4db549f2a20556eee93181a9f0a99 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 18 Oct 2024 14:12:19 -0300 Subject: [PATCH 035/278] iommu/amd: Fix corruption when mapping large pages from 0 ANBZ: #13617 commit e3a682eaf2af51a83f5313145ef592ce50fa787f upstream. If a page is mapped starting at 0 that is equal to or larger than can fit in the current mode (number of table levels) it results in corrupting the mapping as the following logic assumes the mode is correct for the page size being requested. There are two issues here, the check if the address fits within the table uses the start address, it should use the last address to ensure that last byte of the mapping fits within the current table mode. The second is if the mapping is exactly the size of the full page table it has to add another level to instead hold a single IOPTE for the large size. Since both corner cases require a 0 IOVA to be hit and doesn't start until a page size of 2^48 it is unlikely to ever hit in a real system. Reported-by: Alejandro Jimenez Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-27ab08d646a1+29-amd_0map_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/io_pgtable.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 804b788f3f16..f3399087859f 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -118,6 +118,7 @@ static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) */ static bool increase_address_space(struct amd_io_pgtable *pgtable, unsigned long address, + unsigned int page_size_level, gfp_t gfp) { struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; @@ -133,7 +134,8 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, spin_lock_irqsave(&domain->lock, flags); - if (address <= PM_LEVEL_SIZE(pgtable->mode)) + if (address <= PM_LEVEL_SIZE(pgtable->mode) && + pgtable->mode - 1 >= page_size_level) goto out; ret = false; @@ -163,18 +165,21 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, gfp_t gfp, bool *updated) { + unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; int level, end_lvl; u64 *pte, *page; BUG_ON(!is_power_of_2(page_size)); - while (address > PM_LEVEL_SIZE(pgtable->mode)) { + while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || + pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { /* * Return an error if there is no memory to update the * page-table. */ - if (!increase_address_space(pgtable, address, gfp)) + if (!increase_address_space(pgtable, last_addr, + PAGE_SIZE_LEVEL(page_size), gfp)) return NULL; } -- Gitee From 3b72ca1eda43bd39ff48ffd3c74854f8f42da3a3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 12:11:44 +0800 Subject: [PATCH 036/278] remoteproc: Use iommu_paging_domain_alloc() ANBZ: #13617 commit fae93d8da6b94ca1a6a497e8a00ad87324188cf7 upstream. An iommu domain is allocated in rproc_enable_iommu() and is attached to rproc->dev.parent in the same function. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Mathieu Poirier Acked-by: Beleswar Padhi Link: https://lore.kernel.org/r/20241009041147.28391-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/remoteproc/remoteproc_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 2d4ae3b5af86..1f3f4c4a62c7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -108,10 +108,10 @@ static int rproc_enable_iommu(struct rproc *rproc) return 0; } - domain = iommu_domain_alloc(dev->bus); - if (!domain) { + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) { dev_err(dev, "can't alloc iommu domain\n"); - return -ENOMEM; + return PTR_ERR(domain); } iommu_set_fault_handler(domain, rproc_iommu_fault, rproc); -- Gitee From 6aaa6395009bc4c898fb58fcc93424b3ebe871b9 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 12:11:45 +0800 Subject: [PATCH 037/278] media: nvidia: tegra: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 0c069019f33dd20257c6e6dbf5944b73f22de56a upstream. An iommu domain is allocated in tegra_vde_iommu_init() and is attached to vde->dev. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20241009041147.28391-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/media/platform/nvidia/tegra-vde/iommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/nvidia/tegra-vde/iommu.c b/drivers/media/platform/nvidia/tegra-vde/iommu.c index 5521ed3e465f..b1d9d841d944 100644 --- a/drivers/media/platform/nvidia/tegra-vde/iommu.c +++ b/drivers/media/platform/nvidia/tegra-vde/iommu.c @@ -78,9 +78,10 @@ int tegra_vde_iommu_init(struct tegra_vde *vde) arm_iommu_release_mapping(mapping); } #endif - vde->domain = iommu_domain_alloc(&platform_bus_type); - if (!vde->domain) { - err = -ENOMEM; + vde->domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(vde->domain)) { + err = PTR_ERR(vde->domain); + vde->domain = NULL; goto put_group; } -- Gitee From 1b22bcd2b21b8e705f413f54a0c2e0afd701b835 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 12:11:46 +0800 Subject: [PATCH 038/278] drm/nouveau/tegra: Use iommu_paging_domain_alloc() ANBZ: #13617 commit ba1057ab5d01091a6fce428c0f42a57a655aabb2 upstream. In nvkm_device_tegra_probe_iommu(), a paging domain is allocated for @dev and attached to it on success. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Acked-by: Thierry Reding Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20241009041147.28391-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c index 87caa4a72921..763c4c2925f9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c @@ -120,8 +120,8 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra *tdev) mutex_init(&tdev->iommu.mutex); if (device_iommu_mapped(dev)) { - tdev->iommu.domain = iommu_domain_alloc(&platform_bus_type); - if (!tdev->iommu.domain) + tdev->iommu.domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(tdev->iommu.domain)) goto error; /* -- Gitee From c97356a7a3095b51bf164d3455d4b55e61c6e872 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:48 +0800 Subject: [PATCH 039/278] RDMA/usnic: Use iommu_paging_domain_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #13617 commit 3b10f25704beefd0534f89db8323398c89b720e1 upstream. usnic_uiom_alloc_pd() allocates a paging domain for a given device. In this case, iommu_domain_alloc(dev->bus) is equivalent to  iommu_paging_domain_alloc(dev). Replace it as iommu_domain_alloc() has been deprecated. Signed-off-by: Lu Baolu Acked-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-15-baolu.lu@linux.intel.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/infiniband/hw/usnic/usnic_uiom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 84e0f41e7dfa..f948b76f984d 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -443,11 +443,11 @@ struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev) if (!pd) return ERR_PTR(-ENOMEM); - pd->domain = domain = iommu_domain_alloc(dev->bus); - if (!domain) { + pd->domain = domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) { usnic_err("Failed to allocate IOMMU domain"); kfree(pd); - return ERR_PTR(-ENOMEM); + return ERR_CAST(domain); } iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); -- Gitee From dba61278b3c0512dda70c9776d2f2e0acd956bf6 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Aug 2024 15:21:05 +0800 Subject: [PATCH 040/278] media: venus: firmware: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 7ce555252c711f7520be42abba5c7401b3b68456 upstream. An iommu domain is allocated in venus_firmware_init() and is attached to core->fw.dev in the same function. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-10-baolu.lu@linux.intel.com Signed-off-by: Stanimir Varbanov Signed-off-by: Hans Verkuil Signed-off-by: Jay Chen --- drivers/media/platform/qcom/venus/firmware.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/qcom/venus/firmware.c b/drivers/media/platform/qcom/venus/firmware.c index fe7da2b30482..66a18830e66d 100644 --- a/drivers/media/platform/qcom/venus/firmware.c +++ b/drivers/media/platform/qcom/venus/firmware.c @@ -316,10 +316,10 @@ int venus_firmware_init(struct venus_core *core) core->fw.dev = &pdev->dev; - iommu_dom = iommu_domain_alloc(&platform_bus_type); - if (!iommu_dom) { + iommu_dom = iommu_paging_domain_alloc(core->fw.dev); + if (IS_ERR(iommu_dom)) { dev_err(core->fw.dev, "Failed to allocate iommu domain\n"); - ret = -ENOMEM; + ret = PTR_ERR(iommu_dom); goto err_unregister; } -- Gitee From 9cf30cd6d6b1510bdea9df1278137aaaf11e0010 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Aug 2024 15:25:27 +0800 Subject: [PATCH 041/278] soc: fsl: qbman: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 77a1a513083188f12361c45f08bdcfa508749b76 upstream. An iommu domain is allocated in portal_set_cpu() and is attached to pcfg->dev in the same function. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-14-baolu.lu@linux.intel.com Signed-off-by: Christophe Leroy Signed-off-by: Jay Chen --- drivers/soc/fsl/qbman/qman_portal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c index e23b60618c1a..456ef5d5c199 100644 --- a/drivers/soc/fsl/qbman/qman_portal.c +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -48,9 +48,10 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu) struct device *dev = pcfg->dev; int ret; - pcfg->iommu_domain = iommu_domain_alloc(&platform_bus_type); - if (!pcfg->iommu_domain) { + pcfg->iommu_domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(pcfg->iommu_domain)) { dev_err(dev, "%s(): iommu_domain_alloc() failed", __func__); + pcfg->iommu_domain = NULL; goto no_iommu; } ret = fsl_pamu_configure_l1_stash(pcfg->iommu_domain, cpu); -- Gitee From eb474ba2d53de3ab4a4637f72a734b02413f4db0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:37 +0800 Subject: [PATCH 042/278] vfio/type1: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 60ffc45017229ee8288ba139ee12c5ebf07c6f6a upstream. Replace iommu_domain_alloc() with iommu_paging_domain_alloc(). Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-4-baolu.lu@linux.intel.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/vfio/vfio_iommu_type1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 814862f18bf7..0bcc4dbf28c7 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2182,7 +2182,7 @@ static int vfio_iommu_domain_alloc(struct device *dev, void *data) { struct iommu_domain **domain = data; - *domain = iommu_domain_alloc(dev->bus); + *domain = iommu_paging_domain_alloc(dev); return 1; /* Don't iterate */ } @@ -2239,11 +2239,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, * us a representative device for the IOMMU API call. We don't actually * want to iterate beyond the first device (if any). */ - ret = -EIO; iommu_group_for_each_dev(iommu_group, &domain->domain, vfio_iommu_domain_alloc); - if (!domain->domain) + if (IS_ERR(domain->domain)) { + ret = PTR_ERR(domain->domain); goto out_free_domain; + } if (iommu->nesting) { ret = iommu_enable_nesting(domain->domain); -- Gitee From 120623c99f0df874433c7db19f74759baf3c9c1a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:38 +0800 Subject: [PATCH 043/278] vhost-vdpa: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 9c159f6de1aedf200ac94eace47f6082399b561c upstream. Replace iommu_domain_alloc() with iommu_paging_domain_alloc(). Signed-off-by: Lu Baolu Acked-by: Michael S. Tsirkin Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-5-baolu.lu@linux.intel.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/vhost/vdpa.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index c29a195a0175..7a8cae664219 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1217,26 +1217,24 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct device *dma_dev = vdpa_get_dma_dev(vdpa); - const struct bus_type *bus; int ret; /* Device want to do DMA by itself */ if (ops->set_map || ops->dma_map) return 0; - bus = dma_dev->bus; - if (!bus) - return -EFAULT; - if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) { dev_warn_once(&v->dev, "Failed to allocate domain, device is not IOMMU cache coherent capable\n"); return -ENOTSUPP; } - v->domain = iommu_domain_alloc(bus); - if (!v->domain) - return -EIO; + v->domain = iommu_paging_domain_alloc(dma_dev); + if (IS_ERR(v->domain)) { + ret = PTR_ERR(v->domain); + v->domain = NULL; + return ret; + } ret = iommu_attach_device(v->domain, dma_dev); if (ret) -- Gitee From 875d7359a483972f156cda7e0fc8ce2ce636d994 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:45 +0800 Subject: [PATCH 044/278] wifi: ath11k: Use iommu_paging_domain_alloc() ANBZ: #13617 commit ef50d41fbf1c95b07f636f4e268c53488a39284b upstream. An iommu domain is allocated in ath11k_ahb_fw_resources_init() and is attached to ab_ahb->fw.dev in the same function. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Acked-by: Jeff Johnson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-12-baolu.lu@linux.intel.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/net/wireless/ath/ath11k/ahb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/ahb.c b/drivers/net/wireless/ath/ath11k/ahb.c index ef11c138bf30..2451d0d9ccb9 100644 --- a/drivers/net/wireless/ath/ath11k/ahb.c +++ b/drivers/net/wireless/ath/ath11k/ahb.c @@ -995,10 +995,10 @@ static int ath11k_ahb_fw_resources_init(struct ath11k_base *ab) ab_ahb->fw.dev = &pdev->dev; - iommu_dom = iommu_domain_alloc(&platform_bus_type); - if (!iommu_dom) { + iommu_dom = iommu_paging_domain_alloc(ab_ahb->fw.dev); + if (IS_ERR(iommu_dom)) { ath11k_err(ab, "failed to allocate iommu domain\n"); - ret = -ENOMEM; + ret = PTR_ERR(iommu_dom); goto err_unregister; } -- Gitee From 3ea135747ce5b4b42a77bbf1f80455f5c3ec99bf Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:44 +0800 Subject: [PATCH 045/278] wifi: ath10k: Use iommu_paging_domain_alloc() ANBZ: #13617 commit d5b7485588dffb39c5687e965623124ab7ebcd51 upstream. An iommu domain is allocated in ath10k_fw_init() and is attached to ar_snoc->fw.dev in the same function. Use iommu_paging_domain_alloc() to make it explicit. Signed-off-by: Lu Baolu Acked-by: Jeff Johnson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240610085555.88197-11-baolu.lu@linux.intel.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/net/wireless/ath/ath10k/snoc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c index 1d06d4125992..63e7c15a736e 100644 --- a/drivers/net/wireless/ath/ath10k/snoc.c +++ b/drivers/net/wireless/ath/ath10k/snoc.c @@ -1634,10 +1634,10 @@ static int ath10k_fw_init(struct ath10k *ar) ar_snoc->fw.dev = &pdev->dev; - iommu_dom = iommu_domain_alloc(&platform_bus_type); - if (!iommu_dom) { + iommu_dom = iommu_paging_domain_alloc(ar_snoc->fw.dev); + if (IS_ERR(iommu_dom)) { ath10k_err(ar, "failed to allocate iommu domain\n"); - ret = -ENOMEM; + ret = PTR_ERR(iommu_dom); goto err_unregister; } -- Gitee From c0ef1dfd2c436b6b77626c381b76a04e42389aa1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 Sep 2024 13:40:10 +0100 Subject: [PATCH 046/278] ARM: 9418/1: dma-mapping: Use iommu_paging_domain_alloc() ANBZ: #13617 commit e02fcd73779cb7dfd4f31064c6145ca737811f6c upstream. Since arm_iommu_create_mapping() now accepts the device, let's replace iommu_domain_alloc() with iommu_paging_domain_alloc() to retire the former. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Acked-by: Michael S. Tsirkin Acked-by: Jeff Johnson Signed-off-by: Jason Gunthorpe Signed-off-by: Russell King (Oracle) Signed-off-by: Jay Chen --- arch/arm/mm/dma-mapping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 42c035b356d6..0607f60da58e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1588,9 +1588,11 @@ arm_iommu_create_mapping(struct device *dev, dma_addr_t base, u64 size) spin_lock_init(&mapping->lock); - mapping->domain = iommu_domain_alloc(dev->bus); - if (!mapping->domain) + mapping->domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(mapping->domain)) { + err = PTR_ERR(mapping->domain); goto err4; + } kref_init(&mapping->kref); return mapping; -- Gitee From 558aa7860ec89894173bbc9fa0948e5d2283d0c8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 2 Sep 2024 09:46:59 +0800 Subject: [PATCH 047/278] drm/rockchip: Use iommu_paging_domain_alloc() ANBZ: #13617 commit d8c07bee1e636db7ee6ab64b958f7bfdd9ff8c1e upstream. Commit <421be3ee36a4> ("drm/rockchip: Refactor IOMMU initialisation") has refactored rockchip_drm_init_iommu() to pass a device that the domain is allocated for. Replace iommu_domain_alloc() with iommu_paging_domain_alloc() to retire the former. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Acked-by: Andy Yan Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20240902014700.66095-3-baolu.lu@linux.intel.com Signed-off-by: Jay Chen --- drivers/gpu/drm/rockchip/rockchip_drm_drv.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c index ab55d7132550..52126ffb9280 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c @@ -103,13 +103,17 @@ static int rockchip_drm_init_iommu(struct drm_device *drm_dev) struct rockchip_drm_private *private = drm_dev->dev_private; struct iommu_domain_geometry *geometry; u64 start, end; + int ret; if (IS_ERR_OR_NULL(private->iommu_dev)) return 0; - private->domain = iommu_domain_alloc(private->iommu_dev->bus); - if (!private->domain) - return -ENOMEM; + private->domain = iommu_paging_domain_alloc(private->iommu_dev); + if (IS_ERR(private->domain)) { + ret = PTR_ERR(private->domain); + private->domain = NULL; + return ret; + } geometry = &private->domain->geometry; start = geometry->aperture_start; -- Gitee From 9314db78563fd81fbe5a5e60a00a85a773ff30b8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 2 Sep 2024 09:47:00 +0800 Subject: [PATCH 048/278] drm/tegra: Use iommu_paging_domain_alloc() ANBZ: #13617 commit 45c690aea8ee5b7d012cd593bd288540a4bfdbf0 upstream. Commit <17de3f5fdd35> ("iommu: Retire bus ops") removes iommu ops from the bus structure. The iommu subsystem no longer relies on bus for operations. So iommu_domain_alloc() interface is no longer relevant. Replace iommu_domain_alloc() with iommu_paging_domain_alloc() which takes the physical device from which the host1x_device virtual device was instantiated. This physical device is a common parent to all physical devices that are part of the virtual device. Suggested-by: Thierry Reding Signed-off-by: Lu Baolu Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20240902014700.66095-4-baolu.lu@linux.intel.com Signed-off-by: Jay Chen --- drivers/gpu/drm/tegra/drm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 373bcd79257e..9dcf95640cf7 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -1134,6 +1134,7 @@ static bool host1x_drm_wants_iommu(struct host1x_device *dev) static int host1x_drm_probe(struct host1x_device *dev) { + struct device *dma_dev = dev->dev.parent; struct tegra_drm *tegra; struct drm_device *drm; int err; @@ -1148,8 +1149,8 @@ static int host1x_drm_probe(struct host1x_device *dev) goto put; } - if (host1x_drm_wants_iommu(dev) && iommu_present(&platform_bus_type)) { - tegra->domain = iommu_domain_alloc(&platform_bus_type); + if (host1x_drm_wants_iommu(dev) && device_iommu_mapped(dma_dev)) { + tegra->domain = iommu_paging_domain_alloc(dma_dev); if (!tegra->domain) { err = -ENOMEM; goto free; -- Gitee From 5c5cd07b51a72a60aa3ed773325b169b0a77cce6 Mon Sep 17 00:00:00 2001 From: Jay Chen Date: Tue, 26 Aug 2025 19:04:12 +0800 Subject: [PATCH 049/278] anolis: hygon: Use iommu_paging_domain_alloc ANBZ: #13617 Signed-off-by: Jay Chen --- drivers/crypto/ccp/hygon/hct.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/crypto/ccp/hygon/hct.c b/drivers/crypto/ccp/hygon/hct.c index 6f6893914a19..662f8c1e7426 100644 --- a/drivers/crypto/ccp/hygon/hct.c +++ b/drivers/crypto/ccp/hygon/hct.c @@ -414,6 +414,17 @@ static int hct_iommu_alloc(struct pci_dev *pdev) if (i == MCCP_DEV_MAX) return -EINVAL; + if (!hct_data.domain) { + hct_data.domain = iommu_paging_domain_alloc(&pdev->dev); + if (IS_ERR(hct_data.domain)) + return -ENOMEM; + hct_data.prot = IOMMU_READ | IOMMU_WRITE; + /* When the pasid value is 0 or 1, the address space overlaps with the host, + * so the pasid needs to start from 2. + */ + hct_data.pasids[0] |= MCCP_PASID_MASK_BIT; + } + ret = iommu_attach_device(hct_data.domain, &pdev->dev); if (ret) { mutex_lock(&hct_data.lock); @@ -2049,7 +2060,6 @@ static struct miscdevice hct_misc = { static int hct_share_init(void) { int i; - int ret; memset(&hct_data, 0x00, sizeof(hct_data)); mutex_init(&hct_data.lock); @@ -2057,22 +2067,7 @@ static int hct_share_init(void) for (i = 0; i < MCCP_DEV_MAX; i++) mutex_init(&hct_data.iommu[i].lock); - ret = misc_register(&hct_misc); - if (!ret) { - hct_data.domain = iommu_domain_alloc(&pci_bus_type); - if (!hct_data.domain) { - pr_err("iommu domain alloc failed\n"); - misc_deregister(&hct_misc); - return -ENOMEM; - } - hct_data.prot = IOMMU_READ | IOMMU_WRITE; - } - - /* When the pasid value is 0 or 1, the address space overlaps with the host, - * so the pasid needs to start from 2. - */ - hct_data.pasids[0] |= MCCP_PASID_MASK_BIT; - return ret; + return misc_register(&hct_misc); } static void hct_share_exit(void) -- Gitee From 96a15aa4bf7c8df42cc9a0f0dc720666efff5b2a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 12:11:47 +0800 Subject: [PATCH 050/278] iommu: Remove iommu_domain_alloc() ANBZ: #13617 commit f6440fcc9c7b7aaad2e52830ab83fa71990f76ff upstream. The iommu_domain_alloc() interface is no longer used in the tree anymore. Remove it to avoid dead code. There is increasing demand for supporting multiple IOMMU drivers, and this is the last bus-based thing standing in the way of that. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241009041147.28391-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 36 ------------------------------------ include/linux/iommu.h | 6 ------ 2 files changed, 42 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 680a6efb9907..0886d6481982 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2005,42 +2005,6 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); } -static int __iommu_domain_alloc_dev(struct device *dev, void *data) -{ - const struct iommu_ops **ops = data; - - if (!dev_has_iommu(dev)) - return 0; - - if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev), - "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n", - dev_bus_name(dev))) - return -EBUSY; - - *ops = dev_iommu_ops(dev); - return 0; -} - -/* - * The iommu ops in bus has been retired. Do not use this interface in - * new drivers. - */ -struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) -{ - const struct iommu_ops *ops = NULL; - int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev); - struct iommu_domain *domain; - - if (err || !ops) - return NULL; - - domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED); - if (IS_ERR(domain)) - return NULL; - return domain; -} -EXPORT_SYMBOL_GPL(iommu_domain_alloc); - /** * iommu_paging_domain_alloc() - Allocate a paging domain * @dev: device for which the domain is allocated diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1a7fa47dfab0..93275f759642 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -805,7 +805,6 @@ extern int bus_iommu_probe(const struct bus_type *bus); extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); -extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus); struct iommu_domain *iommu_paging_domain_alloc(struct device *dev); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, @@ -1123,11 +1122,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } -static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) -{ - return NULL; -} - static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return ERR_PTR(-ENODEV); -- Gitee From a1a72d9ee06d8f09c207ca285abd94c146aa412a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:37:59 +0000 Subject: [PATCH 051/278] iommu: Refactor __iommu_domain_alloc() ANBZ: #13617 commit 541b967f5a9163b5c36b68599f1141114fbb26fa upstream. Following patch will introduce iommu_paging_domain_alloc_flags() API. Hence move domain init code to separate function so that it can be reused. Also move iommu_get_dma_cookie() setup iommu_setup_default_domain() as it is required in DMA API mode only. Signed-off-by: Jason Gunthorpe [Split the patch and added description - Vasant] Signed-off-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20241028093810.5901-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 46 +++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0886d6481982..3c0446c2863f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1945,6 +1945,22 @@ void iommu_set_fault_handler(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops) +{ + domain->type = type; + domain->owner = ops; + if (!domain->ops) + domain->ops = ops->default_domain_ops; + + /* + * If not already set, assume all sizes by default; the driver + * may override this later + */ + if (!domain->pgsize_bitmap) + domain->pgsize_bitmap = ops->pgsize_bitmap; +} + static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, struct device *dev, unsigned int type) @@ -1973,27 +1989,7 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, if (!domain) return ERR_PTR(-ENOMEM); - domain->type = type; - domain->owner = ops; - /* - * If not already set, assume all sizes by default; the driver - * may override this later - */ - if (!domain->pgsize_bitmap) - domain->pgsize_bitmap = ops->pgsize_bitmap; - - if (!domain->ops) - domain->ops = ops->default_domain_ops; - - if (iommu_is_dma_domain(domain)) { - int rc; - - rc = iommu_get_dma_cookie(domain); - if (rc) { - iommu_domain_free(domain); - return ERR_PTR(rc); - } - } + iommu_domain_init(domain, type, ops); return domain; } @@ -2942,6 +2938,14 @@ static int iommu_setup_default_domain(struct iommu_group *group, if (group->default_domain == dom) return 0; + if (iommu_is_dma_domain(dom)) { + ret = iommu_get_dma_cookie(dom); + if (ret) { + iommu_domain_free(dom); + return ret; + } + } + /* * IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be * mapped before their device is attached, in order to guarantee -- Gitee From 70edae9d068a849fc860ec5a8804b34315e4e900 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:38:00 +0000 Subject: [PATCH 052/278] iommu: Introduce iommu_paging_domain_alloc_flags() ANBZ: #13617 commit 20858d4ebb423826b7a978d54cde195f51d87e20 upstream. Currently drivers calls iommu_paging_domain_alloc(dev) to get an UNMANAGED domain. This is not sufficient to support PASID with UNMANAGED domain as some HW like AMD requires certain page table type to support PASIDs. Also the domain_alloc_paging op only passes device as param for domain allocation. This is not sufficient for AMD driver to decide the right page table. Instead of extending ops->domain_alloc_paging() it was decided to enhance ops->domain_alloc_user() so that caller can pass various additional flags. Hence add iommu_paging_domain_alloc_flags() API which takes flags as parameter. Caller can pass additional parameter to indicate type of domain required, etc. iommu_paging_domain_alloc_flags() internally calls appropriate callback function to allocate a domain. Signed-off-by: Jason Gunthorpe [Added description - Vasant] Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 32 +++++++++++++++++++++++++++----- include/linux/iommu.h | 14 +++++++++++--- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3c0446c2863f..8992cdf098fd 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2002,20 +2002,42 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) } /** - * iommu_paging_domain_alloc() - Allocate a paging domain + * iommu_paging_domain_alloc_flags() - Allocate a paging domain * @dev: device for which the domain is allocated + * @flags: Enum of iommufd_hwpt_alloc_flags * * Allocate a paging domain which will be managed by a kernel driver. Return - * allocated domain if successful, or a ERR pointer for failure. + * allocated domain if successful, or an ERR pointer for failure. */ -struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) { + const struct iommu_ops *ops; + struct iommu_domain *domain; + if (!dev_has_iommu(dev)) return ERR_PTR(-ENODEV); - return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED); + ops = dev_iommu_ops(dev); + + if (ops->domain_alloc_paging && !flags) + domain = ops->domain_alloc_paging(dev); + else if (ops->domain_alloc_user) + domain = ops->domain_alloc_user(dev, flags, NULL, NULL); + else if (ops->domain_alloc && !flags) + domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); + else + return ERR_PTR(-EOPNOTSUPP); + + if (IS_ERR(domain)) + return domain; + if (!domain) + return ERR_PTR(-ENOMEM); + + iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops); + return domain; } -EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc); +EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 93275f759642..2ff4de149dca 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -519,8 +519,6 @@ static inline int __iommu_copy_struct_from_user_array( * the caller iommu_domain_alloc() returns. * @domain_alloc_user: Allocate an iommu domain corresponding to the input * parameters as defined in include/uapi/linux/iommufd.h. - * Unlike @domain_alloc, it is called only by IOMMUFD and - * must fully initialize the new domain before return. * Upon success, if the @user_data is valid and the @parent * points to a kernel-managed domain, the new domain must be * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be @@ -805,7 +803,11 @@ extern int bus_iommu_probe(const struct bus_type *bus); extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); -struct iommu_domain *iommu_paging_domain_alloc(struct device *dev); +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags); +static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) +{ + return iommu_paging_domain_alloc_flags(dev, 0); +} extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1122,6 +1124,12 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) +{ + return ERR_PTR(-ENODEV); +} + static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return ERR_PTR(-ENODEV); -- Gitee From eee5125a3aa9cf66dcb98d6cfa26fffbd0d4bf1c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:38:01 +0000 Subject: [PATCH 053/278] iommu: Add new flag to explictly request PASID capable domain ANBZ: #13617 commit b7a0855eb95f6db8ac8e17596e76f7b94a790fe6 upstream. Introduce new flag (IOMMU_HWPT_ALLOC_PASID) to domain_alloc_users() ops. If IOMMU supports PASID it will allocate domain. Otherwise return error. In error path check for -EOPNOTSUPP and try to allocate non-PASID domain so that DMA-API mode work fine for drivers which does not support PASID as well. Also modify __iommu_group_alloc_default_domain() to call iommu_paging_domain_alloc_flags() with appropriate flag when allocating paging domain. Signed-off-by: Jason Gunthorpe Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 56 +++++++++++++++++++++++++++++------- include/uapi/linux/iommufd.h | 8 ++++++ 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8992cdf098fd..745002737c1b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "dma-iommu.h" #include "iommu-priv.h" @@ -99,6 +100,9 @@ static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); +static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int type, + unsigned int flags); enum { IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0, @@ -1600,8 +1604,30 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group); static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { + struct device *dev = iommu_group_first_dev(group); + struct iommu_domain *dom; + if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; + + /* + * When allocating the DMA API domain assume that the driver is going to + * use PASID and make sure the RID's domain is PASID compatible. + */ + if (req_type & __IOMMU_DOMAIN_PAGING) { + dom = __iommu_paging_domain_alloc_flags(dev, req_type, + dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0); + + /* + * If driver does not support PASID feature then + * try to allocate non-PASID domain + */ + if (PTR_ERR(dom) == -EOPNOTSUPP) + dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0); + + return dom; + } + return __iommu_group_domain_alloc(group, req_type); } @@ -2001,16 +2027,9 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); } -/** - * iommu_paging_domain_alloc_flags() - Allocate a paging domain - * @dev: device for which the domain is allocated - * @flags: Enum of iommufd_hwpt_alloc_flags - * - * Allocate a paging domain which will be managed by a kernel driver. Return - * allocated domain if successful, or an ERR pointer for failure. - */ -struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, - unsigned int flags) +static struct iommu_domain * +__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, + unsigned int flags) { const struct iommu_ops *ops; struct iommu_domain *domain; @@ -2034,9 +2053,24 @@ struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, if (!domain) return ERR_PTR(-ENOMEM); - iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops); + iommu_domain_init(domain, type, ops); return domain; } + +/** + * iommu_paging_domain_alloc_flags() - Allocate a paging domain + * @dev: device for which the domain is allocated + * @flags: Bitmap of iommufd_hwpt_alloc_flags + * + * Allocate a paging domain which will be managed by a kernel driver. Return + * allocated domain if successful, or an ERR pointer for failure. + */ +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) +{ + return __iommu_paging_domain_alloc_flags(dev, + IOMMU_DOMAIN_UNMANAGED, flags); +} EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 41b1a01e9293..dee3e61d5271 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -384,11 +384,19 @@ struct iommu_vfio_ioas { * enforced on device attachment * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is * valid. + * @IOMMU_HWPT_ALLOC_PASID: Requests a domain that can be used with PASID. The + * domain can be attached to any PASID on the device. + * Any domain attached to the non-PASID part of the + * device must also be flaged, otherwise attaching a + * PASID will blocked. + * If IOMMU does not support PASID it will return + * error (-EOPNOTSUPP). */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, + IOMMU_HWPT_ALLOC_PASID = 1 << 3, }; /** -- Gitee From f83f6d3823c2324a84bce54a57b7ed58d761ee81 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:02 +0000 Subject: [PATCH 054/278] iommu/arm-smmu-v3: Enhance domain_alloc_user() to allocate PASID capable domain ANBZ: #13617 commit 60c30aa6afa2397cde39f15849d808536374b330 upstream. Core layer is modified to call domain_alloc_user() to allocate PASID capable domain. Enhance arm_smmu_domain_alloc_user() to allocate PASID capable domain based on the 'flags' parameter. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 363e8a8638e3..892d72d10d2c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3084,7 +3084,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID; struct arm_smmu_domain *smmu_domain; int ret; @@ -3093,6 +3094,9 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, if (parent || user_data) return ERR_PTR(-EOPNOTSUPP); + if (flags & IOMMU_HWPT_ALLOC_PASID) + return arm_smmu_domain_alloc_paging(dev); + smmu_domain = arm_smmu_domain_alloc(); if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); -- Gitee From faaeb8880bb9178cf9241558ab08c3b67a9635f5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:03 +0000 Subject: [PATCH 055/278] iommu/amd: Add helper function to check GIOSUP/GTSUP ANBZ: #13617 commit b0ffdb23e94fcb30185bc618edf8608a7b0c9dfc upstream. amd_iommu_gt_ppr_supported() only checks for GTSUP. To support PASID with V2 page table we need GIOSUP as well. Hence add new helper function to check GIOSUP/GTSUP. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241028093810.5901-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 7 ++++++- drivers/iommu/amd/init.c | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 6b971105ebb7..d9a95869feb8 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -118,9 +118,14 @@ static inline bool check_feature2(u64 mask) return (amd_iommu_efr2 & mask); } +static inline bool amd_iommu_v2_pgtbl_supported(void) +{ + return (check_feature(FEATURE_GIOSUP) && check_feature(FEATURE_GT)); +} + static inline bool amd_iommu_gt_ppr_supported(void) { - return (check_feature(FEATURE_GT) && + return (amd_iommu_v2_pgtbl_supported() && check_feature(FEATURE_PPR) && check_feature(FEATURE_EPHSUP)); } diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c68ecbad2913..e3d073f941e3 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2070,8 +2070,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!check_feature(FEATURE_GIOSUP) || - !check_feature(FEATURE_GT)) { + if (!amd_iommu_v2_pgtbl_supported()) { pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); amd_iommu_pgtable = AMD_IOMMU_V1; } -- Gitee From 1dbc63f069435b1f2a6abad398e26bfe78bfef96 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:04 +0000 Subject: [PATCH 056/278] iommu/amd: Move V2 page table support check to early_amd_iommu_init() ANBZ: #13617 commit d15f55d645a8d8235593aa888d76a694c73e391c upstream. amd_iommu_pgtable validation has to be done before calling iommu_snp_enable(). It can be done immediately after reading IOMMU features. Hence move this check to early_amd_iommu_init(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241028093810.5901-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/init.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index e3d073f941e3..8846ee8e1224 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2069,13 +2069,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); - if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!amd_iommu_v2_pgtbl_supported()) { - pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } - } - if (is_rd890_iommu(iommu->dev)) { int i, j; @@ -3093,6 +3086,13 @@ static int __init early_amd_iommu_init(void) FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL) amd_iommu_gpt_level = PAGE_MODE_5_LEVEL; + if (amd_iommu_pgtable == AMD_IOMMU_V2) { + if (!amd_iommu_v2_pgtbl_supported()) { + pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); + amd_iommu_pgtable = AMD_IOMMU_V1; + } + } + /* Disable any previously enabled IOMMUs */ if (!is_kdump_kernel() || amd_iommu_disabled) disable_iommus(); -- Gitee From 7d2f820127c62a30962145ddcc42d3e902a07474 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:05 +0000 Subject: [PATCH 057/278] iommu/amd: Separate page table setup from domain allocation ANBZ: #13617 commit b3c989083dabab472eb766d59b1d1fb9f11495d6 upstream. Currently protection_domain_alloc() allocates domain and also sets up page table. Page table setup is required for PAGING domain only. Domain type like SVA doesn't need page table. Hence move page table setup code to separate function. Also SVA domain allocation path does not call pdom_setup_pgtable(). Hence remove IOMMU_DOMAIN_SVA type check. Signed-off-by: Vasant Hegde Reviewed-by: Jacob Pan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241028093810.5901-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 42 ++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a0c974361643..467fad2b2726 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2274,28 +2274,36 @@ void protection_domain_free(struct protection_domain *domain) struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { - struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; - int pgtable; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; domain->id = domain_id_alloc(); - if (!domain->id) - goto err_free; + if (!domain->id) { + kfree(domain); + return NULL; + } spin_lock_init(&domain->lock); INIT_LIST_HEAD(&domain->dev_list); INIT_LIST_HEAD(&domain->dev_data_list); domain->iop.pgtbl.cfg.amd.nid = nid; + return domain; +} + +static int pdom_setup_pgtable(struct protection_domain *domain, + unsigned int type) +{ + struct io_pgtable_ops *pgtbl_ops; + int pgtable; + switch (type) { /* No need to allocate io pgtable ops in passthrough mode */ case IOMMU_DOMAIN_IDENTITY: - case IOMMU_DOMAIN_SVA: - return domain; + return 0; case IOMMU_DOMAIN_DMA: pgtable = amd_iommu_pgtable; break; @@ -2307,7 +2315,7 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) pgtable = AMD_IOMMU_V1; break; default: - goto err_id; + return -EINVAL; } switch (pgtable) { @@ -2318,20 +2326,14 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) domain->pd_mode = PD_MODE_V2; break; default: - goto err_id; + return -EINVAL; } - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl.cfg, domain); if (!pgtbl_ops) - goto err_id; + return -ENOMEM; - return domain; -err_id: - domain_id_free(domain->id); -err_free: - kfree(domain); - return NULL; + return 0; } static inline u64 dma_max_address(void) @@ -2367,6 +2369,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; struct amd_iommu *iommu = NULL; + int ret; if (dev) iommu = get_amd_iommu_from_dev(dev); @@ -2386,6 +2389,13 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (!domain) return ERR_PTR(-ENOMEM); + ret = pdom_setup_pgtable(domain, type); + if (ret) { + domain_id_free(domain->id); + kfree(domain); + return ERR_PTR(ret); + } + domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; -- Gitee From 2ebe73d75914a378ee5a7836d56fa203b2b257cd Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:06 +0000 Subject: [PATCH 058/278] iommu/amd: Pass page table type as param to pdom_setup_pgtable() ANBZ: #13617 commit a005ef62f9922ae023aec63d673217486656b6bc upstream. Current code forces v1 page table for UNMANAGED domain and global page table type (amd_iommu_pgtable) for rest of paging domain. Following patch series adds support for domain_alloc_paging() ops. Also enhances domain_alloc_user() to allocate page table based on 'flags. Hence pass page table type as parameter to pdomain_setup_pgtable(). So that caller can decide right page table type. Also update dma_max_address() to take pgtable as parameter. Signed-off-by: Vasant Hegde Reviewed-by: Jacob Pan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241028093810.5901-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel [ Jay Chen: fix rebase conflicts in ../drivers/iommu/amd/iommu.c ] Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 43 +++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 467fad2b2726..588ccb02c3cd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2295,28 +2295,13 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) } static int pdom_setup_pgtable(struct protection_domain *domain, - unsigned int type) + unsigned int type, int pgtable) { struct io_pgtable_ops *pgtbl_ops; - int pgtable; - switch (type) { /* No need to allocate io pgtable ops in passthrough mode */ - case IOMMU_DOMAIN_IDENTITY: + if (!(type & __IOMMU_DOMAIN_PAGING)) return 0; - case IOMMU_DOMAIN_DMA: - pgtable = amd_iommu_pgtable; - break; - /* - * Force IOMMU v1 page table when allocating - * domain for pass-through devices. - */ - case IOMMU_DOMAIN_UNMANAGED: - pgtable = AMD_IOMMU_V1; - break; - default: - return -EINVAL; - } switch (pgtable) { case AMD_IOMMU_V1: @@ -2328,6 +2313,7 @@ static int pdom_setup_pgtable(struct protection_domain *domain, default: return -EINVAL; } + pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl.cfg, domain); if (!pgtbl_ops) @@ -2336,9 +2322,9 @@ static int pdom_setup_pgtable(struct protection_domain *domain, return 0; } -static inline u64 dma_max_address(void) +static inline u64 dma_max_address(int pgtable) { - if (amd_iommu_pgtable == AMD_IOMMU_V1) + if (pgtable == AMD_IOMMU_V1) return ~0ULL; /* @@ -2364,7 +2350,8 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu) } static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, - struct device *dev, u32 flags) + struct device *dev, + u32 flags, int pgtable) { bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; @@ -2389,7 +2376,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (!domain) return ERR_PTR(-ENOMEM); - ret = pdom_setup_pgtable(domain, type); + ret = pdom_setup_pgtable(domain, type, pgtable); if (ret) { domain_id_free(domain->id); kfree(domain); @@ -2397,7 +2384,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, } domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(); + domain->domain.geometry.aperture_end = dma_max_address(pgtable); domain->domain.geometry.force_aperture = true; domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; @@ -2415,8 +2402,16 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) { struct iommu_domain *domain; + int pgtable = amd_iommu_pgtable; + + /* + * Force IOMMU v1 page table when allocating + * domain for pass-through devices. + */ + if (type == IOMMU_DOMAIN_UNMANAGED) + pgtable = AMD_IOMMU_V1; - domain = do_iommu_domain_alloc(type, NULL, 0); + domain = do_iommu_domain_alloc(type, NULL, 0, pgtable); if (IS_ERR(domain)) return NULL; @@ -2434,7 +2429,7 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) return ERR_PTR(-EOPNOTSUPP); - return do_iommu_domain_alloc(type, dev, flags); + return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V1); } void amd_iommu_domain_free(struct iommu_domain *dom) -- Gitee From d04f724e014e452185a3b785d7a31196cbbbbcbb Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:07 +0000 Subject: [PATCH 059/278] iommu/amd: Enhance amd_iommu_domain_alloc_user() ANBZ: #13617 commit ce2cd175469f6cb56e8de9bf87805206c8d19764 upstream. Previous patch enhanced core layer to check device PASID capability and pass right flags to ops->domain_alloc_user(). Enhance amd_iommu_domain_alloc_user() to allocate domain with appropriate page table based on flags parameter. - If flags is empty then allocate domain with default page table type. This will eventually replace ops->domain_alloc(). For UNMANAGED domain, core will call this interface with flags=0. So AMD driver will continue to allocate V1 page table. - If IOMMU_HWPT_ALLOC_PASID flags is passed then allocate domain with v2 page table. Signed-off-by: Vasant Hegde Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241028093810.5901-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 588ccb02c3cd..b46aceb15e28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2368,9 +2368,6 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) return ERR_PTR(-EINVAL); - if (dirty_tracking && !amd_iommu_hd_support(iommu)) - return ERR_PTR(-EOPNOTSUPP); - domain = protection_domain_alloc(type, dev ? dev_to_node(dev) : NUMA_NO_NODE); if (!domain) @@ -2425,11 +2422,36 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, { unsigned int type = IOMMU_DOMAIN_UNMANAGED; + struct amd_iommu *iommu = NULL; + const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID; + + if (dev) + iommu = get_amd_iommu_from_dev(dev); + + if ((flags & ~supported_flags) || parent || user_data) + return ERR_PTR(-EOPNOTSUPP); + + /* Allocate domain with v2 page table if IOMMU supports PASID. */ + if (flags & IOMMU_HWPT_ALLOC_PASID) { + if (!amd_iommu_pasid_supported()) + return ERR_PTR(-EOPNOTSUPP); + + return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V2); + } + + /* Allocate domain with v1 page table for dirty tracking */ + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) { + if (iommu && amd_iommu_hd_support(iommu)) { + return do_iommu_domain_alloc(type, dev, + flags, AMD_IOMMU_V1); + } - if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) return ERR_PTR(-EOPNOTSUPP); + } - return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V1); + /* If nothing specific is required use the kernel commandline default */ + return do_iommu_domain_alloc(type, dev, 0, amd_iommu_pgtable); } void amd_iommu_domain_free(struct iommu_domain *dom) -- Gitee From 6011effcf1c2e1a48ca39bd94ebd783c3e5d69f2 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 28 Oct 2024 09:38:08 +0000 Subject: [PATCH 060/278] iommu/amd: Implement global identity domain ANBZ: #13617 commit 4402f2627d30c44f56180295e14561f1820eeacc upstream. Implement global identity domain. All device groups in identity domain will share this domain. In attach device path, based on device capability it will allocate per device domain ID and GCR3 table. So that it can support SVA. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-11-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/init.c | 3 +++ drivers/iommu/amd/iommu.c | 36 +++++++++++++++++++++++++++++++---- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index d9a95869feb8..bc2515ea8261 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -46,6 +46,7 @@ extern int amd_iommu_gpt_level; extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ +void amd_iommu_init_identity_domain(void); struct protection_domain *protection_domain_alloc(unsigned int type, int nid); void protection_domain_free(struct protection_domain *domain); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 8846ee8e1224..22f9e07f9b3f 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2163,6 +2163,9 @@ static int __init amd_iommu_init_pci(void) struct amd_iommu_pci_seg *pci_seg; int ret; + /* Init global identity domain before registering IOMMU */ + amd_iommu_init_identity_domain(); + for_each_iommu(iommu) { ret = iommu_init_pci(iommu); if (ret) { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b46aceb15e28..b111a54bcf84 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -74,6 +74,9 @@ struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev); + static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data); @@ -2272,6 +2275,14 @@ void protection_domain_free(struct protection_domain *domain) kfree(domain); } +static void protection_domain_init(struct protection_domain *domain, int nid) +{ + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + INIT_LIST_HEAD(&domain->dev_data_list); + domain->iop.pgtbl.cfg.amd.nid = nid; +} + struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { struct protection_domain *domain; @@ -2286,10 +2297,7 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) return NULL; } - spin_lock_init(&domain->lock); - INIT_LIST_HEAD(&domain->dev_list); - INIT_LIST_HEAD(&domain->dev_data_list); - domain->iop.pgtbl.cfg.amd.nid = nid; + protection_domain_init(domain, nid); return domain; } @@ -2493,6 +2501,25 @@ static struct iommu_domain blocked_domain = { } }; +static struct protection_domain identity_domain; + +static const struct iommu_domain_ops identity_domain_ops = { + .attach_dev = amd_iommu_attach_device, +}; + +void amd_iommu_init_identity_domain(void) +{ + struct iommu_domain *domain = &identity_domain.domain; + + domain->type = IOMMU_DOMAIN_IDENTITY; + domain->ops = &identity_domain_ops; + domain->owner = &amd_iommu_ops; + + identity_domain.id = domain_id_alloc(); + + protection_domain_init(&identity_domain, NUMA_NO_NODE); +} + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { @@ -2891,6 +2918,7 @@ static int amd_iommu_dev_disable_feature(struct device *dev, const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, + .identity_domain = &identity_domain.domain, .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_user = amd_iommu_domain_alloc_user, .domain_alloc_sva = amd_iommu_domain_alloc_sva, -- Gitee From 2ae42b45dcdd394b59cb2e60fc70cdadb920b8c5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:38:09 +0000 Subject: [PATCH 061/278] iommu: Put domain allocation in __iommu_group_alloc_blocking_domain() ANBZ: #13617 commit 4208849ec7a6c46d41e623c359c5619704717140 upstream. There is no longer a reason to call __iommu_domain_alloc() to allocate the blocking domain. All drivers that support a native blocking domain provide it via the ops, for other drivers we should call iommu_paging_domain_alloc(). __iommu_group_alloc_blocking_domain() is the only place that allocates an BLOCKED domain, so move the ops->blocked_domain logic there. Signed-off-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-12-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 745002737c1b..c26b46900838 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1996,8 +1996,6 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) return ops->identity_domain; - else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain) - return ops->blocked_domain; else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) domain = ops->domain_alloc_paging(dev); else if (ops->domain_alloc) @@ -3194,22 +3192,25 @@ void iommu_device_unuse_default_domain(struct device *dev) static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { + struct device *dev = iommu_group_first_dev(group); + const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (group->blocking_domain) return 0; - domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); - if (IS_ERR(domain)) { - /* - * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED - * create an empty domain instead. - */ - domain = __iommu_group_domain_alloc(group, - IOMMU_DOMAIN_UNMANAGED); - if (IS_ERR(domain)) - return PTR_ERR(domain); + if (ops->blocked_domain) { + group->blocking_domain = ops->blocked_domain; + return 0; } + + /* + * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an + * empty PAGING domain instead. + */ + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) + return PTR_ERR(domain); group->blocking_domain = domain; return 0; } -- Gitee From 4238e7a831afb3de22fa708d30ef784ff4c13fc7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:38:10 +0000 Subject: [PATCH 062/278] iommu: Create __iommu_alloc_identity_domain() ANBZ: #13617 commit 4490ccc45fb77f77550bdd65e663af9e1cae6bcb upstream. Consolidate all the code to create an IDENTITY domain into one function. This removes the legacy __iommu_domain_alloc() path from all paths, and preps it for final removal. BLOCKED/IDENTITY/PAGING are now always allocated via a type specific function. [Joerg: Actually remove __iommu_domain_alloc()] Signed-off-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-13-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 69 ++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c26b46900838..b321470a8bd6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -94,8 +94,6 @@ static const char * const iommu_group_resv_type_string[] = { static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); -static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, @@ -137,6 +135,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev); static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev); +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -1601,6 +1601,28 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + if (ops->identity_domain) + return ops->identity_domain; + + /* Older drivers create the identity domain via ops->domain_alloc() */ + if (!ops->domain_alloc) + return ERR_PTR(-EOPNOTSUPP); + + domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); + if (IS_ERR(domain)) + return domain; + if (!domain) + return ERR_PTR(-ENOMEM); + + iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); + return domain; +} + static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { @@ -1628,7 +1650,10 @@ __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) return dom; } - return __iommu_group_domain_alloc(group, req_type); + if (req_type == IOMMU_DOMAIN_IDENTITY) + return __iommu_alloc_identity_domain(dev); + + return ERR_PTR(-EINVAL); } /* @@ -1987,44 +2012,6 @@ static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, domain->pgsize_bitmap = ops->pgsize_bitmap; } -static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, - struct device *dev, - unsigned int type) -{ - struct iommu_domain *domain; - unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS; - - if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) - return ops->identity_domain; - else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) - domain = ops->domain_alloc_paging(dev); - else if (ops->domain_alloc) - domain = ops->domain_alloc(alloc_type); - else - return ERR_PTR(-EOPNOTSUPP); - - /* - * Many domain_alloc ops now return ERR_PTR, make things easier for the - * driver by accepting ERR_PTR from all domain_alloc ops instead of - * having two rules. - */ - if (IS_ERR(domain)) - return domain; - if (!domain) - return ERR_PTR(-ENOMEM); - - iommu_domain_init(domain, type, ops); - return domain; -} - -static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) -{ - struct device *dev = iommu_group_first_dev(group); - - return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); -} - static struct iommu_domain * __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, unsigned int flags) -- Gitee From 360caa1f91881c71da93bd206b4281ab46e5c123 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 29 Oct 2024 11:54:25 +0100 Subject: [PATCH 063/278] iommu: Fix prototype of iommu_paging_domain_alloc_flags() ANBZ: #13617 commit d14772c0d88c387f881a577aa136e1e9b1291d07 upstream. The iommu_paging_domain_alloc_flags() prototype for non-iommu kernel configurations lacks the 'static inline' prefixes. Cc: Jason Gunthorpe Cc: Vasant Hegde Fixes: 20858d4ebb42 ("iommu: Introduce iommu_paging_domain_alloc_flags()") Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2ff4de149dca..f3c9b1b616f7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1124,7 +1124,7 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } -struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, +static inline struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags) { return ERR_PTR(-ENODEV); -- Gitee From 3d4026115072b32109000e0031599a8a754e800f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 29 Oct 2024 15:58:24 +0000 Subject: [PATCH 064/278] iommu/tegra241-cmdqv: Fix unused variable warning ANBZ: #13617 commit 5492f0c4085a8fb8820ff974f17b83a7d6dab5a5 upstream. While testing some io-pgtable changes, I ran into a compiler warning from the Tegra CMDQ driver: drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c:803:23: warning: unused variable 'cmdqv_debugfs_dir' [-Wunused-variable] 803 | static struct dentry *cmdqv_debugfs_dir; | ^~~~~~~~~~~~~~~~~ 1 warning generated. Guard the variable declaration with CONFIG_IOMMU_DEBUGFS to silence the warning. Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index a243c543598c..6c7770e79af6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -800,7 +800,9 @@ static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu) return 0; } +#ifdef CONFIG_IOMMU_DEBUGFS static struct dentry *cmdqv_debugfs_dir; +#endif static struct arm_smmu_device * __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, -- Gitee From 3c11987422d7f0cfcc21ee62bee361e5d7cdd4e5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:45 +0000 Subject: [PATCH 065/278] iommu/amd/pgtbl_v2: Take protection domain lock before invalidating TLB ANBZ: #13617 commit 016991606aa01c4d92e6941be636c0c897aa05c7 upstream. Commit c7fc12354be0 ("iommu/amd/pgtbl_v2: Invalidate updated page ranges only") missed to take domain lock before calling amd_iommu_domain_flush_pages(). Fix this by taking protection domain lock before calling TLB invalidation function. Fixes: c7fc12354be0 ("iommu/amd/pgtbl_v2: Invalidate updated page ranges only") Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/io_pgtable_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 0ee4f45ec14e..a56a27396305 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -268,8 +268,11 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, out: if (updated) { struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); + unsigned long flags; + spin_lock_irqsave(&pdom->lock, flags); amd_iommu_domain_flush_pages(pdom, o_iova, size); + spin_unlock_irqrestore(&pdom->lock, flags); } if (mapped) -- Gitee From 57accb58ba7436c35c7941c2371912f8e5b63ce9 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:46 +0000 Subject: [PATCH 066/278] iommu/amd: Use ida interface to manage protection domain ID ANBZ: #13617 commit 2fcab2deebc33640516d9d379b626557a38d5d4f upstream. Replace custom domain ID allocator with IDA interface. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 6 ++-- drivers/iommu/amd/init.c | 31 ++++------------ drivers/iommu/amd/iommu.c | 56 +++++++++++++---------------- 3 files changed, 35 insertions(+), 58 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 601fb4ee6900..e70f6299f765 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -912,14 +912,14 @@ struct unity_map_entry { /* size of the dma_ops aperture as power of 2 */ extern unsigned amd_iommu_aperture_order; -/* allocation bitmap for domain ids */ -extern unsigned long *amd_iommu_pd_alloc_bitmap; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* IDA to track protection domain IDs */ +extern struct ida pdom_ids; + /* Global EFR and EFR2 registers */ extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 22f9e07f9b3f..36d47bd408e7 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -193,12 +193,6 @@ bool amd_iommu_force_isolation __read_mostly; unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; -/* - * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap - * to know which ones are already in use. - */ -unsigned long *amd_iommu_pd_alloc_bitmap; - enum iommu_init_state { IOMMU_START_STATE, IOMMU_IVRS_DETECTED, @@ -1081,7 +1075,12 @@ static bool __copy_device_table(struct amd_iommu *iommu) if (dte_v && dom_id) { pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - __set_bit(dom_id, amd_iommu_pd_alloc_bitmap); + /* Reserve the Domain IDs used by previous kernel */ + if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { + pr_err("Failed to reserve domain ID 0x%x\n", dom_id); + memunmap(old_devtb); + return false; + } /* If gcr3 table existed, mask it out */ if (old_devtb[devid].data[0] & DTE_FLAG_GV) { tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; @@ -2992,9 +2991,7 @@ static bool __init check_ioapic_information(void) static void __init free_dma_resources(void) { - iommu_free_pages(amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID / 8)); - amd_iommu_pd_alloc_bitmap = NULL; + ida_destroy(&pdom_ids); free_unity_maps(); } @@ -3062,20 +3059,6 @@ static int __init early_amd_iommu_init(void) amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base); DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type); - /* Device table - directly used by all IOMMUs */ - ret = -ENOMEM; - - amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL, - get_order(MAX_DOMAIN_ID / 8)); - if (amd_iommu_pd_alloc_bitmap == NULL) - goto out; - - /* - * never allocate domain 0 because its used as the non-allocated and - * error value placeholder - */ - __set_bit(0, amd_iommu_pd_alloc_bitmap); - /* * now the data structures are allocated and basically initialized * start the real acpi table scan diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b111a54bcf84..7e2af11115d0 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -52,8 +53,6 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -static DEFINE_SPINLOCK(pd_bitmap_lock); - LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); @@ -70,6 +69,12 @@ struct iommu_cmd { u32 data[4]; }; +/* + * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap + * to know which ones are already in use. + */ +DEFINE_IDA(pdom_ids); + struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); @@ -1652,31 +1657,14 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) * ****************************************************************************/ -static u16 domain_id_alloc(void) +static int pdom_id_alloc(void) { - unsigned long flags; - int id; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); - BUG_ON(id == 0); - if (id > 0 && id < MAX_DOMAIN_ID) - __set_bit(id, amd_iommu_pd_alloc_bitmap); - else - id = 0; - spin_unlock_irqrestore(&pd_bitmap_lock, flags); - - return id; + return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); } -static void domain_id_free(int id) +static void pdom_id_free(int id) { - unsigned long flags; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - if (id > 0 && id < MAX_DOMAIN_ID) - __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock_irqrestore(&pd_bitmap_lock, flags); + ida_free(&pdom_ids, id); } static void free_gcr3_tbl_level1(u64 *tbl) @@ -1721,7 +1709,7 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) gcr3_info->glx = 0; /* Free per device domain ID */ - domain_id_free(gcr3_info->domid); + pdom_id_free(gcr3_info->domid); iommu_free_page(gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; @@ -1748,6 +1736,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, { int levels = get_gcr3_levels(pasids); int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; + int domid; if (levels > amd_iommu_max_glx_val) return -EINVAL; @@ -1756,11 +1745,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, return -EBUSY; /* Allocate per device domain ID */ - gcr3_info->domid = domain_id_alloc(); + domid = pdom_id_alloc(); + if (domid <= 0) + return -ENOSPC; + gcr3_info->domid = domid; gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC); if (gcr3_info->gcr3_tbl == NULL) { - domain_id_free(gcr3_info->domid); + pdom_id_free(domid); return -ENOMEM; } @@ -2271,7 +2263,7 @@ void protection_domain_free(struct protection_domain *domain) WARN_ON(!list_empty(&domain->dev_list)); if (domain->domain.type & __IOMMU_DOMAIN_PAGING) free_io_pgtable_ops(&domain->iop.pgtbl.ops); - domain_id_free(domain->id); + pdom_id_free(domain->id); kfree(domain); } @@ -2286,16 +2278,18 @@ static void protection_domain_init(struct protection_domain *domain, int nid) struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { struct protection_domain *domain; + int domid; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; - domain->id = domain_id_alloc(); - if (!domain->id) { + domid = pdom_id_alloc(); + if (domid <= 0) { kfree(domain); return NULL; } + domain->id = domid; protection_domain_init(domain, nid); @@ -2383,7 +2377,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, ret = pdom_setup_pgtable(domain, type, pgtable); if (ret) { - domain_id_free(domain->id); + pdom_id_free(domain->id); kfree(domain); return ERR_PTR(ret); } @@ -2515,7 +2509,7 @@ void amd_iommu_init_identity_domain(void) domain->ops = &identity_domain_ops; domain->owner = &amd_iommu_ops; - identity_domain.id = domain_id_alloc(); + identity_domain.id = pdom_id_alloc(); protection_domain_init(&identity_domain, NUMA_NO_NODE); } -- Gitee From 593ffa2db66808ca237b071a8f5c32293e888828 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:47 +0000 Subject: [PATCH 067/278] iommu/amd: Remove protection_domain.dev_cnt variable ANBZ: #13617 commit 743a4bae9fa1480e5f6837f6a55be918d6cd0e16 upstream. protection_domain->dev_list tracks list of attached devices to domain. We can use list_* functions on dev_list to get device count. Hence remove 'dev_cnt' variable. No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 1 - drivers/iommu/amd/iommu.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index e70f6299f765..90a752f57463 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -578,7 +578,6 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7e2af11115d0..cb078a1a1ee9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2040,7 +2040,6 @@ static int do_attach(struct iommu_dev_data *dev_data, /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { @@ -2073,7 +2072,6 @@ static void do_detach(struct iommu_dev_data *dev_data) /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; } /* @@ -2246,16 +2244,13 @@ static void cleanup_domain(struct protection_domain *domain) lockdep_assert_held(&domain->lock); - if (!domain->dev_cnt) - return; - while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); do_detach(entry); } - WARN_ON(domain->dev_cnt != 0); + WARN_ON(!list_empty(&domain->dev_list)); } void protection_domain_free(struct protection_domain *domain) -- Gitee From 662f590567efce5f846056bd4b112e575f4621a4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:48 +0000 Subject: [PATCH 068/278] iommu/amd: xarray to track protection_domain->iommu list ANBZ: #13617 commit d16041124de1dea4389b5e6b330657f34f8c0492 upstream. Use xarray to track IOMMU attached to protection domain instead of static array of MAX_IOMMUS. Also add lockdep assertion. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 8 ++- drivers/iommu/amd/iommu.c | 89 +++++++++++++++++++++++------ 2 files changed, 77 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 90a752f57463..f88f6e4e910f 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -565,6 +565,12 @@ struct pdom_dev_data { struct list_head list; }; +/* Keeps track of the IOMMUs attached to protection domain */ +struct pdom_iommu_info { + struct amd_iommu *iommu; /* IOMMUs attach to protection domain */ + u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -578,7 +584,7 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ + struct xarray iommu_array; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index cb078a1a1ee9..464410620a6c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1265,18 +1265,17 @@ static int iommu_completion_wait(struct amd_iommu *iommu) static void domain_flush_complete(struct protection_domain *domain) { - int i; + struct pdom_iommu_info *pdom_iommu_info; + unsigned long i; - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain && !domain->dev_iommu[i]) - continue; + lockdep_assert_held(&domain->lock); - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + xa_for_each(&domain->iommu_array, i, pdom_iommu_info) + iommu_completion_wait(pdom_iommu_info->iommu); } static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) @@ -1459,21 +1458,22 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, static int domain_flush_pages_v1(struct protection_domain *pdom, u64 address, size_t size) { + struct pdom_iommu_info *pdom_iommu_info; struct iommu_cmd cmd; - int ret = 0, i; + int ret = 0; + unsigned long i; + + lockdep_assert_held(&pdom->lock); build_inv_iommu_pages(&cmd, address, size, pdom->id, IOMMU_NO_PASID, false); - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (!pdom->dev_iommu[i]) - continue; - + xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { /* * Devices of this domain are behind this IOMMU * We need a TLB flush */ - ret |= iommu_queue_command(amd_iommus[i], &cmd); + ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); } return ret; @@ -1512,6 +1512,8 @@ static void __domain_flush_pages(struct protection_domain *domain, void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size) { + lockdep_assert_held(&domain->lock); + if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); @@ -2023,6 +2025,50 @@ static void destroy_gcr3_table(struct iommu_dev_data *dev_data, free_gcr3_table(gcr3_info); } +static int pdom_attach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) +{ + struct pdom_iommu_info *pdom_iommu_info, *curr; + + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (pdom_iommu_info) { + pdom_iommu_info->refcnt++; + return 0; + } + + pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); + if (!pdom_iommu_info) + return -ENOMEM; + + pdom_iommu_info->iommu = iommu; + pdom_iommu_info->refcnt = 1; + + curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, + NULL, pdom_iommu_info, GFP_ATOMIC); + if (curr) { + kfree(pdom_iommu_info); + return -ENOSPC; + } + + return 0; +} + +static void pdom_detach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) +{ + struct pdom_iommu_info *pdom_iommu_info; + + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (!pdom_iommu_info) + return; + + pdom_iommu_info->refcnt--; + if (pdom_iommu_info->refcnt == 0) { + xa_erase(&pdom->iommu_array, iommu->index); + kfree(pdom_iommu_info); + } +} + static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { @@ -2039,13 +2085,17 @@ static int do_attach(struct iommu_dev_data *dev_data, cfg->amd.nid = dev_to_node(dev_data->dev); /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; + ret = pdom_attach_iommu(iommu, domain); + if (ret) + return ret; /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { ret = init_gcr3_table(dev_data, domain); - if (ret) + if (ret) { + pdom_detach_iommu(iommu, domain); return ret; + } } return ret; @@ -2071,7 +2121,7 @@ static void do_detach(struct iommu_dev_data *dev_data) list_del(&dev_data->list); /* decrease reference counters - needs to happen after the flushes */ - domain->dev_iommu[iommu->index] -= 1; + pdom_detach_iommu(iommu, domain); } /* @@ -2267,6 +2317,7 @@ static void protection_domain_init(struct protection_domain *domain, int nid) spin_lock_init(&domain->lock); INIT_LIST_HEAD(&domain->dev_list); INIT_LIST_HEAD(&domain->dev_data_list); + xa_init(&domain->iommu_array); domain->iop.pgtbl.cfg.amd.nid = nid; } -- Gitee From cb695cfee21f0df4f2709b6650d5dce6f07f15ef Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:49 +0000 Subject: [PATCH 069/278] iommu/amd: Remove unused amd_iommus variable ANBZ: #13617 commit b73c698fd5b4a00888ad5a77ad09d3f6b48baa2d upstream. protection_domain structure is updated to use xarray to track the IOMMUs attached to the domain. Now domain flush code is not using amd_iommus. Hence remove this unused variable. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 6 ------ drivers/iommu/amd/init.c | 6 ------ 2 files changed, 12 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index f88f6e4e910f..f44de5f31625 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -877,12 +877,6 @@ extern struct list_head amd_iommu_pci_seg_list; */ extern struct list_head amd_iommu_list; -/* - * Array with pointers to each IOMMU struct - * The indices are referenced in the protection domains - */ -extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; - /* * Structure defining one entry in the device table */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 36d47bd408e7..c205ac00130c 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -176,9 +176,6 @@ LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ -/* Array to assign indices to IOMMUs*/ -struct amd_iommu *amd_iommus[MAX_IOMMUS]; - /* Number of IOMMUs present in the system */ static int amd_iommus_present; @@ -1742,9 +1739,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, return -ENOSYS; } - /* Index is fine - add IOMMU to the array */ - amd_iommus[iommu->index] = iommu; - /* * Copy data from ACPI table entry to the iommu struct */ -- Gitee From 98f488783f223c998b6ce5baf43081f224b491f4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:50 +0000 Subject: [PATCH 070/278] iommu/amd: Do not detach devices in domain free path ANBZ: #13617 commit 07bbd660dbd6ff03907d9ddbdfe9deabbd18ac4d upstream. All devices attached to a protection domain must be freed before calling domain free. Hence do not try to free devices in domain free path. Continue to throw warning if pdom->dev_list is not empty so that any potential issues can be fixed. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 464410620a6c..beec31e24dcf 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2288,21 +2288,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -static void cleanup_domain(struct protection_domain *domain) -{ - struct iommu_dev_data *entry; - - lockdep_assert_held(&domain->lock); - - while (!list_empty(&domain->dev_list)) { - entry = list_first_entry(&domain->dev_list, - struct iommu_dev_data, list); - BUG_ON(!entry->domain); - do_detach(entry); - } - WARN_ON(!list_empty(&domain->dev_list)); -} - void protection_domain_free(struct protection_domain *domain) { WARN_ON(!list_empty(&domain->dev_list)); @@ -2504,16 +2489,7 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, void amd_iommu_domain_free(struct iommu_domain *dom) { - struct protection_domain *domain; - unsigned long flags; - - domain = to_pdomain(dom); - - spin_lock_irqsave(&domain->lock, flags); - - cleanup_domain(domain); - - spin_unlock_irqrestore(&domain->lock, flags); + struct protection_domain *domain = to_pdomain(dom); protection_domain_free(domain); } -- Gitee From 60a4c66699468bf4fc58f57dd3a153f1056fef13 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:51 +0000 Subject: [PATCH 071/278] iommu/amd: Reduce domain lock scope in attach device path ANBZ: #13617 commit d6b47dec368400a62d2b9d44c8e136fc15eac72c upstream. Currently attach device path takes protection domain lock followed by dev_data lock. Most of the operations in this function is specific to device data except pdom_attach_iommu() where it updates protection domain structure. Hence reduce the scope of protection domain lock. Note that this changes the locking order. Now it takes device lock before taking doamin lock (group->mutex -> dev_data->lock -> pdom->lock). dev_data->lock is used only in device attachment path. So changing order is fine. It will not create any issue. Finally move numa node assignment to pdom_attach_iommu(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 52 ++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index beec31e24dcf..245f02d2a181 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2029,16 +2029,23 @@ static int pdom_attach_iommu(struct amd_iommu *iommu, struct protection_domain *pdom) { struct pdom_iommu_info *pdom_iommu_info, *curr; + struct io_pgtable_cfg *cfg = &pdom->iop.pgtbl.cfg; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&pdom->lock, flags); pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); if (pdom_iommu_info) { pdom_iommu_info->refcnt++; - return 0; + goto out_unlock; } pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); - if (!pdom_iommu_info) - return -ENOMEM; + if (!pdom_iommu_info) { + ret = -ENOMEM; + goto out_unlock; + } pdom_iommu_info->iommu = iommu; pdom_iommu_info->refcnt = 1; @@ -2047,43 +2054,52 @@ static int pdom_attach_iommu(struct amd_iommu *iommu, NULL, pdom_iommu_info, GFP_ATOMIC); if (curr) { kfree(pdom_iommu_info); - return -ENOSPC; + ret = -ENOSPC; + goto out_unlock; } - return 0; + /* Update NUMA Node ID */ + if (cfg->amd.nid == NUMA_NO_NODE) + cfg->amd.nid = dev_to_node(&iommu->dev->dev); + +out_unlock: + spin_unlock_irqrestore(&pdom->lock, flags); + return ret; } static void pdom_detach_iommu(struct amd_iommu *iommu, struct protection_domain *pdom) { struct pdom_iommu_info *pdom_iommu_info; + unsigned long flags; + + spin_lock_irqsave(&pdom->lock, flags); pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); - if (!pdom_iommu_info) + if (!pdom_iommu_info) { + spin_unlock_irqrestore(&pdom->lock, flags); return; + } pdom_iommu_info->refcnt--; if (pdom_iommu_info->refcnt == 0) { xa_erase(&pdom->iommu_array, iommu->index); kfree(pdom_iommu_info); } + + spin_unlock_irqrestore(&pdom->lock, flags); } static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - struct io_pgtable_cfg *cfg = &domain->iop.pgtbl.cfg; int ret = 0; /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - /* Update NUMA Node ID */ - if (cfg->amd.nid == NUMA_NO_NODE) - cfg->amd.nid = dev_to_node(dev_data->dev); - /* Do reference counting */ ret = pdom_attach_iommu(iommu, domain); if (ret) @@ -2105,12 +2121,15 @@ static void do_detach(struct iommu_dev_data *dev_data) { struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + unsigned long flags; /* Clear DTE and flush the entry */ dev_update_dte(dev_data, false); /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); amd_iommu_domain_flush_all(domain); + spin_unlock_irqrestore(&domain->lock, flags); /* Clear GCR3 table */ if (pdom_is_sva_capable(domain)) @@ -2132,11 +2151,8 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; - unsigned long flags; int ret = 0; - spin_lock_irqsave(&domain->lock, flags); - dev_data = dev_iommu_priv_get(dev); spin_lock(&dev_data->lock); @@ -2151,8 +2167,6 @@ static int attach_device(struct device *dev, out: spin_unlock(&dev_data->lock); - spin_unlock_irqrestore(&domain->lock, flags); - return ret; } @@ -2162,13 +2176,9 @@ static int attach_device(struct device *dev, static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - unsigned long flags; bool ppr = dev_data->ppr; - spin_lock_irqsave(&domain->lock, flags); - spin_lock(&dev_data->lock); /* @@ -2192,8 +2202,6 @@ static void detach_device(struct device *dev) out: spin_unlock(&dev_data->lock); - spin_unlock_irqrestore(&domain->lock, flags); - /* Remove IOPF handler */ if (ppr) amd_iommu_iopf_remove_device(iommu, dev_data); -- Gitee From ff994aa3a7d62f35deca3762786447cf2fe9a408 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:52 +0000 Subject: [PATCH 072/278] iommu/amd: Rearrange attach device code ANBZ: #13617 commit 4b18ef8491b06e353e8801705092cc292582cb7a upstream. attach_device() is just holding lock and calling do_attach(). There is not need to have another function. Just move do_attach() code to attach_device(). Similarly move do_detach() code to detach_device(). Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 91 ++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 245f02d2a181..c094b775b813 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2090,12 +2090,24 @@ static void pdom_detach_iommu(struct amd_iommu *iommu, spin_unlock_irqrestore(&pdom->lock, flags); } -static int do_attach(struct iommu_dev_data *dev_data, - struct protection_domain *domain) +/* + * If a device is not yet associated with a domain, this function makes the + * device visible in the domain + */ +static int attach_device(struct device *dev, + struct protection_domain *domain) { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int ret = 0; + spin_lock(&dev_data->lock); + + if (dev_data->domain != NULL) { + ret = -EBUSY; + goto out; + } + /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); @@ -2103,67 +2115,17 @@ static int do_attach(struct iommu_dev_data *dev_data, /* Do reference counting */ ret = pdom_attach_iommu(iommu, domain); if (ret) - return ret; + goto out; /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { ret = init_gcr3_table(dev_data, domain); if (ret) { pdom_detach_iommu(iommu, domain); - return ret; + goto out; } } - return ret; -} - -static void do_detach(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain = dev_data->domain; - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - unsigned long flags; - - /* Clear DTE and flush the entry */ - dev_update_dte(dev_data, false); - - /* Flush IOTLB and wait for the flushes to finish */ - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_all(domain); - spin_unlock_irqrestore(&domain->lock, flags); - - /* Clear GCR3 table */ - if (pdom_is_sva_capable(domain)) - destroy_gcr3_table(dev_data, domain); - - /* Update data structures */ - dev_data->domain = NULL; - list_del(&dev_data->list); - - /* decrease reference counters - needs to happen after the flushes */ - pdom_detach_iommu(iommu, domain); -} - -/* - * If a device is not yet associated with a domain, this function makes the - * device visible in the domain - */ -static int attach_device(struct device *dev, - struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - int ret = 0; - - dev_data = dev_iommu_priv_get(dev); - - spin_lock(&dev_data->lock); - - if (dev_data->domain != NULL) { - ret = -EBUSY; - goto out; - } - - ret = do_attach(dev_data, domain); - out: spin_unlock(&dev_data->lock); @@ -2177,7 +2139,9 @@ static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct protection_domain *domain = dev_data->domain; bool ppr = dev_data->ppr; + unsigned long flags; spin_lock(&dev_data->lock); @@ -2197,7 +2161,24 @@ static void detach_device(struct device *dev) dev_data->ppr = false; } - do_detach(dev_data); + /* Clear DTE and flush the entry */ + dev_update_dte(dev_data, false); + + /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_all(domain); + spin_unlock_irqrestore(&domain->lock, flags); + + /* Clear GCR3 table */ + if (pdom_is_sva_capable(domain)) + destroy_gcr3_table(dev_data, domain); + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + + /* decrease reference counters - needs to happen after the flushes */ + pdom_detach_iommu(iommu, domain); out: spin_unlock(&dev_data->lock); -- Gitee From b6daa905d1f599ca1c114a3a329b763a5828cf0f Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:53 +0000 Subject: [PATCH 073/278] iommu/amd: Convert dev_data lock from spinlock to mutex ANBZ: #13617 commit e843aedbeb82b17a5fe6172449bff133fc8b68a1 upstream. Currently in attach device path it takes dev_data->spinlock. But as per design attach device path can sleep. Also if device is PRI capable then it adds device to IOMMU fault handler queue which takes mutex. Hence currently PRI enablement is done outside dev_data lock. Covert dev_data lock from spinlock to mutex so that it follows the design and also PRI enablement can be done properly. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/iommu.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index f44de5f31625..fdb0357e0bb9 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -836,7 +836,7 @@ struct devid_map { */ struct iommu_dev_data { /*Protect against attach/detach races */ - spinlock_t lock; + struct mutex mutex; struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index c094b775b813..28645c7b79b1 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -210,7 +210,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) if (!dev_data) return NULL; - spin_lock_init(&dev_data->lock); + mutex_init(&dev_data->mutex); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -2101,7 +2101,7 @@ static int attach_device(struct device *dev, struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int ret = 0; - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); if (dev_data->domain != NULL) { ret = -EBUSY; @@ -2127,7 +2127,7 @@ static int attach_device(struct device *dev, } out: - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); return ret; } @@ -2143,7 +2143,7 @@ static void detach_device(struct device *dev) bool ppr = dev_data->ppr; unsigned long flags; - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); /* * First check if the device is still attached. It might already @@ -2181,7 +2181,7 @@ static void detach_device(struct device *dev) pdom_detach_iommu(iommu, domain); out: - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); /* Remove IOPF handler */ if (ppr) @@ -2492,9 +2492,9 @@ static int blocked_domain_attach_device(struct iommu_domain *domain, detach_device(dev); /* Clear DTE and flush the entry */ - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); dev_update_dte(dev_data, false); - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); return 0; } -- Gitee From 3f4408869a5a1bd8cb205eaf705e8b7409e640f5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:54 +0000 Subject: [PATCH 074/278] iommu/amd: Reorder attach device code ANBZ: #13617 commit 0b136493d3ffa1358783dcf5b9f866ceef2ff122 upstream. Ideally in attach device path, it should take dev_data lock before making changes to device data including IOPF enablement. So far dev_data was using spinlock and it was hitting lock order issue when it tries to enable IOPF. Hence Commit 526606b0a199 ("iommu/amd: Fix Invalid wait context issue") moved IOPF enablement outside dev_data->lock. Previous patch converted dev_data lock to mutex. Now its safe to call amd_iommu_iopf_add_device() with dev_data->mutex. Hence move back PCI device capability enablement (ATS, PRI, PASID) and IOPF enablement code inside the lock. Also in attach_device(), update 'dev_data->domain' at the end so that error handling becomes simple. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-11-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 65 +++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 28645c7b79b1..e6771c47fece 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2099,6 +2099,7 @@ static int attach_device(struct device *dev, { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct pci_dev *pdev; int ret = 0; mutex_lock(&dev_data->mutex); @@ -2108,10 +2109,6 @@ static int attach_device(struct device *dev, goto out; } - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - /* Do reference counting */ ret = pdom_attach_iommu(iommu, domain); if (ret) @@ -2126,6 +2123,28 @@ static int attach_device(struct device *dev, } } + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + if (pdev && pdom_is_sva_capable(domain)) { + pdev_enable_caps(pdev); + + /* + * Device can continue to function even if IOPF + * enablement failed. Hence in error path just + * disable device PRI support. + */ + if (amd_iommu_iopf_add_device(iommu, dev_data)) + pdev_disable_cap_pri(pdev); + } else if (pdev) { + pdev_enable_cap_ats(pdev); + } + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + + /* Update device table */ + dev_update_dte(dev_data, true); + out: mutex_unlock(&dev_data->mutex); @@ -2140,7 +2159,6 @@ static void detach_device(struct device *dev) struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct protection_domain *domain = dev_data->domain; - bool ppr = dev_data->ppr; unsigned long flags; mutex_lock(&dev_data->mutex); @@ -2154,13 +2172,15 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) goto out; - if (ppr) { + /* Remove IOPF handler */ + if (dev_data->ppr) { iopf_queue_flush_dev(dev); - - /* Updated here so that it gets reflected in DTE */ - dev_data->ppr = false; + amd_iommu_iopf_remove_device(iommu, dev_data); } + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); + /* Clear DTE and flush the entry */ dev_update_dte(dev_data, false); @@ -2182,14 +2202,6 @@ static void detach_device(struct device *dev) out: mutex_unlock(&dev_data->mutex); - - /* Remove IOPF handler */ - if (ppr) - amd_iommu_iopf_remove_device(iommu, dev_data); - - if (dev_is_pci(dev)) - pdev_disable_caps(to_pci_dev(dev)); - } static struct iommu_device *amd_iommu_probe_device(struct device *dev) @@ -2531,7 +2543,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); - struct pci_dev *pdev; int ret; /* @@ -2564,24 +2575,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } #endif - pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; - if (pdev && pdom_is_sva_capable(domain)) { - pdev_enable_caps(pdev); - - /* - * Device can continue to function even if IOPF - * enablement failed. Hence in error path just - * disable device PRI support. - */ - if (amd_iommu_iopf_add_device(iommu, dev_data)) - pdev_disable_cap_pri(pdev); - } else if (pdev) { - pdev_enable_cap_ats(pdev); - } - - /* Update device table */ - dev_update_dte(dev_data, true); - return ret; } -- Gitee From 48e31217591e9da4ef5b36775791a5af45b98ea4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:55 +0000 Subject: [PATCH 075/278] iommu/amd: Add ops->release_domain ANBZ: #13617 commit a0e086b16eca3fe58d30595b0b08d31f8ec7f74e upstream. In release path, remove device from existing domain and attach it to blocked domain. So that all DMAs from device is blocked. Note that soon blocked_domain will support other ops like set_dev_pasid() but release_domain supports only attach_dev ops. Hence added separate 'release_domain' variable. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-12-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e6771c47fece..d2653751a71c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2537,6 +2537,14 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain, NUMA_NO_NODE); } +/* Same as blocked domain except it supports only ops->attach_dev() */ +static struct iommu_domain release_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocked_domain_attach_device, + } +}; + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { @@ -2916,6 +2924,7 @@ static int amd_iommu_dev_disable_feature(struct device *dev, const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, + .release_domain = &release_domain, .identity_domain = &identity_domain.domain, .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_user = amd_iommu_domain_alloc_user, -- Gitee From b2299be775fd2970480003799febb79fa1a89523 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:56 +0000 Subject: [PATCH 076/278] iommu/amd: Improve amd_iommu_release_device() ANBZ: #13617 commit 18f5a6b34b0696cd90b182e6af819bbfc6901c2a upstream. Previous patch added ops->release_domain support. Core will attach devices to release_domain->attach_dev() before calling this function. Devices are already detached their current domain and attached to blocked domain. This is mostly dummy function now. Just throw warning if device is still attached to domain. Suggested-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-13-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d2653751a71c..3193a76e01f4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -77,8 +77,6 @@ DEFINE_IDA(pdom_ids); struct kmem_cache *amd_iommu_irq_cache; -static void detach_device(struct device *dev); - static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev); @@ -563,22 +561,6 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) setup_aliases(iommu, dev); } -static void amd_iommu_uninit_device(struct device *dev) -{ - struct iommu_dev_data *dev_data; - - dev_data = dev_iommu_priv_get(dev); - if (!dev_data) - return; - - if (dev_data->domain) - detach_device(dev); - - /* - * We keep dev_data around for unplugged devices and reuse it when the - * device is re-plugged - not doing so would introduce a ton of races. - */ -} /**************************************************************************** * @@ -2258,17 +2240,14 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) static void amd_iommu_release_device(struct device *dev) { - struct amd_iommu *iommu; - - if (!check_device(dev)) - return; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return; + WARN_ON(dev_data->domain); - amd_iommu_uninit_device(dev); - iommu_completion_wait(iommu); + /* + * We keep dev_data around for unplugged devices and reuse it when the + * device is re-plugged - not doing so would introduce a ton of races. + */ } static struct iommu_group *amd_iommu_device_group(struct device *dev) -- Gitee From c619f345b3948a480e4cc667264283a9e7e5a3e5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:45 -0300 Subject: [PATCH 077/278] vfio: Remove VFIO_TYPE1_NESTING_IOMMU ANBZ: #13617 commit 35890f85573c2ebbbf3491dc66f7ee2ad63055af upstream. This control causes the ARM SMMU drivers to choose a stage 2 implementation for the IO pagetable (vs the stage 1 usual default), however this choice has no significant visible impact to the VFIO user. Further qemu never implemented this and no other userspace user is known. The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide SMMU translation services to the guest operating system" however the rest of the API to set the guest table pointer for the stage 1 and manage invalidation was never completed, or at least never upstreamed, rendering this part useless dead code. Upstream has now settled on iommufd as the uAPI for controlling nested translation. Choosing the stage 2 implementation should be done by through the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation. Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the enable_nesting iommu_domain_op. Just in-case there is some userspace using this continue to treat requesting it as a NOP, but do not advertise support any more. Acked-by: Alex Williamson Reviewed-by: Mostafa Saleh Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon [ Shuai Xue: Conflicts: drivers/vfio/vfio_iommu_type1.c Anolis 73058c23d002 introduces VFIO_DMA_MAP_MMIO_DONT_PIN ] Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ---------------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 16 ---------------- drivers/iommu/iommu.c | 10 ---------- drivers/iommu/iommufd/vfio_compat.c | 7 +------ drivers/vfio/vfio_iommu_type1.c | 12 +----------- include/linux/iommu.h | 3 --- include/uapi/linux/vfio.h | 2 +- 7 files changed, 3 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 892d72d10d2c..8b07f7459f59 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3393,21 +3393,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -3529,7 +3514,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .free = arm_smmu_domain_free_paging, } }; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 17d10685282c..ed60224e5d88 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1592,21 +1592,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { @@ -1690,7 +1675,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .set_pgtable_quirks = arm_smmu_set_pgtable_quirks, .free = arm_smmu_domain_free, } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b321470a8bd6..35ac75fbeee1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2737,16 +2737,6 @@ static int __init iommu_init(void) } core_initcall(iommu_init); -int iommu_enable_nesting(struct iommu_domain *domain) -{ - if (domain->type != IOMMU_DOMAIN_UNMANAGED) - return -EINVAL; - if (!domain->ops->enable_nesting) - return -EINVAL; - return domain->ops->enable_nesting(domain); -} -EXPORT_SYMBOL_GPL(iommu_enable_nesting); - int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59..514aacd64009 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0bcc4dbf28c7..a693df1c0e09 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -72,7 +72,6 @@ struct vfio_iommu { uint64_t pgsize_bitmap; uint64_t num_non_pinned_groups; bool v2; - bool nesting; bool dirty_page_tracking; struct list_head emulated_iommu_groups; }; @@ -2246,12 +2245,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_free_domain; } - if (iommu->nesting) { - ret = iommu_enable_nesting(domain->domain); - if (ret) - goto out_domain; - } - ret = iommu_attach_group(domain->domain, group->iommu_group); if (ret) goto out_domain; @@ -2592,9 +2585,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) switch (arg) { case VFIO_TYPE1_IOMMU: break; - case VFIO_TYPE1_NESTING_IOMMU: - iommu->nesting = true; - fallthrough; + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: case VFIO_TYPE1v2_IOMMU: iommu->v2 = true; break; @@ -2689,7 +2680,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, switch (arg) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_NESTING_IOMMU: case VFIO_DMA_MAP_MMIO_DONT_PIN: case VFIO_UNMAP_ALL: return 1; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f3c9b1b616f7..88706d2a5027 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -650,7 +650,6 @@ struct iommu_ops { * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform * specific mechanisms. - * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. */ @@ -678,7 +677,6 @@ struct iommu_domain_ops { dma_addr_t iova); bool (*enforce_cache_coherency)(struct iommu_domain *domain); - int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); @@ -862,7 +860,6 @@ extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); -int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index b6b7a5a32d21..d334e6cc64ea 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -35,7 +35,7 @@ #define VFIO_EEH 5 /* Two-stage IOMMU */ -#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ #define VFIO_SPAPR_TCE_v2_IOMMU 7 -- Gitee From 8416ec2c54b70fa3c8a8892d1bf437e9e7e75a6e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:46 -0300 Subject: [PATCH 078/278] ACPICA: IORT: Update for revision E.f ANBZ: #13617 commit 1b8655bb8d977ca110436c1cd0ca957c19670c1e upstream. ACPICA commit c4f5c083d24df9ddd71d5782c0988408cf0fc1ab The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory Access Flag field in the Memory Access Properties table, mainly for a PCI Root Complex. This CANWBS defines the coherency of memory accesses to be not marked IOWB cacheable/shareable. Its value further implies the coherency impact from a pair of mismatched memory attributes (e.g. in a nested translation case): 0x0: Use of mismatched memory attributes for accesses made by this device may lead to a loss of coherency. 0x1: Coherency of accesses made by this device to locations in Conventional memory are ensured as follows, even if the memory attributes for the accesses presented by the device or provided by the SMMU are different from Inner and Outer Write-back cacheable, Shareable. Link: https://github.com/acpica/acpica/commit/c4f5c083 Acked-by: Rafael J. Wysocki Signed-off-by: Nicolin Chen Acked-by: Hanjun Guo Tested-by: Nicolin Chen Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- include/acpi/actbl2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index ba0aef9d0646..4ee7cbb4c22d 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -376,7 +376,7 @@ struct acpi_table_ccel { * IORT - IO Remapping Table * * Conforms to "IO Remapping Table System Software on ARM Platforms", - * Document number: ARM DEN 0049E.e, Sep 2022 + * Document number: ARM DEN 0049E.f, Apr 2024 * ******************************************************************************/ @@ -447,6 +447,7 @@ struct acpi_iort_memory_access { #define ACPI_IORT_MF_COHERENCY (1) #define ACPI_IORT_MF_ATTRIBUTES (1<<1) +#define ACPI_IORT_MF_CANWBS (1<<2) /* * IORT node specific subtables -- Gitee From 66cad78207b80018bf2b5ab6775e65029dfca9ea Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:47 -0300 Subject: [PATCH 079/278] ACPI/IORT: Support CANWBS memory access flag ANBZ: #13617 commit 807404d66fcf898d4bcc6a3e3edb07ffd5b88400 upstream. The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory Access Flag field in the Memory Access Properties table, mainly for a PCI Root Complex. This CANWBS defines the coherency of memory accesses to be not marked IOWB cacheable/shareable. Its value further implies the coherency impact from a pair of mismatched memory attributes (e.g. in a nested translation case): 0x0: Use of mismatched memory attributes for accesses made by this device may lead to a loss of coherency. 0x1: Coherency of accesses made by this device to locations in Conventional memory are ensured as follows, even if the memory attributes for the accesses presented by the device or provided by the SMMU are different from Inner and Outer Write-back cacheable, Shareable. Note that the loss of coherency on a CANWBS-unsupported HW typically could occur to an SMMU that doesn't implement the S2FWB feature where additional cache flush operations would be required to prevent that from happening. Add a new ACPI_IORT_MF_CANWBS flag and set IOMMU_FWSPEC_PCI_RC_CANWBS upon the presence of this new flag. CANWBS and S2FWB are similar features, in that they both guarantee the VM can not violate coherency, however S2FWB can be bypassed by PCI No Snoop TLPs, while CANWBS cannot. Thus CANWBS meets the requirements to set IOMMU_CAP_ENFORCE_CACHE_COHERENCY. Architecturally ARM has expected that VFIO would disable No Snoop through PCI Config space, if this is done then the two would have the same protections. Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Acked-by: Hanjun Guo Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/acpi/arm64/iort.c | 13 +++++++++++++ include/linux/iommu.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index ab91b4f591c8..0a3bae2b0e65 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node) return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED; } +static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node) +{ + struct acpi_iort_memory_access *memory_access; + struct acpi_iort_root_complex *pci_rc; + + pci_rc = (struct acpi_iort_root_complex *)node->node_data; + memory_access = + (struct acpi_iort_memory_access *)&pci_rc->memory_properties; + return memory_access->memory_flags & ACPI_IORT_MF_CANWBS; +} + static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node, u32 streamid) { @@ -1335,6 +1346,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in) fwspec = dev_iommu_fwspec_get(dev); if (fwspec && iort_pci_rc_supports_ats(node)) fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; + if (fwspec && iort_pci_rc_supports_canwbs(node)) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS; } else { node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, iort_match_node_callback, dev); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 88706d2a5027..53f129799f8c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1024,6 +1024,8 @@ struct iommu_fwspec { /* ATS is supported */ #define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0) +/* CANWBS is supported */ +#define IOMMU_FWSPEC_PCI_RC_CANWBS (1 << 1) /* * An iommu attach handle represents a relationship between an iommu domain -- Gitee From c83c0a32d385debe22efc9c51c8699664b3ea3ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:48 -0300 Subject: [PATCH 080/278] iommu/arm-smmu-v3: Report IOMMU_CAP_ENFORCE_CACHE_COHERENCY for CANWBS ANBZ: #13617 commit e89573cf4a13ca4e314d03d56ac84c0ba2af464b upstream. HW with CANWBS is always cache coherent and ignores PCI No Snoop requests as well. This meets the requirement for IOMMU_CAP_ENFORCE_CACHE_COHERENCY, so let's return it. Implement the enforce_cache_coherency() op to reject attaching devices that don't have CANWBS. Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 31 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++++ 2 files changed, 38 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 8b07f7459f59..204e0e4e50c1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2293,6 +2293,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) case IOMMU_CAP_CACHE_COHERENCY: /* Assume that a coherent TCU implies coherent TBUs */ return master->smmu->features & ARM_SMMU_FEAT_COHERENCY; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return arm_smmu_master_canwbs(master); case IOMMU_CAP_NOEXEC: case IOMMU_CAP_DEFERRED_FLUSH: return true; @@ -2303,6 +2305,26 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) } } +static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (!arm_smmu_master_canwbs(master_domain->master)) { + ret = false; + break; + } + } + smmu_domain->enforce_cache_coherency = ret; + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + return ret; +} + struct arm_smmu_domain *arm_smmu_domain_alloc(void) { struct arm_smmu_domain *smmu_domain; @@ -2731,6 +2753,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * one of them. */ spin_lock_irqsave(&smmu_domain->devices_lock, flags); + if (smmu_domain->enforce_cache_coherency && + !arm_smmu_master_canwbs(master)) { + spin_unlock_irqrestore(&smmu_domain->devices_lock, + flags); + kfree(master_domain); + return -EINVAL; + } + if (state->ats_enabled) atomic_inc(&smmu_domain->nr_ats_masters); list_add(&master_domain->devices_elm, &smmu_domain->devices); @@ -3508,6 +3538,7 @@ static struct iommu_ops arm_smmu_ops = { .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, + .enforce_cache_coherency = arm_smmu_enforce_cache_coherency, .set_dev_pasid = arm_smmu_s1_set_dev_pasid, .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index e41fb067d06d..81d015ee0d1b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -811,6 +811,7 @@ struct arm_smmu_domain { /* List of struct arm_smmu_master_domain */ struct list_head devices; spinlock_t devices_lock; + bool enforce_cache_coherency : 1; struct mmu_notifier mmu_notifier; }; @@ -893,6 +894,12 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct arm_smmu_cmdq *cmdq); +static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) +{ + return dev_iommu_fwspec_get(master->dev)->flags & + IOMMU_FWSPEC_PCI_RC_CANWBS; +} + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); -- Gitee From fb58c18dac478004bfa721cb88dd52912bd8503d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:49 -0300 Subject: [PATCH 081/278] iommu/arm-smmu-v3: Support IOMMU_GET_HW_INFO via struct arm_smmu_hw_info ANBZ: #13617 commit 6912ec91828b8d7f21b393befad1c36dadbcd751 upstream. For virtualization cases the IDR/IIDR/AIDR values of the actual SMMU instance need to be available to the VMM so it can construct an appropriate vSMMUv3 that reflects the correct HW capabilities. For userspace page tables these values are required to constrain the valid values within the CD table and the IOPTEs. The kernel does not sanitize these values. If building a VMM then userspace is required to only forward bits into a VM that it knows it can implement. Some bits will also require a VMM to detect if appropriate kernel support is available such as for ATS and BTM. Start a new file and kconfig for the advanced iommufd support. This lets it be compiled out for kernels that are not intended to support virtualization, and allows distros to leave it disabled until they are shipping a matching qemu too. Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/Kconfig | 9 +++++ drivers/iommu/arm/arm-smmu-v3/Makefile | 1 + .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 31 ++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 9 +++++ include/uapi/linux/iommufd.h | 35 +++++++++++++++++++ 6 files changed, 86 insertions(+) create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index f7432745d9bb..efa11561298d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -416,6 +416,15 @@ config ARM_SMMU_V3_SVA Say Y here if your system supports SVA extensions such as PCIe PASID and PRI. +config ARM_SMMU_V3_IOMMUFD + bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)" + depends on IOMMUFD + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config ARM_SMMU_V3_KUNIT_TEST tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index dc98c88b48c8..493a659cc66b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o arm_smmu_v3-y := arm-smmu-v3.o +arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c new file mode 100644 index 000000000000..3d2671031c9b --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ + +#include + +#include "arm-smmu-v3.h" + +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct iommu_hw_info_arm_smmuv3 *info; + u32 __iomem *base_idr; + unsigned int i; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + base_idr = master->smmu->base + ARM_SMMU_IDR0; + for (i = 0; i <= 5; i++) + info->idr[i] = readl_relaxed(base_idr + i); + info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR); + info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3; + + return info; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 204e0e4e50c1..cdac972319a9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3521,6 +3521,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, + .hw_info = arm_smmu_hw_info, .domain_alloc_paging = arm_smmu_domain_alloc_paging, .domain_alloc_sva = arm_smmu_sva_domain_alloc, .domain_alloc_user = arm_smmu_domain_alloc_user, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 81d015ee0d1b..112c4f86610b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -81,6 +81,8 @@ struct arm_smmu_device; #define IIDR_REVISION GENMASK(15, 12) #define IIDR_IMPLEMENTER GENMASK(11, 0) +#define ARM_SMMU_AIDR 0x1C + #define ARM_SMMU_CR0 0x20 #define CR0_ATSCHK (1 << 4) #define CR0_CMDQEN (1 << 3) @@ -956,4 +958,11 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu) return ERR_PTR(-ENODEV); } #endif /* CONFIG_TEGRA241_CMDQV */ + +#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); +#else +#define arm_smmu_hw_info NULL +#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ + #endif /* _ARM_SMMU_V3_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dee3e61d5271..49ea668be905 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -517,15 +517,50 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; /** -- Gitee From d8728e1da8c05ddde69fea69c99e6f244bba78a8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:50 -0300 Subject: [PATCH 082/278] iommu/arm-smmu-v3: Implement IOMMU_HWPT_ALLOC_NEST_PARENT ANBZ: #13617 commit 874b87c7539f9de366954917a2e35f95c0a4a4f9 upstream. For SMMUv3 the parent must be a S2 domain, which can be composed into a IOMMU_DOMAIN_NESTED. In future the S2 parent will also need a VMID linked to the VIOMMU and even to KVM. Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Mostafa Saleh Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index cdac972319a9..6f59c1c93bf9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3115,7 +3115,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, { struct arm_smmu_master *master = dev_iommu_priv_get(dev); const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_PASID; + IOMMU_HWPT_ALLOC_NEST_PARENT; struct arm_smmu_domain *smmu_domain; int ret; @@ -3131,6 +3131,14 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) { + if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + } + smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags); -- Gitee From 73bd98b7563d516eb2aed4e64bc7ced394361848 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:51 -0300 Subject: [PATCH 083/278] iommu/arm-smmu-v3: Expose the arm_smmu_attach interface ANBZ: #13617 commit f6681abd413919472d8a142420b639a3a8d84673 upstream. The arm-smmuv3-iommufd.c file will need to call these functions too. Remove statics and put them in the header file. Remove the kunit visibility protections from arm_smmu_make_abort_ste() and arm_smmu_make_s2_domain_ste(). Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Mostafa Saleh Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 ++++------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 27 +++++++++++++++++---- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6f59c1c93bf9..9aabf4327d1b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1549,7 +1549,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, } } -VISIBLE_IF_KUNIT void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) { memset(target, 0, sizeof(*target)); @@ -1632,7 +1631,6 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste); -VISIBLE_IF_KUNIT void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, @@ -2505,8 +2503,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, - const struct arm_smmu_ste *target) +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; @@ -2671,16 +2669,6 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); } -struct arm_smmu_attach_state { - /* Inputs */ - struct iommu_domain *old_domain; - struct arm_smmu_master *master; - bool cd_needs_ats; - ioasid_t ssid; - /* Resulting state */ - bool ats_enabled; -}; - /* * Start the sequence to attach a domain to a master. The sequence contains three * steps: @@ -2701,8 +2689,8 @@ struct arm_smmu_attach_state { * new_domain can be a non-paging domain. In this case ATS will not be enabled, * and invalidations won't be tracked. */ -static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, - struct iommu_domain *new_domain) +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain) { struct arm_smmu_master *master = state->master; struct arm_smmu_master_domain *master_domain; @@ -2784,7 +2772,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * completes synchronizing the PCI device's ATC and finishes manipulating the * smmu_domain->devices list. */ -static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) { struct arm_smmu_master *master = state->master; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 112c4f86610b..1f280dcc652f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -830,21 +830,22 @@ struct arm_smmu_entry_writer_ops { void (*sync)(struct arm_smmu_entry_writer *writer); }; +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled); + #if IS_ENABLED(CONFIG_KUNIT) void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, const __le64 *target); void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits); -void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu, struct arm_smmu_ste *target); void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, bool ats_enabled, unsigned int s1dss); -void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain, - bool ats_enabled); void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, struct arm_smmu_master *master, struct mm_struct *mm, u16 asid); @@ -902,6 +903,22 @@ static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) IOMMU_FWSPEC_PCI_RC_CANWBS; } +struct arm_smmu_attach_state { + /* Inputs */ + struct iommu_domain *old_domain; + struct arm_smmu_master *master; + bool cd_needs_ats; + ioasid_t ssid; + /* Resulting state */ + bool ats_enabled; +}; + +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain); +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state); +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target); + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); -- Gitee From 937fb7a1d0bc5198fd278c0ae347190dfbf8a827 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:21 +0800 Subject: [PATCH 084/278] iommu/vt-d: Add domain_alloc_paging support ANBZ: #13617 commit 7c204426b81822ba768b3a5e5393b1489917fb84 upstream. Add the domain_alloc_paging callback for domain allocation using the iommu_paging_domain_alloc() interface. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9c07050e8e26..2b305f19d9d0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4663,6 +4663,19 @@ static struct iommu_domain identity_domain = { }, }; +static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) +{ + struct dmar_domain *dmar_domain; + bool first_stage; + + first_stage = first_level_by_default(0); + dmar_domain = paging_domain_alloc(dev, first_stage); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + return &dmar_domain->domain; +} + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, @@ -4672,6 +4685,7 @@ const struct iommu_ops intel_iommu_ops = { .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_paging = intel_iommu_domain_alloc_paging, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, -- Gitee From dce04a92b1e7bd1f438362bc51897de98464e060 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:22 +0800 Subject: [PATCH 085/278] iommu/vt-d: Remove unused domain_alloc callback ANBZ: #13617 commit 9ecfcac1fe15e097cfd74663bcb8fbeaf3cc2910 upstream. With domain_alloc_paging callback supported, the legacy domain_alloc callback will never be used anymore. Remove it to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 90 ------------------------------------- 1 file changed, 90 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2b305f19d9d0..a6956e1fe6da 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1466,27 +1466,6 @@ static bool first_level_by_default(unsigned int type) return type != IOMMU_DOMAIN_UNMANAGED; } -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; -} - int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; @@ -1558,20 +1537,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) -{ - int agaw; - int r = (gaw - 12) % 9; - - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} - static void domain_exit(struct dmar_domain *domain) { if (domain->pgd) { @@ -3452,27 +3417,6 @@ void device_block_translation(struct device *dev) info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { @@ -3559,39 +3503,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st return domain; } -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) -{ - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } - - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; - - return domain; - default: - return NULL; - } - - return NULL; -} - static struct iommu_domain * intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, @@ -4682,7 +4593,6 @@ const struct iommu_ops intel_iommu_ops = { .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, -- Gitee From b21ab4209889764d7c015628bffccf76ffde6472 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:23 +0800 Subject: [PATCH 086/278] iommu/vt-d: Enhance compatibility check for paging domain attach ANBZ: #13617 commit a98db518dde246e01ead53617dc0a30d6aaa3752 upstream. The driver now supports domain_alloc_paging, ensuring that a valid device pointer is provided whenever a paging domain is allocated. Additionally, the dmar_domain attributes are set up at the time of allocation. Consistent with the established semantics in the IOMMU core, if a domain is attached to a device and found to be incompatible with the IOMMU hardware capabilities, the operation will return an -EINVAL error. This implicitly advises the caller to allocate a new domain for the device and attempt the domain attachment again. Rename prepare_domain_attach_device() to a more meaningful name. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 70 ++++++++++++------------------------ drivers/iommu/intel/iommu.h | 3 +- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/intel/pasid.c | 28 +-------------- 4 files changed, 26 insertions(+), 77 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a6956e1fe6da..6fc32ca71449 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1618,7 +1618,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int translation = CONTEXT_TT_MULTI_LEVEL; struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int agaw, ret; + int ret; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1635,27 +1635,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); + context_set_address_width(context, domain->agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1888,20 +1876,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, u32 pasid) { struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; + int level, flags = 0; - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } - - level = agaw_to_level(agaw); + level = agaw_to_level(domain->agaw); if (level != 4 && level != 5) return -EINVAL; @@ -3565,42 +3542,41 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(dmar_domain); } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int addr_width; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EPERM; + if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; if (domain->dirty_ops && !ssads_supported(iommu)) return -EINVAL; + if (dmar_domain->iommu_coherency != + iommu_paging_structure_coherency(iommu)) + return -EINVAL; + + if (dmar_domain->iommu_superpage != + iommu_superpage_capability(iommu, dmar_domain->use_first_level)) + return -EINVAL; + + if (dmar_domain->use_first_level && + (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) + if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL; - dmar_domain->gaw = addr_width; - - /* - * Knock out extra levels of page tables if necessary - */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; - - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; - } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) @@ -3616,7 +3592,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, device_block_translation(dev); - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; @@ -4287,7 +4263,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 9d77f3b7e263..f96d2c825ae0 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1230,8 +1230,7 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 433c58944401..96016bc40f94 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 2e5fa0a23299..53157e1194f4 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -345,25 +345,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } -/* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. - */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) -{ - int agaw; - - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } - - return agaw; -} - /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -374,7 +355,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct pasid_entry *pte; struct dma_pte *pgd; u64 pgd_val; - int agaw; u16 did; /* @@ -388,12 +368,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); - return -EINVAL; - } - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); @@ -412,7 +386,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); + pasid_set_address_width(pte, domain->agaw); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); -- Gitee From 68c56a4cdb722e7e6c05eb9c08fb35f261b648c2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:24 +0800 Subject: [PATCH 087/278] iommu/vt-d: Remove domain_update_iommu_cap() ANBZ: #13617 commit c376a3456d8bef43ec556a98c0a04c35086c2737 upstream. The attributes of a paging domain are initialized during the allocation process, and any attempt to attach a domain that is not compatible will result in a failure. Therefore, there is no need to update the domain attributes at the time of domain attachment. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 83 ------------------------------------- drivers/iommu/intel/iommu.h | 1 - 2 files changed, 84 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6fc32ca71449..c5f6ce83cae8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - static int domain_update_iommu_superpage(struct dmar_domain *domain, struct intel_iommu *skip) { @@ -412,29 +382,6 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain, return fls(mask); } -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -452,34 +399,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) return bitmap; } -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -1505,7 +1424,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); spin_unlock(&iommu->lock); return 0; @@ -1531,7 +1449,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) clear_bit(info->did, iommu->domain_ids); xa_erase(&domain->iommu_array, iommu->seq_id); domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } spin_unlock(&iommu->lock); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f96d2c825ae0..adc0643b0d65 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1231,7 +1231,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); -- Gitee From c14576308b4c135576021cc574f87e83c3b207e0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:25 +0800 Subject: [PATCH 088/278] iommu/vt-d: Remove domain_update_iommu_superpage() ANBZ: #13617 commit 5bdd86ec5d19060f63c00fff3b081c887242a37a upstream. The requirement for consistent super page support across all the IOMMU hardware in the system has been removed. In the past, if a new IOMMU was hot-added and lacked consistent super page capability, the hot-add process would be aborted. However, with the updated attachment semantics, it is now permissible for the super page capability to vary among different IOMMU hardware units. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 39 +------------------------------------ 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c5f6ce83cae8..7c0aa20f29ba 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -2617,20 +2587,13 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; + int ret; ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); if (ret) goto out; - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } - /* * Disable translation if already enabled prior to OS handover. */ -- Gitee From e3d3b49d1bc8b294a8dfa653e9d815454546fe38 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:26 +0800 Subject: [PATCH 089/278] iommu/vt-d: Refactor first_level_by_default() ANBZ: #13617 commit ed56de8a9e90d9771c4517fb9f2daac8282269ba upstream. The first stage page table is compatible across host and guest kernels. Therefore, this driver uses the first stage page table as the default for paging domains. The helper first_level_by_default() determines the feasibility of using the first stage page table based on a global policy. This policy requires consistency in scalable mode and first stage translation capability among all iommu units. However, this is unnecessary as domain allocation, attachment, and removal operations are performed on a per-device basis. The domain type (IOMMU_DOMAIN_DMA vs. IOMMU_DOMAIN_UNMANAGED) should not be a factor in determining the first stage page table usage. Both types are for paging domains, and there's no fundamental difference between them. The driver should not be aware of this distinction unless the core specifies allocation flags that require special handling. Convert first_level_by_default() from global to per-iommu and remove the 'type' input. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7c0aa20f29ba..a5c6ce4b6a8c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1341,18 +1341,17 @@ static void free_dmar_iommu(struct intel_iommu *iommu) * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -3181,7 +3180,7 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } @@ -4432,10 +4431,12 @@ static struct iommu_domain identity_domain = { static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; bool first_stage; - first_stage = first_level_by_default(0); + first_stage = first_level_by_default(iommu); dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); -- Gitee From 6a06c33d3eb4af965633db87836db836b3a7bb58 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:27 +0800 Subject: [PATCH 090/278] iommu/vt-d: Refine intel_iommu_domain_alloc_user() ANBZ: #13617 commit 621838c718a81ba3bfb8e0f941bc0133166bc534 upstream. The domain_alloc_user ops should always allocate a guest-compatible page table unless specific allocation flags are specified. Currently, IOMMU_HWPT_ALLOC_NEST_PARENT and IOMMU_HWPT_ALLOC_DIRTY_TRACKING require special handling, as both require hardware support for scalable mode and second-stage translation. In such cases, the driver should select a second-stage page table for the paging domain. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a5c6ce4b6a8c..7d02125a6f12 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3370,6 +3370,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; + bool first_stage; /* Must be NESTING domain */ if (parent) { @@ -3386,8 +3387,20 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); - /* Do not use first stage for user domain translation. */ - dmar_domain = paging_domain_alloc(dev, false); + /* + * Always allocate the guest compatible page table unless + * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING + * is specified. + */ + if (nested_parent || dirty_tracking) { + if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + first_stage = false; + } else { + first_stage = first_level_by_default(iommu); + } + + dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); domain = &dmar_domain->domain; -- Gitee From 43977b96ee405aa783f9e72a6e2e73c73a19212d Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 4 Nov 2024 09:40:28 +0800 Subject: [PATCH 091/278] iommu/vt-d: Use PCI_DEVID() macro ANBZ: #13617 commit 2a32309345ef2977ceb4fba81600066474ac8581 upstream. The macro PCI_DEVID() can be used instead of compose it manually. Signed-off-by: Jinjie Ruan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240829021011.4135618-1-ruanjinjie@huawei.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 4 ++-- drivers/iommu/intel/irq_remapping.c | 4 ++-- drivers/iommu/intel/pasid.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7d02125a6f12..3d143570b4dd 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1464,7 +1464,7 @@ static void copied_context_tear_down(struct intel_iommu *iommu, if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, @@ -1485,7 +1485,7 @@ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index fcf56e262b07..cd2ca9fbbdd2 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -312,7 +312,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +337,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 53157e1194f4..7ef157615e0f 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -220,7 +220,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; -- Gitee From 4645ee5901f4485d0bb7bdc90fd10c53943ca8ca Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 09:40:29 +0800 Subject: [PATCH 092/278] iommu/vt-d: Increase buffer size for device name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #13617 commit 6d8bac098e6e44b9a2768e38e9bf77626dc591b7 upstream. GCC is not happy with the current code, e.g.: .../iommu/intel/dmar.c:1063:9: note: ‘sprintf’ output between 6 and 15 bytes into a destination of size 13 1063 | sprintf(iommu->name, "dmar%d", iommu->seq_id); When `make W=1` is supplied, this prevents kernel building. Fix it by increasing the buffer size for device name and use sizeoF() instead of hard coded constants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241014104529.4025937-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/dmar.c | 2 +- drivers/iommu/intel/iommu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 26554a3a2fa1..2e1890710da0 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1117,7 +1117,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index adc0643b0d65..e1c7edba096d 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -720,7 +720,7 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ -- Gitee From dfc5a5cf7f53b2013fb1224c930cfd8c11278b97 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 4 Nov 2024 09:40:30 +0800 Subject: [PATCH 093/278] iommu/vt-d: Remove unused dmar_msi_read ANBZ: #13617 commit 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 upstream. dmar_msi_read() has been unused since 2022 in commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. (dmar_msi_write still exists and is used once). Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/dmar.c | 13 ------------- include/linux/dmar.h | 1 - 2 files changed, 14 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 2e1890710da0..b7625e2386c3 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1952,19 +1952,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 33c72667ece2..8d7455aa567b 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -301,7 +301,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) struct irq_data; extern void dmar_msi_unmask(struct irq_data *data); extern void dmar_msi_mask(struct irq_data *data); -extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -- Gitee From f62e05ac3f5b36a8175d01d3597a5f26201777c6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 4 Nov 2024 09:40:31 +0800 Subject: [PATCH 094/278] iommu/vt-d: Drop s1_pgtbl from dmar_domain ANBZ: #13617 commit 4f178e07a2e62293311e2107786a843d6290d77a upstream. dmar_domian has stored the s1_cfg which includes the s1_pgtbl info, so no need to store s1_pgtbl, hence drop it. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241025143339.2328991-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.h | 2 -- drivers/iommu/intel/nested.c | 1 - drivers/iommu/intel/pasid.c | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index e1c7edba096d..12907befc9cc 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -653,8 +653,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 96016bc40f94..989ca5cc04eb 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -162,7 +162,6 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 7ef157615e0f..7e76062a7ad2 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -560,7 +560,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); struct dma_pte *pgd = s2_domain->pgd; @@ -611,7 +610,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) pasid_set_flpm(pte, 1); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { pasid_set_sre(pte); -- Gitee From e500f48dc77c00e26755bdd77c070d3f072439f6 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:34 +0800 Subject: [PATCH 095/278] iommu/vt-d: Separate page request queue from SVM ANBZ: #13617 commit 4d5440957641fb5652cbef8df6183baf473cab6b upstream. IO page faults are no longer dependent on CONFIG_INTEL_IOMMU_SVM. Move all Page Request Queue (PRQ) functions that handle prq events to a new file in drivers/iommu/intel/prq.c. The page_req_des struct is now declared in drivers/iommu/intel/prq.c. No functional changes are intended. This is a preparation patch to enable the use of IO page faults outside the SVM/PASID use cases. Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-1-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel [Jay Chen conflict from drivers/iommu/intel/iommu.c because of upstream patch that contain check patch error: out_tear_down ] Signed-off-by: Jay Chen --- drivers/iommu/intel/Makefile | 2 +- drivers/iommu/intel/iommu.c | 20 +- drivers/iommu/intel/iommu.h | 14 +- drivers/iommu/intel/prq.c | 410 +++++++++++++++++++++++++++++++++++ drivers/iommu/intel/svm.c | 397 --------------------------------- 5 files changed, 424 insertions(+), 419 deletions(-) create mode 100644 drivers/iommu/intel/prq.c diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index c8beb0281559..d3bb0798092d 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o prq.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3d143570b4dd..68fe6bda92f8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,12 +1329,10 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu)) { if (ecap_prs(iommu->ecap)) - intel_svm_finish_prq(iommu); + intel_iommu_finish_prq(iommu); } -#endif } /* @@ -2194,19 +2192,18 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -2619,13 +2616,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -4133,7 +4129,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_drain_pasid_prq(dev, pasid); + intel_iommu_drain_pasid_prq(dev, pasid); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, @@ -4476,9 +4472,7 @@ const struct iommu_ops intel_iommu_ops = { .def_domain_type = device_def_domain_type, .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, -#ifdef CONFIG_INTEL_IOMMU_SVM - .page_response = intel_svm_page_response, -#endif + .page_response = intel_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 12907befc9cc..da9bb3683ebb 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -728,12 +728,10 @@ struct intel_iommu { struct iommu_flush flush; #endif -#ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; -#endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ @@ -1274,18 +1272,18 @@ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, u16 did, bool affect_domains); +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -int intel_svm_enable_prq(struct intel_iommu *iommu); -int intel_svm_finish_prq(struct intel_iommu *iommu); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); -void intel_drain_pasid_prq(struct device *dev, u32 pasid); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} -static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..edda5da8ba15 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include +#include + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + struct pci_dev *pdev; + int head, tail; + u16 sid, did; + int qdep; + + info = dev_iommu_priv_get(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return; + + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + pdev = to_pci_dev(dev); + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + qdep = pci_ats_queue_depth(pdev); + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + desc[1].qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | + QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | + QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(info->pfsid); +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_LPIG(req->lpig); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!req->pasid_present)) { + pr_err("IOMMU: %s: Page request without PASID\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + bool last_page; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 078d1e32a24e..3cc43a958b4d 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -25,92 +25,6 @@ #include "../iommu-pages.h" #include "trace.h" -static irqreturn_t prq_event_thread(int irq, void *d); - -int intel_svm_enable_prq(struct intel_iommu *iommu) -{ - struct iopf_queue *iopfq; - int irq, ret; - - iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); - if (!iommu->prq) { - pr_warn("IOMMU: %s: Failed to allocate page request queue\n", - iommu->name); - return -ENOMEM; - } - - irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); - if (irq <= 0) { - pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", - iommu->name); - ret = -EINVAL; - goto free_prq; - } - iommu->pr_irq = irq; - - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "dmar%d-iopfq", iommu->seq_id); - iopfq = iopf_queue_alloc(iommu->iopfq_name); - if (!iopfq) { - pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); - ret = -ENOMEM; - goto free_hwirq; - } - iommu->iopf_queue = iopfq; - - snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); - - ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, - iommu->prq_name, iommu); - if (ret) { - pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", - iommu->name); - goto free_iopfq; - } - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); - - init_completion(&iommu->prq_complete); - - return 0; - -free_iopfq: - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; -free_hwirq: - dmar_free_hwirq(irq); - iommu->pr_irq = 0; -free_prq: - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return ret; -} - -int intel_svm_finish_prq(struct intel_iommu *iommu) -{ - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); - - if (iommu->pr_irq) { - free_irq(iommu->pr_irq, iommu); - dmar_free_hwirq(iommu->pr_irq); - iommu->pr_irq = 0; - } - - if (iommu->iopf_queue) { - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; - } - - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return 0; -} - void intel_svm_check(struct intel_iommu *iommu) { if (!pasid_supported(iommu)) @@ -240,317 +154,6 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, return ret; } -/* Page request queue descriptor */ -struct page_req_dsc { - union { - struct { - u64 type:8; - u64 pasid_present:1; - u64 rsvd:7; - u64 rid:16; - u64 pasid:20; - u64 exe_req:1; - u64 pm_req:1; - u64 rsvd2:10; - }; - u64 qw_0; - }; - union { - struct { - u64 rd_req:1; - u64 wr_req:1; - u64 lpig:1; - u64 prg_index:9; - u64 addr:52; - }; - u64 qw_1; - }; - u64 qw_2; - u64 qw_3; -}; - -static bool is_canonical_address(u64 addr) -{ - int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - long saddr = (long) addr; - - return (((saddr << shift) >> shift) == saddr); -} - -/** - * intel_drain_pasid_prq - Drain page requests and responses for a pasid - * @dev: target device - * @pasid: pasid for draining - * - * Drain all pending page requests and responses related to @pasid in both - * software and hardware. This is supposed to be called after the device - * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB - * and DevTLB have been invalidated. - * - * It waits until all pending page requests for @pasid in the page fault - * queue are completed by the prq handling thread. Then follow the steps - * described in VT-d spec CH7.10 to drain all page requests and page - * responses pending in the hardware. - */ -void intel_drain_pasid_prq(struct device *dev, u32 pasid) -{ - struct device_domain_info *info; - struct dmar_domain *domain; - struct intel_iommu *iommu; - struct qi_desc desc[3]; - struct pci_dev *pdev; - int head, tail; - u16 sid, did; - int qdep; - - info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - - if (!info->pri_enabled) - return; - - iommu = info->iommu; - domain = info->domain; - pdev = to_pci_dev(dev); - sid = PCI_DEVID(info->bus, info->devfn); - did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; - qdep = pci_ats_queue_depth(pdev); - - /* - * Check and wait until all pending page requests in the queue are - * handled by the prq handling thread. - */ -prq_retry: - reinit_completion(&iommu->prq_complete); - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - while (head != tail) { - struct page_req_dsc *req; - - req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { - head = (head + sizeof(*req)) & PRQ_RING_MASK; - continue; - } - - wait_for_completion(&iommu->prq_complete); - goto prq_retry; - } - - iopf_queue_flush_dev(dev); - - /* - * Perform steps described in VT-d spec CH7.10 to drain page - * requests and responses in hardware. - */ - memset(desc, 0, sizeof(desc)); - desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | - QI_IWD_FENCE | - QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); -qi_retry: - reinit_completion(&iommu->prq_complete); - qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - wait_for_completion(&iommu->prq_complete); - goto qi_retry; - } -} - -static int prq_to_iommu_prot(struct page_req_dsc *req) -{ - int prot = 0; - - if (req->rd_req) - prot |= IOMMU_FAULT_PERM_READ; - if (req->wr_req) - prot |= IOMMU_FAULT_PERM_WRITE; - if (req->exe_req) - prot |= IOMMU_FAULT_PERM_EXEC; - if (req->pm_req) - prot |= IOMMU_FAULT_PERM_PRIV; - - return prot; -} - -static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) -{ - struct iopf_fault event = { }; - - /* Fill in event data for device specific processing */ - event.fault.type = IOMMU_FAULT_PAGE_REQ; - event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; - event.fault.prm.pasid = desc->pasid; - event.fault.prm.grpid = desc->prg_index; - event.fault.prm.perm = prq_to_iommu_prot(desc); - - if (desc->lpig) - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (desc->pasid_present) { - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - } - - iommu_report_device_fault(dev, &event); -} - -static void handle_bad_prq_event(struct intel_iommu *iommu, - struct page_req_dsc *req, int result) -{ - struct qi_desc desc = { }; - - pr_err("%s: Invalid page request: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - - if (!req->lpig) - return; - - desc.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); - - qi_submit_sync(iommu, &desc, 1, 0); -} - -static irqreturn_t prq_event_thread(int irq, void *d) -{ - struct intel_iommu *iommu = d; - struct page_req_dsc *req; - int head, tail, handled; - struct device *dev; - u64 address; - - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ - writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); - - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); - while (head != tail) { - req = &iommu->prq[head / sizeof(*req)]; - address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; - } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; - } - - /* Drop Stop Marker message. No need for a response. */ - if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) - goto prq_advance; - - /* - * If prq is to be handled outside iommu driver via receiver of - * the fault notifiers, we skip the page response here. - */ - mutex_lock(&iommu->iopf_lock); - dev = device_rbtree_find(iommu, req->rid); - if (!dev) { - mutex_unlock(&iommu->iopf_lock); - goto bad_req; - } - - intel_svm_prq_report(iommu, dev, req); - trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->qw_2, req->qw_3, - iommu->prq_seq_number++); - mutex_unlock(&iommu->iopf_lock); -prq_advance: - head = (head + sizeof(*req)) & PRQ_RING_MASK; - } - - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); - - /* - * Clear the page request overflow bit and wake up all threads that - * are waiting for the completion of this handling. - */ - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", - iommu->name); - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - if (head == tail) { - iopf_queue_discard_partial(iommu->iopf_queue); - writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); - pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", - iommu->name); - } - } - - if (!completion_done(&iommu->prq_complete)) - complete(&iommu->prq_complete); - - return IRQ_RETVAL(handled); -} - -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct iommu_fault_page_request *prm; - struct qi_desc desc; - bool pasid_present; - bool last_page; - u16 sid; - - prm = &evt->fault.prm; - sid = PCI_DEVID(bus, devfn); - pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - qi_submit_sync(iommu, &desc, 1, 0); -} - static void intel_svm_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); -- Gitee From f53c5ad4c76454cba1682ad55aec60949ccd1226 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 4 Nov 2024 09:40:35 +0800 Subject: [PATCH 096/278] iommu/vt-d: Remove the pasid present check in prq_event_thread ANBZ: #13617 commit 9f831c16c69e4f2f042919c8409be300fd9f4248 upstream. PASID is not strictly needed when handling a PRQ event; remove the check for the pasid present bit in the request. This change was not included in the creation of prq.c to emphasize the change in capability checks when handing PRQ events. Signed-off-by: Klaus Jensen Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-2-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/prq.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index edda5da8ba15..621cd26504b3 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -222,20 +222,14 @@ static irqreturn_t prq_event_thread(int irq, void *d) req = &iommu->prq[head / sizeof(*req)]; address = (u64)req->addr << VTD_PAGE_SHIFT; - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", iommu->name); bad_req: handle_bad_prq_event(iommu, req, QI_RESP_INVALID); goto prq_advance; } - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { pr_err("IOMMU: %s: Page request in Privilege Mode\n", iommu->name); -- Gitee From 5ae90935810fcb074ef85b7cab1be9d85e80e75e Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:36 +0800 Subject: [PATCH 097/278] iommu/vt-d: Move IOMMU_IOPF into INTEL_IOMMU ANBZ: #13617 commit 140f5dedbb9ec8d74d24fab5714ffccac0d8b1d3 upstream. Move IOMMU_IOPF from under INTEL_IOMMU_SVM into INTEL_IOMMU. This certifies that the core intel iommu utilizes the IOPF library functions, independent of the INTEL_IOMMU_SVM config. Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-3-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index e329d264e56d..d84eea30fa38 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -14,6 +14,7 @@ config INTEL_IOMMU depends on PCI_MSI && ACPI && (X86 || IA64) select IOMMU_API select IOMMU_IOVA + select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE @@ -50,7 +51,6 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA - select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by -- Gitee From 01d82f9e1293b0d286b69dba86ee663fb097cece Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:37 +0800 Subject: [PATCH 098/278] iommufd: Enable PRI when doing the iommufd_hwpt_alloc ANBZ: #13617 commit cbeb1b7eee2ffa63b4d809c0a3f068309df933fa upstream. Add IOMMU_HWPT_FAULT_ID_VALID as part of the valid flags when doing an iommufd_hwpt_alloc allowing the use of an iommu fault allocation (iommu_fault_alloc) with the IOMMU_HWPT_ALLOC ioctl. Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-4-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 68fe6bda92f8..34296e141807 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3376,7 +3376,8 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, } if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING + | IOMMU_HWPT_FAULT_ID_VALID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index d06bf6e6c19f..8f020bc0815f 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -107,7 +107,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_FAULT_ID_VALID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; -- Gitee From 665abcc1a046406e084bdeaa60cf69a97cbe1322 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 4 Nov 2024 09:40:38 +0800 Subject: [PATCH 099/278] iommu/vt-d: Drop pasid requirement for prq initialization ANBZ: #13617 commit 9baed1c28030ebac5d44b889b3adf155340fa751 upstream. PASID support within the IOMMU is not required to enable the Page Request Queue, only the PRS capability. Signed-off-by: Klaus Jensen Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-5-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 34296e141807..be820f457c5e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,10 +1329,8 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); - if (pasid_supported(iommu)) { - if (ecap_prs(iommu->ecap)) - intel_iommu_finish_prq(iommu); - } + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); } /* @@ -2192,7 +2190,7 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. @@ -2616,7 +2614,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; -- Gitee From 58d789fdcdf597c3051425fe72f771f8ba77849f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:39 +0800 Subject: [PATCH 100/278] iommu/vt-d: Drain PRQs when domain removed from RID ANBZ: #13617 commit c43e1ccdebf2c950545fdf12c5796ad6f7bad7ee upstream. As this iommu driver now supports page faults for requests without PASID, page requests should be drained when a domain is removed from the RID2PASID entry. This results in the intel_iommu_drain_pasid_prq() call being moved to intel_pasid_tear_down_entry(). This indicates that when a translation is removed from any PASID entry and the PRI has been enabled on the device, page requests are drained in the domain detachment path. The intel_iommu_drain_pasid_prq() helper has been modified to support sending device TLB invalidation requests for both PASID and non-PASID cases. Signed-off-by: Lu Baolu Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20241101045543.70086-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [Jay Chen conflict from drivers/iommu/intel/iommu.c because of upstream patch that contain check patch error: out_tear_down ] Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 1 - drivers/iommu/intel/pasid.c | 1 + drivers/iommu/intel/prq.c | 26 +++++++++----------------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index be820f457c5e..d0ec5a6c41c2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4128,7 +4128,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_iommu_drain_pasid_prq(dev, pasid); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 7e76062a7ad2..31665fb62e1c 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -265,6 +265,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_iommu_drain_pasid_prq(dev, pasid); } /* diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index 621cd26504b3..c2d792db52c3 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -63,26 +63,18 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) struct dmar_domain *domain; struct intel_iommu *iommu; struct qi_desc desc[3]; - struct pci_dev *pdev; int head, tail; u16 sid, did; - int qdep; info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - if (!info->pri_enabled) return; iommu = info->iommu; domain = info->domain; - pdev = to_pci_dev(dev); sid = PCI_DEVID(info->bus, info->devfn); did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; - qdep = pci_ats_queue_depth(pdev); - /* * Check and wait until all pending page requests in the queue are * handled by the prq handling thread. @@ -114,15 +106,15 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_FENCE | QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } qi_retry: reinit_completion(&iommu->prq_complete); qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); -- Gitee From ef05b6b50194a06d727d29cd9521aad99f46b93c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Nov 2024 14:14:24 -0400 Subject: [PATCH 101/278] iommu/io-pgtable-arm: Remove split on unmap behavior ANBZ: #13617 commit 33729a5fc0caf7a97d20507acbeee6b012e7e519 upstream. A minority of page table implementations (arm_lpae, armv7) are unique in how they handle partial unmap of large IOPTEs. Other implementations will unmap the large IOPTE and return it's length. For example if a 2M IOPTE is present and the first 4K is requested to be unmapped then unmap will remove the whole 2M and report 2M as the result. arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps the 4K and returns 4k. This is actually an illegal/non-hitless operation on at least SMMUv3 because of the BBM level 0 rules. Will says this was done to support VFIO, but upon deeper analysis this was never strictly necessary: https://lore.kernel.org/all/20241024134411.GA6956@nvidia.com/ In summary, historical VFIO supported the AMD behavior of unmapping the whole large IOPTE and returning the size, even if asked to unmap a portion. The driver would see this as a request to split a large IOPTE. Modern VFIO always unmaps entire large IOPTEs (except on AMD) and drivers don't see an IOPTE split. Given it doesn't work fully correctly on SMMUv3 and relying on ARM unique behavior would create portability problems across IOMMU drivers, retire this functionality. Outside the iommu users, this will potentially effect io_pgtable users of ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and ARM_MALI_LPAE formats. Cc: Boris Brezillon Cc: Steven Price Cc: Liviu Dudau Cc: dri-devel@lists.freedesktop.org Reviewed-by: Liviu Dudau Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-b3a5b5937f56+7bb-arm_no_split_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 81 +--------------------------------- 1 file changed, 2 insertions(+), 79 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 8967b0e38851..d3d70ba42365 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -581,66 +581,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) kfree(data); } -static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_lpae_iopte blk_pte, int lvl, - arm_lpae_iopte *ptep, size_t pgcount) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_lpae_iopte pte, *tablep; - phys_addr_t blk_paddr; - size_t tablesz = ARM_LPAE_GRANULE(data); - size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); - int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); - int i, unmap_idx_start = -1, num_entries = 0, max_entries; - - if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) - return 0; - - tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie); - if (!tablep) - return 0; /* Bytes unmapped */ - - if (size == split_sz) { - unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); - max_entries = arm_lpae_max_entries(unmap_idx_start, data); - num_entries = min_t(int, pgcount, max_entries); - } - - blk_paddr = iopte_to_paddr(blk_pte, data); - pte = iopte_prot(blk_pte); - - for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) { - /* Unmap! */ - if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries)) - continue; - - __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]); - } - - pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); - if (pte != blk_pte) { - __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie); - /* - * We may race against someone unmapping another part of this - * block, but anything else is invalid. We can't misinterpret - * a page entry here since we're never at the last level. - */ - if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE) - return 0; - - tablep = iopte_deref(pte, data); - } else if (unmap_idx_start >= 0) { - for (i = 0; i < num_entries; i++) - io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size); - - return num_entries * size; - } - - return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep); -} - static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, size_t pgcount, @@ -690,12 +630,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, - lvl + 1, ptep, pgcount); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -1359,19 +1295,6 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) iova += SZ_1G; } - /* Partial unmap */ - size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size) - return __FAIL(ops, i); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, SZ_1G + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42)) - return __FAIL(ops, i); - /* Full unmap */ iova = 0; for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { -- Gitee From 915a0af23c4f68d3800995eafedeb347552e9422 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Nov 2024 14:14:25 -0400 Subject: [PATCH 102/278] iommu/io-pgtable-arm-v7s: Remove split on unmap behavior ANBZ: #13617 commit fd50651636fb719098c2e6c3d8c6d7aef8208ae7 upstream. A minority of page table implementations (arm_lpae, armv7) are unique in how they handle partial unmap of large IOPTEs. Other implementations will unmap the large IOPTE and return it's length. For example if a 2M IOPTE is present and the first 4K is requested to be unmapped then unmap will remove the whole 2M and report 2M as the result. armv7 instead will break up contiguous entries and replace an entry with a whole table so it can unmap the requested 4k. This seems copied from the arm_lpae implementation, which was analyzed here: https://lore.kernel.org/all/20241024134411.GA6956@nvidia.com/ Bring consistency to the implementations and remove this unused functionality. There are no uses outside iommu, this effects the ARM_V7S drivers msm_iommu, mtk_iommu, and arm-smmmu. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-b3a5b5937f56+7bb-arm_no_split_jgg@nvidia.com [will: Remove unused 'loopnr' variable] Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm-v7s.c | 149 ++--------------------------- 1 file changed, 6 insertions(+), 143 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 06ffc683b28f..523355e91a2c 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -166,7 +166,6 @@ struct arm_v7s_io_pgtable { arm_v7s_iopte *pgd; struct kmem_cache *l2_tables; - spinlock_t split_lock; }; static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl); @@ -363,25 +362,6 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl, return pte; } -static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl) -{ - int prot = IOMMU_READ; - arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl); - - if (!(attr & ARM_V7S_PTE_AP_RDONLY)) - prot |= IOMMU_WRITE; - if (!(attr & ARM_V7S_PTE_AP_UNPRIV)) - prot |= IOMMU_PRIV; - if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0) - prot |= IOMMU_MMIO; - else if (pte & ARM_V7S_ATTR_C) - prot |= IOMMU_CACHE; - if (pte & ARM_V7S_ATTR_XN(lvl)) - prot |= IOMMU_NOEXEC; - - return prot; -} - static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1) { @@ -398,23 +378,6 @@ static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) return pte; } -static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl) -{ - if (lvl == 1) { - pte &= ~ARM_V7S_CONT_SECTION; - } else if (lvl == 2) { - arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT); - arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK << - ARM_V7S_CONT_PAGE_TEX_SHIFT); - - pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE; - pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) | - (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) | - ARM_V7S_PTE_TYPE_PAGE; - } - return pte; -} - static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl)) @@ -591,77 +554,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop) kfree(data); } -static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, - unsigned long iova, int idx, int lvl, - arm_v7s_iopte *ptep) -{ - struct io_pgtable *iop = &data->iop; - arm_v7s_iopte pte; - size_t size = ARM_V7S_BLOCK_SIZE(lvl); - int i; - - /* Check that we didn't lose a race to get the lock */ - pte = *ptep; - if (!arm_v7s_pte_is_cont(pte, lvl)) - return pte; - - ptep -= idx & (ARM_V7S_CONT_PAGES - 1); - pte = arm_v7s_cont_to_pte(pte, lvl); - for (i = 0; i < ARM_V7S_CONT_PAGES; i++) - ptep[i] = pte + i * size; - - __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); - - size *= ARM_V7S_CONT_PAGES; - io_pgtable_tlb_flush_walk(iop, iova, size, size); - return pte; -} - -static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_v7s_iopte blk_pte, - arm_v7s_iopte *ptep) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_v7s_iopte pte, *tablep; - int i, unmap_idx, num_entries, num_ptes; - - tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data); - if (!tablep) - return 0; /* Bytes unmapped */ - - num_ptes = ARM_V7S_PTES_PER_LVL(2, cfg); - num_entries = size >> ARM_V7S_LVL_SHIFT(2); - unmap_idx = ARM_V7S_LVL_IDX(iova, 2, cfg); - - pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg); - if (num_entries > 1) - pte = arm_v7s_pte_to_cont(pte, 2); - - for (i = 0; i < num_ptes; i += num_entries, pte += size) { - /* Unmap! */ - if (i == unmap_idx) - continue; - - __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg); - } - - pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg); - if (pte != blk_pte) { - __arm_v7s_free_table(tablep, 2, data); - - if (!ARM_V7S_PTE_IS_TABLE(pte, 1)) - return 0; - - tablep = iopte_deref(pte, 1, data); - return __arm_v7s_unmap(data, gather, iova, size, 2, tablep); - } - - io_pgtable_tlb_add_page(&data->iop, gather, iova, size); - return size; -} - static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, @@ -694,11 +586,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * case in a lock for the sake of correctness and be done with it. */ if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) { - unsigned long flags; - - spin_lock_irqsave(&data->split_lock, flags); - pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep); - spin_unlock_irqrestore(&data->split_lock, flags); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* If the size matches this level, we're in the right place */ @@ -721,12 +610,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, } return size; } else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0], - ptep); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -811,8 +696,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (!data) return NULL; - spin_lock_init(&data->split_lock); - /* * ARM_MTK_TTBR_EXT extend the translation table base support larger * memory address. @@ -936,8 +819,8 @@ static int __init arm_v7s_do_selftests(void) .quirks = IO_PGTABLE_QUIRK_ARM_NS, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, }; - unsigned int iova, size, iova_start; - unsigned int i, loopnr = 0; + unsigned int iova, size; + unsigned int i; size_t mapped; selftest_running = true; @@ -985,26 +868,6 @@ static int __init arm_v7s_do_selftests(void) return __FAIL(ops); iova += SZ_16M; - loopnr++; - } - - /* Partial unmap */ - i = 1; - size = 1UL << __ffs(cfg.pgsize_bitmap); - while (i < loopnr) { - iova_start = i * SZ_16M; - if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size) - return __FAIL(ops); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, iova_start + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops); - - if (ops->iova_to_phys(ops, iova_start + size + 42) - != (size + 42)) - return __FAIL(ops); - i++; } /* Full unmap */ -- Gitee From 58e430a75ca75cea8eae8c7f62dc64f87e4c448a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Nov 2024 14:14:26 -0400 Subject: [PATCH 103/278] iommu: Add a kdoc to iommu_unmap() ANBZ: #13617 commit 6ac7dffe7cca9b259f17b2f255b0d95ee15a76fd upstream. Describe the most conservative version of the driver implementations. All drivers should support this. Many drivers support extending the range if a large page is hit, but let's not make that officially approved API. The main point is to document explicitly that split is not supported. Reviewed-by: Liviu Dudau Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-b3a5b5937f56+7bb-arm_no_split_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 35ac75fbeee1..4d133fd5c856 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2600,6 +2600,20 @@ static size_t __iommu_unmap(struct iommu_domain *domain, return unmapped; } +/** + * iommu_unmap() - Remove mappings from a range of IOVA + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the range starting from @iova + * + * iommu_unmap() will remove a translation created by iommu_map(). It cannot + * subdivide a mapping created by iommu_map(), so it should be called with IOVA + * ranges that match what was passed to iommu_map(). The range can aggregate + * contiguous iommu_map() calls so long as no individual range is split. + * + * Returns: Number of bytes of IOVA unmapped. iova + res will be the point + * unmapping stopped. + */ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { -- Gitee From fab8bc0372e79e30c24f9126d6a8941b2c39f395 Mon Sep 17 00:00:00 2001 From: Gan Jie Date: Fri, 1 Nov 2024 15:27:09 +0800 Subject: [PATCH 104/278] iommu/iova: Fix typo 'adderss' ANBZ: #13617 commit fcdb982e935b3884d7dea70ed59731b7a4d24e65 upstream. Fix typo 'adderss' to 'address'. Signed-off-by: Gan Jie Link: https://lore.kernel.org/r/20241101072709.702-1-ganjie182@gmail.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iova.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 16c6adff3eb7..a28197b88c92 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -506,7 +506,7 @@ __adjust_overlap_range(struct iova *iova, * reserve_iova - reserves an iova in the given range * @iovad: - iova domain pointer * @pfn_lo: - lower page frame address - * @pfn_hi:- higher pfn adderss + * @pfn_hi:- higher pfn address * This function allocates reserves the address range from pfn_lo to pfn_hi so * that this address is not dished out as part of alloc_iova. */ -- Gitee From f207c14ed1991c9f872ef38473b8d460f90013a2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:52 +0800 Subject: [PATCH 105/278] iommu: Pass old domain to set_dev_pasid op ANBZ: #13617 commit b45a3777ceabbe08ab7a6e97f258191c07cbab8d upstream. To support domain replacement for pasid, the underlying iommu driver needs to know the old domain hence be able to clean up the existing attachment. It would be much convenient for iommu layer to pass down the old domain. Otherwise, iommu drivers would need to track domain for pasids by themselves, this would duplicate code among the iommu drivers. Or iommu drivers would rely group->pasid_array to get domain, which may not always the correct one. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 3 ++- drivers/iommu/amd/pasid.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 ++- drivers/iommu/intel/iommu.c | 6 ++++-- drivers/iommu/intel/svm.c | 3 ++- drivers/iommu/iommu.c | 3 ++- include/linux/iommu.h | 2 +- 8 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index bc2515ea8261..d639a2514d18 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -53,7 +53,8 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); void amd_iommu_domain_free(struct iommu_domain *dom); int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index 0657b9373be5..d1dfc745f55e 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = { }; int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct pdom_dev_data *pdom_dev_data; struct protection_domain *sva_pdom = to_pdomain(domain); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index a7c36654dee5..645da7b69bed 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -332,7 +332,8 @@ void arm_smmu_sva_notifier_synchronize(void) } static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 9aabf4327d1b..365eb8a88434 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2874,7 +2874,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d0ec5a6c41c2..43d13d29b56f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4131,7 +4131,8 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -4417,7 +4418,8 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device } static int identity_domain_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 3cc43a958b4d..4a2bd65614ad 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -111,7 +111,8 @@ static const struct mmu_notifier_ops intel_mmuops = { }; static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4d133fd5c856..dd8a4b2ea185 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3365,7 +3365,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, int ret; for_each_group_device(group, device) { - ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, NULL); if (ret) goto err_revert; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 53f129799f8c..3ca5e0e51635 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -656,7 +656,7 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid); + ioasid_t pasid, struct iommu_domain *old); int (*map_pages)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, -- Gitee From 81e50b5ee7416b3b98ac19a7ec11a8b8e6898f8b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:53 +0800 Subject: [PATCH 106/278] iommu/vt-d: Add a helper to flush cache for updating present pasid entry ANBZ: #13617 commit 9bd008f1a91530f6b58f524f0979521161513fdd upstream. Generalize the logic for flushing pasid-related cache upon changes to bits other than SSADE and P which requires a different flow according to VT-d spec. No functional change is intended. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-3-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/pasid.c | 52 ++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 31665fb62e1c..8d11701c2e76 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -287,6 +287,39 @@ static void pasid_flush_caches(struct intel_iommu *iommu, } } +/* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + /* * Set up the scalable mode pasid table entry for first only * translation type. @@ -526,24 +559,7 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, did = pasid_get_domain_id(pte); spin_unlock(&iommu->lock); - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); - - /* - * VT-d spec 3.4 table23 states guides for cache invalidation: - * - * - PASID-selective-within-Domain PASID-cache invalidation - * - PASID-selective PASID-based IOTLB invalidation - * - If (pasid is RID_PASID) - * - Global Device-TLB invalidation to affected functions - * Else - * - PASID-based Device-TLB invalidation (with S=1 and - * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions - */ - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); - - devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_pasid_flush_present(iommu, dev, pasid, did, pte); } /** -- Gitee From 6ceb07923f01aaee941a16925834b0d23d14e8ee Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:54 +0800 Subject: [PATCH 107/278] iommu/vt-d: Refactor the pasid setup helpers ANBZ: #13617 commit 2cb5ff623d95f382b13ed975ac99be3211e9a284 upstream. It is clearer to have a new set of pasid replacement helpers other than extending the existing ones to cover both initial setup and replacement. Then abstract out the common code for manipulating the pasid entry as preparation. No functional change is intended. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-4-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/pasid.c | 169 ++++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 64 deletions(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8d11701c2e76..6841b9892d55 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -324,6 +324,32 @@ static void intel_pasid_flush_present(struct intel_iommu *iommu, * Set up the scalable mode pasid table entry for first only * translation type. */ +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + pgd_t *pgd, u16 did, int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, (u64)__pa(pgd)); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, pgd_t *pgd, u32 pasid, u16 did, int flags) @@ -354,24 +380,8 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - - /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); - - if (flags & PASID_FLAG_FL5LP) - pasid_set_flpm(pte, 1); - - if (flags & PASID_FLAG_PAGE_SNOOP) - pasid_set_pgsnp(pte); - - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_pte_config_first_level(iommu, pte, pgd, did, flags); - /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); - pasid_set_present(pte); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -382,6 +392,26 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for second only translation type. */ +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + u64 pgd_val, int agaw, u16 did, + bool dirty_tracking) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pgd_val); + pasid_set_address_width(pte, agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (dirty_tracking) + pasid_set_ssade(pte); + + pasid_set_present(pte); +} + int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid) @@ -417,17 +447,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, domain->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (domain->dirty_tracking) - pasid_set_ssade(pte); - - pasid_set_present(pte); + pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, + did, domain->dirty_tracking); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -507,6 +528,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for passthrough translation type. */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid) { @@ -525,13 +560,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_present(pte); + pasid_pte_config_pass_through(iommu, pte, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -562,6 +591,46 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, intel_pasid_flush_present(iommu, dev, pasid, did, pte); } +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct dma_pte *pgd = s2_domain->pgd; + + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); +} + /** * intel_pasid_setup_nested() - Set up PASID entry for nested translation. * @iommu: IOMMU which the device belong to @@ -579,7 +648,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); - struct dma_pte *pgd = s2_domain->pgd; struct pasid_entry *pte; /* Address width should match the address width supported by hardware */ @@ -622,34 +690,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EBUSY; } - pasid_clear_entry(pte); - - if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) - pasid_set_flpm(pte, 1); - - pasid_set_flptr(pte, s1_cfg->pgtbl_addr); - - if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { - pasid_set_sre(pte); - if (s1_cfg->flags & IOMMU_VTD_S1_WPE) - pasid_set_wpe(pte); - } - - if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) - pasid_set_eafe(pte); - - if (s2_domain->force_snooping) - pasid_set_pgsnp(pte); - - pasid_set_slptr(pte, virt_to_phys(pgd)); - pasid_set_fault_enable(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (s2_domain->dirty_tracking) - pasid_set_ssade(pte); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); - pasid_set_present(pte); + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); -- Gitee From fbbc698269e10d90dd08d0dd0e895b8304757cf6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:55 +0800 Subject: [PATCH 108/278] iommu/vt-d: Add pasid replace helpers ANBZ: #13617 commit 7543ee63e8113aa34b07df3b16b3b9d2c5f73939 upstream. pasid replacement allows converting a present pasid entry to be FS, SS, PT or nested, hence add helpers for such operations. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-5-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/pasid.c | 190 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 15 +++ 2 files changed, 205 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 6841b9892d55..0f2a926d3bd5 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -389,6 +389,50 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; + + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, pgd, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -456,6 +500,57 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + struct dma_pte *pgd; + u64 pgd_val; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + pgd = domain->pgd; + pgd_val = virt_to_phys(pgd); + did = domain_id_iommu(domain, iommu); + + pasid_pte_config_second_level(iommu, &new_pte, pgd_val, + domain->agaw, did, + domain->dirty_tracking); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set up dirty tracking on a second only or nested translation type. */ @@ -568,6 +663,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -698,6 +825,69 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return 0; } +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Interfaces to setup or teardown a pasid table to the scalable-mode * context table entry: diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index dde6d3ba5ae0..06d1f7006d01 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -303,6 +303,21 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); -- Gitee From 16ab218ddc70a7e993e4e93b21aacfb97d76c724 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:56 +0800 Subject: [PATCH 109/278] iommu/vt-d: Consolidate the struct dev_pasid_info add/remove ANBZ: #13617 commit d93cf86cc66a0b8ae80da8c1ffd6903897786124 upstream. The domain_add_dev_pasid() and domain_remove_dev_pasid() are added to consolidate the adding/removing of the struct dev_pasid_info. Besides, it includes the cache tag assign/unassign as well. This also prepares for adding domain replacement for pasid. The set_dev_pasid callbacks need to deal with the dev_pasid_info for both old and new domain. These two helpers make the life easier. intel_iommu_set_dev_pasid() and intel_svm_set_dev_pasid() are updated to use the helpers. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-6-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel [Jay Chen Conflicts: drivers/iommu/intel/iommu.c because of out_tear_down have checkpatch error ] Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 91 +++++++++++++++++++++++++------------ drivers/iommu/intel/iommu.h | 6 +++ drivers/iommu/intel/svm.c | 28 +++--------- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 43d13d29b56f..5f4150a90c07 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4097,8 +4097,8 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; @@ -4106,10 +4106,12 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct dmar_domain *dmar_domain; unsigned long flags; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - intel_pasid_tear_down_entry(iommu, dev, pasid, false); + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) return; - } dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); @@ -4127,7 +4129,52 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); +} + +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + domain_remove_dev_pasid(domain, dev, pasid); +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: + kfree(dev_pasid); + return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, @@ -4138,7 +4185,6 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; - unsigned long flags; int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) @@ -4154,17 +4200,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) return ret; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - ret = domain_attach_iommu(dmar_domain, iommu); - if (ret) - goto out_free; - - ret = cache_tag_assign_domain(dmar_domain, dev, pasid); - if (ret) - goto out_detach_iommu; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, @@ -4173,24 +4211,17 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, pasid); if (ret) - goto out_unassign_tag; + goto out_remove_dev_pasid; - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); if (domain->type & __IOMMU_DOMAIN_PAGING) intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; -out_unassign_tag: - cache_tag_unassign_domain(dmar_domain, dev, pasid); -out_detach_iommu: - domain_detach_iommu(dmar_domain, iommu); -out_free: - kfree(dev_pasid); + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index da9bb3683ebb..e1be891e0d50 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1228,6 +1228,12 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 4a2bd65614ad..6c0685ea8466 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -115,43 +115,29 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; struct dev_pasid_info *dev_pasid; unsigned long sflags; - unsigned long flags; int ret = 0; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - - ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); - if (ret) - goto free_dev_pasid; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) - goto unassign_tag; + goto out_remove_dev_pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); return 0; -unassign_tag: - cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_dev_pasid: - kfree(dev_pasid); - +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } -- Gitee From 679bdd8f19def97d692a0b2085f748e3427e7520 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:57 +0800 Subject: [PATCH 110/278] iommu/vt-d: Add iommu_domain_did() to get did ANBZ: #13617 commit a1deee90a2cd91d1253c782bc66cd2719a7d0a38 upstream. domain_id_iommu() does not support SVA type and identity type domains. Add iommu_domain_did() to support all domain types. Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-7-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.h | 16 ++++++++++++++++ drivers/iommu/intel/pasid.h | 7 ------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index e1be891e0d50..11b47cdf7612 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -806,6 +806,13 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) return container_of(dom, struct dmar_domain, domain); } +/* + * Domain ID reserved for pasid entries programmed for first-level + * only and pass-through transfer modes. + */ +#define FLPT_DEFAULT_DID 1 +#define NUM_RESERVED_DID 2 + /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -816,6 +823,15 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return info->did; } +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + /* * 0: readable * 1: writable diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 06d1f7006d01..082f4fe20216 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,13 +22,6 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* - * Domain ID reserved for pasid entries programmed for first-level - * only and pass-through transfer modes. - */ -#define FLPT_DEFAULT_DID 1 -#define NUM_RESERVED_DID 2 - #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) -- Gitee From 1cdaa822d58e511c87964653f0596df5bee7fffa Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:58 +0800 Subject: [PATCH 111/278] iommu/vt-d: Make intel_iommu_set_dev_pasid() to handle domain replacement ANBZ: #13617 commit c8596d65b267d46e7f805f55676b62416504c2ec upstream. Let intel_iommu_set_dev_pasid() call the pasid replace helpers hence be able to do domain replacement. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-8-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 46 +++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5f4150a90c07..7704311c39d7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1752,10 +1752,36 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_present(info, context, did, true); } +static int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_first_level(iommu, dev, pgd, + pasid, did, flags); + return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, + iommu_domain_did(old, iommu), + flags); +} + +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); +} + static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, - u32 pasid) + u32 pasid, struct iommu_domain *old) { struct dma_pte *pgd = domain->pgd; int level, flags = 0; @@ -1770,9 +1796,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; - return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, - domain_id_iommu(domain, iommu), - flags); + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + (pgd_t *)pgd, flags, old); } static bool dev_is_real_dma_subdevice(struct device *dev) @@ -1804,9 +1830,11 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); if (ret) goto out_block_translation; @@ -4206,10 +4234,10 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, - dev, pasid); + dev, pasid, old); else - ret = intel_pasid_setup_second_level(iommu, dmar_domain, - dev, pasid); + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); if (ret) goto out_remove_dev_pasid; -- Gitee From f28dffcccdd90904e388cc9065e2639a4fdaefe4 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:59 +0800 Subject: [PATCH 112/278] iommu/vt-d: Limit intel_iommu_set_dev_pasid() for paging domain ANBZ: #13617 commit c33e20869c59c1f78aab11188849dfb15ab412b2 upstream. intel_iommu_set_dev_pasid() is only supposed to be used by paging domain, so limit it. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-9-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7704311c39d7..3e6c8706c2fb 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4215,6 +4215,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct dev_pasid_info *dev_pasid; int ret; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; @@ -4243,8 +4246,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, domain_remove_dev_pasid(old, dev, pasid); - if (domain->type & __IOMMU_DOMAIN_PAGING) - intel_iommu_debugfs_create_dev_pasid(dev_pasid); + intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; -- Gitee From e7cc2c1a4d3e4b79dfe9292f0c21be8de1e86791 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:00 +0800 Subject: [PATCH 113/278] iommu/vt-d: Make intel_svm_set_dev_pasid() support domain replacement ANBZ: #13617 commit cfb31f194a1c4b54ea2109bc0fa03e4ee8d4209b upstream. Make intel_svm_set_dev_pasid() support replacement. Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-10-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 8 ++++---- drivers/iommu/intel/iommu.h | 5 +++++ drivers/iommu/intel/svm.c | 5 +++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3e6c8706c2fb..0fc880a12610 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1752,10 +1752,10 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_present(info, context, did, true); } -static int __domain_setup_first_level(struct intel_iommu *iommu, - struct device *dev, ioasid_t pasid, - u16 did, pgd_t *pgd, int flags, - struct iommu_domain *old) +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old) { if (!old) return intel_pasid_setup_first_level(iommu, dev, pgd, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 11b47cdf7612..49e8cb27d7f4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1250,6 +1250,11 @@ domain_add_dev_pasid(struct iommu_domain *domain, void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old); + int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 6c0685ea8466..f5569347591f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -127,8 +127,9 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, - FLPT_DEFAULT_DID, sflags); + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, mm->pgd, + sflags, old); if (ret) goto out_remove_dev_pasid; -- Gitee From 93b3e9d541e6a7d2469b2b0d8b7c0fbc663ae798 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:01 +0800 Subject: [PATCH 114/278] iommu/vt-d: Make identity_domain_set_dev_pasid() to handle domain replacement ANBZ: #13617 commit 9bc18d283d9a88ab0edc7cc537d24bf534b36110 upstream. Let identity_domain_set_dev_pasid() call the pasid replace helpers hence be able to do domain replacement. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-11-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0fc880a12610..017a59b632b6 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1778,6 +1778,17 @@ static int domain_setup_second_level(struct intel_iommu *iommu, pasid); } +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); +} + static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, @@ -4484,11 +4495,17 @@ static int identity_domain_set_dev_pasid(struct iommu_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; - return intel_pasid_setup_pass_through(iommu, dev, pasid); + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) + return ret; + + domain_remove_dev_pasid(old, dev, pasid); + return 0; } static struct iommu_domain identity_domain = { -- Gitee From 6c4c3d50d2e75dd02d3608b06b2dff4cce827ede Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:02 +0800 Subject: [PATCH 115/278] iommu/vt-d: Add set_dev_pasid callback for nested domain ANBZ: #13617 commit 67f6f56b59126b3bf4fc6f4ea564e450fcfcf9f6 upstream. Add intel_nested_set_dev_pasid() to set a nested type domain to a PASID of a device. Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-12-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 6 ----- drivers/iommu/intel/iommu.h | 7 +++++ drivers/iommu/intel/nested.c | 50 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 017a59b632b6..94b645966e95 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1812,12 +1812,6 @@ static int domain_setup_first_level(struct intel_iommu *iommu, (pgd_t *)pgd, flags, old); } -static bool dev_is_real_dma_subdevice(struct device *dev) -{ - return dev && dev_is_pci(dev) && - pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); -} - static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 49e8cb27d7f4..26accaeca410 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -832,6 +833,12 @@ iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) return domain_id_iommu(to_dmar_domain(domain), iommu); } +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + /* * 0: readable * 1: writable diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 989ca5cc04eb..42c4533a6ea2 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -130,8 +130,58 @@ static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, return ret; } +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_remove_dev_pasid; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, .free = intel_nested_domain_free, .cache_invalidate_user = intel_nested_cache_invalidate_user, }; -- Gitee From 0b206a9c5927bd192af6835d186df782f6350095 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 8 Nov 2024 10:14:03 +0800 Subject: [PATCH 116/278] iommu/arm-smmu-v3: Make set_dev_pasid() op support replace ANBZ: #13617 commit e9f1f727e63a512a69f6568f7ed4577c96eef910 upstream. set_dev_pasid() op is going to be enhanced to support domain replacement of a pasid. This prepares for this op definition. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-13-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++------ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 645da7b69bed..1d3e71569775 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -349,7 +349,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, * get reassigned */ arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid); - ret = arm_smmu_set_pasid(master, smmu_domain, id, &target); + ret = arm_smmu_set_pasid(master, smmu_domain, id, &target, old); mmput(domain->mm); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 365eb8a88434..e6009d746812 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2901,7 +2901,7 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, */ arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); return arm_smmu_set_pasid(master, to_smmu_domain(domain), id, - &target_cd); + &target_cd, old); } static void arm_smmu_update_ste(struct arm_smmu_master *master, @@ -2931,16 +2931,13 @@ static void arm_smmu_update_ste(struct arm_smmu_master *master, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd) + struct arm_smmu_cd *cd, struct iommu_domain *old) { struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); struct arm_smmu_attach_state state = { .master = master, - /* - * For now the core code prevents calling this when a domain is - * already attached, no need to set old_domain. - */ .ssid = pasid, + .old_domain = old, }; struct arm_smmu_cd *cdptr; int ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1f280dcc652f..054bda0e834e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -879,7 +879,7 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd); + struct arm_smmu_cd *cd, struct iommu_domain *old); void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, -- Gitee From 40f9645d354591a5b148b78bb1d70414e81cfbf2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:04 +0800 Subject: [PATCH 117/278] iommu: Make set_dev_pasid op support domain replacement ANBZ: #13617 commit 980e3016ebcc00021bf8190c64fd65ba23e90498 upstream. The iommu core is going to support domain replacement for pasid, it needs to make the set_dev_pasid op support replacing domain and keep the old domain config in the failure case. AMD iommu driver does not support domain replacement for pasid yet, so it would fail the set_dev_pasid op to keep the old config if the input @old is non-NULL. Till now, all the set_dev_pasid callbacks can handle the old parameter and can keep the old config when failed, so update the kdoc of set_dev_pasid op. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-14-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/pasid.c | 3 +++ include/linux/iommu.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index d1dfc745f55e..8c73a30c2800 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -109,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain, unsigned long flags; int ret = -EINVAL; + if (old) + return -EOPNOTSUPP; + /* PASID zero is used for requests from the I/O device without PASID */ if (!is_pasid_valid(dev_data, pasid)) return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3ca5e0e51635..c22842eca6cf 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -631,7 +631,8 @@ struct iommu_ops { * * EBUSY - device is attached to a domain and cannot be changed * * ENODEV - device specific errors, not able to be attached * * - treated as ENODEV by the caller. Use is discouraged - * @set_dev_pasid: set an iommu domain to a pasid of device + * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of + * the device should be left in the old config in error case. * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. * @unmap_pages: unmap a number of pages of the same size from an iommu domain -- Gitee From 071ea7c292857240fe3f0b388d61a3c26711dda2 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sun, 10 Nov 2024 19:02:26 -0800 Subject: [PATCH 118/278] iommu/tegra241-cmdqv: Fix alignment failure at max_n_shift ANBZ: #13617 commit a3799717b881aa0f4e722afb70e7b8ba84ae4f36 upstream. When configuring a kernel with PAGE_SIZE=4KB, depending on its setting of CONFIG_CMA_ALIGNMENT, VCMDQ_LOG2SIZE_MAX=19 could fail the alignment test and trigger a WARN_ON: WARNING: at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:3646 Call trace: arm_smmu_init_one_queue+0x15c/0x210 tegra241_cmdqv_init_structures+0x114/0x338 arm_smmu_device_probe+0xb48/0x1d90 Fix it by capping max_n_shift to CMDQ_MAX_SZ_SHIFT as SMMUv3 CMDQ does. Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV") Signed-off-by: Nicolin Chen Link: https://lore.kernel.org/r/20241111030226.1940737-1-nicolinc@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 6c7770e79af6..c8ec74f089f3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -509,7 +509,8 @@ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq) snprintf(name, 16, "vcmdq%u", vcmdq->idx); - q->llq.max_n_shift = VCMDQ_LOG2SIZE_MAX; + /* Queue size, capped to ensure natural alignment */ + q->llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, VCMDQ_LOG2SIZE_MAX); /* Use the common helper to init the VCMDQ, and then... */ ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0, -- Gitee From 7d3db096851385d3e5b5840d31187c1d9bad23ca Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:17 -0800 Subject: [PATCH 119/278] iommufd: Move struct iommufd_object to public iommufd header ANBZ: #13617 commit d1b3dad9de7962f7bea861f27e352bac17b68bed upstream. Prepare for an embedded structure design for driver-level iommufd_viommu objects: // include/linux/iommufd.h struct iommufd_viommu { struct iommufd_object obj; .... }; // Some IOMMU driver struct iommu_driver_viommu { struct iommufd_viommu core; .... }; It has to expose struct iommufd_object and enum iommufd_object_type from the core-level private header to the public iommufd header. Link: https://patch.msgid.link/r/54a43b0768089d690104530754f499ca05ce0074.1730836219.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_private.h | 25 +------------------------ include/linux/iommufd.h | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8f3c21a664bd..94cfcab7e9de 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -5,8 +5,8 @@ #define __IOMMUFD_PRIVATE_H #include +#include #include -#include #include #include #include @@ -126,29 +126,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, return 0; } -enum iommufd_object_type { - IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HWPT_PAGING, - IOMMUFD_OBJ_HWPT_NESTED, - IOMMUFD_OBJ_IOAS, - IOMMUFD_OBJ_ACCESS, - IOMMUFD_OBJ_FAULT, -#ifdef CONFIG_IOMMUFD_TEST - IOMMUFD_OBJ_SELFTEST, -#endif - IOMMUFD_OBJ_MAX, -}; - -/* Base struct for all objects with a userspace ID handle. */ -struct iommufd_object { - refcount_t shortterm_users; - refcount_t users; - enum iommufd_object_type type; - unsigned int id; -}; - static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 30f832a60ccb..22948dd03d67 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,6 +8,7 @@ #include #include +#include #include struct device; @@ -18,6 +19,29 @@ struct iommufd_ctx; struct iommufd_device; struct page; +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, + IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, + IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, + IOMMUFD_OBJ_FAULT, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif + IOMMUFD_OBJ_MAX, +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + refcount_t shortterm_users; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -- Gitee From 63759468c69a0944e9a898f159865c27539170fc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:18 -0800 Subject: [PATCH 120/278] iommufd: Move _iommufd_object_alloc helper to a sharable file ANBZ: #13617 commit 7d4f46c2372d5a850e28f63001013de50843b6e6 upstream. The following patch will add a new vIOMMU allocator that will require this _iommufd_object_alloc to be sharable with IOMMU drivers (and iommufd too). Add a new driver.c file that will be built with CONFIG_IOMMUFD_DRIVER_CORE selected by CONFIG_IOMMUFD, and put the CONFIG_DRIVER under that remaining to be selectable for drivers to build the existing iova_bitmap.c file. Link: https://patch.msgid.link/r/2f4f6e116dc49ffb67ff6c5e8a7a8e789ab9e98e.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/Kconfig | 4 +++ drivers/iommu/iommufd/Makefile | 3 ++ drivers/iommu/iommufd/driver.c | 40 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 4 --- drivers/iommu/iommufd/main.c | 32 -------------------- include/linux/iommufd.h | 13 ++++++++ 6 files changed, 60 insertions(+), 36 deletions(-) create mode 100644 drivers/iommu/iommufd/driver.c diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 76656fe0470d..0a07f9449fd9 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD_DRIVER_CORE + tristate + default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n + config IOMMUFD tristate "IOMMU Userspace API" select INTERVAL_TREE diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index cf4605962bea..83df9077063e 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -13,3 +13,6 @@ iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o + +iommufd_driver-y := driver.o +obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c new file mode 100644 index 000000000000..2bc47d92a0ab --- /dev/null +++ b/drivers/iommu/iommufd/driver.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD); + +MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 94cfcab7e9de..be347f726fda 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -206,10 +206,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type); - #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 826a2b2be52f..3c32b440471b 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -29,38 +29,6 @@ struct iommufd_object_ops { static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type) -{ - struct iommufd_object *obj; - int rc; - - obj = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!obj) - return ERR_PTR(-ENOMEM); - obj->type = type; - /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); - refcount_set(&obj->users, 1); - - /* - * Reserve an ID in the xarray but do not publish the pointer yet since - * the caller hasn't initialized it yet. Once the pointer is published - * in the xarray and visible to other threads we can't reliably destroy - * it anymore, so the caller must complete all errorable operations - * before calling iommufd_object_finalize(). - */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, - xa_limit_31b, GFP_KERNEL_ACCOUNT); - if (rc) - goto out_free; - return obj; -out_free: - kfree(obj); - return ERR_PTR(rc); -} - /* * Allow concurrent access to the object. * diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 22948dd03d67..94522d4029ca 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -135,4 +135,17 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) return -EOPNOTSUPP; } #endif /* CONFIG_IOMMUFD */ + +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); +#else /* !CONFIG_IOMMUFD_DRIVER_CORE */ +static inline struct iommufd_object * +_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, + enum iommufd_object_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif /* CONFIG_IOMMUFD_DRIVER_CORE */ #endif -- Gitee From 39d188b18afc615543acbf41fd37249621997c8b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:19 -0800 Subject: [PATCH 121/278] iommufd: Introduce IOMMUFD_OBJ_VIOMMU and its related struct ANBZ: #13617 commit 6b22d562fcd6e3d1cc1c265b0596840946d16a09 upstream. Add a new IOMMUFD_OBJ_VIOMMU with an iommufd_viommu structure to represent a slice of physical IOMMU device passed to or shared with a user space VM. This slice, now a vIOMMU object, is a group of virtualization resources of a physical IOMMU's, such as: - Security namespace for guest owned ID, e.g. guest-controlled cache tags - Non-device-affiliated event reporting, e.g. invalidation queue errors - Access to a sharable nesting parent pagetable across physical IOMMUs - Virtualization of various platforms IDs, e.g. RIDs and others - Delivery of paravirtualized invalidation - Direct assigned invalidation queues - Direct assigned interrupts Add a new viommu_alloc op in iommu_ops, for drivers to allocate their own vIOMMU structures. And this allocation also needs a free(), so add struct iommufd_viommu_ops. To simplify a vIOMMU allocation, provide a iommufd_viommu_alloc() helper. It's suggested that a driver should embed a core-level viommu structure in its driver-level viommu struct and call the iommufd_viommu_alloc() helper, meanwhile the driver can also implement a viommu ops: struct my_driver_viommu { struct iommufd_viommu core; /* driver-owned properties/features */ .... }; static const struct iommufd_viommu_ops my_driver_viommu_ops = { .free = my_driver_viommu_free, /* future ops for virtualization features */ .... }; static struct iommufd_viommu my_driver_viommu_alloc(...) { struct my_driver_viommu *my_viommu = iommufd_viommu_alloc(ictx, my_driver_viommu, core, my_driver_viommu_ops); /* Init my_viommu and related HW feature */ .... return &my_viommu->core; } static struct iommu_domain_ops my_driver_domain_ops = { .... .viommu_alloc = my_driver_viommu_alloc, }; Link: https://patch.msgid.link/r/64685e2b79dea0f1dc56f6ede04809b72d578935.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommu.h | 14 ++++++++++++++ include/linux/iommufd.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c22842eca6cf..82be5507d9dc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -42,6 +42,8 @@ struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; struct iommu_fault_param; +struct iommufd_ctx; +struct iommufd_viommu; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -548,6 +550,14 @@ static inline int __iommu_copy_struct_from_user_array( * @remove_dev_pasid: Remove any translation configurations of a specific * pasid, so that any DMA transactions with this pasid * will be blocked by the hardware. + * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind + * the @dev, as the set of virtualization resources shared/passed + * to user space IOMMU instance. And associate it with a nesting + * @parent_domain. The @viommu_type must be defined in the header + * include/uapi/linux/iommufd.h + * It is required to call iommufd_viommu_alloc() helper for + * a bundled allocation of the core and the driver structures, + * using the given @ictx pointer. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -597,6 +607,10 @@ struct iommu_ops { void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); + struct iommufd_viommu *(*viommu_alloc)( + struct device *dev, struct iommu_domain *parent_domain, + struct iommufd_ctx *ictx, unsigned int viommu_type); + const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; struct module *owner; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 94522d4029ca..4fc2ce332f10 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -17,6 +17,7 @@ struct iommu_group; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; +struct iommufd_viommu_ops; struct page; enum iommufd_object_type { @@ -28,6 +29,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, IOMMUFD_OBJ_FAULT, + IOMMUFD_OBJ_VIOMMU, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -78,6 +80,26 @@ void iommufd_access_detach(struct iommufd_access *access); void iommufd_ctx_get(struct iommufd_ctx *ictx); +struct iommufd_viommu { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommu_device *iommu_dev; + struct iommufd_hwpt_paging *hwpt; + + const struct iommufd_viommu_ops *ops; + + unsigned int type; +}; + +/** + * struct iommufd_viommu_ops - vIOMMU specific operations + * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory + * of the vIOMMU will be free-ed by iommufd core after calling this op + */ +struct iommufd_viommu_ops { + void (*destroy)(struct iommufd_viommu *viommu); +}; + #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); struct iommufd_ctx *iommufd_ctx_from_fd(int fd); @@ -148,4 +170,22 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ + +/* + * Helpers for IOMMU driver to allocate driver structures that will be freed by + * the iommufd core. The free op will be called prior to freeing the memory. + */ +#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \ + ({ \ + drv_struct *ret; \ + \ + static_assert(__same_type(struct iommufd_viommu, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member.obj) == 0); \ + ret = (drv_struct *)_iommufd_object_alloc( \ + ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \ + if (!IS_ERR(ret)) \ + ret->member.ops = viommu_ops; \ + ret; \ + }) #endif -- Gitee From 47bc47785765875840e5b77e711b379c81058d71 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:20 -0800 Subject: [PATCH 122/278] iommufd: Verify object in iommufd_object_finalize/abort() ANBZ: #13617 commit d56d1e8405a9e154a30f4a055eb7ef55516b32b0 upstream. To support driver-allocated vIOMMU objects, it's required for IOMMU driver to call the provided iommufd_viommu_alloc helper to embed the core struct. However, there is no guarantee that every driver will call it and allocate objects properly. Make the iommufd_object_finalize/abort functions more robust to verify if the xarray slot indexed by the input obj->id is having an XA_ZERO_ENTRY, which is the reserved value stored by xa_alloc via iommufd_object_alloc. Link: https://patch.msgid.link/r/334bd4dde8e0a88eb30fa67eeef61827cdb546f9.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/main.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 3c32b440471b..30e6c2af3b45 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -41,20 +41,26 @@ static struct miscdevice vfio_misc_dev; void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); - /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, obj); + xa_unlock(&ictx->objects); + /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ + WARN_ON(old != XA_ZERO_ENTRY); } /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_erase(&ictx->objects, obj->id); - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, NULL); + xa_unlock(&ictx->objects); + WARN_ON(old != XA_ZERO_ENTRY); kfree(obj); } -- Gitee From 268b556c1fae6b0883b6a98477eb926cfafd48fd Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:21 -0800 Subject: [PATCH 123/278] iommufd/viommu: Add IOMMU_VIOMMU_ALLOC ioctl ANBZ: #13617 commit 4db97c21ed07a7d4081ed9820599fa36857083d6 upstream. Add a new ioctl for user space to do a vIOMMU allocation. It must be based on a nesting parent HWPT, so take its refcount. IOMMU driver wanting to support vIOMMUs must define its IOMMU_VIOMMU_TYPE_ in the uAPI header and implement a viommu_alloc op in its iommu_ops. Link: https://patch.msgid.link/r/dc2b8ba9ac935007beff07c1761c31cd097ed780.1730836219.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/Makefile | 3 +- drivers/iommu/iommufd/iommufd_private.h | 3 + drivers/iommu/iommufd/main.c | 6 ++ drivers/iommu/iommufd/viommu.c | 81 +++++++++++++++++++++++++ include/uapi/linux/iommufd.h | 40 ++++++++++++ 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/iommufd/viommu.c diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 83df9077063e..cb784da6cddc 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -7,7 +7,8 @@ iommufd-y := \ ioas.o \ main.o \ pages.o \ - vfio_compat.o + vfio_compat.o \ + viommu.o iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index be347f726fda..a8104d9d4cef 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -506,6 +506,9 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); } +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_viommu_destroy(struct iommufd_object *obj); + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 30e6c2af3b45..cc514f9bc3e6 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,7 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vfio_ioas vfio_ioas; + struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -360,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { val64), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), + IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, + struct iommu_viommu_alloc, out_viommu_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -495,6 +498,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_VIOMMU] = { + .destroy = iommufd_viommu_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c new file mode 100644 index 000000000000..888239b78667 --- /dev/null +++ b/drivers/iommu/iommufd/viommu.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +void iommufd_viommu_destroy(struct iommufd_object *obj) +{ + struct iommufd_viommu *viommu = + container_of(obj, struct iommufd_viommu, obj); + + if (viommu->ops && viommu->ops->destroy) + viommu->ops->destroy(viommu); + refcount_dec(&viommu->hwpt->common.obj.users); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_viommu_alloc *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + const struct iommu_ops *ops; + int rc; + + if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (!ops->viommu_alloc) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_put_idev; + } + + if (!hwpt_paging->nest_parent) { + rc = -EINVAL; + goto out_put_hwpt; + } + + viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, + ucmd->ictx, cmd->type); + if (IS_ERR(viommu)) { + rc = PTR_ERR(viommu); + goto out_put_hwpt; + } + + viommu->type = cmd->type; + viommu->ictx = ucmd->ictx; + viommu->hwpt = hwpt_paging; + refcount_inc(&viommu->hwpt->common.obj.users); + /* + * It is the most likely case that a physical IOMMU is unpluggable. A + * pluggable IOMMU instance (if exists) is responsible for refcounting + * on its own. + */ + viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + + cmd->out_viommu_id = viommu->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &viommu->obj); + goto out_put_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj); +out_put_hwpt: + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 49ea668be905..dc87c54afacd 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -52,6 +52,7 @@ enum { IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, }; /** @@ -865,4 +866,43 @@ struct iommu_fault_alloc { __u32 out_fault_fd; }; #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC) + +/** + * enum iommu_viommu_type - Virtual IOMMU Type + * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_viommu_type { + IOMMU_VIOMMU_TYPE_DEFAULT = 0, +}; + +/** + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC) + * @size: sizeof(struct iommu_viommu_alloc) + * @flags: Must be 0 + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU + * @hwpt_id: ID of a nesting parent HWPT to associate to + * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * + * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's + * virtualization support that is a security-isolated slice of the real IOMMU HW + * that is unique to a specific VM. Operations global to the IOMMU are connected + * to the vIOMMU, such as: + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Non-device-affiliated event reporting, e.g. invalidation queue errors + * - Access to a sharable nesting parent pagetable across physical IOMMUs + * - Virtualization of various platforms IDs, e.g. RIDs and others + * - Delivery of paravirtualized invalidation + * - Direct assigned invalidation queues + * - Direct assigned interrupts + */ +struct iommu_viommu_alloc { + __u32 size; + __u32 flags; + __u32 type; + __u32 dev_id; + __u32 hwpt_id; + __u32 out_viommu_id; +}; +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) #endif -- Gitee From 85b3d69fc7c86140dc971ed96050238c42963d26 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:22 -0800 Subject: [PATCH 124/278] iommufd: Add alloc_domain_nested op to iommufd_viommu_ops ANBZ: #13617 commit 69d2689e57f5cb235c0609dd2f8fa10c1a832f87 upstream. Allow IOMMU driver to use a vIOMMU object that holds a nesting parent hwpt/domain to allocate a nested domain. Link: https://patch.msgid.link/r/2dcdb5e405dc0deb68230564530d989d285d959c.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 4fc2ce332f10..de9b56265c9c 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -14,6 +14,7 @@ struct device; struct file; struct iommu_group; +struct iommu_user_data; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; @@ -95,9 +96,17 @@ struct iommufd_viommu { * struct iommufd_viommu_ops - vIOMMU specific operations * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory * of the vIOMMU will be free-ed by iommufd core after calling this op + * @alloc_domain_nested: Allocate a IOMMU_DOMAIN_NESTED on a vIOMMU that holds a + * nesting parent domain (IOMMU_DOMAIN_PAGING). @user_data + * must be defined in include/uapi/linux/iommufd.h. + * It must fully initialize the new iommu_domain before + * returning. Upon failure, ERR_PTR must be returned. */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); + struct iommu_domain *(*alloc_domain_nested)( + struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); }; #if IS_ENABLED(CONFIG_IOMMUFD) -- Gitee From 5135ceff4aebcd54d7c3cec6fc593393ff377dcd Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:23 -0800 Subject: [PATCH 125/278] iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC ANBZ: #13617 commit 13a750180fc86d41695c8f64d8892412482a401d upstream. Now a vIOMMU holds a shareable nesting parent HWPT. So, it can act like that nesting parent HWPT to allocate a nested HWPT. Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc. Also, add an iommufd_viommu_alloc_hwpt_nested helper to allocate a nested HWPT for a vIOMMU object. Since a vIOMMU object holds the parent hwpt's refcount already, increase the refcount of the vIOMMU only. Link: https://patch.msgid.link/r/a0f24f32bfada8b448d17587adcaedeeb50a67ed.1730836219.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/hw_pagetable.c | 73 ++++++++++++++++++++++++- drivers/iommu/iommufd/iommufd_private.h | 1 + include/uapi/linux/iommufd.h | 14 +++-- 3 files changed, 81 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8f020bc0815f..439aed9b58db 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -57,7 +57,10 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) container_of(obj, struct iommufd_hwpt_nested, common.obj); __iommufd_hwpt_destroy(&hwpt_nested->common); - refcount_dec(&hwpt_nested->parent->common.obj.users); + if (hwpt_nested->viommu) + refcount_dec(&hwpt_nested->viommu->obj.users); + else + refcount_dec(&hwpt_nested->parent->common.obj.users); } void iommufd_hwpt_nested_abort(struct iommufd_object *obj) @@ -261,6 +264,58 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, return ERR_PTR(rc); } +/** + * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU + * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED + * hw_pagetable. + */ +static struct iommufd_hwpt_nested * +iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (!user_data->len) + return ERR_PTR(-EOPNOTSUPP); + if (!viommu->ops || !viommu->ops->alloc_domain_nested) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_nested = __iommufd_object_alloc( + viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + hwpt_nested->viommu = viommu; + refcount_inc(&viommu->obj.users); + hwpt_nested->parent = viommu->hwpt; + + hwpt->domain = + viommu->ops->alloc_domain_nested(viommu, flags, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + hwpt->domain->owner = viommu->iommu_dev->ops; + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; @@ -317,6 +372,22 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) goto out_unlock; } hwpt = &hwpt_nested->common; + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_viommu *viommu; + + viommu = container_of(pt_obj, struct iommufd_viommu, obj); + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_unlock; + } + hwpt_nested = iommufd_viommu_alloc_hwpt_nested( + viommu, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a8104d9d4cef..e8f5ef550cc9 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -290,6 +290,7 @@ struct iommufd_hwpt_paging { struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; + struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dc87c54afacd..293968d93fd6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -443,7 +443,7 @@ enum iommu_hwpt_data_type { * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS or HWPT to connect this HWPT to + * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 * @data_type: One of enum iommu_hwpt_data_type @@ -462,11 +462,13 @@ enum iommu_hwpt_data_type { * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. * - * A user-managed nested HWPT will be created from a given parent HWPT via - * @pt_id, in which the parent HWPT must be allocated previously via the - * same ioctl from a given IOAS (@pt_id). In this case, the @data_type - * must be set to a pre-defined type corresponding to an I/O page table - * type supported by the underlying IOMMU hardware. + * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a + * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be + * allocated previously via the same ioctl from a given IOAS (@pt_id). In this + * case, the @data_type must be set to a pre-defined type corresponding to an + * I/O page table type supported by the underlying IOMMU hardware. The device + * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU + * instance. * * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr -- Gitee From de084eac7b8c4048a77916d54873b4e1d39c3069 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:24 -0800 Subject: [PATCH 126/278] iommufd/selftest: Add container_of helpers ANBZ: #13617 commit fd6b853f50c868b03797e7ae87f1ed554aea01e4 upstream. Use these inline helpers to shorten those container_of lines. Note that one of them goes back and forth between iommu_domain and mock_iommu_domain, which isn't necessary. So drop its container_of. Link: https://patch.msgid.link/r/518ec64dae2e814eb29fd9f170f58a3aad56c81c.1730836219.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 75 ++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 540437be168a..322e57ff3605 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -126,12 +126,24 @@ struct mock_iommu_domain { struct xarray pfns; }; +static inline struct mock_iommu_domain * +to_mock_domain(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain, domain); +} + struct mock_iommu_domain_nested { struct iommu_domain domain; struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; +static inline struct mock_iommu_domain_nested * +to_mock_nested(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain_nested, domain); +} + enum selftest_obj_type { TYPE_IDEV, }; @@ -142,6 +154,11 @@ struct mock_dev { int id; }; +static inline struct mock_dev *to_mock_dev(struct device *dev) +{ + return container_of(dev, struct mock_dev, dev); +} + struct selftest_obj { struct iommufd_object obj; enum selftest_obj_type type; @@ -155,10 +172,15 @@ struct selftest_obj { }; }; +static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) +{ + return container_of(obj, struct selftest_obj, obj); +} + static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; @@ -193,8 +215,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = mock->flags; if (enable && !domain->dirty_ops) @@ -243,8 +264,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long flags, struct iommu_dirty_bitmap *dirty) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long end = iova + size; void *ent; @@ -281,7 +301,7 @@ static const struct iommu_dirty_ops dirty_ops = { static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); @@ -327,7 +347,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, /* must be mock_domain */ if (!parent) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; struct iommu_domain *domain; @@ -341,8 +361,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, if (!domain) return ERR_PTR(-ENOMEM); if (has_dirty_flag) - container_of(domain, struct mock_iommu_domain, domain) - ->domain.dirty_ops = &dirty_ops; + domain->dirty_ops = &dirty_ops; return domain; } @@ -352,7 +371,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); - mock_parent = container_of(parent, struct mock_iommu_domain, domain); + mock_parent = to_mock_domain(parent); if (!mock_parent) return ERR_PTR(-EINVAL); @@ -366,8 +385,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, static void mock_domain_free(struct iommu_domain *domain) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); WARN_ON(!xa_empty(&mock->pfns)); kfree(mock); @@ -378,8 +396,7 @@ static int mock_domain_map_pages(struct iommu_domain *domain, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = MOCK_PFN_START_IOVA; unsigned long start_iova = iova; @@ -430,8 +447,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, size_t pgcount, struct iommu_iotlb_gather *iotlb_gather) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); bool first = true; size_t ret = 0; void *ent; @@ -479,8 +495,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); void *ent; WARN_ON(iova % MOCK_IO_PAGE_SIZE); @@ -491,7 +506,7 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: @@ -571,18 +586,14 @@ static const struct iommu_ops mock_ops = { static void mock_domain_free_nested(struct iommu_domain *domain) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); - - kfree(mock_nested); + kfree(to_mock_nested(domain)); } static int mock_domain_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); + struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain); struct iommu_hwpt_invalidate_selftest inv; u32 processed = 0; int i = 0, j; @@ -657,7 +668,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + *mock = to_mock_domain(hwpt->domain); return hwpt; } @@ -675,14 +686,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + *mock_nested = to_mock_nested(hwpt->domain); return hwpt; } static void mock_dev_release(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); ida_free(&mock_dev_ida, mdev->id); kfree(mdev); @@ -813,7 +823,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, if (IS_ERR(dev_obj)) return PTR_ERR(dev_obj); - sobj = container_of(dev_obj, struct selftest_obj, obj); + sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { rc = -EINVAL; goto out_dev_obj; @@ -951,8 +961,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + mock_nested = to_mock_nested(hwpt->domain); if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || mock_nested->iotlb[iotlb_id] != iotlb) @@ -1431,7 +1440,7 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, void iommufd_selftest_destroy(struct iommufd_object *obj) { - struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: -- Gitee From e49600385bb7f9091c9ed679a5c485d4891f4458 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:25 -0800 Subject: [PATCH 127/278] iommufd/selftest: Prepare for mock_viommu_alloc_domain_nested() ANBZ: #13617 commit 18f819901d53b8d1101d3227c87b0d623e9d78f6 upstream. A nested domain now can be allocated for a parent domain or for a vIOMMU object. Rework the existing allocators to prepare for the latter case. Link: https://patch.msgid.link/r/f62894ad8ccae28a8a616845947fe4b76135d79b.1730836219.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 89 ++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 322e57ff3605..92d753985640 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -318,55 +318,39 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) return &mock->domain; } -static struct iommu_domain * -__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, - const struct iommu_hwpt_selftest *user_cfg) +static struct mock_iommu_domain_nested * +__mock_domain_alloc_nested(const struct iommu_user_data *user_data) { struct mock_iommu_domain_nested *mock_nested; - int i; + struct iommu_hwpt_selftest user_cfg; + int rc, i; + + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); if (!mock_nested) return ERR_PTR(-ENOMEM); - mock_nested->parent = mock_parent; mock_nested->domain.ops = &domain_nested_ops; mock_nested->domain.type = IOMMU_DOMAIN_NESTED; for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) - mock_nested->iotlb[i] = user_cfg->iotlb; - return &mock_nested->domain; + mock_nested->iotlb[i] = user_cfg.iotlb; + return mock_nested; } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags, + const struct iommu_user_data *user_data) { + struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; - struct iommu_hwpt_selftest user_cfg; - int rc; - - /* must be mock_domain */ - if (!parent) { - struct mock_dev *mdev = to_mock_dev(dev); - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct iommu_domain *domain; - - if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (user_data || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(dev); - if (!domain) - return ERR_PTR(-ENOMEM); - if (has_dirty_flag) - domain->dirty_ops = &dirty_ops; - return domain; - } - /* must be mock_domain_nested */ - if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + if (flags) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); @@ -375,12 +359,39 @@ mock_domain_alloc_user(struct device *dev, u32 flags, if (!mock_parent) return ERR_PTR(-EINVAL); - rc = iommu_copy_struct_from_user(&user_cfg, user_data, - IOMMU_HWPT_DATA_SELFTEST, iotlb); - if (rc) - return ERR_PTR(rc); + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->parent = mock_parent; + return &mock_nested->domain; +} + +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT; + bool no_dirty_ops = to_mock_dev(dev)->flags & + MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; + + if (parent) + return mock_domain_alloc_nested(parent, flags, user_data); - return __mock_domain_alloc_nested(mock_parent, &user_cfg); + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + domain = mock_domain_alloc_paging(dev); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + domain->dirty_ops = &dirty_ops; + return domain; } static void mock_domain_free(struct iommu_domain *domain) -- Gitee From 54f960dcd154453f9a478e83b841e490ac83cd4c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:26 -0800 Subject: [PATCH 128/278] iommufd/selftest: Add refcount to mock_iommu_device ANBZ: #13617 commit 86070569450eec1a3b6a358be90a99cab87aa425 upstream. For an iommu_dev that can unplug (so far only this selftest does so), the viommu->iommu_dev pointer has no guarantee of its life cycle after it is copied from the idev->dev->iommu->iommu_dev. Track the user count of the iommu_dev. Postpone the exit routine using a completion, if refcount is unbalanced. The refcount inc/dec will be added in the following patch. Link: https://patch.msgid.link/r/33f28d64841b497eebef11b49a571e03103c5d24.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 39 +++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 92d753985640..4f67a83f667a 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -533,14 +533,17 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) static struct iopf_queue *mock_iommu_iopf_queue; -static struct iommu_device mock_iommu_device = { -}; +static struct mock_iommu_device { + struct iommu_device iommu_dev; + struct completion complete; + refcount_t users; +} mock_iommu; static struct iommu_device *mock_probe_device(struct device *dev) { if (dev->bus != &iommufd_mock_bus_type.bus) return ERR_PTR(-ENODEV); - return &mock_iommu_device; + return &mock_iommu.iommu_dev; } static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, @@ -1556,24 +1559,27 @@ int __init iommufd_test_init(void) if (rc) goto err_platform; - rc = iommu_device_sysfs_add(&mock_iommu_device, + rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev, &selftest_iommu_dev->dev, NULL, "%s", dev_name(&selftest_iommu_dev->dev)); if (rc) goto err_bus; - rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, + rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; + refcount_set(&mock_iommu.users, 1); + init_completion(&mock_iommu.complete); + mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); return 0; err_sysfs: - iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: @@ -1583,6 +1589,22 @@ int __init iommufd_test_init(void) return rc; } +static void iommufd_test_wait_for_users(void) +{ + if (refcount_dec_and_test(&mock_iommu.users)) + return; + /* + * Time out waiting for iommu device user count to become 0. + * + * Note that this is just making an example here, since the selftest is + * built into the iommufd module, i.e. it only unplugs the iommu device + * when unloading the module. So, it is expected that this WARN_ON will + * not trigger, as long as any iommufd FDs are open. + */ + WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete, + msecs_to_jiffies(10000))); +} + void iommufd_test_exit(void) { if (mock_iommu_iopf_queue) { @@ -1590,8 +1612,9 @@ void iommufd_test_exit(void) mock_iommu_iopf_queue = NULL; } - iommu_device_sysfs_remove(&mock_iommu_device); - iommu_device_unregister_bus(&mock_iommu_device, + iommufd_test_wait_for_users(); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); + iommu_device_unregister_bus(&mock_iommu.iommu_dev, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); -- Gitee From d2b6c8a9215c260b8d1ecd43c789fbba625dabdf Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:27 -0800 Subject: [PATCH 129/278] iommufd/selftest: Add IOMMU_VIOMMU_TYPE_SELFTEST ANBZ: #13617 commit db70827a8827a1ac8d1063732d8c1182820e875c upstream. Implement the viommu alloc/free functions to increase/reduce refcount of its dependent mock iommu device. User space can verify this loop via the IOMMU_VIOMMU_TYPE_SELFTEST. Link: https://patch.msgid.link/r/9d755a215a3007d4d8d1c2513846830332db62aa.1730836219.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 2 + drivers/iommu/iommufd/selftest.c | 67 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index f4bc23a92f9a..edced4ac7cd3 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -180,4 +180,6 @@ struct iommu_hwpt_invalidate_selftest { __u32 iotlb_id; }; +#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 4f67a83f667a..31c8f78a3a66 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -134,6 +134,7 @@ to_mock_domain(struct iommu_domain *domain) struct mock_iommu_domain_nested { struct iommu_domain domain; + struct mock_viommu *mock_viommu; struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; @@ -144,6 +145,16 @@ to_mock_nested(struct iommu_domain *domain) return container_of(domain, struct mock_iommu_domain_nested, domain); } +struct mock_viommu { + struct iommufd_viommu core; + struct mock_iommu_domain *s2_parent; +}; + +static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) +{ + return container_of(viommu, struct mock_viommu, core); +} + enum selftest_obj_type { TYPE_IDEV, }; @@ -569,6 +580,61 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea return 0; } +static void mock_viommu_destroy(struct iommufd_viommu *viommu) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + + if (refcount_dec_and_test(&mock_iommu->users)) + complete(&mock_iommu->complete); + + /* iommufd core frees mock_viommu and viommu */ +} + +static struct iommu_domain * +mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct mock_iommu_domain_nested *mock_nested; + + if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + return ERR_PTR(-EOPNOTSUPP); + + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->mock_viommu = mock_viommu; + mock_nested->parent = mock_viommu->s2_parent; + return &mock_nested->domain; +} + +static struct iommufd_viommu_ops mock_viommu_ops = { + .destroy = mock_viommu_destroy, + .alloc_domain_nested = mock_viommu_alloc_domain_nested, +}; + +static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, + struct iommu_domain *domain, + struct iommufd_ctx *ictx, + unsigned int viommu_type) +{ + struct mock_iommu_device *mock_iommu = + iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu; + + if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + + mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, + &mock_viommu_ops); + if (IS_ERR(mock_viommu)) + return ERR_CAST(mock_viommu); + + refcount_inc(&mock_iommu->users); + return &mock_viommu->core; +} + static const struct iommu_ops mock_ops = { /* * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() @@ -588,6 +654,7 @@ static const struct iommu_ops mock_ops = { .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, .user_pasid_table = true, + .viommu_alloc = mock_viommu_alloc, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, -- Gitee From 60c8b4deacf382e2aba2cfb6b917ca0e7c091b80 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:28 -0800 Subject: [PATCH 130/278] iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test coverage ANBZ: #13617 commit 7156cd9ef24583c88bcc2f6d213f469aef38bfd9 upstream. Add a new iommufd_viommu FIXTURE and setup it up with a vIOMMU object. Any new vIOMMU feature will be added as a TEST_F under that. Link: https://patch.msgid.link/r/abe267c9d004b29cb1712ceba2f378209d4b7e01.1730836219.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 137 ++++++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 11 ++ tools/testing/selftests/iommu/iommufd_utils.h | 28 ++++ 3 files changed, 176 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 88b92bb69756..37c7da283824 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -133,6 +133,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_option, IOMMU_OPTION, val64); TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved); TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova); + TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id); #undef TEST_LENGTH } @@ -2480,4 +2481,140 @@ TEST_F(vfio_compat_mock_domain, huge_map) } } +FIXTURE(iommufd_viommu) +{ + int fd; + uint32_t ioas_id; + uint32_t stdev_id; + uint32_t hwpt_id; + uint32_t nested_hwpt_id; + uint32_t device_id; + uint32_t viommu_id; +}; + +FIXTURE_VARIANT(iommufd_viommu) +{ + unsigned int viommu; +}; + +FIXTURE_SETUP(iommufd_viommu) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + test_ioctl_ioas_alloc(&self->ioas_id); + test_ioctl_set_default_memory_limit(); + + if (variant->viommu) { + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, NULL, + &self->device_id); + + /* Allocate a nesting parent hwpt */ + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &self->hwpt_id); + + /* Allocate a vIOMMU taking refcount of the parent hwpt */ + test_cmd_viommu_alloc(self->device_id, self->hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, + &self->viommu_id); + + /* Allocate a regular nested hwpt */ + test_cmd_hwpt_alloc_nested(self->device_id, self->viommu_id, 0, + &self->nested_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + } +} + +FIXTURE_TEARDOWN(iommufd_viommu) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_viommu, no_viommu) +{ + .viommu = 0, +}; + +FIXTURE_VARIANT_ADD(iommufd_viommu, mock_viommu) +{ + .viommu = 1, +}; + +TEST_F(iommufd_viommu, viommu_auto_destroy) +{ +} + +TEST_F(iommufd_viommu, viommu_negative_tests) +{ + uint32_t device_id = self->device_id; + uint32_t ioas_id = self->ioas_id; + uint32_t hwpt_id; + + if (self->device_id) { + /* Negative test -- invalid hwpt (hwpt_id=0) */ + test_err_viommu_alloc(ENOENT, device_id, 0, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + + /* Negative test -- not a nesting parent hwpt */ + test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id); + test_err_viommu_alloc(EINVAL, device_id, hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + test_ioctl_destroy(hwpt_id); + + /* Negative test -- unsupported viommu type */ + test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id, + 0xdead, NULL); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, self->hwpt_id)); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, self->viommu_id)); + } else { + test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + } +} + +TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) +{ + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t viommu_id = self->viommu_id; + uint32_t dev_id = self->device_id; + uint32_t iopf_hwpt_id; + uint32_t fault_id; + uint32_t fault_fd; + + if (self->device_id) { + test_ioctl_fault_alloc(&fault_id, &fault_fd); + test_err_hwpt_alloc_iopf( + ENOENT, dev_id, viommu_id, UINT32_MAX, + IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_err_hwpt_alloc_iopf( + EOPNOTSUPP, dev_id, viommu_id, fault_id, + IOMMU_HWPT_FAULT_ID_VALID | (1 << 31), &iopf_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_cmd_hwpt_alloc_iopf( + dev_id, viommu_id, fault_id, IOMMU_HWPT_FAULT_ID_VALID, + &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, iopf_hwpt_id)); + test_cmd_trigger_iopf(dev_id, fault_fd); + + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); + test_ioctl_destroy(iopf_hwpt_id); + close(fault_fd); + test_ioctl_destroy(fault_id); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 2d7d01638be8..fb618485d7ca 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -621,6 +621,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t stdev_id; uint32_t idev_id; uint32_t hwpt_id; + uint32_t viommu_id; __u64 iova; self->fd = open("/dev/iommu", O_RDWR); @@ -663,6 +664,16 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL)) return -1; + + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, + IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) + return -1; + + if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id)) + return -1; + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 6a11c26370f3..7dabc261fae2 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -819,3 +819,31 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) #define test_cmd_trigger_iopf(device_id, fault_fd) \ ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd)) + +static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, + __u32 type, __u32 flags, __u32 *viommu_id) +{ + struct iommu_viommu_alloc cmd = { + .size = sizeof(cmd), + .flags = flags, + .type = type, + .dev_id = device_id, + .hwpt_id = hwpt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &cmd); + if (ret) + return ret; + if (viommu_id) + *viommu_id = cmd.out_viommu_id; + return 0; +} + +#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id) \ + ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ + type, 0, viommu_id)) +#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ + type, 0, viommu_id)) -- Gitee From 148d5d15cc700ee073ea23c84ff82b76e69bc859 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:09 -0800 Subject: [PATCH 131/278] iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl ANBZ: #13617 commit 0ce5c2477af2e2284b9c70474e4dae85db211680 upstream. Introduce a new IOMMUFD_OBJ_VDEVICE to represent a physical device (struct device) against a vIOMMU (struct iommufd_viommu) object in a VM. This vDEVICE object (and its structure) holds all the infos and attributes in the VM, regarding the device related to the vIOMMU. As an initial patch, add a per-vIOMMU virtual ID. This can be: - Virtual StreamID on a nested ARM SMMUv3, an index to a Stream Table - Virtual DeviceID on a nested AMD IOMMU, an index to a Device Table - Virtual RID on a nested Intel VT-D IOMMU, an index to a Context Table Potentially, this vDEVICE structure would hold some vData for Confidential Compute Architecture (CCA). Use this virtual ID to index an "vdevs" xarray that belongs to a vIOMMU object. Add a new ioctl for vDEVICE allocations. Since a vDEVICE is a connection of a device object and an iommufd_viommu object, take two refcounts in the ioctl handler. Link: https://patch.msgid.link/r/cda8fd2263166e61b8191a3b3207e0d2b08545bf.1730836308.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_private.h | 18 ++++++ drivers/iommu/iommufd/main.c | 6 ++ drivers/iommu/iommufd/viommu.c | 76 +++++++++++++++++++++++++ include/linux/iommufd.h | 4 ++ include/uapi/linux/iommufd.h | 22 +++++++ 5 files changed, 126 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index e8f5ef550cc9..062656c19a07 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -507,8 +507,26 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); } +static inline struct iommufd_viommu * +iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VIOMMU), + struct iommufd_viommu, obj); +} + int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_vdevice_destroy(struct iommufd_object *obj); + +struct iommufd_vdevice { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_viommu *viommu; + struct device *dev; + u64 id; /* per-vIOMMU virtual ID */ +}; #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index cc514f9bc3e6..d735fe04197f 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -308,6 +308,7 @@ union ucmd_buffer { struct iommu_option option; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; + struct iommu_vdevice_alloc vdev; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, struct iommu_viommu_alloc, out_viommu_id), + IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, + struct iommu_vdevice_alloc, virt_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -501,6 +504,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, }, + [IOMMUFD_OBJ_VDEVICE] = { + .destroy = iommufd_vdevice_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 888239b78667..69b88e8c7c26 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -11,6 +11,7 @@ void iommufd_viommu_destroy(struct iommufd_object *obj) if (viommu->ops && viommu->ops->destroy) viommu->ops->destroy(viommu); refcount_dec(&viommu->hwpt->common.obj.users); + xa_destroy(&viommu->vdevs); } int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) @@ -53,6 +54,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_hwpt; } + xa_init(&viommu->vdevs); viommu->type = cmd->type; viommu->ictx = ucmd->ictx; viommu->hwpt = hwpt_paging; @@ -79,3 +81,77 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } + +void iommufd_vdevice_destroy(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_viommu *viommu = vdev->viommu; + + /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ + xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); + refcount_dec(&viommu->obj.users); + put_device(vdev->dev); +} + +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_vdevice_alloc *cmd = ucmd->cmd; + struct iommufd_vdevice *vdev, *curr; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + u64 virt_id = cmd->virt_id; + int rc = 0; + + /* virt_id indexes an xarray */ + if (virt_id > ULONG_MAX) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_put_viommu; + } + + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_put_idev; + } + + vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE); + if (IS_ERR(vdev)) { + rc = PTR_ERR(vdev); + goto out_put_idev; + } + + vdev->id = virt_id; + vdev->dev = idev->dev; + get_device(idev->dev); + vdev->viommu = viommu; + refcount_inc(&viommu->obj.users); + + curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); + if (curr) { + rc = xa_err(curr) ?: -EEXIST; + goto out_abort; + } + + cmd->out_vdevice_id = vdev->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &vdev->obj); + goto out_put_idev; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index de9b56265c9c..71fa1e343023 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -10,6 +10,7 @@ #include #include #include +#include struct device; struct file; @@ -31,6 +32,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_ACCESS, IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, + IOMMUFD_OBJ_VDEVICE, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -89,6 +91,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; + struct xarray vdevs; + unsigned int type; }; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 293968d93fd6..964e941a0a17 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -53,6 +53,7 @@ enum { IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f, IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, }; /** @@ -907,4 +908,25 @@ struct iommu_viommu_alloc { __u32 out_viommu_id; }; #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) + +/** + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) + * @size: sizeof(struct iommu_vdevice_alloc) + * @viommu_id: vIOMMU ID to associate with the virtual device + * @dev_id: The physical device to allocate a virtual instance on the vIOMMU + * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID + * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * + * Allocate a virtual device instance (for a physical device) against a vIOMMU. + * This instance holds the device's information (related to its vIOMMU) in a VM. + */ +struct iommu_vdevice_alloc { + __u32 size; + __u32 viommu_id; + __u32 dev_id; + __u32 out_vdevice_id; + __aligned_u64 virt_id; +}; +#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif -- Gitee From ae561634ed9cad99c72d6a3280cd70812a090960 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:10 -0800 Subject: [PATCH 132/278] iommufd/selftest: Add IOMMU_VDEVICE_ALLOC test coverage ANBZ: #13617 commit 5778c75703c6e01ffd70a429b9015bed8008a5fd upstream. Add a vdevice_alloc op to the viommu mock_viommu_ops for the coverage of IOMMU_VIOMMU_TYPE_SELFTEST allocations. Then, add a vdevice_alloc TEST_F to cover the IOMMU_VDEVICE_ALLOC ioctl. Link: https://patch.msgid.link/r/4b9607e5b86726c8baa7b89bd48123fb44104a23.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 20 ++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 4 +++ tools/testing/selftests/iommu/iommufd_utils.h | 27 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 37c7da283824..f3cb628753c9 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -134,6 +134,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved); TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova); TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id); + TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id); #undef TEST_LENGTH } @@ -2617,4 +2618,23 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) } } +TEST_F(iommufd_viommu, vdevice_alloc) +{ + uint32_t viommu_id = self->viommu_id; + uint32_t dev_id = self->device_id; + uint32_t vdev_id = 0; + + if (dev_id) { + /* Set vdev_id to 0x99, unset it, and set to 0x88 */ + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); + test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99, + &vdev_id); + test_ioctl_destroy(vdev_id); + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id); + test_ioctl_destroy(vdev_id); + } else { + test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index fb618485d7ca..22f6fd5f0f74 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -622,6 +622,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t idev_id; uint32_t hwpt_id; uint32_t viommu_id; + uint32_t vdev_id; __u64 iova; self->fd = open("/dev/iommu", O_RDWR); @@ -674,6 +675,9 @@ TEST_FAIL_NTH(basic_fail_nth, device) IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id)) return -1; + if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id)) + return -1; + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 7dabc261fae2..7fe905924d72 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -847,3 +847,30 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, EXPECT_ERRNO(_errno, \ _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ type, 0, viommu_id)) + +static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, + __u64 virt_id, __u32 *vdev_id) +{ + struct iommu_vdevice_alloc cmd = { + .size = sizeof(cmd), + .dev_id = idev_id, + .viommu_id = viommu_id, + .virt_id = virt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &cmd); + if (ret) + return ret; + if (vdev_id) + *vdev_id = cmd.out_vdevice_id; + return 0; +} + +#define test_cmd_vdevice_alloc(viommu_id, idev_id, virt_id, vdev_id) \ + ASSERT_EQ(0, _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ + virt_id, vdev_id)) +#define test_err_vdevice_alloc(_errno, viommu_id, idev_id, virt_id, vdev_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ + virt_id, vdev_id)) -- Gitee From fe9556ecd20580eb706b6a9e4236e297d20786a1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:11 -0800 Subject: [PATCH 133/278] iommu/viommu: Add cache_invalidate to iommufd_viommu_ops ANBZ: #13617 commit 67db79dc1a411335f8a7e53a5e206d96b237d9a8 upstream. This per-vIOMMU cache_invalidate op is like the cache_invalidate_user op in struct iommu_domain_ops, but wider, supporting device cache (e.g. PCI ATC invaldiations). Link: https://patch.msgid.link/r/90138505850fa6b165135e78a87b4cc7022869a4.1730836308.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommufd.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 71fa1e343023..2bc735ff9511 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -16,6 +16,7 @@ struct device; struct file; struct iommu_group; struct iommu_user_data; +struct iommu_user_data_array; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; @@ -105,12 +106,21 @@ struct iommufd_viommu { * must be defined in include/uapi/linux/iommufd.h. * It must fully initialize the new iommu_domain before * returning. Upon failure, ERR_PTR must be returned. + * @cache_invalidate: Flush hardware cache used by a vIOMMU. It can be used for + * any IOMMU hardware specific cache: TLB and device cache. + * The @array passes in the cache invalidation requests, in + * form of a driver data structure. A driver must update the + * array->entry_num to report the number of handled requests. + * The data structure of the array entry must be defined in + * include/uapi/linux/iommufd.h */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); struct iommu_domain *(*alloc_domain_nested)( struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data); + int (*cache_invalidate)(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); }; #if IS_ENABLED(CONFIG_IOMMUFD) -- Gitee From b573c054fa8a14c58533669a6e99f8b2a1544200 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:12 -0800 Subject: [PATCH 134/278] iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE ANBZ: #13617 commit 54ce69e36c71c88f258b1a322c54343d90954858 upstream. With a vIOMMU object, use space can flush any IOMMU related cache that can be directed via a vIOMMU object. It is similar to the IOMMU_HWPT_INVALIDATE uAPI, but can cover a wider range than IOTLB, e.g. device/desciprtor cache. Allow hwpt_id of the iommu_hwpt_invalidate structure to carry a viommu_id, and reuse the IOMMU_HWPT_INVALIDATE uAPI for vIOMMU invalidations. Drivers can define different structures for vIOMMU invalidations v.s. HWPT ones. Since both the HWPT-based and vIOMMU-based invalidation pathways check own cache invalidation op, remove the WARN_ON_ONCE in the allocator. Update the uAPI, kdoc, and selftest case accordingly. Link: https://patch.msgid.link/r/b411e2245e303b8a964f39f49453a5dff280968f.1730836308.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/hw_pagetable.c | 40 +++++++++++++++++++------ include/uapi/linux/iommufd.h | 9 ++++-- tools/testing/selftests/iommu/iommufd.c | 4 +-- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 439aed9b58db..9236e8ca9aa8 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -252,8 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, } hwpt->domain->owner = ops; - if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED || - !hwpt->domain->ops->cache_invalidate_user)) { + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; goto out_abort; } @@ -484,7 +483,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) .entry_len = cmd->entry_len, .entry_num = cmd->entry_num, }; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *pt_obj; u32 done_num = 0; int rc; @@ -498,17 +497,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) goto out; } - hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = PTR_ERR(pt_obj); goto out; } + if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + if (!hwpt->domain->ops || + !hwpt->domain->ops->cache_invalidate_user) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_viommu *viommu = + container_of(pt_obj, struct iommufd_viommu, obj); + + if (!viommu->ops || !viommu->ops->cache_invalidate) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = viommu->ops->cache_invalidate(viommu, &data_array); + } else { + rc = -EINVAL; + goto out_put_pt; + } - rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, - &data_array); done_num = data_array.entry_num; - iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_put_pt: + iommufd_put_object(ucmd->ictx, pt_obj); out: cmd->entry_num = done_num; if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 964e941a0a17..c0929b7cd7ab 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -743,7 +743,7 @@ struct iommu_hwpt_vtd_s1_invalidate { /** * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) * @size: sizeof(struct iommu_hwpt_invalidate) - * @hwpt_id: ID of a nested HWPT for cache invalidation + * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation * @data_uptr: User pointer to an array of driver-specific cache invalidation * data. * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data @@ -754,8 +754,11 @@ struct iommu_hwpt_vtd_s1_invalidate { * Output the number of requests successfully handled by kernel. * @__reserved: Must be 0. * - * Invalidate the iommu cache for user-managed page table. Modifications on a - * user-managed page table should be followed by this operation to sync cache. + * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications + * on a user-managed page table should be followed by this operation, if a HWPT + * is passed in via @hwpt_id. Other caches, such as device cache or descriptor + * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field. + * * Each ioctl can support one or more cache invalidation requests in the array * that has a total size of @entry_len * @entry_num. * diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index f3cb628753c9..8cb3e835ca97 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -367,9 +367,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, parent_hwpt_id)); - /* hwpt_invalidate only supports a user-managed hwpt (nested) */ + /* hwpt_invalidate does not support a parent hwpt */ num_inv = 1; - test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs, + test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs, IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, sizeof(*inv_reqs), &num_inv); assert(!num_inv); -- Gitee From e6ee13fb81f39528fd0398b5124cfa7c07028cd8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Nov 2024 12:05:13 -0800 Subject: [PATCH 135/278] iommu: Add iommu_copy_struct_from_full_user_array helper ANBZ: #13617 commit 4f2e59ccb69866c5165525bd7aee47cd5cd00acc upstream. The iommu_copy_struct_from_user_array helper can be used to copy a single entry from a user array which might not be efficient if the array is big. Add a new iommu_copy_struct_from_full_user_array to copy the entire user array at once. Update the existing iommu_copy_struct_from_user_array kdoc accordingly. Link: https://patch.msgid.link/r/5cd773d9c26920c5807d232b21d415ea79172e49.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommu.h | 48 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 82be5507d9dc..7188d60ecb28 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -501,7 +501,9 @@ static inline int __iommu_copy_struct_from_user_array( * @index: Index to the location in the array to copy user data from * @min_last: The last member of the data structure @kdst points in the * initial version. - * Return 0 for success, otherwise -error. + * + * Copy a single entry from a user array. Return 0 for success, otherwise + * -error. */ #define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \ min_last) \ @@ -509,6 +511,50 @@ static inline int __iommu_copy_struct_from_user_array( kdst, user_array, data_type, index, sizeof(*(kdst)), \ offsetofend(typeof(*(kdst)), min_last)) +/** + * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user + * space data from an iommu_user_data_array + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @kdst_entry_size: sizeof(*kdst) + * @user_array: Pointer to a struct iommu_user_data_array for a user space + * array + * @data_type: The data type of the @kdst. Must match with @user_array->type + * + * Copy the entire user array. kdst must have room for kdst_entry_size * + * user_array->entry_num bytes. Return 0 for success, otherwise -error. + */ +static inline int +iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, + struct iommu_user_data_array *user_array, + unsigned int data_type) +{ + unsigned int i; + int ret; + + if (user_array->type != data_type) + return -EINVAL; + if (!user_array->entry_num) + return -EINVAL; + if (likely(user_array->entry_len == kdst_entry_size)) { + if (copy_from_user(kdst, user_array->uptr, + user_array->entry_num * + user_array->entry_len)) + return -EFAULT; + } + + /* Copy item by item */ + for (i = 0; i != user_array->entry_num; i++) { + ret = copy_struct_from_user( + kdst + kdst_entry_size * i, kdst_entry_size, + user_array->uptr + user_array->entry_len * i, + user_array->entry_len); + if (ret) + return ret; + } + return 0; +} + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability -- Gitee From 4f2600b9461ba53bb75a8e116112d9dda7c60018 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:14 -0800 Subject: [PATCH 136/278] iommufd/viommu: Add iommufd_viommu_find_dev helper ANBZ: #13617 commit c747e67978ff473e6830afe0b1f3986dd1f444b5 upstream. This avoids a bigger trouble of exposing struct iommufd_device and struct iommufd_vdevice in the public header. Link: https://patch.msgid.link/r/84fa7c624db4d4508067ccfdf42059533950180a.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/driver.c | 13 +++++++++++++ include/linux/iommufd.h | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 2bc47d92a0ab..7b67fdf44134 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -36,5 +36,18 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, } EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD); +/* Caller should xa_lock(&viommu->vdevs) to protect the return value */ +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id) +{ + struct iommufd_vdevice *vdev; + + lockdep_assert_held(&viommu->vdevs.xa_lock); + + vdev = xa_load(&viommu->vdevs, vdev_id); + return vdev ? vdev->dev : NULL; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD); + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 2bc735ff9511..11110c749200 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -185,6 +185,8 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -192,6 +194,12 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct device * +iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) +{ + return NULL; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- Gitee From b3ccd892c07b32728a32918783cca289aac2a0fb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:15 -0800 Subject: [PATCH 137/278] iommufd/selftest: Add mock_viommu_cache_invalidate ANBZ: #13617 commit d6563aa2a8309270e042abd22a3e5fe44a6daca2 upstream. Similar to the coverage of cache_invalidate_user for iotlb invalidation, add a device cache and a viommu_cache_invalidate function to test it out. Link: https://patch.msgid.link/r/a29c7c23d7cd143fb26ab68b3618e0957f485fdb.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 25 +++++++++ drivers/iommu/iommufd/selftest.c | 76 +++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index edced4ac7cd3..46558f83e734 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -54,6 +54,11 @@ enum { MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, }; +enum { + MOCK_DEV_CACHE_ID_MAX = 3, + MOCK_DEV_CACHE_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -152,6 +157,7 @@ struct iommu_test_hw_info { /* Should not be equal to any defined value in enum iommu_hwpt_data_type */ #define IOMMU_HWPT_DATA_SELFTEST 0xdead #define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef +#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad /** * struct iommu_hwpt_selftest @@ -182,4 +188,23 @@ struct iommu_hwpt_invalidate_selftest { #define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef +/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU + * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @cache_id: Invalidate cache entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored + */ +struct iommu_viommu_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 vdev_id; + __u32 cache_id; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 31c8f78a3a66..e20498667a2c 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -163,6 +163,7 @@ struct mock_dev { struct device dev; unsigned long flags; int id; + u32 cache[MOCK_DEV_CACHE_NUM]; }; static inline struct mock_dev *to_mock_dev(struct device *dev) @@ -609,9 +610,80 @@ mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, return &mock_nested->domain; } +static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct iommu_viommu_invalidate_selftest *cmds; + struct iommu_viommu_invalidate_selftest *cur; + struct iommu_viommu_invalidate_selftest *end; + int rc; + + /* A zero-length array is allowed to validate the array type */ + if (array->entry_num == 0 && + array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) { + array->entry_num = 0; + return 0; + } + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 3 * sizeof(u32)); + rc = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST); + if (rc) + goto out; + + while (cur != end) { + struct mock_dev *mdev; + struct device *dev; + int i; + + if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) { + rc = -EINVAL; + goto out; + } + + xa_lock(&viommu->vdevs); + dev = iommufd_viommu_find_dev(viommu, + (unsigned long)cur->vdev_id); + if (!dev) { + xa_unlock(&viommu->vdevs); + rc = -EINVAL; + goto out; + } + mdev = container_of(dev, struct mock_dev, dev); + + if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all cache entries and ignore cache_id */ + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = 0; + } else { + mdev->cache[cur->cache_id] = 0; + } + xa_unlock(&viommu->vdevs); + + cur++; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return rc; +} + static struct iommufd_viommu_ops mock_viommu_ops = { .destroy = mock_viommu_destroy, .alloc_domain_nested = mock_viommu_alloc_domain_nested, + .cache_invalidate = mock_viommu_cache_invalidate, }; static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, @@ -782,7 +854,7 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; - int rc; + int rc, i; if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) @@ -796,6 +868,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT; rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); if (rc < 0) -- Gitee From c6d30e4335d21a20cad9aaeb55d11e313e48735e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:16 -0800 Subject: [PATCH 138/278] iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command ANBZ: #13617 commit 576ad6eb45d6458c1a5e646dc35e3ec23c73fd1b upstream. Similar to IOMMU_TEST_OP_MD_CHECK_IOTLB verifying a mock_domain's iotlb, IOMMU_TEST_OP_DEV_CHECK_CACHE will be used to verify a mock_dev's cache. Link: https://patch.msgid.link/r/cd4082079d75427bd67ed90c3c825e15b5720a5f.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 5 ++++ drivers/iommu/iommufd/selftest.c | 22 +++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 7 +++++- tools/testing/selftests/iommu/iommufd_utils.h | 24 +++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 46558f83e734..a6b7a163f636 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -23,6 +23,7 @@ enum { IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, + IOMMU_TEST_OP_DEV_CHECK_CACHE, }; enum { @@ -140,6 +141,10 @@ struct iommu_test_cmd { __u32 perm; __u64 addr; } trigger_iopf; + struct { + __u32 id; + __u32 cache; + } check_dev_cache; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index e20498667a2c..2f9de177dffc 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1125,6 +1125,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, return rc; } +static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id, + unsigned int cache_id, u32 cache) +{ + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = 0; + + idev = iommufd_get_device(ucmd, idev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = container_of(idev->dev, struct mock_dev, dev); + + if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1634,6 +1652,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); + case IOMMU_TEST_OP_DEV_CHECK_CACHE: + return iommufd_test_dev_check_cache(ucmd, cmd->id, + cmd->check_dev_cache.id, + cmd->check_dev_cache.cache); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 8cb3e835ca97..4bc9dd2e620a 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -227,6 +227,8 @@ FIXTURE_SETUP(iommufd_ioas) for (i = 0; i != variant->mock_domains; i++) { test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, &self->device_id); + test_cmd_dev_check_cache_all(self->device_id, + IOMMU_TEST_DEV_CACHE_DEFAULT); self->base_iova = MOCK_APERTURE_START; } } @@ -1392,9 +1394,12 @@ FIXTURE_SETUP(iommufd_mock_domain) ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains); - for (i = 0; i != variant->mock_domains; i++) + for (i = 0; i != variant->mock_domains; i++) { test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i], &self->hwpt_ids[i], &self->idev_ids[i]); + test_cmd_dev_check_cache_all(self->idev_ids[0], + IOMMU_TEST_DEV_CACHE_DEFAULT); + } self->hwpt_id = self->hwpt_ids[0]; self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 7fe905924d72..619ffdb1e5e8 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -250,6 +250,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \ }) +#define test_cmd_dev_check_cache(device_id, cache_id, expected) \ + ({ \ + struct iommu_test_cmd test_cmd = { \ + .size = sizeof(test_cmd), \ + .op = IOMMU_TEST_OP_DEV_CHECK_CACHE, \ + .id = device_id, \ + .check_dev_cache = { \ + .id = cache_id, \ + .cache = expected, \ + }, \ + }; \ + ASSERT_EQ(0, ioctl(self->fd, \ + _IOMMU_TEST_CMD( \ + IOMMU_TEST_OP_DEV_CHECK_CACHE), \ + &test_cmd)); \ + }) + +#define test_cmd_dev_check_cache_all(device_id, expected) \ + ({ \ + int c; \ + for (c = 0; c < MOCK_DEV_CACHE_NUM; c++) \ + test_cmd_dev_check_cache(device_id, c, expected); \ + }) + static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs, uint32_t data_type, uint32_t lreq, uint32_t *nreqs) -- Gitee From c8d2e9854b5be14e6f44d00921ff1ec77f5d325d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:17 -0800 Subject: [PATCH 139/278] iommufd/selftest: Add vIOMMU coverage for IOMMU_HWPT_INVALIDATE ioctl ANBZ: #13617 commit 49ad127719243420b355fd95b0d51ac46ae586e5 upstream. Add a viommu_cache test function to cover vIOMMU invalidations using the updated IOMMU_HWPT_INVALIDATE ioctl, which now allows passing in a vIOMMU via its hwpt_id field. Link: https://patch.msgid.link/r/f317f902041f3d05deaee4ca3fdd8ef4b8297361.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 173 ++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 32 ++++ 2 files changed, 205 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 4bc9dd2e620a..94fe038d2eee 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2642,4 +2642,177 @@ TEST_F(iommufd_viommu, vdevice_alloc) } } +TEST_F(iommufd_viommu, vdevice_cache) +{ + struct iommu_viommu_invalidate_selftest inv_reqs[2] = {}; + uint32_t viommu_id = self->viommu_id; + uint32_t dev_id = self->device_id; + uint32_t vdev_id = 0; + uint32_t num_inv; + + if (dev_id) { + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); + + test_cmd_dev_check_cache_all(dev_id, + IOMMU_TEST_DEV_CACHE_DEFAULT); + + /* Check data_type by passing zero-length array */ + num_inv = 0; + test_cmd_viommu_invalidate(viommu_id, inv_reqs, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: Invalid data_type */ + num_inv = 1; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: structure size sanity */ + num_inv = 1; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs) + 1, &num_inv); + assert(!num_inv); + + num_inv = 1; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + 1, &num_inv); + assert(!num_inv); + + /* Negative test: invalid flag is passed */ + num_inv = 1; + inv_reqs[0].flags = 0xffffffff; + inv_reqs[0].vdev_id = 0x99; + test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid data_uptr when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + test_err_viommu_invalidate(EINVAL, viommu_id, NULL, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid entry_len when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + 0, &num_inv); + assert(!num_inv); + + /* Negative test: invalid cache_id */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].cache_id = MOCK_DEV_CACHE_ID_MAX + 1; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid vdev_id */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x9; + inv_reqs[0].cache_id = 0; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* + * Invalidate the 1st cache entry but fail the 2nd request + * due to invalid flags configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].cache_id = 0; + inv_reqs[1].flags = 0xffffffff; + inv_reqs[1].vdev_id = 0x99; + inv_reqs[1].cache_id = 1; + test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_dev_check_cache(dev_id, 0, 0); + test_cmd_dev_check_cache(dev_id, 1, + IOMMU_TEST_DEV_CACHE_DEFAULT); + test_cmd_dev_check_cache(dev_id, 2, + IOMMU_TEST_DEV_CACHE_DEFAULT); + test_cmd_dev_check_cache(dev_id, 3, + IOMMU_TEST_DEV_CACHE_DEFAULT); + + /* + * Invalidate the 1st cache entry but fail the 2nd request + * due to invalid cache_id configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].cache_id = 0; + inv_reqs[1].flags = 0; + inv_reqs[1].vdev_id = 0x99; + inv_reqs[1].cache_id = MOCK_DEV_CACHE_ID_MAX + 1; + test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_dev_check_cache(dev_id, 0, 0); + test_cmd_dev_check_cache(dev_id, 1, + IOMMU_TEST_DEV_CACHE_DEFAULT); + test_cmd_dev_check_cache(dev_id, 2, + IOMMU_TEST_DEV_CACHE_DEFAULT); + test_cmd_dev_check_cache(dev_id, 3, + IOMMU_TEST_DEV_CACHE_DEFAULT); + + /* Invalidate the 2nd cache entry and verify */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].cache_id = 1; + test_cmd_viommu_invalidate(viommu_id, inv_reqs, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_dev_check_cache(dev_id, 0, 0); + test_cmd_dev_check_cache(dev_id, 1, 0); + test_cmd_dev_check_cache(dev_id, 2, + IOMMU_TEST_DEV_CACHE_DEFAULT); + test_cmd_dev_check_cache(dev_id, 3, + IOMMU_TEST_DEV_CACHE_DEFAULT); + + /* Invalidate the 3rd and 4th cache entries and verify */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].cache_id = 2; + inv_reqs[1].flags = 0; + inv_reqs[1].vdev_id = 0x99; + inv_reqs[1].cache_id = 3; + test_cmd_viommu_invalidate(viommu_id, inv_reqs, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 2); + test_cmd_dev_check_cache_all(dev_id, 0); + + /* Invalidate all cache entries for nested_dev_id[1] and verify */ + num_inv = 1; + inv_reqs[0].vdev_id = 0x99; + inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL; + test_cmd_viommu_invalidate(viommu_id, inv_reqs, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_dev_check_cache_all(dev_id, 0); + test_ioctl_destroy(vdev_id); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 619ffdb1e5e8..c0239f86f2f8 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -305,6 +305,38 @@ static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs, data_type, lreq, nreqs)); \ }) +static int _test_cmd_viommu_invalidate(int fd, __u32 viommu_id, void *reqs, + uint32_t data_type, uint32_t lreq, + uint32_t *nreqs) +{ + struct iommu_hwpt_invalidate cmd = { + .size = sizeof(cmd), + .hwpt_id = viommu_id, + .data_type = data_type, + .data_uptr = (uint64_t)reqs, + .entry_len = lreq, + .entry_num = *nreqs, + }; + int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd); + *nreqs = cmd.entry_num; + return rc; +} + +#define test_cmd_viommu_invalidate(viommu, reqs, lreq, nreqs) \ + ({ \ + ASSERT_EQ(0, \ + _test_cmd_viommu_invalidate(self->fd, viommu, reqs, \ + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, \ + lreq, nreqs)); \ + }) +#define test_err_viommu_invalidate(_errno, viommu_id, reqs, data_type, lreq, \ + nreqs) \ + ({ \ + EXPECT_ERRNO(_errno, _test_cmd_viommu_invalidate( \ + self->fd, viommu_id, reqs, \ + data_type, lreq, nreqs)); \ + }) + static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) { -- Gitee From ddc6ba0b7a0f65a1fe7c5e20150d3e7eaf8534cb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:52 -0300 Subject: [PATCH 140/278] iommu/arm-smmu-v3: Support IOMMU_VIOMMU_ALLOC ANBZ: #13617 commit 69d9b312f38aa19f8c801e90bd23d70685be49f0 upstream. Add a new driver-type for ARM SMMUv3 to enum iommu_viommu_type. Implement an arm_vsmmu_alloc(). As an initial step, copy the VMID from s2_parent. A followup series is required to give the VIOMMU object it's own VMID that will be used in all nesting configurations. Link: https://patch.msgid.link/r/8-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 45 +++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 13 ++++++ include/uapi/linux/iommufd.h | 4 ++ 4 files changed, 63 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 3d2671031c9b..60dd9e907595 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -29,3 +29,48 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) return info; } + +static const struct iommufd_viommu_ops arm_vsmmu_ops = { +}; + +struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, + struct iommu_domain *parent, + struct iommufd_ctx *ictx, + unsigned int viommu_type) +{ + struct arm_smmu_device *smmu = + iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_domain *s2_parent = to_smmu_domain(parent); + struct arm_vsmmu *vsmmu; + + if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) + return ERR_PTR(-EOPNOTSUPP); + + if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) + return ERR_PTR(-EOPNOTSUPP); + + if (s2_parent->smmu != master->smmu) + return ERR_PTR(-EINVAL); + + /* + * Must support some way to prevent the VM from bypassing the cache + * because VFIO currently does not do any cache maintenance. canwbs + * indicates the device is fully coherent and no cache maintenance is + * ever required, even for PCI No-Snoop. + */ + if (!arm_smmu_master_canwbs(master)) + return ERR_PTR(-EOPNOTSUPP); + + vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core, + &arm_vsmmu_ops); + if (IS_ERR(vsmmu)) + return ERR_CAST(vsmmu); + + vsmmu->smmu = smmu; + vsmmu->s2_parent = s2_parent; + /* FIXME Move VMID allocation from the S2 domain allocation to here */ + vsmmu->vmid = s2_parent->s2_cfg.vmid; + + return &vsmmu->core; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e6009d746812..d9948546c5e2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3529,6 +3529,7 @@ static struct iommu_ops arm_smmu_ops = { .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, + .viommu_alloc = arm_vsmmu_alloc, .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 054bda0e834e..2d1efe4312a9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -976,10 +977,22 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu) } #endif /* CONFIG_TEGRA241_CMDQV */ +struct arm_vsmmu { + struct iommufd_viommu core; + struct arm_smmu_device *smmu; + struct arm_smmu_domain *s2_parent; + u16 vmid; +}; + #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); +struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, + struct iommu_domain *parent, + struct iommufd_ctx *ictx, + unsigned int viommu_type); #else #define arm_smmu_hw_info NULL +#define arm_vsmmu_alloc NULL #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c0929b7cd7ab..15b0898a6c86 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -433,10 +433,12 @@ struct iommu_hwpt_vtd_s1 { * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE = 0, IOMMU_HWPT_DATA_VTD_S1 = 1, + IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, }; /** @@ -876,9 +878,11 @@ struct iommu_fault_alloc { /** * enum iommu_viommu_type - Virtual IOMMU Type * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type */ enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, }; /** -- Gitee From ae81f82ac52a7492cf38650a7c8bf87a101c97b1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:53 -0300 Subject: [PATCH 141/278] iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED ANBZ: #13617 commit 1e8be08d1c91d52a9b51d424db78ddbf88660bbb upstream. For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting as the parent and a user provided STE fragment that defines the CD table and related data with addresses translated by the S2 iommu_domain. The kernel only permits userspace to control certain allowed bits of the STE that are safe for user/guest control. IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2 translation, but there is no way of knowing which S1 entries refer to a range of S2. For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to flush all ASIDs from the VMID after flushing the S2 on any change to the S2. The IOMMU_DOMAIN_NESTED can only be created from inside a VIOMMU as the invalidation path relies on the VIOMMU to translate virtual stream ID used in the invalidation commands for the CD table and ATS. Link: https://patch.msgid.link/r/9-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 163 ++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 17 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 26 +++ include/uapi/linux/iommufd.h | 20 +++ 4 files changed, 225 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 60dd9e907595..91247a2a2d2c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -30,7 +30,170 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) return info; } +static void arm_smmu_make_nested_cd_table_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + arm_smmu_make_s2_domain_ste( + target, master, nested_domain->vsmmu->s2_parent, ats_enabled); + + target->data[0] = cpu_to_le64(STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, + STRTAB_STE_0_CFG_NESTED)); + target->data[0] |= nested_domain->ste[0] & + ~cpu_to_le64(STRTAB_STE_0_CFG); + target->data[1] |= nested_domain->ste[1]; +} + +/* + * Create a physical STE from the virtual STE that userspace provided when it + * created the nested domain. Using the vSTE userspace can request: + * - Non-valid STE + * - Abort STE + * - Bypass STE (install the S2, no CD table) + * - CD table STE (install the S2 and the userspace CD table) + */ +static void arm_smmu_make_nested_domain_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); + + /* + * Userspace can request a non-valid STE through the nesting interface. + * We relay that into an abort physical STE with the intention that + * C_BAD_STE for this SID can be generated to userspace. + */ + if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) + cfg = STRTAB_STE_0_CFG_ABORT; + + switch (cfg) { + case STRTAB_STE_0_CFG_S1_TRANS: + arm_smmu_make_nested_cd_table_ste(target, master, nested_domain, + ats_enabled); + break; + case STRTAB_STE_0_CFG_BYPASS: + arm_smmu_make_s2_domain_ste(target, master, + nested_domain->vsmmu->s2_parent, + ats_enabled); + break; + case STRTAB_STE_0_CFG_ABORT: + default: + arm_smmu_make_abort_ste(target); + break; + } +} + +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_nested_domain *nested_domain = + to_smmu_nested_domain(domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_attach_state state = { + .master = master, + .old_domain = iommu_get_domain_for_dev(dev), + .ssid = IOMMU_NO_PASID, + /* Currently invalidation of ATC is not supported */ + .disable_ats = true, + }; + struct arm_smmu_ste ste; + int ret; + + if (nested_domain->vsmmu->smmu != master->smmu) + return -EINVAL; + if (arm_smmu_ssids_in_use(&master->cd_table)) + return -EBUSY; + + mutex_lock(&arm_smmu_asid_lock); + ret = arm_smmu_attach_prepare(&state, domain); + if (ret) { + mutex_unlock(&arm_smmu_asid_lock); + return ret; + } + + arm_smmu_make_nested_domain_ste(&ste, master, nested_domain, + state.ats_enabled); + arm_smmu_install_ste_for_dev(master, &ste); + arm_smmu_attach_commit(&state); + mutex_unlock(&arm_smmu_asid_lock); + return 0; +} + +static void arm_smmu_domain_nested_free(struct iommu_domain *domain) +{ + kfree(to_smmu_nested_domain(domain)); +} + +static const struct iommu_domain_ops arm_smmu_nested_ops = { + .attach_dev = arm_smmu_attach_dev_nested, + .free = arm_smmu_domain_nested_free, +}; + +static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg) +{ + unsigned int cfg; + + if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { + memset(arg->ste, 0, sizeof(arg->ste)); + return 0; + } + + /* EIO is reserved for invalid STE data. */ + if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) || + (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED)) + return -EIO; + + cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0])); + if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS && + cfg != STRTAB_STE_0_CFG_S1_TRANS) + return -EIO; + return 0; +} + +static struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID; + struct arm_smmu_nested_domain *nested_domain; + struct iommu_hwpt_arm_smmuv3 arg; + int ret; + + /* + * Faults delivered to the nested domain are faults that originated by + * the S1 in the domain. The core code will match all PASIDs when + * delivering the fault due to user_pasid_table + */ + if (flags & ~SUPPORTED_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + ret = iommu_copy_struct_from_user(&arg, user_data, + IOMMU_HWPT_DATA_ARM_SMMUV3, ste); + if (ret) + return ERR_PTR(ret); + + ret = arm_smmu_validate_vste(&arg); + if (ret) + return ERR_PTR(ret); + + nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT); + if (!nested_domain) + return ERR_PTR(-ENOMEM); + + nested_domain->domain.type = IOMMU_DOMAIN_NESTED; + nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->vsmmu = vsmmu; + nested_domain->ste[0] = arg.ste[0]; + nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); + + return &nested_domain->domain; +} + static const struct iommufd_viommu_ops arm_vsmmu_ops = { + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, }; struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d9948546c5e2..d12711a83a1f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) case CMDQ_OP_TLBI_NH_ASID: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; + case CMDQ_OP_TLBI_NH_ALL: case CMDQ_OP_TLBI_S12_VMALL: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; @@ -2230,6 +2231,15 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, } __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); + if (smmu_domain->nest_parent) { + /* + * When the S2 domain changes all the nested S1 ASIDs have to be + * flushed too. + */ + cmd.opcode = CMDQ_OP_TLBI_NH_ALL; + arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); + } + /* * Unfortunately, this can't be leaf-only since we may have * zapped an entire table. @@ -2644,6 +2654,8 @@ to_smmu_domain_devices(struct iommu_domain *domain) if ((domain->type & __IOMMU_DOMAIN_PAGING) || domain->type == IOMMU_DOMAIN_SVA) return to_smmu_domain(domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + return to_smmu_nested_domain(domain)->vsmmu->s2_parent; return NULL; } @@ -2716,7 +2728,8 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * enabled if we have arm_smmu_domain, those always have page * tables. */ - state->ats_enabled = arm_smmu_ats_supported(master); + state->ats_enabled = !state->disable_ats && + arm_smmu_ats_supported(master); } if (smmu_domain) { @@ -3123,6 +3136,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, goto err_free; } smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->nest_parent = true; } smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; @@ -3530,6 +3544,7 @@ static struct iommu_ops arm_smmu_ops = { .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .viommu_alloc = arm_vsmmu_alloc, + .user_pasid_table = 1, .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 2d1efe4312a9..49922dfb74a8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -244,6 +244,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 +#define STRTAB_STE_0_CFG_NESTED 7 #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -295,6 +296,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ +#define STRTAB_STE_0_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ + STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX) +#define STRTAB_STE_1_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ + STRTAB_STE_1_S1STALLD) + /* * Context descriptors. * @@ -514,6 +524,7 @@ struct arm_smmu_cmdq_ent { }; } cfgi; + #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 #define CMDQ_OP_TLBI_EL2_ALL 0x20 @@ -815,10 +826,18 @@ struct arm_smmu_domain { struct list_head devices; spinlock_t devices_lock; bool enforce_cache_coherency : 1; + bool nest_parent : 1; struct mmu_notifier mmu_notifier; }; +struct arm_smmu_nested_domain { + struct iommu_domain domain; + struct arm_vsmmu *vsmmu; + + __le64 ste[2]; +}; + /* The following are exposed for testing purposes. */ struct arm_smmu_entry_writer_ops; struct arm_smmu_entry_writer { @@ -863,6 +882,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) return container_of(dom, struct arm_smmu_domain, domain); } +static inline struct arm_smmu_nested_domain * +to_smmu_nested_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct arm_smmu_nested_domain, domain); +} + extern struct xarray arm_smmu_asid_xa; extern struct mutex arm_smmu_asid_lock; @@ -909,6 +934,7 @@ struct arm_smmu_attach_state { struct iommu_domain *old_domain; struct arm_smmu_master *master; bool cd_needs_ats; + bool disable_ats; ioasid_t ssid; /* Resulting state */ bool ats_enabled; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 15b0898a6c86..9012ed22c2a2 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -429,6 +429,26 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; }; +/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * the translation. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass + * nested domain will translate the same as the nesting parent. The S1 will + * install a Context Descriptor Table pointing at userspace memory translated + * by the nesting parent. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data -- Gitee From b43e32ba39845ddaa23e314d279d2d61d56ddbe5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:54 -0300 Subject: [PATCH 142/278] iommu/arm-smmu-v3: Use S2FWB for NESTED domains ANBZ: #13617 commit 67e4fe3985138325c9b21193be52266750616182 upstream. Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field works. When S2FWB is supported and enabled the IOPTE will force cachable access to IOMMU_CACHE memory when nesting with a S1 and deny cachable access when !IOMMU_CACHE. When using a single stage of translation, a simple S2 domain, it doesn't change things for PCI devices as it is just a different encoding for the existing mapping of the IOMMU protection flags to cachability attributes. For non-PCI it also changes the combining rules when incoming transactions have inconsistent attributes. However, when used with a nested S1, FWB has the effect of preventing the guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to bypass the cache. Consistent with KVM we wish to deny the guest the ability to become incoherent with cached memory the hypervisor believes is cachable so we don't have to flush it. Allow NESTED domains to be created if the SMMU has S2FWB support and use S2FWB for NESTING_PARENTS. This is an additional option to CANWBS. Link: https://patch.msgid.link/r/10-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 7 +++-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 3 +++ drivers/iommu/io-pgtable-arm.c | 27 ++++++++++++++----- include/linux/io-pgtable.h | 2 ++ 5 files changed, 38 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 91247a2a2d2c..a1c8fcd4797c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -220,9 +220,12 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, * Must support some way to prevent the VM from bypassing the cache * because VFIO currently does not do any cache maintenance. canwbs * indicates the device is fully coherent and no cache maintenance is - * ever required, even for PCI No-Snoop. + * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make + * things non-coherent using the memattr, but No-Snoop behavior is not + * effected. */ - if (!arm_smmu_master_canwbs(master)) + if (!arm_smmu_master_canwbs(master) && + !(smmu->features & ARM_SMMU_FEAT_S2FWB)) return ERR_PTR(-EOPNOTSUPP); vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d12711a83a1f..a248761c5658 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1046,7 +1046,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) /* S2 translates */ if (cfg & BIT(1)) { used_bits[1] |= - cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG); + cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS | + STRTAB_STE_1_SHCFG); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | @@ -1654,6 +1655,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, FIELD_PREP(STRTAB_STE_1_EATS, ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB); if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); @@ -2472,6 +2475,9 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, pgtbl_cfg.oas = smmu->oas; fmt = ARM_64_LPAE_S2; finalise_stage_fn = arm_smmu_domain_finalise_s2; + if ((smmu->features & ARM_SMMU_FEAT_S2FWB) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB; break; default: return -EINVAL; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 49922dfb74a8..d0d024446c60 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -58,6 +58,7 @@ struct arm_smmu_device; #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) #define ARM_SMMU_IDR5 0x14 @@ -265,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4) #define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6) +#define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) @@ -740,6 +742,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20) #define ARM_SMMU_FEAT_HA (1 << 21) #define ARM_SMMU_FEAT_HD (1 << 22) +#define ARM_SMMU_FEAT_S2FWB (1 << 23) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index d3d70ba42365..19b529e28ec7 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -106,6 +106,18 @@ #define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6) #define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6) #define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6) +/* + * For !FWB these code to: + * 1111 = Normal outer write back cachable / Inner Write Back Cachable + * Permit S1 to override + * 0101 = Normal Non-cachable / Inner Non-cachable + * 0001 = Device / Device-nGnRE + * For S2FWB these code: + * 0110 Force Normal Write Back + * 0101 Normal* is forced Normal-NC, Device unchanged + * 0001 Force Device-nGnRE + */ +#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2) #define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2) #define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2) #define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2) @@ -470,12 +482,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, */ if (data->iop.fmt == ARM_64_LPAE_S2 || data->iop.fmt == ARM_32_LPAE_S2) { - if (prot & IOMMU_MMIO) + if (prot & IOMMU_MMIO) { pte |= ARM_LPAE_PTE_MEMATTR_DEV; - else if (prot & IOMMU_CACHE) - pte |= ARM_LPAE_PTE_MEMATTR_OIWB; - else + } else if (prot & IOMMU_CACHE) { + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB; + else + pte |= ARM_LPAE_PTE_MEMATTR_OIWB; + } else { pte |= ARM_LPAE_PTE_MEMATTR_NC; + } } else { if (prot & IOMMU_MMIO) pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV @@ -983,8 +999,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) struct arm_lpae_io_pgtable *data; typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr; - /* The NS quirk doesn't apply at stage 2 */ - if (cfg->quirks) + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB)) return NULL; data = arm_lpae_alloc_pgtable(cfg); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index b1ecfc3cd5bc..ce86b09ae80f 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -87,6 +87,7 @@ struct io_pgtable_cfg { * attributes set in the TCR for a non-coherent page-table walker. * * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable. + * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -95,6 +96,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) #define IO_PGTABLE_QUIRK_ARM_HD BIT(7) + #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; -- Gitee From 39a79616219c0cc32039616a249d531b7e833214 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:55 -0300 Subject: [PATCH 143/278] iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED ANBZ: #13617 commit f27298a82ba09a1c8aecee8a209b2a312beac672 upstream. The EATS flag needs to flow through the vSTE and into the pSTE, and ensure physical ATS is enabled on the PCI device. The physical ATS state must match the VM's idea of EATS as we rely on the VM to issue the ATS invalidation commands. Thus ATS must remain off at the device until EATS on a nesting domain turns it on. Attaching a nesting domain is the point where the invalidation responsibility transfers to userspace. Update the ATS logic to track EATS for nesting domains and flush the ATC whenever the S2 nesting parent changes. Link: https://patch.msgid.link/r/11-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 31 ++++++++++++++++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 +++++++++++++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 4 ++- include/uapi/linux/iommufd.h | 2 +- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index a1c8fcd4797c..84c8a21c00ae 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -95,8 +95,6 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, .master = master, .old_domain = iommu_get_domain_for_dev(dev), .ssid = IOMMU_NO_PASID, - /* Currently invalidation of ATC is not supported */ - .disable_ats = true, }; struct arm_smmu_ste ste; int ret; @@ -107,6 +105,15 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, return -EBUSY; mutex_lock(&arm_smmu_asid_lock); + /* + * The VM has to control the actual ATS state at the PCI device because + * we forward the invalidations directly from the VM. If the VM doesn't + * think ATS is on it will not generate ATC flushes and the ATC will + * become incoherent. Since we can't access the actual virtual PCI ATS + * config bit here base this off the EATS value in the STE. If the EATS + * is set then the VM must generate ATC flushes. + */ + state.disable_ats = !nested_domain->enable_ats; ret = arm_smmu_attach_prepare(&state, domain); if (ret) { mutex_unlock(&arm_smmu_asid_lock); @@ -131,8 +138,10 @@ static const struct iommu_domain_ops arm_smmu_nested_ops = { .free = arm_smmu_domain_nested_free, }; -static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg) +static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, + bool *enable_ats) { + unsigned int eats; unsigned int cfg; if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { @@ -149,6 +158,18 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg) if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS && cfg != STRTAB_STE_0_CFG_S1_TRANS) return -EIO; + + /* + * Only Full ATS or ATS UR is supported + * The EATS field will be set by arm_smmu_make_nested_domain_ste() + */ + eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1])); + arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS); + if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS) + return -EIO; + + if (cfg == STRTAB_STE_0_CFG_S1_TRANS) + *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS); return 0; } @@ -160,6 +181,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID; struct arm_smmu_nested_domain *nested_domain; struct iommu_hwpt_arm_smmuv3 arg; + bool enable_ats = false; int ret; /* @@ -175,7 +197,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, if (ret) return ERR_PTR(ret); - ret = arm_smmu_validate_vste(&arg); + ret = arm_smmu_validate_vste(&arg, &enable_ats); if (ret) return ERR_PTR(ret); @@ -185,6 +207,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, nested_domain->domain.type = IOMMU_DOMAIN_NESTED; nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->enable_ats = enable_ats; nested_domain->vsmmu = vsmmu; nested_domain->ste[0] = arg.ste[0]; nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index a248761c5658..033c41ac59e5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2107,7 +2107,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!master->ats_enabled) continue; - arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd); + if (master_domain->nested_ats_flush) { + /* + * If a S2 used as a nesting parent is changed we have + * no option but to completely flush the ATC. + */ + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + } else { + arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, + &cmd); + } for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; @@ -2631,7 +2640,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) static struct arm_smmu_master_domain * arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, struct arm_smmu_master *master, - ioasid_t ssid) + ioasid_t ssid, bool nested_ats_flush) { struct arm_smmu_master_domain *master_domain; @@ -2640,7 +2649,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) { if (master_domain->master == master && - master_domain->ssid == ssid) + master_domain->ssid == ssid && + master_domain->nested_ats_flush == nested_ats_flush) return master_domain; } return NULL; @@ -2671,13 +2681,18 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, { struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain); struct arm_smmu_master_domain *master_domain; + bool nested_ats_flush = false; unsigned long flags; if (!smmu_domain) return; + if (domain->type == IOMMU_DOMAIN_NESTED) + nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); - master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid); + master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid, + nested_ats_flush); if (master_domain) { list_del(&master_domain->devices_elm); kfree(master_domain); @@ -2744,6 +2759,9 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, return -ENOMEM; master_domain->master = master; master_domain->ssid = state->ssid; + if (new_domain->type == IOMMU_DOMAIN_NESTED) + master_domain->nested_ats_flush = + to_smmu_nested_domain(new_domain)->enable_ats; /* * During prepare we want the current smmu_domain and new diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index d0d024446c60..24ede80f3eae 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -305,7 +305,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_NESTING_ALLOWED \ cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ - STRTAB_STE_1_S1STALLD) + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS) /* * Context descriptors. @@ -837,6 +837,7 @@ struct arm_smmu_domain { struct arm_smmu_nested_domain { struct iommu_domain domain; struct arm_vsmmu *vsmmu; + bool enable_ats : 1; __le64 ste[2]; }; @@ -878,6 +879,7 @@ struct arm_smmu_master_domain { struct list_head devices_elm; struct arm_smmu_master *master; ioasid_t ssid; + bool nested_ats_flush : 1; }; static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 9012ed22c2a2..6d3710e9c14a 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -437,7 +437,7 @@ struct iommu_hwpt_vtd_s1 { * the translation. Must be little-endian. * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax - * - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD * * -EIO will be returned if @ste is not legal or contains any non-allowed field. * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass -- Gitee From ad2976c6b37fcde92ea545d82ceb8a28b1663f1f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:56 -0300 Subject: [PATCH 144/278] iommu/arm-smmu-v3: Support IOMMU_HWPT_INVALIDATE using a VIOMMU object ANBZ: #13617 commit d68beb276ba26cec47350a6d468e967673ee0c56 upstream. Implement the vIOMMU's cache_invalidate op for user space to invalidate the IOTLB entries, Device ATS and CD entries that are cached by hardware. Add struct iommu_viommu_arm_smmuv3_invalidate defining invalidation entries that are simply in the native format of a 128-bit TLBI command. Scan those commands against the permitted command list and fix their VMID/SID fields to match what is stored in the vIOMMU. Link: https://patch.msgid.link/r/12-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Co-developed-by: Eric Auger Signed-off-by: Eric Auger Co-developed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 134 ++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 + include/uapi/linux/iommufd.h | 24 ++++ 4 files changed, 166 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 84c8a21c00ae..c96cab6521a4 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -215,8 +215,134 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, return &nested_domain->domain; } +static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid) +{ + struct arm_smmu_master *master; + struct device *dev; + int ret = 0; + + xa_lock(&vsmmu->core.vdevs); + dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid); + if (!dev) { + ret = -EIO; + goto unlock; + } + master = dev_iommu_priv_get(dev); + + /* At this moment, iommufd only supports PCI device that has one SID */ + if (sid) + *sid = master->streams[0].id; +unlock: + xa_unlock(&vsmmu->core.vdevs); + return ret; +} + +/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */ +struct arm_vsmmu_invalidation_cmd { + union { + u64 cmd[2]; + struct iommu_viommu_arm_smmuv3_invalidate ucmd; + }; +}; + +/* + * Convert, in place, the raw invalidation command into an internal format that + * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are + * stored in CPU endian. + * + * Enforce the VMID or SID on the command. + */ +static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, + struct arm_vsmmu_invalidation_cmd *cmd) +{ + /* Commands are le64 stored in u64 */ + cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]); + cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]); + + switch (cmd->cmd[0] & CMDQ_0_OP) { + case CMDQ_OP_TLBI_NSNH_ALL: + /* Convert to NH_ALL */ + cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL | + FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + cmd->cmd[1] = 0; + break; + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_NH_VAA: + case CMDQ_OP_TLBI_NH_ALL: + case CMDQ_OP_TLBI_NH_ASID: + cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + break; + case CMDQ_OP_ATC_INV: + case CMDQ_OP_CFGI_CD: + case CMDQ_OP_CFGI_CD_ALL: { + u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]); + + if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid)) + return -EIO; + cmd->cmd[0] &= ~CMDQ_CFGI_0_SID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid); + break; + } + default: + return -EIO; + } + return 0; +} + +static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = vsmmu->smmu; + struct arm_vsmmu_invalidation_cmd *last; + struct arm_vsmmu_invalidation_cmd *cmds; + struct arm_vsmmu_invalidation_cmd *cur; + struct arm_vsmmu_invalidation_cmd *end; + int ret; + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 2 * sizeof(u64)); + ret = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3); + if (ret) + goto out; + + last = cmds; + while (cur != end) { + ret = arm_vsmmu_convert_user_cmd(vsmmu, cur); + if (ret) + goto out; + + /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */ + cur++; + if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1) + continue; + + /* FIXME always uses the main cmdq rather than trying to group by type */ + ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd, + cur - last, true); + if (ret) { + cur--; + goto out; + } + last = cur; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return ret; +} + static const struct iommufd_viommu_ops arm_vsmmu_ops = { .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + .cache_invalidate = arm_vsmmu_cache_invalidate, }; struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, @@ -239,6 +365,14 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, if (s2_parent->smmu != master->smmu) return ERR_PTR(-EINVAL); + /* + * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW + * defect is needed to determine if arm_vsmmu_cache_invalidate() needs + * any change to remove this. + */ + if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) + return ERR_PTR(-EOPNOTSUPP); + /* * Must support some way to prevent the VM from bypassing the cache * because VFIO currently does not do any cache maintenance. canwbs diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 033c41ac59e5..b7aad0aa0e75 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -766,9 +766,9 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, * insert their own list of commands then all of the commands from one * CPU will appear before any of the commands from the other CPU. */ -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq *cmdq, - u64 *cmds, int n, bool sync) +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync) { u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 24ede80f3eae..8d653425d84f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -529,6 +529,7 @@ struct arm_smmu_cmdq_ent { #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 + #define CMDQ_OP_TLBI_NH_VAA 0x13 #define CMDQ_OP_TLBI_EL2_ALL 0x20 #define CMDQ_OP_TLBI_EL2_ASID 0x21 #define CMDQ_OP_TLBI_EL2_VA 0x22 @@ -951,6 +952,10 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state); void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, const struct arm_smmu_ste *target); +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync); + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 6d3710e9c14a..4117336fc75f 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -721,9 +721,11 @@ struct iommu_hwpt_get_dirty_bitmap { * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation * Data Type * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3 */ enum iommu_hwpt_invalidate_data_type { IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1, }; /** @@ -762,6 +764,28 @@ struct iommu_hwpt_vtd_s1_invalidate { __u32 __reserved; }; +/** + * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation + * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3) + * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. + * Must be little-endian. + * + * Supported command list only when passing in a vIOMMU via @hwpt_id: + * CMDQ_OP_TLBI_NSNH_ALL + * CMDQ_OP_TLBI_NH_VA + * CMDQ_OP_TLBI_NH_VAA + * CMDQ_OP_TLBI_NH_ALL + * CMDQ_OP_TLBI_NH_ASID + * CMDQ_OP_ATC_INV + * CMDQ_OP_CFGI_CD + * CMDQ_OP_CFGI_CD_ALL + * + * -EIO will be returned if the command is not supported. + */ +struct iommu_viommu_arm_smmuv3_invalidate { + __aligned_le64 cmd[2]; +}; + /** * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) * @size: sizeof(struct iommu_hwpt_invalidate) -- Gitee From ea06c81102d3c3a886c7a1f1a30194f0d515c386 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Nov 2024 10:42:29 -0700 Subject: [PATCH 145/278] iommu/arm-smmu-v3: Import IOMMUFD module namespace ANBZ: #13617 commit 6d026e6d48cd2a95407c8fdd8d6187b871401c23 upstream. Commit 69d9b312f38a ("iommu/arm-smmu-v3: Support IOMMU_VIOMMU_ALLOC") started using _iommufd_object_alloc() without importing the IOMMUFD module namespace, resulting in a modpost warning: WARNING: modpost: module arm_smmu_v3 uses symbol _iommufd_object_alloc from namespace IOMMUFD, but does not import it. Commit d68beb276ba2 ("iommu/arm-smmu-v3: Support IOMMU_HWPT_INVALIDATE using a VIOMMU object") added another warning by using iommufd_viommu_find_dev(): WARNING: modpost: module arm_smmu_v3 uses symbol iommufd_viommu_find_dev from namespace IOMMUFD, but does not import it. Import the IOMMUFD module namespace to resolve the warnings. Fixes: 69d9b312f38a ("iommu/arm-smmu-v3: Support IOMMU_VIOMMU_ALLOC") Link: https://patch.msgid.link/r/20241114-arm-smmu-v3-import-iommufd-module-ns-v1-1-c551e7b972e9@kernel.org Signed-off-by: Nathan Chancellor Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index c96cab6521a4..6cc14d82399f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -397,3 +397,5 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, return &vsmmu->core; } + +MODULE_IMPORT_NS(IOMMUFD); -- Gitee From ecac5967bfd4057da3751600e1d67b9d9d8d8200 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 13 Nov 2024 11:51:34 -0800 Subject: [PATCH 146/278] iommufd: Export do_update_pinned ANBZ: #13617 commit 10caa8b45119fa19fa98ea304d49f7e1283062fc upstream. Export do_update_pinned. No functional change. Link: https://patch.msgid.link/r/1731527497-16091-2-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.h | 5 +++++ drivers/iommu/iommufd/pages.c | 10 +++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 9b40b2237932..f5f20fa639ef 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -252,4 +252,9 @@ struct iopt_pages_access { unsigned int users; }; +struct pfn_reader_user; + +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user); + #endif diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 8f249169831a..3427749bc5ce 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -985,8 +985,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, return rc; } -static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, - bool inc, struct pfn_reader_user *user) +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) { int rc = 0; @@ -1020,8 +1020,8 @@ static void update_unpinned(struct iopt_pages *pages) return; if (pages->npinned == pages->last_npinned) return; - do_update_pinned(pages, pages->last_npinned - pages->npinned, false, - NULL); + iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned, + false, NULL); } /* @@ -1051,7 +1051,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, npages = pages->npinned - pages->last_npinned; inc = true; } - return do_update_pinned(pages, npages, inc, user); + return iopt_pages_update_pinned(pages, npages, inc, user); } /* -- Gitee From 1af926c2559ff534ddf4ad0e490409f1cfe1ef01 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 13 Nov 2024 11:51:35 -0800 Subject: [PATCH 147/278] iommufd: Lock all IOAS objects ANBZ: #13617 commit 051ae5aa73d782a8be2699a11e17c68f28b4e758 upstream. Define helpers to lock and unlock all IOAS objects. This will allow DMA mappings to be updated atomically during live update. This code is substantially the same as an initial version provided by Jason, plus fixes. Link: https://patch.msgid.link/r/1731527497-16091-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/ioas.c | 65 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 1 + 3 files changed, 67 insertions(+) diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index c05d33fc3a50..c82ed5a92e3b 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -52,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_table; + + down_read(&ucmd->ictx->ioas_creation_lock); iommufd_object_finalize(ucmd->ictx, &ioas->obj); + up_read(&ucmd->ictx->ioas_creation_lock); return 0; out_table: @@ -374,6 +377,68 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) return rc; } +static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_ioas *ioas; + unsigned long index; + + xa_for_each(ioas_list, index, ioas) { + up_write(&ioas->iopt.iova_rwsem); + refcount_dec(&ioas->obj.users); + } + up_write(&ictx->ioas_creation_lock); + xa_destroy(ioas_list); +} + +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_object *obj; + unsigned long index; + int rc; + + /* + * This is very ugly, it is done instead of adding a lock around + * pages->source_mm, which is a performance path for mdev, we just + * obtain the write side of all the iova_rwsems which also protects the + * pages->source_*. Due to copies we can't know which IOAS could read + * from the pages, so we just lock everything. This is the only place + * locks are nested and they are uniformly taken in ID order. + * + * ioas_creation_lock prevents new IOAS from being installed in the + * xarray while we do this, and also prevents more than one thread from + * holding nested locks. + */ + down_write(&ictx->ioas_creation_lock); + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + struct iommufd_ioas *ioas; + + if (!obj || obj->type != IOMMUFD_OBJ_IOAS) + continue; + + if (!refcount_inc_not_zero(&obj->users)) + continue; + + xa_unlock(&ictx->objects); + + ioas = container_of(obj, struct iommufd_ioas, obj); + down_write_nest_lock(&ioas->iopt.iova_rwsem, + &ictx->ioas_creation_lock); + + rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL)); + if (rc) { + iommufd_release_all_iova_rwsem(ictx, ioas_list); + return rc; + } + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + return 0; +} + int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 062656c19a07..57c0c8f0f6a5 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -24,6 +24,7 @@ struct iommufd_ctx { struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; + struct rw_semaphore ioas_creation_lock; u8 account_mode; /* Compatibility with VFIO no iommu */ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index d735fe04197f..13ac2286035e 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -222,6 +222,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); } + init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; -- Gitee From 8f39fdd37929ccc87472b85507c09047bb094858 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 13 Nov 2024 11:51:36 -0800 Subject: [PATCH 148/278] iommufd: Add IOMMU_IOAS_CHANGE_PROCESS ANBZ: #13617 commit 829ed626499c11c9d11c65e93febc1e0da7cd61b upstream. Add an ioctl that updates all DMA mappings to reflect the current process, Change the mm and transfer locked memory accounting from old to current mm. This will be used for live update, allowing an old process to hand the iommufd device descriptor to a new process. The new process calls the ioctl. IOMMU_IOAS_CHANGE_PROCESS only supports DMA mappings created with IOMMU_IOAS_MAP_FILE, because the kernel metadata for such mappings does not depend on the userland VA of the pages (which is different in the new process). IOMMU_IOAS_CHANGE_PROCESS fails if other types of mappings are present. This is a revised version of code originally provided by Jason. Link: https://patch.msgid.link/r/1731527497-16091-4-git-send-email-steven.sistare@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/io_pagetable.h | 1 + drivers/iommu/iommufd/ioas.c | 147 ++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 2 + include/uapi/linux/iommufd.h | 23 ++++ 5 files changed, 174 insertions(+) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index f5f20fa639ef..10c928a9a463 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -173,6 +173,7 @@ enum { IOPT_PAGES_ACCOUNT_NONE = 0, IOPT_PAGES_ACCOUNT_USER = 1, IOPT_PAGES_ACCOUNT_MM = 2, + IOPT_PAGES_ACCOUNT_MODE_NUM = 3, }; enum iopt_address_type { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index c82ed5a92e3b..1542c5fd10a8 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -439,6 +439,153 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx, return 0; } +static bool need_charge_update(struct iopt_pages *pages) +{ + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + return false; + case IOPT_PAGES_ACCOUNT_MM: + return pages->source_mm != current->mm; + case IOPT_PAGES_ACCOUNT_USER: + /* + * Update when mm changes because it also accounts + * in mm->pinned_vm. + */ + return (pages->source_user != current_user()) || + (pages->source_mm != current->mm); + } + return true; +} + +static int charge_current(unsigned long *npinned) +{ + struct iopt_pages tmp = { + .source_mm = current->mm, + .source_task = current->group_leader, + .source_user = current_user(), + }; + unsigned int account_mode; + int rc; + + for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM; + account_mode++) { + if (!npinned[account_mode]) + continue; + + tmp.account_mode = account_mode; + rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true, + NULL); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + while (account_mode != 0) { + account_mode--; + if (!npinned[account_mode]) + continue; + tmp.account_mode = account_mode; + iopt_pages_update_pinned(&tmp, npinned[account_mode], false, + NULL); + } + return rc; +} + +static void change_mm(struct iopt_pages *pages) +{ + struct task_struct *old_task = pages->source_task; + struct user_struct *old_user = pages->source_user; + struct mm_struct *old_mm = pages->source_mm; + + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + mmdrop(old_mm); + + pages->source_task = current->group_leader; + get_task_struct(pages->source_task); + put_task_struct(old_task); + + pages->source_user = get_uid(current_user()); + free_uid(old_user); +} + +#define for_each_ioas_area(_xa, _index, _ioas, _area) \ + xa_for_each((_xa), (_index), (_ioas)) \ + for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \ + _area; \ + _area = iopt_area_iter_next(_area, 0, ULONG_MAX)) + +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_change_process *cmd = ucmd->cmd; + struct iommufd_ctx *ictx = ucmd->ictx; + unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {}; + struct iommufd_ioas *ioas; + struct iopt_area *area; + struct iopt_pages *pages; + struct xarray ioas_list; + unsigned long index; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + xa_init(&ioas_list); + rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list); + if (rc) + return rc; + + for_each_ioas_area(&ioas_list, index, ioas, area) { + if (area->pages->type != IOPT_ADDRESS_FILE) { + rc = -EINVAL; + goto out; + } + } + + /* + * Count last_pinned pages, then clear it to avoid double counting + * if the same iopt_pages is visited multiple times in this loop. + * Since we are under all the locks, npinned == last_npinned, so we + * can easily restore last_npinned before we return. + */ + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + if (need_charge_update(pages)) { + all_npinned[pages->account_mode] += pages->last_npinned; + pages->last_npinned = 0; + } + } + + rc = charge_current(all_npinned); + + if (rc) { + /* Charge failed. Fix last_npinned and bail. */ + for_each_ioas_area(&ioas_list, index, ioas, area) + area->pages->last_npinned = area->pages->npinned; + goto out; + } + + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + /* Uncharge the old one (which also restores last_npinned) */ + if (need_charge_update(pages)) { + int r = iopt_pages_update_pinned(pages, pages->npinned, + false, NULL); + + if (WARN_ON(r)) + rc = r; + } + change_mm(pages); + } + +out: + iommufd_release_all_iova_rwsem(ictx, &ioas_list); + return rc; +} + int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 57c0c8f0f6a5..b6d706cf2c66 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -255,6 +255,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 13ac2286035e..0a96cc8f27da 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -349,6 +349,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, + struct iommu_ioas_change_process, __reserved), IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4117336fc75f..059b6537f2b7 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -54,6 +54,7 @@ enum { IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f, IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, + IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, }; /** @@ -980,4 +981,26 @@ struct iommu_vdevice_alloc { __aligned_u64 virt_id; }; #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) + +/** + * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS) + * @size: sizeof(struct iommu_ioas_change_process) + * @__reserved: Must be 0 + * + * This transfers pinned memory counts for every memory map in every IOAS + * in the context to the current process. This only supports maps created + * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present. + * If the ioctl returns a failure status, then nothing is changed. + * + * This API is useful for transferring operation of a device from one process + * to another, such as during userland live update. + */ +struct iommu_ioas_change_process { + __u32 size; + __u32 __reserved; +}; + +#define IOMMU_IOAS_CHANGE_PROCESS \ + _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) + #endif -- Gitee From f21ad1c163b866834b0ee5c2fdaf0f38ac6cfec1 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 13 Nov 2024 11:51:37 -0800 Subject: [PATCH 149/278] iommufd: IOMMU_IOAS_CHANGE_PROCESS selftest ANBZ: #13617 commit c0dec4b848ce5110e95095d0d0ae46724beb70ec upstream. Add selftest cases for IOMMU_IOAS_CHANGE_PROCESS. Link: https://patch.msgid.link/r/1731527497-16091-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- tools/testing/selftests/iommu/Makefile | 1 + tools/testing/selftests/iommu/iommufd.c | 141 ++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 6 + 3 files changed, 148 insertions(+) diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile index 32c5fdfd0eef..f824582a253f 100644 --- a/tools/testing/selftests/iommu/Makefile +++ b/tools/testing/selftests/iommu/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O2 -Wno-unused-function CFLAGS += $(KHDR_INCLUDES) +LDLIBS += -lcap CFLAGS += -D_GNU_SOURCE diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 94fe038d2eee..a1b2b657999d 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include #include +#include #include #include @@ -135,6 +136,8 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova); TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id); TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id); + TEST_LENGTH(iommu_ioas_change_process, IOMMU_IOAS_CHANGE_PROCESS, + __reserved); #undef TEST_LENGTH } @@ -193,6 +196,144 @@ TEST_F(iommufd, global_options) EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd)); } +static void drop_cap_ipc_lock(struct __test_metadata *_metadata) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_IPC_LOCK }; + + caps = cap_get_proc(); + ASSERT_NE(caps, NULL); + ASSERT_NE(-1, + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR)); + ASSERT_NE(-1, cap_set_proc(caps)); + cap_free(caps); +} + +static long get_proc_status_value(pid_t pid, const char *var) +{ + FILE *fp; + char buf[80], tag[80]; + long val = -1; + + snprintf(buf, sizeof(buf), "/proc/%d/status", pid); + fp = fopen(buf, "r"); + if (!fp) + return val; + + while (fgets(buf, sizeof(buf), fp)) + if (fscanf(fp, "%s %ld\n", tag, &val) == 2 && !strcmp(tag, var)) + break; + + fclose(fp); + return val; +} + +static long get_vm_pinned(pid_t pid) +{ + return get_proc_status_value(pid, "VmPin:"); +} + +static long get_vm_locked(pid_t pid) +{ + return get_proc_status_value(pid, "VmLck:"); +} + +FIXTURE(change_process) +{ + int fd; + uint32_t ioas_id; +}; + +FIXTURE_VARIANT(change_process) +{ + int accounting; +}; + +FIXTURE_SETUP(change_process) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + + drop_cap_ipc_lock(_metadata); + if (variant->accounting != IOPT_PAGES_ACCOUNT_NONE) { + struct iommu_option set_limit_cmd = { + .size = sizeof(set_limit_cmd), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .val64 = (variant->accounting == IOPT_PAGES_ACCOUNT_MM), + }; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &set_limit_cmd)); + } + + test_ioctl_ioas_alloc(&self->ioas_id); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); +} + +FIXTURE_TEARDOWN(change_process) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(change_process, account_none) +{ + .accounting = IOPT_PAGES_ACCOUNT_NONE, +}; + +FIXTURE_VARIANT_ADD(change_process, account_user) +{ + .accounting = IOPT_PAGES_ACCOUNT_USER, +}; + +FIXTURE_VARIANT_ADD(change_process, account_mm) +{ + .accounting = IOPT_PAGES_ACCOUNT_MM, +}; + +TEST_F(change_process, basic) +{ + pid_t parent = getpid(); + pid_t child; + __u64 iova; + struct iommu_ioas_change_process cmd = { + .size = sizeof(cmd), + }; + + /* Expect failure if non-file maps exist */ + test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova); + EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd)); + test_ioctl_ioas_unmap(iova, PAGE_SIZE); + + /* Change process works in current process. */ + test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova); + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd)); + + /* Change process works in another process */ + child = fork(); + if (!child) { + int nlock = PAGE_SIZE / 1024; + + /* Parent accounts for locked memory before */ + ASSERT_EQ(nlock, get_vm_pinned(parent)); + if (variant->accounting == IOPT_PAGES_ACCOUNT_MM) + ASSERT_EQ(nlock, get_vm_locked(parent)); + ASSERT_EQ(0, get_vm_pinned(getpid())); + ASSERT_EQ(0, get_vm_locked(getpid())); + + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd)); + + /* Child accounts for locked memory after */ + ASSERT_EQ(0, get_vm_pinned(parent)); + ASSERT_EQ(0, get_vm_locked(parent)); + ASSERT_EQ(nlock, get_vm_pinned(getpid())); + if (variant->accounting == IOPT_PAGES_ACCOUNT_MM) + ASSERT_EQ(nlock, get_vm_locked(getpid())); + + exit(0); + } + ASSERT_NE(-1, child); + ASSERT_EQ(child, waitpid(child, NULL, 0)); +} + FIXTURE(iommufd_ioas) { int fd; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index c0239f86f2f8..d979f5b0efe8 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -22,6 +22,12 @@ #define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / __BITS_PER_LONG) +enum { + IOPT_PAGES_ACCOUNT_NONE = 0, + IOPT_PAGES_ACCOUNT_USER = 1, + IOPT_PAGES_ACCOUNT_MM = 2, +}; + #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) static inline void set_bit(unsigned int nr, unsigned long *addr) -- Gitee From 9bdfaf3918cf5a03e4242a9d86b8b3b811acc878 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 14 Nov 2024 15:55:30 -0400 Subject: [PATCH 150/278] iommu: Add ops->domain_alloc_nested() ANBZ: #13617 commit 64214c2b95364d26cdff045d8bbefd37380edbe1 upstream. It turns out all the drivers that are using this immediately call into another function, so just make that function directly into the op. This makes paging=NULL for domain_alloc_user and we can remove the argument in the next patch. The function mirrors the similar op in the viommu that allocates a nested domain on top of the viommu's nesting parent. This version supports cases where a viommu is not being used. Link: https://patch.msgid.link/r/1-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 9 +++------ drivers/iommu/intel/iommu.h | 6 ++++-- drivers/iommu/intel/nested.c | 11 +++++++++-- drivers/iommu/iommufd/hw_pagetable.c | 8 ++++---- drivers/iommu/iommufd/selftest.c | 7 ++++--- include/linux/iommu.h | 9 +++++---- 6 files changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 94b645966e95..95dea7bcdc27 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3399,12 +3399,8 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *domain; bool first_stage; - /* Must be NESTING domain */ - if (parent) { - if (!nested_supported(iommu) || flags) - return ERR_PTR(-EOPNOTSUPP); - return intel_nested_domain_alloc(parent, user_data); - } + if (parent) + return ERR_PTR(-EOPNOTSUPP); if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING @@ -4534,6 +4530,7 @@ const struct iommu_ops intel_iommu_ops = { .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, + .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 26accaeca410..0a20b11b0726 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1265,8 +1265,10 @@ int __domain_setup_first_level(struct intel_iommu *iommu, int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data); +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); enum cache_tag_type { diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 42c4533a6ea2..aba92c00b427 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -186,14 +186,21 @@ static const struct iommu_domain_ops intel_nested_domain_ops = { .cache_invalidate_user = intel_nested_cache_invalidate_user, }; -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data) +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct intel_iommu *iommu = info->iommu; struct iommu_hwpt_vtd_s1 vtd; struct dmar_domain *domain; int ret; + if (!nested_supported(iommu) || flags) + return ERR_PTR(-EOPNOTSUPP); + /* Must be nested domain */ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 9236e8ca9aa8..ec3c64a8c796 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -227,7 +227,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, int rc; if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) || - !user_data->len || !ops->domain_alloc_user) + !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); if (parent->auto_domain || !parent->nest_parent || parent->common.domain->owner != ops) @@ -242,9 +242,9 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; - hwpt->domain = ops->domain_alloc_user(idev->dev, - flags & ~IOMMU_HWPT_FAULT_ID_VALID, - parent->common.domain, user_data); + hwpt->domain = ops->domain_alloc_nested( + idev->dev, parent->common.domain, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 2f9de177dffc..c58083c3660a 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -356,8 +356,8 @@ __mock_domain_alloc_nested(const struct iommu_user_data *user_data) } static struct iommu_domain * -mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags, - const struct iommu_user_data *user_data) +mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, const struct iommu_user_data *user_data) { struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; @@ -391,7 +391,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *domain; if (parent) - return mock_domain_alloc_nested(parent, flags, user_data); + return ERR_PTR(-EOPNOTSUPP); if (user_data) return ERR_PTR(-EOPNOTSUPP); @@ -719,6 +719,7 @@ static const struct iommu_ops mock_ops = { .hw_info = mock_domain_hw_info, .domain_alloc_paging = mock_domain_alloc_paging, .domain_alloc_user = mock_domain_alloc_user, + .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, .device_group = generic_device_group, .probe_device = mock_probe_device, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7188d60ecb28..4051ff9e54aa 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -567,15 +567,13 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * the caller iommu_domain_alloc() returns. * @domain_alloc_user: Allocate an iommu domain corresponding to the input * parameters as defined in include/uapi/linux/iommufd.h. - * Upon success, if the @user_data is valid and the @parent - * points to a kernel-managed domain, the new domain must be - * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be - * NULL while the @user_data can be optionally provided, the + * The @user_data can be optionally provided, the * new domain must support __IOMMU_DOMAIN_PAGING. * Upon failure, ERR_PTR must be returned. * @domain_alloc_paging: Allocate an iommu_domain that can be used for * UNMANAGED, DMA, and DMA_FQ domain types. * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. + * @domain_alloc_nested: Allocate an iommu_domain for nested translation. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -630,6 +628,9 @@ struct iommu_ops { struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_domain *(*domain_alloc_sva)(struct device *dev, struct mm_struct *mm); + struct iommu_domain *(*domain_alloc_nested)( + struct device *dev, struct iommu_domain *parent, u32 flags, + const struct iommu_user_data *user_data); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); -- Gitee From 6ab1039fe63bcb363c5f146a0f898521e1b8eb64 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 14 Nov 2024 15:55:31 -0400 Subject: [PATCH 151/278] iommu: Rename ops->domain_alloc_user() to domain_alloc_paging_flags() ANBZ: #13617 commit d53764723ecd639a0cc0c5ad24146847fc09f78d upstream. Now that the main domain allocating path is calling this function it doesn't make sense to leave it named _user. Change the name to alloc_paging_flags() to mirror the new iommu_paging_domain_alloc_flags() function. A driver should implement only one of ops->domain_alloc_paging() or ops->domain_alloc_paging_flags(). The former is a simpler interface with less boiler plate that the majority of drivers use. The latter is for drivers with a greater feature set (PASID, multiple page table support, advanced iommufd support, nesting, etc). Additional patches will be needed to achieve this. Link: https://patch.msgid.link/r/2-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen Conflicts: include/linux/iommu.h --- drivers/iommu/amd/iommu.c | 9 ++++----- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 ++++----- drivers/iommu/intel/iommu.c | 10 +++------- drivers/iommu/iommu.c | 4 ++-- drivers/iommu/iommufd/hw_pagetable.c | 8 ++++---- drivers/iommu/iommufd/selftest.c | 10 +++------- include/linux/iommu.h | 22 ++++++++++++--------- 7 files changed, 33 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3193a76e01f4..cabc336f1905 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2429,9 +2429,8 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) } static struct iommu_domain * -amd_iommu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { unsigned int type = IOMMU_DOMAIN_UNMANAGED; @@ -2442,7 +2441,7 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, if (dev) iommu = get_amd_iommu_from_dev(dev); - if ((flags & ~supported_flags) || parent || user_data) + if ((flags & ~supported_flags) || user_data) return ERR_PTR(-EOPNOTSUPP); /* Allocate domain with v2 page table if IOMMU supports PASID. */ @@ -2906,7 +2905,7 @@ const struct iommu_ops amd_iommu_ops = { .release_domain = &release_domain, .identity_domain = &identity_domain.domain, .domain_alloc = amd_iommu_domain_alloc, - .domain_alloc_user = amd_iommu_domain_alloc_user, + .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b7aad0aa0e75..4ff3b766e8cf 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3132,9 +3132,8 @@ static struct iommu_domain arm_smmu_blocked_domain = { }; static struct iommu_domain * -arm_smmu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | @@ -3144,7 +3143,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, if (flags & ~PAGING_FLAGS) return ERR_PTR(-EOPNOTSUPP); - if (parent || user_data) + if (user_data) return ERR_PTR(-EOPNOTSUPP); if (flags & IOMMU_HWPT_ALLOC_PASID) @@ -3556,7 +3555,7 @@ static struct iommu_ops arm_smmu_ops = { .hw_info = arm_smmu_hw_info, .domain_alloc_paging = arm_smmu_domain_alloc_paging, .domain_alloc_sva = arm_smmu_sva_domain_alloc, - .domain_alloc_user = arm_smmu_domain_alloc_user, + .domain_alloc_paging_flags = arm_smmu_domain_alloc_paging_flags, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 95dea7bcdc27..1c8307d8a912 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3387,9 +3387,8 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st } static struct iommu_domain * -intel_iommu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; @@ -3399,9 +3398,6 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *domain; bool first_stage; - if (parent) - return ERR_PTR(-EOPNOTSUPP); - if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_FAULT_ID_VALID))) @@ -4527,7 +4523,7 @@ const struct iommu_ops intel_iommu_ops = { .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc_user = intel_iommu_domain_alloc_user, + .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, .domain_alloc_nested = intel_iommu_domain_alloc_nested, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index dd8a4b2ea185..c407b03f90b3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2026,8 +2026,8 @@ __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, if (ops->domain_alloc_paging && !flags) domain = ops->domain_alloc_paging(dev); - else if (ops->domain_alloc_user) - domain = ops->domain_alloc_user(dev, flags, NULL, NULL); + else if (ops->domain_alloc_paging_flags) + domain = ops->domain_alloc_paging_flags(dev, flags, NULL); else if (ops->domain_alloc && !flags) domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); else diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index ec3c64a8c796..ce03c3804651 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -119,7 +119,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); - if ((flags || user_data) && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_paging_flags) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); @@ -139,9 +139,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt_paging->ioas = ioas; hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, - user_data); + if (ops->domain_alloc_paging_flags) { + hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, flags, + user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index c58083c3660a..a0de6d6d4e68 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -379,9 +379,8 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | @@ -390,9 +389,6 @@ mock_domain_alloc_user(struct device *dev, u32 flags, MOCK_FLAGS_DEVICE_NO_DIRTY; struct iommu_domain *domain; - if (parent) - return ERR_PTR(-EOPNOTSUPP); - if (user_data) return ERR_PTR(-EOPNOTSUPP); if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) @@ -718,7 +714,7 @@ static const struct iommu_ops mock_ops = { .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, .domain_alloc_paging = mock_domain_alloc_paging, - .domain_alloc_user = mock_domain_alloc_user, + .domain_alloc_paging_flags = mock_domain_alloc_paging_flags, .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, .device_group = generic_device_group, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4051ff9e54aa..1c7fd0122e7a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -565,13 +565,17 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @domain_alloc: allocate and return an iommu domain if success. Otherwise * NULL is returned. The domain is not fully initialized until * the caller iommu_domain_alloc() returns. - * @domain_alloc_user: Allocate an iommu domain corresponding to the input - * parameters as defined in include/uapi/linux/iommufd.h. - * The @user_data can be optionally provided, the - * new domain must support __IOMMU_DOMAIN_PAGING. - * Upon failure, ERR_PTR must be returned. -* @domain_alloc_paging: Allocate an iommu_domain that can be used for - * UNMANAGED, DMA, and DMA_FQ domain types. + * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the + * input parameters as defined in + * include/uapi/linux/iommufd.h. The @user_data can be + * optionally provided, the new domain must support + * __IOMMU_DOMAIN_PAGING. Upon failure, ERR_PTR must be + * returned. + * @domain_alloc_paging: Allocate an iommu_domain that can be used for + * UNMANAGED, DMA, and DMA_FQ domain types. This is the + * same as invoking domain_alloc_paging_flags() with + * @flags=0, @user_data=NULL. A driver should implement + * only one of the two ops. * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. * @domain_alloc_nested: Allocate an iommu_domain for nested translation. * @probe_device: Add device to iommu driver handling @@ -622,8 +626,8 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); - struct iommu_domain *(*domain_alloc_user)( - struct device *dev, u32 flags, struct iommu_domain *parent, + struct iommu_domain *(*domain_alloc_paging_flags)( + struct device *dev, u32 flags, const struct iommu_user_data *user_data); struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_domain *(*domain_alloc_sva)(struct device *dev, -- Gitee From 6c9b4f0732f2b87d90a65022652a5bf31ab7a417 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 3 Dec 2024 18:49:05 +0000 Subject: [PATCH 152/278] iommu/arm-smmu-v3: Introduce struct arm_smmu_event ANBZ: #13617 commit 43ca55f5555b53d0c37039c62d75e882cccbfb69 upstream. Introduce `struct arm_smmu_event` to represent event records. Parse out relevant fields from raw event records for ease and use the new `struct arm_smmu_event` instead. Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20241203184906.2264528-2-praan@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 55 ++++++++++++++------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 18 +++++++ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 4ff3b766e8cf..ee65d7e825a3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1759,17 +1759,34 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) } /* IRQ and event handlers */ -static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) +static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) +{ + event->id = FIELD_GET(EVTQ_0_ID, raw[0]); + event->sid = FIELD_GET(EVTQ_0_SID, raw[0]); + event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]); + event->ssid = event->ssv ? FIELD_GET(EVTQ_0_SSID, raw[0]) : IOMMU_NO_PASID; + event->privileged = FIELD_GET(EVTQ_1_PnU, raw[1]); + event->instruction = FIELD_GET(EVTQ_1_InD, raw[1]); + event->s2 = FIELD_GET(EVTQ_1_S2, raw[1]); + event->read = FIELD_GET(EVTQ_1_RnW, raw[1]); + event->stag = FIELD_GET(EVTQ_1_STAG, raw[1]); + event->stall = FIELD_GET(EVTQ_1_STALL, raw[1]); + event->class = FIELD_GET(EVTQ_1_CLASS, raw[1]); + event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]); + event->ipa = raw[3] & EVTQ_3_IPA; + event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR; +} + +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; struct arm_smmu_master *master; - bool ssid_valid = evt[0] & EVTQ_0_SSV; - u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]); struct iopf_fault fault_evt = { }; struct iommu_fault *flt = &fault_evt.fault; - switch (FIELD_GET(EVTQ_0_ID, evt[0])) { + switch (event->id) { case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1779,35 +1796,35 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) return -EOPNOTSUPP; } - if (!(evt[1] & EVTQ_1_STALL)) + if (!event->stall) return -EOPNOTSUPP; - if (evt[1] & EVTQ_1_RnW) + if (event->read) perm |= IOMMU_FAULT_PERM_READ; else perm |= IOMMU_FAULT_PERM_WRITE; - if (evt[1] & EVTQ_1_InD) + if (event->instruction) perm |= IOMMU_FAULT_PERM_EXEC; - if (evt[1] & EVTQ_1_PnU) + if (event->privileged) perm |= IOMMU_FAULT_PERM_PRIV; flt->type = IOMMU_FAULT_PAGE_REQ; flt->prm = (struct iommu_fault_page_request) { .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]), + .grpid = event->stag, .perm = perm, - .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), + .addr = event->iova, }; - if (ssid_valid) { + if (event->ssv) { flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); + flt->prm.pasid = event->ssid; } mutex_lock(&smmu->streams_mutex); - master = arm_smmu_find_master(smmu, sid); + master = arm_smmu_find_master(smmu, event->sid); if (!master) { ret = -EINVAL; goto out_unlock; @@ -1822,23 +1839,23 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) { int i, ret; + u64 evt[EVTQ_ENT_DWORDS]; + struct arm_smmu_event event = {0}; struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->evtq.q; struct arm_smmu_ll_queue *llq = &q->llq; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - u64 evt[EVTQ_ENT_DWORDS]; do { while (!queue_remove_raw(q, evt)) { - u8 id = FIELD_GET(EVTQ_0_ID, evt[0]); - - ret = arm_smmu_handle_evt(smmu, evt); + arm_smmu_decode_event(evt, &event); + ret = arm_smmu_handle_evt(smmu, &event); if (!ret || !__ratelimit(&rs)) continue; - dev_info(smmu->dev, "event 0x%02x received:\n", id); - for (i = 0; i < ARRAY_SIZE(evt); ++i) + dev_info(smmu->dev, "event 0x%02x received:\n", event.id); + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) dev_info(smmu->dev, "\t0x%016llx\n", (unsigned long long)evt[i]); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8d653425d84f..1478049dea4b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -470,6 +470,7 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_1_TT_READ (1UL << 44) #define EVTQ_2_ADDR GENMASK_ULL(63, 0) #define EVTQ_3_IPA GENMASK_ULL(51, 12) +#define EVTQ_3_FETCH_ADDR GENMASK_ULL(51, 3) /* PRI queue */ #define PRIQ_ENT_SZ_SHIFT 4 @@ -789,6 +790,23 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_event { + u8 stall : 1, + ssv : 1, + privileged : 1, + instruction : 1, + s2 : 1, + read : 1; + u8 id; + u8 class; + u16 stag; + u32 sid; + u32 ssid; + u64 iova; + u64 ipa; + u64 fetch_addr; +}; + /* SMMU private data for each master */ struct arm_smmu_master { struct arm_smmu_device *smmu; -- Gitee From c7aeece438f0ae7a9290b528b92407ce42ae1553 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 3 Dec 2024 18:49:06 +0000 Subject: [PATCH 153/278] iommu/arm-smmu-v3: Log better event records ANBZ: #13617 commit d814b70b9b901c823ddedd12757ca4a19b18c8f7 upstream. Currently, the driver dumps the raw hex for a received event record. Improve this by leveraging `struct arm_smmu_event` for event fields and log human-readable event records with meaningful information. Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20241203184906.2264528-3-praan@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 114 +++++++++++++++++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 14 ++- 2 files changed, 115 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ee65d7e825a3..e71a5cb6cf0d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -83,6 +83,28 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { { 0, NULL}, }; +static const char * const event_str[] = { + [EVT_ID_BAD_STREAMID_CONFIG] = "C_BAD_STREAMID", + [EVT_ID_STE_FETCH_FAULT] = "F_STE_FETCH", + [EVT_ID_BAD_STE_CONFIG] = "C_BAD_STE", + [EVT_ID_STREAM_DISABLED_FAULT] = "F_STREAM_DISABLED", + [EVT_ID_BAD_SUBSTREAMID_CONFIG] = "C_BAD_SUBSTREAMID", + [EVT_ID_CD_FETCH_FAULT] = "F_CD_FETCH", + [EVT_ID_BAD_CD_CONFIG] = "C_BAD_CD", + [EVT_ID_TRANSLATION_FAULT] = "F_TRANSLATION", + [EVT_ID_ADDR_SIZE_FAULT] = "F_ADDR_SIZE", + [EVT_ID_ACCESS_FAULT] = "F_ACCESS", + [EVT_ID_PERMISSION_FAULT] = "F_PERMISSION", + [EVT_ID_VMS_FETCH_FAULT] = "F_VMS_FETCH", +}; + +static const char * const event_class_str[] = { + [0] = "CD fetch", + [1] = "Stage 1 translation table fetch", + [2] = "Input address caused fault", + [3] = "Reserved", +}; + static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, struct arm_smmu_device *smmu, u32 flags); static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); @@ -1759,8 +1781,11 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) } /* IRQ and event handlers */ -static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) +static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) { + struct arm_smmu_master *master; + event->id = FIELD_GET(EVTQ_0_ID, raw[0]); event->sid = FIELD_GET(EVTQ_0_SID, raw[0]); event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]); @@ -1775,9 +1800,21 @@ static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]); event->ipa = raw[3] & EVTQ_3_IPA; event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR; + event->ttrnw = FIELD_GET(EVTQ_1_TT_READ, raw[1]); + event->class_tt = false; + event->dev = NULL; + + if (event->id == EVT_ID_PERMISSION_FAULT) + event->class_tt = (event->class == EVTQ_1_CLASS_TT); + + mutex_lock(&smmu->streams_mutex); + master = arm_smmu_find_master(smmu, event->sid); + if (master) + event->dev = get_device(master->dev); + mutex_unlock(&smmu->streams_mutex); } -static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, struct arm_smmu_event *event) { int ret = 0; @@ -1836,9 +1873,67 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, return ret; } +static void arm_smmu_dump_raw_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) +{ + int i; + + dev_err(smmu->dev, "event 0x%02x received:\n", event->id); + + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) + dev_err(smmu->dev, "\t0x%016llx\n", raw[i]); +} + +#define ARM_SMMU_EVT_KNOWN(e) ((e)->id < ARRAY_SIZE(event_str) && event_str[(e)->id]) +#define ARM_SMMU_LOG_EVT_STR(e) ARM_SMMU_EVT_KNOWN(e) ? event_str[(e)->id] : "UNKNOWN" +#define ARM_SMMU_LOG_CLIENT(e) (e)->dev ? dev_name((e)->dev) : "(unassigned sid)" + +static void arm_smmu_dump_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *evt, + struct ratelimit_state *rs) +{ + if (!__ratelimit(rs)) + return; + + arm_smmu_dump_raw_event(smmu, raw, evt); + + switch (evt->id) { + case EVT_ID_TRANSLATION_FAULT: + case EVT_ID_ADDR_SIZE_FAULT: + case EVT_ID_ACCESS_FAULT: + case EVT_ID_PERMISSION_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x iova: %#llx ipa: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->iova, evt->ipa); + + dev_err(smmu->dev, "%s %s %s %s \"%s\"%s%s stag: %#x", + evt->privileged ? "priv" : "unpriv", + evt->instruction ? "inst" : "data", + evt->read ? "read" : "write", + evt->s2 ? "s2" : "s1", event_class_str[evt->class], + evt->class_tt ? (evt->ttrnw ? " ttd_read" : " ttd_write") : "", + evt->stall ? " stall" : "", evt->stag); + + break; + + case EVT_ID_STE_FETCH_FAULT: + case EVT_ID_CD_FETCH_FAULT: + case EVT_ID_VMS_FETCH_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x fetch_addr: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->fetch_addr); + + break; + + default: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid); + } +} + static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) { - int i, ret; u64 evt[EVTQ_ENT_DWORDS]; struct arm_smmu_event event = {0}; struct arm_smmu_device *smmu = dev; @@ -1849,16 +1944,11 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { - arm_smmu_decode_event(evt, &event); - ret = arm_smmu_handle_evt(smmu, &event); - if (!ret || !__ratelimit(&rs)) - continue; - - dev_info(smmu->dev, "event 0x%02x received:\n", event.id); - for (i = 0; i < EVTQ_ENT_DWORDS; ++i) - dev_info(smmu->dev, "\t0x%016llx\n", - (unsigned long long)evt[i]); + arm_smmu_decode_event(smmu, evt, &event); + if (arm_smmu_handle_event(smmu, &event)) + arm_smmu_dump_event(smmu, evt, &event, &rs); + put_device(event.dev); cond_resched(); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1478049dea4b..2d073e0d0a81 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -452,10 +452,18 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_0_ID GENMASK_ULL(7, 0) +#define EVT_ID_BAD_STREAMID_CONFIG 0x02 +#define EVT_ID_STE_FETCH_FAULT 0x03 +#define EVT_ID_BAD_STE_CONFIG 0x04 +#define EVT_ID_STREAM_DISABLED_FAULT 0x06 +#define EVT_ID_BAD_SUBSTREAMID_CONFIG 0x08 +#define EVT_ID_CD_FETCH_FAULT 0x09 +#define EVT_ID_BAD_CD_CONFIG 0x0a #define EVT_ID_TRANSLATION_FAULT 0x10 #define EVT_ID_ADDR_SIZE_FAULT 0x11 #define EVT_ID_ACCESS_FAULT 0x12 #define EVT_ID_PERMISSION_FAULT 0x13 +#define EVT_ID_VMS_FETCH_FAULT 0x25 #define EVTQ_0_SSV (1UL << 11) #define EVTQ_0_SSID GENMASK_ULL(31, 12) @@ -467,6 +475,7 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_1_RnW (1UL << 35) #define EVTQ_1_S2 (1UL << 39) #define EVTQ_1_CLASS GENMASK_ULL(41, 40) +#define EVTQ_1_CLASS_TT 0x01 #define EVTQ_1_TT_READ (1UL << 44) #define EVTQ_2_ADDR GENMASK_ULL(63, 0) #define EVTQ_3_IPA GENMASK_ULL(51, 12) @@ -796,7 +805,9 @@ struct arm_smmu_event { privileged : 1, instruction : 1, s2 : 1, - read : 1; + read : 1, + ttrnw : 1, + class_tt : 1; u8 id; u8 class; u16 stag; @@ -805,6 +816,7 @@ struct arm_smmu_event { u64 iova; u64 ipa; u64 fetch_addr; + struct device *dev; }; /* SMMU private data for each master */ -- Gitee From 672730086d4890b57dfdf77075839e02a6631466 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:55 +0000 Subject: [PATCH 154/278] iommu/arm-smmu: Make instance lookup robust ANBZ: #13617 commit 7d835134d4e13e9c30509fd24a42f8c2b94135ea upstream. Relying on the driver list was a cute idea for minimising the scope of our SMMU device lookups, however it turns out to have a subtle flaw. The SMMU device only gets added to that list after arm_smmu_device_probe() returns success, so there's actually no way the iommu_device_register() call from there could ever work as intended, even if it wasn't already hampered by the fwspec setup not happening early enough. Switch both arm_smmu_get_by_fwnode() implementations to use a platform bus lookup instead, which *will* reliably work. Also make sure that we don't register SMMUv2 instances until we've fully initialised them, to avoid similar consequences of the lookup now finding a device with no drvdata. Moving the error returns is also a perfect excuse to streamline them with dev_err_probe() in the process. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/6d7ce1dc31873abdb75c895fb8bd2097cce098b4.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +-- drivers/iommu/arm/arm-smmu/arm-smmu.c | 29 +++++++++------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e71a5cb6cf0d..7aae95cdb4a6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3343,8 +3343,8 @@ static struct platform_driver arm_smmu_driver; static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index ed60224e5d88..098336f70171 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1416,8 +1416,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } @@ -2250,29 +2250,26 @@ static int arm_smmu_device_probe(struct platform_device *pdev) i, irq); } + platform_set_drvdata(pdev, smmu); + + /* Check for RMRs and install bypass SMRs if any */ + arm_smmu_rmr_install_bypass_smr(smmu); + + arm_smmu_device_reset(smmu); + arm_smmu_test_smr_masks(smmu); + err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL, "smmu.%pa", &smmu->ioaddr); - if (err) { - dev_err(dev, "Failed to register iommu in sysfs\n"); - return err; - } + if (err) + return dev_err_probe(dev, err, "Failed to register iommu in sysfs\n"); err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, using_legacy_binding ? NULL : dev); if (err) { - dev_err(dev, "Failed to register iommu\n"); iommu_device_sysfs_remove(&smmu->iommu); - return err; + return dev_err_probe(dev, err, "Failed to register iommu\n"); } - platform_set_drvdata(pdev, smmu); - - /* Check for RMRs and install bypass SMRs if any */ - arm_smmu_rmr_install_bypass_smr(smmu); - - arm_smmu_device_reset(smmu); - arm_smmu_test_smr_masks(smmu); - /* * We want to avoid touching dev->power.lock in fastpaths unless * it's really going to do something useful - pm_runtime_enabled() -- Gitee From c53f0f2d51284276dbd98ef6fd1d3bb738ac2189 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:56 +0000 Subject: [PATCH 155/278] iommu/arm-smmu: Retire probe deferral workaround ANBZ: #13617 commit 97cb1fa0272646c2a033b05338bb8e0260879968 upstream. This reverts commit 229e6ee43d2a160a1592b83aad620d6027084aad. Now that the fundamental ordering issue between arm_smmu_get_by_fwnode() and iommu_device_register() is resolved, the race condition for client probe no longer exists either, so retire the specific workaround. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/4167c5dfa052d4c8bb780f0a30af63dcfc4ce6c1.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 098336f70171..9c6d3921a58a 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1442,17 +1442,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) goto out_free; } else { smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); - - /* - * Defer probe if the relevant SMMU instance hasn't finished - * probing yet. This is a fragile hack and we'd ideally - * avoid this race in the core code. Until that's ironed - * out, however, this is the most pragmatic option on the - * table. - */ - if (!smmu) - return ERR_PTR(dev_err_probe(dev, -EPROBE_DEFER, - "smmu dev has not bound yet\n")); } #ifdef CONFIG_ARCH_PHYTIUM -- Gitee From af641f7fdddece65b32fb401aa6b0511fd2e93df Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:58 +0000 Subject: [PATCH 156/278] iommu: Manage driver probe deferral better ANBZ: #13617 commit 46b3df8eb9bd035620bc48bd7a1f028490626621 upstream. Since iommu_fwspec_init() absorbed the basic driver probe deferral check to wait for an IOMMU to register, we may as well handle the probe deferral timeout there as well. The current inconsistency of callers results in client devices deferring forever on an arm64 ACPI system where an SMMU has failed its own driver probe. Acked-by: Will Deacon Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/41fa59f156ef8d196d08fa75c4901e6d4b12e6c4.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c407b03f90b3..00ede527e568 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2860,7 +2860,7 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode) struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (!ops) - return -EPROBE_DEFER; + return driver_deferred_probe_check_state(dev); if (fwspec) return ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e7a6a1611d19..97987cd78da9 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -29,8 +29,6 @@ static int of_iommu_xlate(struct device *dev, return -ENODEV; ret = iommu_fwspec_init(dev, of_fwnode_handle(iommu_spec->np)); - if (ret == -EPROBE_DEFER) - return driver_deferred_probe_check_state(dev); if (ret) return ret; -- Gitee From 37a193bc5d912c68b348a29de47c8c81a4b35e92 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 3 Dec 2024 00:02:55 -0800 Subject: [PATCH 157/278] iommufd/selftest: Cover IOMMU_FAULT_QUEUE_ALLOC in iommufd_fail_nth ANBZ: #13617 commit a8c9df25f90e5155b70f5e2474c7299841e3335a upstream. This was missing in the series introducing the fault object. Thus, add it. Link: https://patch.msgid.link/r/d61b9b7f73276cc8f1aef9602bd35c486917506e.1733212723.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- tools/testing/selftests/iommu/iommufd_fail_nth.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 22f6fd5f0f74..64b1f8e1b0cf 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -615,7 +615,12 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) /* device.c */ TEST_FAIL_NTH(basic_fail_nth, device) { + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; struct iommu_test_hw_info info; + uint32_t fault_id, fault_fd; + uint32_t fault_hwpt_id; uint32_t ioas_id; uint32_t ioas_id2; uint32_t stdev_id; @@ -678,6 +683,15 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id)) return -1; + if (_test_ioctl_fault_alloc(self->fd, &fault_id, &fault_fd)) + return -1; + close(fault_fd); + + if (_test_cmd_hwpt_alloc(self->fd, idev_id, hwpt_id, fault_id, + IOMMU_HWPT_FAULT_ID_VALID, &fault_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data))) + return -1; + return 0; } -- Gitee From d05eb5802ab10e01849e86c4ef75f9fff66ab56b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2024 14:51:36 -0400 Subject: [PATCH 158/278] iommu/arm-smmu-v3: Improve uAPI comment for IOMMU_HW_INFO_TYPE_ARM_SMMUV3 ANBZ: #13617 commit 2ca704f55e22b7b00cc7025953091af3c82fa5c0 upstream. Be specific about what fields should be accessed in the idr result and give other guidance to the VMM on how it should generate the vIDR. Discussion on the list, and review of the qemu implementation understood this needs to be clearer and more detailed. Link: https://patch.msgid.link/r/0-v1-191e5e24cec3+3b0-iommufd_smmuv3_hwinf_jgg@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- include/uapi/linux/iommufd.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 059b6537f2b7..b5197e317415 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -558,16 +558,25 @@ struct iommu_hw_info_vtd { * For the details of @idr, @iidr and @aidr, please refer to the chapters * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. * - * User space should read the underlying ARM SMMUv3 hardware information for - * the list of supported features. - * - * Note that these values reflect the raw HW capability, without any insight if - * any required kernel driver support is present. Bits may be set indicating the - * HW has functionality that is lacking kernel software support, such as BTM. If - * a VMM is using this information to construct emulated copies of these - * registers it should only forward bits that it knows it can support. - * - * In future, presence of required kernel support will be indicated in flags. + * This reports the raw HW capability, and not all bits are meaningful to be + * read by userspace. Only the following fields should be used: + * + * idr[0]: ST_LEVEL, TERM_MODEL, STALL_MODEL, TTENDIAN , CD2L, ASID16, TTF + * idr[1]: SIDSIZE, SSIDSIZE + * idr[3]: BBML, RIL + * idr[5]: VAX, GRAN64K, GRAN16K, GRAN4K + * + * - S1P should be assumed to be true if a NESTED HWPT can be created + * - VFIO/iommufd only support platforms with COHACC, it should be assumed to be + * true. + * - ATS is a per-device property. If the VMM describes any devices as ATS + * capable in ACPI/DT it should set the corresponding idr. + * + * This list may expand in future (eg E0PD, AIE, PBHA, D128, DS etc). It is + * important that VMMs do not read bits outside the list to allow for + * compatibility with future kernels. Several features in the SMMUv3 + * architecture are not currently supported by the kernel for nesting: HTTU, + * BTM, MPAM and others. */ struct iommu_hw_info_arm_smmuv3 { __u32 flags; -- Gitee From 95022421b9bdfd975bf155801a5575d544858791 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Sat, 9 Dec 2023 01:02:49 +0000 Subject: [PATCH 159/278] arm64/sysreg: add system register POR_EL{0,1} ANBZ: #13617 commit c0c5a8ea96b877e903b40ed2345e73f83b0ed612 upstream. Add POR_EL{0,1} according to DDI0601 2023-03. Signed-off-by: Joey Gouly Reviewed-by: Mark Brown Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-3-45284e538474@kernel.org Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- arch/arm64/include/asm/sysreg.h | 13 +++++++++++++ arch/arm64/tools/sysreg | 12 ++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 94633246d311..13b9b000ed42 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1002,6 +1002,19 @@ #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) +/* + * Permission Overlay Extension (POE) permission encodings. + */ +#define POE_NONE UL(0x0) +#define POE_R UL(0x1) +#define POE_X UL(0x2) +#define POE_RX UL(0x3) +#define POE_W UL(0x4) +#define POE_RW UL(0x5) +#define POE_XW UL(0x6) +#define POE_RXW UL(0x7) +#define POE_MASK UL(0xf) + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 0e7d7f327410..10e82b882529 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2504,6 +2504,18 @@ Sysreg PIR_EL2 3 4 10 2 3 Fields PIRx_ELx EndSysreg +Sysreg POR_EL0 3 3 10 2 4 +Fields PIRx_ELx +EndSysreg + +Sysreg POR_EL1 3 0 10 2 4 +Fields PIRx_ELx +EndSysreg + +Sysreg POR_EL12 3 5 10 2 4 +Fields PIRx_ELx +EndSysreg + Sysreg LORSA_EL1 3 0 10 4 0 Res0 63:52 Field 51:16 SA -- Gitee From 7669d5d5b87e954edfbfac124a32f696c4b67b55 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:48 +0100 Subject: [PATCH 160/278] arm64: cpufeature: add Permission Overlay Extension cpucap ANBZ: #13617 commit 3496f69391eee225244f0d3f0404142a80b710f5 upstream This indicates if the system supports POE. This is a CPUCAP_BOOT_CPU_FEATURE as the boot CPU will enable POE if it has it, so secondary CPUs must also have this feature. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-6-joey.gouly@arm.com Signed-off-by: Will Deacon Conflicts: arch/arm64/kernel/cpufeature.c --- arch/arm64/kernel/cpufeature.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 10 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 66faa0b57f12..0a7767b4d247 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2743,6 +2743,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) }, +#endif +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, #endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index bbb901027817..209facbec446 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -43,6 +43,7 @@ HAS_NO_FPSIMD HAS_NO_HW_PREFETCH HAS_PAN HAS_S1PIE +HAS_S1POE HAS_RAS_EXTN HAS_RNG HAS_SB -- Gitee From a7d2dce9c2e8411eaf5afc2a337c229e7cb57b49 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:49 +0100 Subject: [PATCH 161/278] arm64: context switch POR_EL0 register ANBZ: #13617 commit 160a8e13de6c36270e8c6537b8a944f4e73d2362 upstream. POR_EL0 is a register that can be modified by userspace directly, so it must be context switched. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-7-joey.gouly@arm.com [will: Dropped unnecessary isb()s] Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/process.c | 24 ++++++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index aebd82fa854f..d86a10521c20 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -838,6 +838,12 @@ static inline bool system_supports_tlb_range(void) cpus_have_const_cap(ARM64_HAS_TLB_RANGE); } +static inline bool system_supports_poe(void) +{ + return IS_ENABLED(CONFIG_ARM64_POE) && + alternative_has_cap_unlikely(ARM64_HAS_S1POE); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index e5bc54522e71..b3ad719c2d0c 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -179,6 +179,7 @@ struct thread_struct { u64 sctlr_user; u64 svcr; u64 tpidr2_el0; + u64 por_el0; }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 13b9b000ed42..c003dbe3fa27 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1015,6 +1015,9 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf) +/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index f29e0cca53ef..c3b83023851f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -272,12 +272,21 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); } void arch_release_task_struct(struct task_struct *tsk) @@ -375,6 +384,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -499,6 +511,17 @@ static void erratum_1418040_new_exec(void) preempt_enable(); } +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + } +} + /* * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore * this function must be called with preemption disabled and the update to @@ -534,6 +557,7 @@ struct task_struct *__switch_to(struct task_struct *prev, ssbs_thread_switch(next); erratum_1418040_thread_switch(next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case the -- Gitee From 18ed6f02571ff02cea9d04610d8c78c2d3621ec2 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:54 +0100 Subject: [PATCH 162/278] arm64: enable the Permission Overlay Extension for EL0 ANBZ: #13617 commit bf83dae90fbc01d66477a3440eaad07da6657fdc upstream Expose a HWCAP and ID_AA64MMFR3_EL1_S1POE to userspace, so they can be used to check if the CPU supports the feature. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-12-joey.gouly@arm.com Signed-off-by: Will Deacon [ Jay Chen: fix rebase conflicts in arch/arm64/kernel/cpufeature.c ] Signed-off-by: Jay Chen Conflicts: Documentation/arch/arm64/elf_hwcaps.rst arch/arm64/include/asm/hwcap.h arch/arm64/include/uapi/asm/hwcap.h arch/arm64/kernel/cpufeature.c arch/arm64/kernel/cpuinfo.c --- Documentation/arch/arm64/elf_hwcaps.rst | 3 +++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 14 ++++++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 20 insertions(+) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index f88a24d621dd..ab5fdf95a51f 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -320,6 +320,9 @@ HWCAP2_MOPS HWCAP2_HBC Functionality implied by ID_AA64ISAR2_EL1.BC == 0b0001. +HWCAP2_POE + Functionality implied by ID_AA64MMFR3_EL1.S1POE == 0b0001. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 521267478d18..196f21b7d11b 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -139,6 +139,7 @@ #define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16) #define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS) #define KERNEL_HWCAP_HBC __khwcap2_feature(HBC) +#define KERNEL_HWCAP_POE __khwcap2_feature(POE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 53026f45a509..26981476131c 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -104,5 +104,6 @@ #define HWCAP2_SME_F16F16 (1UL << 42) #define HWCAP2_MOPS (1UL << 43) #define HWCAP2_HBC (1UL << 44) +#define HWCAP2_POE (1UL << 63) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0a7767b4d247..a74f54e12476 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -403,6 +403,8 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, @@ -2236,6 +2238,14 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2750,6 +2760,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_S1POE, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, #endif @@ -2920,6 +2931,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), #endif /* CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 00805a3bdf85..2e6abfd196d1 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -132,6 +132,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_F16F16] = "smef16f16", [KERNEL_HWCAP_MOPS] = "mops", [KERNEL_HWCAP_HBC] = "hbc", + [KERNEL_HWCAP_POE] = "poe", }; #ifdef CONFIG_COMPAT -- Gitee From a3208eaef44bd7d4ae3e2ecae94faf12f3b57efc Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:51 +0100 Subject: [PATCH 163/278] arm64/cpufeature: Runtime detection of Guarded Control Stack (GCS) ANBZ: #13617 commit 6487c963083c24ede289d4267ffa60a9db668cd4 upstream. Add a cpufeature for GCS, allowing other code to conditionally support it at runtime. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-12-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas Conflicts: arch/arm64/tools/cpucaps --- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/kernel/cpufeature.c | 20 ++++++++++++++++++++ arch/arm64/tools/cpucaps | 1 + 3 files changed, 27 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index d86a10521c20..7a75fa43b113 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -844,6 +844,12 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_gcs(void) +{ + return IS_ENABLED(CONFIG_ARM64_GCS) && + alternative_has_cap_unlikely(ARM64_HAS_GCS); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a74f54e12476..66edc2cbeb4d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -257,6 +257,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -2246,6 +2248,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) } #endif +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2763,6 +2773,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, #endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 209facbec446..a3a48dd380f3 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -27,6 +27,7 @@ HAS_ECV_CNTPOFF HAS_EPAN HAS_EVT HAS_FGT +HAS_GCS HAS_GENERIC_AUTH HAS_GENERIC_AUTH_ARCH_QARMA3 HAS_GENERIC_AUTH_ARCH_QARMA5 -- Gitee From b4a8e4304158e830adcace847b7967edc32f657f Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 13:48:09 +0000 Subject: [PATCH 164/278] iommu/arm-smmu-v3: Document SVA interaction with new pagetable features ANBZ: #13617 commit 6e192214c6c82c2f52238d5e1865f11594e58a6f upstream. Process pagetables may now be using new permission-indirection-based features which an SMMU may not understand when given such a table for SVA. Although SMMUv3.4 does add its own S1PIE feature, realistically we're still going to have to cope with feature mismatches between CPUs and SMMUs, so let's start simple and essentially just document the expectations for what falls out as-is. Although it seems unlikely for SVA applications to also depend on memory-hardening features, or vice-versa, the relative lifecycles make it tricky to enforce mutual exclusivity. Thankfully our PIE index allocation makes it relatively benign for an SMMU to keep interpreting them as direct permissions, the only real implication is that an SVA application cannot harden itself against its own devices with these features. Thus, inform the user about that just in case they have other expectations. Also we don't (yet) support LPA2, so deny SVA entirely if we're going to misunderstand the pagetable format altogether. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/68a37b00a720f0827cac0e4f40e4d3a688924054.1733406275.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 1d3e71569775..9ba596430e7c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -112,6 +112,15 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, * from the current CPU register */ target->data[3] = cpu_to_le64(read_sysreg(mair_el1)); + + /* + * Note that we don't bother with S1PIE on the SMMU, we just rely on + * our default encoding scheme matching direct permissions anyway. + * SMMU has no notion of S1POE nor GCS, so make sure that is clear if + * either is enabled for CPUs, just in case anyone imagines otherwise. + */ + if (system_supports_poe() || system_supports_gcs()) + dev_warn_once(master->smmu->dev, "SVA devices ignore permission overlays and GCS\n"); } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd); @@ -206,8 +215,12 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) unsigned long asid_bits; u32 feat_mask = ARM_SMMU_FEAT_COHERENCY; - if (vabits_actual == 52) + if (vabits_actual == 52) { + /* We don't support LPA2 */ + if (PAGE_SIZE != SZ_64K) + return false; feat_mask |= ARM_SMMU_FEAT_VAX; + } if ((smmu->features & feat_mask) != feat_mask) return false; -- Gitee From e0268bf4918871aca0351ad3a18dd912db325d81 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 11:43:27 -0400 Subject: [PATCH 165/278] iommu/arm-smmu-v3: Remove arm_smmu_domain_finalise() during attach ANBZ: #13617 commit 48e7b8e284e5be9fd1b54b60246bcbe9711d43e4 upstream. Domains are now always finalized during allocation because the core code no longer permits a NULL dev argument to domain_alloc_paging/_flags(). Remove the late finalize during attach that supported domains that were not fully initialized. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v1-0bb8d5313a27+27b-smmuv3_paging_flags_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 37 +++++---------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 7aae95cdb4a6..ac2ca54d5675 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2460,7 +2460,6 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void) if (!smmu_domain) return ERR_PTR(-ENOMEM); - mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); @@ -2469,7 +2468,9 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void) static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { + struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_domain *smmu_domain; + int ret; /* * Allocate the domain and initialise some of its data structures. @@ -2480,15 +2481,10 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); - if (dev) { - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - int ret; - - ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); - if (ret) { - kfree(smmu_domain); - return ERR_PTR(ret); - } + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); + if (ret) { + kfree(smmu_domain); + return ERR_PTR(ret); } return &smmu_domain->domain; } @@ -2960,15 +2956,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) state.master = master = dev_iommu_priv_get(dev); smmu = master->smmu; - mutex_lock(&smmu_domain->init_mutex); - - if (!smmu_domain->smmu) { - ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0); - } else if (smmu_domain->smmu != smmu) - ret = -EINVAL; - - mutex_unlock(&smmu_domain->init_mutex); - if (ret) + if (smmu_domain->smmu != smmu) return ret; if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { @@ -3025,16 +3013,9 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_cd target_cd; - int ret = 0; - mutex_lock(&smmu_domain->init_mutex); - if (!smmu_domain->smmu) - ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0); - else if (smmu_domain->smmu != smmu) - ret = -EINVAL; - mutex_unlock(&smmu_domain->init_mutex); - if (ret) - return ret; + if (smmu_domain->smmu != smmu) + return -EINVAL; if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) return -EINVAL; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 2d073e0d0a81..8bbabd5b7dcd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -843,7 +843,6 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; - struct mutex init_mutex; /* Protects smmu pointer */ struct io_pgtable_ops *pgtbl_ops; atomic_t nr_ats_masters; -- Gitee From 493397f47f0494254401dbaeefea5db0d52a4a80 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 11:43:28 -0400 Subject: [PATCH 166/278] iommu/arm-smmu-v3: Make domain_alloc_paging_flags() directly determine the S1/S2 ANBZ: #13617 commit bb857c5c015033026d82d404061b26bbb37c821d upstream. The selection of S1/S2 is a bit indirect today, make domain_alloc_paging_flags() directly decode the flags and select the correct S1/S2 type. Directly reject flag combinations the HW doesn't support when processing the flags. Fix missing rejection of some flag combinations that are not supported today (ie NEST_PARENT | DIRTY_TRACKING) by using a switch statement to list out exactly the combinations that are currently supported. Move the determination of the stage out of arm_smmu_domain_finalise() and into both callers. As today the default stage is S1 if supported in HW. This makes arm_smmu_domain_alloc_paging_flags() self contained and no longer calling arm_smmu_domain_alloc_paging(). Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v1-0bb8d5313a27+27b-smmuv3_paging_flags_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 42 +++++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ac2ca54d5675..2bcf546d4e02 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2481,6 +2481,11 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); + if (master->smmu->features & ARM_SMMU_FEAT_TRANS_S1) + smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + else + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); if (ret) { kfree(smmu_domain); @@ -2554,12 +2559,6 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, struct arm_smmu_domain *smmu_domain); bool enable_dirty = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - /* Restrict the stage to what we can actually support */ - if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2)) - smmu_domain->stage = ARM_SMMU_DOMAIN_S1; - pgtbl_cfg = (struct io_pgtable_cfg) { .pgsize_bitmap = smmu->pgsize_bitmap, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, @@ -3224,6 +3223,7 @@ arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT; struct arm_smmu_domain *smmu_domain; @@ -3234,25 +3234,43 @@ arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags, if (user_data) return ERR_PTR(-EOPNOTSUPP); - if (flags & IOMMU_HWPT_ALLOC_PASID) - return arm_smmu_domain_alloc_paging(dev); - smmu_domain = arm_smmu_domain_alloc(); if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); - if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) { - if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) { + switch (flags) { + case 0: + /* Prefer S1 if available */ + if (smmu->features & ARM_SMMU_FEAT_TRANS_S1) + smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + else + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + break; + case IOMMU_HWPT_ALLOC_NEST_PARENT: + if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) { ret = -EOPNOTSUPP; goto err_free; } smmu_domain->stage = ARM_SMMU_DOMAIN_S2; smmu_domain->nest_parent = true; + break; + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID: + case IOMMU_HWPT_ALLOC_PASID: + if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + break; + default: + ret = -EOPNOTSUPP; + goto err_free; } smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; - ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags); + ret = arm_smmu_domain_finalise(smmu_domain, smmu, flags); if (ret) goto err_free; return &smmu_domain->domain; -- Gitee From 89448777d001d093f8c79303b5e3188801c7f1d7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 11:43:29 -0400 Subject: [PATCH 167/278] iommu/arm-smmu-v3: Remove domain_alloc_paging() ANBZ: #13617 commit cdfb9840fcc60b6e493aec077b1eecaa3268640b upstream. arm_smmu_domain_alloc_paging_flags() with a flags = 0 now does the same thing as arm_smmu_domain_alloc_paging(), remove arm_smmu_domain_alloc_paging(). Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v1-0bb8d5313a27+27b-smmuv3_paging_flags_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 31 --------------------- 1 file changed, 31 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2bcf546d4e02..d2d74fd2df61 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -105,8 +105,6 @@ static const char * const event_class_str[] = { [3] = "Reserved", }; -static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_device *smmu, u32 flags); static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) @@ -2466,34 +2464,6 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void) return smmu_domain; } -static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) -{ - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct arm_smmu_domain *smmu_domain; - int ret; - - /* - * Allocate the domain and initialise some of its data structures. - * We can't really do anything meaningful until we've added a - * master. - */ - smmu_domain = arm_smmu_domain_alloc(); - if (IS_ERR(smmu_domain)) - return ERR_CAST(smmu_domain); - - if (master->smmu->features & ARM_SMMU_FEAT_TRANS_S1) - smmu_domain->stage = ARM_SMMU_DOMAIN_S1; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - - ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); - if (ret) { - kfree(smmu_domain); - return ERR_PTR(ret); - } - return &smmu_domain->domain; -} - static void arm_smmu_domain_free_paging(struct iommu_domain *domain) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); @@ -3659,7 +3629,6 @@ static struct iommu_ops arm_smmu_ops = { .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .hw_info = arm_smmu_hw_info, - .domain_alloc_paging = arm_smmu_domain_alloc_paging, .domain_alloc_sva = arm_smmu_sva_domain_alloc, .domain_alloc_paging_flags = arm_smmu_domain_alloc_paging_flags, .probe_device = arm_smmu_probe_device, -- Gitee From 7da937bf657a5c7ab6eb3bf2728ec8bb4423299f Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Fri, 6 Dec 2024 10:01:14 -0300 Subject: [PATCH 168/278] iommu/tegra241-cmdqv: do not use smp_processor_id in preemptible context ANBZ: #13617 commit 1f806218164d1bb93f3db21eaf61254b08acdf03 upstream. During boot some of the calls to tegra241_cmdqv_get_cmdq() will happen in preemptible context. As this function calls smp_processor_id(), if CONFIG_DEBUG_PREEMPT is enabled, these calls will trigger a series of "BUG: using smp_processor_id() in preemptible" backtraces. As tegra241_cmdqv_get_cmdq() only calls smp_processor_id() to use the CPU number as a factor to balance out traffic on cmdq usage, it is safe to use raw_smp_processor_id() here. Cc: Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV") Signed-off-by: Luis Claudio R. Goncalves Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Link: https://lore.kernel.org/r/Z1L1mja3nXzsJ0Pk@uudg.org Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index c8ec74f089f3..6e41ddaa24d6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -339,7 +339,7 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, * one CPU at a time can enter the process, while the others * will be spinning at the same lock. */ - lidx = smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf; + lidx = raw_smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf; vcmdq = vintf->lvcmdqs[lidx]; if (!vcmdq || !READ_ONCE(vcmdq->enabled)) return NULL; -- Gitee From 9659f8b82597f3f8e77adaf87d4d9311bcb44702 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Mon, 2 Dec 2024 14:06:04 +0000 Subject: [PATCH 169/278] iommu/io-pgtable-arm: Add coverage for different OAS in selftest ANBZ: #13617 commit 376ce8b35ed15d5deee57bdecd8449f6a4df4c42 upstream. Run selftests with different OAS values intead of hardcoding it to 48 bits. We always keep OAS >= IAS to make the config valid for stage-2. This can be further improved, if we split IAS/OAS configuration for stage-1 and stage-2 (to use input sizes compatible with VA_BITS as SMMUv3 does, or IAS > OAS which is valid for stage-1). However, that adds more complexity, and the current change improves coverage and makes it possible to test all concatenation cases. Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241202140604.422235-3-smostafa@google.com Signed-off-by: Will Deacon [Jay Chen Conflicts from anolis patch e34309e426670db702ca9532241a9f154666089a ] Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 19b529e28ec7..9f15fc09ac06 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -1347,11 +1347,11 @@ static int __init arm_lpae_do_selftests(void) SZ_64K | SZ_512M, }; - static const unsigned int ias[] __initconst = { + static const unsigned int address_size[] __initconst = { 32, 36, 40, 42, 44, 48, }; - int i, j, pass = 0, fail = 0; + int i, j, k, pass = 0, fail = 0; struct device *dev = kmalloc(sizeof(struct device), GFP_KERNEL | __GFP_NOFAIL); struct io_pgtable_cfg cfg = { .tlb = &dummy_tlb_ops, @@ -1364,15 +1364,19 @@ static int __init arm_lpae_do_selftests(void) set_dev_node(dev, NUMA_NO_NODE); for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { - for (j = 0; j < ARRAY_SIZE(ias); ++j) { - cfg.pgsize_bitmap = pgsize[i]; - cfg.ias = ias[j]; - pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u\n", - pgsize[i], ias[j]); - if (arm_lpae_run_tests(&cfg)) - fail++; - else - pass++; + for (j = 0; j < ARRAY_SIZE(address_size); ++j) { + /* Don't use ias > oas as it is not valid for stage-2. */ + for (k = 0; k <= j; ++k) { + cfg.pgsize_bitmap = pgsize[i]; + cfg.ias = address_size[k]; + cfg.oas = address_size[j]; + pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", + pgsize[i], cfg.ias, cfg.oas); + if (arm_lpae_run_tests(&cfg)) + fail++; + else + pass++; + } } } -- Gitee From 99c16f68641ea3485961e35aedbb5103a051bc1f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 20:13:41 -0400 Subject: [PATCH 170/278] iommu/amd: Put list_add/del(dev_data) back under the domain->lock ANBZ: #13617 commit 4a552f7890f0870f6d9fd4fbc6c05cea7bfd4503 upstream. The list domain->dev_list is protected by the domain->lock spinlock. Any iteration, addition or removal must be under the lock. Move the list_del() up into the critical section. pdom_is_sva_capable(), and destroy_gcr3_table() do not interact with the list element. Wrap the list_add() in a lock, it would make more sense if this was under the same critical section as adjusting the refcounts earlier, but that requires more complications. Fixes: d6b47dec3684 ("iommu/amd: Reduce domain lock scope in attach device path") Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/1-v1-3b9edcf8067d+3975-amd_dev_list_locking_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index cabc336f1905..1f0d257ff8fa 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2082,6 +2082,7 @@ static int attach_device(struct device *dev, struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct pci_dev *pdev; + unsigned long flags; int ret = 0; mutex_lock(&dev_data->mutex); @@ -2122,7 +2123,9 @@ static int attach_device(struct device *dev, /* Update data structures */ dev_data->domain = domain; + spin_lock_irqsave(&domain->lock, flags); list_add(&dev_data->list, &domain->dev_list); + spin_unlock_irqrestore(&domain->lock, flags); /* Update device table */ dev_update_dte(dev_data, true); @@ -2169,6 +2172,7 @@ static void detach_device(struct device *dev) /* Flush IOTLB and wait for the flushes to finish */ spin_lock_irqsave(&domain->lock, flags); amd_iommu_domain_flush_all(domain); + list_del(&dev_data->list); spin_unlock_irqrestore(&domain->lock, flags); /* Clear GCR3 table */ @@ -2177,7 +2181,6 @@ static void detach_device(struct device *dev) /* Update data structures */ dev_data->domain = NULL; - list_del(&dev_data->list); /* decrease reference counters - needs to happen after the flushes */ pdom_detach_iommu(iommu, domain); -- Gitee From 3bbfeaa06a0e0c80af221327d0e0b3fe6f373b18 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 20:13:42 -0400 Subject: [PATCH 171/278] iommu/amd: Add lockdep asserts for domain->dev_list ANBZ: #13617 commit 91da87d38cca46be6f2f4f3c820f5d817c1866dc upstream. Add an assertion to all the iteration points that don't obviously have the lock held already. These all take the locker higher in their call chains. Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/2-v1-3b9edcf8067d+3975-amd_dev_list_locking_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 1f0d257ff8fa..35322b7e788f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1424,6 +1424,7 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, struct iommu_cmd cmd; int ret = 0; + lockdep_assert_held(&pdom->lock); list_for_each_entry(dev_data, &pdom->dev_list, list) { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); u16 domid = dev_data->gcr3_info.domid; @@ -1473,6 +1474,8 @@ static void __domain_flush_pages(struct protection_domain *domain, ioasid_t pasid = IOMMU_NO_PASID; bool gn = false; + lockdep_assert_held(&domain->lock); + if (pdom_is_v2_pgtbl_mode(domain)) { gn = true; ret = domain_flush_pages_v2(domain, address, size); @@ -1594,6 +1597,8 @@ void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; + lockdep_assert_held(&domain->lock); + list_for_each_entry(dev_data, &domain->dev_list, list) { struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); -- Gitee From 0256a12c0454a6c6694ce2f2933d19d0cb977909 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 Dec 2024 16:13:39 -0400 Subject: [PATCH 172/278] iommufd/selftest: Remove domain_alloc_paging() ANBZ: #13617 commit d61927d784e25c0ce5ab6015538e6a82f152c24e upstream. Since this implements domain_alloc_paging_flags() it only needs one op. Fold mock_domain_alloc_paging() into mock_domain_alloc_paging_flags(). Link: https://patch.msgid.link/r/0-v1-8a3e7e21ff6a+1745d-iommufd_paging_flags_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/selftest.c | 43 ++++++++++++-------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index a0de6d6d4e68..6512c1d16348 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -311,25 +311,6 @@ static const struct iommu_dirty_ops dirty_ops = { .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; -static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) -{ - struct mock_dev *mdev = to_mock_dev(dev); - struct mock_iommu_domain *mock; - - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return NULL; - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - return &mock->domain; -} - static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { @@ -385,21 +366,30 @@ mock_domain_alloc_paging_flags(struct device *dev, u32 flags, bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT; - bool no_dirty_ops = to_mock_dev(dev)->flags & - MOCK_FLAGS_DEVICE_NO_DIRTY; - struct iommu_domain *domain; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct mock_iommu_domain *mock; if (user_data) return ERR_PTR(-EOPNOTSUPP); if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(dev); - if (!domain) + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) return ERR_PTR(-ENOMEM); + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) + mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; + mock->domain.ops = mock_ops.default_domain_ops; + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; + xa_init(&mock->pfns); + if (has_dirty_flag) - domain->dirty_ops = &dirty_ops; - return domain; + mock->domain.dirty_ops = &dirty_ops; + return &mock->domain; } static void mock_domain_free(struct iommu_domain *domain) @@ -713,7 +703,6 @@ static const struct iommu_ops mock_ops = { .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, - .domain_alloc_paging = mock_domain_alloc_paging, .domain_alloc_paging_flags = mock_domain_alloc_paging_flags, .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, -- Gitee From 4e3fc9f2cec9184332a1e2886a23e74fc0fbea7a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 7 Dec 2024 04:01:08 -0800 Subject: [PATCH 173/278] iommufd: Deal with IOMMU_HWPT_FAULT_ID_VALID in iommufd core ANBZ: #13617 commit 11534b4de2a1bcc438ed90d031184c9c847e8560 upstream. IOMMU_HWPT_FAULT_ID_VALID is used to mark if the fault_id field of iommu_hwp_alloc is valid or not. As the fault_id field is handled in the iommufd core, so it makes sense to sanitize the IOMMU_HWPT_FAULT_ID_VALID flag in the iommufd core, and mask it out before passing the user flags to the iommu drivers. Link: https://patch.msgid.link/r/20241207120108.5640-1-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 8 +------- drivers/iommu/intel/iommu.c | 3 +-- drivers/iommu/iommufd/hw_pagetable.c | 10 +++++++--- drivers/iommu/iommufd/selftest.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 6cc14d82399f..72050bb507fd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -178,18 +178,12 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); - const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID; struct arm_smmu_nested_domain *nested_domain; struct iommu_hwpt_arm_smmuv3 arg; bool enable_ats = false; int ret; - /* - * Faults delivered to the nested domain are faults that originated by - * the S1 in the domain. The core code will match all PASIDs when - * delivering the fault due to user_pasid_table - */ - if (flags & ~SUPPORTED_FLAGS) + if (flags) return ERR_PTR(-EOPNOTSUPP); ret = iommu_copy_struct_from_user(&arg, user_data, diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 1c8307d8a912..f6d4e37ca038 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3399,8 +3399,7 @@ intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, bool first_stage; if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING - | IOMMU_HWPT_FAULT_ID_VALID))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index ce03c3804651..598be26a14e2 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -140,8 +140,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (ops->domain_alloc_paging_flags) { - hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, flags, - user_data); + hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -280,6 +280,8 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, struct iommufd_hw_pagetable *hwpt; int rc; + if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + return ERR_PTR(-EOPNOTSUPP); if (!user_data->len) return ERR_PTR(-EOPNOTSUPP); if (!viommu->ops || !viommu->ops->alloc_domain_nested) @@ -296,7 +298,9 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, hwpt_nested->parent = viommu->hwpt; hwpt->domain = - viommu->ops->alloc_domain_nested(viommu, flags, user_data); + viommu->ops->alloc_domain_nested(viommu, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, + user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 6512c1d16348..d40deb0a4f06 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -585,7 +585,7 @@ mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, struct mock_viommu *mock_viommu = to_mock_viommu(viommu); struct mock_iommu_domain_nested *mock_nested; - if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + if (flags) return ERR_PTR(-EOPNOTSUPP); mock_nested = __mock_domain_alloc_nested(user_data); -- Gitee From 8e4712fc667aa086d60dd568464bb831dd1e92af Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 13 Dec 2024 09:17:50 +0800 Subject: [PATCH 174/278] iommu/vt-d: Remove cache tags before disabling ATS ANBZ: #13617 commit 1f2557e08a617a4b5e92a48a1a9a6f86621def18 upstream. The current implementation removes cache tags after disabling ATS, leading to potential memory leaks and kernel crashes. Specifically, CACHE_TAG_DEVTLB type cache tags may still remain in the list even after the domain is freed, causing a use-after-free condition. This issue really shows up when multiple VFs from different PFs passed through to a single user-space process via vfio-pci. In such cases, the kernel may crash with kernel messages like: BUG: kernel NULL pointer dereference, address: 0000000000000014 PGD 19036a067 P4D 1940a3067 PUD 136c9b067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 74 UID: 0 PID: 3183 Comm: testCli Not tainted 6.11.9 #2 RIP: 0010:cache_tag_flush_range+0x9b/0x250 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x163/0x590 ? exc_page_fault+0x72/0x190 ? asm_exc_page_fault+0x22/0x30 ? cache_tag_flush_range+0x9b/0x250 ? cache_tag_flush_range+0x5d/0x250 intel_iommu_tlb_sync+0x29/0x40 intel_iommu_unmap_pages+0xfe/0x160 __iommu_unmap+0xd8/0x1a0 vfio_unmap_unpin+0x182/0x340 [vfio_iommu_type1] vfio_remove_dma+0x2a/0xb0 [vfio_iommu_type1] vfio_iommu_type1_ioctl+0xafa/0x18e0 [vfio_iommu_type1] Move cache_tag_unassign_domain() before iommu_disable_pci_caps() to fix it. Fixes: 3b1d9e2b2d68 ("iommu/vt-d: Add cache tag assignment interface") Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241129020506.576413-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f6d4e37ca038..511e5be98250 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3279,6 +3279,9 @@ void device_block_translation(struct device *dev) struct intel_iommu *iommu = info->iommu; unsigned long flags; + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + iommu_disable_pci_caps(info); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) @@ -3295,7 +3298,6 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); - cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } -- Gitee From be70f7c0abe519f6d7f5003de7bfbdc6236977a8 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 13 Dec 2024 09:17:51 +0800 Subject: [PATCH 175/278] iommu/vt-d: Fix qi_batch NULL pointer with nested parent domain ANBZ: #13617 commit 74536f91962d5f6af0a42414773ce61e653c10ee upstream. The qi_batch is allocated when assigning cache tag for a domain. While for nested parent domain, it is missed. Hence, when trying to map pages to the nested parent, NULL dereference occurred. Also, there is potential memleak since there is no lock around domain->qi_batch allocation. To solve it, add a helper for qi_batch allocation, and call it in both the __cache_tag_assign_domain() and __cache_tag_assign_parent_domain(). BUG: kernel NULL pointer dereference, address: 0000000000000200 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8104795067 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 223 UID: 0 PID: 4357 Comm: qemu-system-x86 Not tainted 6.13.0-rc1-00028-g4b50c3c3b998-dirty #2632 Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x80/0x150 ? do_user_addr_fault+0x63/0x7b0 ? exc_page_fault+0x7c/0x220 ? asm_exc_page_fault+0x26/0x30 ? cache_tag_flush_range_np+0x13c/0x260 intel_iommu_iotlb_sync_map+0x1a/0x30 iommu_map+0x61/0xf0 batch_to_domain+0x188/0x250 iopt_area_fill_domains+0x125/0x320 ? rcu_is_watching+0x11/0x50 iopt_map_pages+0x63/0x100 iopt_map_common.isra.0+0xa7/0x190 iopt_map_user_pages+0x6a/0x80 iommufd_ioas_map+0xcd/0x1d0 iommufd_fops_ioctl+0x118/0x1c0 __x64_sys_ioctl+0x93/0xc0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 705c1cdf1e73 ("iommu/vt-d: Introduce batched cache invalidation") Cc: stable@vger.kernel.org Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241210130322.17175-1-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/cache.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index e5b89f728ad3..09694cca8752 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -105,12 +105,35 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did, spin_unlock_irqrestore(&domain->cache_lock, flags); } +/* domain->qi_batch will be freed in iommu_free_domain() path. */ +static int domain_qi_batch_alloc(struct dmar_domain *domain) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&domain->cache_lock, flags); + if (domain->qi_batch) + goto out_unlock; + + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC); + if (!domain->qi_batch) + ret = -ENOMEM; +out_unlock: + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return ret; +} + static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -139,6 +162,10 @@ static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -190,13 +217,6 @@ int cache_tag_assign_domain(struct dmar_domain *domain, u16 did = domain_get_id_for_dev(domain, dev); int ret; - /* domain->qi_bach will be freed in iommu_free_domain() path. */ - if (!domain->qi_batch) { - domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_KERNEL); - if (!domain->qi_batch) - return -ENOMEM; - } - ret = __cache_tag_assign_domain(domain, did, dev, pasid); if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) return ret; -- Gitee From 4b9ff0dd491897f4aaba7a31e74150101c8a8b5b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 13 Dec 2024 09:17:52 +0800 Subject: [PATCH 176/278] iommu/vt-d: Avoid draining PRQ in sva mm release path ANBZ: #13617 commit dda2b8c3c6ccc50deae65cc75f246577348e2ec5 upstream. When a PASID is used for SVA by a device, it's possible that the PASID entry is cleared before the device flushes all ongoing DMA requests and removes the SVA domain. This can occur when an exception happens and the process terminates before the device driver stops DMA and calls the iommu driver to unbind the PASID. There's no need to drain the PRQ in the mm release path. Instead, the PRQ will be drained in the SVA unbind path. Unfortunately, commit c43e1ccdebf2 ("iommu/vt-d: Drain PRQs when domain removed from RID") changed this behavior by unconditionally draining the PRQ in intel_pasid_tear_down_entry(). This can lead to a potential sleeping-in-atomic-context issue. Smatch static checker warning: drivers/iommu/intel/prq.c:95 intel_iommu_drain_pasid_prq() warn: sleeping in atomic context To avoid this issue, prevent draining the PRQ in the SVA mm release path and restore the previous behavior. Fixes: c43e1ccdebf2 ("iommu/vt-d: Drain PRQs when domain removed from RID") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-iommu/c5187676-2fa2-4e29-94e0-4a279dc88b49@stanley.mountain/ Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241212021529.1104745-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/pasid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 0f2a926d3bd5..5b7d85f1e143 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -265,7 +265,8 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); - intel_iommu_drain_pasid_prq(dev, pasid); + if (!fault_ignore) + intel_iommu_drain_pasid_prq(dev, pasid); } /* -- Gitee From 3d56461f3908d7bb3180f4840b399bfaa8f1cdd8 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:29 +0000 Subject: [PATCH 177/278] iommu/amd: Misc ACPI IVRS debug info clean up ANBZ: #13617 commit f20a6e3eb2ef323aa0a6ac22f94293ce4f17d113 upstream. * Remove redundant AMD-Vi prefix. * Print IVHD device entry settings field using hex value. * Print root device of IVHD ACPI device entry using hex value. Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-2-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 35 +++++++++++++---------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index fdb0357e0bb9..af87b1d094c1 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -468,7 +468,7 @@ extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ if (amd_iommu_dump) \ - pr_info("AMD-Vi: " format, ## arg); \ + pr_info(format, ## arg); \ } while(0); /* global flag if IOMMUs cache non-present entries */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c205ac00130c..9b19570bfc50 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1238,7 +1238,7 @@ static int __init add_acpi_hid_device(u8 *hid, u8 *uid, u32 *devid, entry->cmd_line = cmd_line; entry->root_devid = (entry->devid & (~0x7)); - pr_info("%s, add hid:%s, uid:%s, rdevid:%d\n", + pr_info("%s, add hid:%s, uid:%s, rdevid:%#x\n", entry->cmd_line ? "cmd" : "ivrs", entry->hid, entry->uid, entry->root_devid); @@ -1330,15 +1330,14 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, switch (e->type) { case IVHD_DEV_ALL: - DUMP_printk(" DEV_ALL\t\t\tflags: %02x\n", e->flags); + DUMP_printk(" DEV_ALL\t\t\tsetting: %#02x\n", e->flags); for (dev_i = 0; dev_i <= pci_seg->last_bdf; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: - DUMP_printk(" DEV_SELECT\t\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x\n", + DUMP_printk(" DEV_SELECT\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1349,8 +1348,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_SELECT_RANGE_START: - DUMP_printk(" DEV_SELECT_RANGE_START\t " - "devid: %04x:%02x:%02x.%x flags: %02x\n", + DUMP_printk(" DEV_SELECT_RANGE_START\tdevid: %04x:%02x:%02x.%x flags: %#02x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1363,8 +1361,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_ALIAS: - DUMP_printk(" DEV_ALIAS\t\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x devid_to: %02x:%02x.%x\n", + DUMP_printk(" DEV_ALIAS\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1381,9 +1378,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_ALIAS_RANGE: - DUMP_printk(" DEV_ALIAS_RANGE\t\t " - "devid: %04x:%02x:%02x.%x flags: %02x " - "devid_to: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_ALIAS_RANGE\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %04x:%02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1400,8 +1395,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT: - DUMP_printk(" DEV_EXT_SELECT\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x ext: %08x\n", + DUMP_printk(" DEV_EXT_SELECT\t\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1413,8 +1407,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT_RANGE: - DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " - "%04x:%02x:%02x.%x flags: %02x ext: %08x\n", + DUMP_printk(" DEV_EXT_SELECT_RANGE\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1427,7 +1420,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_RANGE_END: - DUMP_printk(" DEV_RANGE_END\t\t devid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_RANGE_END\t\tdevid: %04x:%02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid)); @@ -1460,11 +1453,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, else var = "UNKNOWN"; - DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n", var, (int)handle, seg_id, PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); + PCI_FUNC(devid), + e->flags); ret = add_special_device(type, handle, &devid, false); if (ret) @@ -1524,11 +1518,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, } devid = PCI_SEG_DEVID_TO_SBDF(seg_id, e->devid); - DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n", hid, uid, seg_id, PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); + PCI_FUNC(devid), + e->flags); flags = e->flags; -- Gitee From 8228eb4b04ccfe7809c3e1751610acb6f4e4f757 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:30 +0000 Subject: [PATCH 178/278] iommu/amd: Disable AMD IOMMU if CMPXCHG16B feature is not supported ANBZ: #13617 commit 82582f85ed22ba6cd27fea76b4248745f3b9fdf7 upstream. According to the AMD IOMMU spec, IOMMU hardware reads the entire DTE in a single 256-bit transaction. It is recommended to update DTE using 128-bit operation followed by an INVALIDATE_DEVTAB_ENTYRY command when the IV=1b or V=1b before the change. According to the AMD BIOS and Kernel Developer's Guide (BDKG) dated back to family 10h Processor [1], which is the first introduction of AMD IOMMU, AMD processor always has CPUID Fn0000_0001_ECX[CMPXCHG16B]=1. Therefore, it is safe to assume cmpxchg128 is available with all AMD processor w/ IOMMU. In addition, the CMPXCHG16B feature has already been checked separately before enabling the GA, XT, and GAM modes. Consolidate the detection logic, and fail the IOMMU initialization if the feature is not supported. [1] https://www.amd.com/content/dam/amd/en/documents/archived-tech-docs/programmer-references/31116.pdf Reviewed-by: Jason Gunthorpe Suggested-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-3-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/init.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 9b19570bfc50..9a0979c888c2 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1751,13 +1751,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; - /* - * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports. - * GAM also requires GA mode. Therefore, we need to - * check cmpxchg16b support before enabling it. - */ - if (!boot_cpu_has(X86_FEATURE_CX16) || - ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)) + /* GAM requires GA mode. */ + if ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0) amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; case 0x11: @@ -1767,13 +1762,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; - /* - * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports. - * XT, GAM also requires GA mode. Therefore, we need to - * check cmpxchg16b support before enabling them. - */ - if (!boot_cpu_has(X86_FEATURE_CX16) || - ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) { + /* XT and GAM require GA mode. */ + if ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0) { amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; } @@ -3035,6 +3025,11 @@ static int __init early_amd_iommu_init(void) return -EINVAL; } + if (!boot_cpu_has(X86_FEATURE_CX16)) { + pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n"); + return -EINVAL; + } + /* * Validate checksum here so we don't need to do it when * we actually parse the table -- Gitee From e67ab321be3320183dd4a896a90c359a07dd7d1f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:31 +0000 Subject: [PATCH 179/278] iommu/amd: Introduce struct ivhd_dte_flags to store persistent DTE flags ANBZ: #13617 commit 7bea695ada0e84c40685551159068996cea29ef8 upstream. During early initialization, the driver parses IVRS IVHD block to get list of downstream devices along with their DTE flags (i.e INITPass, EIntPass, NMIPass, SysMgt, Lint0Pass, Lint1Pass). This information is currently store in the device DTE, and needs to be preserved when clearing and configuring each DTE, which makes it difficult to manage. Introduce struct ivhd_dte_flags to store IVHD DTE settings for a device or range of devices, which are stored in the amd_ivhd_dev_flags_list during initial IVHD parsing. Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-4-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 16 ++++ drivers/iommu/amd/init.c | 113 +++++++++++++++++++++------- 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index af87b1d094c1..ae5f1e031722 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -220,6 +220,8 @@ #define DEV_ENTRY_EX 0x67 #define DEV_ENTRY_SYSMGT1 0x68 #define DEV_ENTRY_SYSMGT2 0x69 +#define DTE_DATA1_SYSMGT_MASK GENMASK_ULL(41, 40) + #define DEV_ENTRY_IRQ_TBL_EN 0x80 #define DEV_ENTRY_INIT_PASS 0xb8 #define DEV_ENTRY_EINT_PASS 0xb9 @@ -516,6 +518,9 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define for_each_pdom_dev_data_safe(pdom_dev_data, next, pdom) \ list_for_each_entry_safe((pdom_dev_data), (next), &pdom->dev_data_list, list) +#define for_each_ivhd_dte_flags(entry) \ + list_for_each_entry((entry), &amd_ivhd_dev_flags_list, list) + struct amd_iommu; struct iommu_domain; struct irq_domain; @@ -884,6 +889,17 @@ struct dev_table_entry { u64 data[4]; }; +/* + * Structure to sture persistent DTE flags from IVHD + */ +struct ivhd_dte_flags { + struct list_head list; + u16 segid; + u16 devid_first; + u16 devid_last; + struct dev_table_entry dte; +}; + /* * One entry for unity mappings parsed out of the ACPI table. */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 9a0979c888c2..1cec9069910e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -173,8 +173,8 @@ bool amd_iommu_snp_en; EXPORT_SYMBOL(amd_iommu_snp_en); LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */ -LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the - system */ +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +LIST_HEAD(amd_ivhd_dev_flags_list); /* list of all IVHD device entry settings */ /* Number of IOMMUs present in the system */ static int amd_iommus_present; @@ -983,6 +983,14 @@ static void iommu_enable_gt(struct amd_iommu *iommu) } /* sets a specific bit in the device table entry. */ +static void set_dte_bit(struct dev_table_entry *dte, u8 bit) +{ + int i = (bit >> 6) & 0x03; + int _bit = bit & 0x3f; + + dte->data[i] |= (1UL << _bit); +} + static void __set_dev_entry_bit(struct dev_table_entry *dev_table, u16 devid, u8 bit) { @@ -1135,6 +1143,19 @@ static bool copy_device_table(void) return true; } +static bool search_ivhd_dte_flags(u16 segid, u16 first, u16 last) +{ + struct ivhd_dte_flags *e; + + for_each_ivhd_dte_flags(e) { + if ((e->segid == segid) && + (e->devid_first == first) && + (e->devid_last == last)) + return true; + } + return false; +} + void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid) { int sysmgt; @@ -1150,27 +1171,66 @@ void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid) * This function takes the device specific flags read from the ACPI * table and sets up the device table entry with that information */ -static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, - u16 devid, u32 flags, u32 ext_flags) +static void __init +set_dev_entry_from_acpi_range(struct amd_iommu *iommu, u16 first, u16 last, + u32 flags, u32 ext_flags) { - if (flags & ACPI_DEVFLAG_INITPASS) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_INIT_PASS); - if (flags & ACPI_DEVFLAG_EXTINT) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_EINT_PASS); - if (flags & ACPI_DEVFLAG_NMI) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_NMI_PASS); - if (flags & ACPI_DEVFLAG_SYSMGT1) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1); - if (flags & ACPI_DEVFLAG_SYSMGT2) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2); - if (flags & ACPI_DEVFLAG_LINT0) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT0_PASS); - if (flags & ACPI_DEVFLAG_LINT1) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT1_PASS); + int i; + struct dev_table_entry dte = {}; + + /* Parse IVHD DTE setting flags and store information */ + if (flags) { + struct ivhd_dte_flags *d; - amd_iommu_apply_erratum_63(iommu, devid); + if (search_ivhd_dte_flags(iommu->pci_seg->id, first, last)) + return; - amd_iommu_set_rlookup_table(iommu, devid); + d = kzalloc(sizeof(struct ivhd_dte_flags), GFP_KERNEL); + if (!d) + return; + + pr_debug("%s: devid range %#x:%#x\n", __func__, first, last); + + if (flags & ACPI_DEVFLAG_INITPASS) + set_dte_bit(&dte, DEV_ENTRY_INIT_PASS); + if (flags & ACPI_DEVFLAG_EXTINT) + set_dte_bit(&dte, DEV_ENTRY_EINT_PASS); + if (flags & ACPI_DEVFLAG_NMI) + set_dte_bit(&dte, DEV_ENTRY_NMI_PASS); + if (flags & ACPI_DEVFLAG_SYSMGT1) + set_dte_bit(&dte, DEV_ENTRY_SYSMGT1); + if (flags & ACPI_DEVFLAG_SYSMGT2) + set_dte_bit(&dte, DEV_ENTRY_SYSMGT2); + if (flags & ACPI_DEVFLAG_LINT0) + set_dte_bit(&dte, DEV_ENTRY_LINT0_PASS); + if (flags & ACPI_DEVFLAG_LINT1) + set_dte_bit(&dte, DEV_ENTRY_LINT1_PASS); + + /* Apply erratum 63, which needs info in initial_dte */ + if (FIELD_GET(DTE_DATA1_SYSMGT_MASK, dte.data[1]) == 0x1) + dte.data[0] |= DTE_FLAG_IW; + + memcpy(&d->dte, &dte, sizeof(dte)); + d->segid = iommu->pci_seg->id; + d->devid_first = first; + d->devid_last = last; + list_add_tail(&d->list, &amd_ivhd_dev_flags_list); + } + + for (i = first; i <= last; i++) { + if (flags) { + struct dev_table_entry *dev_table = get_dev_table(iommu); + + memcpy(&dev_table[i], &dte, sizeof(dte)); + } + amd_iommu_set_rlookup_table(iommu, i); + } +} + +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) +{ + set_dev_entry_from_acpi_range(iommu, devid, devid, flags, ext_flags); } int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line) @@ -1331,9 +1391,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: DUMP_printk(" DEV_ALL\t\t\tsetting: %#02x\n", e->flags); - - for (dev_i = 0; dev_i <= pci_seg->last_bdf; ++dev_i) - set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); + set_dev_entry_from_acpi_range(iommu, 0, pci_seg->last_bdf, e->flags, 0); break; case IVHD_DEV_SELECT: @@ -1427,14 +1485,11 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) { + if (alias) pci_seg->alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - devid_to, flags, ext_flags); - } - set_dev_entry_from_acpi(iommu, dev_i, - flags, ext_flags); } + set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags); + set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); break; case IVHD_DEV_SPECIAL: { u8 handle, type; -- Gitee From 62b8d90ac12a180fe4d698fd2e17e56da6443717 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:32 +0000 Subject: [PATCH 180/278] iommu/amd: Introduce helper function to update 256-bit DTE ANBZ: #13617 commit 8b3f78733814b180089a400743b6f19d118aec62 upstream. The current implementation does not follow 128-bit write requirement to update DTE as specified in the AMD I/O Virtualization Techonology (IOMMU) Specification. Therefore, modify the struct dev_table_entry to contain union of u128 data array, and introduce a helper functions update_dte256() to update DTE using two 128-bit cmpxchg operations to update 256-bit DTE with the modified structure, and take into account the DTE[V, GV] bits when programming the DTE to ensure proper order of DTE programming and flushing. In addition, introduce a per-DTE spin_lock struct dev_data.dte_lock to provide synchronization when updating the DTE to prevent cmpxchg128 failure. Suggested-by: Jason Gunthorpe Suggested-by: Uros Bizjak Reviewed-by: Jason Gunthorpe Reviewed-by: Uros Bizjak Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-5-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 10 ++- drivers/iommu/amd/iommu.c | 123 ++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ae5f1e031722..ea7922b06325 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -427,9 +427,13 @@ #define DTE_GCR3_SHIFT_C 43 #define DTE_GPT_LEVEL_SHIFT 54 +#define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54) #define GCR3_VALID 0x01ULL +/* DTE[128:179] | DTE[184:191] */ +#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) + #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) #define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) @@ -842,6 +846,7 @@ struct devid_map { struct iommu_dev_data { /*Protect against attach/detach races */ struct mutex mutex; + spinlock_t dte_lock; /* DTE lock for 256-bit access */ struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ @@ -886,7 +891,10 @@ extern struct list_head amd_iommu_list; * Structure defining one entry in the device table */ struct dev_table_entry { - u64 data[4]; + union { + u64 data[4]; + u128 data128[2]; + }; }; /* diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 35322b7e788f..7088fd0f90f9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -83,12 +83,125 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data); +static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); + /**************************************************************************** * * Helper functions * ****************************************************************************/ +static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) +{ + /* + * Note: + * We use arch_cmpxchg128_local() because: + * - Need cmpxchg16b instruction mainly for 128-bit store to DTE + * (not necessary for cmpxchg since this function is already + * protected by a spin_lock for this DTE). + * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. + */ + arch_cmpxchg128_local(ptr, *ptr, val); +} + +static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) +{ + struct dev_table_entry old; + + old.data128[1] = ptr->data128[1]; + /* + * Preserve DTE_DATA2_INTR_MASK. This needs to be + * done here since it requires to be inside + * spin_lock(&dev_data->dte_lock) context. + */ + new->data[2] &= ~DTE_DATA2_INTR_MASK; + new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; + + amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); +} + +static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) +{ + amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); +} + +/* + * Note: + * IOMMU reads the entire Device Table entry in a single 256-bit transaction + * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver + * need to ensure the following: + * - DTE[V|GV] bit is being written last when setting. + * - DTE[V|GV] bit is being written first when clearing. + * + * This function is used only by code, which updates DMA translation part of the DTE. + * So, only consider control bits related to DMA when updating the entry. + */ +static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, + struct dev_table_entry *new) +{ + unsigned long flags; + struct dev_table_entry *dev_table = get_dev_table(iommu); + struct dev_table_entry *ptr = &dev_table[dev_data->devid]; + + spin_lock_irqsave(&dev_data->dte_lock, flags); + + if (!(ptr->data[0] & DTE_FLAG_V)) { + /* Existing DTE is not valid. */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!(new->data[0] & DTE_FLAG_V)) { + /* Existing DTE is valid. New DTE is not valid. */ + write_dte_lower128(ptr, new); + write_dte_upper128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { + /* + * Both DTEs are valid. + * Existing DTE has no guest page table. + */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { + /* + * Both DTEs are valid. + * Existing DTE has guest page table, + * new DTE has no guest page table, + */ + write_dte_lower128(ptr, new); + write_dte_upper128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != + FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { + /* + * Both DTEs are valid and have guest page table, + * but have different number of levels. So, we need + * to upadte both upper and lower 128-bit value, which + * require disabling and flushing. + */ + struct dev_table_entry clear = {}; + + /* First disable DTE */ + write_dte_lower128(ptr, &clear); + iommu_flush_dte_sync(iommu, dev_data->devid); + + /* Then update DTE */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else { + /* + * Both DTEs are valid and have guest page table, + * and same number of levels. We just need to only + * update the lower 128-bit. So no need to disable DTE. + */ + write_dte_lower128(ptr, new); + } + + spin_unlock_irqrestore(&dev_data->dte_lock, flags); +} + static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) { return (pdom && (pdom->pd_mode == PD_MODE_V2)); @@ -209,6 +322,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) return NULL; mutex_init(&dev_data->mutex); + spin_lock_init(&dev_data->dte_lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -1269,6 +1383,15 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) +{ + int ret; + + ret = iommu_flush_dte(iommu, devid); + if (!ret) + iommu_completion_wait(iommu); +} + static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) { u32 devid; -- Gitee From 18755ddb3bba972c54e67bf09dd93b64bcb2b328 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:33 +0000 Subject: [PATCH 181/278] iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers ANBZ: #13617 commit fd5dff9de4be29b8ebec63b7e916915d4c984027 upstream. Also, the set_dte_entry() is used to program several DTE fields (e.g. stage1 table, stage2 table, domain id, and etc.), which is difficult to keep track with current implementation. Therefore, separate logic for clearing DTE (i.e. make_clear_dte) and another function for setting up the GCR3 Table Root Pointer, GIOV, GV, GLX, and GuestPagingMode into another function set_dte_gcr3_table(). Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-6-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 2 + drivers/iommu/amd/amd_iommu_types.h | 13 +-- drivers/iommu/amd/init.c | 30 ++++++- drivers/iommu/amd/iommu.c | 129 ++++++++++++++++------------ 4 files changed, 106 insertions(+), 68 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index d639a2514d18..b97dc92bbfeb 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -185,3 +185,5 @@ struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); extern bool amd_iommu_snp_en; #endif + +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ea7922b06325..0bbda60d3cdc 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -409,8 +409,7 @@ #define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) -#define DTE_GLX_SHIFT (56) -#define DTE_GLX_MASK (3) +#define DTE_GLX GENMASK_ULL(57, 56) #define DTE_FLAG_IR BIT_ULL(61) #define DTE_FLAG_IW BIT_ULL(62) @@ -418,13 +417,9 @@ #define DTE_FLAG_MASK (0x3ffULL << 32) #define DEV_DOMID_MASK 0xffffULL -#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL) -#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL) -#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL) - -#define DTE_GCR3_SHIFT_A 58 -#define DTE_GCR3_SHIFT_B 16 -#define DTE_GCR3_SHIFT_C 43 +#define DTE_GCR3_14_12 GENMASK_ULL(60, 58) +#define DTE_GCR3_30_15 GENMASK_ULL(31, 16) +#define DTE_GCR3_51_31 GENMASK_ULL(63, 43) #define DTE_GPT_LEVEL_SHIFT 54 #define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 1cec9069910e..dee79591a212 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1088,11 +1088,9 @@ static bool __copy_device_table(struct amd_iommu *iommu) } /* If gcr3 table existed, mask it out */ if (old_devtb[devid].data[0] & DTE_FLAG_GV) { - tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; - tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; + tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31); pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; - tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A; - tmp |= DTE_FLAG_GV; + tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV); pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; } } @@ -1143,6 +1141,30 @@ static bool copy_device_table(void) return true; } +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid) +{ + struct ivhd_dte_flags *e; + unsigned int best_len = UINT_MAX; + struct dev_table_entry *dte = NULL; + + for_each_ivhd_dte_flags(e) { + /* + * Need to go through the whole list to find the smallest range, + * which contains the devid. + */ + if ((e->segid == segid) && + (e->devid_first <= devid) && (devid <= e->devid_last)) { + unsigned int len = e->devid_last - e->devid_first; + + if (len < best_len) { + dte = &(e->dte); + best_len = len; + } + } + } + return dte; +} + static bool search_ivhd_dte_flags(u16 segid, u16 first, u16 last) { struct ivhd_dte_flags *e; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7088fd0f90f9..6c7d6eaad198 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1958,90 +1958,109 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) return ret; } +static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr, + struct dev_table_entry *new) +{ + /* All existing DTE must have V bit set */ + new->data128[0] = DTE_FLAG_V; + new->data128[1] = 0; +} + +/* + * Note: + * The old value for GCR3 table and GPT have been cleared from caller. + */ +static void set_dte_gcr3_table(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data, + struct dev_table_entry *target) +{ + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + u64 gcr3; + + if (!gcr3_info->gcr3_tbl) + return; + + pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n", + __func__, dev_data->devid, gcr3_info->glx, + (unsigned long long)gcr3_info->gcr3_tbl); + + gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); + + target->data[0] |= DTE_FLAG_GV | + FIELD_PREP(DTE_GLX, gcr3_info->glx) | + FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); + if (pdom_is_v2_pgtbl_mode(dev_data->domain)) + target->data[0] |= DTE_FLAG_GIOV; + + target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | + FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); + + /* Guest page table can only support 4 and 5 levels */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) + target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); + else + target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); +} + static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { - u64 pte_root = 0; - u64 flags = 0; - u32 old_domid; - u16 devid = dev_data->devid; u16 domid; + u32 old_domid; + struct dev_table_entry *initial_dte; + struct dev_table_entry new = {}; struct protection_domain *domain = dev_data->domain; - struct dev_table_entry *dev_table = get_dev_table(iommu); struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; else domid = domain->id; + make_clear_dte(dev_data, dte, &new); + if (domain->iop.mode != PAGE_MODE_NONE) - pte_root = iommu_virt_to_phys(domain->iop.root); + new.data[0] = iommu_virt_to_phys(domain->iop.root); - pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) + new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; - pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; /* - * When SNP is enabled, Only set TV bit when IOMMU - * page translation is in use. + * When SNP is enabled, we can only support TV=1 with non-zero domain ID. + * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in + * do_iommu_domain_alloc(). */ - if (!amd_iommu_snp_en || (domid != 0)) - pte_root |= DTE_FLAG_TV; - - flags = dev_table[devid].data[1]; - - if (dev_data->ats_enabled) - flags |= DTE_FLAG_IOTLB; + WARN_ON(amd_iommu_snp_en && (domid == 0)); + new.data[0] |= DTE_FLAG_TV; if (dev_data->ppr) - pte_root |= 1ULL << DEV_ENTRY_PPR; + new.data[0] |= 1ULL << DEV_ENTRY_PPR; if (domain->dirty_tracking) - pte_root |= DTE_FLAG_HAD; - - if (gcr3_info && gcr3_info->gcr3_tbl) { - u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); - u64 glx = gcr3_info->glx; - u64 tmp; - - pte_root |= DTE_FLAG_GV; - pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; + new.data[0] |= DTE_FLAG_HAD; - /* First mask out possible old values for GCR3 table */ - tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; - flags &= ~tmp; - - tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; - flags &= ~tmp; - - /* Encode GCR3 table into DTE */ - tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; - pte_root |= tmp; - - tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; - flags |= tmp; - - tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; - flags |= tmp; + if (dev_data->ats_enabled) + new.data[1] |= DTE_FLAG_IOTLB; - if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { - dev_table[devid].data[2] |= - ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT); - } + old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK; + new.data[1] |= domid; - /* GIOV is supported with V2 page table mode only */ - if (pdom_is_v2_pgtbl_mode(domain)) - pte_root |= DTE_FLAG_GIOV; + /* + * Restore cached persistent DTE bits, which can be set by information + * in IVRS table. See set_dev_entry_from_acpi(). + */ + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); + if (initial_dte) { + new.data128[0] |= initial_dte->data128[0]; + new.data128[1] |= initial_dte->data128[1]; } - flags &= ~DEV_DOMID_MASK; - flags |= domid; + set_dte_gcr3_table(iommu, dev_data, &new); - old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK; - dev_table[devid].data[1] = flags; - dev_table[devid].data[0] = pte_root; + update_dte256(iommu, dev_data, &new); /* * A kdump kernel might be replacing a domain ID that was copied from -- Gitee From 9db3cebf0b1a11458ea6c23af42f3bbe257200eb Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:34 +0000 Subject: [PATCH 182/278] iommu/amd: Introduce helper function get_dte256() ANBZ: #13617 commit a2ce608a1eb65c2af99c58b63eae557165a0da87 upstream. And use it in clone_alias() along with update_dte256(). Also use get_dte256() in dump_dte_entry(). Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-7-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 62 ++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6c7d6eaad198..f5a386061faf 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -85,6 +85,8 @@ static void set_dte_entry(struct amd_iommu *iommu, static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); +static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); + /**************************************************************************** * * Helper functions @@ -202,6 +204,21 @@ static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_da spin_unlock_irqrestore(&dev_data->dte_lock, flags); } +static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, + struct dev_table_entry *dte) +{ + unsigned long flags; + struct dev_table_entry *ptr; + struct dev_table_entry *dev_table = get_dev_table(iommu); + + ptr = &dev_table[dev_data->devid]; + + spin_lock_irqsave(&dev_data->dte_lock, flags); + dte->data128[0] = ptr->data128[0]; + dte->data128[1] = ptr->data128[1]; + spin_unlock_irqrestore(&dev_data->dte_lock, flags); +} + static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) { return (pdom && (pdom->pd_mode == PD_MODE_V2)); @@ -350,9 +367,11 @@ static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) { + struct dev_table_entry new; struct amd_iommu *iommu; - struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data, *alias_data; u16 devid = pci_dev_id(pdev); + int ret = 0; if (devid == alias) return 0; @@ -361,13 +380,27 @@ static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) if (!iommu) return 0; - amd_iommu_set_rlookup_table(iommu, alias); - dev_table = get_dev_table(iommu); - memcpy(dev_table[alias].data, - dev_table[devid].data, - sizeof(dev_table[alias].data)); + /* Copy the data from pdev */ + dev_data = dev_iommu_priv_get(&pdev->dev); + if (!dev_data) { + pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); + ret = -EINVAL; + goto out; + } + get_dte256(iommu, dev_data, &new); - return 0; + /* Setup alias */ + alias_data = find_dev_data(iommu, alias); + if (!alias_data) { + pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); + ret = -EINVAL; + goto out; + } + update_dte256(iommu, alias_data, &new); + + amd_iommu_set_rlookup_table(iommu, alias); +out: + return ret; } static void clone_aliases(struct amd_iommu *iommu, struct device *dev) @@ -640,6 +673,12 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) return -ENOMEM; dev_data->dev = dev; + + /* + * The dev_iommu_priv_set() needes to be called before setup_aliases. + * Otherwise, subsequent call to dev_iommu_priv_get() will fail. + */ + dev_iommu_priv_set(dev, dev_data); setup_aliases(iommu, dev); /* @@ -653,8 +692,6 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) dev_data->flags = pdev_get_caps(to_pci_dev(dev)); } - dev_iommu_priv_set(dev, dev_data); - return 0; } @@ -685,10 +722,13 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) { int i; - struct dev_table_entry *dev_table = get_dev_table(iommu); + struct dev_table_entry dte; + struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); + + get_dte256(iommu, dev_data, &dte); for (i = 0; i < 4; ++i) - pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]); + pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); } static void dump_command(unsigned long phys_addr) -- Gitee From 8479876d26dbcca1032bf2601df8022edebc8ac4 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:35 +0000 Subject: [PATCH 183/278] iommu/amd: Modify clear_dte_entry() to avoid in-place update ANBZ: #13617 commit 66ea3f96ae2b02cf543f3373b67aa6b8c6794926 upstream. By reusing the make_clear_dte() and update_dte256(). Also, there is no need to set TV bit for non-SNP system when clearing DTE for blocked domain, and no longer need to apply erratum 63 in clear_dte() since it is already stored in struct ivhd_dte_flags and apply in set_dte_entry(). Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-8-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f5a386061faf..1dff8954d60c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2112,19 +2112,16 @@ static void set_dte_entry(struct amd_iommu *iommu, } } -static void clear_dte_entry(struct amd_iommu *iommu, u16 devid) +/* + * Clear DMA-remap related flags to block all DMA (blockeded domain) + */ +static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { - struct dev_table_entry *dev_table = get_dev_table(iommu); - - /* remove entry from the device table seen by the hardware */ - dev_table[devid].data[0] = DTE_FLAG_V; - - if (!amd_iommu_snp_en) - dev_table[devid].data[0] |= DTE_FLAG_TV; - - dev_table[devid].data[1] &= DTE_FLAG_MASK; + struct dev_table_entry new = {}; + struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; - amd_iommu_apply_erratum_63(iommu, devid); + make_clear_dte(dev_data, dte, &new); + update_dte256(iommu, dev_data, &new); } /* Update and flush DTE for the given device */ @@ -2135,7 +2132,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) if (set) set_dte_entry(iommu, dev_data); else - clear_dte_entry(iommu, dev_data->devid); + clear_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); -- Gitee From c5fdc71d3ac53af73b8379462d08113927f9387e Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:36 +0000 Subject: [PATCH 184/278] iommu/amd: Lock DTE before updating the entry with WRITE_ONCE() ANBZ: #13617 commit 457da57646686fcb38b3f61b153920ca08200078 upstream. When updating only within a 64-bit tuple of a DTE, just lock the DTE and use WRITE_ONCE() because it is writing to memory read back by HW. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-9-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/iommu.c | 43 +++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index b97dc92bbfeb..5d9ad7ffd238 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -187,3 +187,4 @@ extern bool amd_iommu_snp_en; #endif struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); +struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 1dff8954d60c..b5c87ca3aaf4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -347,7 +347,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) return dev_data; } -static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) +struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) { struct iommu_dev_data *dev_data; struct llist_node *node; @@ -2867,12 +2867,12 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct protection_domain *pdomain = to_pdomain(domain); - struct dev_table_entry *dev_table; + struct dev_table_entry *dte; struct iommu_dev_data *dev_data; bool domain_flush = false; struct amd_iommu *iommu; unsigned long flags; - u64 pte_root; + u64 new; spin_lock_irqsave(&pdomain->lock, flags); if (!(pdomain->dirty_tracking ^ enable)) { @@ -2881,16 +2881,15 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, } list_for_each_entry(dev_data, &pdomain->dev_list, list) { + spin_lock(&dev_data->dte_lock); iommu = get_amd_iommu_from_dev_data(dev_data); - - dev_table = get_dev_table(iommu); - pte_root = dev_table[dev_data->devid].data[0]; - - pte_root = (enable ? pte_root | DTE_FLAG_HAD : - pte_root & ~DTE_FLAG_HAD); + dte = &get_dev_table(iommu)[dev_data->devid]; + new = dte->data[0]; + new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); + dte->data[0] = new; + spin_unlock(&dev_data->dte_lock); /* Flush device DTE */ - dev_table[dev_data->devid].data[0] = pte_root; device_flush_dte(dev_data); domain_flush = true; } @@ -3157,17 +3156,23 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, struct irq_remap_table *table) { - u64 dte; - struct dev_table_entry *dev_table = get_dev_table(iommu); + u64 new; + struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; + struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); + + if (dev_data) + spin_lock(&dev_data->dte_lock); - dte = dev_table[devid].data[2]; - dte &= ~DTE_IRQ_PHYS_ADDR_MASK; - dte |= iommu_virt_to_phys(table->table); - dte |= DTE_IRQ_REMAP_INTCTL; - dte |= DTE_INTTABLEN; - dte |= DTE_IRQ_REMAP_ENABLE; + new = READ_ONCE(dte->data[2]); + new &= ~DTE_IRQ_PHYS_ADDR_MASK; + new |= iommu_virt_to_phys(table->table); + new |= DTE_IRQ_REMAP_INTCTL; + new |= DTE_INTTABLEN; + new |= DTE_IRQ_REMAP_ENABLE; + WRITE_ONCE(dte->data[2], new); - dev_table[devid].data[2] = dte; + if (dev_data) + spin_unlock(&dev_data->dte_lock); } static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) -- Gitee From 216c46f8ed8f859c9acc3475993453c5ace35e3b Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Nov 2024 05:49:37 +0000 Subject: [PATCH 185/278] iommu/amd: Remove amd_iommu_apply_erratum_63() ANBZ: #13617 commit b0988acc94c021d8d428c9496649a1b4e6203011 upstream. Also replace __set_dev_entry_bit() with set_dte_bit() and remove unused helper functions. Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241118054937.5203-10-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/init.c | 50 +++-------------------------------- 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 5d9ad7ffd238..c19990a282e6 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -16,7 +16,6 @@ irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data); irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data); irqreturn_t amd_iommu_int_thread_galog(int irq, void *data); irqreturn_t amd_iommu_int_handler(int irq, void *data); -void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid); void amd_iommu_restart_log(struct amd_iommu *iommu, const char *evt_type, u8 cntrl_intr, u8 cntrl_log, u32 status_run_mask, u32 status_overflow_mask); diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index dee79591a212..cd37e3fa39b5 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -991,38 +991,6 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) dte->data[i] |= (1UL << _bit); } -static void __set_dev_entry_bit(struct dev_table_entry *dev_table, - u16 devid, u8 bit) -{ - int i = (bit >> 6) & 0x03; - int _bit = bit & 0x3f; - - dev_table[devid].data[i] |= (1UL << _bit); -} - -static void set_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit) -{ - struct dev_table_entry *dev_table = get_dev_table(iommu); - - return __set_dev_entry_bit(dev_table, devid, bit); -} - -static int __get_dev_entry_bit(struct dev_table_entry *dev_table, - u16 devid, u8 bit) -{ - int i = (bit >> 6) & 0x03; - int _bit = bit & 0x3f; - - return (dev_table[devid].data[i] & (1UL << _bit)) >> _bit; -} - -static int get_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit) -{ - struct dev_table_entry *dev_table = get_dev_table(iommu); - - return __get_dev_entry_bit(dev_table, devid, bit); -} - static bool __copy_device_table(struct amd_iommu *iommu) { u64 int_ctl, int_tab_len, entry = 0; @@ -1178,17 +1146,6 @@ static bool search_ivhd_dte_flags(u16 segid, u16 first, u16 last) return false; } -void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid) -{ - int sysmgt; - - sysmgt = get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1) | - (get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2) << 1); - - if (sysmgt == 0x01) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_IW); -} - /* * This function takes the device specific flags read from the ACPI * table and sets up the device table entry with that information @@ -2636,9 +2593,9 @@ static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg) return; for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { - __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_VALID); + set_dte_bit(&dev_table[devid], DEV_ENTRY_VALID); if (!amd_iommu_snp_en) - __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_TRANSLATION); + set_dte_bit(&dev_table[devid], DEV_ENTRY_TRANSLATION); } } @@ -2666,8 +2623,7 @@ static void init_device_table(void) for_each_pci_segment(pci_seg) { for (devid = 0; devid <= pci_seg->last_bdf; ++devid) - __set_dev_entry_bit(pci_seg->dev_table, - devid, DEV_ENTRY_IRQ_TBL_EN); + set_dte_bit(&pci_seg->dev_table[devid], DEV_ENTRY_IRQ_TBL_EN); } } -- Gitee From 9a75ab114ba3a9a3748eee3761d924437a3c369b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:22 -0800 Subject: [PATCH 186/278] iommu: Prevent pasid attach if no ops->remove_dev_pasid ANBZ: #13617 commit fb3de9f9b085d003a8a869ca6a4789d1bfbb3f22 upstream. driver should implement both set_dev_pasid and remove_dev_pasid op, otherwise it is a problem how to detach pasid. In reality, it is impossible that an iommu driver implements set_dev_pasid() but no remove_dev_pasid() op. However, it is better to check it. Move the group check to be the first as dev_iommu_ops() may fail when there is no valid group. Also take the chance to remove the dev_has_iommu() check as it is duplicated to the group check. Reviewed-by: Jason Gunthorpe Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241204122928.11987-2-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 00ede527e568..80d1962f658a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3414,16 +3414,19 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct group_device *device; + const struct iommu_ops *ops; int ret; - if (!domain->ops->set_dev_pasid) - return -EOPNOTSUPP; - if (!group) return -ENODEV; - if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner || - pasid == IOMMU_NO_PASID) + ops = dev_iommu_ops(dev); + + if (!domain->ops->set_dev_pasid || + !ops->remove_dev_pasid) + return -EOPNOTSUPP; + + if (ops != domain->owner || pasid == IOMMU_NO_PASID) return -EINVAL; mutex_lock(&group->mutex); -- Gitee From 361c28993de01bd7f57443147e8bdd15042e6063 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:23 -0800 Subject: [PATCH 187/278] iommu: Consolidate the ops->remove_dev_pasid usage into a helper ANBZ: #13617 commit 1fbf73425f5169e2d183b2ca67bfe2f1019d11c0 upstream. Add a wrapper for the ops->remove_dev_pasid, this consolidates the iommu_ops fetching and callback invoking. It is also a preparation for starting the transition from using remove_dev_pasid op to detach pasid to the way using blocked_domain to detach pasid. Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-3-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 80d1962f658a..b78a8f755230 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3358,6 +3358,14 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); +static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + + ops->remove_dev_pasid(dev, pasid, domain); +} + static int __iommu_set_group_pasid(struct iommu_domain *domain, struct iommu_group *group, ioasid_t pasid) { @@ -3376,11 +3384,9 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, err_revert: last_gdev = device; for_each_group_device(group, device) { - const struct iommu_ops *ops = dev_iommu_ops(device->dev); - if (device == last_gdev) break; - ops->remove_dev_pasid(device->dev, pasid, domain); + iommu_remove_dev_pasid(device->dev, pasid, domain); } return ret; } @@ -3390,12 +3396,9 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, struct iommu_domain *domain) { struct group_device *device; - const struct iommu_ops *ops; - for_each_group_device(group, device) { - ops = dev_iommu_ops(device->dev); - ops->remove_dev_pasid(device->dev, pasid, domain); - } + for_each_group_device(group, device) + iommu_remove_dev_pasid(device->dev, pasid, domain); } /* -- Gitee From 7ecd038b197311b64ce03ba368df036d8a3fcd03 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:24 -0800 Subject: [PATCH 188/278] iommu: Detaching pasid by attaching to the blocked_domain ANBZ: #13617 commit b18301b9156a0d8a0094fcd16a1b98816539eab0 upstream. The iommu drivers are on the way to detach pasid by attaching to the blocked domain. However, this cannot be done in one shot. During the transition, iommu core would select between the remove_dev_pasid op and the blocked domain. Suggested-by: Kevin Tian Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-4-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b78a8f755230..723676e41ef0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3362,8 +3362,18 @@ static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *blocked_domain = ops->blocked_domain; + int ret = 1; - ops->remove_dev_pasid(dev, pasid, domain); + if (blocked_domain && blocked_domain->ops->set_dev_pasid) { + ret = blocked_domain->ops->set_dev_pasid(blocked_domain, + dev, pasid, domain); + } else { + ops->remove_dev_pasid(dev, pasid, domain); + ret = 0; + } + + WARN_ON(ret); } static int __iommu_set_group_pasid(struct iommu_domain *domain, @@ -3426,7 +3436,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, ops = dev_iommu_ops(dev); if (!domain->ops->set_dev_pasid || - !ops->remove_dev_pasid) + (!ops->remove_dev_pasid && + (!ops->blocked_domain || + !ops->blocked_domain->ops->set_dev_pasid))) return -EOPNOTSUPP; if (ops != domain->owner || pasid == IOMMU_NO_PASID) -- Gitee From 360a72140dda4b7a1b51bd453d67731fed25bdb4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 Dec 2024 04:29:25 -0800 Subject: [PATCH 189/278] iommu/arm-smmu-v3: Make the blocked domain support PASID ANBZ: #13617 commit ef181762cb544efc8c88b79ea9224e21ca5da533 upstream. The blocked domain is used to park RID to be blocking DMA state. This can be extended to PASID as well. By this, the remove_dev_pasid() op of ARM SMMUv3 can be dropped. Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-5-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d2d74fd2df61..29dd2785c97a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3073,13 +3073,12 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master, return ret; } -static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old_domain) { + struct arm_smmu_domain *smmu_domain = to_smmu_domain(old_domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct arm_smmu_domain *smmu_domain; - - smmu_domain = to_smmu_domain(domain); mutex_lock(&arm_smmu_asid_lock); arm_smmu_clear_cd(master, pasid); @@ -3100,6 +3099,7 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid, sid_domain->type == IOMMU_DOMAIN_BLOCKED) sid_domain->ops->attach_dev(sid_domain, dev); } + return 0; } static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, @@ -3181,6 +3181,7 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, static const struct iommu_domain_ops arm_smmu_blocked_ops = { .attach_dev = arm_smmu_attach_dev_blocked, + .set_dev_pasid = arm_smmu_blocking_set_dev_pasid, }; static struct iommu_domain arm_smmu_blocked_domain = { @@ -3636,7 +3637,6 @@ static struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, - .remove_dev_pasid = arm_smmu_remove_dev_pasid, .dev_enable_feat = arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, -- Gitee From c6b1086f25e4001e333d95d9fe7803273689a6a1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:26 -0800 Subject: [PATCH 190/278] iommu/vt-d: Make the blocked domain support PASID ANBZ: #13617 commit 4f0bdab175d6ea544c97766813db001ba28be3a7 upstream. The blocked domain can be extended to park PASID of a device to be the DMA blocking state. By this the remove_dev_pasid() op is dropped. Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-6-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 511e5be98250..02ee216bbe20 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3309,10 +3309,15 @@ static int blocking_domain_attach_dev(struct iommu_domain *domain, return 0; } +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); + static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, + .set_dev_pasid = blocking_domain_set_dev_pasid, } }; @@ -4157,13 +4162,16 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, kfree(dev_pasid); } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); - domain_remove_dev_pasid(domain, dev, pasid); + domain_remove_dev_pasid(old, dev, pasid); + + return 0; } struct dev_pasid_info * @@ -4536,7 +4544,6 @@ const struct iommu_ops intel_iommu_ops = { .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, - .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, .page_response = intel_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { -- Gitee From 01a23fcdf35ef75e7edd3768bcb96e3667cdd2f6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:27 -0800 Subject: [PATCH 191/278] iommu/amd: Make the blocked domain support PASID ANBZ: #13617 commit 5f53638882391646e203e064c3ef3cc401745da8 upstream. The blocked domain can be extended to park PASID of a device to be the DMA blocking state. By this the remove_dev_pasid() op is dropped. Remove PASID from old domain and device GCR3 table. No need to attach PASID to the blocked domain as clearing PASID from GCR3 table will make sure all DMAs for that PASID are blocked. Suggested-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-7-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b5c87ca3aaf4..a1c2c046a0f0 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2676,10 +2676,19 @@ static int blocked_domain_attach_device(struct iommu_domain *domain, return 0; } +static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + amd_iommu_remove_dev_pasid(dev, pasid, old); + return 0; +} + static struct iommu_domain blocked_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocked_domain_attach_device, + .set_dev_pasid = blocked_domain_set_dev_pasid, } }; @@ -3101,7 +3110,6 @@ const struct iommu_ops amd_iommu_ops = { .def_domain_type = amd_iommu_def_domain_type, .dev_enable_feat = amd_iommu_dev_enable_feature, .dev_disable_feat = amd_iommu_dev_disable_feature, - .remove_dev_pasid = amd_iommu_remove_dev_pasid, .page_response = amd_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = amd_iommu_attach_device, -- Gitee From 8957ca9613b309a6512bb862ed6cbf613b741587 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:28 -0800 Subject: [PATCH 192/278] iommu: Remove the remove_dev_pasid op ANBZ: #13617 commit 647b7aad19490a7b90c52c883bda7df299457491 upstream. The iommu drivers that supports PASID have supported attaching pasid to the blocked_domain, hence remove the remove_dev_pasid op from the iommu_ops. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-8-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 17 ++++------------- include/linux/iommu.h | 5 ----- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 723676e41ef0..55a961297e97 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3363,17 +3363,9 @@ static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *blocked_domain = ops->blocked_domain; - int ret = 1; - if (blocked_domain && blocked_domain->ops->set_dev_pasid) { - ret = blocked_domain->ops->set_dev_pasid(blocked_domain, - dev, pasid, domain); - } else { - ops->remove_dev_pasid(dev, pasid, domain); - ret = 0; - } - - WARN_ON(ret); + WARN_ON(blocked_domain->ops->set_dev_pasid(blocked_domain, + dev, pasid, domain)); } static int __iommu_set_group_pasid(struct iommu_domain *domain, @@ -3436,9 +3428,8 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, ops = dev_iommu_ops(dev); if (!domain->ops->set_dev_pasid || - (!ops->remove_dev_pasid && - (!ops->blocked_domain || - !ops->blocked_domain->ops->set_dev_pasid))) + !ops->blocked_domain || + !ops->blocked_domain->ops->set_dev_pasid) return -EOPNOTSUPP; if (ops != domain->owner || pasid == IOMMU_NO_PASID) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1c7fd0122e7a..457bf70ff263 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -595,9 +595,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains - * @remove_dev_pasid: Remove any translation configurations of a specific - * pasid, so that any DMA transactions with this pasid - * will be blocked by the hardware. * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind * the @dev, as the set of virtualization resources shared/passed * to user space IOMMU instance. And associate it with a nesting @@ -655,8 +652,6 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); - void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain); struct iommufd_viommu *(*viommu_alloc)( struct device *dev, struct iommu_domain *parent_domain, -- Gitee From 8fb0ba39c73f9784f855b9df3125d3abb8891db7 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Mon, 2 Dec 2024 14:06:03 +0000 Subject: [PATCH 193/278] iommu/io-pgtable-arm: Fix stage-2 concatenation with 16K ANBZ: #13617 commit 4dcac8407fe1be21990f356e2e8d8309ba63e346 upstream. At the moment, io-pgtable-arm uses concatenation only if it is possible at level 0, which misses a case where concatenation is mandatory at level 1 according to R_SRKBC in Arm spec DDI0487 K.a. Also, that means concatenation can be used when not mandated, contradicting the comment on the code. However, these cases can only happen if the SMMUv3 driver is changed to use ias != oas for stage-2. This patch re-writes the code to use concatenation only if mandatory, fixing the missing case for level-1 and granule 16K with PA = 40 bits. Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241202140604.422235-2-smostafa@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 45 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 9f15fc09ac06..3abf81e800a9 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -223,6 +223,33 @@ static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data) return ptes_per_table - (i & (ptes_per_table - 1)); } +/* + * Check if concatenated PGDs are mandatory according to Arm DDI0487 (K.a) + * 1) R_DXBSH: For 16KB, and 48-bit input size, use level 1 instead of 0. + * 2) R_SRKBC: After de-ciphering the table for PA size and valid initial lookup + * a) 40 bits PA size with 4K: use level 1 instead of level 0 (2 tables for ias = oas) + * b) 40 bits PA size with 16K: use level 2 instead of level 1 (16 tables for ias = oas) + * c) 42 bits PA size with 4K: use level 1 instead of level 0 (8 tables for ias = oas) + * d) 48 bits PA size with 16K: use level 1 instead of level 0 (2 tables for ias = oas) + */ +static inline bool arm_lpae_concat_mandatory(struct arm_lpae_io_pgtable *data) +{ + unsigned int ias = data->iop.cfg.ias; + unsigned int oas = data->iop.cfg.oas; + + /* Covers 1 and 2.d */ + if ((ARM_LPAE_GRANULE(data) == SZ_16K) && (data->start_level == 0)) + return (oas == 48) || (ias == 48); + + /* Covers 2.a and 2.c */ + if ((ARM_LPAE_GRANULE(data) == SZ_4K) && (data->start_level == 0)) + return (oas == 40) || (oas == 42); + + /* Case 2.b */ + return (ARM_LPAE_GRANULE(data) == SZ_16K) && + (data->start_level == 1) && (oas == 40); +} + static bool selftest_running = false; static dma_addr_t __arm_lpae_dma_addr(void *pages) @@ -1006,18 +1033,12 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) if (!data) return NULL; - /* - * Concatenate PGDs at level 1 if possible in order to reduce - * the depth of the stage-2 walk. - */ - if (data->start_level == 0) { - unsigned long pgd_pages; - - pgd_pages = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte); - if (pgd_pages <= ARM_LPAE_S2_MAX_CONCAT_PAGES) { - data->pgd_bits += data->bits_per_level; - data->start_level++; - } + if (arm_lpae_concat_mandatory(data)) { + if (WARN_ON((ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte)) > + ARM_LPAE_S2_MAX_CONCAT_PAGES)) + return NULL; + data->pgd_bits += data->bits_per_level; + data->start_level++; } /* VTCR */ -- Gitee From e7b26d901f5f8e0138b5fba9e70e141f81fc0f81 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Sun, 15 Dec 2024 20:04:11 +0000 Subject: [PATCH 194/278] iommu/io-pgtable-arm: Fix cfg reading in arm_lpae_concat_mandatory() ANBZ: #13617 commit b7b8a63055572f5baa78c1d9d048aad750b02ba5 upstream. The newly introduced arm_lpae_concat_mandatory() function reads the ias/oas fields from the 'io_pgtable_cfg' copy embedded inside the 'arm_lpae_io_pgtable' structure. However, this copy is not set until later in alloc_io_pgtable_ops() after the alloc() function has been called. Use the address sizes passed in the 'io_pgtable_cfg' structure when deciding whether or not to concatenate the PGD. Fixes: 4dcac8407fe1 ("iommu/io-pgtable-arm: Fix stage-2 concatenation with 16K") Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241215200412.561400-1-smostafa@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 3abf81e800a9..4426bf2ee939 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -232,12 +232,13 @@ static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data) * c) 42 bits PA size with 4K: use level 1 instead of level 0 (8 tables for ias = oas) * d) 48 bits PA size with 16K: use level 1 instead of level 0 (2 tables for ias = oas) */ -static inline bool arm_lpae_concat_mandatory(struct arm_lpae_io_pgtable *data) +static inline bool arm_lpae_concat_mandatory(struct io_pgtable_cfg *cfg, + struct arm_lpae_io_pgtable *data) { - unsigned int ias = data->iop.cfg.ias; - unsigned int oas = data->iop.cfg.oas; + unsigned int ias = cfg->ias; + unsigned int oas = cfg->oas; - /* Covers 1 and 2.d */ + /* Covers 1 and 2.d */ if ((ARM_LPAE_GRANULE(data) == SZ_16K) && (data->start_level == 0)) return (oas == 48) || (ias == 48); @@ -1033,7 +1034,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) if (!data) return NULL; - if (arm_lpae_concat_mandatory(data)) { + if (arm_lpae_concat_mandatory(cfg, data)) { if (WARN_ON((ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte)) > ARM_LPAE_S2_MAX_CONCAT_PAGES)) return NULL; -- Gitee From 15f0728e1dd9ddc39ff1ade22eaaabd6c974e458 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 18 Dec 2024 21:14:21 -0800 Subject: [PATCH 195/278] iommu/tegra241-cmdqv: Read SMMU IDR1.CMDQS instead of hardcoding ANBZ: #13617 commit e94dc6ddda8dd3770879a132d577accd2cce25f9 upstream. The hardware limitation "max=19" actually comes from SMMU Command Queue. So, it'd be more natural for tegra241-cmdqv driver to read it out rather than hardcoding it itself. This is not an issue yet for a kernel on a baremetal system, but a guest kernel setting the queue base/size in form of IPA/gPA might result in a noncontiguous queue in the physical address space, if underlying physical pages backing up the guest RAM aren't contiguous entirely: e.g. 2MB-page backed guest RAM cannot guarantee a contiguous queue if it is 8MB (capped to VCMDQ_LOG2SIZE_MAX=19). This might lead to command errors when HW does linear-read from a noncontiguous queue memory. Adding this extra IDR1.CMDQS cap (in the guest kernel) allows VMM to set SMMU's IDR1.CMDQS=17 for the case mentioned above, so a guest-level queue will be capped to maximum 2MB, ensuring a contiguous queue memory. Fixes: a3799717b881 ("iommu/tegra241-cmdqv: Fix alignment failure at max_n_shift") Reported-by: Ian Kalinowski Cc: stable@vger.kernel.org Signed-off-by: Nicolin Chen Link: https://lore.kernel.org/r/20241219051421.1850267-1-nicolinc@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 6e41ddaa24d6..d525ab43a4ae 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -79,7 +79,6 @@ #define TEGRA241_VCMDQ_PAGE1(q) (TEGRA241_VCMDQ_PAGE1_BASE + 0x80*(q)) #define VCMDQ_ADDR GENMASK(47, 5) #define VCMDQ_LOG2SIZE GENMASK(4, 0) -#define VCMDQ_LOG2SIZE_MAX 19 #define TEGRA241_VCMDQ_BASE 0x00000 #define TEGRA241_VCMDQ_CONS_INDX_BASE 0x00008 @@ -505,12 +504,15 @@ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq) struct arm_smmu_cmdq *cmdq = &vcmdq->cmdq; struct arm_smmu_queue *q = &cmdq->q; char name[16]; + u32 regval; int ret; snprintf(name, 16, "vcmdq%u", vcmdq->idx); - /* Queue size, capped to ensure natural alignment */ - q->llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, VCMDQ_LOG2SIZE_MAX); + /* Cap queue size to SMMU's IDR1.CMDQS and ensure natural alignment */ + regval = readl_relaxed(smmu->base + ARM_SMMU_IDR1); + q->llq.max_n_shift = + min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, regval)); /* Use the common helper to init the VCMDQ, and then... */ ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0, -- Gitee From 93cd1113f9c7673e076b229d07fe882a599405c7 Mon Sep 17 00:00:00 2001 From: Gao Shiyuan Date: Sat, 4 Jan 2025 00:58:08 +0800 Subject: [PATCH 196/278] iommu/amd: remove return value of amd_iommu_detect ANBZ: #13617 commit 5bb494d5cbb9a3403ba8b1c8bc145b42fc119078 upstream. The return value of amd_iommu_detect is not used, so remove it and is consistent with other iommu detect functions. Signed-off-by: Gao Shiyuan Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250103165808.80939-1-gaoshiyuan@baidu.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/init.c | 10 ++++------ include/linux/amd-iommu.h | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index cd37e3fa39b5..70134695d917 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3394,25 +3394,23 @@ static bool amd_iommu_sme_check(void) * IOMMUs * ****************************************************************************/ -int __init amd_iommu_detect(void) +void __init amd_iommu_detect(void) { int ret; if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return -ENODEV; + return; if (!amd_iommu_sme_check()) - return -ENODEV; + return; ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); if (ret) - return ret; + return; amd_iommu_detected = true; iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; - - return 1; } /**************************************************************************** diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index dc7ed2f46886..ab0b52a552cf 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -31,11 +31,11 @@ struct amd_iommu_pi_data { struct task_struct; struct pci_dev; -extern int amd_iommu_detect(void); +extern void amd_iommu_detect(void); #else /* CONFIG_AMD_IOMMU */ -static inline int amd_iommu_detect(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } #endif /* CONFIG_AMD_IOMMU */ -- Gitee From 864df55afd2ed733519f0a94fb24bf8f5b862be7 Mon Sep 17 00:00:00 2001 From: Kees Bakker Date: Tue, 7 Jan 2025 10:17:42 +0800 Subject: [PATCH 197/278] iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE ANBZ: #13617 commit 60f030f7418d3f1d94f2fb207fe3080e1844630b upstream. There is a WARN_ON_ONCE to catch an unlikely situation when domain_remove_dev_pasid can't find the `pasid`. In case it nevertheless happens we must avoid using a NULL pointer. Signed-off-by: Kees Bakker Link: https://lore.kernel.org/r/20241218201048.E544818E57E@bout3.ijzerbout.nl Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 02ee216bbe20..a77c0c455cf6 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4153,13 +4153,14 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, break; } } - WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); - kfree(dev_pasid); + if (!WARN_ON_ONCE(!dev_pasid)) { + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); + } } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, -- Gitee From 616031c31e26499c2516b089897c8e250a88c786 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 7 Jan 2025 10:17:43 +0800 Subject: [PATCH 198/278] iommu/vt-d: Remove domain_alloc_paging() ANBZ: #13617 commit de1dda7e0b60c52204c623f288021b2a22636126 upstream. This is duplicated by intel_iommu_domain_alloc_paging_flags(), just remove it. Signed-off-by: Jason Gunthorpe Reviewed-by: Yi Liu Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/0-v1-b101d00c5ee5+17645-vtd_paging_flags_jgg@nvidia.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a77c0c455cf6..e20819c3fd44 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4512,21 +4512,6 @@ static struct iommu_domain identity_domain = { }, }; -static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - struct dmar_domain *dmar_domain; - bool first_stage; - - first_stage = first_level_by_default(iommu); - dmar_domain = paging_domain_alloc(dev, first_stage); - if (IS_ERR(dmar_domain)) - return ERR_CAST(dmar_domain); - - return &dmar_domain->domain; -} - const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, @@ -4535,7 +4520,6 @@ const struct iommu_ops intel_iommu_ops = { .hw_info = intel_iommu_hw_info, .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, - .domain_alloc_paging = intel_iommu_domain_alloc_paging, .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, -- Gitee From f9af596e5a5e83668eb1a06bddc0547898432f44 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 7 Jan 2025 10:17:44 +0800 Subject: [PATCH 199/278] iommu/vt-d: Remove iommu cap audit ANBZ: #13617 commit c2206299401b23bed9ef87f561d6f85f9351aa84 upstream. The capability audit code was introduced by commit "iommu/vt-d: Audit IOMMU Capabilities and add helper functions", aiming to verify the consistency of capabilities across all IOMMUs for supported features. Nowadays, all the kAPIs of the iommu subsystem have evolved to be device oriented, in preparation for supporting heterogeneous IOMMU architectures. There is no longer a need to require capability consistence among IOMMUs for any feature. Remove the iommu cap audit code to make the driver align with the design in the iommu core. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241216071828.22962-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/Makefile | 2 +- drivers/iommu/intel/cap_audit.c | 217 ---------------------------- drivers/iommu/intel/cap_audit.h | 131 ----------------- drivers/iommu/intel/iommu.c | 9 -- drivers/iommu/intel/irq_remapping.c | 8 - 5 files changed, 1 insertion(+), 366 deletions(-) delete mode 100644 drivers/iommu/intel/cap_audit.c delete mode 100644 drivers/iommu/intel/cap_audit.h diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index d3bb0798092d..6c7528130cf9 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o prq.o -obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o +obj-$(CONFIG_DMAR_TABLE) += trace.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c deleted file mode 100644 index 9862dc20b35e..000000000000 --- a/drivers/iommu/intel/cap_audit.c +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * cap_audit.c - audit iommu capabilities for boot time and hot plug - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park - * Lu Baolu - */ - -#define pr_fmt(fmt) "DMAR: " fmt - -#include "iommu.h" -#include "cap_audit.h" - -static u64 intel_iommu_cap_sanity; -static u64 intel_iommu_ecap_sanity; - -static inline void check_irq_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK); -} - -static inline void check_dmar_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK); - - CHECK_FEATURE_MISMATCH(a, b, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK); -} - -static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type) -{ - bool mismatch = false; - u64 old_cap = intel_iommu_cap_sanity; - u64 old_ecap = intel_iommu_ecap_sanity; - - if (type == CAP_AUDIT_HOTPLUG_IRQR) { - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK); - goto out; - } - - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK); - - /* Abort hot plug if the hot plug iommu feature is smaller than global */ - MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch); - -out: - if (mismatch) { - intel_iommu_cap_sanity = old_cap; - intel_iommu_ecap_sanity = old_ecap; - return -EFAULT; - } - - return 0; -} - -static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) -{ - struct dmar_drhd_unit *d; - struct intel_iommu *i; - int rc = 0; - - rcu_read_lock(); - if (list_empty(&dmar_drhd_units)) - goto out; - - for_each_active_iommu(i, d) { - if (!iommu) { - intel_iommu_ecap_sanity = i->ecap; - intel_iommu_cap_sanity = i->cap; - iommu = i; - continue; - } - - if (type == CAP_AUDIT_STATIC_DMAR) - check_dmar_capabilities(iommu, i); - else - check_irq_capabilities(iommu, i); - } - - /* - * If the system is sane to support scalable mode, either SL or FL - * should be sane. - */ - if (intel_cap_smts_sanity() && - !intel_cap_flts_sanity() && !intel_cap_slts_sanity()) - rc = -EOPNOTSUPP; - -out: - rcu_read_unlock(); - return rc; -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu) -{ - switch (type) { - case CAP_AUDIT_STATIC_DMAR: - case CAP_AUDIT_STATIC_IRQR: - return cap_audit_static(iommu, type); - case CAP_AUDIT_HOTPLUG_DMAR: - case CAP_AUDIT_HOTPLUG_IRQR: - return cap_audit_hotplug(iommu, type); - default: - break; - } - - return -EFAULT; -} - -bool intel_cap_smts_sanity(void) -{ - return ecap_smts(intel_iommu_ecap_sanity); -} - -bool intel_cap_pasid_sanity(void) -{ - return ecap_pasid(intel_iommu_ecap_sanity); -} - -bool intel_cap_nest_sanity(void) -{ - return ecap_nest(intel_iommu_ecap_sanity); -} - -bool intel_cap_flts_sanity(void) -{ - return ecap_flts(intel_iommu_ecap_sanity); -} - -bool intel_cap_slts_sanity(void) -{ - return ecap_slts(intel_iommu_ecap_sanity); -} diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h deleted file mode 100644 index d07b75938961..000000000000 --- a/drivers/iommu/intel/cap_audit.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * cap_audit.h - audit iommu capabilities header - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park - */ - -/* - * Capability Register Mask - */ -#define CAP_FL5LP_MASK BIT_ULL(60) -#define CAP_PI_MASK BIT_ULL(59) -#define CAP_FL1GP_MASK BIT_ULL(56) -#define CAP_RD_MASK BIT_ULL(55) -#define CAP_WD_MASK BIT_ULL(54) -#define CAP_MAMV_MASK GENMASK_ULL(53, 48) -#define CAP_NFR_MASK GENMASK_ULL(47, 40) -#define CAP_PSI_MASK BIT_ULL(39) -#define CAP_SLLPS_MASK GENMASK_ULL(37, 34) -#define CAP_FRO_MASK GENMASK_ULL(33, 24) -#define CAP_ZLR_MASK BIT_ULL(22) -#define CAP_MGAW_MASK GENMASK_ULL(21, 16) -#define CAP_SAGAW_MASK GENMASK_ULL(12, 8) -#define CAP_CM_MASK BIT_ULL(7) -#define CAP_PHMR_MASK BIT_ULL(6) -#define CAP_PLMR_MASK BIT_ULL(5) -#define CAP_RWBF_MASK BIT_ULL(4) -#define CAP_AFL_MASK BIT_ULL(3) -#define CAP_NDOMS_MASK GENMASK_ULL(2, 0) - -/* - * Extended Capability Register Mask - */ -#define ECAP_RPS_MASK BIT_ULL(49) -#define ECAP_SMPWC_MASK BIT_ULL(48) -#define ECAP_FLTS_MASK BIT_ULL(47) -#define ECAP_SLTS_MASK BIT_ULL(46) -#define ECAP_SLADS_MASK BIT_ULL(45) -#define ECAP_VCS_MASK BIT_ULL(44) -#define ECAP_SMTS_MASK BIT_ULL(43) -#define ECAP_PDS_MASK BIT_ULL(42) -#define ECAP_DIT_MASK BIT_ULL(41) -#define ECAP_PASID_MASK BIT_ULL(40) -#define ECAP_PSS_MASK GENMASK_ULL(39, 35) -#define ECAP_EAFS_MASK BIT_ULL(34) -#define ECAP_NWFS_MASK BIT_ULL(33) -#define ECAP_SRS_MASK BIT_ULL(31) -#define ECAP_ERS_MASK BIT_ULL(30) -#define ECAP_PRS_MASK BIT_ULL(29) -#define ECAP_NEST_MASK BIT_ULL(26) -#define ECAP_MTS_MASK BIT_ULL(25) -#define ECAP_MHMV_MASK GENMASK_ULL(23, 20) -#define ECAP_IRO_MASK GENMASK_ULL(17, 8) -#define ECAP_SC_MASK BIT_ULL(7) -#define ECAP_PT_MASK BIT_ULL(6) -#define ECAP_EIM_MASK BIT_ULL(4) -#define ECAP_DT_MASK BIT_ULL(2) -#define ECAP_QI_MASK BIT_ULL(1) -#define ECAP_C_MASK BIT_ULL(0) - -/* - * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each - * IOMMU gets audited. - */ -#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(a) != cap##_##feature(b)) { \ - intel_iommu_##cap##_sanity &= ~(MASK); \ - pr_info("IOMMU feature %s inconsistent", #feature); \ - } \ -} while (0) - -#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ - DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK) - -#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(intel_iommu_##cap##_sanity)) \ - DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \ - (b)->cap, cap, feature, MASK); \ -} while (0) - -#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \ -do { \ - u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \ - min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \ - intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \ - min_feature; \ -} while (0) - -#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \ -do { \ - if ((intel_iommu_##cap##_sanity & (MASK)) > \ - (cap##_##feature((iommu)->cap))) \ - mismatch = true; \ - else \ - (iommu)->cap = ((iommu)->cap & ~(MASK)) | \ - (intel_iommu_##cap##_sanity & (MASK)); \ -} while (0) - -enum cap_audit_type { - CAP_AUDIT_STATIC_DMAR, - CAP_AUDIT_STATIC_IRQR, - CAP_AUDIT_HOTPLUG_DMAR, - CAP_AUDIT_HOTPLUG_IRQR, -}; - -bool intel_cap_smts_sanity(void); -bool intel_cap_pasid_sanity(void); -bool intel_cap_nest_sanity(void); -bool intel_cap_flts_sanity(void); -bool intel_cap_slts_sanity(void); - -static inline bool scalable_mode_support(void) -{ - return (intel_iommu_sm && intel_cap_smts_sanity()); -} - -static inline bool pasid_mode_support(void) -{ - return scalable_mode_support() && intel_cap_pasid_sanity(); -} - -static inline bool nested_mode_support(void) -{ - return scalable_mode_support() && intel_cap_nest_sanity(); -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e20819c3fd44..23460dccd7eb 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -29,7 +29,6 @@ #include "../irq_remapping.h" #include "../iommu-pages.h" #include "pasid.h" -#include "cap_audit.h" #include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE @@ -2118,10 +2117,6 @@ static int __init init_dmars(void) struct intel_iommu *iommu; int ret; - ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); - if (ret) - goto free_iommu; - for_each_iommu(iommu, drhd) { if (drhd->ignored) { iommu_disable_translation(iommu); @@ -2617,10 +2612,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) struct intel_iommu *iommu = dmaru->iommu; int ret; - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); - if (ret) - goto out; - /* * Disable translation if already enabled prior to OS handover. */ diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index cd2ca9fbbdd2..1cfc33f139bd 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -24,7 +24,6 @@ #include "iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" -#include "cap_audit.h" enum irq_mode { IRQ_REMAPPING, @@ -731,9 +730,6 @@ static int __init intel_prepare_irq_remapping(void) if (dmar_table_init() < 0) return -ENODEV; - if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL)) - return -ENODEV; - if (!dmar_ir_support()) return -ENODEV; @@ -1547,10 +1543,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu) int ret; int eim = x2apic_enabled(); - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu); - if (ret) - return ret; - if (eim && !ecap_eim_support(iommu->ecap)) { pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n", iommu->reg_phys, iommu->ecap); -- Gitee From 1e5f007ead4912622a7a2d153cccdc853755f43c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 7 Jan 2025 10:17:45 +0800 Subject: [PATCH 200/278] iommu/vt-d: Draining PRQ in sva unbind path when FPD bit set ANBZ: #13617 commit cf08ca81d08a04b3b304e8fb4e052f323a09783d upstream. When a device uses a PASID for SVA (Shared Virtual Address), it's possible that the PASID entry is marked as non-present and FPD bit set before the device flushes all ongoing DMA requests and removes the SVA domain. This can occur when an exception happens and the process terminates before the device driver stops DMA and calls the iommu driver to unbind the PASID. There's no need to drain the PRQ in the mm release path. Instead, the PRQ will be drained in the SVA unbind path. But in such case, intel_pasid_tear_down_entry() only checks the presence of the pasid entry and returns directly. Add the code to clear the FPD bit and drain the PRQ. Fixes: c43e1ccdebf2 ("iommu/vt-d: Drain PRQs when domain removed from RID") Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241217024240.139615-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/pasid.c | 22 +++++++++++++++++++++- drivers/iommu/intel/pasid.h | 6 ++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 5b7d85f1e143..fb59a7d35958 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -244,11 +244,31 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); - if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) { + if (WARN_ON(!pte)) { spin_unlock(&iommu->lock); return; } + if (!pasid_pte_is_present(pte)) { + if (!pasid_pte_is_fault_disabled(pte)) { + WARN_ON(READ_ONCE(pte->val[0]) != 0); + spin_unlock(&iommu->lock); + return; + } + + /* + * When a PASID is used for SVA by a device, it's possible + * that the pasid entry is non-present with the Fault + * Processing Disabled bit set. Clear the pasid entry and + * drain the PRQ for the PASID before return. + */ + pasid_clear_entry(pte); + spin_unlock(&iommu->lock); + intel_iommu_drain_pasid_prq(dev, pasid); + + return; + } + did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); intel_pasid_clear_entry(dev, pasid, fault_ignore); diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 082f4fe20216..668d8ece6b14 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -73,6 +73,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte) return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; } +/* Get FPD(Fault Processing Disable) bit of a PASID table entry */ +static inline bool pasid_pte_is_fault_disabled(struct pasid_entry *pte) +{ + return READ_ONCE(pte->val[0]) & PASID_PTE_FPD; +} + /* Get PGTT field of a PASID table entry */ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) { -- Gitee From e9d5a585e07afe82baeb699a82752cfca3e2e9e3 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 7 Jan 2025 10:17:46 +0800 Subject: [PATCH 201/278] iommu/vt-d: Link cache tags of same iommu unit together ANBZ: #13617 commit acf5d49aaf862333a7139adff52a6b153af2853a upstream. Cache tag invalidation requests for a domain are accumulated until a different iommu unit is found when traversing the cache_tags linked list. But cache tags of same iommu unit can be distributed in the linked list, this make batched flush less efficient. E.g., one device backed by iommu0 is attached to a domain in between two devices attaching backed by iommu1. Group cache tags together for same iommu unit in cache_tag_assign() to maximize the performance of batched flush. Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Signed-off-by: Zhenzhong Duan Link: https://lore.kernel.org/r/20241219054358.8654-1-zhenzhong.duan@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/cache.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 09694cca8752..fc35cba59145 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -47,6 +47,7 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct cache_tag *tag, *temp; + struct list_head *prev; unsigned long flags; tag = kzalloc(sizeof(*tag), GFP_KERNEL); @@ -65,6 +66,7 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, tag->dev = iommu->iommu.dev; spin_lock_irqsave(&domain->cache_lock, flags); + prev = &domain->cache_tags; list_for_each_entry(temp, &domain->cache_tags, node) { if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { temp->users++; @@ -73,8 +75,15 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, trace_cache_tag_assign(temp); return 0; } + if (temp->iommu == iommu) + prev = &temp->node; } - list_add_tail(&tag->node, &domain->cache_tags); + /* + * Link cache tags of same iommu unit together, so corresponding + * flush ops can be batched for iommu unit. + */ + list_add(&tag->node, prev); + spin_unlock_irqrestore(&domain->cache_lock, flags); trace_cache_tag_assign(tag); -- Gitee From 4b69058bc85da46b6ced09e5ba64c5b48cd5b211 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 10 Dec 2024 08:51:19 -0800 Subject: [PATCH 202/278] iommu/io-pgtable-arm: Make pgtable walker more generic ANBZ: #13617 commit 821500d5c59737708b0959a6c71328ae5ba070ca upstream. We can re-use this basic pgtable walk logic in a few places. Signed-off-by: Rob Clark Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241210165127.600817-2-robdclark@gmail.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 67 ++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 4426bf2ee939..36648f4e097f 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -741,33 +741,33 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, } struct io_pgtable_walk_data { - struct iommu_dirty_bitmap *dirty; + struct io_pgtable *iop; + void *data; + int (*visit)(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size); unsigned long flags; u64 addr; const u64 end; }; -static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, - struct io_pgtable_walk_data *walk_data, - arm_lpae_iopte *ptep, - int lvl); +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl); -static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data, - struct io_pgtable_walk_data *walk_data, - arm_lpae_iopte *ptep, int lvl) +static int io_pgtable_visit(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, int lvl) { struct io_pgtable *iop = &data->iop; arm_lpae_iopte pte = READ_ONCE(*ptep); - if (iopte_leaf(pte, lvl, iop->fmt)) { - size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); + size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); + int ret = walk_data->visit(walk_data, lvl, ptep, size); + if (ret) + return ret; - if (iopte_writeable_dirty(pte)) { - iommu_dirty_bitmap_record(walk_data->dirty, - walk_data->addr, size); - if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) - iopte_set_writeable_clean(ptep); - } + if (iopte_leaf(pte, lvl, iop->fmt)) { walk_data->addr += size; return 0; } @@ -776,13 +776,13 @@ static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data, return -EINVAL; ptep = iopte_deref(pte, data); - return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1); + return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1); } -static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, - struct io_pgtable_walk_data *walk_data, - arm_lpae_iopte *ptep, - int lvl) +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl) { u32 idx; int max_entries, ret; @@ -797,7 +797,7 @@ static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data); (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) { - ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl); + ret = io_pgtable_visit(data, walk_data, ptep + idx, lvl); if (ret) return ret; } @@ -805,6 +805,23 @@ static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, return 0; } +static int visit_dirty(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct iommu_dirty_bitmap *dirty = walk_data->data; + + if (!iopte_leaf(*ptep, lvl, walk_data->iop->fmt)) + return 0; + + if (iopte_writeable_dirty(*ptep)) { + iommu_dirty_bitmap_record(dirty, walk_data->addr, size); + if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) + iopte_set_writeable_clean(ptep); + } + + return 0; +} + static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, unsigned long iova, size_t size, unsigned long flags, @@ -813,7 +830,9 @@ static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable_cfg *cfg = &data->iop.cfg; struct io_pgtable_walk_data walk_data = { - .dirty = dirty, + .iop = &data->iop, + .data = dirty, + .visit = visit_dirty, .flags = flags, .addr = iova, .end = iova + size, @@ -828,7 +847,7 @@ static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, if (data->iop.fmt != ARM_64_LPAE_S1) return -EINVAL; - return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); + return __arm_lpae_iopte_walk(data, &walk_data, ptep, lvl); } static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) -- Gitee From 8bab99b9661fbb2611d370190002a86301197937 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 10 Dec 2024 08:51:20 -0800 Subject: [PATCH 203/278] iommu/io-pgtable-arm: Re-use the pgtable walk for iova_to_phys ANBZ: #13617 commit d9e589e6ad73e5e13f8ed5df9a3e4049001a95c1 upstream. Re-use the generic pgtable walk path. Signed-off-by: Rob Clark Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241210165127.600817-3-robdclark@gmail.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 74 +++++++++++++++++----------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 36648f4e097f..792f2b0daf0d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -704,42 +704,6 @@ static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iov data->start_level, ptep); } -static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, - unsigned long iova) -{ - struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - arm_lpae_iopte pte, *ptep = data->pgd; - int lvl = data->start_level; - - do { - /* Valid IOPTE pointer? */ - if (!ptep) - return 0; - - /* Grab the IOPTE we're interested in */ - ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); - pte = READ_ONCE(*ptep); - - /* Valid entry? */ - if (!pte) - return 0; - - /* Leaf entry? */ - if (iopte_leaf(pte, lvl, data->iop.fmt)) - goto found_translation; - - /* Take it to the next level */ - ptep = iopte_deref(pte, data); - } while (++lvl < ARM_LPAE_MAX_LEVELS); - - /* Ran out of page tables to walk */ - return 0; - -found_translation: - iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1); - return iopte_to_paddr(pte, data) | iova; -} - struct io_pgtable_walk_data { struct io_pgtable *iop; void *data; @@ -755,6 +719,41 @@ static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data, arm_lpae_iopte *ptep, int lvl); +struct iova_to_phys_data { + arm_lpae_iopte pte; + int lvl; +}; + +static int visit_iova_to_phys(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct iova_to_phys_data *data = walk_data->data; + data->pte = *ptep; + data->lvl = lvl; + return 0; +} + +static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, + unsigned long iova) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct iova_to_phys_data d; + struct io_pgtable_walk_data walk_data = { + .data = &d, + .visit = visit_iova_to_phys, + .addr = iova, + .end = iova + 1, + }; + int ret; + + ret = __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level); + if (ret) + return 0; + + iova &= (ARM_LPAE_BLOCK_SIZE(d.lvl, data) - 1); + return iopte_to_paddr(d.pte, data) | iova; +} + static int io_pgtable_visit(struct arm_lpae_io_pgtable *data, struct io_pgtable_walk_data *walk_data, arm_lpae_iopte *ptep, int lvl) @@ -772,8 +771,9 @@ static int io_pgtable_visit(struct arm_lpae_io_pgtable *data, return 0; } - if (WARN_ON(!iopte_table(pte, lvl))) + if (!iopte_table(pte, lvl)) { return -EINVAL; + } ptep = iopte_deref(pte, data); return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1); -- Gitee From 70c0ffd47155c4949c2246bf37ab6e337ef4eef6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 10 Dec 2024 08:51:21 -0800 Subject: [PATCH 204/278] iommu/io-pgtable-arm: Add way to debug pgtable walk ANBZ: #13617 commit aff028a8192d056d346541c5fc7d88c0eb43412c upstream. Add an io-pgtable method to walk the pgtable returning the raw PTEs that would be traversed for a given iova access. Signed-off-by: Rob Clark Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241210165127.600817-4-robdclark@gmail.com [will: Removed 'arm_lpae_io_pgtable_walk_data::level' per Mostafa] Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/io-pgtable-arm.c | 23 +++++++++++++++++++++++ include/linux/io-pgtable.h | 11 +++++++++++ 2 files changed, 34 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 792f2b0daf0d..8bc8bf45e795 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -754,6 +754,28 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, return iopte_to_paddr(d.pte, data) | iova; } +static int visit_pgtable_walk(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct arm_lpae_io_pgtable_walk_data *data = walk_data->data; + data->ptes[lvl] = *ptep; + return 0; +} + +static int arm_lpae_pgtable_walk(struct io_pgtable_ops *ops, unsigned long iova, + void *wd) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_walk_data walk_data = { + .data = wd, + .visit = visit_pgtable_walk, + .addr = iova, + .end = iova + 1, + }; + + return __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level); +} + static int io_pgtable_visit(struct arm_lpae_io_pgtable *data, struct io_pgtable_walk_data *walk_data, arm_lpae_iopte *ptep, int lvl) @@ -929,6 +951,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, .read_and_clear_dirty = arm_lpae_read_and_clear_dirty, + .pgtable_walk = arm_lpae_pgtable_walk, }; return data; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ce86b09ae80f..bba2a51c87d2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -180,12 +180,22 @@ struct io_pgtable_cfg { }; }; +/** + * struct arm_lpae_io_pgtable_walk_data - information from a pgtable walk + * + * @ptes: The recorded PTE values from the walk + */ +struct arm_lpae_io_pgtable_walk_data { + u64 ptes[4]; +}; + /** * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map_pages: Map a physically contiguous range of pages of the same size. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. + * @pgtable_walk: (optional) Perform a page table walk for a given iova. * * These functions map directly onto the iommu_ops member functions with * the same names. @@ -199,6 +209,7 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*pgtable_walk)(struct io_pgtable_ops *ops, unsigned long iova, void *wd); int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, unsigned long flags, -- Gitee From e854c2eff8fb1b3707640cf3e7fb5f5aa28918bc Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 7 Jan 2025 16:51:00 +0000 Subject: [PATCH 205/278] iommu/arm-smmu-v3: Use str_read_write helper w/ logs ANBZ: #13617 commit f2c77f6e41e68e1b24c165acbf6d4da6b3117e23 upstream. Adopt the `str_read_write` helper in event logging as suggested by the coccinelle tool. Signed-off-by: Pranjal Shrivastava Reviewed-by: Nicolin Chen Link: https://lore.kernel.org/all/20250107130053.GC6991@willie-the-truck/ Link: https://lore.kernel.org/r/20250107165100.1093357-1-praan@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 29dd2785c97a..be37d5746f6b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1907,7 +1907,7 @@ static void arm_smmu_dump_event(struct arm_smmu_device *smmu, u64 *raw, dev_err(smmu->dev, "%s %s %s %s \"%s\"%s%s stag: %#x", evt->privileged ? "priv" : "unpriv", evt->instruction ? "inst" : "data", - evt->read ? "read" : "write", + str_read_write(evt->read), evt->s2 ? "s2" : "s1", event_class_str[evt->class], evt->class_tt ? (evt->ttrnw ? " ttd_read" : " ttd_write") : "", evt->stall ? " stall" : "", evt->stag); -- Gitee From 096a17b71518625df287ba83582262cb53c32d1e Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 2 Oct 2024 20:53:31 -0700 Subject: [PATCH 206/278] iommu/hyper-v: Don't assume cpu_possible_mask is dense ANBZ: #13617 commit 4f6b64f3d3d96fb3796614362c64a4b73ddf3f7a upstream. Current code gets the APIC IDs for CPUs numbered 255 and lower. This code assumes cpu_possible_mask is dense, which is not true in the general case per [1]. If cpu_possible_mask contains holes, num_possible_cpus() is less than nr_cpu_ids, so some CPUs might get skipped. Furthermore, getting the APIC ID of a CPU that isn't in cpu_possible_mask is invalid. However, the configurations that Hyper-V provides to guest VMs on x86 hardware, in combination with how x86 code assigns Linux CPU numbers, *does* always produce a dense cpu_possible_mask. So the dense assumption is not currently causing failures. But for robustness against future changes in how cpu_possible_mask is populated, update the code to no longer assume dense. The correct approach is to determine the range to scan based on nr_cpu_ids, and skip any CPUs that are not in the cpu_possible_mask. [1] https://lore.kernel.org/lkml/SN6PR02MB4157210CC36B2593F8572E5ED4692@SN6PR02MB4157.namprd02.prod.outlook.com/ Signed-off-by: Michael Kelley Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241003035333.49261-4-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241003035333.49261-4-mhklinux@outlook.com> Signed-off-by: Jay Chen --- drivers/iommu/hyperv-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 8a5c17b97310..2a86aa5d54c6 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -164,8 +164,8 @@ static int __init hyperv_prepare_irq_remapping(void) * max cpu affinity for IOAPIC irqs. Scan cpu 0-255 and set cpu * into ioapic_max_cpumask if its APIC ID is less than 256. */ - for (i = min_t(unsigned int, num_possible_cpus() - 1, 255); i >= 0; i--) - if (cpu_physical_id(i) < 256) + for (i = min_t(unsigned int, nr_cpu_ids - 1, 255); i >= 0; i--) + if (cpu_possible(i) && cpu_physical_id(i) < 256) cpumask_set_cpu(i, &ioapic_max_cpumask); return 0; -- Gitee From 3361c261708ac18c4e4fea58adc0c36101c51138 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Jan 2025 13:18:55 +0000 Subject: [PATCH 207/278] iommu/arm-smmu-v3: Add missing #include of linux/string_choices.h ANBZ: #13617 commit 1f3dc29d2445c89c85e451c02e41a8e0cd22423c upstream. Commit f2c77f6e41e6 ("iommu/arm-smmu-v3: Use str_read_write helper w/ logs") introduced a call to str_read_write() in the SMMUv3 driver but without an explicit #include of . This breaks the build for custom configurations where CONFIG_ACPI=n: drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:1909:4: error: call to undeclared function 'str_read_write'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 1909 | str_read_write(evt->read), | ^ Add the missing #include. Link: https://lore.kernel.org/r/d07e82a4-2880-4ae3-961b-471bfa7ac6c4@samsung.com Reported-by: Marek Szyprowski Fixes: f2c77f6e41e6 ("iommu/arm-smmu-v3: Use str_read_write helper w/ logs") Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index be37d5746f6b..2ffb4a0ea762 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include -- Gitee From 4a422e6e87b6c7012360105fc8dab90493f04eb9 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 7 Jan 2025 09:10:04 -0800 Subject: [PATCH 208/278] iommufd: Keep OBJ/IOCTL lists in an alphabetical order ANBZ: #13617 commit 442003f3a842dc374b1c706187778c3d57b84c23 upstream. Reorder the existing OBJ/IOCTL lists. Also run clang-format for the same coding style at line wrappings. No functional change. Link: https://patch.msgid.link/r/c5e6d6e0a0bb7abc92ad26937fde19c9426bee96.1736237481.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/main.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 0a96cc8f27da..bffecd62775d 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,9 +307,9 @@ union ucmd_buffer { struct iommu_ioas_map map; struct iommu_ioas_unmap unmap; struct iommu_option option; + struct iommu_vdevice_alloc vdev; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; - struct iommu_vdevice_alloc vdev; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -333,8 +333,8 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), - IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc, - out_fault_fd), + IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, + struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, @@ -355,20 +355,18 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, struct iommu_ioas_iova_ranges, out_iova_alignment), - IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, - iova), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), - IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, - val64), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), + IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, + struct iommu_vdevice_alloc, virt_id), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, struct iommu_viommu_alloc, out_viommu_id), - IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, - struct iommu_vdevice_alloc, virt_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -490,8 +488,8 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_DEVICE] = { .destroy = iommufd_device_destroy, }, - [IOMMUFD_OBJ_IOAS] = { - .destroy = iommufd_ioas_destroy, + [IOMMUFD_OBJ_FAULT] = { + .destroy = iommufd_fault_destroy, }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, @@ -501,15 +499,15 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_nested_destroy, .abort = iommufd_hwpt_nested_abort, }, - [IOMMUFD_OBJ_FAULT] = { - .destroy = iommufd_fault_destroy, - }, - [IOMMUFD_OBJ_VIOMMU] = { - .destroy = iommufd_viommu_destroy, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, }, [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, }, + [IOMMUFD_OBJ_VIOMMU] = { + .destroy = iommufd_viommu_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, -- Gitee From 9b4d9281791c7ff989b36fe4190982cdfd275e04 Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Fri, 10 Jan 2025 12:34:59 -0400 Subject: [PATCH 209/278] iommu/amd: Remove unused amd_iommu_domain_update() ANBZ: #13617 commit 1a684b099fac9a37e6fe2f0e594adbb1eff5181a upstream. All the callers have been removed by the below commit, remove the implementation and prototypes. Fixes: 322d889ae7d3 ("iommu/amd: Remove amd_iommu_domain_update() from page table freeing") Reviewed-by: Vasant Hegde Signed-off-by: Alejandro Jimenez Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-9776c53c2966+1c7-amd_paging_flags_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/iommu.c | 9 --------- 2 files changed, 10 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index c19990a282e6..75c0eb480a65 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); */ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); -void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a1c2c046a0f0..172382cd6f06 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1775,15 +1775,6 @@ void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) domain_flush_complete(domain); } -void amd_iommu_domain_update(struct protection_domain *domain) -{ - /* Update device table */ - amd_iommu_update_and_flush_device_table(domain); - - /* Flush domain TLB(s) and wait for completion */ - amd_iommu_domain_flush_all(domain); -} - int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) { struct iommu_dev_data *dev_data; -- Gitee From 2a6f0b8bcccaefb8c6b43dac4156ef50098b910c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 10 Jan 2025 12:35:00 -0400 Subject: [PATCH 210/278] iommu/amd: Remove domain_alloc() ANBZ: #13617 commit f9b80f941e0e68c3347c5d22a17a0f636a064e2c upstream. IOMMU drivers should not be sensitive to the domain type, a paging domain should be created based only on the flags passed in, the same for all callers. AMD was using the domain_alloc() path to force VFIO into a v1 domain type, because v1 gives higher performance. However now that IOMMU_HWPT_ALLOC_PASID is present, and a NULL device is not possible, domain_alloc_paging_flags() will do the right thing for VFIO. When invoked from VFIO flags will be 0 and the amd_iommu_pgtable type of domain will be selected. This is v1 by default unless the kernel command line has overridden it to v2. If the admin is forcing v2 assume they know what they are doing so force it everywhere, including for VFIO. Reviewed-by: Vasant Hegde Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-9776c53c2966+1c7-amd_paging_flags_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 172382cd6f06..94f087d6c9f3 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2587,25 +2587,6 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, return &domain->domain; } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) -{ - struct iommu_domain *domain; - int pgtable = amd_iommu_pgtable; - - /* - * Force IOMMU v1 page table when allocating - * domain for pass-through devices. - */ - if (type == IOMMU_DOMAIN_UNMANAGED) - pgtable = AMD_IOMMU_V1; - - domain = do_iommu_domain_alloc(type, NULL, 0, pgtable); - if (IS_ERR(domain)) - return NULL; - - return domain; -} - static struct iommu_domain * amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) @@ -3090,7 +3071,6 @@ const struct iommu_ops amd_iommu_ops = { .blocked_domain = &blocked_domain, .release_domain = &release_domain, .identity_domain = &identity_domain.domain, - .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, .probe_device = amd_iommu_probe_device, -- Gitee From d2ae88cbf1b7a22bef025659f1467ff47d4ab0ca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 10 Jan 2025 12:35:01 -0400 Subject: [PATCH 211/278] iommu/amd: Remove dev == NULL checks ANBZ: #13617 commit 02bcd1a8b991c6fc29271fa02250bea1b61fb742 upstream. This is no longer possible, amd_iommu_domain_alloc_paging_flags() is never called with dev = NULL from the core code. Similarly get_amd_iommu_from_dev() can never be NULL either. Reviewed-by: Vasant Hegde Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-9776c53c2966+1c7-amd_paging_flags_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 94f087d6c9f3..dcefcfa9f8ab 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2545,13 +2545,10 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, u32 flags, int pgtable) { bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); struct protection_domain *domain; - struct amd_iommu *iommu = NULL; int ret; - if (dev) - iommu = get_amd_iommu_from_dev(dev); - /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. @@ -2559,8 +2556,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) return ERR_PTR(-EINVAL); - domain = protection_domain_alloc(type, - dev ? dev_to_node(dev) : NUMA_NO_NODE); + domain = protection_domain_alloc(type, dev_to_node(dev)); if (!domain) return ERR_PTR(-ENOMEM); @@ -2576,13 +2572,11 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, domain->domain.geometry.force_aperture = true; domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; - if (iommu) { - domain->domain.type = type; - domain->domain.ops = iommu->iommu.ops->default_domain_ops; + domain->domain.type = type; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; - if (dirty_tracking) - domain->domain.dirty_ops = &amd_dirty_ops; - } + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; return &domain->domain; } @@ -2593,13 +2587,10 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, { unsigned int type = IOMMU_DOMAIN_UNMANAGED; - struct amd_iommu *iommu = NULL; + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID; - if (dev) - iommu = get_amd_iommu_from_dev(dev); - if ((flags & ~supported_flags) || user_data) return ERR_PTR(-EOPNOTSUPP); @@ -2613,10 +2604,9 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, /* Allocate domain with v1 page table for dirty tracking */ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) { - if (iommu && amd_iommu_hd_support(iommu)) { - return do_iommu_domain_alloc(type, dev, - flags, AMD_IOMMU_V1); - } + if (amd_iommu_hd_support(iommu)) + return do_iommu_domain_alloc(type, dev, flags, + AMD_IOMMU_V1); return ERR_PTR(-EOPNOTSUPP); } -- Gitee From 9d2253c4aa7033591960c02fd02739f3c303ebbb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 10 Jan 2025 12:35:02 -0400 Subject: [PATCH 212/278] iommu/amd: Remove type argument from do_iommu_domain_alloc() and related ANBZ: #13617 commit 55b237dd7f7ec2ee9c7986e0fc28c5867bf63282 upstream. do_iommu_domain_alloc() is only called from amd_iommu_domain_alloc_paging_flags() so type is always IOMMU_DOMAIN_UNMANAGED. Remove type and all the dead conditionals checking it. IOMMU_DOMAIN_IDENTITY checks are similarly obsolete as the conversion to the global static identity domain removed those call paths. The caller of protection_domain_alloc() should set the type, fix the miss in the SVA code. Reviewed-by: Vasant Hegde Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-9776c53c2966+1c7-amd_paging_flags_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 2 +- drivers/iommu/amd/iommu.c | 35 ++++++++++------------------------- drivers/iommu/amd/pasid.c | 3 ++- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 75c0eb480a65..f063d181dd68 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -46,7 +46,7 @@ extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ void amd_iommu_init_identity_domain(void); -struct protection_domain *protection_domain_alloc(unsigned int type, int nid); +struct protection_domain *protection_domain_alloc(int nid); void protection_domain_free(struct protection_domain *domain); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index dcefcfa9f8ab..6296a6e4664e 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2464,7 +2464,7 @@ static void protection_domain_init(struct protection_domain *domain, int nid) domain->iop.pgtbl.cfg.amd.nid = nid; } -struct protection_domain *protection_domain_alloc(unsigned int type, int nid) +struct protection_domain *protection_domain_alloc(int nid) { struct protection_domain *domain; int domid; @@ -2485,15 +2485,10 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) return domain; } -static int pdom_setup_pgtable(struct protection_domain *domain, - unsigned int type, int pgtable) +static int pdom_setup_pgtable(struct protection_domain *domain, int pgtable) { struct io_pgtable_ops *pgtbl_ops; - /* No need to allocate io pgtable ops in passthrough mode */ - if (!(type & __IOMMU_DOMAIN_PAGING)) - return 0; - switch (pgtable) { case AMD_IOMMU_V1: domain->pd_mode = PD_MODE_V1; @@ -2540,27 +2535,19 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu) return iommu && (iommu->features & FEATURE_HDSUP); } -static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, - struct device *dev, - u32 flags, int pgtable) +static struct iommu_domain *do_iommu_domain_alloc(struct device *dev, u32 flags, + int pgtable) { bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); struct protection_domain *domain; int ret; - /* - * Since DTE[Mode]=0 is prohibited on SNP-enabled system, - * default to use IOMMU_DOMAIN_DMA[_FQ]. - */ - if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return ERR_PTR(-EINVAL); - - domain = protection_domain_alloc(type, dev_to_node(dev)); + domain = protection_domain_alloc(dev_to_node(dev)); if (!domain) return ERR_PTR(-ENOMEM); - ret = pdom_setup_pgtable(domain, type, pgtable); + ret = pdom_setup_pgtable(domain, pgtable); if (ret) { pdom_id_free(domain->id); kfree(domain); @@ -2572,7 +2559,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, domain->domain.geometry.force_aperture = true; domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; - domain->domain.type = type; + domain->domain.type = IOMMU_DOMAIN_UNMANAGED; domain->domain.ops = iommu->iommu.ops->default_domain_ops; if (dirty_tracking) @@ -2586,7 +2573,6 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { - unsigned int type = IOMMU_DOMAIN_UNMANAGED; struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID; @@ -2599,20 +2585,19 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, if (!amd_iommu_pasid_supported()) return ERR_PTR(-EOPNOTSUPP); - return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V2); + return do_iommu_domain_alloc(dev, flags, AMD_IOMMU_V2); } /* Allocate domain with v1 page table for dirty tracking */ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) { if (amd_iommu_hd_support(iommu)) - return do_iommu_domain_alloc(type, dev, flags, - AMD_IOMMU_V1); + return do_iommu_domain_alloc(dev, flags, AMD_IOMMU_V1); return ERR_PTR(-EOPNOTSUPP); } /* If nothing specific is required use the kernel commandline default */ - return do_iommu_domain_alloc(type, dev, 0, amd_iommu_pgtable); + return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); } void amd_iommu_domain_free(struct iommu_domain *dom) diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index 8c73a30c2800..9101d07b11d3 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -185,12 +185,13 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct protection_domain *pdom; int ret; - pdom = protection_domain_alloc(IOMMU_DOMAIN_SVA, dev_to_node(dev)); + pdom = protection_domain_alloc(dev_to_node(dev)); if (!pdom) return ERR_PTR(-ENOMEM); pdom->domain.ops = &amd_sva_domain_ops; pdom->mn.ops = &sva_mn; + pdom->domain.type = IOMMU_DOMAIN_SVA; ret = mmu_notifier_register(&pdom->mn, mm); if (ret) { -- Gitee From f9a22c756627e948d4cbf9341c039b37e5335131 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 17 Jan 2025 11:29:01 -0800 Subject: [PATCH 213/278] iommufd/fault: Use a separate spinlock to protect fault->deliver list ANBZ: #13617 commit 3d49020a327cd7d069059317c11df24e407ccfa3 upstream. The fault->mutex serializes the fault read()/write() fops and the iommufd_fault_auto_response_faults(), mainly for fault->response. Also, it was conveniently used to fence the fault->deliver in poll() fop and iommufd_fault_iopf_handler(). However, copy_from/to_user() may sleep if pagefaults are enabled. Thus, they could take a long time to wait for user pages to swap in, blocking iommufd_fault_iopf_handler() and its caller that is typically a shared IRQ handler of an IOMMU driver, resulting in a potential global DOS. Instead of reusing the mutex to protect the fault->deliver list, add a separate spinlock, nested under the mutex, to do the job. iommufd_fault_iopf_handler() would no longer be blocked by copy_from/to_user(). Add a free_list in iommufd_auto_response_faults(), so the spinlock can simply fence a fast list_for_each_entry_safe routine. Provide two deliver list helpers for iommufd_fault_fops_read() to use: - Fetch the first iopf_group out of the fault->deliver list - Restore an iopf_group back to the head of the fault->deliver list Lastly, move the mutex closer to the response in the fault structure, and update its kdoc accordingly. Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object") Link: https://patch.msgid.link/r/20250117192901.79491-1-nicolinc@nvidia.com Cc: stable@vger.kernel.org Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 34 ++++++++++++++++--------- drivers/iommu/iommufd/iommufd_private.h | 29 +++++++++++++++++++-- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 8c8226f0dffd..6e3ee8d44419 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -98,15 +98,23 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, { struct iommufd_fault *fault = hwpt->fault; struct iopf_group *group, *next; + struct list_head free_list; unsigned long index; if (!fault) return; + INIT_LIST_HEAD(&free_list); mutex_lock(&fault->mutex); + spin_lock(&fault->lock); list_for_each_entry_safe(group, next, &fault->deliver, node) { if (group->attach_handle != &handle->handle) continue; + list_move(&group->node, &free_list); + } + spin_unlock(&fault->lock); + + list_for_each_entry_safe(group, next, &free_list, node) { list_del(&group->node); iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); iopf_free_group(group); @@ -253,17 +261,19 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, return -ESPIPE; mutex_lock(&fault->mutex); - while (!list_empty(&fault->deliver) && count > done) { - group = list_first_entry(&fault->deliver, - struct iopf_group, node); - - if (group->fault_count * fault_size > count - done) + while ((group = iommufd_fault_deliver_fetch(fault))) { + if (done >= count || + group->fault_count * fault_size > count - done) { + iommufd_fault_deliver_restore(fault, group); break; + } rc = xa_alloc(&fault->response, &group->cookie, group, xa_limit_32b, GFP_KERNEL); - if (rc) + if (rc) { + iommufd_fault_deliver_restore(fault, group); break; + } idev = to_iommufd_handle(group->attach_handle)->idev; list_for_each_entry(iopf, &group->faults, list) { @@ -272,13 +282,12 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, group->cookie); if (copy_to_user(buf + done, &data, fault_size)) { xa_erase(&fault->response, group->cookie); + iommufd_fault_deliver_restore(fault, group); rc = -EFAULT; break; } done += fault_size; } - - list_del(&group->node); } mutex_unlock(&fault->mutex); @@ -336,10 +345,10 @@ static __poll_t iommufd_fault_fops_poll(struct file *filep, __poll_t pollflags = EPOLLOUT; poll_wait(filep, &fault->wait_queue, wait); - mutex_lock(&fault->mutex); + spin_lock(&fault->lock); if (!list_empty(&fault->deliver)) pollflags |= EPOLLIN | EPOLLRDNORM; - mutex_unlock(&fault->mutex); + spin_unlock(&fault->lock); return pollflags; } @@ -382,6 +391,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) INIT_LIST_HEAD(&fault->deliver); xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); mutex_init(&fault->mutex); + spin_lock_init(&fault->lock); init_waitqueue_head(&fault->wait_queue); filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, @@ -432,9 +442,9 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) hwpt = group->attach_handle->domain->fault_data; fault = hwpt->fault; - mutex_lock(&fault->mutex); + spin_lock(&fault->lock); list_add_tail(&group->node, &fault->deliver); - mutex_unlock(&fault->mutex); + spin_unlock(&fault->lock); wake_up_interruptible(&fault->wait_queue); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index b6d706cf2c66..0b1bafc7fd99 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -443,14 +443,39 @@ struct iommufd_fault { struct iommufd_ctx *ictx; struct file *filep; - /* The lists of outstanding faults protected by below mutex. */ - struct mutex mutex; + spinlock_t lock; /* protects the deliver list */ struct list_head deliver; + struct mutex mutex; /* serializes response flows */ struct xarray response; struct wait_queue_head wait_queue; }; +/* Fetch the first node out of the fault->deliver list */ +static inline struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->lock); + list_add(&group->node, &fault->deliver); + spin_unlock(&fault->lock); +} + struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; -- Gitee From ed7cbb545e425587dde8ab879614813da7f00758 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 20 Jan 2025 11:50:51 -0800 Subject: [PATCH 214/278] iommufd: Fix struct iommu_hwpt_pgfault init and padding ANBZ: #13617 commit e721f619e3ec9bae08bf419c3944cf1e6966c821 upstream. The iommu_hwpt_pgfault is used to report IO page fault data to userspace, but iommufd_fault_fops_read was never zeroing its padding. This leaks the content of the kernel stack memory to userspace. Also, the iommufd uAPI requires explicit padding and use of __aligned_u64 to ensure ABI compatibility's with 32 bit. pahole result, before: struct iommu_hwpt_pgfault { __u32 flags; /* 0 4 */ __u32 dev_id; /* 4 4 */ __u32 pasid; /* 8 4 */ __u32 grpid; /* 12 4 */ __u32 perm; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ __u64 addr; /* 24 8 */ __u32 length; /* 32 4 */ __u32 cookie; /* 36 4 */ /* size: 40, cachelines: 1, members: 8 */ /* sum members: 36, holes: 1, sum holes: 4 */ /* last cacheline: 40 bytes */ }; pahole result, after: struct iommu_hwpt_pgfault { __u32 flags; /* 0 4 */ __u32 dev_id; /* 4 4 */ __u32 pasid; /* 8 4 */ __u32 grpid; /* 12 4 */ __u32 perm; /* 16 4 */ __u32 __reserved; /* 20 4 */ __u64 addr __attribute__((__aligned__(8))); /* 24 8 */ __u32 length; /* 32 4 */ __u32 cookie; /* 36 4 */ /* size: 40, cachelines: 1, members: 9 */ /* forced alignments: 1 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); Fixes: c714f15860fc ("iommufd: Add fault and response message definitions") Link: https://patch.msgid.link/r/20250120195051.2450-1-nicolinc@nvidia.com Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 2 +- include/uapi/linux/iommufd.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 6e3ee8d44419..40864b450bea 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -250,7 +250,7 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, { size_t fault_size = sizeof(struct iommu_hwpt_pgfault); struct iommufd_fault *fault = filep->private_data; - struct iommu_hwpt_pgfault data; + struct iommu_hwpt_pgfault data = {}; struct iommufd_device *idev; struct iopf_group *group; struct iopf_fault *iopf; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index b5197e317415..462f74ee234e 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -868,6 +868,7 @@ enum iommu_hwpt_pgfault_perm { * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @__reserved: Must be 0. * @addr: Fault address * @length: a hint of how much data the requestor is expecting to fetch. For * example, if the PRI initiator knows it is going to do a 10MB @@ -883,7 +884,8 @@ struct iommu_hwpt_pgfault { __u32 pasid; __u32 grpid; __u32 perm; - __u64 addr; + __u32 __reserved; + __aligned_u64 addr; __u32 length; __u32 cookie; }; -- Gitee From 902680bfd03f3c1d20d79160dc8b262a6a605f88 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 17 Jan 2025 13:58:00 +0800 Subject: [PATCH 215/278] iommu: Fix potential memory leak in iopf_queue_remove_device() ANBZ: #13617 commit 9759ae2cee7cd42b95f1c48aa3749bd02b5ddb08 upstream. The iopf_queue_remove_device() helper removes a device from the per-iommu iopf queue when PRI is disabled on the device. It responds to all outstanding iopf's with an IOMMU_PAGE_RESP_INVALID code and detaches the device from the queue. However, it fails to release the group structure that represents a group of iopf's awaiting for a response after responding to the hardware. This can cause a memory leak if iopf_queue_remove_device() is called with pending iopf's. Fix it by calling iopf_free_group() after the iopf group is responded. Fixes: 199112327135 ("iommu: Track iopf group instead of last fault") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250117055800.782462-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/io-pgfault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 4674e618797c..8b5926c1452e 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -478,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) ops->page_response(dev, iopf, &resp); list_del_init(&group->pending_node); + iopf_free_group(group); } mutex_unlock(&fault_param->lock); -- Gitee From a59663278da0a133e6c75731b3ba5f5cdb568072 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Feb 2025 21:00:54 -0800 Subject: [PATCH 216/278] iommufd: Make attach_handle generic than fault specific ANBZ: #13617 commit fb21b1568adaa76af7a8c853f37c60fba8b28661 upstream. "attach_handle" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases. Now, both the MSI and PASID series require to reuse the attach_handle for non-fault cases. Add a set of new attach/detach/replace helpers that does the attach_handle allocation/releasing/replacement in the common path and also handles those fault specific routines such as iopf enabling/disabling and auto response. This covers both non-fault and fault cases in a clean way, replacing those inline helpers in the header. The following patch will clean up those old helpers in the fault.c file. Link: https://patch.msgid.link/r/32687df01c02291d89986a9fca897bbbe2b10987.1738645017.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/device.c | 105 ++++++++++++++++++++++++ drivers/iommu/iommufd/fault.c | 8 +- drivers/iommu/iommufd/iommufd_private.h | 33 +------- 3 files changed, 113 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ba2aed1b7014..c5537e77c34c 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -352,6 +352,111 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, return 0; } +/* The device attach/detach/replace helpers for attach_handle */ + +static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + if (hwpt->fault) { + rc = iommufd_fault_iopf_enable(idev); + if (rc) + goto out_free_handle; + } + + handle->idev = idev; + rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + if (rc) + goto out_disable_iopf; + + return 0; + +out_disable_iopf: + if (hwpt->fault) + iommufd_fault_iopf_disable(idev); +out_free_handle: + kfree(handle); + return rc; +} + +static struct iommufd_attach_handle * +iommufd_device_get_attach_handle(struct iommufd_device *idev) +{ + struct iommu_attach_handle *handle; + + lockdep_assert_held(&idev->igroup->lock); + + handle = + iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); + if (IS_ERR(handle)) + return NULL; + return to_iommufd_handle(handle); +} + +static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + + handle = iommufd_device_get_attach_handle(idev); + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + if (hwpt->fault) { + iommufd_auto_response_faults(hwpt, handle); + iommufd_fault_iopf_disable(idev); + } + kfree(handle); +} + +static int iommufd_hwpt_replace_device(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + struct iommufd_attach_handle *handle, *old_handle = + iommufd_device_get_attach_handle(idev); + int rc; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + if (hwpt->fault && !old->fault) { + rc = iommufd_fault_iopf_enable(idev); + if (rc) + goto out_free_handle; + } + + handle->idev = idev; + rc = iommu_replace_group_handle(idev->igroup->group, hwpt->domain, + &handle->handle); + if (rc) + goto out_disable_iopf; + + if (old->fault) { + iommufd_auto_response_faults(hwpt, old_handle); + if (!hwpt->fault) + iommufd_fault_iopf_disable(idev); + } + kfree(old_handle); + + return 0; + +out_disable_iopf: + if (hwpt->fault && !old->fault) + iommufd_fault_iopf_disable(idev); +out_free_handle: + kfree(handle); + return rc; +} + int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 40864b450bea..5de7dcd3a325 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -16,7 +16,7 @@ #include "../iommu-priv.h" #include "iommufd_private.h" -static int iommufd_fault_iopf_enable(struct iommufd_device *idev) +int iommufd_fault_iopf_enable(struct iommufd_device *idev) { struct device *dev = idev->dev; int ret; @@ -45,7 +45,7 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev) return ret; } -static void iommufd_fault_iopf_disable(struct iommufd_device *idev) +void iommufd_fault_iopf_disable(struct iommufd_device *idev) { mutex_lock(&idev->iopf_lock); if (!WARN_ON(idev->iopf_enabled == 0)) { @@ -93,8 +93,8 @@ int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, return ret; } -static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, - struct iommufd_attach_handle *handle) +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle) { struct iommufd_fault *fault = hwpt->fault; struct iopf_group *group, *next; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 0b1bafc7fd99..02fe1ada97cc 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -504,35 +504,10 @@ int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable *old); -static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - if (hwpt->fault) - return iommufd_fault_domain_attach_dev(hwpt, idev); - - return iommu_attach_group(hwpt->domain, idev->igroup->group); -} - -static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - if (hwpt->fault) { - iommufd_fault_domain_detach_dev(hwpt, idev); - return; - } - - iommu_detach_group(hwpt->domain, idev->igroup->group); -} - -static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - if (old->fault || hwpt->fault) - return iommufd_fault_domain_replace_dev(idev, hwpt, old); - - return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); -} +int iommufd_fault_iopf_enable(struct iommufd_device *idev); +void iommufd_fault_iopf_disable(struct iommufd_device *idev); +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle); static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) -- Gitee From 2e43c28df3860d54b05b8fb9fcc7d50024dc702a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Feb 2025 21:00:55 -0800 Subject: [PATCH 217/278] iommufd/fault: Remove iommufd_fault_domain_attach/detach/replace_dev() ANBZ: #13617 commit dc10ba25d43f433ad5d9e8e6be4f4d2bb3cd9ddb upstream. There are new attach/detach/replace helpers in device.c taking care of both the attach_handle and the fault specific routines for iopf_enable/disable() and auto response. Clean up these redundant functions in the fault.c file. Link: https://patch.msgid.link/r/3ca94625e9d78270d9a715fa0809414fddd57e58.1738645017.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 120 ------------------------ drivers/iommu/iommufd/iommufd_private.h | 8 -- 2 files changed, 128 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 5de7dcd3a325..6a88dbc1c2a2 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -55,44 +55,6 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev) mutex_unlock(&idev->iopf_lock); } -static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - int ret; - - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); - if (ret) - kfree(handle); - - return ret; -} - -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - int ret; - - if (!hwpt->fault) - return -EINVAL; - - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - - ret = __fault_domain_attach_dev(hwpt, idev); - if (ret) - iommufd_fault_iopf_disable(idev); - - return ret; -} - void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle) { @@ -130,88 +92,6 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&fault->mutex); } -static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) -{ - struct iommu_attach_handle *handle; - - handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); - if (IS_ERR(handle)) - return NULL; - - return to_iommufd_handle(handle); -} - -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - - handle = iommufd_device_get_attach_handle(idev); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); - iommufd_auto_response_faults(hwpt, handle); - iommufd_fault_iopf_disable(idev); - kfree(handle); -} - -static int __fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - struct iommufd_attach_handle *handle, *curr = NULL; - int ret; - - if (old->fault) - curr = iommufd_device_get_attach_handle(idev); - - if (hwpt->fault) { - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, &handle->handle); - } else { - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, NULL); - } - - if (!ret && curr) { - iommufd_auto_response_faults(old, curr); - kfree(curr); - } - - return ret; -} - -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - bool iopf_off = !hwpt->fault && old->fault; - bool iopf_on = hwpt->fault && !old->fault; - int ret; - - if (iopf_on) { - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - } - - ret = __fault_domain_replace_dev(idev, hwpt, old); - if (ret) { - if (iopf_on) - iommufd_fault_iopf_disable(idev); - return ret; - } - - if (iopf_off) - iommufd_fault_iopf_disable(idev); - - return 0; -} - void iommufd_fault_destroy(struct iommufd_object *obj) { struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 02fe1ada97cc..8e0e3ab64747 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -496,14 +496,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old); - int iommufd_fault_iopf_enable(struct iommufd_device *idev); void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, -- Gitee From f641c4212f09acae0729963c0f92e32670c88f54 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 27 Jan 2025 09:44:11 +0000 Subject: [PATCH 218/278] iommu/amd: Expicitly enable CNTRL.EPHEn bit in resume path ANBZ: #13617 commit ef75966abf950c0539534effa4960caa29fb7167 upstream. With recent kernel, AMDGPU failed to resume after suspend on certain laptop. Sample log: ----------- Nov 14 11:52:19 Thinkbook kernel: iommu ivhd0: AMD-Vi: Event logged [ILLEGAL_DEV_TABLE_ENTRY device=0000:06:00.0 pasid=0x00000 address=0x135300000 flags=0x0080] Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[0]: 7d90000000000003 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[1]: 0000100103fc0009 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[2]: 2000000117840013 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[3]: 0000000000000000 This is because in resume path, CNTRL[EPHEn] is not set. Fix this by setting CNTRL[EPHEn] to 1 in resume path if EFR[EPHSUP] is set. Note May be better approach is to save the control register in suspend path and restore it in resume path instead of trying to set indivisual bits. We will have separate patch for that. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219499 Fixes: c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") Tested-by: Hamish McIntyre-Bhatty Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20250127094411.5931-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/init.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 0bbda60d3cdc..23caea22f8dc 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -175,6 +175,7 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 #define CONTROL_IRTCACHEDIS 59 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 70134695d917..73ce2c4eded2 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2652,6 +2652,10 @@ static void iommu_init_flags(struct amd_iommu *iommu) /* Set IOTLB invalidation timeout to 1s */ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + + /* Enable Enhanced Peripheral Page Request Handling */ + if (check_feature(FEATURE_EPHSUP)) + iommu_feature_enable(iommu, CONTROL_EPH_EN); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) -- Gitee From 33af84cd2f7350c70ff1eb601d628410bf71cff5 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 28 Jan 2025 19:05:21 +0000 Subject: [PATCH 219/278] iommu: Fix a spelling error ANBZ: #13617 commit 78be7f04537fa35f6cc694879e9a475ca1984936 upstream. Fix spelling error IDENITY -> IDENTITY in drivers/iommu/iommu.c. Signed-off-by: Easwar Hariharan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250128190522.70800-1-eahariha@linux.microsoft.com [ joro: Add commit message ] Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 55a961297e97..10a6bfabdac9 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1770,7 +1770,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group, group->id); /* - * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) -- Gitee From e439fae08ded13a2283c912dd1fec391966b9256 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 11 Feb 2025 08:55:12 +0800 Subject: [PATCH 220/278] iommu/vt-d: Make intel_iommu_drain_pasid_prq() cover faults for RID ANBZ: #13617 commit add43c4fbc92f8b48c1acd64e953af3b1be4cd9c upstream. This driver supports page faults on PCI RID since commit <9f831c16c69e> ("iommu/vt-d: Remove the pasid present check in prq_event_thread") by allowing the reporting of page faults with the pasid_present field cleared to the upper layer for further handling. The fundamental assumption here is that the detach or replace operations act as a fence for page faults. This implies that all pending page faults associated with a specific RID or PASID are flushed when a domain is detached or replaced from a device RID or PASID. However, the intel_iommu_drain_pasid_prq() helper does not correctly handle faults for RID. This leads to faults potentially remaining pending in the iommu hardware queue even after the domain is detached, thereby violating the aforementioned assumption. Fix this issue by extending intel_iommu_drain_pasid_prq() to cover faults for RID. Fixes: 9f831c16c69e ("iommu/vt-d: Remove the pasid present check in prq_event_thread") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250121023150.815972-1-baolu.lu@linux.intel.com Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20250211005512.985563-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/prq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index c2d792db52c3..064194399b38 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -87,7 +87,9 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) struct page_req_dsc *req; req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { head = (head + sizeof(*req)) & PRQ_RING_MASK; continue; } -- Gitee From b6c0dc459c5608fd9a69ec8b5e63f7726377ded7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Feb 2025 21:34:12 +0200 Subject: [PATCH 221/278] x86/events/amd/iommu: Increase IOMMU_NAME_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #13617 commit 1623ced247f7cb1b48a27cca6b0f17fe5ab5942b upstream. The init_one_iommu() takes an unsigned int argument that can't be checked for the boundaries at compile time and GCC complains about that when build with `make W=1`: arch/x86/events/amd/iommu.c:441:53: note: directive argument in the range [0, 4294967294] arch/x86/events/amd/iommu.c:441:9: note: ‘snprintf’ output between 12 and 21 bytes into a destination of size 16 Increase the size to cover all possible cases. Signed-off-by: Andy Shevchenko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250210193412.483233-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jay Chen --- arch/x86/events/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b15f7b950d2e..f8228d8243f7 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -30,7 +30,7 @@ #define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) #define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -#define IOMMU_NAME_SIZE 16 +#define IOMMU_NAME_SIZE 24 struct perf_amd_iommu { struct list_head list; -- Gitee From 20fc9898de4cf562b378da29730f78510f9abaff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:37 -0800 Subject: [PATCH 222/278] genirq/msi: Refactor iommu_dma_compose_msi_msg() ANBZ: #13617 commit 9349887e93009331e751854843f73a086bef4018 upstream. The two-step process to translate the MSI address involves two functions, iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). Previously iommu_dma_compose_msi_msg() needed to be in the iommu layer as it had to dereference the opaque cookie pointer. Now, the previous patch changed the cookie pointer into an integer so there is no longer any need for the iommu layer to be involved. Further, the call sites of iommu_dma_compose_msi_msg() all follow the same pattern of setting an MSI message address_hi/lo to non-translated and then immediately calling iommu_dma_compose_msi_msg(). Refactor iommu_dma_compose_msi_msg() into msi_msg_set_addr() that directly accepts the u64 version of the address and simplifies all the callers. Move the new helper to linux/msi.h since it has nothing to do with iommu. Aside from refactoring, this logically prepares for the next patch, which allows multiple implementation options for iommu_dma_prepare_msi(). So, it does not make sense to have the iommu_dma_compose_msi_msg() in dma-iommu.c as it no longer provides the only iommu_dma_prepare_msi() implementation. Link: https://patch.msgid.link/r/eda62a9bafa825e9cdabd7ddc61ad5a21c32af24.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen [Jay Chen Conflicts: drivers/irqchip/irq-gic-v3-its.c ft2000plus patch lead to some code conflicts ] --- drivers/iommu/dma-iommu.c | 18 ------------------ drivers/irqchip/irq-gic-v2m.c | 5 +---- drivers/irqchip/irq-gic-v3-its.c | 13 +++---------- drivers/irqchip/irq-gic-v3-mbi.c | 12 ++++-------- drivers/irqchip/irq-ls-scfg-msi.c | 5 ++--- include/linux/iommu.h | 6 ------ include/linux/msi.h | 28 ++++++++++++++++++++++++++++ 7 files changed, 38 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index fc67f42450d8..df85080ff591 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1856,24 +1856,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -#ifdef CONFIG_IRQ_MSI_IOMMU - if (desc->iommu_msi_shift) { - u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - - msg->address_hi = upper_32_bits(msi_iova); - msg->address_lo = lower_32_bits(msi_iova) | - (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); - } -#endif -} - static int iommu_dma_init(void) { if (is_kdump_kernel()) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index a1e370d0200f..34f437207adf 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -87,9 +87,6 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct v2m_data *v2m = irq_data_get_irq_chip_data(data); phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq); - msg->address_hi = upper_32_bits(addr); - msg->address_lo = lower_32_bits(addr); - if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) msg->data = 0; else @@ -97,7 +94,7 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET) msg->data -= v2m->spi_offset; - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, addr); } static struct irq_chip gicv2m_irq_chip = { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index b2e3a8f19ba3..99488ff0bb13 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1731,22 +1731,15 @@ static u64 its_irq_get_msi_base(struct its_device *its_dev) static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); - struct its_node *its; - u64 addr; - - its = its_dev->its; - addr = its->get_msi_base(its_dev); - - msg->address_lo = lower_32_bits(addr); - msg->address_hi = upper_32_bits(addr); - msg->data = its_get_event_id(d); #ifdef CONFIG_ARCH_PHYTIUM if (typeof_ft2000plus()) return; #endif - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); + msg->data = its_get_event_id(d); + msi_msg_set_addr(irq_data_get_msi_desc(d), msg, + its_dev->its->get_msi_base(its_dev)); } static int its_irq_set_irqchip_state(struct irq_data *d, diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee17..a6510128611e 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -147,22 +147,18 @@ static const struct irq_domain_ops mbi_domain_ops = { static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR); - msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR); msg[0].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[0], + mbi_phys_base + GICD_SETSPI_NSR); } static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg) { mbi_compose_msi_msg(data, msg); - msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); - msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); msg[1].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), &msg[1]); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[1], + mbi_phys_base + GICD_CLRSPI_NSR); } static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index f31a262fe438..818128606dc4 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -86,8 +86,6 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct ls_scfg_msi *msi_data = irq_data_get_irq_chip_data(data); - msg->address_hi = upper_32_bits(msi_data->msiir_addr); - msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; if (msi_affinity_flag) { @@ -97,7 +95,8 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->data |= cpumask_first(mask); } - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, + msi_data->msiir_addr); } static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 457bf70ff263..29ed79adf06b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1547,7 +1547,6 @@ static inline void iommu_debugfs_setup(void) {} int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); #else /* CONFIG_IOMMU_DMA */ @@ -1563,11 +1562,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a { return 0; } - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/msi.h b/include/linux/msi.h index 4ebe9fd2cf3c..78046c06a0a1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -302,6 +302,34 @@ static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_io #endif } +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) +{ +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } +#endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} + int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); /** -- Gitee From 1da89605fdce64cfd81ba90ba8c549d31d9136a5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:38 -0800 Subject: [PATCH 223/278] iommu: Make iommu_dma_prepare_msi() into a generic operation ANBZ: #13617 commit 288683c92b1abc32277c83819bea287af614d239 upstream. SW_MSI supports IOMMU to translate an MSI message before the MSI message is delivered to the interrupt controller. On such systems, an iommu_domain must have a translation for the MSI message for interrupts to work. The IRQ subsystem will call into IOMMU to request that a physical page be set up to receive MSI messages, and the IOMMU then sets an IOVA that maps to that physical page. Ultimately the IOVA is programmed into the device via the msi_msg. Generalize this by allowing iommu_domain owners to provide implementations of this mapping. Add a function pointer in struct iommu_domain to allow a domain owner to provide its own implementation. Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in the iommu_get_msi_cookie() path. Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain doesn't change or become freed while running. Races with IRQ operations from VFIO and domain changes from iommufd are possible here. Replace the msi_prepare_lock with a lockdep assertion for the group mutex as documentation. For the dmau_iommu.c each iommu_domain is unique to a group. Link: https://patch.msgid.link/r/4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe [Jay Chen Conflicts: drivers/iommu/iommu.c minor: symbol literal confilct ] Signed-off-by: Jay Chen Signed-off-by: Shuai Xue --- drivers/iommu/dma-iommu.c | 33 +++++++++++++---------------- drivers/iommu/iommu.c | 29 ++++++++++++++++++++++++++ include/linux/iommu.h | 44 ++++++++++++++++++++++++++------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index df85080ff591..16593f218b9a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENOMEM; mutex_init(&domain->iova_cookie->mutex); + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; domain->iova_cookie = cookie; + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; + if (domain->sw_msi != iommu_dma_sw_msi) + return; + if (!cookie) return; @@ -1820,33 +1829,19 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { + if (!domain->iova_cookie) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); if (!msi_page) return -ENOMEM; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 10a6bfabdac9..7df07cec671f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3642,3 +3642,32 @@ int iommu_replace_group_handle(struct iommu_group *group, return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, IOMMUFD_INTERNAL); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->domain && group->domain->sw_msi) + ret = group->domain->sw_msi(group->domain, desc, msi_addr); + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 29ed79adf06b..5039745aefa5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -239,6 +247,16 @@ struct iommu_domain { CK_KABI_RESERVE(4) }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1509,6 +1527,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1542,26 +1572,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* -- Gitee From 033b7605e9f10a89c5de68e09171813f2aae673a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Feb 2025 17:31:40 -0800 Subject: [PATCH 224/278] iommu: Turn fault_data to iommufd private pointer ANBZ: #13617 commit 748706d7ca06012621b32851b68136bf33613b9e upstream. A "fault_data" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases, along with the attach_handle. Now, the iommufd version of the sw_msi function will reuse the attach_handle and fault_data for a non-fault case. Rename "fault_data" to "iommufd_hwpt" so as not to confine it to a "fault" case. Move it into a union to be the iommufd private pointer. A following patch will move the iova_cookie to the union for dma-iommu too after the iommufd_sw_msi implementation is added. Since we have two unions now, add some simple comments for readability. Link: https://patch.msgid.link/r/ee5039503f28a16590916e9eef28b917e2d1607a.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 2 +- include/linux/iommu.h | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 6a88dbc1c2a2..aa839df42823 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -319,7 +319,7 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) struct iommufd_hw_pagetable *hwpt; struct iommufd_fault *fault; - hwpt = group->attach_handle->domain->fault_data; + hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; spin_lock(&fault->lock); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 598be26a14e2..2641d50f46cf 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -406,10 +406,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - hwpt->domain->fault_data = hwpt; refcount_inc(&fault->obj.users); iommufd_put_object(ucmd->ictx, &fault->obj); } + hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5039745aefa5..e4fc48d19cd4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -224,8 +224,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - void *fault_data; - union { + union { /* Pointer usable by owner of the domain */ + struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ + }; + union { /* Fault handler */ struct { iommu_fault_handler_t handler; void *handler_token; -- Gitee From b3af5f4f167d65af74ea62b43fd1a419e9f0b3b0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:41 -0800 Subject: [PATCH 225/278] iommufd: Implement sw_msi support natively ANBZ: #13617 commit 40f5175d0eb77f902ba8e2a5df2a8f3a218c8843 upstream. iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case, the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain. Implement the sw_msi mapping directly and use a global per-fd table to associate assigned IOVA to the MSI pages. This allows the MSI pages to be loaded into a domain before it is attached ensuring that MSI is not disrupted. Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/device.c | 161 ++++++++++++++++++++---- drivers/iommu/iommufd/hw_pagetable.c | 3 + drivers/iommu/iommufd/iommufd_private.h | 23 +++- drivers/iommu/iommufd/main.c | 9 ++ 4 files changed, 173 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index c5537e77c34c..e2e56c429ac6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static __maybe_unused struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +#endif + static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2641d50f46cf..7de6e914232e 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, goto out_abort; } hwpt->domain->owner = viommu->iommu_dev->ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8e0e3ab64747..246297452a44 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,22 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + struct iommufd_ctx { struct file *file; struct xarray objects; @@ -26,6 +42,10 @@ struct iommufd_ctx { wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; + u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { @@ -383,6 +403,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index bffecd62775d..d50907f69c94 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } -- Gitee From a0d914f2bb3b5d51344e97ca2a150d1501e311bd Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Mon, 6 Jan 2025 19:14:13 +0000 Subject: [PATCH 226/278] iommu/amd: Preserve default DTE fields when updating Host Page Table Root ANBZ: #13617 commit c5b0320bbf79548fbf058a3925a07c8f281beeab upstream. When updating the page table root field on the DTE, avoid overwriting any bits that are already set. The earlier call to make_clear_dte() writes default values that all DTEs must have set (currently DTE[V]), and those must be preserved. Currently this doesn't cause problems since the page table root update is the first field that is set after make_clear_dte() is called, and DTE_FLAG_V is set again later along with the permission bits (IR/IW). Remove this redundant assignment too. Fixes: fd5dff9de4be ("iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers") Signed-off-by: Alejandro Jimenez Reviewed-by: Dheeraj Kumar Srivastava Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250106191413.3107140-1-alejandro.j.jimenez@oracle.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6296a6e4664e..052e2fe25b97 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2052,12 +2052,12 @@ static void set_dte_entry(struct amd_iommu *iommu, make_clear_dte(dev_data, dte, &new); if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] = iommu_virt_to_phys(domain->iop.root); + new.data[0] |= iommu_virt_to_phys(domain->iop.root); new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; - new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; /* * When SNP is enabled, we can only support TV=1 with non-zero domain ID. -- Gitee From aaedad475ed9eaf23731359912e0997fed158a34 Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Fri, 28 Feb 2025 18:27:25 +0800 Subject: [PATCH 227/278] iommu/vt-d: Remove device comparison in context_setup_pass_through_cb ANBZ: #13617 commit 64f792981e35e191eb619f6f2fefab76cc7d6112 upstream. Remove the device comparison check in context_setup_pass_through_cb. pci_for_each_dma_alias already makes a decision on whether the callback function should be called for a device. With the check in place it will fail to create context entries for aliases as it walks up to the root bus. Fixes: 2031c469f816 ("iommu/vt-d: Add support for static identity domain") Closes: https://lore.kernel.org/linux-iommu/82499eb6-00b7-4f83-879a-e97b4144f576@linux.intel.com/ Cc: stable@vger.kernel.org Signed-off-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20250224180316.140123-1-jsnitsel@redhat.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 23460dccd7eb..64fc6d599b0d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4437,9 +4437,6 @@ static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void * { struct device *dev = data; - if (dev != &pdev->dev) - return 0; - return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); } -- Gitee From 8fb07d2b23d08535abdc5b97265ab6ab6a9bbd86 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 28 Feb 2025 18:27:26 +0800 Subject: [PATCH 228/278] iommu/vt-d: Fix suspicious RCU usage ANBZ: #13617 commit b150654f74bf0df8e6a7936d5ec51400d9ec06d8 upstream. Commit ("iommu/vt-d: Allocate DMAR fault interrupts locally") moved the call to enable_drhd_fault_handling() to a code path that does not hold any lock while traversing the drhd list. Fix it by ensuring the dmar_global_lock lock is held when traversing the drhd list. Without this fix, the following warning is triggered: ============================= WARNING: suspicious RCU usage 6.14.0-rc3 #55 Not tainted ----------------------------- drivers/iommu/intel/dmar.c:2046 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by cpuhp/1/23: #0: ffffffff84a67c50 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x87/0x2c0 #1: ffffffff84a6a380 (cpuhp_state-up){+.+.}-{0:0}, at: cpuhp_thread_fun+0x87/0x2c0 stack backtrace: CPU: 1 UID: 0 PID: 23 Comm: cpuhp/1 Not tainted 6.14.0-rc3 #55 Call Trace: dump_stack_lvl+0xb7/0xd0 lockdep_rcu_suspicious+0x159/0x1f0 ? __pfx_enable_drhd_fault_handling+0x10/0x10 enable_drhd_fault_handling+0x151/0x180 cpuhp_invoke_callback+0x1df/0x990 cpuhp_thread_fun+0x1ea/0x2c0 smpboot_thread_fn+0x1f5/0x2e0 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0x12a/0x2d0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x4a/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Holding the lock in enable_drhd_fault_handling() triggers a lockdep splat about a possible deadlock between dmar_global_lock and cpu_hotplug_lock. This is avoided by not holding dmar_global_lock when calling iommu_device_register(), which initiates the device probe process. Fixes: d74169ceb0d2 ("iommu/vt-d: Allocate DMAR fault interrupts locally") Reported-and-tested-by: Ido Schimmel Closes: https://lore.kernel.org/linux-iommu/Zx9OwdLIc_VoQ0-a@shredder.mtl.com/ Tested-by: Breno Leitao Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250218022422.2315082-1-baolu.lu@linux.intel.com Tested-by: Ido Schimmel Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/dmar.c | 1 + drivers/iommu/intel/iommu.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index b7625e2386c3..57e223e44d06 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2100,6 +2100,7 @@ int enable_drhd_fault_handling(unsigned int cpu) /* * Enable fault control interrupt. */ + guard(rwsem_read)(&dmar_global_lock); for_each_iommu(iommu, drhd) { u32 fault_status; int ret; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 64fc6d599b0d..130e74789c99 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3205,7 +3205,14 @@ int __init intel_iommu_init(void) iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); iommu_pmu_register(iommu); } -- Gitee From 7bbaa2b3a2131a70762368cc8fcf5dff259f9784 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:46 -0800 Subject: [PATCH 229/278] iommu: Make @handle mandatory in iommu_{attach|replace}_group_handle() ANBZ: #13617 commit 237603a46abf9637f9b71c6225293fac2b4d6ef7 upstream. Caller of the two APIs always provide a valid handle, make @handle as mandatory parameter. Take this chance incoporate the handle->domain set under the protection of group->mutex in iommu_attach_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-2-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7df07cec671f..3f22de6c10ce 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3557,10 +3557,11 @@ int iommu_attach_group_handle(struct iommu_domain *domain, { int ret; - if (handle) - handle->domain = domain; + if (!handle) + return -EINVAL; mutex_lock(&group->mutex); + handle->domain = domain; ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); if (ret) goto err_unlock; @@ -3614,16 +3615,14 @@ int iommu_replace_group_handle(struct iommu_group *group, void *curr; int ret; - if (!new_domain) + if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - if (handle) { - ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); - if (ret) - goto err_unlock; - handle->domain = new_domain; - } + handle->domain = new_domain; + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) -- Gitee From ec6ea7dd2beed04593ec78f5e588932231c819eb Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:47 -0800 Subject: [PATCH 230/278] iommu: Drop iommu_group_replace_domain() ANBZ: #13617 commit 473ec072a63370e37dddbadb2a7cc2419a0fdb28 upstream. iommufd does not use it now, so drop it. Link: https://patch.msgid.link/r/20250226011849.5102-3-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen [Jay Chen Conflicts: drivers/iommu/iommu.c minor: symbol literal confilct ] --- drivers/iommu/iommu-priv.h | 3 --- drivers/iommu/iommu.c | 35 ++++++----------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index de5b54eaa8bf..b4508423e13b 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,9 +24,6 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); - int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3f22de6c10ce..8e736d0195ef 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2226,32 +2226,6 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @group: IOMMU group that will be attached to the new domain - * @new_domain: new IOMMU domain to replace with - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, @@ -3604,9 +3578,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, IOMMUFD_INTERNAL); * @new_domain: new IOMMU domain to replace with * @handle: attach handle * - * This is a variant of iommu_group_replace_domain(). It allows the caller to - * provide an attach handle for the new domain and use it when the domain is - * attached. + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, -- Gitee From d82a899180f5c1038fefa88f9a4ab4cf0513d6e7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:48 -0800 Subject: [PATCH 231/278] iommu: Store either domain or handle in group->pasid_array ANBZ: #13617 commit e1ea9d30d84c65e96eba27b240be5b6798350490 upstream. iommu_attach_device_pasid() only stores handle to group->pasid_array when there is a valid handle input. However, it makes the iommu_attach_device_pasid() unable to detect if the pasid has been attached or not previously. To be complete, let the iommu_attach_device_pasid() store the domain to group->pasid_array if no valid handle. The other users of the group->pasid_array should be updated to be consistent. e.g. the iommu_attach_group_handle() and iommu_replace_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-4-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8e736d0195ef..5c3063c0f6f6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -45,6 +45,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -2186,6 +2189,17 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -3394,6 +3408,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; + void *entry; int ret; if (!group) @@ -3417,10 +3432,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } } - if (handle) - handle->domain = domain; + entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); if (ret) goto out_unlock; @@ -3500,13 +3514,17 @@ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; + void *entry; xa_lock(&group->pasid_array); - handle = xa_load(&group->pasid_array, pasid); - if (!handle) + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); - else if (type && handle->domain->type != type) - handle = ERR_PTR(-EBUSY); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } xa_unlock(&group->pasid_array); return handle; @@ -3529,14 +3547,15 @@ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { + void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = domain; - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); if (ret) goto err_unlock; @@ -3589,14 +3608,14 @@ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { - void *curr; + void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = new_domain; + entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; @@ -3605,7 +3624,7 @@ int iommu_replace_group_handle(struct iommu_group *group, if (ret) goto err_release; - curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); -- Gitee From 728f92e9fcf4ac73a6d6fb6aea6d01435ce0f3ae Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:49 -0800 Subject: [PATCH 232/278] iommu: Swap the order of setting group->pasid_array and calling attach op of iommu drivers ANBZ: #13617 commit 5e9f822c9c683ae884fa5e71df41d1647b2876c6 upstream. The current implementation stores entry to the group->pasid_array before the underlying iommu driver has successfully set the new domain. This can lead to issues where PRIs are received on the new domain before the attach operation is completed. This patch swaps the order of operations to ensure that the domain is set in the underlying iommu driver before updating the group->pasid_array. Link: https://patch.msgid.link/r/20250226011849.5102-5-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 48 ++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5c3063c0f6f6..743d370394f1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3434,13 +3434,29 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + if (ret) { + xa_release(&group->pasid_array, pasid); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3555,19 +3571,27 @@ int iommu_attach_group_handle(struct iommu_domain *domain, mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) - goto err_unlock; + goto out_unlock; ret = __iommu_attach_group(domain, group); - if (ret) - goto err_erase; - mutex_unlock(&group->mutex); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } - return 0; -err_erase: - xa_erase(&group->pasid_array, IOMMU_NO_PASID); -err_unlock: + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); return ret; } -- Gitee From e44fa7722bfe2c7b00757fe672ace8b800b209bd Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 26 Feb 2025 02:40:12 -0800 Subject: [PATCH 233/278] iommufd: Disallow allocating nested parent domain with fault ID ANBZ: #13617 commit 1062d81086156e42878d701b816d2f368b53a77c upstream. Allocating a domain with a fault ID indicates that the domain is faultable. However, there is a gap for the nested parent domain to support PRI. Some hardware lacks the capability to distinguish whether PRI occurs at stage 1 or stage 2. This limitation may require software-based page table walking to resolve. Since no in-tree IOMMU driver currently supports this functionality, it is disallowed. For more details, refer to the related discussion at [1]. [1] https://lore.kernel.org/linux-iommu/bd1655c6-8b2f-4cfa-adb1-badc00d01811@intel.com/ Link: https://patch.msgid.link/r/20250226104012.82079-1-yi.l.liu@intel.com Suggested-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/hw_pagetable.c | 3 +++ tools/testing/selftests/iommu/iommufd.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 7de6e914232e..268315b1d8bc 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -126,6 +126,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_FAULT_ID_VALID) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index a1b2b657999d..618c03bb6509 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -439,6 +439,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &test_hwpt_id); test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, &test_hwpt_id); + test_err_hwpt_alloc(EOPNOTSUPP, self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_FAULT_ID_VALID, + &test_hwpt_id); test_cmd_hwpt_alloc(self->device_id, self->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, -- Gitee From d5fe3e457e9da3f7a96a01f79f8e3e41ff4519aa Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 5 Mar 2025 13:18:00 -0800 Subject: [PATCH 234/278] iommufd: Set domain->iommufd_hwpt in all hwpt->domain allocators ANBZ: #13617 commit 897008d0f7672c3510281e826232a32d62710323 upstream. Setting domain->iommufd_hwpt in iommufd_hwpt_alloc() only covers the HWPT allocations from user space, but not for an auto domain. This resulted in a NULL pointer access in the auto domain pathway: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 pc : iommufd_sw_msi+0x54/0x2b0 lr : iommufd_sw_msi+0x40/0x2b0 Call trace: iommufd_sw_msi+0x54/0x2b0 (P) iommu_dma_prepare_msi+0x64/0xa8 its_irq_domain_alloc+0xf0/0x2c0 irq_domain_alloc_irqs_parent+0x2c/0xa8 msi_domain_alloc+0xa0/0x1a8 Since iommufd_sw_msi() requires to access the domain->iommufd_hwpt, it is better to set that explicitly prior to calling iommu_domain_set_sw_msi(). Fixes: 748706d7ca06 ("iommu: Turn fault_data to iommufd private pointer") Link: https://patch.msgid.link/r/20250305211800.229465-1-nicolinc@nvidia.com Reported-by: Ankit Agrawal Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Ankit Agrawal Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/hw_pagetable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 268315b1d8bc..1d4cfe3677dc 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -159,6 +159,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + hwpt->domain->iommufd_hwpt = hwpt; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* @@ -255,6 +256,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + hwpt->domain->iommufd_hwpt = hwpt; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { @@ -311,6 +313,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, hwpt->domain = NULL; goto out_abort; } + hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); @@ -415,7 +418,6 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) refcount_inc(&fault->obj.users); iommufd_put_object(ucmd->ictx, &fault->obj); } - hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); -- Gitee From 3247636a3244d60c2b66b6f1b0e12a9bf83f04e7 Mon Sep 17 00:00:00 2001 From: Jay Chen Date: Thu, 21 Aug 2025 10:20:23 +0800 Subject: [PATCH 235/278] anolis: Revert "anolis: iommu/dma: Fix not fully traversing iova reservations issue" ANBZ: #13617 This reverts commit 8e688d7457cadbbde0ce56b2ce1983c69f70a7ca. --- drivers/iommu/dma-iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 16593f218b9a..a11e1425b02b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -736,7 +736,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev } ret = 0; - goto iova_reserve; + goto done_unlock; } init_iova_domain(iovad, 1UL << order, base_pfn); @@ -751,7 +751,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) domain->type = IOMMU_DOMAIN_DMA; -iova_reserve: ret = iova_reserve_iommu_regions(dev, domain); done_unlock: -- Gitee From 5d783bd08fad87ee60e90a671feb388812808a64 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 26 Feb 2025 15:57:49 +0000 Subject: [PATCH 236/278] iommu/dma: Remove redundant locking ANBZ: #13617 commit 032d7e435cbd81ffa37b846aea5a236f3f5912a3 upstream. This reverts commit ac9a5d522bb80be50ea84965699e1c8257d745ce. iommu_dma_init_domain() is now only called under the group mutex, as it should be given that that the default domain belongs to the group, so the additional internal locking is no longer needed. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/a943d4c198e6a1fffe998337d577dc3aa7f660a9.1740585469.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/dma-iommu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a11e1425b02b..94e85168a600 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -87,7 +87,6 @@ struct iommu_dma_cookie { struct iommu_domain *fq_domain; /* Options for dma-iommu use */ struct iommu_dma_options options; - struct mutex mutex; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -401,7 +400,6 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) if (!domain->iova_cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -726,23 +724,20 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev domain->geometry.aperture_start >> order); /* start_pfn is always nonzero for an already-initialised domain */ - mutex_lock(&cookie->mutex); if (iovad->start_pfn) { if (1UL << order != iovad->granule || base_pfn != iovad->start_pfn) { pr_warn("Incompatible range for DMA domain\n"); - ret = -EFAULT; - goto done_unlock; + return -EFAULT; } - ret = 0; - goto done_unlock; + return 0; } init_iova_domain(iovad, 1UL << order, base_pfn); ret = iova_domain_init_rcaches(iovad); if (ret) - goto done_unlock; + return ret; iommu_dma_init_options(&cookie->options, dev); @@ -751,11 +746,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) domain->type = IOMMU_DOMAIN_DMA; - ret = iova_reserve_iommu_regions(dev, domain); - -done_unlock: - mutex_unlock(&cookie->mutex); - return ret; + return iova_reserve_iommu_regions(dev, domain); } /** -- Gitee From 36885e154dd8170fe21cc03ce428965da9f85fb9 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:14 +0000 Subject: [PATCH 237/278] iommu/amd: Log IOMMU control register in event log path ANBZ: #13617 commit 558d2bbd457d513d9226f88fef0d9c782e595f6e upstream. Useful for debugging ILLEGAL_DEV_TABLE_ENTRY events as some of the DTE settings depends on Control register settings. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 052e2fe25b97..26f1b7a104ec 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -868,7 +868,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) int type, devid, flags, tag; volatile u32 *event = __evt; int count = 0; - u64 address; + u64 address, ctrl; u32 pasid; retry: @@ -878,6 +878,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) (event[1] & EVENT_DOMID_MASK_LO); flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; address = (u64)(((u64)event[3]) << 32) | event[2]; + ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); if (type == 0) { /* Did we hit the erratum? */ @@ -899,6 +900,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), pasid, address, flags); + dev_err(dev, "Control Reg : 0x%llx\n", ctrl); dump_dte_entry(iommu, devid); break; case EVENT_TYPE_DEV_TAB_ERR: -- Gitee From 0b256a880cedc76fcd3ab388ece5e74d6e6d071c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:15 +0000 Subject: [PATCH 238/278] iommu/amd: Remove unused variable ANBZ: #13617 commit e481f8a5db2e9d3a2fbb4872cc08ad11fbc1af68 upstream. Remove 'amd_iommu_aperture_order' as its not used since commit d9cfed925448. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 23caea22f8dc..3f57bdd0a659 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -928,9 +928,6 @@ struct unity_map_entry { * Data structures for device handling */ -/* size of the dma_ops aperture as power of 2 */ -extern unsigned amd_iommu_aperture_order; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ -- Gitee From 33c6c806470cd0aa371c3ba123d5c84c815c31ed Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:17 +0000 Subject: [PATCH 239/278] iommu/amd: Remove outdated comment ANBZ: #13617 commit 60785fceda6f4cbdee425d3320d0f541b166b5bf upstream. Remove outdated comment. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/io_pgtable.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index f3399087859f..26cf562dde11 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -47,13 +47,6 @@ static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, return fpte; } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ - static void free_pt_page(u64 *pt, struct list_head *freelist) { struct page *p = virt_to_page(pt); -- Gitee From 588494486a5b4b8049760180dc01c07b59149d59 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 25 Jan 2024 22:11:01 -0600 Subject: [PATCH 240/278] x86/cpufeatures: Add SEV-SNP CPU feature ANBZ: #13617 commit b6e0f6666f74f0794530e3557f5b0a4ce37bd556 upstream. Add CPU feature detection for Secure Encrypted Virtualization with Secure Nested Paging. This feature adds a strong memory integrity protection to help prevent malicious hypervisor-based attacks like data replay, memory re-mapping, and more. Since enabling the SNP CPU feature imposes a number of additional requirements on host initialization and handling legacy firmware APIs for SEV/SEV-ES guests, only introduce the CPU feature bit so that the relevant handling can be added, but leave it disabled via a disabled-features mask. Once all the necessary changes needed to maintain legacy SEV/SEV-ES support are introduced in subsequent patches, the SNP feature bit will be unmasked/enabled. Signed-off-by: Brijesh Singh Signed-off-by: Jarkko Sakkinen Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-2-michael.roth@amd.com Signed-off-by: sa-buc --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 4 +++- arch/x86/kernel/cpu/amd.c | 5 +++-- tools/arch/x86/include/asm/cpufeatures.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e68c256d3493..e1aa32f2ecb0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -467,6 +467,7 @@ #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c1e800b636f4..0568c79f1ae5 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -117,6 +117,8 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) + /* * Make sure to add features to the correct mask */ @@ -141,7 +143,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 #define DISABLED_MASK21 0 #define DISABLED_MASK22 0 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1e51f90a7233..85b8f7d5aa0d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -671,8 +671,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * SME feature (set in scattered.c). * If the kernel has not enabled SME via any means then * don't advertise the SME feature. - * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV and SEV_ES feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise SEV and + * any additional functionality based on it. * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -707,6 +707,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); } } diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index bc82ca1bb346..a65fe0a72c13 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -432,6 +432,7 @@ #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ -- Gitee From 4edeb709b761d8f631aac5b7d4379da7f49bccd3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 22 Nov 2024 22:07:07 +0100 Subject: [PATCH 241/278] x86/cpufeatures: Remove "AMD" from the comments to the AMD-specific leaf ANBZ: #13617 commit 288bba2f4c8be1e1b9c8bc2e087ce677faf9918a upstream. 0x8000001f.EAX is an AMD-specific leaf so there's no need to have "AMD" in almost every feature's comment. Zap it and make the text more readable this way. No functional changes. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241122210707.12742-1-bp@kernel.org Signed-off-by: sa-buc [Jay Chen: fix merge confilct 408323581b722c9bd504dd296920f392049a7f52 ] Signed-off-by: Jay Chen --- arch/x86/include/asm/cpufeatures.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e1aa32f2ecb0..ad798e03a12c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -463,14 +463,14 @@ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ -#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ /* HYGON 3rd CSV */ #define X86_FEATURE_CSV3 (19*32 + 30) /* HYGON 3rd CSV */ -- Gitee From 9d61426f9e2e730c3a063637988a321e344cb1f3 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 25 Jan 2024 22:11:03 -0600 Subject: [PATCH 242/278] iommu/amd: Don't rely on external callers to enable IOMMU SNP support ANBZ: #13617 commit 04d65a9dbb33e20500005e151d720acead78c539 upstream. Currently, the expectation is that the kernel will call amd_iommu_snp_enable() to perform various checks and set the amd_iommu_snp_en flag that the IOMMU uses to adjust its setup routines to account for additional requirements on hosts where SNP is enabled. This is somewhat fragile as it relies on this call being done prior to IOMMU setup. It is more robust to just do this automatically as part of IOMMU initialization, so rework the code accordingly. There is still a need to export information about whether or not the IOMMU is configured in a manner compatible with SNP, so relocate the existing amd_iommu_snp_en flag so it can be used to convey that information in place of the return code that was previously provided by calls to amd_iommu_snp_enable(). While here, also adjust the kernel messages related to IOMMU SNP enablement for consistency/grammar/clarity. Suggested-by: Borislav Petkov (AMD) Signed-off-by: Ashish Kalra Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Acked-by: Joerg Roedel Link: https://lore.kernel.org/r/20240126041126.1927228-4-michael.roth@amd.com Signed-off-by: Jay Chen --- arch/x86/include/asm/iommu.h | 1 + drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/init.c | 69 ++++++++++++++++------------------- include/linux/amd-iommu.h | 4 -- 4 files changed, 32 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 2fd52b65deac..3be2451e7bc8 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -10,6 +10,7 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_merge; extern int panic_on_overflow; +extern bool amd_iommu_snp_en; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index f063d181dd68..d6f4cf823599 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -181,7 +181,6 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); -extern bool amd_iommu_snp_en; #endif struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 73ce2c4eded2..aaa5bab512a3 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3205,6 +3205,36 @@ static bool __init detect_ivrs(void) return true; } +static void iommu_snp_enable(void) +{ +#ifdef CONFIG_KVM_AMD_SEV + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return; + /* + * The SNP support requires that IOMMU must be enabled, and is + * not configured in the passthrough mode. + */ + if (no_iommu || iommu_default_passthrough()) { + pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + return; + } + + amd_iommu_snp_en = check_feature(FEATURE_SNP); + if (!amd_iommu_snp_en) { + pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + return; + } + + pr_info("IOMMU SNP support enabled.\n"); + + /* Enforce IOMMU v1 pagetable when SNP is enabled. */ + if (amd_iommu_pgtable != AMD_IOMMU_V1) { + pr_warn("Forcing use of AMD IOMMU v1 page table due to SNP.\n"); + amd_iommu_pgtable = AMD_IOMMU_V1; + } +#endif +} + /**************************************************************************** * * AMD IOMMU Initialization State Machine @@ -3240,6 +3270,7 @@ static int __init state_next(void) break; case IOMMU_ENABLED: register_syscore_ops(&amd_iommu_syscore_ops); + iommu_snp_enable(); ret = amd_iommu_init_pci(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT; break; @@ -3759,41 +3790,3 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true); } - -#ifdef CONFIG_AMD_MEM_ENCRYPT -int amd_iommu_snp_enable(void) -{ - /* - * The SNP support requires that IOMMU must be enabled, and is - * not configured in the passthrough mode. - */ - if (no_iommu || iommu_default_passthrough()) { - pr_err("SNP: IOMMU is disabled or configured in passthrough mode, SNP cannot be supported"); - return -EINVAL; - } - - /* - * Prevent enabling SNP after IOMMU_ENABLED state because this process - * affect how IOMMU driver sets up data structures and configures - * IOMMU hardware. - */ - if (init_state > IOMMU_ENABLED) { - pr_err("SNP: Too late to enable SNP for IOMMU.\n"); - return -EINVAL; - } - - amd_iommu_snp_en = check_feature(FEATURE_SNP); - if (!amd_iommu_snp_en) - return -EINVAL; - - pr_info("SNP enabled\n"); - - /* Enforce IOMMU v1 pagetable when SNP is enabled. */ - if (amd_iommu_pgtable != AMD_IOMMU_V1) { - pr_warn("Force to using AMD IOMMU v1 page table due to SNP\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } - - return 0; -} -#endif diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index ab0b52a552cf..1974c0d25031 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -85,8 +85,4 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); struct amd_iommu *get_amd_iommu(unsigned int idx); -#ifdef CONFIG_AMD_MEM_ENCRYPT -int amd_iommu_snp_enable(void); -#endif - #endif /* _ASM_X86_AMD_IOMMU_H */ -- Gitee From 326149629c63bcbd0bac01655997b4cc6d843592 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:18 +0000 Subject: [PATCH 243/278] iommu/amd: Fix header file ANBZ: #13617 commit ee4cf9260afe8e4be6b6d64f56fa7493d051d8de upstream. Move function declaration inside AMD_IOMMU_H defination. Fixes: fd5dff9de4be ("iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers") Fixes: 457da5764668 ("iommu/amd: Lock DTE before updating the entry with WRITE_ONCE()") Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index d6f4cf823599..4f57d62cc494 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -176,12 +176,11 @@ void amd_iommu_apply_ivrs_quirks(void); #else static inline void amd_iommu_apply_ivrs_quirks(void) { } #endif +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); - -#endif - -struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); + +#endif /* AMD_IOMMU_H */ -- Gitee From aa3c8e1b78f7fd58bb9233fa9a791d26ca59cdd1 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:19 +0000 Subject: [PATCH 244/278] iommu/amd: Remove unused forward declaration ANBZ: #13617 commit 5536e19e940be0aad326ff80870e0261c0ba5ff7 upstream. Remove unused forward declaration. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index aaa5bab512a3..94f9e55dad7e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -218,7 +218,6 @@ static bool __initdata cmdline_maps; static enum iommu_init_state init_state = IOMMU_START_STATE; static int amd_iommu_enable_interrupts(void); -static int __init iommu_go_to_state(enum iommu_init_state state); static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg); static bool amd_iommu_pre_enabled = true; -- Gitee From 946c440cece8c0d9527572679d7f316438b71b22 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 10 Mar 2025 10:47:44 +0800 Subject: [PATCH 245/278] iommu/vt-d: Fix system hang on reboot -f ANBZ: #13617 commit 9ce7603ad3cbae64003087de57834e8c5c856a20 upstream. We found that executing the command ./a.out &;reboot -f (where a.out is a program that only executes a while(1) infinite loop) can probabilistically cause the system to hang in the intel_iommu_shutdown() function, rendering it unresponsive. Through analysis, we identified that the factors contributing to this issue are as follows: 1. The reboot -f command does not prompt the kernel to notify the application layer to perform cleanup actions, allowing the application to continue running. 2. When the kernel reaches the intel_iommu_shutdown() function, only the BSP (Bootstrap Processor) CPU is operational in the system. 3. During the execution of intel_iommu_shutdown(), the function down_write (&dmar_global_lock) causes the process to sleep and be scheduled out. 4. At this point, though the processor's interrupt flag is not cleared, allowing interrupts to be accepted. However, only legacy devices and NMI (Non-Maskable Interrupt) interrupts could come in, as other interrupts routing have already been disabled. If no legacy or NMI interrupts occur at this stage, the scheduler will not be able to run. 5. If the application got scheduled at this time is executing a while(1)- type loop, it will be unable to be preempted, leading to an infinite loop and causing the system to become unresponsive. To resolve this issue, the intel_iommu_shutdown() function should not execute down_write(), which can potentially cause the process to be scheduled out. Furthermore, since only the BSP is running during the later stages of the reboot, there is no need for protection against parallel access to the DMAR (DMA Remapping) unit. Therefore, the following lines could be removed: down_write(&dmar_global_lock); up_write(&dmar_global_lock); After testing, the issue has been resolved. Fixes: 6c3a44ed3c55 ("iommu/vt-d: Turn off translations at shutdown") Co-developed-by: Ethan Zhao Signed-off-by: Ethan Zhao Signed-off-by: Yunhui Cui Link: https://lore.kernel.org/r/20250303062421.17929-1-cuiyunhui@bytedance.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 130e74789c99..57fe1d2667fa 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2889,16 +2889,19 @@ void intel_iommu_shutdown(void) if (no_iommu || dmar_disabled) return; - down_write(&dmar_global_lock); + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; - /* Disable PMRs explicitly here. */ - for_each_iommu(iommu, drhd) + /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - intel_disable_iommus(); - - up_write(&dmar_global_lock); + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) -- Gitee From efabcceab30a528d01a4db820e2c9ea65caee071 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Mar 2025 10:47:45 +0800 Subject: [PATCH 246/278] iommu/vt-d: Use virt_to_phys() ANBZ: #13617 commit a8653e5cc2047b6a3354dc072a9f1ebaff157eaa upstream. If all the inlines are unwound virt_to_dma_pfn() is simply: return page_to_pfn(virt_to_page(p)) << (PAGE_SHIFT - VTD_PAGE_SHIFT); Which can be re-arranged to: (page_to_pfn(virt_to_page(p)) << PAGE_SHIFT) >> VTD_PAGE_SHIFT The only caller is: ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) re-arranged to: ((page_to_pfn(virt_to_page(tmp_page)) << PAGE_SHIFT) >> VTD_PAGE_SHIFT) << VTD_PAGE_SHIFT Which simplifies to: page_to_pfn(virt_to_page(tmp_page)) << PAGE_SHIFT That is the same as virt_to_phys(tmp_page), so just remove all of this. Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-e797f4dc6918+93057-iommu_pages_jgg@nvidia.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/intel/iommu.h | 19 ------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 57fe1d2667fa..7874adc35991 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -737,7 +737,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, return NULL; domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | + DMA_PTE_WRITE; if (domain->use_first_level) pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 0a20b11b0726..755cf097ddda 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -953,25 +953,6 @@ static inline unsigned long lvl_to_nr_pages(unsigned int lvl) return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); } -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} - static inline void context_set_present(struct context_entry *context) { context->lo |= 1; -- Gitee From 12272977a8bb8d08cc255d348a0333c909432f35 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Mar 2025 10:47:46 +0800 Subject: [PATCH 247/278] iommu/vt-d: Check if SVA is supported when attaching the SVA domain ANBZ: #13617 commit 607ba1bb096110751f6aa4b46666e0ba024ab3c2 upstream. Attach of a SVA domain should fail if SVA is not supported, move the check for SVA support out of IOMMU_DEV_FEAT_SVA and into attach. Also check when allocating a SVA domain to match other drivers. Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 37 +------------------------------ drivers/iommu/intel/svm.c | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7874adc35991..9a4fc8d7c232 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3921,41 +3921,6 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int intel_iommu_enable_sva(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; - - if (!info || dmar_disabled) - return -EINVAL; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) - return -ENODEV; - - if (!info->pasid_enabled || !info->ats_enabled) - return -EINVAL; - - /* - * Devices having device-specific I/O fault handling should not - * support PCI/PRI. The IOMMU side has no means to check the - * capability of device-specific IOPF. Therefore, IOMMU can only - * default that if the device driver enables SVA on a non-PRI - * device, it will handle IOPF in its own way. - */ - if (!info->pri_supported) - return 0; - - /* Devices supporting PRI should have it enabled. */ - if (!info->pri_enabled) - return -EINVAL; - - return 0; -} - static int context_flip_pri(struct device_domain_info *info, bool enable) { struct intel_iommu *iommu = info->iommu; @@ -4076,7 +4041,7 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) return intel_iommu_enable_iopf(dev); case IOMMU_DEV_FEAT_SVA: - return intel_iommu_enable_sva(dev); + return 0; default: return -ENODEV; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index f5569347591f..ba93123cb4eb 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -110,6 +110,41 @@ static const struct mmu_notifier_ops intel_mmuops = { .free_notifier = intel_mm_free_notifier, }; +static int intel_iommu_sva_supported(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu; + + if (!info || dmar_disabled) + return -EINVAL; + + iommu = info->iommu; + if (!iommu) + return -EINVAL; + + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; + + if (!info->pasid_enabled || !info->ats_enabled) + return -EINVAL; + + /* + * Devices having device-specific I/O fault handling should not + * support PCI/PRI. The IOMMU side has no means to check the + * capability of device-specific IOPF. Therefore, IOMMU can only + * default that if the device driver enables SVA on a non-PRI + * device, it will handle IOPF in its own way. + */ + if (!info->pri_supported) + return 0; + + /* Devices supporting PRI should have it enabled. */ + if (!info->pri_enabled) + return -EINVAL; + + return 0; +} + static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) @@ -121,6 +156,10 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, unsigned long sflags; int ret = 0; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ret; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); if (IS_ERR(dev_pasid)) return PTR_ERR(dev_pasid); @@ -161,6 +200,10 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct dmar_domain *domain; int ret; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ERR_PTR(ret); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return ERR_PTR(-ENOMEM); -- Gitee From 8c2e7663ad202d34a255d69b61e59f93186e86b8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:47 +0800 Subject: [PATCH 248/278] iommu/vt-d: Move scalable mode ATS enablement to probe path ANBZ: #13617 commit 5518f239aff1baf772c5748da3add7243c5fb5df upstream. Device ATS is currently enabled when a domain is attached to the device and disabled when the domain is detached. This creates a limitation: when the IOMMU is operating in scalable mode and IOPF is enabled, the device's domain cannot be changed. The previous code enables ATS when a domain is set to a device's RID and disables it during RID domain switch. So, if a PASID is set with a domain requiring PRI, ATS should remain enabled until the domain is removed. During the PASID domain's lifecycle, if the RID's domain changes, PRI will be disrupted because it depends on ATS, which is disabled when the blocking domain is set for the device's RID. Remove this limitation by moving ATS enablement to the device probe path. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 51 ++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9a4fc8d7c232..1c82e9140cfe 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1173,32 +1173,28 @@ static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) return true; } -static void iommu_enable_pci_caps(struct device_domain_info *info) +static void iommu_enable_pci_ats(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_supported) return; pdev = to_pci_dev(info->dev); - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) + if (!pci_ats_page_aligned(pdev)) + return; + + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; } -static void iommu_disable_pci_caps(struct device_domain_info *info) +static void iommu_disable_pci_ats(struct device_domain_info *info) { - struct pci_dev *pdev; - - if (!dev_is_pci(info->dev)) + if (!info->ats_enabled) return; - pdev = to_pci_dev(info->dev); - - if (info->ats_enabled) { - pci_disable_ats(pdev); - info->ats_enabled = 0; - } + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; } static void intel_flush_iotlb_all(struct iommu_domain *domain) @@ -1557,12 +1553,19 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; + int ret; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); - return pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapping_cb, domain); + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; + + iommu_enable_pci_ats(info); + + return 0; } /* Return largest possible superpage level for a given mapping */ @@ -1844,8 +1847,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; - iommu_enable_pci_caps(info); - ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; @@ -3268,6 +3269,7 @@ static void domain_context_clear(struct device_domain_info *info) pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); } /* @@ -3284,7 +3286,6 @@ void device_block_translation(struct device *dev) if (info->domain) cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); - iommu_disable_pci_caps(info); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, @@ -3819,6 +3820,9 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) !pci_enable_pasid(pdev, info->pasid_supported & ~1)) info->pasid_enabled = 1; + if (sm_supported(iommu)) + iommu_enable_pci_ats(info); + return &iommu->iommu; free_table: intel_pasid_free_table(dev); @@ -3835,6 +3839,8 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_ats(info); + if (info->pasid_enabled) { pci_disable_pasid(to_pci_dev(dev)); info->pasid_enabled = 0; @@ -4438,13 +4444,10 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device if (dev_is_real_dma_subdevice(dev)) return 0; - if (sm_supported(iommu)) { + if (sm_supported(iommu)) ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); - if (!ret) - iommu_enable_pci_caps(info); - } else { + else ret = device_setup_pass_through(dev); - } return ret; } -- Gitee From 60d6ddb25ad47a46ec21023cae3fa4cc4d56d0bf Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:48 +0800 Subject: [PATCH 249/278] iommu/vt-d: Move PRI enablement in probe path ANBZ: #13617 commit 87caaba1d174d365a9717e01d2c42335ba1da7be upstream. Update PRI enablement to use the new method, similar to the amd iommu driver. Enable PRI in the device probe path and disable it when the device is released. PRI is enabled throughout the device's iommu lifecycle. The infrastructure for the iommu subsystem to handle iopf requests is created during iopf enablement and released during iopf disablement. All invalid page requests from the device are automatically handled by the iommu subsystem if iopf is not enabled. Add iopf_refcount to track the iopf enablement. Convert the return type of intel_iommu_disable_iopf() to void, as there is no way to handle a failure when disabling this feature. Make intel_iommu_enable/disable_iopf() helpers global, as they will be used beyond the current file in the subsequent patch. The iopf_refcount is not protected by any lock. This is acceptable, as there is no concurrent access to it in the current code. The following patch will address this by moving it to the domain attach/detach paths, which are protected by the iommu group mutex. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 137 +++++++++++++----------------------- drivers/iommu/intel/iommu.h | 4 ++ drivers/iommu/intel/pasid.c | 2 + drivers/iommu/intel/prq.c | 2 +- 4 files changed, 55 insertions(+), 90 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 1c82e9140cfe..92554403aabe 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1197,6 +1197,37 @@ static void iommu_disable_pci_ats(struct device_domain_info *info) info->ats_enabled = 0; } +static void iommu_enable_pci_pri(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_enabled || !info->pri_supported) + return; + + pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; + + if (pci_reset_pri(pdev)) + return; + + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; +} + +static void iommu_disable_pci_pri(struct device_domain_info *info) +{ + if (!info->pri_enabled) + return; + + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; +} + static void intel_flush_iotlb_all(struct iommu_domain *domain) { cache_tag_flush_all(to_dmar_domain(domain)); @@ -3822,6 +3853,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) if (sm_supported(iommu)) iommu_enable_pci_ats(info); + iommu_enable_pci_pri(info); return &iommu->iommu; free_table: @@ -3839,6 +3871,7 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_pri(info); iommu_disable_pci_ats(info); if (info->pasid_enabled) { @@ -3927,116 +3960,41 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int context_flip_pri(struct device_domain_info *info, bool enable) -{ - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct context_entry *context; - u16 did; - - spin_lock(&iommu->lock); - if (context_copied(iommu, bus, devfn)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - context = iommu_context_addr(iommu, bus, devfn, false); - if (!context || !context_present(context)) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - did = context_domain_id(context); - - if (enable) - context_set_sm_pre(context); - else - context_clear_sm_pre(context); - - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(context, sizeof(*context)); - intel_context_flush_present(info, context, did, true); - spin_unlock(&iommu->lock); - - return 0; -} - -static int intel_iommu_enable_iopf(struct device *dev) +int intel_iommu_enable_iopf(struct device *dev) { - struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int ret; - if (!pdev || !info || !info->ats_enabled || !info->pri_supported) + if (!info->pri_enabled) return -ENODEV; - if (info->pri_enabled) - return -EBUSY; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - /* PASID is required in PRG Response Message. */ - if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) - return -EINVAL; - - ret = pci_reset_pri(pdev); - if (ret) - return ret; + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; - ret = context_flip_pri(info, true); - if (ret) - goto err_remove_device; - - ret = pci_enable_pri(pdev, PRQ_DEPTH); - if (ret) - goto err_clear_pri; - - info->pri_enabled = 1; + info->iopf_refcount = 1; return 0; -err_clear_pri: - context_flip_pri(info, false); -err_remove_device: - iopf_queue_remove_device(iommu->iopf_queue, dev); - - return ret; } -static int intel_iommu_disable_iopf(struct device *dev) +void intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - if (!info->pri_enabled) - return -EINVAL; + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; - /* Disable new PRI reception: */ - context_flip_pri(info, false); + if (--info->iopf_refcount) + return; - /* - * Remove device from fault queue and acknowledge all outstanding - * PRQs to the device: - */ iopf_queue_remove_device(iommu->iopf_queue, dev); - - /* - * PCIe spec states that by clearing PRI enable bit, the Page - * Request Interface will not issue new page requests, but has - * outstanding page requests that have been transmitted or are - * queued for transmission. This is supposed to be called after - * the device driver has stopped DMA, all PASIDs have been - * unbound and the outstanding PRQs have been drained. - */ - pci_disable_pri(to_pci_dev(dev)); - info->pri_enabled = 0; - - return 0; } static int @@ -4059,7 +4017,8 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) { switch (feat) { case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_disable_iopf(dev); + intel_iommu_disable_iopf(dev); + return 0; case IOMMU_DEV_FEAT_SVA: return 0; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 755cf097ddda..c5d4c276cf01 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -774,6 +774,7 @@ struct device_domain_info { u8 ats_enabled:1; u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ u8 ats_qdep; + unsigned int iopf_refcount; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ @@ -1295,6 +1296,9 @@ void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); +int intel_iommu_enable_iopf(struct device *dev); +void intel_iommu_disable_iopf(struct device *dev); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index fb59a7d35958..c2742e256552 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -992,6 +992,8 @@ static int context_entry_set_pasid_table(struct context_entry *context, context_set_sm_dte(context); if (info->pasid_supported) context_set_pasid(context); + if (info->pri_supported) + context_set_sm_pre(context); context_set_fault_enable(context); context_set_present(context); diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index 064194399b38..5b6a64d96850 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -67,7 +67,7 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) u16 sid, did; info = dev_iommu_priv_get(dev); - if (!info->pri_enabled) + if (!info->iopf_refcount) return; iommu = info->iommu; -- Gitee From 869b4412debc5f39738051718c486ae86ab5d05e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:49 +0800 Subject: [PATCH 250/278] iommu/vt-d: Cleanup intel_context_flush_present() ANBZ: #13617 commit 4c293add5874038dc82ef579663dd86744d8e872 upstream. The intel_context_flush_present() is called in places where either the scalable mode is disabled, or scalable mode is enabled but all PASID entries are known to be non-present. In these cases, the flush_domains path within intel_context_flush_present() will never execute. This dead code is therefore removed. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 5 ++--- drivers/iommu/intel/pasid.c | 41 +++++++------------------------------ 3 files changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 92554403aabe..fdd268da21b9 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1783,7 +1783,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, did, true); + intel_context_flush_no_pasid(info, context, did); } int __domain_setup_first_level(struct intel_iommu *iommu, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index c5d4c276cf01..117905394cdf 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1286,9 +1286,8 @@ void cache_tag_flush_all(struct dmar_domain *domain); void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end); -void intel_context_flush_present(struct device_domain_info *info, - struct context_entry *context, - u16 did, bool affect_domains); +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did); int intel_iommu_enable_prq(struct intel_iommu *iommu); int intel_iommu_finish_prq(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index c2742e256552..7ee18bb48bd4 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -932,7 +932,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, did, false); + intel_context_flush_no_pasid(info, context, did); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -1119,17 +1119,15 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) /* * Cache invalidations after change in a context table entry that was present - * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). If - * IOMMU is in scalable mode and all PASID table entries of the device were - * non-present, set flush_domains to false. Otherwise, true. + * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). + * This helper can only be used when IOMMU is working in the legacy mode or + * IOMMU is in scalable mode but all PASID table entries of the device are + * non-present. */ -void intel_context_flush_present(struct device_domain_info *info, - struct context_entry *context, - u16 did, bool flush_domains) +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did) { struct intel_iommu *iommu = info->iommu; - struct pasid_entry *pte; - int i; /* * Device-selective context-cache invalidation. The Domain-ID field @@ -1152,30 +1150,5 @@ void intel_context_flush_present(struct device_domain_info *info, return; } - /* - * For scalable mode: - * - Domain-selective PASID-cache invalidation to affected domains - * - Domain-selective IOTLB invalidation to affected domains - * - Global Device-TLB invalidation to affected functions - */ - if (flush_domains) { - /* - * If the IOMMU is running in scalable mode and there might - * be potential PASID translations, the caller should hold - * the lock to ensure that context changes and cache flushes - * are atomic. - */ - assert_spin_locked(&iommu->lock); - for (i = 0; i < info->pasid_table->max_pasid; i++) { - pte = intel_pasid_get_entry(info->dev, i); - if (!pte || !pasid_pte_is_present(pte)) - continue; - - did = pasid_get_domain_id(pte); - qi_flush_pasid_cache(iommu, did, QI_PC_ALL_PASIDS, 0); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } - } - __context_flush_dev_iotlb(info); } -- Gitee From 4b619207c44f444af3acfdc328e6897a58f00a62 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 27 Feb 2025 14:47:47 +0000 Subject: [PATCH 251/278] iommu: Unexport iommu_fwspec_free() ANBZ: #13617 commit 29c6e1c2b923b43e8082bba5c6675185a8fe305a upstream. The drivers doing their own fwspec parsing have no need to call iommu_fwspec_free() since fwspecs were moved into dev_iommu, as returning an error from .probe_device will tear down the whole lot anyway. Move it into the private interface now that it only serves for of_iommu to clean up in an error case. I have no idea what mtk_v1 was doing in effectively guaranteeing a NULL fwspec would be dereferenced if no "iommus" DT property was found, so add a check for that to at least make the code look sane. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/36e245489361de2d13db22a510fa5c79e7126278.1740667667.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 1 - drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 1 - drivers/iommu/mtk_iommu_v1.c | 14 ++++---------- drivers/iommu/tegra-smmu.c | 1 - include/linux/iommu.h | 5 ----- 6 files changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 9c6d3921a58a..cc89287da504 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1503,7 +1503,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index b4508423e13b..fed55fbfe99c 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,6 +24,8 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } +void iommu_fwspec_free(struct device *dev); + int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 743d370394f1..747439afefb7 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2878,7 +2878,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index ee4e55b6b190..21f37c3958d4 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -445,22 +445,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -475,6 +466,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 7f633bb5efef..69d353e1df84 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e4fc48d19cd4..da2f531340e0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1133,7 +1133,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1449,10 +1448,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { -- Gitee From df8ee81418bd4eb2eef4506c62a80f1fa325cdae Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:31 +0000 Subject: [PATCH 252/278] iommu: Resolve ops in iommu_init_device() ANBZ: #13617 commit fd598f71b66975c042f40ae7cd4310d6e699e149 upstream. Since iommu_init_device() was factored out, it is in fact the only consumer of the ops which __iommu_probe_device() is resolving, so let it do that itself rather than passing them in. This also puts the ops lookup at a more logical point relative to the rest of the flow through __iommu_probe_device(). Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/fa4b6cfc67a352488b7f4e0b736008307ce9ac2e.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 747439afefb7..a5143993aca6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -406,14 +406,28 @@ EXPORT_SYMBOL_GPL(dev_iommu_priv_set); * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ -static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) +static int iommu_init_device(struct device *dev) { + const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* + * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU + * instances with non-NULL fwnodes, and client devices should have been + * identified with a fwspec by this point. Otherwise, we can currently + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + ops = iommu_fwspec_ops(dev->iommu->fwspec); + if (!ops) { + ret = -ENODEV; + goto err_free; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -516,22 +530,10 @@ DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops; struct iommu_group *group; struct group_device *gdev; int ret; - /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently - * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can - * be present, and that any of their registered instances has suitable - * ops for probing, and thus cheekily co-opt the same mechanism. - */ - ops = iommu_fwspec_ops(dev_iommu_fwspec_get(dev)); - if (!ops) - return -ENODEV; /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client @@ -545,7 +547,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (dev->iommu_group) return 0; - ret = iommu_init_device(dev, ops); + ret = iommu_init_device(dev); if (ret) return ret; -- Gitee From 85b445ef7b7dec18152865d605d76cd280a9488f Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:32 +0000 Subject: [PATCH 253/278] iommu: Keep dev->iommu state consistent ANBZ: #13617 commit 3832862eb9c4dfa0e80b2522bfaedbc8a43de97d upstream. At the moment, if of_iommu_configure() allocates dev->iommu itself via iommu_fwspec_init(), then suffers a DT parsing failure, it cleans up the fwspec but leaves the empty dev_iommu hanging around. So far this is benign (if a tiny bit wasteful), but we'd like to be able to reason about dev->iommu having a consistent and unambiguous lifecycle. Thus make sure that the of_iommu cleanup undoes precisely whatever it did. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/d219663a3f23001f23d520a883ac622d70b4e642.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index fed55fbfe99c..05fa6e682e88 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -17,6 +17,8 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } +void dev_iommu_free(struct device *dev); + const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a5143993aca6..06dfaca3b24a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -354,7 +354,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 97987cd78da9..e10a68b5ffde 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -116,6 +116,7 @@ static void of_pci_check_device_ats(struct device *dev, struct device_node *np) int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { + bool dev_iommu_present; int err; if (!master_np) @@ -127,6 +128,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, mutex_unlock(&iommu_probe_device_lock); return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,8 +149,10 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, err = of_iommu_configure_device(master_np, dev, id); } - if (err) + if (err && dev_iommu_present) iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); if (!err && dev->bus) -- Gitee From cc42c0f483e6ad944a35bab3abefad2630513564 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Oct 2023 17:21:20 +0300 Subject: [PATCH 254/278] driver core: platform: Refactor error path in a couple places ANBZ: #13617 commit a549e3aac29cde86c1ade76909df759918c11653 upstream. The usual pattern is to bail out on the error case. Besides that one of the labels is redundant as we may return directly. Refactor platform_device_add() and platform_dma_configure() accordingly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231003142122.3072824-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jay Chen --- drivers/base/platform.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 76bfcba25003..c839aa0ef30b 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -681,7 +681,7 @@ int platform_device_add(struct platform_device *pdev) */ ret = ida_alloc(&platform_devid_ida, GFP_KERNEL); if (ret < 0) - goto err_out; + return ret; pdev->id = ret; pdev->id_auto = true; dev_set_name(&pdev->dev, "%s.%d.auto", pdev->name, pdev->id); @@ -715,8 +715,10 @@ int platform_device_add(struct platform_device *pdev) dev_name(&pdev->dev), dev_name(pdev->dev.parent)); ret = device_add(&pdev->dev); - if (ret == 0) - return ret; + if (ret) + goto failed; + + return 0; failed: if (pdev->id_auto) { @@ -730,7 +732,6 @@ int platform_device_add(struct platform_device *pdev) release_resource(r); } - err_out: return ret; } EXPORT_SYMBOL_GPL(platform_device_add); @@ -1456,12 +1457,12 @@ static int platform_dma_configure(struct device *dev) attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode)); ret = acpi_dma_configure(dev, attr); } + if (ret || drv->driver_managed_dma) + return ret; - if (!ret && !drv->driver_managed_dma) { - ret = iommu_device_use_default_domain(dev); - if (ret) - arch_teardown_dma_ops(dev); - } + ret = iommu_device_use_default_domain(dev); + if (ret) + arch_teardown_dma_ops(dev); return ret; } -- Gitee From c1f46291a56766380e5873524662048cf51c6bc3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:33 +0000 Subject: [PATCH 255/278] iommu: Get DT/ACPI parsing into the proper probe path ANBZ: #13617 commit bcb81ac6ae3c2ef95b44e7b54c3c9522364a245c upstream. In hindsight, there were some crucial subtleties overlooked when moving {of,acpi}_dma_configure() to driver probe time to allow waiting for IOMMU drivers with -EPROBE_DEFER, and these have become an ever-increasing source of problems. The IOMMU API has some fundamental assumptions that iommu_probe_device() is called for every device added to the system, in the order in which they are added. Calling it in a random order or not at all dependent on driver binding leads to malformed groups, a potential lack of isolation for devices with no driver, and all manner of unexpected concurrency and race conditions. We've attempted to mitigate the latter with point-fix bodges like iommu_probe_device_lock, but it's a losing battle and the time has come to bite the bullet and address the true source of the problem instead. The crux of the matter is that the firmware parsing actually serves two distinct purposes; one is identifying the IOMMU instance associated with a device so we can check its availability, the second is actually telling that instance about the relevant firmware-provided data for the device. However the latter also depends on the former, and at the time there was no good place to defer and retry that separately from the availability check we also wanted for client driver probe. Nowadays, though, we have a proper notion of multiple IOMMU instances in the core API itself, and each one gets a chance to probe its own devices upon registration, so we can finally make that work as intended for DT/IORT/VIOT platforms too. All we need is for iommu_probe_device() to be able to run the iommu_fwspec machinery currently buried deep in the wrong end of {of,acpi}_dma_configure(). Luckily it turns out to be surprisingly straightforward to bootstrap this transformation by pretty much just calling the same path twice. At client driver probe time, dev->driver is obviously set; conversely at device_add(), or a subsequent bus_iommu_probe(), any device waiting for an IOMMU really should *not* have a driver already, so we can use that as a condition to disambiguate the two cases, and avoid recursing back into the IOMMU core at the wrong times. Obviously this isn't the nicest thing, but for now it gives us a functional baseline to then unpick the layers in between without many more awkward cross-subsystem patches. There are some minor side-effects like dma_range_map potentially being created earlier, and some debug prints being repeated, but these aren't significantly detrimental. Let's make things work first, then deal with making them nice. With the basic flow finally in the right order again, the next step is probably turning the bus->dma_configure paths inside-out, since all we really need from bus code is its notion of which device and input ID(s) to parse the common firmware properties with... Acked-by: Bjorn Helgaas # pci-driver.c Acked-by: Rob Herring (Arm) # of/device.c Signed-off-by: Robin Murphy Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/e3b191e6fd6ca9a1e84c5e5e40044faf97abb874.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/acpi/arm64/dma.c | 5 +++++ drivers/acpi/scan.c | 7 ------- drivers/amba/bus.c | 3 ++- drivers/base/platform.c | 3 ++- drivers/bus/fsl-mc/fsl-mc-bus.c | 3 ++- drivers/cdx/cdx.c | 3 ++- drivers/iommu/iommu.c | 24 +++++++++++++++++++++--- drivers/iommu/of_iommu.c | 7 ++++++- drivers/of/device.c | 7 ++++++- drivers/pci/pci-driver.c | 3 ++- 10 files changed, 48 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c index 52b2abf88689..f30f138352b7 100644 --- a/drivers/acpi/arm64/dma.c +++ b/drivers/acpi/arm64/dma.c @@ -26,6 +26,11 @@ void acpi_arch_dma_setup(struct device *dev) else end = (1ULL << 32) - 1; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + return; + } + ret = acpi_dma_get_range(dev, &map); if (!ret && map) { end = dma_range_map_max(map); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index be25e7015523..71f180792016 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1570,13 +1570,6 @@ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) err = viot_iommu_configure(dev); mutex_unlock(&iommu_probe_device_lock); - /* - * If we have reason to believe the IOMMU driver missed the initial - * iommu_probe_device() call for dev, replay it to get things in order. - */ - if (!err && dev->bus) - err = iommu_probe_device(dev); - return err; } diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 09e72967b8ab..fe1b86fc3fc4 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -363,7 +363,8 @@ static int amba_dma_configure(struct device *dev) ret = acpi_dma_configure(dev, attr); } - if (!ret && !drv->driver_managed_dma) { + /* @drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index c839aa0ef30b..9ff253ff0628 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1457,7 +1457,8 @@ static int platform_dma_configure(struct device *dev) attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode)); ret = acpi_dma_configure(dev, attr); } - if (ret || drv->driver_managed_dma) + /* @drv may not be valid when we're called from the IOMMU layer */ + if (ret || !dev->driver || drv->driver_managed_dma) return ret; ret = iommu_device_use_default_domain(dev); diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 350e7b24ee2b..05cf082878a0 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -153,7 +153,8 @@ static int fsl_mc_dma_configure(struct device *dev) else ret = acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); - if (!ret && !mc_drv->driver_managed_dma) { + /* @mc_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !mc_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index a61834bc84a9..37c775350a79 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -270,7 +270,8 @@ static int cdx_dma_configure(struct device *dev) return ret; } - if (!ret && !cdx_drv->driver_managed_dma) { + /* @cdx_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !cdx_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 06dfaca3b24a..7670508a33b8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -416,9 +416,21 @@ static int iommu_init_device(struct device *dev) if (!dev_iommu_get(dev)) return -ENOMEM; /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently + * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing + * is buried in the bus dma_configure path. Properly unpicking that is + * still a big job, so for now just invoke the whole thing. The device + * already having a driver bound means dma_configure has already run and + * either found no IOMMU to wait for, or we're in its replay call right + * now, so either way there's no point calling it again. + */ + if (!dev->driver && dev->bus->dma_configure) { + mutex_unlock(&iommu_probe_device_lock); + dev->bus->dma_configure(dev); + mutex_lock(&iommu_probe_device_lock); + } + /* + * At this point, relevant devices either now have a fwspec which will + * match ops registered with a non-NULL fwnode, or we can reasonably * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can * be present, and that any of their registered instances has suitable * ops for probing, and thus cheekily co-opt the same mechanism. @@ -428,6 +440,12 @@ static int iommu_init_device(struct device *dev) ret = -ENODEV; goto err_free; } + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); if (!try_module_get(ops->owner)) { ret = -EINVAL; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e10a68b5ffde..6b989a62def2 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -155,7 +155,12 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); - if (!err && dev->bus) + /* + * If we're not on the iommu_probe_device() path (as indicated by the + * initial dev->iommu) then try to simulate it. This should no longer + * happen unless of_dma_configure() is being misused outside bus code. + */ + if (!err && dev->bus && !dev_iommu_present) err = iommu_probe_device(dev); if (err && err != -EPROBE_DEFER) diff --git a/drivers/of/device.c b/drivers/of/device.c index dc98aae05677..073050bd7d1e 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -99,6 +99,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, bool coherent, set_map = false; int ret; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + goto skip_map; + } + if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else @@ -119,7 +124,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, end = dma_range_map_max(map); set_map = true; } - +skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index b699839a7d4f..578f6f1565e2 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1669,7 +1669,8 @@ static int pci_dma_configure(struct device *dev) pci_put_host_bridge_device(bridge); - if (!ret && !driver->driver_managed_dma) { + /* @driver may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !driver->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); -- Gitee From 67b4fa288673939dd024b50a8549177f36756ae1 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:32 +0200 Subject: [PATCH 256/278] PM: runtime: Add pm_runtime_put_autosuspend() replacement ANBZ: #13617 commit b7d46644e554ed017dfabc0841acf418d0584bc9 upstream. Add __pm_runtime_put_autosuspend() that replaces pm_runtime_put_autosuspend() for new users. The intent is to later re-purpose pm_runtime_put_autosuspend() to also mark the device's last busy stamp---which is what the vast majority of users actually need. This is also described in pm_runtime_put_autosuspend() documentation. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Signed-off-by: Rafael J. Wysocki Signed-off-by: Jay Chen --- Documentation/power/runtime_pm.rst | 17 +++++++++++------ include/linux/pm_runtime.h | 12 ++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index b6d5a3a8febc..9cd40c016ad3 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -154,7 +154,7 @@ suspending the device are satisfied) and to queue up a suspend request for the device in that case. If there is no idle callback, or if the callback returns 0, then the PM core will attempt to carry out a runtime suspend of the device, also respecting devices configured for autosuspend. In essence this means a -call to pm_runtime_autosuspend() (do note that drivers needs to update the +call to __pm_runtime_autosuspend() (do note that drivers needs to update the device last busy mark, pm_runtime_mark_last_busy(), to control the delay under this circumstance). To prevent this (for example, if the callback routine has started a delayed suspend), the routine must return a non-zero value. Negative @@ -412,6 +412,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: pm_request_idle(dev) and return its result `int pm_runtime_put_autosuspend(struct device *dev);` + - does the same as __pm_runtime_put_autosuspend() for now, but in the + future, will also call pm_runtime_mark_last_busy() as well, DO NOT USE! + + `int __pm_runtime_put_autosuspend(struct device *dev);` - decrement the device's usage counter; if the result is 0 then run pm_request_autosuspend(dev) and return its result @@ -542,6 +546,7 @@ It is safe to execute the following helper functions from interrupt context: - pm_runtime_put_noidle() - pm_runtime_put() - pm_runtime_put_autosuspend() +- __pm_runtime_put_autosuspend() - pm_runtime_enable() - pm_suspend_ignore_children() - pm_runtime_set_active() @@ -867,9 +872,9 @@ automatically be delayed until the desired period of inactivity has elapsed. Inactivity is determined based on the power.last_busy field. Drivers should call pm_runtime_mark_last_busy() to update this field after carrying out I/O, -typically just before calling pm_runtime_put_autosuspend(). The desired length -of the inactivity period is a matter of policy. Subsystems can set this length -initially by calling pm_runtime_set_autosuspend_delay(), but after device +typically just before calling __pm_runtime_put_autosuspend(). The desired +length of the inactivity period is a matter of policy. Subsystems can set this +length initially by calling pm_runtime_set_autosuspend_delay(), but after device registration the length should be controlled by user space, using the /sys/devices/.../power/autosuspend_delay_ms attribute. @@ -880,7 +885,7 @@ instead of the non-autosuspend counterparts:: Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; Instead of: pm_schedule_suspend use: pm_request_autosuspend; - Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; + Instead of: pm_runtime_put use: __pm_runtime_put_autosuspend; Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. Drivers may also continue to use the non-autosuspend helper functions; they @@ -919,7 +924,7 @@ Here is a schematic pseudo-code example:: lock(&foo->private_lock); if (--foo->num_pending_requests == 0) { pm_runtime_mark_last_busy(&foo->dev); - pm_runtime_put_autosuspend(&foo->dev); + __pm_runtime_put_autosuspend(&foo->dev); } else { foo_process_next_request(foo); } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 406855d73901..ad5f6be0fc36 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -462,6 +462,18 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } +/** + * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. + * @dev: Target device. + * + * Decrement the runtime PM usage counter of @dev and if it turns out to be + * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + */ +static inline int __pm_runtime_put_autosuspend(struct device *dev) +{ + return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); +} + /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- Gitee From 6fb5720545cb5ddfd4b4e15a24a79612311c56d2 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Thu, 23 Jan 2025 19:56:36 +0000 Subject: [PATCH 257/278] iommu/arm-smmu: Set rpm auto_suspend once during probe ANBZ: #13617 commit 0a679336dc171db06d84679ae9180e5cda42166c upstream. The current code calls arm_smmu_rpm_use_autosuspend() during device attach, which seems unusual as it sets the autosuspend delay and the 'use_autosuspend' flag for the smmu device. These parameters can be simply set once during the smmu probe and in order to avoid bouncing rpm states, we can simply mark_last_busy() during a client dev attach as discussed in [1]. Move the handling of arm_smmu_rpm_use_autosuspend() to the SMMU probe and modify the arm_smmu_rpm_put() function to mark_last_busy() before calling __pm_runtime_put_autosuspend(). Additionally, s/pm_runtime_put_autosuspend/__pm_runtime_put_autosuspend/ to help with the refactor of the pm_runtime_put_autosuspend() API [2]. Link: https://lore.kernel.org/r/20241023164835.GF29251@willie-the-truck [1] Link: https://git.kernel.org/linus/b7d46644e554 [2] Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20250123195636.4182099-1-praan@google.com Signed-off-by: Will Deacon Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index cc89287da504..7897fdc6aac0 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -83,8 +83,11 @@ static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu) static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) { - if (pm_runtime_enabled(smmu->dev)) - pm_runtime_put_autosuspend(smmu->dev); + if (pm_runtime_enabled(smmu->dev)) { + pm_runtime_mark_last_busy(smmu->dev); + __pm_runtime_put_autosuspend(smmu->dev); + + } } static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu) @@ -1199,7 +1202,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* Looks ok, so add the device to the domain */ arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS, smmu_domain->cfg.cbndx, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); rpm_put: arm_smmu_rpm_put(smmu); return ret; @@ -1222,7 +1224,6 @@ static int arm_smmu_attach_dev_type(struct device *dev, return ret; arm_smmu_master_install_s2crs(cfg, type, 0, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); arm_smmu_rpm_put(smmu); return 0; } @@ -2267,6 +2268,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (dev->pm_domain) { pm_runtime_set_active(dev); pm_runtime_enable(dev); + arm_smmu_rpm_use_autosuspend(smmu); } return 0; -- Gitee From 27b385975be9e9a572742f751848abf09ba6126e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 12 Mar 2025 15:01:31 +0000 Subject: [PATCH 258/278] iommu: Don't warn prematurely about dodgy probes ANBZ: #13617 commit 73d2f10957f517e5918c81b7e859e214ba71c044 upstream. The warning for suspect probe conditions inadvertently got moved too early in a prior respin - it happened to work out OK for fwspecs, but in general still needs to be after the ops->probe_device call so drivers which filter devices for themselves have a chance to do that. Signed-off-by: Robin Murphy Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Link: https://lore.kernel.org/r/72a4853e7ef36e7c1c4ca171ed4ed8e1a463a61a.1741791691.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7670508a33b8..cc904316194d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -440,12 +440,6 @@ static int iommu_init_device(struct device *dev) ret = -ENODEV; goto err_free; } - /* - * And if we do now see any replay calls, they would indicate someone - * misusing the dma_configure path outside bus code. - */ - if (dev->driver) - dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -568,6 +562,12 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list ret = iommu_init_device(dev); if (ret) return ret; + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); group = dev->iommu_group; gdev = iommu_group_alloc_device(group, dev); -- Gitee From fbdbf5a182643bdfdda81806dfc992d26ea23460 Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:19 +0530 Subject: [PATCH 259/278] iommu/amd: Introduce generic function to set multibit feature value ANBZ: #13617 commit 1c608b0b280d8a33edee28f4fb531151b47ec33c upstream. Define generic function `iommu_feature_set()` to set the values in the feature control register and replace `iommu_set_inv_tlb_timeout()` with it. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-2-sarunkod@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 27 ++++++++++----------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 3f57bdd0a659..fe0672676551 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -181,7 +181,7 @@ #define CONTROL_IRTCACHEDIS 59 #define CONTROL_SNPAVIC_EN 61 -#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) +#define CTRL_INV_TO_MASK 7 #define CTRL_INV_TO_NONE 0 #define CTRL_INV_TO_1MS 1 #define CTRL_INV_TO_10MS 2 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 94f9e55dad7e..af0cbb5e5756 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -410,33 +410,26 @@ static void iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } -/* Generic functions to enable/disable certain features of the IOMMU. */ -void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_set(struct amd_iommu *iommu, u64 val, u64 mask, u8 shift) { u64 ctrl; ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl |= (1ULL << bit); + mask <<= shift; + ctrl &= ~mask; + ctrl |= (val << shift) & mask; writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +/* Generic functions to enable/disable certain features of the IOMMU. */ +void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~(1ULL << bit); - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 1ULL, 1ULL, bit); } -static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~CTRL_INV_TO_MASK; - ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK; - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 0ULL, 1ULL, bit); } /* Function to enable the hardware */ @@ -2650,7 +2643,7 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); /* Set IOTLB invalidation timeout to 1s */ - iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + iommu_feature_set(iommu, CTRL_INV_TO_1S, CTRL_INV_TO_MASK, CONTROL_INV_TIMEOUT); /* Enable Enhanced Peripheral Page Request Handling */ if (check_feature(FEATURE_EPHSUP)) -- Gitee From 5cde35008016d91fc74be6bec7261eebce4f7ad8 Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:20 +0530 Subject: [PATCH 260/278] iommu/amd: Replace slab cache allocator with page allocator ANBZ: #13617 commit eaf717fa1c3f01ed7762f0733787b07d1ab6c970 upstream. Commit 05152a049444 ("iommu/amd: Add slab-cache for irq remapping tables") introduces slab cache allocator. But slab cache allocator provides benefit only when the allocation and deallocation of many identical objects is frequent. The AMD IOMMU driver allocates Interrupt remapping table (IRT) when device driver requests IRQ for the first time and never frees it. Hence the slab allocator does not provide any benefit here. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-3-sarunkod@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 4 ---- drivers/iommu/amd/init.c | 21 +-------------------- drivers/iommu/amd/iommu.c | 26 ++++++++++++++------------ 3 files changed, 15 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index fe0672676551..487fbe97aeaf 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -313,7 +313,6 @@ * AMD IOMMU hardware only support 512 IRTEs despite * the architectural limitation of 2048 entries. */ -#define DTE_INTTAB_ALIGNMENT 128 #define DTE_INTTABLEN_VALUE 9ULL #define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) @@ -492,9 +491,6 @@ extern const struct iommu_ops amd_iommu_ops; /* IVRS indicates that pre-boot remapping was enabled */ extern bool amdr_ivrs_remap_support; -/* kmem_cache to get tables with 128 byte alignement */ -extern struct kmem_cache *amd_iommu_irq_cache; - #define PCI_SBDF_TO_SEGID(sbdf) (((sbdf) >> 16) & 0xffff) #define PCI_SBDF_TO_DEVID(sbdf) ((sbdf) & 0xffff) #define PCI_SEG_DEVID_TO_SBDF(seg, devid) ((((u32)(seg) & 0xffff) << 16) | \ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index af0cbb5e5756..fde4f6a3059c 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -2930,9 +2929,6 @@ static struct syscore_ops amd_iommu_syscore_ops = { static void __init free_iommu_resources(void) { - kmem_cache_destroy(amd_iommu_irq_cache); - amd_iommu_irq_cache = NULL; - free_iommu_all(); free_pci_segments(); } @@ -3039,7 +3035,7 @@ static void __init ivinfo_init(void *ivrs) static int __init early_amd_iommu_init(void) { struct acpi_table_header *ivrs_base; - int remap_cache_sz, ret; + int ret; acpi_status status; if (!amd_iommu_detected) @@ -3101,22 +3097,7 @@ static int __init early_amd_iommu_init(void) if (amd_iommu_irq_remap) { struct amd_iommu_pci_seg *pci_seg; - /* - * Interrupt remapping enabled, create kmem_cache for the - * remapping tables. - */ ret = -ENOMEM; - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32); - else - remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2); - amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache", - remap_cache_sz, - DTE_INTTAB_ALIGNMENT, - 0, NULL); - if (!amd_iommu_irq_cache) - goto out; - for_each_pci_segment(pci_seg) { if (alloc_irq_lookup_table(pci_seg)) goto out; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 26f1b7a104ec..6b789caa9161 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -75,8 +75,6 @@ struct iommu_cmd { */ DEFINE_IDA(pdom_ids); -struct kmem_cache *amd_iommu_irq_cache; - static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev); @@ -3149,7 +3147,7 @@ static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) return table; } -static struct irq_remap_table *__alloc_irq_table(void) +static struct irq_remap_table *__alloc_irq_table(int nid, int order) { struct irq_remap_table *table; @@ -3157,19 +3155,13 @@ static struct irq_remap_table *__alloc_irq_table(void) if (!table) return NULL; - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); + table->table = iommu_alloc_pages_node(nid, GFP_KERNEL, order); if (!table->table) { kfree(table); return NULL; } raw_spin_lock_init(&table->lock); - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - memset(table->table, 0, - MAX_IRQS_PER_TABLE * sizeof(u32)); - else - memset(table->table, 0, - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); return table; } @@ -3201,6 +3193,14 @@ static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, return 0; } +static inline size_t get_irq_table_size(unsigned int max_irqs) +{ + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + return max_irqs * sizeof(u32); + + return max_irqs * (sizeof(u64) * 2); +} + static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, u16 devid, struct pci_dev *pdev) { @@ -3208,6 +3208,8 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; + int order = get_order(get_irq_table_size(MAX_IRQS_PER_TABLE)); + int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; spin_lock_irqsave(&iommu_table_lock, flags); @@ -3226,7 +3228,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, spin_unlock_irqrestore(&iommu_table_lock, flags); /* Nothing there yet, allocate new irq remapping table */ - new_table = __alloc_irq_table(); + new_table = __alloc_irq_table(nid, order); if (!new_table) return NULL; @@ -3261,7 +3263,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, spin_unlock_irqrestore(&iommu_table_lock, flags); if (new_table) { - kmem_cache_free(amd_iommu_irq_cache, new_table->table); + iommu_free_pages(new_table->table, order); kfree(new_table); } return table; -- Gitee From 022871fc80e3dc8f8714d21d78a5dfe4ba675cdb Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:21 +0530 Subject: [PATCH 261/278] iommu/amd: Rename DTE_INTTABLEN* and MAX_IRQS_PER_TABLE macro ANBZ: #13617 commit 950865c1b88abf190eaec5fa703e5fef05093e30 upstream. AMD iommu can support both 512 and 2K interrupts on newer platform. Hence add suffix "512" to the existing macros. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-4-sarunkod@amd.com Signed-off-by: Joerg Roedel [Jay Chen This patch have a compiler bug, because of deleting the MAX_IRQS_PER_TABLE macro, but do not delete all code. Now we will delete the MAX_IRQS_PER_TABLE at next patch. ] Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 3 +++ drivers/iommu/amd/init.c | 2 +- drivers/iommu/amd/iommu.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 487fbe97aeaf..acefbd389223 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -317,6 +317,9 @@ #define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) #define MAX_IRQS_PER_TABLE (1 << DTE_INTTABLEN_VALUE) +#define DTE_INTTABLEN_VALUE_512 9ULL +#define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1) +#define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512) #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index fde4f6a3059c..e3d0cf110f2d 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1059,7 +1059,7 @@ static bool __copy_device_table(struct amd_iommu *iommu) int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; if (irq_v && (int_ctl || int_tab_len)) { if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN)) { + (int_tab_len != DTE_INTTABLEN_512)) { pr_err("Wrong old irq remapping flag: %#x\n", devid); memunmap(old_devtb); return false; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6b789caa9161..9c2d098a7fd8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3121,7 +3121,7 @@ static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, new &= ~DTE_IRQ_PHYS_ADDR_MASK; new |= iommu_virt_to_phys(table->table); new |= DTE_IRQ_REMAP_INTCTL; - new |= DTE_INTTABLEN; + new |= DTE_INTTABLEN_512; new |= DTE_IRQ_REMAP_ENABLE; WRITE_ONCE(dte->data[2], new); -- Gitee From 71a637462c7aaa0859d164b8cb79eadf76857d90 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 7 Mar 2025 15:28:22 +0530 Subject: [PATCH 262/278] iommu/amd: Enable support for up to 2K interrupts per function ANBZ: #13617 commit 19e5cc156cbc32421eb8237615f246c08eea8284 upstream. AMD IOMMU optionally supports up to 2K interrupts per function on newer platforms. Support for this feature is indicated through Extended Feature 2 Register (MMIO Offset 01A0h[NumIntRemapSup]). Allocate 2K IRTEs per device when this support is available. Co-developed-by: Sairaj Kodilkar Signed-off-by: Sairaj Kodilkar Signed-off-by: Kishon Vijay Abraham I Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-5-sarunkod@amd.com Signed-off-by: Joerg Roedel [Jay Chen delete the macto MAX_IRQS_PER_TABLE ] Signed-off-by: Jay Chen --- drivers/iommu/amd/amd_iommu_types.h | 18 ++++++----- drivers/iommu/amd/init.c | 16 +++++++++- drivers/iommu/amd/iommu.c | 48 +++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index acefbd389223..5089b58e528a 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -112,6 +112,10 @@ #define FEATURE_SNPAVICSUP_GAM(x) \ (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) +#define FEATURE_NUM_INT_REMAP_SUP GENMASK_ULL(9, 8) +#define FEATURE_NUM_INT_REMAP_SUP_2K(x) \ + (FIELD_GET(FEATURE_NUM_INT_REMAP_SUP, x) == 0x1) + /* Note: * The current driver only support 16-bit PASID. * Currently, hardware only implement upto 16-bit PASID @@ -175,6 +179,9 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_NUM_INT_REMAP_MODE 43 +#define CONTROL_NUM_INT_REMAP_MODE_MASK 0x03 +#define CONTROL_NUM_INT_REMAP_MODE_2K 0x01 #define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 @@ -309,17 +316,13 @@ #define DTE_IRQ_REMAP_INTCTL (2ULL << 60) #define DTE_IRQ_REMAP_ENABLE 1ULL -/* - * AMD IOMMU hardware only support 512 IRTEs despite - * the architectural limitation of 2048 entries. - */ -#define DTE_INTTABLEN_VALUE 9ULL -#define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) -#define MAX_IRQS_PER_TABLE (1 << DTE_INTTABLEN_VALUE) #define DTE_INTTABLEN_VALUE_512 9ULL #define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1) #define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512) +#define DTE_INTTABLEN_VALUE_2K 11ULL +#define DTE_INTTABLEN_2K (DTE_INTTABLEN_VALUE_2K << 1) +#define MAX_IRQS_PER_TABLE_2K BIT(DTE_INTTABLEN_VALUE_2K) #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 @@ -850,6 +853,7 @@ struct iommu_dev_data { struct device *dev; u16 devid; /* PCI Device ID */ + unsigned int max_irqs; /* Maximum IRQs supported by device */ u32 max_pasids; /* Max supported PASIDs */ u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ int ats_qdep; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index e3d0cf110f2d..3265a6fa500b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1059,7 +1059,8 @@ static bool __copy_device_table(struct amd_iommu *iommu) int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; if (irq_v && (int_ctl || int_tab_len)) { if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN_512)) { + (int_tab_len != DTE_INTTABLEN_512 && + int_tab_len != DTE_INTTABLEN_2K)) { pr_err("Wrong old irq remapping flag: %#x\n", devid); memunmap(old_devtb); return false; @@ -2735,6 +2736,17 @@ static void iommu_enable_irtcachedis(struct amd_iommu *iommu) iommu->irtcachedis_enabled ? "disabled" : "enabled"); } +static void iommu_enable_2k_int(struct amd_iommu *iommu) +{ + if (!FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + return; + + iommu_feature_set(iommu, + CONTROL_NUM_INT_REMAP_MODE_2K, + CONTROL_NUM_INT_REMAP_MODE_MASK, + CONTROL_NUM_INT_REMAP_MODE); +} + static void early_enable_iommu(struct amd_iommu *iommu) { iommu_disable(iommu); @@ -2747,6 +2759,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_enable(iommu); amd_iommu_flush_all_caches(iommu); } @@ -2803,6 +2816,7 @@ static void early_enable_iommus(void) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_set_device_table(iommu); amd_iommu_flush_all_caches(iommu); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9c2d098a7fd8..861c415cc4b4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2408,8 +2408,14 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) } out_err: + iommu_completion_wait(iommu); + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; + else + dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; + if (dev_is_pci(dev)) pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); @@ -3107,6 +3113,13 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) raw_spin_unlock_irqrestore(&iommu->lock, flags); } +static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) +{ + if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) + return DTE_INTTABLEN_2K; + return DTE_INTTABLEN_512; +} + static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, struct irq_remap_table *table) { @@ -3121,7 +3134,7 @@ static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, new &= ~DTE_IRQ_PHYS_ADDR_MASK; new |= iommu_virt_to_phys(table->table); new |= DTE_IRQ_REMAP_INTCTL; - new |= DTE_INTTABLEN_512; + new |= iommu_get_int_tablen(dev_data); new |= DTE_IRQ_REMAP_ENABLE; WRITE_ONCE(dte->data[2], new); @@ -3202,13 +3215,14 @@ static inline size_t get_irq_table_size(unsigned int max_irqs) } static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, - u16 devid, struct pci_dev *pdev) + u16 devid, struct pci_dev *pdev, + unsigned int max_irqs) { struct irq_remap_table *table = NULL; struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; - int order = get_order(get_irq_table_size(MAX_IRQS_PER_TABLE)); + int order = get_order(get_irq_table_size(max_irqs)); int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; @@ -3270,13 +3284,14 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, } static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, - bool align, struct pci_dev *pdev) + bool align, struct pci_dev *pdev, + unsigned long max_irqs) { struct irq_remap_table *table; int index, c, alignment = 1; unsigned long flags; - table = alloc_irq_table(iommu, devid, pdev); + table = alloc_irq_table(iommu, devid, pdev, max_irqs); if (!table) return -ENODEV; @@ -3287,7 +3302,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, /* Scan table for free entries */ for (index = ALIGN(table->min_index, alignment), c = 0; - index < MAX_IRQS_PER_TABLE;) { + index < max_irqs;) { if (!iommu->irte_ops->is_allocated(table, index)) { c += 1; } else { @@ -3557,6 +3572,14 @@ static void fill_msi_msg(struct msi_msg *msg, u32 index) msg->data = index; msg->address_lo = 0; msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + /* + * The struct msi_msg.dest_mode_logical is used to set the DM bit + * in MSI Message Address Register. For device w/ 2K int-remap support, + * this is bit must be set to 1 regardless of the actual destination + * mode, which is signified by the IRTE[DM]. + */ + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + msg->arch_addr_lo.dest_mode_logical = true; msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; } @@ -3619,6 +3642,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, struct amd_ir_data *data = NULL; struct amd_iommu *iommu; struct irq_cfg *cfg; + struct iommu_dev_data *dev_data; + unsigned long max_irqs; int i, ret, devid, seg, sbdf; int index; @@ -3637,6 +3662,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!iommu) return -EINVAL; + dev_data = search_dev_data(iommu, devid); + max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); if (ret < 0) return ret; @@ -3644,7 +3672,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { struct irq_remap_table *table; - table = alloc_irq_table(iommu, devid, NULL); + table = alloc_irq_table(iommu, devid, NULL, max_irqs); if (table) { if (!table->min_index) { /* @@ -3665,9 +3693,11 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); index = alloc_irq_index(iommu, devid, nr_irqs, align, - msi_desc_to_pci_dev(info->desc)); + msi_desc_to_pci_dev(info->desc), + max_irqs); } else { - index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL); + index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, + max_irqs); } if (index < 0) { -- Gitee From 2888587a35a17f0a4243da79f991d3a27d00e154 Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Thu, 7 Nov 2024 04:37:11 +0000 Subject: [PATCH 263/278] iommufd: Allow fault reporting for non-PRI PCI devices ANBZ: #13617 commit c9d6ee6699fd6626b0974143a72c3406276e1c65 upstream. iommufd_fault_iopf_enable has limitation to PRI on PCI/SRIOV VFs because the PRI might be a shared resource and current iommu subsystem is not ready to support enabling/disabling PRI on a VF without any impact on others. However, we have devices that appear as PCI but are actually on the AMBA bus. These fake PCI devices have PASID capability, support stall as well as SRIOV, so remove the limitation for these devices. Link: https://patch.msgid.link/r/20241107043711.116-1-zhangfei.gao@linaro.org Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Reviewed-by: Lu Baolu Signed-off-by: Zhangfei Gao Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index aa839df42823..1f703eb9daa8 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -27,8 +28,12 @@ int iommufd_fault_iopf_enable(struct iommufd_device *idev) * resource between PF and VFs. There is no coordination for this * shared capability. This waits for a vPRI reset to recover. */ - if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn) - return -EINVAL; + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->is_virtfn && pci_pri_supported(pdev)) + return -EINVAL; + } mutex_lock(&idev->iopf_lock); /* Device iopf has already been on. */ -- Gitee From b4bafcef44dce62f4e19aa93a3b7334b598bff45 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 3 Dec 2024 00:02:54 -0800 Subject: [PATCH 264/278] iommufd: Fix out_fput in iommufd_fault_alloc() ANBZ: #13617 commit af7f4780514f850322b2959032ecaa96e4b26472 upstream. As fput() calls the file->f_op->release op, where fault obj and ictx are getting released, there is no need to release these two after fput() one more time, which would result in imbalanced refcounts: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 48 PID: 2369 at lib/refcount.c:31 refcount_warn_saturate+0x60/0x230 Call trace: refcount_warn_saturate+0x60/0x230 (P) refcount_warn_saturate+0x60/0x230 (L) iommufd_fault_fops_release+0x9c/0xe0 [iommufd] ... VFS: Close: file count is 0 (f_op=iommufd_fops [iommufd]) WARNING: CPU: 48 PID: 2369 at fs/open.c:1507 filp_flush+0x3c/0xf0 Call trace: filp_flush+0x3c/0xf0 (P) filp_flush+0x3c/0xf0 (L) __arm64_sys_close+0x34/0x98 ... imbalanced put on file reference count WARNING: CPU: 48 PID: 2369 at fs/file.c:74 __file_ref_put+0x100/0x138 Call trace: __file_ref_put+0x100/0x138 (P) __file_ref_put+0x100/0x138 (L) __fput_sync+0x4c/0xd0 Drop those two lines to fix the warnings above. Cc: stable@vger.kernel.org Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object") Link: https://patch.msgid.link/r/b5651beb3a6b1adeef26fffac24607353bf67ba1.1733212723.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 1f703eb9daa8..ca2c16d7242b 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -311,8 +311,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) put_unused_fd(fdno); out_fput: fput(filep); - refcount_dec(&fault->obj.users); - iommufd_ctx_put(fault->ictx); out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); -- Gitee From 566e32951171839fa3b5213b3f1862d1a77d9029 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 14 Jan 2025 22:55:59 -0800 Subject: [PATCH 265/278] iommufd/fault: Destroy response and mutex in iommufd_fault_destroy() ANBZ: #13617 commit 3f4818ec139030f425476bf8a10b616bab53a7b5 upstream. Both were missing in the initial patch. Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object") Link: https://patch.msgid.link/r/bc8bb13e215af27e62ee51bdba3648dd4ed2dce3.1736923732.git.nicolinc@nvidia.com Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index ca2c16d7242b..57a79475b58e 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -101,6 +101,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj) { struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); struct iopf_group *group, *next; + unsigned long index; /* * The iommufd object's reference count is zero at this point. @@ -113,6 +114,13 @@ void iommufd_fault_destroy(struct iommufd_object *obj) iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); iopf_free_group(group); } + xa_for_each(&fault->response, index, group) { + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_destroy(&fault->response); + mutex_destroy(&fault->mutex); } static void iommufd_compose_fault_message(struct iommu_fault *fault, -- Gitee From fbcf62cf4987293c929114d18ad56941411b2ecd Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:19 -0700 Subject: [PATCH 266/278] iommufd/fault: Move two fault functions out of the header ANBZ: #13617 commit dbf00d7d89125802113a9d8ea4c77ab9f4de8866 upstream. There is no need to keep them in the header. The vEVENTQ version of these two functions will turn out to be a different implementation and will not share with this fault version. Thus, move them out of the header. Link: https://patch.msgid.link/r/7eebe32f3d354799f5e28128c693c3c284740b21.1741719725.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 25 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 25 ------------------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 57a79475b58e..cd8b8aeb91d0 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -138,6 +138,31 @@ static void iommufd_compose_fault_message(struct iommu_fault *fault, hwpt_fault->cookie = cookie; } +/* Fetch the first node out of the fault->deliver list */ +static struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->lock); + list_add(&group->node, &fault->deliver); + spin_unlock(&fault->lock); +} + static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 246297452a44..1c58f5fe17b4 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -472,31 +472,6 @@ struct iommufd_fault { struct wait_queue_head wait_queue; }; -/* Fetch the first node out of the fault->deliver list */ -static inline struct iopf_group * -iommufd_fault_deliver_fetch(struct iommufd_fault *fault) -{ - struct list_head *list = &fault->deliver; - struct iopf_group *group = NULL; - - spin_lock(&fault->lock); - if (!list_empty(list)) { - group = list_first_entry(list, struct iopf_group, node); - list_del(&group->node); - } - spin_unlock(&fault->lock); - return group; -} - -/* Restore a node back to the head of the fault->deliver list */ -static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault, - struct iopf_group *group) -{ - spin_lock(&fault->lock); - list_add(&group->node, &fault->deliver); - spin_unlock(&fault->lock); -} - struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; -- Gitee From aa654042ccda46c2dbe48bba466b696474fdae69 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:20 -0700 Subject: [PATCH 267/278] iommufd/fault: Add an iommufd_fault_init() helper ANBZ: #13617 commit 927dabc9aa4dbebf92b34da9b7acd7d8d5c6331b upstream. The infrastructure of a fault object will be shared with a new vEVENTQ object in a following change. Add an iommufd_fault_init helper and an INIT_EVENTQ_FOPS marco for a vEVENTQ allocator to use too. Reorder the iommufd_ctx_get and refcount_inc, to keep them symmetrical with the iommufd_fault_fops_release(). Since the new vEVENTQ doesn't need "response" and its "mutex", so keep the xa_init_flags and mutex_init in their original locations. Link: https://patch.msgid.link/r/a9522c521909baeb1bd843950b2490478f3d06e0.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen [Jay Chen Conflicts: drivers/iommu/iommufd/fault.c fix the conflicts to delete the no_llseek ] --- drivers/iommu/iommufd/fault.c | 71 +++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index cd8b8aeb91d0..5d8de98732b6 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -280,21 +280,49 @@ static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) return 0; } -static const struct file_operations iommufd_fault_fops = { - .owner = THIS_MODULE, - .open = nonseekable_open, - .read = iommufd_fault_fops_read, - .write = iommufd_fault_fops_write, - .poll = iommufd_fault_fops_poll, - .release = iommufd_fault_fops_release, - .llseek = no_llseek, -}; +#define INIT_FAULT_FOPS(read_op, write_op) \ + ((const struct file_operations){ \ + .owner = THIS_MODULE, \ + .open = nonseekable_open, \ + .read = read_op, \ + .write = write_op, \ + .poll = iommufd_fault_fops_poll, \ + .release = iommufd_fault_fops_release, \ + }) + +static int iommufd_fault_init(struct iommufd_fault *fault, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) +{ + struct file *filep; + int fdno; + + spin_lock_init(&fault->lock); + INIT_LIST_HEAD(&fault->deliver); + init_waitqueue_head(&fault->wait_queue); + + filep = anon_inode_getfile(name, fops, fault, O_RDWR); + if (IS_ERR(filep)) + return PTR_ERR(filep); + + fault->ictx = ictx; + iommufd_ctx_get(fault->ictx); + fault->filep = filep; + refcount_inc(&fault->obj.users); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) + fput(filep); + return fdno; +} + +static const struct file_operations iommufd_fault_fops = + INIT_FAULT_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) { struct iommu_fault_alloc *cmd = ucmd->cmd; struct iommufd_fault *fault; - struct file *filep; int fdno; int rc; @@ -305,28 +333,14 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) if (IS_ERR(fault)) return PTR_ERR(fault); - fault->ictx = ucmd->ictx; - INIT_LIST_HEAD(&fault->deliver); xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); mutex_init(&fault->mutex); - spin_lock_init(&fault->lock); - init_waitqueue_head(&fault->wait_queue); - - filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, - fault, O_RDWR); - if (IS_ERR(filep)) { - rc = PTR_ERR(filep); - goto out_abort; - } - refcount_inc(&fault->obj.users); - iommufd_ctx_get(fault->ictx); - fault->filep = filep; - - fdno = get_unused_fd_flags(O_CLOEXEC); + fdno = iommufd_fault_init(fault, "[iommufd-pgfault]", ucmd->ictx, + &iommufd_fault_fops); if (fdno < 0) { rc = fdno; - goto out_fput; + goto out_abort; } cmd->out_fault_id = fault->obj.id; @@ -342,8 +356,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) return 0; out_put_fdno: put_unused_fd(fdno); -out_fput: - fput(filep); + fput(fault->filep); out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); -- Gitee From 530ba4667709e6adbdd2eff1b5f1a43d2455c893 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:21 -0700 Subject: [PATCH 268/278] iommufd: Abstract an iommufd_eventq from iommufd_fault ANBZ: #13617 commit 5426a78bebefbb32643ee85320e977f3971c5521 upstream. The fault object was designed exclusively for hwpt's IO page faults (PRI). But its queue implementation can be reused for other purposes too, such as hardware IRQ and event injections to user space. Meanwhile, a fault object holds a list of faults. So it's more accurate to call it a "fault queue". Combining the reusing idea above, abstract a new iommufd_eventq as a common structure embedded into struct iommufd_fault, similar to hwpt_paging holding a common hwpt. Add a common iommufd_eventq_ops and iommufd_eventq_init to prepare for an IOMMUFD_OBJ_VEVENTQ (vIOMMU Event Queue). Link: https://patch.msgid.link/r/e7336a857954209aabb466e0694aab323da95d90.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/fault.c | 111 +++++++++++++----------- drivers/iommu/iommufd/hw_pagetable.c | 6 +- drivers/iommu/iommufd/iommufd_private.h | 28 ++++-- 3 files changed, 82 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 5d8de98732b6..f8e60e5879d1 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -17,6 +17,8 @@ #include "../iommu-priv.h" #include "iommufd_private.h" +/* IOMMUFD_OBJ_FAULT Functions */ + int iommufd_fault_iopf_enable(struct iommufd_device *idev) { struct device *dev = idev->dev; @@ -73,13 +75,13 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, INIT_LIST_HEAD(&free_list); mutex_lock(&fault->mutex); - spin_lock(&fault->lock); - list_for_each_entry_safe(group, next, &fault->deliver, node) { + spin_lock(&fault->common.lock); + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { if (group->attach_handle != &handle->handle) continue; list_move(&group->node, &free_list); } - spin_unlock(&fault->lock); + spin_unlock(&fault->common.lock); list_for_each_entry_safe(group, next, &free_list, node) { list_del(&group->node); @@ -99,7 +101,9 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, void iommufd_fault_destroy(struct iommufd_object *obj) { - struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iopf_group *group, *next; unsigned long index; @@ -109,7 +113,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj) * accessing this pointer. Therefore, acquiring the mutex here * is unnecessary. */ - list_for_each_entry_safe(group, next, &fault->deliver, node) { + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { list_del(&group->node); iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); iopf_free_group(group); @@ -142,15 +146,15 @@ static void iommufd_compose_fault_message(struct iommu_fault *fault, static struct iopf_group * iommufd_fault_deliver_fetch(struct iommufd_fault *fault) { - struct list_head *list = &fault->deliver; + struct list_head *list = &fault->common.deliver; struct iopf_group *group = NULL; - spin_lock(&fault->lock); + spin_lock(&fault->common.lock); if (!list_empty(list)) { group = list_first_entry(list, struct iopf_group, node); list_del(&group->node); } - spin_unlock(&fault->lock); + spin_unlock(&fault->common.lock); return group; } @@ -158,16 +162,17 @@ iommufd_fault_deliver_fetch(struct iommufd_fault *fault) static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, struct iopf_group *group) { - spin_lock(&fault->lock); - list_add(&group->node, &fault->deliver); - spin_unlock(&fault->lock); + spin_lock(&fault->common.lock); + list_add(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); } static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { size_t fault_size = sizeof(struct iommu_hwpt_pgfault); - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iommu_hwpt_pgfault data = {}; struct iommufd_device *idev; struct iopf_group *group; @@ -216,7 +221,8 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b size_t count, loff_t *ppos) { size_t response_size = sizeof(struct iommu_hwpt_page_response); - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iommu_hwpt_page_response response; struct iopf_group *group; size_t done = 0; @@ -256,59 +262,61 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b return done == 0 ? rc : done; } -static __poll_t iommufd_fault_fops_poll(struct file *filep, - struct poll_table_struct *wait) +/* Common Event Queue Functions */ + +static __poll_t iommufd_eventq_fops_poll(struct file *filep, + struct poll_table_struct *wait) { - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; __poll_t pollflags = EPOLLOUT; - poll_wait(filep, &fault->wait_queue, wait); - spin_lock(&fault->lock); - if (!list_empty(&fault->deliver)) + poll_wait(filep, &eventq->wait_queue, wait); + spin_lock(&eventq->lock); + if (!list_empty(&eventq->deliver)) pollflags |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fault->lock); + spin_unlock(&eventq->lock); return pollflags; } -static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) +static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep) { - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; - refcount_dec(&fault->obj.users); - iommufd_ctx_put(fault->ictx); + refcount_dec(&eventq->obj.users); + iommufd_ctx_put(eventq->ictx); return 0; } -#define INIT_FAULT_FOPS(read_op, write_op) \ +#define INIT_EVENTQ_FOPS(read_op, write_op) \ ((const struct file_operations){ \ .owner = THIS_MODULE, \ .open = nonseekable_open, \ .read = read_op, \ .write = write_op, \ - .poll = iommufd_fault_fops_poll, \ - .release = iommufd_fault_fops_release, \ + .poll = iommufd_eventq_fops_poll, \ + .release = iommufd_eventq_fops_release, \ }) -static int iommufd_fault_init(struct iommufd_fault *fault, char *name, - struct iommufd_ctx *ictx, - const struct file_operations *fops) +static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) { struct file *filep; int fdno; - spin_lock_init(&fault->lock); - INIT_LIST_HEAD(&fault->deliver); - init_waitqueue_head(&fault->wait_queue); + spin_lock_init(&eventq->lock); + INIT_LIST_HEAD(&eventq->deliver); + init_waitqueue_head(&eventq->wait_queue); - filep = anon_inode_getfile(name, fops, fault, O_RDWR); + filep = anon_inode_getfile(name, fops, eventq, O_RDWR); if (IS_ERR(filep)) return PTR_ERR(filep); - fault->ictx = ictx; - iommufd_ctx_get(fault->ictx); - fault->filep = filep; - refcount_inc(&fault->obj.users); + eventq->ictx = ictx; + iommufd_ctx_get(eventq->ictx); + eventq->filep = filep; + refcount_inc(&eventq->obj.users); fdno = get_unused_fd_flags(O_CLOEXEC); if (fdno < 0) @@ -317,7 +325,7 @@ static int iommufd_fault_init(struct iommufd_fault *fault, char *name, } static const struct file_operations iommufd_fault_fops = - INIT_FAULT_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); + INIT_EVENTQ_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) { @@ -329,36 +337,37 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) if (cmd->flags) return -EOPNOTSUPP; - fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT); + fault = __iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT, + common.obj); if (IS_ERR(fault)) return PTR_ERR(fault); xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); mutex_init(&fault->mutex); - fdno = iommufd_fault_init(fault, "[iommufd-pgfault]", ucmd->ictx, - &iommufd_fault_fops); + fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]", + ucmd->ictx, &iommufd_fault_fops); if (fdno < 0) { rc = fdno; goto out_abort; } - cmd->out_fault_id = fault->obj.id; + cmd->out_fault_id = fault->common.obj.id; cmd->out_fault_fd = fdno; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_put_fdno; - iommufd_object_finalize(ucmd->ictx, &fault->obj); + iommufd_object_finalize(ucmd->ictx, &fault->common.obj); - fd_install(fdno, fault->filep); + fd_install(fdno, fault->common.filep); return 0; out_put_fdno: put_unused_fd(fdno); - fput(fault->filep); + fput(fault->common.filep); out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); + iommufd_object_abort_and_destroy(ucmd->ictx, &fault->common.obj); return rc; } @@ -371,11 +380,11 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; - spin_lock(&fault->lock); - list_add_tail(&group->node, &fault->deliver); - spin_unlock(&fault->lock); + spin_lock(&fault->common.lock); + list_add_tail(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); - wake_up_interruptible(&fault->wait_queue); + wake_up_interruptible(&fault->common.wait_queue); return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 1d4cfe3677dc..9a89f3a28dc5 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -14,7 +14,7 @@ static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) iommu_domain_free(hwpt->domain); if (hwpt->fault) - refcount_dec(&hwpt->fault->obj.users); + refcount_dec(&hwpt->fault->common.obj.users); } void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) @@ -415,8 +415,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - refcount_inc(&fault->obj.users); - iommufd_put_object(ucmd->ictx, &fault->obj); + refcount_inc(&fault->common.obj.users); + iommufd_put_object(ucmd->ictx, &fault->common.obj); } cmd->out_hwpt_id = hwpt->obj.id; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 1c58f5fe17b4..44fb30af10b0 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -454,20 +454,13 @@ void iopt_remove_access(struct io_pagetable *iopt, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); -/* - * An iommufd_fault object represents an interface to deliver I/O page faults - * to the user space. These objects are created/destroyed by the user space and - * associated with hardware page table objects during page-table allocation. - */ -struct iommufd_fault { +struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; struct file *filep; spinlock_t lock; /* protects the deliver list */ struct list_head deliver; - struct mutex mutex; /* serializes response flows */ - struct xarray response; struct wait_queue_head wait_queue; }; @@ -480,12 +473,29 @@ struct iommufd_attach_handle { /* Convert an iommu attach handle to iommufd handle. */ #define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_eventq common; + struct mutex mutex; /* serializes response flows */ + struct xarray response; +}; + +static inline struct iommufd_fault * +eventq_to_fault(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_fault, common); +} + static inline struct iommufd_fault * iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_FAULT), - struct iommufd_fault, obj); + struct iommufd_fault, common.obj); } int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); -- Gitee From 3efa2ba61213cb213798206c80271daf19dc3e7d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:22 -0700 Subject: [PATCH 269/278] iommufd: Rename fault.c to eventq.c ANBZ: #13617 commit 0507f337fc0c3a10f802b42834e6532edcf605be upstream. Rename the file, aligning with the new eventq object. Link: https://patch.msgid.link/r/d726397e2d08028e25a1cb6eb9febefac35a32ba.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/Makefile | 2 +- drivers/iommu/iommufd/{fault.c => eventq.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename drivers/iommu/iommufd/{fault.c => eventq.c} (100%) diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index cb784da6cddc..71d692c9a8f4 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ device.o \ - fault.o \ + eventq.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/eventq.c similarity index 100% rename from drivers/iommu/iommufd/fault.c rename to drivers/iommu/iommufd/eventq.c -- Gitee From fa86891618a6b3b63196fd9aa15c6813752e0109 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:23 -0700 Subject: [PATCH 270/278] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC ANBZ: #13617 commit e36ba5ab808ef6237c3148d469c8238674230e2b upstream. Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that provides user space (VMM) another FD to read the vIOMMU Events. Allow a vIOMMU object to allocate vEVENTQs, with a condition that each vIOMMU can only have one single vEVENTQ per type. Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl. Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/eventq.c | 209 +++++++++++++++++++++++- drivers/iommu/iommufd/iommufd_private.h | 82 ++++++++++ drivers/iommu/iommufd/main.c | 7 + drivers/iommu/iommufd/viommu.c | 2 + include/linux/iommufd.h | 3 + include/uapi/linux/iommufd.h | 82 ++++++++++ 6 files changed, 384 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index f8e60e5879d1..4c43ace8c725 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -262,13 +262,148 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b return done == 0 ? rc : done; } +/* IOMMUFD_OBJ_VEVENTQ Functions */ + +void iommufd_veventq_abort(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_viommu *viommu = veventq->viommu; + struct iommufd_vevent *cur, *next; + + lockdep_assert_held_write(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(cur, next, &eventq->deliver, node) { + list_del(&cur->node); + if (cur != &veventq->lost_events_header) + kfree(cur); + } + + refcount_dec(&viommu->obj.users); + list_del(&veventq->node); +} + +void iommufd_veventq_destroy(struct iommufd_object *obj) +{ + struct iommufd_veventq *veventq = eventq_to_veventq( + container_of(obj, struct iommufd_eventq, obj)); + + down_write(&veventq->viommu->veventqs_rwsem); + iommufd_veventq_abort(obj); + up_write(&veventq->viommu->veventqs_rwsem); +} + +static struct iommufd_vevent * +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + struct iommufd_vevent *vevent = NULL; + + spin_lock(&eventq->lock); + if (!list_empty(list)) { + struct iommufd_vevent *next; + + next = list_first_entry(list, struct iommufd_vevent, node); + /* Make a copy of the lost_events_header for copy_to_user */ + if (next == &veventq->lost_events_header) { + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC); + if (!vevent) + goto out_unlock; + } + list_del(&next->node); + if (vevent) + memcpy(vevent, next, sizeof(*vevent)); + else + vevent = next; + } +out_unlock: + spin_unlock(&eventq->lock); + return vevent; +} + +static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + + spin_lock(&eventq->lock); + if (vevent_for_lost_events_header(vevent)) { + /* Remove the copy of the lost_events_header */ + kfree(vevent); + vevent = NULL; + /* An empty list needs the lost_events_header back */ + if (list_empty(list)) + vevent = &veventq->lost_events_header; + } + if (vevent) + list_add(&vevent->node, list); + spin_unlock(&eventq->lock); +} + +static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_vevent_header *hdr; + struct iommufd_vevent *cur; + size_t done = 0; + int rc = 0; + + if (*ppos) + return -ESPIPE; + + while ((cur = iommufd_veventq_deliver_fetch(veventq))) { + /* Validate the remaining bytes against the header size */ + if (done >= count || sizeof(*hdr) > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + hdr = &cur->header; + + /* If being a normal vEVENT, validate against the full size */ + if (!vevent_for_lost_events_header(cur) && + sizeof(hdr) + cur->data_len > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + + if (copy_to_user(buf + done, hdr, sizeof(*hdr))) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + done += sizeof(*hdr); + + if (cur->data_len && + copy_to_user(buf + done, cur->event_data, cur->data_len)) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + spin_lock(&eventq->lock); + veventq->num_events--; + spin_unlock(&eventq->lock); + done += cur->data_len; + kfree(cur); + } + + return done == 0 ? rc : done; +} + /* Common Event Queue Functions */ static __poll_t iommufd_eventq_fops_poll(struct file *filep, struct poll_table_struct *wait) { struct iommufd_eventq *eventq = filep->private_data; - __poll_t pollflags = EPOLLOUT; + __poll_t pollflags = 0; + + if (eventq->obj.type == IOMMUFD_OBJ_FAULT) + pollflags |= EPOLLOUT; poll_wait(filep, &eventq->wait_queue, wait); spin_lock(&eventq->lock); @@ -388,3 +523,75 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) return 0; } + +static const struct file_operations iommufd_veventq_fops = + INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL); + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_veventq_alloc *cmd = ucmd->cmd; + struct iommufd_veventq *veventq; + struct iommufd_viommu *viommu; + int fdno; + int rc; + + if (cmd->flags || cmd->__reserved || + cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->veventq_depth) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + down_write(&viommu->veventqs_rwsem); + + if (iommufd_viommu_find_veventq(viommu, cmd->type)) { + rc = -EEXIST; + goto out_unlock_veventqs; + } + + veventq = __iommufd_object_alloc(ucmd->ictx, veventq, + IOMMUFD_OBJ_VEVENTQ, common.obj); + if (IS_ERR(veventq)) { + rc = PTR_ERR(veventq); + goto out_unlock_veventqs; + } + + veventq->type = cmd->type; + veventq->viommu = viommu; + refcount_inc(&viommu->obj.users); + veventq->depth = cmd->veventq_depth; + list_add_tail(&veventq->node, &viommu->veventqs); + veventq->lost_events_header.header.flags = + IOMMU_VEVENTQ_FLAG_LOST_EVENTS; + + fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]", + ucmd->ictx, &iommufd_veventq_fops); + if (fdno < 0) { + rc = fdno; + goto out_abort; + } + + cmd->out_veventq_id = veventq->common.obj.id; + cmd->out_veventq_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + iommufd_object_finalize(ucmd->ictx, &veventq->common.obj); + fd_install(fdno, veventq->common.filep); + goto out_unlock_veventqs; + +out_put_fdno: + put_unused_fd(fdno); + fput(veventq->common.filep); +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); +out_unlock_veventqs: + up_write(&viommu->veventqs_rwsem); + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 44fb30af10b0..8cda9c4672eb 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -507,6 +507,74 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle); +/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ +struct iommufd_vevent { + struct iommufd_vevent_header header; + struct list_head node; /* for iommufd_eventq::deliver */ + ssize_t data_len; + u64 event_data[] __counted_by(data_len); +}; + +#define vevent_for_lost_events_header(vevent) \ + (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) + +/* + * An iommufd_veventq object represents an interface to deliver vIOMMU events to + * the user space. It is created/destroyed by the user space and associated with + * a vIOMMU object during the allocations. + */ +struct iommufd_veventq { + struct iommufd_eventq common; + struct iommufd_viommu *viommu; + struct list_head node; /* for iommufd_viommu::veventqs */ + struct iommufd_vevent lost_events_header; + + unsigned int type; + unsigned int depth; + + /* Use common.lock for protection */ + u32 num_events; + u32 sequence; +}; + +static inline struct iommufd_veventq * +eventq_to_veventq(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_veventq, common); +} + +static inline struct iommufd_veventq * +iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VEVENTQ), + struct iommufd_veventq, common.obj); +} + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); +void iommufd_veventq_destroy(struct iommufd_object *obj); +void iommufd_veventq_abort(struct iommufd_object *obj); + +static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + + lockdep_assert_held(&eventq->lock); + + /* + * Remove the lost_events_header and add the new node at the same time. + * Note the new node can be lost_events_header, for a sequence update. + */ + if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) + list_del(&veventq->lost_events_header.node); + list_add_tail(&vevent->node, &eventq->deliver); + vevent->header.sequence = veventq->sequence; + veventq->sequence = (veventq->sequence + 1) & INT_MAX; + + wake_up_interruptible(&eventq->wait_queue); +} + static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) { @@ -515,6 +583,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_viommu, obj); } +static inline struct iommufd_veventq * +iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type) +{ + struct iommufd_veventq *veventq, *next; + + lockdep_assert_held(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { + if (veventq->type == type) + return veventq; + } + return NULL; +} + int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index d50907f69c94..64834df5d20c 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -317,6 +317,7 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vdevice_alloc vdev; + struct iommu_veventq_alloc veventq; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST @@ -372,6 +373,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, struct iommu_vdevice_alloc, virt_id), + IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc, + struct iommu_veventq_alloc, out_veventq_fd), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, @@ -514,6 +517,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, }, + [IOMMUFD_OBJ_VEVENTQ] = { + .destroy = iommufd_veventq_destroy, + .abort = iommufd_veventq_abort, + }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, }, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 69b88e8c7c26..01df2b985f02 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -59,6 +59,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) viommu->ictx = ucmd->ictx; viommu->hwpt = hwpt_paging; refcount_inc(&viommu->hwpt->common.obj.users); + INIT_LIST_HEAD(&viommu->veventqs); + init_rwsem(&viommu->veventqs_rwsem); /* * It is the most likely case that a physical IOMMU is unpluggable. A * pluggable IOMMU instance (if exists) is responsible for refcounting diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 11110c749200..8948b1836940 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, + IOMMUFD_OBJ_VEVENTQ, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -93,6 +94,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; struct xarray vdevs; + struct list_head veventqs; + struct rw_semaphore veventqs_rwsem; unsigned int type; }; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 462f74ee234e..11ed647838d0 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -55,6 +55,7 @@ enum { IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, + IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, }; /** @@ -1014,4 +1015,85 @@ struct iommu_ioas_change_process { #define IOMMU_IOAS_CHANGE_PROCESS \ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) +/** + * enum iommu_veventq_flag - flag for struct iommufd_vevent_header + * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs + */ +enum iommu_veventq_flag { + IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0), +}; + +/** + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status + * @flags: Combination of enum iommu_veventq_flag + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of + * [0, INT_MAX] where the following index of INT_MAX is 0 + * + * Each iommufd_vevent_header reports a sequence index of the following vEVENT: + * ------------------------------------------------------------------------- + * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | + * ------------------------------------------------------------------------- + * And this sequence index is expected to be monotonic to the sequence index of + * the previous vEVENT. If two adjacent sequence indexes has a delta larger than + * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: + * ------------------------------------------------------------------------- + * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | + * ------------------------------------------------------------------------- + * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT + * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header + * would be added to the tail, and no data would follow this header: + * --------------------------------------------------------------------------- + * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | + * --------------------------------------------------------------------------- + */ +struct iommufd_vevent_header { + __u32 flags; + __u32 sequence; +}; + +/** + * enum iommu_veventq_type - Virtual Event Queue Type + * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_veventq_type { + IOMMU_VEVENTQ_TYPE_DEFAULT = 0, +}; + +/** + * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) + * @size: sizeof(struct iommu_veventq_alloc) + * @flags: Must be 0 + * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with + * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type + * @veventq_depth: Maximum number of events in the vEVENTQ + * @out_veventq_id: The ID of the new vEVENTQ + * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the + * successfully returned fd after using it + * @__reserved: Must be 0 + * + * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU + * can have multiple FDs for different types, but is confined to one per @type. + * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ, + * if there are vEVENTs available. A vEVENTQ will lose events due to overflow, + * if the number of the vEVENTs hits @veventq_depth. + * + * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by + * a type-specific data structure, in a normal case: + * ------------------------------------------------------------- + * || header0 | data0 | header1 | data1 | ... | headerN | dataN || + * ------------------------------------------------------------- + * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to + * struct iommufd_vevent_header). + */ +struct iommu_veventq_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 veventq_depth; + __u32 out_veventq_id; + __u32 out_veventq_fd; + __u32 __reserved; +}; +#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) #endif -- Gitee From 5f7aee65a9ef9b0413feb3d63f4007f63f318d18 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:24 -0700 Subject: [PATCH 271/278] iommufd/viommu: Add iommufd_viommu_get_vdev_id helper ANBZ: #13617 commit ea94b211c5483080b749c142090f4c4de4926e51 upstream. This is a reverse search v.s. iommufd_viommu_find_dev, as drivers may want to convert a struct device pointer (physical) to its virtual device ID for an event injection to the user space VM. Again, this avoids exposing more core structures to the drivers, than the iommufd_viommu alone. Link: https://patch.msgid.link/r/18b8e8bc1b8104d43b205d21602c036fd0804e56.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen [ Shuai Xue: Convert string literal namespace to symbol due to missing commit cdd30ebb1b9f ] Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/driver.c | 24 ++++++++++++++++++++++++ include/linux/iommufd.h | 9 +++++++++ 2 files changed, 33 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 7b67fdf44134..3bffa9a795c2 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -49,5 +49,29 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD); +/* Return -ENOENT if device is not associated to the vIOMMU */ +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id) +{ + struct iommufd_vdevice *vdev; + unsigned long index; + int rc = -ENOENT; + + if (WARN_ON_ONCE(!vdev_id)) + return -EINVAL; + + xa_lock(&viommu->vdevs); + xa_for_each(&viommu->vdevs, index, vdev) { + if (vdev->dev == dev) { + *vdev_id = vdev->id; + rc = 0; + break; + } + } + xa_unlock(&viommu->vdevs); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, IOMMUFD); + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 8948b1836940..05cb393aff0a 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -190,6 +190,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, enum iommufd_object_type type); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -203,6 +205,13 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { return NULL; } + +static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, + unsigned long *vdev_id) +{ + return -ENOENT; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- Gitee From 43462ff78d8dab626495d6cfeca1ea532ea3826a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:25 -0700 Subject: [PATCH 272/278] iommufd/viommu: Add iommufd_viommu_report_event helper ANBZ: #13617 commit e8e1ef9b77a7a09b7809890a52229f24d3c8b532 upstream. Similar to iommu_report_device_fault, this allows IOMMU drivers to report vIOMMU events from threaded IRQ handlers to user space hypervisors. Link: https://patch.msgid.link/r/44be825042c8255e75d0151b338ffd8ba0e4920b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen [ Shuai Xue: Convert string literal namespace to symbol due to missing commit cdd30ebb1b9f ] Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/driver.c | 48 ++++++++++++++++++++++++++++++++++ include/linux/iommufd.h | 11 ++++++++ 2 files changed, 59 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 3bffa9a795c2..c74bc6e0539b 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -73,5 +73,53 @@ int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, IOMMUFD); +/* + * Typically called in driver's threaded IRQ handler. + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h + */ +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len) +{ + struct iommufd_veventq *veventq; + struct iommufd_vevent *vevent; + int rc = 0; + + if (WARN_ON_ONCE(!data_len || !event_data)) + return -EINVAL; + + down_read(&viommu->veventqs_rwsem); + + veventq = iommufd_viommu_find_veventq(viommu, type); + if (!veventq) { + rc = -EOPNOTSUPP; + goto out_unlock_veventqs; + } + + spin_lock(&veventq->common.lock); + if (veventq->num_events == veventq->depth) { + vevent = &veventq->lost_events_header; + goto out_set_header; + } + + vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); + if (!vevent) { + rc = -ENOMEM; + vevent = &veventq->lost_events_header; + goto out_set_header; + } + memcpy(vevent->event_data, event_data, data_len); + vevent->data_len = data_len; + veventq->num_events++; + +out_set_header: + iommufd_vevent_handler(veventq, vevent); + spin_unlock(&veventq->common.lock); +out_unlock_veventqs: + up_read(&viommu->veventqs_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, IOMMUFD); + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 05cb393aff0a..60eff9272551 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -11,6 +11,7 @@ #include #include #include +#include struct device; struct file; @@ -192,6 +193,9 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, struct device *dev, unsigned long *vdev_id); +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -212,6 +216,13 @@ static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, { return -ENOENT; } + +static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, + void *event_data, size_t data_len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- Gitee From 6b7bf552de6bf4afce5d82c1bcdbd24625f4bc7a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:26 -0700 Subject: [PATCH 273/278] iommufd/selftest: Require vdev_id when attaching to a nested domain ANBZ: #13617 commit 941d0719aa66adb40b96f049635d86b447577970 upstream. When attaching a device to a vIOMMU-based nested domain, vdev_id must be present. Add a piece of code hard-requesting it, preparing for a vEVENTQ support in the following patch. Then, update the TEST_F. A HWPT-based nested domain will return a NULL new_viommu, thus no such a vDEVICE requirement. Link: https://patch.msgid.link/r/4051ca8a819e51cb30de6b4fe9e4d94d956afe3d.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/selftest.c | 24 ++++++++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 5 +++++ 2 files changed, 29 insertions(+) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d40deb0a4f06..ba84bacbce2e 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -161,7 +161,10 @@ enum selftest_obj_type { struct mock_dev { struct device dev; + struct mock_viommu *viommu; + struct rw_semaphore viommu_rwsem; unsigned long flags; + unsigned long vdev_id; int id; u32 cache[MOCK_DEV_CACHE_NUM]; }; @@ -193,10 +196,30 @@ static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { struct mock_dev *mdev = to_mock_dev(dev); + struct mock_viommu *new_viommu = NULL; + unsigned long vdev_id = 0; + int rc; if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; + iommu_group_mutex_assert(dev); + if (domain->type == IOMMU_DOMAIN_NESTED) { + new_viommu = to_mock_nested(domain)->mock_viommu; + if (new_viommu) { + rc = iommufd_viommu_get_vdev_id(&new_viommu->core, dev, + &vdev_id); + if (rc) + return rc; + } + } + if (new_viommu != mdev->viommu) { + down_write(&mdev->viommu_rwsem); + mdev->viommu = new_viommu; + mdev->vdev_id = vdev_id; + up_write(&mdev->viommu_rwsem); + } + return 0; } @@ -850,6 +873,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) if (!mdev) return ERR_PTR(-ENOMEM); + init_rwsem(&mdev->viommu_rwsem); device_initialize(&mdev->dev); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 618c03bb6509..6a050a1d64ed 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2740,6 +2740,7 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) uint32_t iopf_hwpt_id; uint32_t fault_id; uint32_t fault_fd; + uint32_t vdev_id; if (self->device_id) { test_ioctl_fault_alloc(&fault_id, &fault_fd); @@ -2756,6 +2757,10 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + iopf_hwpt_id); + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, iopf_hwpt_id)); -- Gitee From 78b75e2406e10dfcdbbc0a1001de0c4d960d651c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:27 -0700 Subject: [PATCH 274/278] iommufd/selftest: Add IOMMU_TEST_OP_TRIGGER_VEVENT for vEVENTQ coverage ANBZ: #13617 commit b3cc0b7599ccc128831fdc0fb71606a246d2a58a upstream. The handler will get vDEVICE object from the given mdev and convert it to its per-vIOMMU virtual ID to mimic a real IOMMU driver. Link: https://patch.msgid.link/r/1ea874d20e56d65e7cfd6e0e8e01bd3dbd038761.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/iommufd/iommufd_test.h | 10 ++++++++++ drivers/iommu/iommufd/selftest.c | 30 ++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index a6b7a163f636..87e9165cea27 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -24,6 +24,7 @@ enum { IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, IOMMU_TEST_OP_DEV_CHECK_CACHE, + IOMMU_TEST_OP_TRIGGER_VEVENT, }; enum { @@ -145,6 +146,9 @@ struct iommu_test_cmd { __u32 id; __u32 cache; } check_dev_cache; + struct { + __u32 dev_id; + } trigger_vevent; }; __u32 last; }; @@ -212,4 +216,10 @@ struct iommu_viommu_invalidate_selftest { __u32 cache_id; }; +#define IOMMU_VEVENTQ_TYPE_SELFTEST 0xbeefbeef + +struct iommu_viommu_event_selftest { + __u32 virt_id; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index ba84bacbce2e..d55dde28e9bc 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1621,6 +1621,34 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, return 0; } +static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iommu_viommu_event_selftest test = {}; + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = -ENOENT; + + idev = iommufd_get_device(ucmd, cmd->trigger_vevent.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = to_mock_dev(idev->dev); + + down_read(&mdev->viommu_rwsem); + if (!mdev->viommu || !mdev->vdev_id) + goto out_unlock; + + test.virt_id = mdev->vdev_id; + rc = iommufd_viommu_report_event(&mdev->viommu->core, + IOMMU_VEVENTQ_TYPE_SELFTEST, &test, + sizeof(test)); +out_unlock: + up_read(&mdev->viommu_rwsem); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return rc; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = to_selftest_obj(obj); @@ -1702,6 +1730,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.flags); case IOMMU_TEST_OP_TRIGGER_IOPF: return iommufd_test_trigger_iopf(ucmd, cmd); + case IOMMU_TEST_OP_TRIGGER_VEVENT: + return iommufd_test_trigger_vevent(ucmd, cmd); default: return -EOPNOTSUPP; } -- Gitee From a40dbb52d2137140f73783489e59d46da7f57045 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:28 -0700 Subject: [PATCH 275/278] iommufd/selftest: Add IOMMU_VEVENTQ_ALLOC test coverage ANBZ: #13617 commit 97717a1f283fee4e886bbe96c6a0ca460f71a4ab upstream. Trigger vEVENTs by feeding an idev ID and validating the returned output virt_ids whether they equal to the value that was set to the vDEVICE. Link: https://patch.msgid.link/r/e829532ec0a3927d61161b7674b20e731ecd495b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- tools/testing/selftests/iommu/iommufd.c | 31 +++++ .../selftests/iommu/iommufd_fail_nth.c | 7 ++ tools/testing/selftests/iommu/iommufd_utils.h | 115 ++++++++++++++++++ 3 files changed, 153 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6a050a1d64ed..156c74da53cd 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2778,15 +2778,46 @@ TEST_F(iommufd_viommu, vdevice_alloc) uint32_t viommu_id = self->viommu_id; uint32_t dev_id = self->device_id; uint32_t vdev_id = 0; + uint32_t veventq_id; + uint32_t veventq_fd; + int prev_seq = -1; if (dev_id) { + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + self->nested_hwpt_id); + + /* Allocate a vEVENTQ with veventq_depth=2 */ + test_cmd_veventq_alloc(viommu_id, IOMMU_VEVENTQ_TYPE_SELFTEST, + &veventq_id, &veventq_fd); + test_err_veventq_alloc(EEXIST, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, NULL, NULL); /* Set vdev_id to 0x99, unset it, and set to 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x99, &prev_seq); test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + + /* Try again with 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + /* Trigger an overflow with three events */ + test_cmd_trigger_vevents(dev_id, 3); + test_err_read_vevents(EOVERFLOW, veventq_fd, 3, 0x88, + &prev_seq); + /* Overflow must be gone after the previous reads */ + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x88, &prev_seq); + close(veventq_fd); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + test_ioctl_destroy(veventq_id); } else { test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL); } diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 64b1f8e1b0cf..99a7f7897bb2 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -620,6 +620,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) }; struct iommu_test_hw_info info; uint32_t fault_id, fault_fd; + uint32_t veventq_id, veventq_fd; uint32_t fault_hwpt_id; uint32_t ioas_id; uint32_t ioas_id2; @@ -692,6 +693,12 @@ TEST_FAIL_NTH(basic_fail_nth, device) IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data))) return -1; + if (_test_cmd_veventq_alloc(self->fd, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, &veventq_id, + &veventq_fd)) + return -1; + close(veventq_fd); + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index d979f5b0efe8..6f2ba2fa8f76 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "../kselftest_harness.h" #include "../../../../drivers/iommu/iommufd/iommufd_test.h" @@ -936,3 +937,117 @@ static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, EXPECT_ERRNO(_errno, \ _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ virt_id, vdev_id)) + +static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, + __u32 *veventq_id, __u32 *veventq_fd) +{ + struct iommu_veventq_alloc cmd = { + .size = sizeof(cmd), + .type = type, + .veventq_depth = 2, + .viommu_id = viommu_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_VEVENTQ_ALLOC, &cmd); + if (ret) + return ret; + if (veventq_id) + *veventq_id = cmd.out_veventq_id; + if (veventq_fd) + *veventq_fd = cmd.out_veventq_fd; + return 0; +} + +#define test_cmd_veventq_alloc(viommu_id, type, veventq_id, veventq_fd) \ + ASSERT_EQ(0, _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) +#define test_err_veventq_alloc(_errno, viommu_id, type, veventq_id, \ + veventq_fd) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) + +static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) +{ + struct iommu_test_cmd trigger_vevent_cmd = { + .size = sizeof(trigger_vevent_cmd), + .op = IOMMU_TEST_OP_TRIGGER_VEVENT, + .trigger_vevent = { + .dev_id = dev_id, + }, + }; + int ret; + + while (nvevents--) { + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), + &trigger_vevent_cmd); + if (ret < 0) + return -1; + } + return ret; +} + +#define test_cmd_trigger_vevents(dev_id, nvevents) \ + ASSERT_EQ(0, _test_cmd_trigger_vevents(self->fd, dev_id, nvevents)) + +static int _test_cmd_read_vevents(int fd, __u32 event_fd, __u32 nvevents, + __u32 virt_id, int *prev_seq) +{ + struct pollfd pollfd = { .fd = event_fd, .events = POLLIN }; + struct iommu_viommu_event_selftest *event; + struct iommufd_vevent_header *hdr; + ssize_t bytes; + void *data; + int ret, i; + + ret = poll(&pollfd, 1, 1000); + if (ret < 0) + return -1; + + data = calloc(nvevents, sizeof(*hdr) + sizeof(*event)); + if (!data) { + errno = ENOMEM; + return -1; + } + + bytes = read(event_fd, data, + nvevents * (sizeof(*hdr) + sizeof(*event))); + if (bytes <= 0) { + errno = EFAULT; + ret = -1; + goto out_free; + } + + for (i = 0; i < nvevents; i++) { + hdr = data + i * (sizeof(*hdr) + sizeof(*event)); + + if (hdr->flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS || + hdr->sequence - *prev_seq > 1) { + *prev_seq = hdr->sequence; + errno = EOVERFLOW; + ret = -1; + goto out_free; + } + *prev_seq = hdr->sequence; + event = data + sizeof(*hdr); + if (event->virt_id != virt_id) { + errno = EINVAL; + ret = -1; + goto out_free; + } + } + + ret = 0; +out_free: + free(data); + return ret; +} + +#define test_cmd_read_vevents(event_fd, nvevents, virt_id, prev_seq) \ + ASSERT_EQ(0, _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) +#define test_err_read_vevents(_errno, event_fd, nvevents, virt_id, prev_seq) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) -- Gitee From 77c53fea72ed3d3a4cc7a12164c7be51b2b6fb4e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:30 -0700 Subject: [PATCH 276/278] iommu/arm-smmu-v3: Introduce struct arm_smmu_vmaster ANBZ: #13617 commit f0ea207ed7813bdf17e65aa042d023d1c59e49e3 upstream. Use it to store all vSMMU-related data. The vsid (Virtual Stream ID) will be the first use case. Since the vsid reader will be the eventq handler that already holds a streams_mutex, reuse that to fence the vmaster too. Also add a pair of arm_smmu_attach_prepare/commit_vmaster helpers to set or unset the master->vmaster pointer. Put the helpers inside the existing arm_smmu_attach_prepare/commit(). For identity/blocked ops that don't call arm_smmu_attach_prepare/commit(), add a simpler arm_smmu_master_clear_vmaster helper to unset the vmaster. Link: https://patch.msgid.link/r/a7f282e1a531279e25f06c651e95d56f6b120886.1741719725.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 41 +++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 +++++++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 28 +++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 72050bb507fd..a91170b44a0b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -85,6 +85,47 @@ static void arm_smmu_make_nested_domain_ste( } } +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + struct arm_smmu_vmaster *vmaster; + unsigned long vsid; + int ret; + + iommu_group_mutex_assert(state->master->dev); + + ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, + state->master->dev, &vsid); + if (ret) + return ret; + + vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); + if (!vmaster) + return -ENOMEM; + vmaster->vsmmu = nested_domain->vsmmu; + vmaster->vsid = vsid; + state->vmaster = vmaster; + + return 0; +} + +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ + struct arm_smmu_master *master = state->master; + + mutex_lock(&master->smmu->streams_mutex); + kfree(master->vmaster); + master->vmaster = state->vmaster; + mutex_unlock(&master->smmu->streams_mutex); +} + +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ + struct arm_smmu_attach_state state = { .master = master }; + + arm_smmu_attach_commit_vmaster(&state); +} + static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, struct device *dev) { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2ffb4a0ea762..aff9a2a52e41 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2803,6 +2803,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(new_domain); unsigned long flags; + int ret; /* * arm_smmu_share_asid() must not see two domains pointing to the same @@ -2827,9 +2828,18 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, } if (smmu_domain) { + if (new_domain->type == IOMMU_DOMAIN_NESTED) { + ret = arm_smmu_attach_prepare_vmaster( + state, to_smmu_nested_domain(new_domain)); + if (ret) + return ret; + } + master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL); - if (!master_domain) + if (!master_domain) { + kfree(state->vmaster); return -ENOMEM; + } master_domain->master = master; master_domain->ssid = state->ssid; if (new_domain->type == IOMMU_DOMAIN_NESTED) @@ -2856,6 +2866,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); kfree(master_domain); + kfree(state->vmaster); return -EINVAL; } @@ -2888,6 +2899,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) lockdep_assert_held(&arm_smmu_asid_lock); + arm_smmu_attach_commit_vmaster(state); + if (state->ats_enabled && !master->ats_enabled) { arm_smmu_enable_ats(master); } else if (state->ats_enabled && master->ats_enabled) { @@ -3155,6 +3168,7 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); return 0; @@ -3173,7 +3187,9 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, struct device *dev) { struct arm_smmu_ste ste; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8bbabd5b7dcd..3aa3a1e3b022 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -799,6 +799,11 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_vmaster { + struct arm_vsmmu *vsmmu; + unsigned long vsid; +}; + struct arm_smmu_event { u8 stall : 1, ssv : 1, @@ -824,6 +829,7 @@ struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; struct arm_smmu_stream *streams; + struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */ /* Locked by the iommu core using the group mutex */ struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; @@ -972,6 +978,7 @@ struct arm_smmu_attach_state { bool disable_ats; ioasid_t ssid; /* Resulting state */ + struct arm_smmu_vmaster *vmaster; bool ats_enabled; }; @@ -1055,9 +1062,30 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, struct iommu_domain *parent, struct iommufd_ctx *ictx, unsigned int viommu_type); +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain); +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); #else #define arm_smmu_hw_info NULL #define arm_vsmmu_alloc NULL + +static inline int +arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + return 0; +} + +static inline void +arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ +} + +static inline void +arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ +} #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ -- Gitee From 3012300a02c3721df4a51d9c670174f1c961d5a4 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:31 -0700 Subject: [PATCH 277/278] iommu/arm-smmu-v3: Report events that belong to devices attached to vIOMMU ANBZ: #13617 commit e7d3fa3d29d5b2ed12d247cf57a0a34fffe89eb8 upstream. Aside from the IOPF framework, iommufd provides an additional pathway to report hardware events, via the vEVENTQ of vIOMMU infrastructure. Define an iommu_vevent_arm_smmuv3 uAPI structure, and report stage-1 events in the threaded IRQ handler. Also, add another four event record types that can be forwarded to a VM. Link: https://patch.msgid.link/r/5cf6719682fdfdabffdb08374cdf31ad2466d75a.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Conflicts: drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c Signed-off-by: Jay Chen [ Shuai Xue: Convert string literal namespace to symbol due to missing commit cdd30ebb1b9f. ] Signed-off-by: Shuai Xue --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 17 ++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 58 +++++++++++-------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++ include/uapi/linux/iommufd.h | 23 ++++++++ 4 files changed, 80 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index a91170b44a0b..c7ecf72f5b53 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -433,4 +433,21 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, return &vsmmu->core; } +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt) +{ + struct iommu_vevent_arm_smmuv3 vevt; + int i; + + lockdep_assert_held(&vmaster->vsmmu->smmu->streams_mutex); + + vevt.evt[0] = cpu_to_le64((evt[0] & ~EVTQ_0_SID) | + FIELD_PREP(EVTQ_0_SID, vmaster->vsid)); + for (i = 1; i < EVTQ_ENT_DWORDS; i++) + vevt.evt[i] = cpu_to_le64(evt[i]); + + return iommufd_viommu_report_event(&vmaster->vsmmu->core, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3, &vevt, + sizeof(vevt)); +} + MODULE_IMPORT_NS(IOMMUFD); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index aff9a2a52e41..3fc688b5a65f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1813,8 +1813,8 @@ static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, mutex_unlock(&smmu->streams_mutex); } -static int arm_smmu_handle_event(struct arm_smmu_device *smmu, - struct arm_smmu_event *event) +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, u64 *evt, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; @@ -1823,6 +1823,10 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, struct iommu_fault *flt = &fault_evt.fault; switch (event->id) { + case EVT_ID_BAD_STE_CONFIG: + case EVT_ID_STREAM_DISABLED_FAULT: + case EVT_ID_BAD_SUBSTREAMID_CONFIG: + case EVT_ID_BAD_CD_CONFIG: case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1832,31 +1836,30 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, return -EOPNOTSUPP; } - if (!event->stall) - return -EOPNOTSUPP; - - if (event->read) - perm |= IOMMU_FAULT_PERM_READ; - else - perm |= IOMMU_FAULT_PERM_WRITE; + if (event->stall) { + if (event->read) + perm |= IOMMU_FAULT_PERM_READ; + else + perm |= IOMMU_FAULT_PERM_WRITE; - if (event->instruction) - perm |= IOMMU_FAULT_PERM_EXEC; + if (event->instruction) + perm |= IOMMU_FAULT_PERM_EXEC; - if (event->privileged) - perm |= IOMMU_FAULT_PERM_PRIV; + if (event->privileged) + perm |= IOMMU_FAULT_PERM_PRIV; - flt->type = IOMMU_FAULT_PAGE_REQ; - flt->prm = (struct iommu_fault_page_request) { - .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = event->stag, - .perm = perm, - .addr = event->iova, - }; + flt->type = IOMMU_FAULT_PAGE_REQ; + flt->prm = (struct iommu_fault_page_request){ + .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, + .grpid = event->stag, + .perm = perm, + .addr = event->iova, + }; - if (event->ssv) { - flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = event->ssid; + if (event->ssv) { + flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + flt->prm.pasid = event->ssid; + } } mutex_lock(&smmu->streams_mutex); @@ -1866,7 +1869,12 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, goto out_unlock; } - ret = iommu_report_device_fault(master->dev, &fault_evt); + if (event->stall) + ret = iommu_report_device_fault(master->dev, &fault_evt); + else if (master->vmaster && !event->s2) + ret = arm_vmaster_report_event(master->vmaster, evt); + else + ret = -EOPNOTSUPP; /* Unhandled events should be pinned */ out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; @@ -1944,7 +1952,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { arm_smmu_decode_event(smmu, evt, &event); - if (arm_smmu_handle_event(smmu, &event)) + if (arm_smmu_handle_event(smmu, evt, &event)) arm_smmu_dump_event(smmu, evt, &event, &rs); put_device(event.dev); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 3aa3a1e3b022..d82ee67975d7 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1066,6 +1066,7 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain); void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); #else #define arm_smmu_hw_info NULL #define arm_vsmmu_alloc NULL @@ -1086,6 +1087,12 @@ static inline void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) { } + +static inline int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, + u64 *evt) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 11ed647838d0..8fce5bed1a63 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1054,9 +1054,32 @@ struct iommufd_vevent_header { /** * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event + * (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3) + * @evt: 256-bit ARM SMMUv3 Event record, little-endian. + * Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec) + * - 0x04 C_BAD_STE + * - 0x06 F_STREAM_DISABLED + * - 0x08 C_BAD_SUBSTREAMID + * - 0x0a C_BAD_CD + * - 0x10 F_TRANSLATION + * - 0x11 F_ADDR_SIZE + * - 0x12 F_ACCESS + * - 0x13 F_PERMISSION + * + * StreamID field reports a virtual device ID. To receive a virtual event for a + * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC. + */ +struct iommu_vevent_arm_smmuv3 { + __aligned_le64 evt[4]; }; /** -- Gitee From 459a0363f9bf63f227cd0ade2890665451c7bbc4 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:32 -0700 Subject: [PATCH 278/278] iommu/arm-smmu-v3: Set MEV bit in nested STE for DoS mitigations ANBZ: #13617 commit da0c56520e880441d0503d0cf0d6853dcfb5f1a4 upstream. There is a DoS concern on the shared hardware event queue among devices passed through to VMs, that too many translation failures that belong to VMs could overflow the shared hardware event queue if those VMs or their VMMs don't handle/recover the devices properly. The MEV bit in the STE allows to configure the SMMU HW to merge similar event records, though there is no guarantee. Set it in a nested STE for DoS mitigations. In the future, we might want to enable the MEV for non-nested cases too such as domain->type == IOMMU_DOMAIN_UNMANAGED or even IOMMU_DOMAIN_DMA. Link: https://patch.msgid.link/r/8ed12feef67fc65273d0f5925f401a81f56acebe.1741719725.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 2 ++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 ++-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index c7ecf72f5b53..0590e3b51745 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -43,6 +43,8 @@ static void arm_smmu_make_nested_cd_table_ste( target->data[0] |= nested_domain->ste[0] & ~cpu_to_le64(STRTAB_STE_0_CFG); target->data[1] |= nested_domain->ste[1]; + /* Merge events for DoS mitigations on eventq */ + target->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV); } /* diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3fc688b5a65f..564a70f313a2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1052,7 +1052,7 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW | - STRTAB_STE_1_EATS); + STRTAB_STE_1_EATS | STRTAB_STE_1_MEV); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); /* @@ -1068,7 +1068,7 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) if (cfg & BIT(1)) { used_bits[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS | - STRTAB_STE_1_SHCFG); + STRTAB_STE_1_SHCFG | STRTAB_STE_1_MEV); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index d82ee67975d7..f011e2718918 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -266,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4) #define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6) +#define STRTAB_STE_1_MEV (1UL << 19) #define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) -- Gitee