From 8c62f9f540034c62c8a797c0fcc59af578c81462 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:16:59 +0100 Subject: [PATCH 001/219] dt-bindings: iommu: dart: Add t8103-usb4-dart compatible ANBZ: #13617 commit a08bd8df97b7549fe0edc53d087322314a0c4dfe upstream. This DART variant is found in the t8103 (M1) SoCs and used for the USB4/Thunderbolt PCIe ports. Unlike the regular t8103 DART these support up to 64 SIDs and require a slightly different MMIO layout. Acked-by: Hector Martin Acked-by: Rob Herring Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-2-sven@svenpeter.dev Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- Documentation/devicetree/bindings/iommu/apple,dart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/iommu/apple,dart.yaml b/Documentation/devicetree/bindings/iommu/apple,dart.yaml index 903edf85d72e..7adb1de455a5 100644 --- a/Documentation/devicetree/bindings/iommu/apple,dart.yaml +++ b/Documentation/devicetree/bindings/iommu/apple,dart.yaml @@ -24,6 +24,7 @@ properties: compatible: enum: - apple,t8103-dart + - apple,t8103-usb4-dart - apple,t8110-dart - apple,t6000-dart -- Gitee From 49dd8bbf18bfab9d4eb35be81a294cc51adc4ebf Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:17:00 +0100 Subject: [PATCH 002/219] iommu/apple-dart: Write to all DART_T8020_STREAM_SELECT ANBZ: #13617 commit c58a17a99753749aabd979bc62babb2243266dcc upstream. We're about to add support for a DART variant that use more than 16 streams and requires writing to two separate stream select registers when issuing TLB flushes. Acked-by: Hector Martin Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-3-sven@svenpeter.dev Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/apple-dart.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index d5204c9f7186..7b461da8dbeb 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -368,12 +368,14 @@ apple_dart_t8020_hw_stream_command(struct apple_dart_stream_map *stream_map, u32 command) { unsigned long flags; - int ret; + int ret, i; u32 command_reg; spin_lock_irqsave(&stream_map->dart->lock, flags); - writel(stream_map->sidmap[0], stream_map->dart->regs + DART_T8020_STREAM_SELECT); + for (i = 0; i < BITS_TO_U32(stream_map->dart->num_streams); i++) + writel(stream_map->sidmap[i], + stream_map->dart->regs + DART_T8020_STREAM_SELECT + 4 * i); writel(command, stream_map->dart->regs + DART_T8020_STREAM_COMMAND); ret = readl_poll_timeout_atomic( -- Gitee From 42e0b9c6f9ed35c0af0d3a2e98269e1418ccc4ee Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:17:01 +0100 Subject: [PATCH 003/219] iommu/apple-dart: Add support for t8103 USB4 DART ANBZ: #13617 commit 863c092323ab17a5bc4b2f3185d4136f9e57bbe3 upstream. This variant of the regular t8103 DART is used for the two USB4/Thunderbolt PCIe controllers. It supports 64 instead of 16 streams which requires a slightly different MMIO layout. Acked-by: Hector Martin Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-4-sven@svenpeter.dev Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/apple-dart.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 7b461da8dbeb..d6b5cac9aede 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -81,6 +81,7 @@ #define DART_T8020_TCR_BYPASS_DAPF BIT(12) #define DART_T8020_TTBR 0x200 +#define DART_T8020_USB4_TTBR 0x400 #define DART_T8020_TTBR_VALID BIT(31) #define DART_T8020_TTBR_ADDR_FIELD_SHIFT 0 #define DART_T8020_TTBR_SHIFT 12 @@ -1217,6 +1218,33 @@ static const struct apple_dart_hw apple_dart_hw_t8103 = { .ttbr_shift = DART_T8020_TTBR_SHIFT, .ttbr_count = 4, }; + +static const struct apple_dart_hw apple_dart_hw_t8103_usb4 = { + .type = DART_T8020, + .irq_handler = apple_dart_t8020_irq, + .invalidate_tlb = apple_dart_t8020_hw_invalidate_tlb, + .oas = 36, + .fmt = APPLE_DART, + .max_sid_count = 64, + + .enable_streams = DART_T8020_STREAMS_ENABLE, + .lock = DART_T8020_CONFIG, + .lock_bit = DART_T8020_CONFIG_LOCK, + + .error = DART_T8020_ERROR, + + .tcr = DART_T8020_TCR, + .tcr_enabled = DART_T8020_TCR_TRANSLATE_ENABLE, + .tcr_disabled = 0, + .tcr_bypass = 0, + + .ttbr = DART_T8020_USB4_TTBR, + .ttbr_valid = DART_T8020_TTBR_VALID, + .ttbr_addr_field_shift = DART_T8020_TTBR_ADDR_FIELD_SHIFT, + .ttbr_shift = DART_T8020_TTBR_SHIFT, + .ttbr_count = 4, +}; + static const struct apple_dart_hw apple_dart_hw_t6000 = { .type = DART_T6000, .irq_handler = apple_dart_t8020_irq, @@ -1309,6 +1337,7 @@ static DEFINE_SIMPLE_DEV_PM_OPS(apple_dart_pm_ops, apple_dart_suspend, apple_dar static const struct of_device_id apple_dart_of_match[] = { { .compatible = "apple,t8103-dart", .data = &apple_dart_hw_t8103 }, + { .compatible = "apple,t8103-usb4-dart", .data = &apple_dart_hw_t8103_usb4 }, { .compatible = "apple,t8110-dart", .data = &apple_dart_hw_t8110 }, { .compatible = "apple,t6000-dart", .data = &apple_dart_hw_t6000 }, {}, -- Gitee From 1e41565cba5d9696b4257f95fd0bd3c634cc49cc Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 17:20:09 +0100 Subject: [PATCH 004/219] iommu/apple-dart: Use readl instead of readl_relaxed for consistency ANBZ: #13617 commit 60732292a135e8b8152858f219a563c05e9569e9 upstream. While the readl_relaxed in apple_dart_suspend is correct the rest of the driver uses the non-relaxed variants everywhere and the single readl_relaxed is inconsistent and possibly confusing. Signed-off-by: Sven Peter Acked-by: Hector Martin Reviewed-by: Neal Gompa Link: https://lore.kernel.org/r/20231126162009.17934-1-sven@svenpeter.dev Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/apple-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index d6b5cac9aede..39c517aa2721 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -1302,7 +1302,7 @@ static __maybe_unused int apple_dart_suspend(struct device *dev) unsigned int sid, idx; for (sid = 0; sid < dart->num_streams; sid++) { - dart->save_tcr[sid] = readl_relaxed(dart->regs + DART_TCR(dart, sid)); + dart->save_tcr[sid] = readl(dart->regs + DART_TCR(dart, sid)); for (idx = 0; idx < dart->hw->ttbr_count; idx++) dart->save_ttbr[sid][idx] = readl(dart->regs + DART_TTBR(dart, sid, idx)); -- Gitee From 65c6ec4d17da811e41c9cd51700755e490566b77 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 12 Nov 2023 14:50:13 -0400 Subject: [PATCH 005/219] iommufd: Add iommufd_ctx to iommufd_put_object() ANBZ: #13617 commit bd7a282650b8beb57bc9d19bfcb714b1ccae843a upstream. Will be used in the next patch. Link: https://lore.kernel.org/r/1-v2-ca9e00171c5b+123-iommufd_syz4_jgg@nvidia.com/ Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe [ Conflicts: drivers/iommu/iommufd/selftest.c bugfix fd4d5cd7a2e8 is rebased by stable which introduces __iommufd_test_syz_conv_iova ] Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/device.c | 14 +++++++------- drivers/iommu/iommufd/hw_pagetable.c | 8 ++++---- drivers/iommu/iommufd/ioas.c | 14 +++++++------- drivers/iommu/iommufd/iommufd_private.h | 3 ++- drivers/iommu/iommufd/selftest.c | 14 +++++++------- drivers/iommu/iommufd/vfio_compat.c | 18 +++++++++--------- 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dbcb027857b6..68ac138d925c 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -587,7 +587,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, continue; destroy_hwpt = (*do_attach)(idev, hwpt); if (IS_ERR(destroy_hwpt)) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); /* * -EINVAL means the domain is incompatible with the * device. Other error codes should propagate to @@ -599,7 +599,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } *pt_id = hwpt->obj.id; - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); goto out_unlock; } @@ -668,7 +668,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, destroy_hwpt = ERR_PTR(-EINVAL); goto out_put_pt_obj; } - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); /* This destruction has to be after we unlock everything */ if (destroy_hwpt) @@ -676,7 +676,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return 0; out_put_pt_obj: - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); return PTR_ERR(destroy_hwpt); } @@ -808,7 +808,7 @@ static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) if (IS_ERR(ioas)) return PTR_ERR(ioas); rc = iommufd_access_change_ioas(access, ioas); - iommufd_put_object(&ioas->obj); + iommufd_put_object(access->ictx, &ioas->obj); return rc; } @@ -957,7 +957,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, access->ops->unmap(access->data, iova, length); - iommufd_put_object(&access->obj); + iommufd_put_object(access->ictx, &access->obj); xa_lock(&ioas->iopt.access_list); } xa_unlock(&ioas->iopt.access_list); @@ -1259,6 +1259,6 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) out_free: kfree(data); out_put: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 5be7f513b622..29612106a962 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -320,9 +320,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (ioas) mutex_unlock(&ioas->mutex); out_put_pt: - iommufd_put_object(pt_obj); + iommufd_put_object(ucmd->ictx, pt_obj); out_put_idev: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } @@ -347,7 +347,7 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, enable); - iommufd_put_object(&hwpt_paging->common.obj); + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } @@ -370,6 +370,6 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) rc = iopt_read_and_clear_dirty_data( &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); - iommufd_put_object(&hwpt_paging->common.obj); + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 0407e2b758ef..157a89b993e4 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -105,7 +105,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) rc = -EMSGSIZE; out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -175,7 +175,7 @@ int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) interval_tree_remove(node, &allowed_iova); kfree(container_of(node, struct iopt_allowed, node)); } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -232,7 +232,7 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) cmd->iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -266,7 +266,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) return PTR_ERR(src_ioas); rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, &pages_list); - iommufd_put_object(&src_ioas->obj); + iommufd_put_object(ucmd->ictx, &src_ioas->obj); if (rc) return rc; @@ -287,7 +287,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put_dst: - iommufd_put_object(&dst_ioas->obj); + iommufd_put_object(ucmd->ictx, &dst_ioas->obj); out_pages: iopt_free_pages_list(&pages_list); return rc; @@ -323,7 +323,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -401,6 +401,6 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd) rc = -EOPNOTSUPP; } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a74cfefffbc6..f918a22f0d48 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -154,7 +154,8 @@ static inline bool iommufd_lock_obj(struct iommufd_object *obj) struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); -static inline void iommufd_put_object(struct iommufd_object *obj) +static inline void iommufd_put_object(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { refcount_dec(&obj->users); up_read(&obj->destroy_rwsem); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 67a7591390b9..d437ffb715e3 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -101,7 +101,7 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, if (IS_ERR(ioas)) return; *iova = __iommufd_test_syz_conv_iova(&ioas->iopt, iova); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); } struct mock_iommu_domain { @@ -515,7 +515,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || hwpt->domain->ops != mock_ops.default_domain_ops) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); @@ -533,7 +533,7 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || hwpt->domain->ops != &domain_nested_ops) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock_nested = container_of(hwpt->domain, @@ -696,7 +696,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_dev_obj: - iommufd_put_object(dev_obj); + iommufd_put_object(ucmd->ictx, dev_obj); return rc; } @@ -714,7 +714,7 @@ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, down_write(&ioas->iopt.iova_rwsem); rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); up_write(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -769,7 +769,7 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, rc = 0; out_put: - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } @@ -1248,7 +1248,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, out_free: kvfree(tmp); out_put: - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 538fbf76354d..a3ad5f0b6c59 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -41,7 +41,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) if (IS_ERR(ioas)) return PTR_ERR(ioas); *out_ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); @@ -98,7 +98,7 @@ int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { ret = 0; - iommufd_put_object(&ictx->vfio_ioas->obj); + iommufd_put_object(ictx, &ictx->vfio_ioas->obj); goto out_abort; } ictx->vfio_ioas = ioas; @@ -133,7 +133,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) if (IS_ERR(ioas)) return PTR_ERR(ioas); cmd->ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); case IOMMU_VFIO_IOAS_SET: @@ -143,7 +143,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = ioas; xa_unlock(&ucmd->ictx->objects); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return 0; case IOMMU_VFIO_IOAS_CLEAR: @@ -190,7 +190,7 @@ static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, iova = map.iova; rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), map.size, iommu_prot, 0); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -249,7 +249,7 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, rc = -EFAULT; err_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -272,7 +272,7 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) } mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -349,7 +349,7 @@ static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -511,7 +511,7 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } -- Gitee From eb5519c7ac1c4492da968ed7168f953bf117fe72 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 12 Nov 2023 15:44:08 -0400 Subject: [PATCH 006/219] iommufd: Do not UAF during iommufd_put_object() ANBZ: #13617 commit 6f9c4d8c468c189d6dc470324bd52955f8aa0a10 upstream. The mixture of kernel and user space lifecycle objects continues to be complicated inside iommufd. The obj->destroy_rwsem is used to bring order to the kernel driver destruction sequence but it cannot be sequenced right with the other refcounts so we end up possibly UAF'ing: BUG: KASAN: slab-use-after-free in __up_read+0x627/0x750 kernel/locking/rwsem.c:1342 Read of size 8 at addr ffff888073cde868 by task syz-executor934/6535 CPU: 1 PID: 6535 Comm: syz-executor934 Not tainted 6.6.0-rc7-syzkaller-00195-g2af9b20dbb39 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 __up_read+0x627/0x750 kernel/locking/rwsem.c:1342 iommufd_put_object drivers/iommu/iommufd/iommufd_private.h:149 [inline] iommufd_vfio_ioas+0x46c/0x580 drivers/iommu/iommufd/vfio_compat.c:146 iommufd_fops_ioctl+0x347/0x4d0 drivers/iommu/iommufd/main.c:398 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd There are two races here, the more obvious one: CPU 0 CPU 1 iommufd_put_object() iommufd_destroy() refcount_dec(&obj->users) iommufd_object_remove() kfree() up_read(&obj->destroy_rwsem) // Boom And there is also perhaps some possibility that the rwsem could hit an issue: CPU 0 CPU 1 iommufd_put_object() iommufd_object_destroy_user() refcount_dec(&obj->users); down_write(&obj->destroy_rwsem) up_read(&obj->destroy_rwsem); atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); tmp = atomic_long_add_return_release() rwsem_try_write_lock() iommufd_object_remove() up_write(&obj->destroy_rwsem) kfree() clear_nonspinnable() // Boom Fix this by reorganizing this again so that two refcounts are used to keep track of things with a rule that users == 0 && shortterm_users == 0 means no other threads have that memory. Put a wait_queue in the iommufd_ctx object that is triggered when any sub object reaches a 0 shortterm_users. This allows the same wait for userspace ioctls to finish behavior that the rwsem was providing. This is weaker still than the prior versions: - There is no bias on shortterm_users so if some thread is waiting to destroy other threads can continue to get new read sides - If destruction fails, eg because of an active in-kernel user, then shortterm_users will have cycled to zero momentarily blocking new users - If userspace races destroy with other userspace operations they continue to get an EBUSY since we still can't intermix looking up an ID and sleeping for its unref In all cases these are things that userspace brings on itself, correct programs will not hit them. Fixes: 99f98a7c0d69 ("iommufd: IOMMUFD_DESTROY should not increase the refcount") Link: https://lore.kernel.org/all/2-v2-ca9e00171c5b+123-iommufd_syz4_jgg@nvidia.com/ Reported-by: syzbot+d31adfb277377ef8fcba@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/00000000000055ef9a0609336580@google.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_private.h | 67 +++++++++-- drivers/iommu/iommufd/main.c | 146 +++++++++++++----------- 2 files changed, 135 insertions(+), 78 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f918a22f0d48..abae041e256f 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -21,6 +21,7 @@ struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; + wait_queue_head_t destroy_wait; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -135,7 +136,7 @@ enum iommufd_object_type { /* Base struct for all objects with a userspace ID handle. */ struct iommufd_object { - struct rw_semaphore destroy_rwsem; + refcount_t shortterm_users; refcount_t users; enum iommufd_object_type type; unsigned int id; @@ -143,10 +144,15 @@ struct iommufd_object { static inline bool iommufd_lock_obj(struct iommufd_object *obj) { - if (!down_read_trylock(&obj->destroy_rwsem)) + if (!refcount_inc_not_zero(&obj->users)) return false; - if (!refcount_inc_not_zero(&obj->users)) { - up_read(&obj->destroy_rwsem); + if (!refcount_inc_not_zero(&obj->shortterm_users)) { + /* + * If the caller doesn't already have a ref on obj this must be + * called under the xa_lock. Otherwise the caller is holding a + * ref on users. Thus it cannot be one before this decrement. + */ + refcount_dec(&obj->users); return false; } return true; @@ -157,8 +163,13 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, static inline void iommufd_put_object(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + /* + * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees + * a spurious !0 users with a 0 shortterm_users. + */ refcount_dec(&obj->users); - up_read(&obj->destroy_rwsem); + if (refcount_dec_and_test(&obj->shortterm_users)) + wake_up_interruptible_all(&ictx->destroy_wait); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); @@ -166,17 +177,49 @@ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail); + +enum { + REMOVE_WAIT_SHORTTERM = 1, +}; +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags); + +/* + * The caller holds a users refcount and wants to destroy the object. At this + * point the caller has no shortterm_users reference and at least the xarray + * will be holding one. + */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, false); + int ret; + + ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM); + + /* + * If there is a bug and we couldn't destroy the object then we did put + * back the caller's users refcount and will eventually try to free it + * again during close. + */ + WARN_ON(ret); } -static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj) + +/* + * The HWPT allocated by autodomains is used in possibly many devices and + * is automatically destroyed when its refcount reaches zero. + * + * If userspace uses the HWPT manually, even for a short term, then it will + * disrupt this refcounting and the auto-free in the kernel will not work. + * Userspace that tries to use the automatically allocated HWPT must be careful + * to ensure that it is consistently destroyed, eg by not racing accesses + * and by not attaching an automatic HWPT to a device manually. + */ +static inline void +iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, true); + iommufd_object_remove(ictx, obj, obj->id, 0); } struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, @@ -312,7 +355,7 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, lockdep_assert_not_held(&hwpt_paging->ioas->mutex); if (hwpt_paging->auto_domain) { - iommufd_object_deref_user(ictx, &hwpt->obj); + iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } } diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 45b9d40773b1..c9091e46d208 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -33,7 +33,6 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type) { - static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; struct iommufd_object *obj; int rc; @@ -41,15 +40,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, if (!obj) return ERR_PTR(-ENOMEM); obj->type = type; - /* - * In most cases the destroy_rwsem is obtained with try so it doesn't - * interact with lockdep, however on destroy we have to sleep. This - * means if we have to destroy an object while holding a get on another - * object it triggers lockdep. Using one locking class per object type - * is a simple and reasonable way to avoid this. - */ - __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", - &obj_keys[type]); + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); refcount_set(&obj->users, 1); /* @@ -129,92 +121,113 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, return obj; } +static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) +{ + if (refcount_dec_and_test(&to_destroy->shortterm_users)) + return 0; + + if (wait_event_timeout(ictx->destroy_wait, + refcount_read(&to_destroy->shortterm_users) == + 0, + msecs_to_jiffies(10000))) + return 0; + + pr_crit("Time out waiting for iommufd object to become free\n"); + refcount_inc(&to_destroy->shortterm_users); + return -EBUSY; +} + /* * Remove the given object id from the xarray if the only reference to the - * object is held by the xarray. The caller must call ops destroy(). + * object is held by the xarray. */ -static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, - u32 id, bool extra_put) +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags) { struct iommufd_object *obj; XA_STATE(xas, &ictx->objects, id); - - xa_lock(&ictx->objects); - obj = xas_load(&xas); - if (xa_is_zero(obj) || !obj) { - obj = ERR_PTR(-ENOENT); - goto out_xa; - } + bool zerod_shortterm = false; + int ret; /* - * If the caller is holding a ref on obj we put it here under the - * spinlock. + * The purpose of the shortterm_users is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must increment + * shortterm_users, such as during ioctl execution. */ - if (extra_put) + if (flags & REMOVE_WAIT_SHORTTERM) { + ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); + if (ret) { + /* + * We have a bug. Put back the callers reference and + * defer cleaning this object until close. + */ + refcount_dec(&to_destroy->users); + return ret; + } + zerod_shortterm = true; + } + + xa_lock(&ictx->objects); + obj = xas_load(&xas); + if (to_destroy) { + /* + * If the caller is holding a ref on obj we put it here under + * the spinlock. + */ refcount_dec(&obj->users); + if (WARN_ON(obj != to_destroy)) { + ret = -ENOENT; + goto err_xa; + } + } else if (xa_is_zero(obj) || !obj) { + ret = -ENOENT; + goto err_xa; + } + if (!refcount_dec_if_one(&obj->users)) { - obj = ERR_PTR(-EBUSY); - goto out_xa; + ret = -EBUSY; + goto err_xa; } xas_store(&xas, NULL); if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) ictx->vfio_ioas = NULL; - -out_xa: xa_unlock(&ictx->objects); - /* The returned object reference count is zero */ - return obj; -} - -/* - * The caller holds a users refcount and wants to destroy the object. Returns - * true if the object was destroyed. In all cases the caller no longer has a - * reference on obj. - */ -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail) -{ - struct iommufd_object *ret; - /* - * The purpose of the destroy_rwsem is to ensure deterministic - * destruction of objects used by external drivers and destroyed by this - * function. Any temporary increment of the refcount must hold the read - * side of this, such as during ioctl execution. - */ - down_write(&obj->destroy_rwsem); - ret = iommufd_object_remove(ictx, obj->id, true); - up_write(&obj->destroy_rwsem); - - if (allow_fail && IS_ERR(ret)) - return; - - /* - * If there is a bug and we couldn't destroy the object then we did put - * back the caller's refcount and will eventually try to free it again - * during close. + * Since users is zero any positive users_shortterm must be racing + * iommufd_put_object(), or we have a bug. */ - if (WARN_ON(IS_ERR(ret))) - return; + if (!zerod_shortterm) { + ret = iommufd_object_dec_wait_shortterm(ictx, obj); + if (WARN_ON(ret)) + return ret; + } iommufd_object_ops[obj->type].destroy(obj); kfree(obj); + return 0; + +err_xa: + if (zerod_shortterm) { + /* Restore the xarray owned reference */ + refcount_set(&obj->shortterm_users, 1); + } + xa_unlock(&ictx->objects); + + /* The returned object reference count is zero */ + return ret; } static int iommufd_destroy(struct iommufd_ucmd *ucmd) { struct iommu_destroy *cmd = ucmd->cmd; - struct iommufd_object *obj; - obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); - if (IS_ERR(obj)) - return PTR_ERR(obj); - iommufd_object_ops[obj->type].destroy(obj); - kfree(obj); - return 0; + return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); } static int iommufd_fops_open(struct inode *inode, struct file *filp) @@ -238,6 +251,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + init_waitqueue_head(&ictx->destroy_wait); filp->private_data = ictx; return 0; } -- Gitee From 7aadaddfde8e755683b96c68ca95effd94c8cb8d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 17 Oct 2023 09:42:36 -0500 Subject: [PATCH 007/219] iommu/amd: Do not flush IRTE when only updating isRun and destination fields ANBZ: #13617 commit 57cdb720eaa5c4b2fcf04ba6ff6a26f638b0b474 upstream. According to the recent update in the AMD IOMMU spec [1], the IsRun and Destination fields of the Interrupt Remapping Table Entry (IRTE) are not cached by the IOMMU hardware. Therefore, do not issue the INVALIDATE_INTERRUPT_TABLE command when updating IRTE[IsRun] and IRTE[Destination] when IRTE[GuestMode]=1, which should help improve IOMMU AVIC/x2AVIC performance. References: [1] AMD IOMMU Spec Revision (Rev 3.08-PUB) (Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf) Cc: Joao Martins Cc: Alejandro Jimenez Signed-off-by: Suravee Suthikulpanit Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Link: https://lore.kernel.org/r/20231017144236.8287-1-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2906a279b991..b6c6be2eb819 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3104,8 +3104,8 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, return index; } -static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, - struct irte_ga *irte) +static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, + struct irte_ga *irte) { struct irq_remap_table *table; struct irte_ga *entry; @@ -3132,6 +3132,18 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, raw_spin_unlock_irqrestore(&table->lock, flags); + return 0; +} + +static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, + struct irte_ga *irte) +{ + bool ret; + + ret = __modify_irte_ga(iommu, devid, index, irte); + if (ret) + return ret; + iommu_flush_irt_and_complete(iommu, devid); return 0; @@ -3815,8 +3827,8 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) } entry->lo.fields_vapic.is_run = is_run; - return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, - ir_data->irq_2_irte.index, entry); + return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, + ir_data->irq_2_irte.index, entry); } EXPORT_SYMBOL(amd_iommu_update_ga); #endif -- Gitee From 9b5b09e2539ea4b393a13b6b44aff5996e8bedd7 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:07 +0000 Subject: [PATCH 008/219] iommu/amd: Rename iommu_flush_all_caches() -> amd_iommu_flush_all_caches() ANBZ: #13617 commit af3263758bf0b65a040e90190ab708b8a4027947 upstream. Rename function inline with driver naming convention. No functional changes. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 5 +++++ drivers/iommu/amd/amd_iommu_types.h | 6 ------ drivers/iommu/amd/init.c | 8 ++++---- drivers/iommu/amd/iommu.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 978d3060a67e..746ed0b4a83e 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -53,6 +53,11 @@ int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_complete(struct protection_domain *domain); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 90b7d7950a9e..809d74faa1a5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -902,12 +902,6 @@ extern int amd_iommu_max_glx_val; extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; -/* - * This function flushes all internal caches of - * the IOMMU used by this driver. - */ -void iommu_flush_all_caches(struct amd_iommu *iommu); - static inline int get_ioapic_devid(int id) { struct devid_map *entry; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index ad5070e33af2..2f7e1cca7d34 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2235,7 +2235,7 @@ static int __init amd_iommu_init_pci(void) init_device_table_dma(pci_seg); for_each_iommu(iommu) - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); print_iommu_info(); @@ -2785,7 +2785,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); iommu_enable(iommu); - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } /* @@ -2841,7 +2841,7 @@ static void early_enable_iommus(void) iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); iommu_set_device_table(iommu); - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } } } @@ -3313,7 +3313,7 @@ static int __init state_next(void) uninit_device_table_dma(pci_seg); for_each_iommu(iommu) - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } } return ret; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b6c6be2eb819..5fb5b7b3632d 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1398,7 +1398,7 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) iommu_completion_wait(iommu); } -void iommu_flush_all_caches(struct amd_iommu *iommu) +void amd_iommu_flush_all_caches(struct amd_iommu *iommu) { if (check_feature(FEATURE_IA)) { amd_iommu_flush_all(iommu); -- Gitee From 020ccbc684c5da4484fd2b1f4e50ca0300cbc051 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:08 +0000 Subject: [PATCH 009/219] iommu/amd: Remove redundant domain flush from attach_device() ANBZ: #13617 commit 3f2571fed2fa9e9ea61fd990a9a4fc20ca83b62e upstream. Domain flush was introduced in attach_device() path to handle kdump scenario. Later init code was enhanced to handle kdump scenario where it also takes care of flushing everything including TLB (see early_enable_iommus()). Hence remove redundant flush from attach_device() function. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 5fb5b7b3632d..c896b2d57cba 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1887,15 +1887,6 @@ static int attach_device(struct device *dev, do_attach(dev_data, domain); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ - amd_iommu_domain_flush_tlb_pde(domain); - - amd_iommu_domain_flush_complete(domain); - out: spin_unlock(&dev_data->lock); -- Gitee From 0fbab4e92ef7c8670f3cf6add081dfbdc482df16 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:09 +0000 Subject: [PATCH 010/219] iommu/amd: Remove redundant passing of PDE bit ANBZ: #13617 commit a976da66e8e547cd07a9792f5854c2f73c456a21 upstream. Current code always sets PDE bit in INVALIDATE_IOMMU_PAGES command. Hence get rid of 'pde' variable across functions. We can re-introduce this bit whenever its needed. Suggested-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index c896b2d57cba..be5bc25c65db 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1130,7 +1130,7 @@ static inline u64 build_inv_address(u64 address, size_t size) } static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - size_t size, u16 domid, int pde) + size_t size, u16 domid) { u64 inv_address = build_inv_address(address, size); @@ -1139,8 +1139,8 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - if (pde) /* PDE bit - we want to flush everything, not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + /* PDE bit - we want to flush everything, not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, @@ -1347,7 +1347,7 @@ static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { struct iommu_cmd cmd; build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id, 1); + dom_id); iommu_queue_command(iommu, &cmd); } @@ -1358,8 +1358,7 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) { struct iommu_cmd cmd; - build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id, 1); + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, dom_id); iommu_queue_command(iommu, &cmd); iommu_completion_wait(iommu); @@ -1474,13 +1473,13 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) * page. Otherwise it flushes the whole TLB of the IOMMU. */ static void __domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) + u64 address, size_t size) { struct iommu_dev_data *dev_data; struct iommu_cmd cmd; int ret = 0, i; - build_inv_iommu_pages(&cmd, address, size, domain->id, pde); + build_inv_iommu_pages(&cmd, address, size, domain->id); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -1505,10 +1504,10 @@ static void __domain_flush_pages(struct protection_domain *domain, } static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) + u64 address, size_t size) { if (likely(!amd_iommu_np_cache)) { - __domain_flush_pages(domain, address, size, pde); + __domain_flush_pages(domain, address, size); return; } @@ -1541,7 +1540,7 @@ static void domain_flush_pages(struct protection_domain *domain, flush_size = 1ul << min_alignment; - __domain_flush_pages(domain, address, flush_size, pde); + __domain_flush_pages(domain, address, flush_size); address += flush_size; size -= flush_size; } @@ -1550,7 +1549,7 @@ static void domain_flush_pages(struct protection_domain *domain, /* Flush the whole IO/TLB for a given protection domain - including PDE */ void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) { - domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); + domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } void amd_iommu_domain_flush_complete(struct protection_domain *domain) @@ -1577,7 +1576,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - domain_flush_pages(domain, iova, size, 1); + domain_flush_pages(domain, iova, size); amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2586,7 +2585,7 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - domain_flush_pages(dom, gather->start, gather->end - gather->start + 1, 1); + domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } -- Gitee From 94ed982185afe372e45459df56131b5c73ddeef4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:10 +0000 Subject: [PATCH 011/219] iommu/amd: Add support to invalidate multiple guest pages ANBZ: #13617 commit cf62924daf9f0363c82e4332d9ac9630f1d76c42 upstream. Current interface supports invalidating single page or entire guest translation information for a single process address space. IOMMU CMD_INV_IOMMU_PAGES and CMD_INV_IOTLB_PAGES commands supports invalidating range of pages. Add support to invalidate multiple pages. This is preparatory patch before consolidating host and guest invalidation code into single function. Following patches will consolidation tlb invalidation code. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index be5bc25c65db..3dcbc5deb363 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1158,40 +1158,36 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, } static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid, - u64 address, bool size) + u64 address, size_t size) { - memset(cmd, 0, sizeof(*cmd)); + u64 inv_address = build_inv_address(address, size); - address &= ~(0xfffULL); + memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = pasid; cmd->data[1] = domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); + cmd->data[2] = lower_32_bits(inv_address); + cmd->data[3] = upper_32_bits(inv_address); cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - if (size) - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); } static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int qdep, u64 address, bool size) + int qdep, u64 address, size_t size) { - memset(cmd, 0, sizeof(*cmd)); + u64 inv_address = build_inv_address(address, size); - address &= ~(0xfffULL); + memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = devid; cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; cmd->data[0] |= (qdep & 0xff) << 24; cmd->data[1] = devid; cmd->data[1] |= (pasid & 0xff) << 16; - cmd->data[2] = lower_32_bits(address); + cmd->data[2] = lower_32_bits(inv_address); cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - cmd->data[3] = upper_32_bits(address); - if (size) - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } @@ -2651,7 +2647,7 @@ const struct iommu_ops amd_iommu_ops = { }; static int __flush_pasid(struct protection_domain *domain, u32 pasid, - u64 address, bool size) + u64 address, size_t size) { struct iommu_dev_data *dev_data; struct iommu_cmd cmd; @@ -2715,7 +2711,7 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid, u64 address) { - return __flush_pasid(domain, pasid, address, false); + return __flush_pasid(domain, pasid, address, PAGE_SIZE); } int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, @@ -2734,8 +2730,7 @@ int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) { - return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - true); + return __flush_pasid(domain, pasid, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) -- Gitee From 4347cb1c0ac325a0325926e901f89e5494bc8409 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:11 +0000 Subject: [PATCH 012/219] iommu/amd: Refactor IOMMU tlb invalidation code ANBZ: #13617 commit 4f0a600799237ed95b403f24354305b0f81ccbb4 upstream. build_inv_iommu_pages() and build_inv_iommu_pasid() pretty much duplicates the code. Hence enhance build_inv_iommu_pages() to invalidate guest pages as well. And remove build_inv_iommu_pasid(). Suggested-by: Kishon Vijay Abraham I Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3dcbc5deb363..9ec2c5d3f76f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1130,17 +1130,23 @@ static inline u64 build_inv_address(u64 address, size_t size) } static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - size_t size, u16 domid) + size_t size, u16 domid, + ioasid_t pasid, bool gn) { u64 inv_address = build_inv_address(address, size); memset(cmd, 0, sizeof(*cmd)); + cmd->data[1] |= domid; cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); /* PDE bit - we want to flush everything, not only the PTEs */ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + if (gn) { + cmd->data[0] |= pasid; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + } + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, @@ -1157,22 +1163,6 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } -static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid, - u64 address, size_t size) -{ - u64 inv_address = build_inv_address(address, size); - - memset(cmd, 0, sizeof(*cmd)); - - cmd->data[0] = pasid; - cmd->data[1] = domid; - cmd->data[2] = lower_32_bits(inv_address); - cmd->data[3] = upper_32_bits(inv_address); - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; - cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); -} - static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, int qdep, u64 address, size_t size) { @@ -1343,7 +1333,7 @@ static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { struct iommu_cmd cmd; build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id); + dom_id, IOMMU_NO_PASID, false); iommu_queue_command(iommu, &cmd); } @@ -1354,7 +1344,8 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) { struct iommu_cmd cmd; - build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, dom_id); + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + dom_id, IOMMU_NO_PASID, false); iommu_queue_command(iommu, &cmd); iommu_completion_wait(iommu); @@ -1475,7 +1466,8 @@ static void __domain_flush_pages(struct protection_domain *domain, struct iommu_cmd cmd; int ret = 0, i; - build_inv_iommu_pages(&cmd, address, size, domain->id); + build_inv_iommu_pages(&cmd, address, size, domain->id, + IOMMU_NO_PASID, false); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -2656,7 +2648,7 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, if (!(domain->flags & PD_IOMMUV2_MASK)) return -EINVAL; - build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); + build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, true); /* * IOMMU TLB needs to be flushed before Device TLB to -- Gitee From dfcf7618b4d0ad30b46dec81d92c278b18172566 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:12 +0000 Subject: [PATCH 013/219] iommu/amd: Refactor device iotlb invalidation code ANBZ: #13617 commit bbf85fe10faadbd237a7b8df4423de4ab4bd67e7 upstream. build_inv_iotlb_pages() and build_inv_iotlb_pasid() pretty much duplicates the code. Enhance build_inv_iotlb_pages() to invalidate guest IOTLB as well. And remove build_inv_iotlb_pasid() function. Suggested-by: Kishon Vijay Abraham I Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9ec2c5d3f76f..8e65f4f58634 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1150,34 +1150,24 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, - u64 address, size_t size) + u64 address, size_t size, + ioasid_t pasid, bool gn) { u64 inv_address = build_inv_address(address, size); memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; cmd->data[0] |= (qdep & 0xff) << 24; cmd->data[1] = devid; cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); - CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); -} - -static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int qdep, u64 address, size_t size) -{ - u64 inv_address = build_inv_address(address, size); - - memset(cmd, 0, sizeof(*cmd)); + if (gn) { + cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; + cmd->data[1] |= (pasid & 0xff) << 16; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + } - cmd->data[0] = devid; - cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; - cmd->data[0] |= (qdep & 0xff) << 24; - cmd->data[1] = devid; - cmd->data[1] |= (pasid & 0xff) << 16; - cmd->data[2] = lower_32_bits(inv_address); - cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } @@ -1406,7 +1396,8 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, int qdep = dev_data->ats_qdep; - build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); + build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, + size, IOMMU_NO_PASID, false); return iommu_queue_command(iommu, &cmd); } @@ -2682,8 +2673,8 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) continue; - build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, - qdep, address, size); + build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, + address, size, pasid, true); ret = iommu_queue_command(iommu, &cmd); if (ret != 0) -- Gitee From 22e7d41bbeed150c717b898b7aac8076336f6260 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:13 +0000 Subject: [PATCH 014/219] iommu/amd: Consolidate amd_iommu_domain_flush_complete() call ANBZ: #13617 commit 8d004ac1c67bc1584b4b6b90429487dac5b2d83f upstream. Call amd_iommu_domain_flush_complete() from domain_flush_pages(). That way we can remove explicit call of amd_iommu_domain_flush_complete() from various places. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/io_pgtable.c | 1 - drivers/iommu/amd/iommu.c | 21 ++++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 6c0621f6f572..ca22546e4d1a 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -425,7 +425,6 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, * increase_address_space(). */ amd_iommu_domain_flush_tlb_pde(dom); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 8e65f4f58634..ad5fd3ec18fc 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1487,6 +1487,10 @@ static void domain_flush_pages(struct protection_domain *domain, { if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); + + /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ + amd_iommu_domain_flush_complete(domain); + return; } @@ -1523,6 +1527,9 @@ static void domain_flush_pages(struct protection_domain *domain, address += flush_size; size -= flush_size; } + + /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ + amd_iommu_domain_flush_complete(domain); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ @@ -1556,7 +1563,6 @@ static void domain_flush_np_cache(struct protection_domain *domain, spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); - amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1827,12 +1833,9 @@ static void do_detach(struct iommu_dev_data *dev_data) /* Flush the DTE entry */ device_flush_dte(dev_data); - /* Flush IOTLB */ + /* Flush IOTLB and wait for the flushes to finish */ amd_iommu_domain_flush_tlb_pde(domain); - /* Wait for the flushes to finish */ - amd_iommu_domain_flush_complete(domain); - /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; domain->dev_cnt -= 1; @@ -2007,7 +2010,6 @@ void amd_iommu_domain_update(struct protection_domain *domain) /* Flush domain TLB(s) and wait for completion */ amd_iommu_domain_flush_tlb_pde(domain); - amd_iommu_domain_flush_complete(domain); } /***************************************************************************** @@ -2448,10 +2450,9 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, } /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ - if (domain_flush) { + if (domain_flush) amd_iommu_domain_flush_tlb_pde(pdomain); - amd_iommu_domain_flush_complete(pdomain); - } + pdomain->dirty_tracking = enable; spin_unlock_irqrestore(&pdomain->lock, flags); @@ -2553,7 +2554,6 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) spin_lock_irqsave(&dom->lock, flags); amd_iommu_domain_flush_tlb_pde(dom); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } @@ -2565,7 +2565,6 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, spin_lock_irqsave(&dom->lock, flags); domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } -- Gitee From 6111744a2717a6ebc6fe223de0d6e67378a133b5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:14 +0000 Subject: [PATCH 015/219] iommu/amd: Make domain_flush_pages as global function ANBZ: #13617 commit 2c535dd37d677abd11b278fce89b8bdb0a55facb upstream. - Rename domain_flush_pages() -> amd_iommu_domain_flush_pages() and make it as global function. - Rename amd_iommu_domain_flush_tlb_pde() -> amd_iommu_domain_flush_all() and make it as static. - Convert v1 page table (io_pgtble.c) to use amd_iommu_domain_flush_pages(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 3 ++- drivers/iommu/amd/io_pgtable.c | 4 +++- drivers/iommu/amd/iommu.c | 22 ++++++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 746ed0b4a83e..31275bc6e9e4 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -61,7 +61,8 @@ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_complete(struct protection_domain *domain); -void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain); +void amd_iommu_domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size); int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, unsigned long cr3); diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index ca22546e4d1a..2a0d1e97e52f 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -369,6 +369,8 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, bool updated = false; u64 __pte, *pte; int ret, i, count; + size_t size = pgcount << __ffs(pgsize); + unsigned long o_iova = iova; BUG_ON(!IS_ALIGNED(iova, pgsize)); BUG_ON(!IS_ALIGNED(paddr, pgsize)); @@ -424,7 +426,7 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, * Updates and flushing already happened in * increase_address_space(). */ - amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_pages(dom, o_iova, size); spin_unlock_irqrestore(&dom->lock, flags); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index ad5fd3ec18fc..e108d11f1b3f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1482,8 +1482,8 @@ static void __domain_flush_pages(struct protection_domain *domain, WARN_ON(ret); } -static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size) +void amd_iommu_domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); @@ -1533,9 +1533,10 @@ static void domain_flush_pages(struct protection_domain *domain, } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) +static void amd_iommu_domain_flush_all(struct protection_domain *domain) { - domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); + amd_iommu_domain_flush_pages(domain, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } void amd_iommu_domain_flush_complete(struct protection_domain *domain) @@ -1562,7 +1563,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - domain_flush_pages(domain, iova, size); + amd_iommu_domain_flush_pages(domain, iova, size); spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1834,7 +1835,7 @@ static void do_detach(struct iommu_dev_data *dev_data) device_flush_dte(dev_data); /* Flush IOTLB and wait for the flushes to finish */ - amd_iommu_domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_all(domain); /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; @@ -2009,7 +2010,7 @@ void amd_iommu_domain_update(struct protection_domain *domain) amd_iommu_update_and_flush_device_table(domain); /* Flush domain TLB(s) and wait for completion */ - amd_iommu_domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_all(domain); } /***************************************************************************** @@ -2451,7 +2452,7 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ if (domain_flush) - amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_all(pdomain); pdomain->dirty_tracking = enable; spin_unlock_irqrestore(&pdomain->lock, flags); @@ -2553,7 +2554,7 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_all(dom); spin_unlock_irqrestore(&dom->lock, flags); } @@ -2564,7 +2565,8 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); spin_unlock_irqrestore(&dom->lock, flags); } -- Gitee From 2f364bc6c3e396d73bc25171b25125adcfa6e0c2 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:15 +0000 Subject: [PATCH 016/219] iommu/amd/pgtbl_v2: Invalidate updated page ranges only ANBZ: #13617 commit c7fc12354be0ba47566d55f4ebdc6a47bd69d5ed upstream. Enhance __domain_flush_pages() to detect domain page table mode and use that info to build invalidation commands. So that we can use amd_iommu_domain_flush_pages() to invalidate v2 page table. Also pass PASID, gn variable to device_flush_iotlb() so that it can build IOTLB invalidation command for both v1 and v2 page table. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/io_pgtable_v2.c | 10 ++-------- drivers/iommu/amd/iommu.c | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 4823c702290c..1187779c240e 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -244,7 +244,6 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, unsigned long mapped_size = 0; unsigned long o_iova = iova; size_t size = pgcount << __ffs(pgsize); - int count = 0; int ret = 0; bool updated = false; @@ -265,19 +264,14 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, *pte = set_pte_attr(paddr, map_size, prot); - count++; iova += map_size; paddr += map_size; mapped_size += map_size; } out: - if (updated) { - if (count > 1) - amd_iommu_flush_tlb(&pdom->domain, 0); - else - amd_iommu_flush_page(&pdom->domain, 0, o_iova); - } + if (updated) + amd_iommu_domain_flush_pages(pdom, o_iova, size); if (mapped) *mapped += mapped_size; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e108d11f1b3f..80c5efe345c1 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -85,6 +85,11 @@ static void detach_device(struct device *dev); * ****************************************************************************/ +static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) +{ + return (pdom && (pdom->flags & PD_IOMMUV2_MASK)); +} + static inline int get_acpihid_device_id(struct device *dev, struct acpihid_map_entry **entry) { @@ -1388,8 +1393,8 @@ void amd_iommu_flush_all_caches(struct amd_iommu *iommu) /* * Command send function for flushing on-device TLB */ -static int device_flush_iotlb(struct iommu_dev_data *dev_data, - u64 address, size_t size) +static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, + size_t size, ioasid_t pasid, bool gn) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct iommu_cmd cmd; @@ -1397,7 +1402,7 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, int qdep = dev_data->ats_qdep; build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, - size, IOMMU_NO_PASID, false); + size, pasid, gn); return iommu_queue_command(iommu, &cmd); } @@ -1439,8 +1444,11 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } - if (dev_data->ats_enabled) - ret = device_flush_iotlb(dev_data, 0, ~0UL); + if (dev_data->ats_enabled) { + /* Invalidate the entire contents of an IOTLB */ + ret = device_flush_iotlb(dev_data, 0, ~0UL, + IOMMU_NO_PASID, false); + } return ret; } @@ -1456,9 +1464,13 @@ static void __domain_flush_pages(struct protection_domain *domain, struct iommu_dev_data *dev_data; struct iommu_cmd cmd; int ret = 0, i; + ioasid_t pasid = IOMMU_NO_PASID; + bool gn = false; + + if (pdom_is_v2_pgtbl_mode(domain)) + gn = true; - build_inv_iommu_pages(&cmd, address, size, domain->id, - IOMMU_NO_PASID, false); + build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, gn); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -1476,7 +1488,7 @@ static void __domain_flush_pages(struct protection_domain *domain, if (!dev_data->ats_enabled) continue; - ret |= device_flush_iotlb(dev_data, address, size); + ret |= device_flush_iotlb(dev_data, address, size, pasid, gn); } WARN_ON(ret); -- Gitee From f99007f1e7814b0392dbdfcd44b6e0786e74d39d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 9 Dec 2023 23:12:40 +0000 Subject: [PATCH 017/219] iommu/apple-dart: Fix spelling mistake "grups" -> "groups" ANBZ: #13617 commit b6b2264ba2090bb63665458e4b9369781df7dd4d upstream. There is a spelling mistake in a dev_err message. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20231209231240.4056082-1-colin.i.king@gmail.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/apple-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 39c517aa2721..eb1e62cd499a 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -911,7 +911,7 @@ static struct iommu_group *apple_dart_device_group(struct device *dev) ret = apple_dart_merge_master_cfg(group_master_cfg, cfg); if (ret) { - dev_err(dev, "Failed to merge DART IOMMU grups.\n"); + dev_err(dev, "Failed to merge DART IOMMU groups.\n"); iommu_group_put(group); res = ERR_PTR(ret); goto out; -- Gitee From b81e1d83c08347dcfbd31a92d249ced8ec7934d5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Oct 2023 08:05:20 +0800 Subject: [PATCH 018/219] iommu: Change kconfig around IOMMU_SVA ANBZ: #13617 commit 8f23f5dba6b4693448144bde4dd6f537543442c2 upstream. Linus suggested that the kconfig here is confusing: https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/ Let's break it into three kconfigs controlling distinct things: - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional fields for the IOMMU. Currently only PASID, but later patches store a struct iommu_mm_data * - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit for keeping track of the ENQCMD instruction. x86 will select this if IOMMU_SVA is enabled - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code for iommu driver use and the IOMMU exported API This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: mm/Kconfig minor: config order conflict ] Signed-off-by: Shuai Xue --- arch/Kconfig | 5 +++++ arch/x86/Kconfig | 1 + arch/x86/kernel/traps.c | 2 +- drivers/iommu/Kconfig | 1 + include/linux/iommu.h | 2 +- include/linux/mm_types.h | 2 +- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- mm/Kconfig | 3 +++ mm/init-mm.c | 2 +- 10 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index cf8b88fd863e..68131ee4ce68 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -312,6 +312,11 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED config ARCH_HAS_CPU_FINALIZE_INIT bool +# The architecture has a per-task state that includes the mm's PASID +config ARCH_HAS_CPU_PASID + bool + select IOMMU_MM_DATA + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 54606f41a45f..7a58df5bfaec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -73,6 +73,7 @@ config X86 select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CPU_PASID if IOMMU_SVA select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9e76ddc06201..555a7b640437 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -668,7 +668,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs) */ static bool try_fixup_enqcmd_gp(void) { -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 9bf45d11be10..0c1d9ecc7a77 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -160,6 +160,7 @@ config IOMMU_DMA # Shared Virtual Addressing config IOMMU_SVA + select IOMMU_MM_DATA bool config FSL_PAMU diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b9d74dcf3ffa..f25cf4537b43 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1382,7 +1382,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream return false; } -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { mm->pasid = IOMMU_PASID_INVALID; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce6a7c9dee..f3fa8c6d0497 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -992,7 +992,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA u32 pasid; #endif #ifdef CONFIG_KSM diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f93897057a7..93a839d8e086 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1037,7 +1037,7 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL diff --git a/kernel/fork.c b/kernel/fork.c index a0fb3e9810ce..bd1500d4a03b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1223,7 +1223,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif diff --git a/mm/Kconfig b/mm/Kconfig index 890ea45e55d7..7f159aeb2bfe 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1315,6 +1315,9 @@ config DUPTEXT If unsure, say N. +config IOMMU_MM_DATA + bool + source "mm/damon/Kconfig" config ASYNC_FORK diff --git a/mm/init-mm.c b/mm/init-mm.c index cfd367822cdd..c52dc2740a3d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,7 +44,7 @@ struct mm_struct init_mm = { #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA .pasid = IOMMU_PASID_INVALID, #endif INIT_MM_CONTEXT(init_mm) -- Gitee From d8e0ace179ba07898f45f8bdf8cfe3d030ba8fda Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:21 +0800 Subject: [PATCH 019/219] iommu/vt-d: Remove mm->pasid in intel_sva_bind_mm() ANBZ: #13617 commit 5c79705d7ce8986036c73e89024d189d4f9df1ff upstream. The pasid is passed in as a parameter through .set_dev_pasid() callback. Thus, intel_sva_bind_mm() can directly use it instead of retrieving the pasid value from mm->pasid. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-3-tina.zhang@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/svm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 4072b43f28c8..1f3b9b8b6155 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -315,21 +315,22 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, } static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct mm_struct *mm) + struct iommu_domain *domain, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct mm_struct *mm = domain->mm; struct intel_svm_dev *sdev; struct intel_svm *svm; unsigned long sflags; int ret = 0; - svm = pasid_private_find(mm->pasid); + svm = pasid_private_find(pasid); if (!svm) { svm = kzalloc(sizeof(*svm), GFP_KERNEL); if (!svm) return -ENOMEM; - svm->pasid = mm->pasid; + svm->pasid = pasid; svm->mm = mm; INIT_LIST_HEAD_RCU(&svm->devs); @@ -367,7 +368,7 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, + ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) goto free_sdev; @@ -381,7 +382,7 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, free_svm: if (list_empty(&svm->devs)) { mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(mm->pasid); + pasid_private_remove(pasid); kfree(svm); } @@ -821,9 +822,8 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - struct mm_struct *mm = domain->mm; - return intel_svm_bind_mm(iommu, dev, mm); + return intel_svm_bind_mm(iommu, dev, domain, pasid); } static void intel_svm_domain_free(struct iommu_domain *domain) -- Gitee From 1439b50c4ae30c918c2d1207bc1272b763896b21 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:44:14 +0800 Subject: [PATCH 020/219] anolis: Revert "NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow default substream bypass with a pasid support" ANBZ: #13617 This reverts commit 2cb2cfd1b574a2dffae53a8498704fd6c2d64a58. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 ++++---------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index dd5628977628..8692fc8190eb 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1138,12 +1138,6 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->stall_enabled = master->stall_enabled; cd_table->s1cdmax = master->ssid_bits; - - if (master->domain->domain.type == IOMMU_DOMAIN_IDENTITY) - cd_table->s1dss = STRTAB_STE_1_S1DSS_BYPASS; - else - cd_table->s1dss = STRTAB_STE_1_S1DSS_SSID0; - max_contexts = 1 << cd_table->s1cdmax; if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) || @@ -1344,8 +1338,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, BUG_ON(ste_live); dst->data[1] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_1_S1DSS, cd_table->s1dss) | - FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING) | + FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | @@ -2153,8 +2146,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, return 0; } -static int arm_smmu_domain_finalise(struct iommu_domain *domain, - struct arm_smmu_master *master) +static int arm_smmu_domain_finalise(struct iommu_domain *domain) { int ret; unsigned long ias, oas; @@ -2166,11 +2158,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain, struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - /* - * A master with a pasid capability might need a CD table, so only set - * ARM_SMMU_DOMAIN_BYPASS if IOMMU_DOMAIN_IDENTITY and non-pasid master - */ - if (domain->type == IOMMU_DOMAIN_IDENTITY && !master->ssid_bits) { + if (domain->type == IOMMU_DOMAIN_IDENTITY) { smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS; return 0; } @@ -2422,7 +2410,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!smmu_domain->smmu) { smmu_domain->smmu = smmu; - ret = arm_smmu_domain_finalise(domain, master); + ret = arm_smmu_domain_finalise(domain); if (ret) smmu_domain->smmu = NULL; } else if (smmu_domain->smmu != smmu) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index f26dcb38deac..27ddf1acd12c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -601,7 +601,6 @@ struct arm_smmu_ctx_desc_cfg { struct arm_smmu_l1_ctx_desc *l1_desc; unsigned int num_l1_ents; u8 s1fmt; - u8 s1dss; /* log2 of the maximum number of CDs supported by this table */ u8 s1cdmax; /* Whether CD entries in this table have the stall bit set. */ -- Gitee From 36d6cd70525d955002ef2eb2646c9cc46d84fa36 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:44:22 +0800 Subject: [PATCH 021/219] anolis: Revert "iommu/arm-smmu-v3: Do not use devm for the cd table allocations" ANBZ: #13617 This reverts commit c1b52734c83a3b4c3f07cef8817fcf3a03551c0c. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 29 ++++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 8692fc8190eb..6386d79b6f3e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -999,8 +999,8 @@ static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, { size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); - l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size, - &l1_desc->l2ptr_dma, GFP_KERNEL); + l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, + &l1_desc->l2ptr_dma, GFP_KERNEL); if (!l1_desc->l2ptr) { dev_warn(smmu->dev, "failed to allocate context descriptor table\n"); @@ -1151,17 +1151,17 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES); - cd_table->l1_desc = kcalloc(cd_table->num_l1_ents, - sizeof(*cd_table->l1_desc), - GFP_KERNEL); + cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents, + sizeof(*cd_table->l1_desc), + GFP_KERNEL); if (!cd_table->l1_desc) return -ENOMEM; l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); } - cd_table->cdtab = dma_alloc_coherent(smmu->dev, l1size, - &cd_table->cdtab_dma, GFP_KERNEL); + cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, + GFP_KERNEL); if (!cd_table->cdtab) { dev_warn(smmu->dev, "failed to allocate context descriptor\n"); ret = -ENOMEM; @@ -1172,7 +1172,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) err_free_l1: if (cd_table->l1_desc) { - kfree(cd_table->l1_desc); + devm_kfree(smmu->dev, cd_table->l1_desc); cd_table->l1_desc = NULL; } return ret; @@ -1192,18 +1192,21 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master) if (!cd_table->l1_desc[i].l2ptr) continue; - dma_free_coherent(smmu->dev, size, - cd_table->l1_desc[i].l2ptr, - cd_table->l1_desc[i].l2ptr_dma); + dmam_free_coherent(smmu->dev, size, + cd_table->l1_desc[i].l2ptr, + cd_table->l1_desc[i].l2ptr_dma); } - kfree(cd_table->l1_desc); + devm_kfree(smmu->dev, cd_table->l1_desc); + cd_table->l1_desc = NULL; l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); } else { l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3); } - dma_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma); + dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma); + cd_table->cdtab_dma = 0; + cd_table->cdtab = NULL; } bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd) -- Gitee From 82e4f1f6a52de08e9ad64ffba22a772d83d9120f Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:44:31 +0800 Subject: [PATCH 022/219] anolis: Revert "iommu/arm-smmu-v3: Do not allow a SVA domain to be set on the wrong PASID" ANBZ: #13617 This reverts commit ae50d2c436c5bd9598b08062e2a65ea2f5a54d61. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index f8531372ad96..4a27fbdb2d84 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -575,9 +575,6 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, int ret = 0; struct mm_struct *mm = domain->mm; - if (mm_get_enqcmd_pasid(mm) != id) - return -EINVAL; - mutex_lock(&sva_lock); ret = __arm_smmu_sva_bind(dev, id, mm); mutex_unlock(&sva_lock); -- Gitee From f92f641cc87864c2bc94f3487ec947bc0f3f89a5 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:44:44 +0800 Subject: [PATCH 023/219] anolis: Revert "iommu/arm-smmu-v3: Hold arm_smmu_asid_lock during all of attach_dev" ANBZ: #13617 This reverts commit 24588ff9508c6ebffd484875ae667189651a68c9. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 +++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6386d79b6f3e..b3513a8bc8ad 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2409,6 +2409,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return -EBUSY; } + arm_smmu_detach_dev(master); + mutex_lock(&smmu_domain->init_mutex); if (!smmu_domain->smmu) { @@ -2423,16 +2425,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (ret) return ret; - /* - * Prevent arm_smmu_share_asid() from trying to change the ASID - * of either the old or new domain while we are working on it. - * This allows the STE and the smmu_domain->devices list to - * be inconsistent during this routine. - */ - mutex_lock(&arm_smmu_asid_lock); - - arm_smmu_detach_dev(master); - master->domain = smmu_domain; /* @@ -2458,7 +2450,13 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } } + /* + * Prevent SVA from concurrently modifying the CD or writing to + * the CD entry + */ + mutex_lock(&arm_smmu_asid_lock); ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); + mutex_unlock(&arm_smmu_asid_lock); if (ret) { master->domain = NULL; goto out_list_del; @@ -2468,15 +2466,13 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_install_ste_for_dev(master); arm_smmu_enable_ats(master); - goto out_unlock; + return 0; out_list_del: spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); -out_unlock: - mutex_unlock(&arm_smmu_asid_lock); return ret; } -- Gitee From d5dd642f84e975355b8500cc9bd84f62f07aadca Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:44:55 +0800 Subject: [PATCH 024/219] anolis: Revert "iommu/arm-smmu-v3: Do not use GFP_KERNEL under as spinlock" ANBZ: #13617 This reverts commit cb0e3bd78b28698b07f2798ea0a4e867ddcf2880. --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 4a27fbdb2d84..05722121f00e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -292,8 +292,10 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, struct mm_struct *mm) { int ret; + unsigned long flags; struct arm_smmu_ctx_desc *cd; struct arm_smmu_mmu_notifier *smmu_mn; + struct arm_smmu_master *master; list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) { if (smmu_mn->mn.mm == mm) { @@ -323,9 +325,28 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, goto err_free_cd; } + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master, &smmu_domain->devices, domain_head) { + ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), + cd); + if (ret) { + list_for_each_entry_from_reverse( + master, &smmu_domain->devices, domain_head) + arm_smmu_write_ctx_desc( + master, mm_get_enqcmd_pasid(mm), NULL); + break; + } + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + if (ret) + goto err_put_notifier; + list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); return smmu_mn; +err_put_notifier: + /* Frees smmu_mn */ + mmu_notifier_put(&smmu_mn->mn); err_free_cd: arm_smmu_free_shared_cd(cd); return ERR_PTR(ret); @@ -342,6 +363,9 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + NULL); + /* * If we went through clear(), we've already invalidated, and no * new TLB entry can have been formed. @@ -357,8 +381,7 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) arm_smmu_free_shared_cd(cd); } -static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, - struct mm_struct *mm) +static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) { int ret; struct arm_smmu_bond *bond; @@ -381,15 +404,9 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, goto err_free_bond; } - ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd); - if (ret) - goto err_put_notifier; - list_add(&bond->list, &master->bonds); return 0; -err_put_notifier: - arm_smmu_mmu_notifier_put(bond->smmu_mn); err_free_bond: kfree(bond); return ret; @@ -551,9 +568,6 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); mutex_lock(&sva_lock); - - arm_smmu_write_ctx_desc(master, id, NULL); - list_for_each_entry(t, &master->bonds, list) { if (t->mm == mm) { bond = t; @@ -576,7 +590,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, struct mm_struct *mm = domain->mm; mutex_lock(&sva_lock); - ret = __arm_smmu_sva_bind(dev, id, mm); + ret = __arm_smmu_sva_bind(dev, mm); mutex_unlock(&sva_lock); return ret; -- Gitee From 1b3beff07b58adda15d76b90c6e7c8decbee4c8b Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:45:03 +0800 Subject: [PATCH 025/219] anolis: Revert "iommu/arm-smmu-v3: Remove ARM_SMMU_DOMAIN_NESTED" ANBZ: #13617 This reverts commit 9cb92161da66701037de73767a9c9074a382a5fd. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b3513a8bc8ad..1cc8db2c0461 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1289,6 +1289,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, cd_table = &master->cd_table; break; case ARM_SMMU_DOMAIN_S2: + case ARM_SMMU_DOMAIN_NESTED: s2_cfg = &smmu_domain->s2_cfg; break; default: @@ -2180,6 +2181,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) fmt = ARM_64_LPAE_S1; finalise_stage_fn = arm_smmu_domain_finalise_s1; break; + case ARM_SMMU_DOMAIN_NESTED: case ARM_SMMU_DOMAIN_S2: ias = smmu->ias; oas = smmu->oas; @@ -2738,7 +2740,7 @@ static int arm_smmu_enable_nesting(struct iommu_domain *domain) if (smmu_domain->smmu) ret = -EPERM; else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; mutex_unlock(&smmu_domain->init_mutex); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 27ddf1acd12c..03f9e526cbd9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -715,6 +715,7 @@ struct arm_smmu_master { enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, + ARM_SMMU_DOMAIN_NESTED, ARM_SMMU_DOMAIN_BYPASS, }; -- Gitee From 029cb05b9a528503c76fb7d671a20e94b1687c53 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:45:10 +0800 Subject: [PATCH 026/219] anolis: Revert "iommu/arm-smmu-v3: Master cannot be NULL in arm_smmu_write_strtab_ent()" ANBZ: #13617 This reverts commit 519ffa1a30560d326d59fb753f5e1d7ddbc3837a. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1cc8db2c0461..c1212239c6c2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1272,10 +1272,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, */ u64 val = le64_to_cpu(dst->data[0]); bool ste_live = false; - struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_device *smmu = NULL; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; - struct arm_smmu_domain *smmu_domain = master->domain; + struct arm_smmu_domain *smmu_domain = NULL; struct arm_smmu_cmdq_ent prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, .prefetch = { @@ -1283,6 +1283,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, }, }; + if (master) { + smmu_domain = master->domain; + smmu = master->smmu; + } + if (smmu_domain) { switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: -- Gitee From eee292fb44fb74af5e01a64131e8dd1c1bfe2a69 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:45:17 +0800 Subject: [PATCH 027/219] anolis: Revert "iommu/arm-smmu-v3: Add a type for the STE" ANBZ: #13617 This reverts commit cce5d5065e710f9c87d7e8cec9861219a78f9f68. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 59 +++++++++++---------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +-- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c1212239c6c2..95d020a31c68 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1252,7 +1252,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, - struct arm_smmu_ste *dst) + __le64 *dst) { /* * This is hideously complicated, but we only really care about @@ -1270,7 +1270,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, * 2. Write everything apart from dword 0, sync, write dword 0, sync * 3. Update Config, sync */ - u64 val = le64_to_cpu(dst->data[0]); + u64 val = le64_to_cpu(dst[0]); bool ste_live = false; struct arm_smmu_device *smmu = NULL; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; @@ -1328,10 +1328,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); - dst->data[0] = cpu_to_le64(val); - dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + dst[0] = cpu_to_le64(val); + dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - dst->data[2] = 0; /* Nuke the VMID */ + dst[2] = 0; /* Nuke the VMID */ /* * The SMMU can perform negative caching, so we must sync * the STE regardless of whether the old value was live. @@ -1346,7 +1346,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; BUG_ON(ste_live); - dst->data[1] = cpu_to_le64( + dst[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1355,7 +1355,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu->features & ARM_SMMU_FEAT_STALLS && !master->stall_enabled) - dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | @@ -1365,7 +1365,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (s2_cfg) { BUG_ON(ste_live); - dst->data[2] = cpu_to_le64( + dst[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | #ifdef __BIG_ENDIAN @@ -1374,18 +1374,18 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + dst[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } if (master->ats_enabled) - dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, + dst[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS)); arm_smmu_sync_ste_for_sid(smmu, sid); /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst->data[0], cpu_to_le64(val)); + WRITE_ONCE(dst[0], cpu_to_le64(val)); arm_smmu_sync_ste_for_sid(smmu, sid); /* It's likely that we'll want to use the new STE soon */ @@ -1393,8 +1393,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); } -static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, - unsigned int nent, bool force) +static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent, bool force) { unsigned int i; u64 val = STRTAB_STE_0_V; @@ -1405,11 +1404,11 @@ static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); for (i = 0; i < nent; ++i) { - strtab->data[0] = cpu_to_le64(val); - strtab->data[1] = cpu_to_le64(FIELD_PREP( - STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - strtab->data[2] = 0; - strtab++; + strtab[0] = cpu_to_le64(val); + strtab[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); + strtab[2] = 0; + strtab += STRTAB_STE_DWORDS; } } @@ -2224,23 +2223,26 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) return 0; } -static struct arm_smmu_ste * -arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) +static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) { + __le64 *step; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { - unsigned int idx1, idx2; + struct arm_smmu_strtab_l1_desc *l1_desc; + int idx; /* Two-level walk */ - idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; - idx2 = sid & ((1 << STRTAB_SPLIT) - 1); - return &cfg->l1_desc[idx1].l2ptr[idx2]; + idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; + l1_desc = &cfg->l1_desc[idx]; + idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS; + step = &l1_desc->l2ptr[idx]; } else { /* Simple linear lookup */ - return (struct arm_smmu_ste *)&cfg - ->strtab[sid * STRTAB_STE_DWORDS]; + step = &cfg->strtab[sid * STRTAB_STE_DWORDS]; } + + return step; } static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) @@ -2250,8 +2252,7 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) for (i = 0; i < master->num_streams; ++i) { u32 sid = master->streams[i].id; - struct arm_smmu_ste *step = - arm_smmu_get_step_for_sid(smmu, sid); + __le64 *step = arm_smmu_get_step_for_sid(smmu, sid); /* Bridged PCI devices may end up with duplicated IDs */ for (j = 0; j < i; j++) @@ -3774,7 +3775,7 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); list_for_each_entry(e, &rmr_list, list) { - struct arm_smmu_ste *step; + __le64 *step; struct iommu_iort_rmr_data *rmr; int ret, i; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 03f9e526cbd9..961205ba86d2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -206,11 +206,6 @@ #define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) #define STRTAB_STE_DWORDS 8 - -struct arm_smmu_ste { - __le64 data[STRTAB_STE_DWORDS]; -}; - #define STRTAB_STE_0_V (1UL << 0) #define STRTAB_STE_0_CFG GENMASK_ULL(3, 1) #define STRTAB_STE_0_CFG_ABORT 0 @@ -576,7 +571,7 @@ struct arm_smmu_priq { struct arm_smmu_strtab_l1_desc { u8 span; - struct arm_smmu_ste *l2ptr; + __le64 *l2ptr; dma_addr_t l2ptr_dma; }; -- Gitee From b38f687ba4c26a6b424df69dcebcad2364caf495 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:45:26 +0800 Subject: [PATCH 028/219] anolis: Revert "iommu/arm-smmu-v3: disable stall for quiet_cd" ANBZ: #13617 This reverts commit bb90b8fbc275baf9820e9fb722b53e709f0c9ba1. --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 95d020a31c68..dabc7bf95b91 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1063,7 +1063,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, bool cd_live; __le64 *cdptr; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - struct arm_smmu_device *smmu = master->smmu; if (WARN_ON(ssid >= (1 << cd_table->s1cdmax))) return -E2BIG; @@ -1078,8 +1077,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, if (!cd) { /* (5) */ val = 0; } else if (cd == &quiet_cd) { /* (4) */ - if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) - val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R); val |= CTXDESC_CD_0_TCR_EPD0; } else if (cd_live) { /* (3) */ val &= ~CTXDESC_CD_0_ASID; -- Gitee From 37b6f1352815e841fad2cf7ddcd8e028933d943b Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 13 Aug 2025 18:46:22 +0800 Subject: [PATCH 029/219] anolis: Revert "iommu: Add mm_get_enqcmd_pasid() helper function" ANBZ: #13617 This reverts commit 9e009f1923bbd9b745dc02ef0b79c8481caab658. --- arch/x86/kernel/traps.c | 2 +- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 23 +++++++------------ drivers/iommu/iommu-sva.c | 2 +- include/linux/iommu.h | 12 ---------- 4 files changed, 10 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 555a7b640437..0933405f1ce6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,7 +694,7 @@ static bool try_fixup_enqcmd_gp(void) if (!mm_valid_pasid(current->mm)) return false; - pasid = mm_get_enqcmd_pasid(current->mm); + pasid = current->mm->pasid; /* * Did this thread already have its PASID activated? diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 05722121f00e..353248ab18e7 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -246,8 +246,7 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, smmu_domain); } - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, - size); + arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -265,11 +264,10 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ - arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), - &quiet_cd); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd); arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); smmu_mn->cleared = true; mutex_unlock(&sva_lock); @@ -327,13 +325,10 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { - ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), - cd); + ret = arm_smmu_write_ctx_desc(master, mm->pasid, cd); if (ret) { - list_for_each_entry_from_reverse( - master, &smmu_domain->devices, domain_head) - arm_smmu_write_ctx_desc( - master, mm_get_enqcmd_pasid(mm), NULL); + list_for_each_entry_from_reverse(master, &smmu_domain->devices, domain_head) + arm_smmu_write_ctx_desc(master, mm->pasid, NULL); break; } } @@ -363,8 +358,7 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); - arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), - NULL); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, NULL); /* * If we went through clear(), we've already invalidated, and no @@ -372,8 +366,7 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) */ if (!smmu_mn->cleared) { arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, - 0); + arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); } /* Frees smmu_mn */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 4a2f5699747f..b78671a8a914 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -141,7 +141,7 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - return mm_get_enqcmd_pasid(domain->mm); + return domain->mm->pasid; } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f25cf4537b43..61e71e307fab 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1391,12 +1391,6 @@ static inline bool mm_valid_pasid(struct mm_struct *mm) { return mm->pasid != IOMMU_PASID_INVALID; } - -static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) -{ - return mm->pasid; -} - void mm_pasid_drop(struct mm_struct *mm); struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); @@ -1419,12 +1413,6 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) } static inline void mm_pasid_init(struct mm_struct *mm) {} static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; } - -static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) -{ - return IOMMU_PASID_INVALID; -} - static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif /* CONFIG_IOMMU_SVA */ -- Gitee From 596468f2eb17be2fd502eb724c118ba9b0e8d1fb Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:22 +0800 Subject: [PATCH 030/219] iommu: Add mm_get_enqcmd_pasid() helper function ANBZ: #13617 commit 2396046d75d3c0b2cfead852a77efd023f8539dc upstream. mm_get_enqcmd_pasid() should be used by architecture code and closely related to learn the PASID value that the x86 ENQCMD operation should use for the mm. For the moment SMMUv3 uses this without any connection to ENQCMD, it will be cleaned up similar to how the prior patch made VT-d use the PASID argument of set_dev_pasid(). The motivation is to replace mm->pasid with an iommu private data structure that is introduced in a later patch. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-4-tina.zhang@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- arch/x86/kernel/traps.c | 2 +- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 23 ++++++++++++------- drivers/iommu/iommu-sva.c | 2 +- include/linux/iommu.h | 12 ++++++++++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0933405f1ce6..555a7b640437 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,7 +694,7 @@ static bool try_fixup_enqcmd_gp(void) if (!mm_valid_pasid(current->mm)) return false; - pasid = current->mm->pasid; + pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 353248ab18e7..05722121f00e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -246,7 +246,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, smmu_domain); } - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, + size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -264,10 +265,11 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ - arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + &quiet_cd); arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); smmu_mn->cleared = true; mutex_unlock(&sva_lock); @@ -325,10 +327,13 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { - ret = arm_smmu_write_ctx_desc(master, mm->pasid, cd); + ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), + cd); if (ret) { - list_for_each_entry_from_reverse(master, &smmu_domain->devices, domain_head) - arm_smmu_write_ctx_desc(master, mm->pasid, NULL); + list_for_each_entry_from_reverse( + master, &smmu_domain->devices, domain_head) + arm_smmu_write_ctx_desc( + master, mm_get_enqcmd_pasid(mm), NULL); break; } } @@ -358,7 +363,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); - arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, NULL); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + NULL); /* * If we went through clear(), we've already invalidated, and no @@ -366,7 +372,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) */ if (!smmu_mn->cleared) { arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, + 0); } /* Frees smmu_mn */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index b78671a8a914..4a2f5699747f 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -141,7 +141,7 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - return domain->mm->pasid; + return mm_get_enqcmd_pasid(domain->mm); } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 61e71e307fab..f25cf4537b43 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1391,6 +1391,12 @@ static inline bool mm_valid_pasid(struct mm_struct *mm) { return mm->pasid != IOMMU_PASID_INVALID; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return mm->pasid; +} + void mm_pasid_drop(struct mm_struct *mm); struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); @@ -1413,6 +1419,12 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) } static inline void mm_pasid_init(struct mm_struct *mm) {} static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return IOMMU_PASID_INVALID; +} + static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif /* CONFIG_IOMMU_SVA */ -- Gitee From d6377ae9572a98954ed34e02deacebfdf44582fd Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:23 +0800 Subject: [PATCH 031/219] mm: Add structure to keep sva information ANBZ: #13617 commit 541a3e257d48c16b77d19f39ed939ef5832046df upstream. Introduce iommu_mm_data structure to keep sva information (pasid and the related sva domains). Add iommu_mm pointer, pointing to an instance of iommu_mm_data structure, to mm. Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-5-tina.zhang@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- include/linux/iommu.h | 5 +++++ include/linux/mm_types.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f25cf4537b43..54e6f50f9406 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -860,6 +860,11 @@ struct iommu_sva { struct iommu_domain *domain; }; +struct iommu_mm_data { + u32 pasid; + struct list_head sva_domains; +}; + int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f3fa8c6d0497..afadea7c21e0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -781,6 +781,7 @@ struct mm_cid { #endif struct kioctx_table; +struct iommu_mm_data; struct mm_struct { struct { /* @@ -994,6 +995,7 @@ struct mm_struct { #ifdef CONFIG_IOMMU_MM_DATA u32 pasid; + struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM /* -- Gitee From 55ca0d606bbdec3c26c643ef1fbdcf273e861c26 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:24 +0800 Subject: [PATCH 032/219] iommu: Support mm PASID 1:n with sva domains ANBZ: #13617 commit 092edaddb660376648acb97678570ed5d8299768 upstream. Each mm bound to devices gets a PASID and corresponding sva domains allocated in iommu_sva_bind_device(), which are referenced by iommu_mm field of the mm. The PASID is released in __mmdrop(), while a sva domain is released when no one is using it (the reference count is decremented in iommu_sva_unbind_device()). However, although sva domains and their PASID are separate objects such that their own life cycles could be handled independently, an enqcmd use case may require releasing the PASID in releasing the mm (i.e., once a PASID is allocated for a mm, it will be permanently used by the mm and won't be released until the end of mm) and only allows to drop the PASID after the sva domains are released. To this end, mmgrab() is called in iommu_sva_domain_alloc() to increment the mm reference count and mmdrop() is invoked in iommu_domain_free() to decrement the mm reference count. Since the required info of PASID and sva domains is kept in struct iommu_mm_data of a mm, use mm->iommu_mm field instead of the old pasid field in mm struct. The sva domain list is protected by iommu_sva_lock. Besides, this patch removes mm_pasid_init(), as with the introduced iommu_mm structure, initializing mm pasid in mm_init() is unnecessary. Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-6-tina.zhang@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sva.c | 92 +++++++++++++++++++++++---------------- include/linux/iommu.h | 23 ++++++++-- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 4a2f5699747f..5175e8d85247 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -12,32 +12,42 @@ static DEFINE_MUTEX(iommu_sva_lock); /* Allocate a PASID for the mm within range (inclusive) */ -static int iommu_sva_alloc_pasid(struct mm_struct *mm, struct device *dev) +static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev) { + struct iommu_mm_data *iommu_mm; ioasid_t pasid; - int ret = 0; + + lockdep_assert_held(&iommu_sva_lock); if (!arch_pgtable_dma_compat(mm)) - return -EBUSY; + return ERR_PTR(-EBUSY); - mutex_lock(&iommu_sva_lock); + iommu_mm = mm->iommu_mm; /* Is a PASID already associated with this mm? */ - if (mm_valid_pasid(mm)) { - if (mm->pasid >= dev->iommu->max_pasids) - ret = -EOVERFLOW; - goto out; + if (iommu_mm) { + if (iommu_mm->pasid >= dev->iommu->max_pasids) + return ERR_PTR(-EOVERFLOW); + return iommu_mm; } + iommu_mm = kzalloc(sizeof(struct iommu_mm_data), GFP_KERNEL); + if (!iommu_mm) + return ERR_PTR(-ENOMEM); + pasid = iommu_alloc_global_pasid(dev); if (pasid == IOMMU_PASID_INVALID) { - ret = -ENOSPC; - goto out; + kfree(iommu_mm); + return ERR_PTR(-ENOSPC); } - mm->pasid = pasid; - ret = 0; -out: - mutex_unlock(&iommu_sva_lock); - return ret; + iommu_mm->pasid = pasid; + INIT_LIST_HEAD(&iommu_mm->sva_domains); + /* + * Make sure the write to mm->iommu_mm is not reordered in front of + * initialization to iommu_mm fields. If it does, readers may see a + * valid iommu_mm with uninitialized values. + */ + smp_store_release(&mm->iommu_mm, iommu_mm); + return iommu_mm; } /** @@ -58,31 +68,33 @@ static int iommu_sva_alloc_pasid(struct mm_struct *mm, struct device *dev) */ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { + struct iommu_mm_data *iommu_mm; struct iommu_domain *domain; struct iommu_sva *handle; int ret; + mutex_lock(&iommu_sva_lock); + /* Allocate mm->pasid if necessary. */ - ret = iommu_sva_alloc_pasid(mm, dev); - if (ret) - return ERR_PTR(ret); + iommu_mm = iommu_alloc_mm_data(mm, dev); + if (IS_ERR(iommu_mm)) { + ret = PTR_ERR(iommu_mm); + goto out_unlock; + } handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return ERR_PTR(-ENOMEM); - - mutex_lock(&iommu_sva_lock); - /* Search for an existing domain. */ - domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid, - IOMMU_DOMAIN_SVA); - if (IS_ERR(domain)) { - ret = PTR_ERR(domain); + if (!handle) { + ret = -ENOMEM; goto out_unlock; } - if (domain) { - domain->users++; - goto out; + /* Search for an existing domain. */ + list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + if (!ret) { + domain->users++; + goto out; + } } /* Allocate a new domain and set it on device pasid. */ @@ -92,23 +104,23 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_unlock; } - ret = iommu_attach_device_pasid(domain, dev, mm->pasid); + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); if (ret) goto out_free_domain; domain->users = 1; + list_add(&domain->next, &mm->iommu_mm->sva_domains); + out: mutex_unlock(&iommu_sva_lock); handle->dev = dev; handle->domain = domain; - return handle; out_free_domain: iommu_domain_free(domain); + kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); - kfree(handle); - return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(iommu_sva_bind_device); @@ -124,12 +136,13 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device); void iommu_sva_unbind_device(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - ioasid_t pasid = domain->mm->pasid; + struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm; struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); + iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { - iommu_detach_device_pasid(domain, dev, pasid); + list_del(&domain->next); iommu_domain_free(domain); } mutex_unlock(&iommu_sva_lock); @@ -205,8 +218,11 @@ iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) void mm_pasid_drop(struct mm_struct *mm) { - if (likely(!mm_valid_pasid(mm))) + struct iommu_mm_data *iommu_mm = mm->iommu_mm; + + if (!iommu_mm) return; - iommu_free_global_pasid(mm->pasid); + iommu_free_global_pasid(iommu_mm->pasid); + kfree(iommu_mm); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54e6f50f9406..bca9a2977986 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -121,6 +121,11 @@ struct iommu_domain { struct { /* IOMMU_DOMAIN_SVA */ struct mm_struct *mm; int users; + /* + * Next iommu_domain in mm->iommu_mm->sva-domains list + * protected by iommu_sva_lock. + */ + struct list_head next; }; }; @@ -1390,16 +1395,28 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream #ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { - mm->pasid = IOMMU_PASID_INVALID; + /* + * During dup_mm(), a new mm will be memcpy'd from an old one and that makes + * the new mm and the old one point to a same iommu_mm instance. When either + * one of the two mms gets released, the iommu_mm instance is freed, leaving + * the other mm running into a use-after-free/double-free problem. To avoid + * the problem, zeroing the iommu_mm pointer of a new mm is needed here. + */ + mm->iommu_mm = NULL; } + static inline bool mm_valid_pasid(struct mm_struct *mm) { - return mm->pasid != IOMMU_PASID_INVALID; + return READ_ONCE(mm->iommu_mm); } static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) { - return mm->pasid; + struct iommu_mm_data *iommu_mm = READ_ONCE(mm->iommu_mm); + + if (!iommu_mm) + return IOMMU_PASID_INVALID; + return iommu_mm->pasid; } void mm_pasid_drop(struct mm_struct *mm); -- Gitee From 31c5d51d7d7e5a43f44f03752aaa716b997913a4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 8 Dec 2023 09:53:14 +0800 Subject: [PATCH 033/219] iommu: Set owner token to SVA domain ANBZ: #13617 commit 7be423336eccc872249d37900c19c1d24f171353 upstream. Commit a9c362db3920 ("iommu: Validate that devices match domains") added an owner token to the iommu_domain. This token is checked during domain attachment to RID or PASID through the generic iommu interfaces. The SVA domains are attached to PASIDs through those iommu interfaces. Therefore, they require the owner token to be set during allocation. Otherwise, they fail to attach. Set the owner token for SVA domains. Fixes: a9c362db3920 ("iommu: Validate that devices match domains") Cc: Robin Murphy Signed-off-by: Lu Baolu Reviewed-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231208015314.320663-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9db6b81d2caa..022ce458c3c6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3653,6 +3653,7 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); domain->mm = mm; + domain->owner = ops; domain->iopf_handler = iommu_sva_handle_iopf; domain->fault_data = mm; -- Gitee From 07fd0954017f88f1623d75ed13cff7ca335df8ad Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:08 -0400 Subject: [PATCH 034/219] iommu: Remove struct iommu_ops *iommu from arch_setup_dma_ops() ANBZ: #13617 commit 4720287c7bf76e59d19d4dfbdc3f54eeea6fd46b upstream. This is not being used to pass ops, it is just a way to tell if an iommu driver was probed. These days this can be detected directly via device_iommu_mapped(). Call device_iommu_mapped() in the two places that need to check it and remove the iommu parameter everywhere. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Acked-by: Christoph Hellwig Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- arch/arc/mm/dma.c | 2 +- arch/arm/mm/dma-mapping-nommu.c | 2 +- arch/arm/mm/dma-mapping.c | 10 +++++----- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/mips/mm/dma-noncoherent.c | 2 +- arch/riscv/mm/dma-noncoherent.c | 2 +- drivers/acpi/scan.c | 3 +-- drivers/hv/hv_common.c | 2 +- drivers/of/device.c | 2 +- include/linux/dma-map-ops.h | 4 ++-- 10 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 2a7fbbb83b70..197707bc7658 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -91,7 +91,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, * Plug in direct dma map ops. */ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { /* * IOC hardware snoops all DMA traffic keeping the caches consistent diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index cfd9c933d2f0..b94850b57995 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -34,7 +34,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { if (IS_ENABLED(CONFIG_CPU_V7M)) { /* diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 5409225b4abc..6c359a3af8d9 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1713,7 +1713,7 @@ void arm_iommu_detach_device(struct device *dev) EXPORT_SYMBOL_GPL(arm_iommu_detach_device); static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { struct dma_iommu_mapping *mapping; @@ -1748,7 +1748,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) #else static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { } @@ -1757,7 +1757,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) { } #endif /* CONFIG_ARM_DMA_USE_IOMMU */ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { /* * Due to legacy code that sets the ->dma_coherent flag from a bus @@ -1776,8 +1776,8 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, if (dev->dma_ops) return; - if (iommu) - arm_setup_iommu_dma_ops(dev, dma_base, size, iommu, coherent); + if (device_iommu_mapped(dev)) + arm_setup_iommu_dma_ops(dev, dma_base, size, coherent); xen_setup_dma_ops(dev); dev->archdata.dma_ops_setup = true; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3cb101e8cb29..61886e43e3a1 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -47,7 +47,7 @@ void arch_teardown_dma_ops(struct device *dev) #endif void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { int cls = cache_line_size_of_cpu(); @@ -58,7 +58,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, ARCH_DMA_MINALIGN, cls); dev->dma_coherent = coherent; - if (iommu) + if (device_iommu_mapped(dev)) iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1); xen_setup_dma_ops(dev); diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index 3c4fc97b9f39..0f3cec663a12 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -138,7 +138,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { dev->dma_coherent = coherent; } diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index 8932a93ec50c..32031a7d96d4 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -130,7 +130,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size) } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN, TAINT_CPU_OUT_OF_SPEC, diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index fd19b58c75da..001b4db120e3 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1640,8 +1640,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; - arch_setup_dma_ops(dev, 0, U64_MAX, - iommu, attr == DEV_DMA_COHERENT); + arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT); return 0; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index ccad7bca3fd3..fd938b6dfa7e 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -489,7 +489,7 @@ void hv_setup_dma_ops(struct device *dev, bool coherent) * Hyper-V does not offer a vIOMMU in the guest * VM, so pass 0/NULL for the IOMMU settings */ - arch_setup_dma_ops(dev, 0, 0, NULL, coherent); + arch_setup_dma_ops(dev, 0, 0, coherent); } EXPORT_SYMBOL_GPL(hv_setup_dma_ops); diff --git a/drivers/of/device.c b/drivers/of/device.c index 1ca42ad9dd15..65c71be71a8d 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -193,7 +193,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, dev_dbg(dev, "device is%sbehind an iommu\n", iommu ? " " : " not "); - arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); + arch_setup_dma_ops(dev, dma_start, size, coherent); if (!iommu) of_dma_set_restricted_buffer(dev, np); diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 348ea5320e78..3888a21cf4ff 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -434,10 +434,10 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent); + bool coherent); #else static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, - u64 size, const struct iommu_ops *iommu, bool coherent) + u64 size, bool coherent) { } #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */ -- Gitee From b44c6d9cfca633c3789a11869d10ac2e5ef0479b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:09 -0400 Subject: [PATCH 035/219] iommmu/of: Do not return struct iommu_ops from of_iommu_configure() ANBZ: #13617 commit 6ff6e184f1f4d4993d45ca3f934c8288890965fe upstream. Nothing needs this pointer. Return a normal error code with the usual IOMMU semantic that ENODEV means 'there is no IOMMU driver'. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/of_iommu.c | 31 +++++++++++++++++++------------ drivers/of/device.c | 22 +++++++++++++++------- include/linux/of_iommu.h | 13 ++++++------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 42cffb0ee5e2..ff327981001e 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -107,16 +107,22 @@ static int of_iommu_configure_device(struct device_node *master_np, of_iommu_configure_dev(master_np, dev); } -const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id) +/* + * Returns: + * 0 on success, an iommu was configured + * -ENODEV if the device does not have any IOMMU + * -EPROBEDEFER if probing should be tried again + * -errno fatal errors + */ +int of_iommu_configure(struct device *dev, struct device_node *master_np, + const u32 *id) { const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec; int err = NO_IOMMU; if (!master_np) - return NULL; + return -ENODEV; /* Serialise to make dev->iommu stable under our potential fwspec */ mutex_lock(&iommu_probe_device_lock); @@ -124,7 +130,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, if (fwspec) { if (fwspec->ops) { mutex_unlock(&iommu_probe_device_lock); - return fwspec->ops; + return 0; } /* In the deferred case, start again from scratch */ iommu_fwspec_free(dev); @@ -169,14 +175,15 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, err = iommu_probe_device(dev); /* Ignore all other errors apart from EPROBE_DEFER */ - if (err == -EPROBE_DEFER) { - ops = ERR_PTR(err); - } else if (err < 0) { - dev_dbg(dev, "Adding to IOMMU failed: %d\n", err); - ops = NULL; + if (err < 0) { + if (err == -EPROBE_DEFER) + return err; + dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + return err; } - - return ops; + if (!ops) + return -ENODEV; + return 0; } static enum iommu_resv_type __maybe_unused diff --git a/drivers/of/device.c b/drivers/of/device.c index 65c71be71a8d..873d933e8e6d 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -93,12 +93,12 @@ of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { - const struct iommu_ops *iommu; const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 dma_start = 0; u64 mask, end, size = 0; bool coherent; + int iommu_ret; int ret; if (np == dev->of_node) @@ -181,21 +181,29 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); - iommu = of_iommu_configure(dev, np, id); - if (PTR_ERR(iommu) == -EPROBE_DEFER) { + iommu_ret = of_iommu_configure(dev, np, id); + if (iommu_ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (!ret) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; - } + } else if (iommu_ret == -ENODEV) { + dev_dbg(dev, "device is not behind an iommu\n"); + } else if (iommu_ret) { + dev_err(dev, "iommu configuration for device failed with %pe\n", + ERR_PTR(iommu_ret)); - dev_dbg(dev, "device is%sbehind an iommu\n", - iommu ? " " : " not "); + /* + * Historically this routine doesn't fail driver probing + * due to errors in of_iommu_configure() + */ + } else + dev_dbg(dev, "device is behind an iommu\n"); arch_setup_dma_ops(dev, dma_start, size, coherent); - if (!iommu) + if (iommu_ret) of_dma_set_restricted_buffer(dev, np); return 0; diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h index 9a5e6b410dd2..e61cbbe12dac 100644 --- a/include/linux/of_iommu.h +++ b/include/linux/of_iommu.h @@ -8,20 +8,19 @@ struct iommu_ops; #ifdef CONFIG_OF_IOMMU -extern const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id); +extern int of_iommu_configure(struct device *dev, struct device_node *master_np, + const u32 *id); extern void of_iommu_get_resv_regions(struct device *dev, struct list_head *list); #else -static inline const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id) +static inline int of_iommu_configure(struct device *dev, + struct device_node *master_np, + const u32 *id) { - return NULL; + return -ENODEV; } static inline void of_iommu_get_resv_regions(struct device *dev, -- Gitee From 5be7c27154591e9250d9efd1418529c49475c8d4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:10 -0400 Subject: [PATCH 036/219] iommu/of: Use -ENODEV consistently in of_iommu_configure() ANBZ: #13617 commit 5b4ea8b06eb79234a244ffc1f7405aa968f62069 upstream. Instead of returning 1 and trying to handle positive error codes just stick to the convention of returning -ENODEV. Remove references to ops from of_iommu_configure(), a NULL ops will already generate an error code. There is no reason to check dev->bus, if err=0 at this point then the called configure functions thought there was an iommu and we should try to probe it. Remove it. Reviewed-by: Jerry Snitselaar Reviewed-by: Moritz Fischer Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/of_iommu.c | 49 ++++++++++++---------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index ff327981001e..719652b60840 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -17,8 +17,6 @@ #include #include -#define NO_IOMMU 1 - static int of_iommu_xlate(struct device *dev, struct of_phandle_args *iommu_spec) { @@ -29,7 +27,7 @@ static int of_iommu_xlate(struct device *dev, ops = iommu_ops_from_fwnode(fwnode); if ((ops && !ops->of_xlate) || !of_device_is_available(iommu_spec->np)) - return NO_IOMMU; + return -ENODEV; ret = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops); if (ret) @@ -61,7 +59,7 @@ static int of_iommu_configure_dev_id(struct device_node *master_np, "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) - return err == -ENODEV ? NO_IOMMU : err; + return err; err = of_iommu_xlate(dev, &iommu_spec); of_node_put(iommu_spec.np); @@ -72,7 +70,7 @@ static int of_iommu_configure_dev(struct device_node *master_np, struct device *dev) { struct of_phandle_args iommu_spec; - int err = NO_IOMMU, idx = 0; + int err = -ENODEV, idx = 0; while (!of_parse_phandle_with_args(master_np, "iommus", "#iommu-cells", @@ -117,9 +115,8 @@ static int of_iommu_configure_device(struct device_node *master_np, int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { - const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec; - int err = NO_IOMMU; + int err; if (!master_np) return -ENODEV; @@ -153,37 +150,21 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, } else { err = of_iommu_configure_device(master_np, dev, id); } - - /* - * Two success conditions can be represented by non-negative err here: - * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons - * 0 : we found an IOMMU, and dev->fwspec is initialised appropriately - * <0 : any actual error - */ - if (!err) { - /* The fwspec pointer changed, read it again */ - fwspec = dev_iommu_fwspec_get(dev); - ops = fwspec->ops; - } mutex_unlock(&iommu_probe_device_lock); - /* - * If we have reason to believe the IOMMU driver missed the initial - * probe for dev, replay it to get things in order. - */ - if (!err && dev->bus) - err = iommu_probe_device(dev); - - /* Ignore all other errors apart from EPROBE_DEFER */ - if (err < 0) { - if (err == -EPROBE_DEFER) - return err; - dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + if (err == -ENODEV || err == -EPROBE_DEFER) return err; - } - if (!ops) - return -ENODEV; + if (err) + goto err_log; + + err = iommu_probe_device(dev); + if (err) + goto err_log; return 0; + +err_log: + dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + return err; } static enum iommu_resv_type __maybe_unused -- Gitee From ee38445aecfa0c78c63a2e79161364cbc3489eb1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:11 -0400 Subject: [PATCH 037/219] iommu: Mark dev_iommu_get() with lockdep ANBZ: #13617 commit 64945d1b0ed169aeffa59020941e4ac45ebc315a upstream. Allocation of dev->iommu must be done under the iommu_probe_device_lock. Mark this with lockdep to discourage future mistakes. Reviewed-by: Jerry Snitselaar Tested-by: Hector Martin Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 022ce458c3c6..ad458c0da661 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -334,6 +334,8 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) { struct dev_iommu *param = dev->iommu; + lockdep_assert_held(&iommu_probe_device_lock); + if (param) return param; -- Gitee From 36bcdfb2bff3e4e0f4f28f58affc261a82e8a0c8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:13 -0400 Subject: [PATCH 038/219] acpi: Do not return struct iommu_ops from acpi_iommu_configure_id() ANBZ: #13617 commit cdbc723f2da11331bc92858da50dc7a7610ed9bd upstream. Nothing needs this pointer. Return a normal error code with the usual IOMMU semantic that ENODEV means 'there is no IOMMU driver'. Acked-by: Rafael J. Wysocki Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/acpi/scan.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 001b4db120e3..ff7078afb9d5 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1561,8 +1561,7 @@ static inline const struct iommu_ops *acpi_iommu_fwspec_ops(struct device *dev) return fwspec ? fwspec->ops : NULL; } -static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, - const u32 *id_in) +static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) { int err; const struct iommu_ops *ops; @@ -1576,7 +1575,7 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, ops = acpi_iommu_fwspec_ops(dev); if (ops) { mutex_unlock(&iommu_probe_device_lock); - return ops; + return 0; } err = iort_iommu_configure_id(dev, id_in); @@ -1593,12 +1592,14 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, /* Ignore all other errors apart from EPROBE_DEFER */ if (err == -EPROBE_DEFER) { - return ERR_PTR(err); + return err; } else if (err) { dev_dbg(dev, "Adding to IOMMU failed: %d\n", err); - return NULL; + return -ENODEV; } - return acpi_iommu_fwspec_ops(dev); + if (!acpi_iommu_fwspec_ops(dev)) + return -ENODEV; + return 0; } #else /* !CONFIG_IOMMU_API */ @@ -1610,10 +1611,9 @@ int acpi_iommu_fwspec_init(struct device *dev, u32 id, return -ENODEV; } -static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, - const u32 *id_in) +static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) { - return NULL; + return -ENODEV; } #endif /* !CONFIG_IOMMU_API */ @@ -1627,7 +1627,7 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id) { - const struct iommu_ops *iommu; + int ret; if (attr == DEV_DMA_NOT_SUPPORTED) { set_dma_ops(dev, &dma_dummy_ops); @@ -1636,10 +1636,15 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, acpi_arch_dma_setup(dev); - iommu = acpi_iommu_configure_id(dev, input_id); - if (PTR_ERR(iommu) == -EPROBE_DEFER) + ret = acpi_iommu_configure_id(dev, input_id); + if (ret == -EPROBE_DEFER) return -EPROBE_DEFER; + /* + * Historically this routine doesn't fail driver probing due to errors + * in acpi_iommu_configure_id() + */ + arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT); return 0; -- Gitee From 0a12e72c7c531aff5666734775084c734564cd6e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:14 -0400 Subject: [PATCH 039/219] iommu/tegra: Use tegra_dev_iommu_get_stream_id() in the remaining places ANBZ: #13617 commit bf9cd9fef9f15531680325f956f81317d46a159d upstream. This API was defined to formalize the access to internal iommu details on some Tegra SOCs, but a few callers got missed. Add them. The helper already masks by 0xFFFF so remove this code from the callers. Suggested-by: Thierry Reding Reviewed-by: Thierry Reding Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/dma/tegra186-gpc-dma.c | 8 +++----- drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c | 9 ++------- drivers/memory/tegra/tegra186.c | 14 ++++++++------ 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/dma/tegra186-gpc-dma.c b/drivers/dma/tegra186-gpc-dma.c index 029f45f7e37f..182c054e7cd3 100644 --- a/drivers/dma/tegra186-gpc-dma.c +++ b/drivers/dma/tegra186-gpc-dma.c @@ -1361,8 +1361,8 @@ static int tegra_dma_program_sid(struct tegra_dma_channel *tdc, int stream_id) static int tegra_dma_probe(struct platform_device *pdev) { const struct tegra_dma_chip_data *cdata = NULL; - struct iommu_fwspec *iommu_spec; - unsigned int stream_id, i; + unsigned int i; + u32 stream_id; struct tegra_dma *tdma; int ret; @@ -1391,12 +1391,10 @@ static int tegra_dma_probe(struct platform_device *pdev) tdma->dma_dev.dev = &pdev->dev; - iommu_spec = dev_iommu_fwspec_get(&pdev->dev); - if (!iommu_spec) { + if (!tegra_dev_iommu_get_stream_id(&pdev->dev, &stream_id)) { dev_err(&pdev->dev, "Missing iommu stream-id\n"); return -EINVAL; } - stream_id = iommu_spec->ids[0] & 0xffff; ret = device_property_read_u32(&pdev->dev, "dma-channel-mask", &tdma->chan_mask); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c index e7e8fdf3adab..29682722b0b3 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c @@ -28,19 +28,14 @@ static void gp10b_ltc_init(struct nvkm_ltc *ltc) { struct nvkm_device *device = ltc->subdev.device; - struct iommu_fwspec *spec; + u32 sid; nvkm_wr32(device, 0x17e27c, ltc->ltc_nr); nvkm_wr32(device, 0x17e000, ltc->ltc_nr); nvkm_wr32(device, 0x100800, ltc->ltc_nr); - spec = dev_iommu_fwspec_get(device->dev); - if (spec) { - u32 sid = spec->ids[0] & 0xffff; - - /* stream ID */ + if (tegra_dev_iommu_get_stream_id(device->dev, &sid)) nvkm_wr32(device, 0x160000, sid << 2); - } } static const struct nvkm_ltc_func diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index 7633481e547d..f915cf2972a6 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -114,9 +114,12 @@ static void tegra186_mc_client_sid_override(struct tegra_mc *mc, static int tegra186_mc_probe_device(struct tegra_mc *mc, struct device *dev) { #if IS_ENABLED(CONFIG_IOMMU_API) - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct of_phandle_args args; unsigned int i, index = 0; + u32 sid; + + if (!tegra_dev_iommu_get_stream_id(dev, &sid)) + return 0; while (!of_parse_phandle_with_args(dev->of_node, "interconnects", "#interconnect-cells", index, &args)) { @@ -124,11 +127,10 @@ static int tegra186_mc_probe_device(struct tegra_mc *mc, struct device *dev) for (i = 0; i < mc->soc->num_clients; i++) { const struct tegra_mc_client *client = &mc->soc->clients[i]; - if (client->id == args.args[0]) { - u32 sid = fwspec->ids[0] & MC_SID_STREAMID_OVERRIDE_MASK; - - tegra186_mc_client_sid_override(mc, client, sid); - } + if (client->id == args.args[0]) + tegra186_mc_client_sid_override( + mc, client, + sid & MC_SID_STREAMID_OVERRIDE_MASK); } } -- Gitee From e623a28b4bd7281a7b6d19afcc7effdcec898003 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 29 Nov 2023 15:44:02 +0100 Subject: [PATCH 040/219] iommu/arm-smmu-qcom: Add QCM2290 MDSS compatible ANBZ: #13617 commit 28af105cb65043f20a16c5e2519d66390b1f9bb9 upstream. Add the QCM2290 MDSS compatible to clients compatible list, as it also needs the workarounds. Reviewed-by: Dmitry Baryshkov Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231125-topic-rb1_feat-v3-5-4cbb567743bb@linaro.org Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 4521101e648a..1566cfd6c7e8 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -246,6 +246,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno-gmu" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, + { .compatible = "qcom,qcm2290-mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, { .compatible = "qcom,sc7280-mdss" }, -- Gitee From b279df0e85869bb48e74082247130bba9cf16b1e Mon Sep 17 00:00:00 2001 From: Vladimir Lypak Date: Wed, 11 Oct 2023 19:57:26 +0200 Subject: [PATCH 041/219] iommu/qcom: restore IOMMU state if needed ANBZ: #13617 commit 268dd4edb748a1e2f298b04854681453df6e77a2 upstream. If the IOMMU has a power domain then some state will be lost in qcom_iommu_suspend and TZ will reset device if we don't call qcom_scm_restore_sec_cfg before accessing it again. Signed-off-by: Vladimir Lypak [luca@z3ntu.xyz: reword commit message a bit] Signed-off-by: Luca Weiss Link: https://lore.kernel.org/r/20231011-msm8953-iommu-restore-v1-1-48a0c93809a2@z3ntu.xyz Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 52f996ef9ca4..e079bb7a993e 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -891,8 +891,16 @@ static void qcom_iommu_device_remove(struct platform_device *pdev) static int __maybe_unused qcom_iommu_resume(struct device *dev) { struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev); + int ret; + + ret = clk_bulk_prepare_enable(CLK_NUM, qcom_iommu->clks); + if (ret < 0) + return ret; + + if (dev->pm_domain) + return qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, 0); - return clk_bulk_prepare_enable(CLK_NUM, qcom_iommu->clks); + return ret; } static int __maybe_unused qcom_iommu_suspend(struct device *dev) -- Gitee From 489d56d416024148337bf276cb79204a7ebdeceb Mon Sep 17 00:00:00 2001 From: Wenkai Lin Date: Wed, 6 Dec 2023 08:57:27 +0800 Subject: [PATCH 042/219] iommu/arm-smmu-v3: disable stall for quiet_cd ANBZ: #13617 commit b41932f544586e606a7493cbc6dfb3c97c6b44d7 upstream. In the stall model, invalid transactions were expected to be stalled and aborted by the IOPF handler. However, when killing a test case with a huge amount of data, the accelerator streamline can not stop until all data is consumed even if the page fault handler reports errors. As a result, the kill may take a long time, about 10 seconds with numerous iopf interrupts. So disable stall for quiet_cd in the non-force stall model, since force stall model (STALL_MODEL==0b10) requires CD.S must be 1. Signed-off-by: Zhangfei Gao Signed-off-by: Wenkai Lin Suggested-by: Jean-Philippe Brucker Reviewed-by: Jason Gunthorpe Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20231206005727.46150-1-zhangfei.gao@linaro.org Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index dabc7bf95b91..95d020a31c68 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1063,6 +1063,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, bool cd_live; __le64 *cdptr; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + struct arm_smmu_device *smmu = master->smmu; if (WARN_ON(ssid >= (1 << cd_table->s1cdmax))) return -E2BIG; @@ -1077,6 +1078,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, if (!cd) { /* (5) */ val = 0; } else if (cd == &quiet_cd) { /* (4) */ + if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) + val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R); val |= CTXDESC_CD_0_TCR_EPD0; } else if (cd_live) { /* (3) */ val &= ~CTXDESC_CD_0_ASID; -- Gitee From d2018e0c14fc0e8f747722be12e11465f03c594a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:33 -0400 Subject: [PATCH 043/219] iommu/arm-smmu-v3: Add a type for the STE ANBZ: #13617 commit 57b89048874c9edd4a17f34c126b4a4188b7b444 upstream. Instead of passing a naked __le16 * around to represent a STE wrap it in a "struct arm_smmu_ste" with an array of the correct size. This makes it much clearer which functions will comprise the "STE API". Reviewed-by: Moritz Fischer Reviewed-by: Michael Shavit Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 59 ++++++++++----------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 95d020a31c68..c1212239c6c2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1252,7 +1252,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, - __le64 *dst) + struct arm_smmu_ste *dst) { /* * This is hideously complicated, but we only really care about @@ -1270,7 +1270,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, * 2. Write everything apart from dword 0, sync, write dword 0, sync * 3. Update Config, sync */ - u64 val = le64_to_cpu(dst[0]); + u64 val = le64_to_cpu(dst->data[0]); bool ste_live = false; struct arm_smmu_device *smmu = NULL; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; @@ -1328,10 +1328,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); - dst[0] = cpu_to_le64(val); - dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + dst->data[0] = cpu_to_le64(val); + dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - dst[2] = 0; /* Nuke the VMID */ + dst->data[2] = 0; /* Nuke the VMID */ /* * The SMMU can perform negative caching, so we must sync * the STE regardless of whether the old value was live. @@ -1346,7 +1346,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; BUG_ON(ste_live); - dst[1] = cpu_to_le64( + dst->data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1355,7 +1355,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu->features & ARM_SMMU_FEAT_STALLS && !master->stall_enabled) - dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | @@ -1365,7 +1365,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (s2_cfg) { BUG_ON(ste_live); - dst[2] = cpu_to_le64( + dst->data[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | #ifdef __BIG_ENDIAN @@ -1374,18 +1374,18 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } if (master->ats_enabled) - dst[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, + dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS)); arm_smmu_sync_ste_for_sid(smmu, sid); /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst[0], cpu_to_le64(val)); + WRITE_ONCE(dst->data[0], cpu_to_le64(val)); arm_smmu_sync_ste_for_sid(smmu, sid); /* It's likely that we'll want to use the new STE soon */ @@ -1393,7 +1393,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); } -static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent, bool force) +static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, + unsigned int nent, bool force) { unsigned int i; u64 val = STRTAB_STE_0_V; @@ -1404,11 +1405,11 @@ static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent, bool fo val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); for (i = 0; i < nent; ++i) { - strtab[0] = cpu_to_le64(val); - strtab[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); - strtab[2] = 0; - strtab += STRTAB_STE_DWORDS; + strtab->data[0] = cpu_to_le64(val); + strtab->data[1] = cpu_to_le64(FIELD_PREP( + STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); + strtab->data[2] = 0; + strtab++; } } @@ -2223,26 +2224,23 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) return 0; } -static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) +static struct arm_smmu_ste * +arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) { - __le64 *step; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { - struct arm_smmu_strtab_l1_desc *l1_desc; - int idx; + unsigned int idx1, idx2; /* Two-level walk */ - idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; - l1_desc = &cfg->l1_desc[idx]; - idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS; - step = &l1_desc->l2ptr[idx]; + idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; + idx2 = sid & ((1 << STRTAB_SPLIT) - 1); + return &cfg->l1_desc[idx1].l2ptr[idx2]; } else { /* Simple linear lookup */ - step = &cfg->strtab[sid * STRTAB_STE_DWORDS]; + return (struct arm_smmu_ste *)&cfg + ->strtab[sid * STRTAB_STE_DWORDS]; } - - return step; } static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) @@ -2252,7 +2250,8 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) for (i = 0; i < master->num_streams; ++i) { u32 sid = master->streams[i].id; - __le64 *step = arm_smmu_get_step_for_sid(smmu, sid); + struct arm_smmu_ste *step = + arm_smmu_get_step_for_sid(smmu, sid); /* Bridged PCI devices may end up with duplicated IDs */ for (j = 0; j < i; j++) @@ -3775,7 +3774,7 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); list_for_each_entry(e, &rmr_list, list) { - __le64 *step; + struct arm_smmu_ste *step; struct iommu_iort_rmr_data *rmr; int ret, i; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 961205ba86d2..03f9e526cbd9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -206,6 +206,11 @@ #define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) #define STRTAB_STE_DWORDS 8 + +struct arm_smmu_ste { + __le64 data[STRTAB_STE_DWORDS]; +}; + #define STRTAB_STE_0_V (1UL << 0) #define STRTAB_STE_0_CFG GENMASK_ULL(3, 1) #define STRTAB_STE_0_CFG_ABORT 0 @@ -571,7 +576,7 @@ struct arm_smmu_priq { struct arm_smmu_strtab_l1_desc { u8 span; - __le64 *l2ptr; + struct arm_smmu_ste *l2ptr; dma_addr_t l2ptr_dma; }; -- Gitee From 8befac3cdfa48975828beafd45109c301b4dc0af Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:34 -0400 Subject: [PATCH 044/219] iommu/arm-smmu-v3: Master cannot be NULL in arm_smmu_write_strtab_ent() ANBZ: #13617 commit 12a48fe90d0919e4e594815122403f793a090803 upstream. The only caller is arm_smmu_install_ste_for_dev() which never has a NULL master. Remove the confusing if. Reviewed-by: Moritz Fischer Reviewed-by: Michael Shavit Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c1212239c6c2..1cc8db2c0461 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1272,10 +1272,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, */ u64 val = le64_to_cpu(dst->data[0]); bool ste_live = false; - struct arm_smmu_device *smmu = NULL; + struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; - struct arm_smmu_domain *smmu_domain = NULL; + struct arm_smmu_domain *smmu_domain = master->domain; struct arm_smmu_cmdq_ent prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, .prefetch = { @@ -1283,11 +1283,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, }, }; - if (master) { - smmu_domain = master->domain; - smmu = master->smmu; - } - if (smmu_domain) { switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: -- Gitee From f85cc2785ff341f54a905e921b309d2beefa0694 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:35 -0400 Subject: [PATCH 045/219] iommu/arm-smmu-v3: Remove ARM_SMMU_DOMAIN_NESTED ANBZ: #13617 commit 9fde008337d3be8afa7b3b42c1787aac4b6853c9 upstream. Currently this is exactly the same as ARM_SMMU_DOMAIN_S2, so just remove it. The ongoing work to add nesting support through iommufd will do something a little different. Reviewed-by: Moritz Fischer Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1cc8db2c0461..b3513a8bc8ad 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1289,7 +1289,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, cd_table = &master->cd_table; break; case ARM_SMMU_DOMAIN_S2: - case ARM_SMMU_DOMAIN_NESTED: s2_cfg = &smmu_domain->s2_cfg; break; default: @@ -2181,7 +2180,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) fmt = ARM_64_LPAE_S1; finalise_stage_fn = arm_smmu_domain_finalise_s1; break; - case ARM_SMMU_DOMAIN_NESTED: case ARM_SMMU_DOMAIN_S2: ias = smmu->ias; oas = smmu->oas; @@ -2740,7 +2738,7 @@ static int arm_smmu_enable_nesting(struct iommu_domain *domain) if (smmu_domain->smmu) ret = -EPERM; else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; mutex_unlock(&smmu_domain->init_mutex); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 03f9e526cbd9..27ddf1acd12c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -715,7 +715,6 @@ struct arm_smmu_master { enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, - ARM_SMMU_DOMAIN_NESTED, ARM_SMMU_DOMAIN_BYPASS, }; -- Gitee From e424730400c86b4a494eeacc7b9a71d186116116 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:40 -0300 Subject: [PATCH 046/219] iommu/arm-smmu: Reorganize arm_smmu_domain_add_master() ANBZ: #13617 commit ff0f802974136c7f4c576da227565dc27cc1c69d upstream. Make arm_smmu_domain_add_master() not use the smmu_domain to detect the s2cr configuration, instead pass it in as a parameter. It always returns zero so make it return void. Since it no longer really does anything to do with a domain call it arm_smmu_master_install_s2crs(). This is done to make the next two patches able to re-use this code without forcing the creation of a struct arm_smmu_domain. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f029b19764e9..ae236ff92b52 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1086,21 +1086,14 @@ static void arm_smmu_master_free_smes(struct arm_smmu_master_cfg *cfg, mutex_unlock(&smmu->stream_map_mutex); } -static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_master_cfg *cfg, - struct iommu_fwspec *fwspec) +static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, + enum arm_smmu_s2cr_type type, + u8 cbndx, struct iommu_fwspec *fwspec) { - struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_device *smmu = cfg->smmu; struct arm_smmu_s2cr *s2cr = smmu->s2crs; - u8 cbndx = smmu_domain->cfg.cbndx; - enum arm_smmu_s2cr_type type; int i, idx; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) - type = S2CR_TYPE_BYPASS; - else - type = S2CR_TYPE_TRANS; - for_each_cfg_sme(cfg, fwspec, i, idx) { if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx) continue; @@ -1110,7 +1103,6 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, s2cr[idx].cbndx = cbndx; arm_smmu_write_s2cr(smmu, idx); } - return 0; } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -1153,7 +1145,12 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* Looks ok, so add the device to the domain */ - ret = arm_smmu_domain_add_master(smmu_domain, cfg, fwspec); + arm_smmu_master_install_s2crs(cfg, + smmu_domain->stage == + ARM_SMMU_DOMAIN_BYPASS ? + S2CR_TYPE_BYPASS : + S2CR_TYPE_TRANS, + smmu_domain->cfg.cbndx, fwspec); /* * Setup an autosuspend delay to avoid bouncing runpm state. -- Gitee From 91fcfce5abb5a50da2a9ae9dba8e4014e85b3bba Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:41 -0300 Subject: [PATCH 047/219] iommu/arm-smmu: Convert to a global static identity domain ANBZ: #13617 commit 22bb7b41476a1b8dc282dc5b3ca5470090b52e46 upstream. Create a global static identity domain with it's own arm_smmu_attach_dev_identity() that simply calls arm_smmu_master_install_s2crs() with the identity parameters. This is done by giving the attach path for identity its own unique implementation that simply calls arm_smmu_master_install_s2crs(). Remove ARM_SMMU_DOMAIN_BYPASS and all checks of IOMMU_DOMAIN_IDENTITY. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com [will: Move duplicated autosuspend logic into a helper function] Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 81 ++++++++++++++++++--------- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 - 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index ae236ff92b52..d5d1351b963f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -87,6 +87,23 @@ static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) pm_runtime_put_autosuspend(smmu->dev); } +static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu) +{ + /* + * Setup an autosuspend delay to avoid bouncing runpm state. + * Otherwise, if a driver for a suspended consumer device + * unmaps buffers, it will runpm resume/suspend for each one. + * + * For example, when used by a GPU device, when an application + * or game exits, it can trigger unmapping 100s or 1000s of + * buffers. With a runpm cycle for each buffer, that adds up + * to 5-10sec worth of reprogramming the context bank, while + * the system appears to be locked up to the user. + */ + pm_runtime_set_autosuspend_delay(smmu->dev, 20); + pm_runtime_use_autosuspend(smmu->dev); +} + static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); @@ -629,12 +646,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain->smmu) goto out_unlock; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS; - smmu_domain->smmu = smmu; - goto out_unlock; - } - /* * Mapping the requested stage onto what we support is surprisingly * complicated, mainly because the spec allows S1+S2 SMMUs without @@ -830,7 +841,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) struct arm_smmu_cfg *cfg = &smmu_domain->cfg; int ret, irq; - if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY) + if (!smmu) return; ret = arm_smmu_rpm_get(smmu); @@ -859,7 +870,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) { + if (type != IOMMU_DOMAIN_UNMANAGED) { if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) return NULL; } @@ -1145,32 +1156,45 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* Looks ok, so add the device to the domain */ - arm_smmu_master_install_s2crs(cfg, - smmu_domain->stage == - ARM_SMMU_DOMAIN_BYPASS ? - S2CR_TYPE_BYPASS : - S2CR_TYPE_TRANS, + arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS, smmu_domain->cfg.cbndx, fwspec); - - /* - * Setup an autosuspend delay to avoid bouncing runpm state. - * Otherwise, if a driver for a suspended consumer device - * unmaps buffers, it will runpm resume/suspend for each one. - * - * For example, when used by a GPU device, when an application - * or game exits, it can trigger unmapping 100s or 1000s of - * buffers. With a runpm cycle for each buffer, that adds up - * to 5-10sec worth of reprogramming the context bank, while - * the system appears to be locked up to the user. - */ - pm_runtime_set_autosuspend_delay(smmu->dev, 20); - pm_runtime_use_autosuspend(smmu->dev); - + arm_smmu_rpm_use_autosuspend(smmu); rpm_put: arm_smmu_rpm_put(smmu); return ret; } +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_device *smmu; + int ret; + + if (!cfg) + return -ENODEV; + smmu = cfg->smmu; + + ret = arm_smmu_rpm_get(smmu); + if (ret < 0) + return ret; + + arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_BYPASS, 0, fwspec); + arm_smmu_rpm_use_autosuspend(smmu); + arm_smmu_rpm_put(smmu); + return 0; +} + +static const struct iommu_domain_ops arm_smmu_identity_ops = { + .attach_dev = arm_smmu_attach_dev_identity, +}; + +static struct iommu_domain arm_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &arm_smmu_identity_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -1584,6 +1608,7 @@ static int arm_smmu_def_domain_type(struct device *dev) } static struct iommu_ops arm_smmu_ops = { + .identity_domain = &arm_smmu_identity_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 703fd5817ec1..836ed6799a80 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -361,7 +361,6 @@ enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, ARM_SMMU_DOMAIN_NESTED, - ARM_SMMU_DOMAIN_BYPASS, }; struct arm_smmu_domain { -- Gitee From 58f9fb878644940ca727a26694a455e98299d1e9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:42 -0300 Subject: [PATCH 048/219] iommu/arm-smmu: Implement IOMMU_DOMAIN_BLOCKED ANBZ: #13617 commit bbbf11eea38c0cb167edc5ce5fe76324e87ad074 upstream. Using the same design as IDENTITY setup a S2CR_TYPE_FAULT s2cr for the device. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 28 ++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d5d1351b963f..a16c65b2d5f1 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1164,8 +1164,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; } -static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) +static int arm_smmu_attach_dev_type(struct device *dev, + enum arm_smmu_s2cr_type type) { struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1180,12 +1180,18 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, if (ret < 0) return ret; - arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_BYPASS, 0, fwspec); + arm_smmu_master_install_s2crs(cfg, type, 0, fwspec); arm_smmu_rpm_use_autosuspend(smmu); arm_smmu_rpm_put(smmu); return 0; } +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); +} + static const struct iommu_domain_ops arm_smmu_identity_ops = { .attach_dev = arm_smmu_attach_dev_identity, }; @@ -1195,6 +1201,21 @@ static struct iommu_domain arm_smmu_identity_domain = { .ops = &arm_smmu_identity_ops, }; +static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, + struct device *dev) +{ + return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); +} + +static const struct iommu_domain_ops arm_smmu_blocked_ops = { + .attach_dev = arm_smmu_attach_dev_blocked, +}; + +static struct iommu_domain arm_smmu_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &arm_smmu_blocked_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -1609,6 +1630,7 @@ static int arm_smmu_def_domain_type(struct device *dev) static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, + .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, -- Gitee From d4290693041610893d59af24f981a5d14ac468d8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:43 -0300 Subject: [PATCH 049/219] iommu/arm-smmu: Pass arm_smmu_domain to internal functions ANBZ: #13617 commit e0976331ad114af8e379e18483c346c6c79ca858 upstream. Keep the types consistent, all the callers of these functions already have obtained a struct arm_smmu_domain, don't needlessly go to/from an iommu_domain through the internal call chains. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index a16c65b2d5f1..8385121db418 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -414,8 +414,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev) { u32 fsr, fsynr, cbfrsynra; unsigned long iova; - struct iommu_domain *domain = dev; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_domain *smmu_domain = dev; struct arm_smmu_device *smmu = smmu_domain->smmu; int idx = smmu_domain->cfg.cbndx; int ret; @@ -428,7 +427,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev) iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); - ret = report_iommu_fault(domain, NULL, iova, + ret = report_iommu_fault(&smmu_domain->domain, NULL, iova, fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); if (ret == -ENOSYS) @@ -629,7 +628,7 @@ static int arm_smmu_alloc_context_bank(struct arm_smmu_domain *smmu_domain, return __arm_smmu_alloc_bitmap(smmu->context_map, start, smmu->num_context_banks); } -static int arm_smmu_init_domain_context(struct iommu_domain *domain, +static int arm_smmu_init_domain_context(struct arm_smmu_domain *smmu_domain, struct arm_smmu_device *smmu, struct device *dev) { @@ -638,7 +637,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, struct io_pgtable_ops *pgtbl_ops; struct io_pgtable_cfg pgtbl_cfg; enum io_pgtable_fmt fmt; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct iommu_domain *domain = &smmu_domain->domain; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; irqreturn_t (*context_fault)(int irq, void *dev); @@ -812,8 +811,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, else context_fault = arm_smmu_context_fault; - ret = devm_request_irq(smmu->dev, irq, context_fault, - IRQF_SHARED, "arm-smmu-context-fault", domain); + ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED, + "arm-smmu-context-fault", smmu_domain); if (ret < 0) { dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n", cfg->irptndx, irq); @@ -834,9 +833,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, return ret; } -static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) +static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; int ret, irq; @@ -857,7 +855,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) if (cfg->irptndx != ARM_SMMU_INVALID_IRPTNDX) { irq = smmu->irqs[cfg->irptndx]; - devm_free_irq(smmu->dev, irq, domain); + devm_free_irq(smmu->dev, irq, smmu_domain); } free_io_pgtable_ops(smmu_domain->pgtbl_ops); @@ -897,7 +895,7 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) * Free the domain resources. We assume that all devices have * already been detached. */ - arm_smmu_destroy_domain_context(domain); + arm_smmu_destroy_domain_context(smmu_domain); kfree(smmu_domain); } @@ -1142,7 +1140,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; /* Ensure that the domain is finalised */ - ret = arm_smmu_init_domain_context(domain, smmu, dev); + ret = arm_smmu_init_domain_context(smmu_domain, smmu, dev); if (ret < 0) goto rpm_put; -- Gitee From 9ad63a54fc02bdcc1c7ed6375dd8bdfc6d28f9fd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:44 -0300 Subject: [PATCH 050/219] iommu/arm-smmu: Convert to domain_alloc_paging() ANBZ: #13617 commit 9b3febc3a3da7fcd81ece10614b7fd6c729ba8b4 upstream. Now that the BLOCKED and IDENTITY behaviors are managed with their own domains change to the domain_alloc_paging() op. The check for using_legacy_binding is now redundant, arm_smmu_def_domain_type() always returns IOMMU_DOMAIN_IDENTITY for this mode, so the core code will never attempt to create a DMA domain in the first place. Since commit a4fdd9762272 ("iommu: Use flush queue capability") the core code only passes in IDENTITY/BLOCKED/UNMANAGED/DMA domain types. It will not pass in IDENTITY or BLOCKED if the global statics exist, so the test for DMA is also redundant now too. Call arm_smmu_init_domain_context() early if a dev is available. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com [will: Simplify arm_smmu_domain_alloc_paging() since 'cfg' cannot be NULL] Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 8385121db418..44cab3c72072 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -864,14 +864,10 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) arm_smmu_rpm_put(smmu); } -static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) +static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED) { - if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) - return NULL; - } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -884,6 +880,15 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); + if (dev) { + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + + if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) { + kfree(smmu_domain); + return NULL; + } + } + return &smmu_domain->domain; } @@ -1630,7 +1635,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc = arm_smmu_domain_alloc, + .domain_alloc_paging = arm_smmu_domain_alloc_paging, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .probe_finalize = arm_smmu_probe_finalize, -- Gitee From 40edc74daddface947455e8bcac83c765e95e2be Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 13 Dec 2023 03:14:50 -0800 Subject: [PATCH 051/219] iommu/sva: Fix memory leak in iommu_sva_bind_device() ANBZ: #13617 commit 9991a82a38174672273df44a3cc6c87ef74b18e9 upstream. Free the handle when the domain allocation fails before unlocking and returning. Fixes: 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") Signed-off-by: Harshit Mogalapalli Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20231213111450.2487861-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sva.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 5175e8d85247..c3fc9201d0be 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -101,7 +101,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm domain = iommu_sva_domain_alloc(dev, mm); if (!domain) { ret = -ENOMEM; - goto out_unlock; + goto out_free_handle; } ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); @@ -118,6 +118,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm out_free_domain: iommu_domain_free(domain); +out_free_handle: kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); -- Gitee From c26deb6344b2cc62af36b31133c8784d0cc71f28 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:42 +0800 Subject: [PATCH 052/219] iommu/vt-d: Refactor device_to_iommu() to retrieve iommu directly ANBZ: #13617 commit 1903ef8f0d77680e6eb36bd0cc3ea3aa6e95a050 upstream. The device_to_iommu() helper was originally designed to look up the DMAR ACPI table to retrieve the iommu device and the request ID for a given device. However, it was also being used in other places where there was no need to lookup the ACPI table at all. Retrieve the iommu device directly from the per-device iommu private data in functions called after device is probed. Rename the original device_to_iommu() function to a more meaningful name, device_lookup_iommu(), to avoid mis-using it. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20231116015048.29675-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: also replace device_to_iommu() to device_lookup_iommu() in acpi_rmrr_andd_probe() ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 32 +++++++++++--------------------- drivers/iommu/intel/iommu.h | 1 - drivers/iommu/intel/svm.c | 20 +++----------------- 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3af810b3fd67..a7e95fbb27f1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -704,7 +704,7 @@ static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) return false; } -struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) +static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; struct pci_dev *pdev = NULL; @@ -2094,14 +2094,11 @@ static int domain_context_mapping_cb(struct pci_dev *pdev, static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct domain_context_mapping_data data; + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; struct pasid_table *table; - struct intel_iommu *iommu; - u8 bus, devfn; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; table = intel_pasid_get_table(dev); @@ -2462,15 +2459,10 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; unsigned long flags; - u8 bus, devfn; int ret; - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; - ret = domain_attach_iommu(domain, iommu); if (ret) return ret; @@ -3779,7 +3771,7 @@ static inline int acpi_rmrr_andd_probe(struct device *dev) ret = iommu_probe_device(dev); - iommu = device_to_iommu(dev, &bus, &devfn); + iommu = device_lookup_iommu(dev, &bus, &devfn); if (!iommu) { pr_info("dpoint-- cannot get acpi device corresponding iommu\n"); return -EINVAL; @@ -4193,14 +4185,11 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) int prepare_domain_attach_device(struct iommu_domain *domain, struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int addr_width; - iommu = device_to_iommu(dev, NULL, NULL); - if (!iommu) - return -ENODEV; - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; @@ -4477,7 +4466,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) u8 bus, devfn; int ret; - iommu = device_to_iommu(dev, &bus, &devfn); + iommu = device_lookup_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV); @@ -4814,8 +4803,9 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) { - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; + struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; unsigned long flags; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 4ec4be87f384..5a5d4ad58d6a 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -898,7 +898,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); -struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 1f3b9b8b6155..d4cf8bc3d02d 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -392,14 +392,9 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) { struct intel_svm_dev *sdev; - struct intel_iommu *iommu; struct intel_svm *svm; struct mm_struct *mm; - iommu = device_to_iommu(dev, NULL, NULL); - if (!iommu) - return; - if (pasid_to_svm_sdev(dev, pasid, &svm, &sdev)) return; mm = svm->mm; @@ -750,25 +745,16 @@ int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; struct iommu_fault_page_request *prm; - struct intel_iommu *iommu; bool private_present; bool pasid_present; bool last_page; - u8 bus, devfn; int ret = 0; u16 sid; - if (!dev || !dev_is_pci(dev)) - return -ENODEV; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; - - if (!msg || !evt) - return -EINVAL; - prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; -- Gitee From d361cd515d1ad2d65f6782b86ee29fcca496b489 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:43 +0800 Subject: [PATCH 053/219] iommu/vt-d: Remove unused parameter of intel_pasid_setup_pass_through() ANBZ: #13617 commit 47642bdd5a25cd46228221719af2902b583bd36c upstream. The domain parameter of this helper is unused and can be deleted to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20231116015048.29675-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 5 ++--- drivers/iommu/intel/pasid.c | 1 - drivers/iommu/intel/pasid.h | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a7e95fbb27f1..522dae92f83f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2475,7 +2475,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { /* Setup the PASID entry for requests without PASID: */ if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, domain, + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, @@ -4876,8 +4876,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, goto out_free; if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dmar_domain, - dev, pasid); + ret = intel_pasid_setup_pass_through(iommu, dev, pasid); else if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 9fb9b200112c..070e8571b9b0 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -770,7 +770,6 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid) { u16 did = FLPT_DEFAULT_DID; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index dd37611175cc..16265bc1f7ec 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -111,7 +111,6 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); -- Gitee From f5a2097357a78967706b6c3aef65b3d1929eb5a6 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:44 +0800 Subject: [PATCH 054/219] iommu/vt-d: Remove unused vcmd interfaces ANBZ: #13617 commit d2b66903464ec29e54ac0991fa43c0da2c2a4892 upstream. Commit 99b5726b4423 ("iommu: Remove ioasid infrastructure") has removed ioasid allocation interfaces from the iommu subsystem. As a result, these vcmd interfaces have become obsolete. Remove them to avoid dead code. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20231116015048.29675-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/debugfs.c | 3 -- drivers/iommu/intel/iommu.h | 3 -- drivers/iommu/intel/pasid.c | 57 ----------------------------------- drivers/iommu/intel/pasid.h | 12 -------- 4 files changed, 75 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index dee61e513be6..86b506af7daa 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -106,9 +106,6 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8), IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9), IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9), - IOMMU_REGSET_ENTRY(VCCAP), - IOMMU_REGSET_ENTRY(VCMD), - IOMMU_REGSET_ENTRY(VCRSP), }; static struct dentry *intel_iommu_debug; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 5a5d4ad58d6a..392b081e0ded 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -140,9 +140,6 @@ #define DMAR_ECEO_REG 0x408 #define DMAR_ECRSP_REG 0x410 #define DMAR_ECCAP_REG 0x430 -#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ -#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ -#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 070e8571b9b0..268000d8f90b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -27,63 +27,6 @@ */ u32 intel_pasid_max_id = PASID_MAX; -int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid) -{ - unsigned long flags; - u8 status_code; - int ret = 0; - u64 res; - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); - IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, - !(res & VCMD_VRSP_IP), res); - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); - - status_code = VCMD_VRSP_SC(res); - switch (status_code) { - case VCMD_VRSP_SC_SUCCESS: - *pasid = VCMD_VRSP_RESULT_PASID(res); - break; - case VCMD_VRSP_SC_NO_PASID_AVAIL: - pr_info("IOMMU: %s: No PASID available\n", iommu->name); - ret = -ENOSPC; - break; - default: - ret = -ENODEV; - pr_warn("IOMMU: %s: Unexpected error code %d\n", - iommu->name, status_code); - } - - return ret; -} - -void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid) -{ - unsigned long flags; - u8 status_code; - u64 res; - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - dmar_writeq(iommu->reg + DMAR_VCMD_REG, - VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); - IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, - !(res & VCMD_VRSP_IP), res); - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); - - status_code = VCMD_VRSP_SC(res); - switch (status_code) { - case VCMD_VRSP_SC_SUCCESS: - break; - case VCMD_VRSP_SC_INVALID_PASID: - pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); - break; - default: - pr_warn("IOMMU: %s: Unexpected error code %d\n", - iommu->name, status_code); - } -} - /* * Per device pasid table management: */ diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 16265bc1f7ec..00401cfc2a40 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,16 +22,6 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* Virtual command interface for enlightened pasid management. */ -#define VCMD_CMD_ALLOC 0x1 -#define VCMD_CMD_FREE 0x2 -#define VCMD_VRSP_IP 0x1 -#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1) -#define VCMD_VRSP_SC_SUCCESS 0 -#define VCMD_VRSP_SC_NO_PASID_AVAIL 16 -#define VCMD_VRSP_SC_INVALID_PASID 16 -#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff) -#define VCMD_CMD_OPERAND(e) ((e) << 16) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -117,8 +107,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); -int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid); -void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid); void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, struct device *dev, u32 pasid); #endif /* __INTEL_PASID_H */ -- Gitee From 83bde92408f4c107ad36aad8afd005be6fb3e39a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:45 +0800 Subject: [PATCH 055/219] iommu/vt-d: Move inline helpers to header files ANBZ: #13617 commit 80b79e141da7251d6ea82573f058ee9418896465 upstream. Move inline helpers to header files so that other files can use them without duplicating the code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20231116015048.29675-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 204 ++-------------------------------- drivers/iommu/intel/iommu.h | 175 +++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.c | 216 +----------------------------------- drivers/iommu/intel/pasid.h | 210 +++++++++++++++++++++++++++++++++++ 4 files changed, 400 insertions(+), 405 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 522dae92f83f..e30017575c36 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -47,9 +47,6 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define MAX_AGAW_WIDTH 64 -#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) - #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) @@ -64,74 +61,6 @@ #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) -/* page table handling */ -#define LEVEL_STRIDE (9) -#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) - -static inline int agaw_to_level(int agaw) -{ - return agaw + 2; -} - -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - -static inline int width_to_agaw(int width) -{ - return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); -} - -static inline unsigned int level_to_offset_bits(int level) -{ - return (level - 1) * LEVEL_STRIDE; -} - -static inline int pfn_level_offset(u64 pfn, int level) -{ - return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; -} - -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} - -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} - static void __init check_tylersburg_isoch(void); static int rwbf_quirk; @@ -169,78 +98,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } -static inline void context_set_present(struct context_entry *context) -{ - context->lo |= 1; -} - -static inline void context_set_fault_enable(struct context_entry *context) -{ - context->lo &= (((u64)-1) << 2) | 1; -} - -static inline void context_set_translation_type(struct context_entry *context, - unsigned long value) -{ - context->lo &= (((u64)-1) << 4) | 3; - context->lo |= (value & 3) << 2; -} - -static inline void context_set_address_root(struct context_entry *context, - unsigned long value) -{ - context->lo &= ~VTD_PAGE_MASK; - context->lo |= value & VTD_PAGE_MASK; -} - -static inline void context_set_address_width(struct context_entry *context, - unsigned long value) -{ - context->hi |= value & 7; -} - -static inline void context_set_domain_id(struct context_entry *context, - unsigned long value) -{ - context->hi |= (value & ((1 << 16) - 1)) << 8; -} - -static inline void context_set_pasid(struct context_entry *context) -{ - context->lo |= CONTEXT_PASIDE; -} - -static inline int context_domain_id(struct context_entry *c) -{ - return((c->hi >> 8) & 0xffff); -} - -static inline void context_clear_entry(struct context_entry *context) -{ - context->lo = 0; - context->hi = 0; -} - -static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - if (!iommu->copied_tables) - return false; - - return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - set_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -384,13 +241,12 @@ void free_pgtable_page(void *vaddr) free_page((unsigned long)vaddr); } -static inline int domain_type_is_si(struct dmar_domain *domain) +static int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; } -static inline int domain_pfn_supported(struct dmar_domain *domain, - unsigned long pfn) +static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; @@ -452,7 +308,7 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } -static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) +static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) { return sm_supported(iommu) ? ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); @@ -1587,9 +1443,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, } /* Notification for newly created mappings */ -static inline void __mapping_notify_one(struct intel_iommu *iommu, - struct dmar_domain *domain, - unsigned long pfn, unsigned int pages) +static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, + unsigned long pfn, unsigned int pages) { /* * It's a non-present to present mapping. Only flush if caching mode @@ -1856,7 +1711,7 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) spin_unlock(&iommu->lock); } -static inline int guestwidth_to_adjustwidth(int gaw) +static int guestwidth_to_adjustwidth(int gaw) { int agaw; int r = (gaw - 12) % 9; @@ -1890,7 +1745,7 @@ static void domain_exit(struct dmar_domain *domain) * Value of X in the PDTS field of a scalable mode context entry * indicates PASID directory with 2^(X + 7) entries. */ -static inline unsigned long context_get_sm_pds(struct pasid_table *table) +static unsigned long context_get_sm_pds(struct pasid_table *table) { unsigned long pds, max_pde; @@ -1902,38 +1757,6 @@ static inline unsigned long context_get_sm_pds(struct pasid_table *table) return pds - 7; } -/* - * Set the RID_PASID field of a scalable mode context entry. The - * IOMMU hardware will use the PASID value set in this field for - * DMA translations of DMA requests without PASID. - */ -static inline void -context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) -{ - context->hi |= pasid & ((1 << 20) - 1); -} - -/* - * Set the DTE(Device-TLB Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_dte(struct context_entry *context) -{ - context->lo |= BIT_ULL(2); -} - -/* - * Set the PRE(Page Request Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_pre(struct context_entry *context) -{ - context->lo |= BIT_ULL(4); -} - -/* Convert value to context PASID directory size field coding. */ -#define context_pdts(pds) (((pds) & 0x7) << 9) - static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, struct pasid_table *table, @@ -2115,18 +1938,15 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) } /* Returns a number of VTD pages, but aligned to MM page size */ -static inline unsigned long aligned_nrpages(unsigned long host_addr, - size_t size) +static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { host_addr &= ~PAGE_MASK; return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } /* Return largest possible superpage level for a given mapping */ -static inline int hardware_largepage_caps(struct dmar_domain *domain, - unsigned long iov_pfn, - unsigned long phy_pfn, - unsigned long pages) +static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phy_pfn, unsigned long pages) { int support, level = 1; unsigned long pfnmerge; @@ -3638,7 +3458,7 @@ void intel_iommu_shutdown(void) up_write(&dmar_global_lock); } -static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) +static struct intel_iommu *dev_to_intel_iommu(struct device *dev) { struct iommu_device *iommu_dev = dev_to_iommu_device(dev); @@ -3717,7 +3537,7 @@ const struct attribute_group *intel_iommu_groups[] = { NULL, }; -static inline bool has_external_pci(void) +static bool has_external_pci(void) { struct pci_dev *pdev = NULL; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 392b081e0ded..470dc0437000 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -851,6 +851,181 @@ static inline bool context_present(struct context_entry *context) return (context->lo & 1); } +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) +#define MAX_AGAW_WIDTH (64) +#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ + return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); +} + +static inline int width_to_agaw(int width) +{ + return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (level - 1) * LEVEL_STRIDE; +} + +static inline int pfn_level_offset(u64 pfn, int level) +{ + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; +} + +static inline u64 level_mask(int level) +{ + return -1ULL << level_to_offset_bits(level); +} + +static inline u64 level_size(int level) +{ + return 1ULL << level_to_offset_bits(level); +} + +static inline u64 align_to_level(u64 pfn, int level) +{ + return (pfn + level_size(level) - 1) & level_mask(level); +} + +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ + return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); +} + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) +{ + return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn_start(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} + +static inline void context_set_present(struct context_entry *context) +{ + context->lo |= 1; +} + +static inline void context_set_fault_enable(struct context_entry *context) +{ + context->lo &= (((u64)-1) << 2) | 1; +} + +static inline void context_set_translation_type(struct context_entry *context, + unsigned long value) +{ + context->lo &= (((u64)-1) << 4) | 3; + context->lo |= (value & 3) << 2; +} + +static inline void context_set_address_root(struct context_entry *context, + unsigned long value) +{ + context->lo &= ~VTD_PAGE_MASK; + context->lo |= value & VTD_PAGE_MASK; +} + +static inline void context_set_address_width(struct context_entry *context, + unsigned long value) +{ + context->hi |= value & 7; +} + +static inline void context_set_domain_id(struct context_entry *context, + unsigned long value) +{ + context->hi |= (value & ((1 << 16) - 1)) << 8; +} + +static inline void context_set_pasid(struct context_entry *context) +{ + context->lo |= CONTEXT_PASIDE; +} + +static inline int context_domain_id(struct context_entry *c) +{ + return((c->hi >> 8) & 0xffff); +} + +static inline void context_clear_entry(struct context_entry *context) +{ + context->lo = 0; + context->hi = 0; +} + +#ifdef CONFIG_INTEL_IOMMU +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} +#endif /* CONFIG_INTEL_IOMMU */ + +/* + * Set the RID_PASID field of a scalable mode context entry. The + * IOMMU hardware will use the PASID value set in this field for + * DMA translations of DMA requests without PASID. + */ +static inline void +context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) +{ + context->hi |= pasid & ((1 << 20) - 1); +} + +/* + * Set the DTE(Device-TLB Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_dte(struct context_entry *context) +{ + context->lo |= BIT_ULL(2); +} + +/* + * Set the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_pre(struct context_entry *context) +{ + context->lo |= BIT_ULL(4); +} + +/* Convert value to context PASID directory size field coding. */ +#define context_pdts(pds) (((pds) & 0x7) << 9) + struct dmar_drhd_unit *dmar_find_matched_drhd_unit(struct pci_dev *dev); int dmar_enable_qi(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 268000d8f90b..e1cdf8899fa9 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -173,30 +173,6 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) /* * Interfaces for PASID table entry manipulation: */ -static inline void pasid_clear_entry(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], 0); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - -static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], PASID_PTE_FPD); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - static void intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) { @@ -212,192 +188,6 @@ intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) pasid_clear_entry(pe); } -static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) -{ - u64 old; - - old = READ_ONCE(*ptr); - WRITE_ONCE(*ptr, (old & ~mask) | bits); -} - -static inline u64 pasid_get_bits(u64 *ptr) -{ - return READ_ONCE(*ptr); -} - -/* - * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode - * PASID entry. - */ -static inline void -pasid_set_domain_id(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); -} - -/* - * Get domain ID value of a scalable mode PASID entry. - */ -static inline u16 -pasid_get_domain_id(struct pasid_entry *pe) -{ - return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); -} - -/* - * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_slptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); -} - -/* - * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID - * entry. - */ -static inline void -pasid_set_address_width(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); -} - -/* - * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_translation_type(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); -} - -/* - * Enable fault processing by clearing the FPD(Fault Processing - * Disable) field (Bit 1) of a scalable mode PASID entry. - */ -static inline void pasid_set_fault_enable(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 1, 0); -} - -/* - * Enable second level A/D bits by setting the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); -} - -/* - * Disable second level A/D bits by clearing the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_clear_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 0); -} - -/* - * Checks if second level A/D bits specifically the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry is set. - */ -static inline bool pasid_get_ssade(struct pasid_entry *pe) -{ - return pasid_get_bits(&pe->val[0]) & (1 << 9); -} - -/* - * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_sre(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 0, 1); -} - -/* - * Setup the WPE(Write Protect Enable) field (Bit 132) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_wpe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); -} - -/* - * Setup the P(Present) field (Bit 0) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_present(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 0, 1); -} - -/* - * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) -{ - pasid_set_bits(&pe->val[1], 1 << 23, value << 23); -} - -/* - * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID - * entry. It is required when XD bit of the first level page table - * entry is about to be set. - */ -static inline void pasid_set_nxe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); -} - -/* - * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode - * PASID entry. - */ -static inline void -pasid_set_pgsnp(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); -} - -/* - * Setup the First Level Page table Pointer field (Bit 140~191) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_flptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); -} - -/* - * Setup the First Level Paging Mode field (Bit 130~131) of a - * scalable mode PASID entry. - */ -static inline void -pasid_set_flpm(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); -} - -/* - * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) - * of a scalable mode PASID entry. - */ -static inline void pasid_set_eafe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); -} - static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -559,9 +349,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, * Skip top levels of page tables for iommu which has less agaw * than default. Unnecessary for PT mode. */ -static inline int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static int iommu_skip_agaw(struct dmar_domain *domain, + struct intel_iommu *iommu, + struct dma_pte **pgd) { int agaw; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 00401cfc2a40..8d40d4c66e31 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -86,6 +86,216 @@ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); } +static inline void pasid_clear_entry(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], 0); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], PASID_PTE_FPD); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) +{ + u64 old; + + old = READ_ONCE(*ptr); + WRITE_ONCE(*ptr, (old & ~mask) | bits); +} + +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + +/* + * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode + * PASID entry. + */ +static inline void +pasid_set_domain_id(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); +} + +/* + * Get domain ID value of a scalable mode PASID entry. + */ +static inline u16 +pasid_get_domain_id(struct pasid_entry *pe) +{ + return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); +} + +/* + * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_slptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); +} + +/* + * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID + * entry. + */ +static inline void +pasid_set_address_width(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); +} + +/* + * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_translation_type(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); +} + +/* + * Enable fault processing by clearing the FPD(Fault Processing + * Disable) field (Bit 1) of a scalable mode PASID entry. + */ +static inline void pasid_set_fault_enable(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 1, 0); +} + +/* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* + * Setup the WPE(Write Protect Enable) field (Bit 132) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_wpe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); +} + +/* + * Setup the P(Present) field (Bit 0) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 1); +} + +/* + * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) +{ + pasid_set_bits(&pe->val[1], 1 << 23, value << 23); +} + +/* + * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID + * entry. It is required when XD bit of the first level page table + * entry is about to be set. + */ +static inline void pasid_set_nxe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); +} + +/* + * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode + * PASID entry. + */ +static inline void +pasid_set_pgsnp(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); +} + +/* + * Setup the First Level Page table Pointer field (Bit 140~191) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_flptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); +} + +/* + * Setup the First Level Paging Mode field (Bit 130~131) of a + * scalable mode PASID entry. + */ +static inline void +pasid_set_flpm(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); +} + +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); -- Gitee From 6d2701eb1379fcd8e767475da48b0a9a8f44ba77 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 10 Jan 2024 20:10:08 -0800 Subject: [PATCH 056/219] iommu: Add cache_invalidate_user op ANBZ: #13617 commit f35b88b66fbb5c90298ce3aa483b8a2cf1f39ad0 upstream. The updates of the PTEs in the nested page table will be propagated to the hardware caches. Add a new domain op cache_invalidate_user() for the userspace to flush the hardware caches for a nested domain through iommufd. No wrapper for it, as it's only supposed to be used by iommufd. Then, pass in invalidation requests in form of a user data array containing a number of invalidation data entries. Link: https://lore.kernel.org/r/20240111041015.47920-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommu.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bca9a2977986..01efc72700d6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -297,6 +297,23 @@ struct iommu_user_data { size_t len; }; +/** + * struct iommu_user_data_array - iommu driver specific user space data array + * @type: The data type of all the entries in the user buffer array + * @uptr: Pointer to the user buffer array + * @entry_len: The fixed-width length of an entry in the array, in bytes + * @entry_num: The number of total entries in the array + * + * The user buffer includes an array of requests with format defined in + * include/uapi/linux/iommufd.h + */ +struct iommu_user_data_array { + unsigned int type; + void __user *uptr; + size_t entry_len; + u32 entry_num; +}; + /** * __iommu_copy_struct_from_user - Copy iommu driver specific user space data * @dst_data: Pointer to an iommu driver specific user data that is defined in @@ -462,6 +479,13 @@ struct iommu_ops { * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue + * @cache_invalidate_user: Flush hardware cache for user space IO page table. + * The @domain must be IOMMU_DOMAIN_NESTED. The @array + * passes in the cache invalidation requests, in form + * of a driver data structure. The driver must update + * array->entry_num to report the number of handled + * invalidation requests. The driver data structure + * must be defined in include/uapi/linux/iommufd.h * @iova_to_phys: translate iova to physical address * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform @@ -487,6 +511,8 @@ struct iommu_domain_ops { size_t size); void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); + int (*cache_invalidate_user)(struct iommu_domain *domain, + struct iommu_user_data_array *array); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); -- Gitee From d8c87a89b05d344ca403510d7b4a0fb5294daa13 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 10 Jan 2024 20:10:09 -0800 Subject: [PATCH 057/219] iommufd: Add IOMMU_HWPT_INVALIDATE ANBZ: #13617 commit 8c6eabae3807e048b9f17733af5e20500fbf858c upstream. In nested translation, the stage-1 page table is user-managed but cached by the IOMMU hardware, so an update on present page table entries in the stage-1 page table should be followed with a cache invalidation. Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation. It takes hwpt_id to specify the iommu_domain, and a multi-entry array to support multiple invalidation data in one ioctl. enum iommu_hwpt_invalidate_data_type is defined to tag the data type of the entries in the multi-entry array. Link: https://lore.kernel.org/r/20240111041015.47920-3-yi.l.liu@intel.com Reviewed-by: Kevin Tian Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/hw_pagetable.c | 41 +++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 10 ++++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 43 +++++++++++++++++++++++++ 4 files changed, 97 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 29612106a962..3f3f1fa1a0a9 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -373,3 +373,44 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } + +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_invalidate *cmd = ucmd->cmd; + struct iommu_user_data_array data_array = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .entry_len = cmd->entry_len, + .entry_num = cmd->entry_num, + }; + struct iommufd_hw_pagetable *hwpt; + u32 done_num = 0; + int rc; + + if (cmd->__reserved) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cmd->entry_num && (!cmd->data_uptr || !cmd->entry_len)) { + rc = -EINVAL; + goto out; + } + + hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out; + } + + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + done_num = data_array.entry_num; + + iommufd_put_object(ucmd->ictx, &hwpt->obj); +out: + cmd->entry_num = done_num; + if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) + return -EFAULT; + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index abae041e256f..991f864d1f9b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -328,6 +328,15 @@ iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hwpt_paging, common.obj); } + +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt_nested(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HWPT_NESTED), + struct iommufd_hw_pagetable, obj); +} + int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); @@ -345,6 +354,7 @@ void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index c9091e46d208..39b32932c61e 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -322,6 +322,7 @@ union ucmd_buffer { struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; + struct iommu_hwpt_invalidate cache; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -360,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, struct iommu_hwpt_get_dirty_bitmap, data), + IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, + struct iommu_hwpt_invalidate, __reserved), IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 0b2bc6252e2c..824560c50ec6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -49,6 +49,7 @@ enum { IOMMUFD_CMD_GET_HW_INFO, IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, + IOMMUFD_CMD_HWPT_INVALIDATE, }; /** @@ -613,4 +614,46 @@ struct iommu_hwpt_get_dirty_bitmap { #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) +/** + * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation + * Data Type + * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + */ +enum iommu_hwpt_invalidate_data_type { + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, +}; + +/** + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) + * @size: sizeof(struct iommu_hwpt_invalidate) + * @hwpt_id: ID of a nested HWPT for cache invalidation + * @data_uptr: User pointer to an array of driver-specific cache invalidation + * data. + * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data + * type of all the entries in the invalidation request array. It + * should be a type supported by the hwpt pointed by @hwpt_id. + * @entry_len: Length (in bytes) of a request entry in the request array + * @entry_num: Input the number of cache invalidation requests in the array. + * Output the number of requests successfully handled by kernel. + * @__reserved: Must be 0. + * + * Invalidate the iommu cache for user-managed page table. Modifications on a + * user-managed page table should be followed by this operation to sync cache. + * Each ioctl can support one or more cache invalidation requests in the array + * that has a total size of @entry_len * @entry_num. + * + * An empty invalidation request array by setting @entry_num==0 is allowed, and + * @entry_len and @data_uptr would be ignored in this case. This can be used to + * check if the given @data_type is supported or not by kernel. + */ +struct iommu_hwpt_invalidate { + __u32 size; + __u32 hwpt_id; + __aligned_u64 data_uptr; + __u32 data_type; + __u32 entry_len; + __u32 entry_num; + __u32 __reserved; +}; +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) #endif -- Gitee From 85cb63c3ef48f9104fd50b09af35862920c62279 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:10 -0800 Subject: [PATCH 058/219] iommu: Add iommu_copy_struct_from_user_array helper ANBZ: #13617 commit 77785117f9c73fd71a440a5ac86dd80752967adc upstream. Wrap up the data pointer/num sanity and __iommu_copy_struct_from_user() call for iommu drivers to copy driver specific data at a specific location in the struct iommu_user_data_array. And expect it to be used in cache_invalidate_user ops for example. Link: https://lore.kernel.org/r/20240111041015.47920-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/linux/iommu.h | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 01efc72700d6..8286b93f14ae 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -354,6 +354,57 @@ static inline int __iommu_copy_struct_from_user( sizeof(*kdst), \ offsetofend(typeof(*kdst), min_last)) +/** + * __iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_array: Pointer to a struct iommu_user_data_array for a user space array + * @data_type: The data type of the @dst_data. Must match with @src_array.type + * @index: Index to the location in the array to copy user data from + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user_array( + void *dst_data, const struct iommu_user_data_array *src_array, + unsigned int data_type, unsigned int index, size_t data_len, + size_t min_len) +{ + struct iommu_user_data src_data; + + if (WARN_ON(!src_array || index >= src_array->entry_num)) + return -EINVAL; + if (!src_array->entry_num) + return -EINVAL; + src_data.uptr = src_array->uptr + src_array->entry_len * index; + src_data.len = src_array->entry_len; + src_data.type = src_array->type; + + return __iommu_copy_struct_from_user(dst_data, &src_data, data_type, + data_len, min_len); +} + +/** + * iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_array: Pointer to a struct iommu_user_data_array for a user space + * array + * @data_type: The data type of the @kdst. Must match with @user_array->type + * @index: Index to the location in the array to copy user data from + * @min_last: The last member of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \ + min_last) \ + __iommu_copy_struct_from_user_array( \ + kdst, user_array, data_type, index, sizeof(*(kdst)), \ + offsetofend(typeof(*(kdst)), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability -- Gitee From 8e96fed3588669e888d889d0add26f78efcc7e40 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:11 -0800 Subject: [PATCH 059/219] iommufd/selftest: Add mock_domain_cache_invalidate_user support ANBZ: #13617 commit ac8691203c07bb1897681d5c66374174336392fe upstream. Add mock_domain_cache_invalidate_user() data structure to support user space selftest program to cover user cache invalidation pathway. Link: https://lore.kernel.org/r/20240111041015.47920-5-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 18 ++++++++++ drivers/iommu/iommufd/selftest.c | 50 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 7910fbe1962d..09dfc8aa65c4 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -148,4 +148,22 @@ struct iommu_hwpt_selftest { __u32 iotlb; }; +/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ +#define IOMMU_HWPT_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_hwpt_invalidate_selftest - Invalidation data for Mock driver + * (IOMMU_HWPT_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @iotlb_id: Invalidate iotlb entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @iotlb_id will be ignored + */ +struct iommu_hwpt_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 iotlb_id; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d437ffb715e3..3fd47e6bf57c 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -488,9 +488,59 @@ static void mock_domain_free_nested(struct iommu_domain *domain) kfree(mock_nested); } +static int +mock_domain_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + struct iommu_hwpt_invalidate_selftest inv; + u32 processed = 0; + int i = 0, j; + int rc = 0; + + if (array->type != IOMMU_HWPT_INVALIDATE_DATA_SELFTEST) { + rc = -EINVAL; + goto out; + } + + for ( ; i < array->entry_num; i++) { + rc = iommu_copy_struct_from_user_array(&inv, array, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + i, iotlb_id); + if (rc) + break; + + if (inv.flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + break; + } + + if (inv.iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX) { + rc = -EINVAL; + break; + } + + if (inv.flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all mock iotlb entries and ignore iotlb_id */ + for (j = 0; j < MOCK_NESTED_DOMAIN_IOTLB_NUM; j++) + mock_nested->iotlb[j] = 0; + } else { + mock_nested->iotlb[inv.iotlb_id] = 0; + } + + processed++; + } + +out: + array->entry_num = processed; + return rc; +} + static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, + .cache_invalidate_user = mock_domain_cache_invalidate_user, }; static inline struct iommufd_hw_pagetable * -- Gitee From fa872283294ec1ca829d2a6b597592a587af677b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:12 -0800 Subject: [PATCH 060/219] iommufd/selftest: Add IOMMU_TEST_OP_MD_CHECK_IOTLB test op ANBZ: #13617 commit e1fa6640d58e3529bd5e392dd92371cde2b31283 upstream. Allow to test whether IOTLB has been invalidated or not. Link: https://lore.kernel.org/r/20240111041015.47920-6-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 5 ++++ drivers/iommu/iommufd/selftest.c | 26 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 4 +++ tools/testing/selftests/iommu/iommufd_utils.h | 24 +++++++++++++++++ 4 files changed, 59 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 09dfc8aa65c4..482d4059f5db 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -21,6 +21,7 @@ enum { IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, IOMMU_TEST_OP_DIRTY, + IOMMU_TEST_OP_MD_CHECK_IOTLB, }; enum { @@ -121,6 +122,10 @@ struct iommu_test_cmd { __aligned_u64 uptr; __aligned_u64 out_nr_dirty; } dirty; + struct { + __u32 id; + __u32 iotlb; + } check_iotlb; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 3fd47e6bf57c..d8860785d3b4 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -858,6 +858,28 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, return 0; } +static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, + u32 mockpt_id, unsigned int iotlb_id, + u32 iotlb) +{ + struct mock_iommu_domain_nested *mock_nested; + struct iommufd_hw_pagetable *hwpt; + int rc = 0; + + hwpt = get_md_pagetable_nested(ucmd, mockpt_id, &mock_nested); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + + if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || + mock_nested->iotlb[iotlb_id] != iotlb) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &hwpt->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1339,6 +1361,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_refs( ucmd, u64_to_user_ptr(cmd->check_refs.uptr), cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_MD_CHECK_IOTLB: + return iommufd_test_md_check_iotlb(ucmd, cmd->id, + cmd->check_iotlb.id, + cmd->check_iotlb.iotlb); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 32203f54dee8..09ec63cb3a82 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -330,6 +330,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &nested_hwpt_id[1], IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], + IOMMU_TEST_IOTLB_DEFAULT); /* Negative test: a nested hwpt on top of a nested hwpt */ test_err_hwpt_alloc_nested(EINVAL, self->device_id, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index ad9202335656..fe0a0f566b67 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -195,6 +195,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ hwpt_id, data_type, data, data_len)) +#define test_cmd_hwpt_check_iotlb(hwpt_id, iotlb_id, expected) \ + ({ \ + struct iommu_test_cmd test_cmd = { \ + .size = sizeof(test_cmd), \ + .op = IOMMU_TEST_OP_MD_CHECK_IOTLB, \ + .id = hwpt_id, \ + .check_iotlb = { \ + .id = iotlb_id, \ + .iotlb = expected, \ + }, \ + }; \ + ASSERT_EQ(0, \ + ioctl(self->fd, \ + _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_IOTLB), \ + &test_cmd)); \ + }) + +#define test_cmd_hwpt_check_iotlb_all(hwpt_id, expected) \ + ({ \ + int i; \ + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) \ + test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \ + }) + static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) { -- Gitee From a511465a1ebc0558acf9b5d639b36e416b0b04f0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:13 -0800 Subject: [PATCH 061/219] iommufd/selftest: Add coverage for IOMMU_HWPT_INVALIDATE ioctl ANBZ: #13617 commit bf26eb83fd3b6f392cf68285c8858db5bfd5e570 upstream. Add test cases for the IOMMU_HWPT_INVALIDATE ioctl and verify it by using the new IOMMU_TEST_OP_MD_CHECK_IOTLB. Link: https://lore.kernel.org/r/20240111041015.47920-7-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 148 ++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 31 ++++ 2 files changed, 179 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 09ec63cb3a82..949f8fc03771 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -116,6 +116,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id); TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved); TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved); + TEST_LENGTH(iommu_hwpt_invalidate, IOMMU_HWPT_INVALIDATE, __reserved); TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id); TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES, out_iova_alignment); @@ -271,7 +272,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) struct iommu_hwpt_selftest data = { .iotlb = IOMMU_TEST_IOTLB_DEFAULT, }; + struct iommu_hwpt_invalidate_selftest inv_reqs[2] = {}; uint32_t nested_hwpt_id[2] = {}; + uint32_t num_inv; uint32_t parent_hwpt_id = 0; uint32_t parent_hwpt_id_not_work = 0; uint32_t test_hwpt_id = 0; @@ -344,6 +347,151 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, parent_hwpt_id)); + /* hwpt_invalidate only supports a user-managed hwpt (nested) */ + num_inv = 1; + test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Check data_type by passing zero-length array */ + num_inv = 0; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: Invalid data_type */ + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: structure size sanity */ + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs) + 1, &num_inv); + assert(!num_inv); + + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + 1, &num_inv); + assert(!num_inv); + + /* Negative test: invalid flag is passed */ + num_inv = 1; + inv_reqs[0].flags = 0xffffffff; + test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid data_uptr when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], NULL, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid entry_len when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + 0, &num_inv); + assert(!num_inv); + + /* Negative test: invalid iotlb_id */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = MOCK_NESTED_DOMAIN_IOTLB_ID_MAX + 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* + * Invalidate the 1st iotlb entry but fail the 2nd request + * due to invalid flags configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 0; + inv_reqs[1].flags = 0xffffffff; + inv_reqs[1].iotlb_id = 1; + test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* + * Invalidate the 1st iotlb entry but fail the 2nd request + * due to invalid iotlb_id configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 0; + inv_reqs[1].flags = 0; + inv_reqs[1].iotlb_id = MOCK_NESTED_DOMAIN_IOTLB_ID_MAX + 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* Invalidate the 2nd iotlb entry and verify */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 1; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* Invalidate the 3rd and 4th iotlb entries and verify */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 2; + inv_reqs[1].flags = 0; + inv_reqs[1].iotlb_id = 3; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 2); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], 0); + + /* Invalidate all iotlb entries for nested_hwpt_id[1] and verify */ + num_inv = 1; + inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL; + test_cmd_hwpt_invalidate(nested_hwpt_id[1], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], 0); + /* Attach device to nested_hwpt_id[0] that then will be busy */ test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]); EXPECT_ERRNO(EBUSY, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index fe0a0f566b67..c646264aa41f 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -219,6 +219,37 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \ }) +static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs, + uint32_t data_type, uint32_t lreq, + uint32_t *nreqs) +{ + struct iommu_hwpt_invalidate cmd = { + .size = sizeof(cmd), + .hwpt_id = hwpt_id, + .data_type = data_type, + .data_uptr = (uint64_t)reqs, + .entry_len = lreq, + .entry_num = *nreqs, + }; + int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd); + *nreqs = cmd.entry_num; + return rc; +} + +#define test_cmd_hwpt_invalidate(hwpt_id, reqs, data_type, lreq, nreqs) \ + ({ \ + ASSERT_EQ(0, \ + _test_cmd_hwpt_invalidate(self->fd, hwpt_id, reqs, \ + data_type, lreq, nreqs)); \ + }) +#define test_err_hwpt_invalidate(_errno, hwpt_id, reqs, data_type, lreq, \ + nreqs) \ + ({ \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_invalidate( \ + self->fd, hwpt_id, reqs, \ + data_type, lreq, nreqs)); \ + }) + static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) { -- Gitee From bdd9705cdaa8b5607e78622ce66e380f3355dcff Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 10 Jan 2024 20:10:14 -0800 Subject: [PATCH 062/219] iommufd: Add data structure for Intel VT-d stage-1 cache invalidation ANBZ: #13617 commit 393a5778b72a7b551493d0fd3fbe0282154058fe upstream. This adds the data structure invalidating caches for the nested domain allocated with IOMMU_HWPT_DATA_VTD_S1 type. Link: https://lore.kernel.org/r/20240111041015.47920-8-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- include/uapi/linux/iommufd.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 824560c50ec6..1dfeaa2e649e 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,42 @@ enum iommu_hwpt_invalidate_data_type { IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, }; +/** + * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d + * stage-1 cache invalidation + * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies + * to all-levels page structure cache or just + * the leaf PTE cache. + */ +enum iommu_hwpt_vtd_s1_invalidate_flags { + IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, +}; + +/** + * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation + * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) + * @addr: The start address of the range to be invalidated. It needs to + * be 4KB aligned. + * @npages: Number of contiguous 4K pages to be invalidated. + * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags + * @__reserved: Must be 0 + * + * The Intel VT-d specific invalidation data for user-managed stage-1 cache + * invalidation in nested translation. Userspace uses this structure to + * tell the impacted cache scope after modifying the stage-1 page table. + * + * Invalidating all the caches related to the page table by setting @addr + * to be 0 and @npages to be U64_MAX. + * + * The device TLB will be invalidated automatically if ATS is enabled. + */ +struct iommu_hwpt_vtd_s1_invalidate { + __aligned_u64 addr; + __aligned_u64 npages; + __u32 flags; + __u32 __reserved; +}; + /** * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) * @size: sizeof(struct iommu_hwpt_invalidate) -- Gitee From a67c926f26f670e235eeb48674ab410aa4896bcd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 10 Jan 2024 20:10:15 -0800 Subject: [PATCH 063/219] iommu/vt-d: Add iotlb flush for nested domain ANBZ: #13617 commit f6f3721244a8224fa97952b34fbb21e4ee40e8a8 upstream. This implements the .cache_invalidate_user() callback to support iotlb flush for nested domain. Link: https://lore.kernel.org/r/20240111041015.47920-9-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/intel/nested.c | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index b5a5563ab32c..f26c7f1c46cc 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -73,9 +73,97 @@ static void intel_nested_domain_free(struct iommu_domain *domain) kfree(to_dmar_domain(domain)); } +static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, + unsigned int mask) +{ + struct device_domain_info *info; + unsigned long flags; + u16 sid, qdep; + + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->ats_enabled) + continue; + sid = info->bus << 8 | info->devfn; + qdep = info->ats_qdep; + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, addr, mask); + quirk_extra_dev_tlb_flush(info, addr, mask, + IOMMU_NO_PASID, qdep); + } + spin_unlock_irqrestore(&domain->lock, flags); +} + +static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, + unsigned long npages, bool ih) +{ + struct iommu_domain_info *info; + unsigned int mask; + unsigned long i; + + xa_for_each(&domain->iommu_array, i, info) + qi_flush_piotlb(info->iommu, + domain_id_iommu(domain, info->iommu), + IOMMU_NO_PASID, addr, npages, ih); + + if (!domain->has_iotlb_device) + return; + + if (npages == U64_MAX) + mask = 64 - VTD_PAGE_SHIFT; + else + mask = ilog2(__roundup_pow_of_two(npages)); + + nested_flush_dev_iotlb(domain, addr, mask); +} + +static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct iommu_hwpt_vtd_s1_invalidate inv_entry; + u32 index, processed = 0; + int ret = 0; + + if (array->type != IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) { + ret = -EINVAL; + goto out; + } + + for (index = 0; index < array->entry_num; index++) { + ret = iommu_copy_struct_from_user_array(&inv_entry, array, + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, + index, __reserved); + if (ret) + break; + + if ((inv_entry.flags & ~IOMMU_VTD_INV_FLAGS_LEAF) || + inv_entry.__reserved) { + ret = -EOPNOTSUPP; + break; + } + + if (!IS_ALIGNED(inv_entry.addr, VTD_PAGE_SIZE) || + ((inv_entry.npages == U64_MAX) && inv_entry.addr)) { + ret = -EINVAL; + break; + } + + intel_nested_flush_cache(dmar_domain, inv_entry.addr, + inv_entry.npages, + inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); + processed++; + } + +out: + array->entry_num = processed; + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, .free = intel_nested_domain_free, + .cache_invalidate_user = intel_nested_cache_invalidate_user, }; struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, -- Gitee From b40c3183ae33c6cc647a9c816959fbfc779686c6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 3 Jan 2024 11:27:22 -0400 Subject: [PATCH 064/219] iommufd/selftest: Check the bus type during probe ANBZ: #13617 commit 47f2bd2ff382e5fe766b1322e354558a8da4a470 upstream. This relied on the probe function only being invoked by the bus type mock was registered on. The removal of the bus ops broke this assumption and the probe could be called on non-mock bus types like PCI. Check the bus type directly in probe. Fixes: 17de3f5fdd35 ("iommu: Retire bus ops") Link: https://lore.kernel.org/r/0-v1-82d59f7eab8c+40c-iommufd_mock_bus_jgg@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d8860785d3b4..dd52d28d1423 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -25,6 +25,19 @@ static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; +struct mock_bus_type { + struct bus_type bus; + struct notifier_block nb; +}; + +static struct mock_bus_type iommufd_mock_bus_type = { + .bus = { + .name = "iommufd_mock", + }, +}; + +static atomic_t mock_dev_num; + enum { MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, @@ -452,6 +465,8 @@ static struct iommu_device mock_iommu_device = { static struct iommu_device *mock_probe_device(struct device *dev) { + if (dev->bus != &iommufd_mock_bus_type.bus) + return ERR_PTR(-ENODEV); return &mock_iommu_device; } @@ -591,19 +606,6 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } -struct mock_bus_type { - struct bus_type bus; - struct notifier_block nb; -}; - -static struct mock_bus_type iommufd_mock_bus_type = { - .bus = { - .name = "iommufd_mock", - }, -}; - -static atomic_t mock_dev_num; - static void mock_dev_release(struct device *dev) { struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); -- Gitee From ca700674f78f460df628a9e7db422559bb9b39f5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 30 Jan 2024 12:12:53 -0400 Subject: [PATCH 065/219] iommu: Allow ops->default_domain to work when !CONFIG_IOMMU_DMA ANBZ: #13617 commit 83b3836bf83f09beea5f592b126cfdd1bc921e48 upstream. The ops->default_domain flow used a 0 req_type to select the default domain and this was enforced by iommu_group_alloc_default_domain(). When !CONFIG_IOMMU_DMA started forcing the old ARM32 drivers into IDENTITY it also overroad the 0 req_type of the ops->default_domain drivers to IDENTITY which ends up causing failures during device probe. Make iommu_group_alloc_default_domain() accept a req_type that matches the ops->default_domain and have iommu_group_alloc_default_domain() generate a req_type that matches the default_domain. This way the req_type always describes what kind of domain should be attached and ops->default_domain overrides all other mechanisms to choose the default domain. Fixes: 2ad56efa80db ("powerpc/iommu: Setup a default domain and remove set_platform_dma_ops") Fixes: 0f6a90436a57 ("iommu: Do not use IOMMU_DOMAIN_DMA if CONFIG_IOMMU_DMA is not enabled") Reported-by: Ovidiu Panait Closes: https://lore.kernel.org/linux-iommu/20240123165829.630276-1-ovidiu.panait@windriver.com/ Reported-by: Shivaprasad G Bhat Closes: https://lore.kernel.org/linux-iommu/170618452753.3805.4425669653666211728.stgit@ltcd48-lp2.aus.stglab.ibm.com/ Tested-by: Ovidiu Panait Tested-by: Shivaprasad G Bhat Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-755bd21c4a64+525b8-iommu_def_dom_fix_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ad458c0da661..10673e3e1d80 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1810,7 +1810,7 @@ iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) * domain. Do not use in new drivers. */ if (ops->default_domain) { - if (req_type) + if (req_type != ops->default_domain->type) return ERR_PTR(-EINVAL); return ops->default_domain; } @@ -1882,10 +1882,18 @@ static int iommu_get_def_domain_type(struct iommu_group *group, const struct iommu_ops *ops = dev_iommu_ops(dev); int type; - if (!ops->def_domain_type) - return cur_type; - - type = ops->def_domain_type(dev); + if (ops->default_domain) { + /* + * Drivers that declare a global static default_domain will + * always choose that. + */ + type = ops->default_domain->type; + } else { + if (ops->def_domain_type) + type = ops->def_domain_type(dev); + else + return cur_type; + } if (!type || cur_type == type) return cur_type; if (!cur_type) -- Gitee From 8dd4961e754f20fdd0b3f906c745c616b05be0be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Oct 2023 10:35:11 -0300 Subject: [PATCH 066/219] powerpc/iommu: Do not do platform domain attach atctions after probe ANBZ: #13617 commit a8ca9fc9134c1a43e6d4db7ff59496bbd7075def upstream. POWER throws a splat at boot, it looks like the DMA ops were probably changed while a driver was attached. Something is still weird about how power sequences its bootup. Previously this was hidden since the core iommu code did nothing during probe, now it calls spapr_tce_platform_iommu_attach_dev(). Make spapr_tce_platform_iommu_attach_dev() do nothing on the probe time call like it did before. WARNING: CPU: 0 PID: 8 at arch/powerpc/kernel/iommu.c:407 __iommu_free+0x1e4/0x1f0 Modules linked in: sd_mod t10_pi crc64_rocksoft crc64 sg ibmvfc mlx5_core(+) scsi_transport_fc ibmveth mlxfw psample dm_multipath dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 0 PID: 8 Comm: kworker/0:0 Not tainted 6.6.0-rc3-next-20230929-auto #1 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1030.30 (NH1030_062) hv:phyp pSeries Workqueue: events work_for_cpu_fn NIP: c00000000005f6d4 LR: c00000000005f6d0 CTR: 00000000005ca81c REGS: c000000003a27890 TRAP: 0700 Not tainted (6.6.0-rc3-next-20230929-auto) MSR: 800000000282b033 CR: 48000824 XER: 00000008 CFAR: c00000000020f738 IRQMASK: 0 GPR00: c00000000005f6d0 c000000003a27b30 c000000001481800 000000000000017 GPR04: 00000000ffff7fff c000000003a27950 c000000003a27948 0000000000000027 GPR08: c000000c18c07c10 0000000000000001 0000000000000027 c000000002ac8a08 GPR12: 0000000000000000 c000000002ff0000 c00000000019cc88 c000000003042300 GPR16: 0000000000000000 0000000000000000 0000000000000000 c000000003071ab0 GPR20: c00000000349f80d c000000003215440 c000000003215480 61c8864680b583eb GPR24: 0000000000000000 000000007fffffff 0800000020000000 0000000000000010 GPR28: 0000000000020000 0000800000020000 c00000000c5dc800 c00000000c5dc880 NIP [c00000000005f6d4] __iommu_free+0x1e4/0x1f0 LR [c00000000005f6d0] __iommu_free+0x1e0/0x1f0 Call Trace: [c000000003a27b30] [c00000000005f6d0] __iommu_free+0x1e0/0x1f0 (unreliable) [c000000003a27bc0] [c00000000005f848] iommu_free+0x28/0x70 [c000000003a27bf0] [c000000000061518] iommu_free_coherent+0x68/0xa0 [c000000003a27c20] [c00000000005e8d4] dma_iommu_free_coherent+0x24/0x40 [c000000003a27c40] [c00000000024698c] dma_free_attrs+0x10c/0x140 [c000000003a27c90] [c008000000dcb8d4] mlx5_cmd_cleanup+0x5c/0x90 [mlx5_core] [c000000003a27cc0] [c008000000dc45a0] mlx5_mdev_uninit+0xc8/0x100 [mlx5_core] [c000000003a27d00] [c008000000dc4ac4] probe_one+0x3ec/0x530 [mlx5_core] [c000000003a27d90] [c0000000008c5edc] local_pci_probe+0x6c/0x110 [c000000003a27e10] [c000000000189c98] work_for_cpu_fn+0x38/0x60 [c000000003a27e40] [c00000000018d1d0] process_scheduled_works+0x230/0x4f0 [c000000003a27f10] [c00000000018ff14] worker_thread+0x1e4/0x500 [c000000003a27f90] [c00000000019cdb8] kthread+0x138/0x140 [c000000003a27fe0] [c00000000000df98] start_kernel_thread+0x14/0x18 Code: 481b004d 60000000 e89e0028 3c62ffe0 3863dd20 481b0039 60000000 e89e0038 3c62ffe0 3863dd38 481b0025 60000000 <0fe00000> 4bffff20 60000000 3c4c0142 ---[ end trace 0000000000000000 ]--- iommu_free: invalid entry entry = 0x8000000203d0 dma_addr = 0x8000000203d0000 Table = 0xc00000000c5dc800 bus# = 0x1 size = 0x20000 startOff = 0x800000000000 index = 0x70200016 Fixes: 2ad56efa80db ("powerpc/iommu: Setup a default domain and remove set_platform_dma_ops") Reported-by: Tasmiya Nalatwad Link: https://lore.kernel.org/r/d06cee81-c47f-9d62-dfc6-4c77b60058db@linux.vnet.ibm.com Tested-by: Tasmiya Nalatwad Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-2b52423411b9+164fc-iommu_ppc_defdomain_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- arch/powerpc/kernel/iommu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 18c1b5ba4aea..3425b917c8d6 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1280,13 +1280,19 @@ struct iommu_table_group_ops spapr_tce_table_group_ops = { /* * A simple iommu_ops to allow less cruft in generic VFIO code. */ -static int spapr_tce_platform_iommu_attach_dev(struct iommu_domain *dom, - struct device *dev) +static int +spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, + struct device *dev) { + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; int ret = -EINVAL; + /* At first attach the ownership is already set */ + if (!domain) + return 0; + if (!grp) return -ENODEV; -- Gitee From 67aa04c9bc2f89ba22ddf3d73341b818b9ea5bf7 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Fri, 26 Jan 2024 09:09:18 -0600 Subject: [PATCH 067/219] powerpc: iommu: Bring back table group release_ownership() call ANBZ: #13617 commit d2d00e15808c37ec476a5c040ee2cdd23854ef18 upstream. The commit 2ad56efa80db ("powerpc/iommu: Setup a default domain and remove set_platform_dma_ops") refactored the code removing the set_platform_dma_ops(). It missed out the table group release_ownership() call which would have got called otherwise during the guest shutdown via vfio_group_detach_container(). On PPC64, this particular call actually sets up the 32-bit TCE table, and enables the 64-bit DMA bypass etc. Now after guest shutdown, the subsequent host driver (e.g megaraid-sas) probe post unbind from vfio-pci fails like, megaraid_sas 0031:01:00.0: Warning: IOMMU dma not supported: mask 0x7fffffffffffffff, table unavailable megaraid_sas 0031:01:00.0: Warning: IOMMU dma not supported: mask 0xffffffff, table unavailable megaraid_sas 0031:01:00.0: Failed to set DMA mask megaraid_sas 0031:01:00.0: Failed from megasas_init_fw 6539 The patch brings back the call to table_group release_ownership() call when switching back to PLATFORM domain from BLOCKED, while also separates the domain_ops for both. Fixes: 2ad56efa80db ("powerpc/iommu: Setup a default domain and remove set_platform_dma_ops") Signed-off-by: Shivaprasad G Bhat Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/170628173462.3742.18330000394415935845.stgit@ltcd48-lp2.aus.stglab.ibm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- arch/powerpc/kernel/iommu.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 3425b917c8d6..bcf15d93e1b1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1287,20 +1287,20 @@ spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; - int ret = -EINVAL; /* At first attach the ownership is already set */ if (!domain) return 0; - if (!grp) - return -ENODEV; - table_group = iommu_group_get_iommudata(grp); - ret = table_group->ops->take_ownership(table_group); + /* + * The domain being set to PLATFORM from earlier + * BLOCKED. The table_group ownership has to be released. + */ + table_group->ops->release_ownership(table_group); iommu_group_put(grp); - return ret; + return 0; } static const struct iommu_domain_ops spapr_tce_platform_domain_ops = { @@ -1312,13 +1312,32 @@ static struct iommu_domain spapr_tce_platform_domain = { .ops = &spapr_tce_platform_domain_ops, }; -static struct iommu_domain spapr_tce_blocked_domain = { - .type = IOMMU_DOMAIN_BLOCKED, +static int +spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, + struct device *dev) +{ + struct iommu_group *grp = iommu_group_get(dev); + struct iommu_table_group *table_group; + int ret = -EINVAL; + /* * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain * also sets the dma_api ops */ - .ops = &spapr_tce_platform_domain_ops, + table_group = iommu_group_get_iommudata(grp); + ret = table_group->ops->take_ownership(table_group); + iommu_group_put(grp); + + return ret; +} + +static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = { + .attach_dev = spapr_tce_blocked_iommu_attach_dev, +}; + +static struct iommu_domain spapr_tce_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &spapr_tce_blocked_domain_ops, }; static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) -- Gitee From 806204fd6763c8ba78bf3bf33ed39d7a523c4596 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:09 +0000 Subject: [PATCH 068/219] iommufd/selftest: Test u64 unaligned bitmaps ANBZ: #13617 commit 42af95114535dd94c39714b97ad720602d406b9a upstream. Exercise the dirty tracking bitmaps with byte unaligned addresses in addition to the PAGE_SIZE unaligned bitmaps, using a address towards the end of the page boundary. In doing so, increase the tailroom we allocate for the bitmap from MOCK_PAGE_SIZE(2K) into PAGE_SIZE(4K), such that we can test end of bitmap boundary. Link: https://lore.kernel.org/r/20240202133415.23819-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 949f8fc03771..bf1b979cf28d 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1741,9 +1741,9 @@ FIXTURE_SETUP(iommufd_dirty_tracking) self->bitmap_size = variant->buffer_size / self->page_size / BITS_PER_BYTE; - /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */ + /* Provision with an extra (PAGE_SIZE) for the unaligned case */ rc = posix_memalign(&self->bitmap, PAGE_SIZE, - self->bitmap_size + MOCK_PAGE_SIZE); + self->bitmap_size + PAGE_SIZE); assert(!rc); assert(self->bitmap); assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); @@ -1873,6 +1873,13 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, 0, _metadata); + /* u64 unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + 0xff1, + self->bitmap_size, 0, _metadata); + + test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } @@ -1907,6 +1914,14 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); + /* u64 unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + 0xff1, + self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); + test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } -- Gitee From 41886080c63e4e350605ca9b872aa6e8a1321099 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:10 +0000 Subject: [PATCH 069/219] iommufd/iova_bitmap: Handle recording beyond the mapped pages ANBZ: #13617 commit 2780025e01e2e1c92f83ee7da91d9727c2e58a3e upstream. IOVA bitmap is a zero-copy scheme of recording dirty bits that iterate the different bitmap user pages at chunks of a maximum of PAGE_SIZE/sizeof(struct page*) pages. When the iterations are split up into 64G, the end of the range may be broken up in a way that's aligned with a non base page PTE size. This leads to only part of the huge page being recorded in the bitmap. Note that in pratice this is only a problem for IOMMU dirty tracking i.e. when the backing PTEs are in IOMMU hugepages and the bitmap is in base page granularity. So far this not something that affects VF dirty trackers (which reports and records at the same granularity). To fix that, if there is a remainder of bits left to set in which the current IOVA bitmap doesn't cover, make a copy of the bitmap structure and iterate-and-set the rest of the bits remaining. Finally, when advancing the iterator, skip all the bits that were set ahead. Link: https://lore.kernel.org/r/20240202133415.23819-5-joao.m.martins@oracle.com Reported-by: Avihai Horon Fixes: f35f22cc760e ("iommu/vt-d: Access/Dirty bit support for SS domains") Fixes: 421a511a293f ("iommu/amd: Access/Dirty bit support in IOPTEs") Signed-off-by: Joao Martins Tested-by: Avihai Horon Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iova_bitmap.c | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index e9b08e16711f..89e6e11157f0 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -113,6 +113,9 @@ struct iova_bitmap { /* length of the IOVA range for the whole bitmap */ size_t length; + + /* length of the IOVA range set ahead the pinned pages */ + unsigned long set_ahead_length; }; /* @@ -342,6 +345,32 @@ static bool iova_bitmap_done(struct iova_bitmap *bitmap) return bitmap->mapped_base_index >= bitmap->mapped_total_index; } +static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap, + size_t set_ahead_length) +{ + int ret = 0; + + while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) { + unsigned long length = iova_bitmap_mapped_length(bitmap); + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + + ret = iova_bitmap_get(bitmap); + if (ret) + break; + + length = min(length, set_ahead_length); + iova_bitmap_set(bitmap, iova, length); + + set_ahead_length -= length; + bitmap->mapped_base_index += + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + iova_bitmap_put(bitmap); + } + + bitmap->set_ahead_length = 0; + return ret; +} + /* * Advances to the next range, releases the current pinned * pages and pins the next set of bitmap pages. @@ -358,6 +387,15 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap) if (iova_bitmap_done(bitmap)) return 0; + /* Iterate, set and skip any bits requested for next iteration */ + if (bitmap->set_ahead_length) { + int ret; + + ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length); + if (ret) + return ret; + } + /* When advancing the index we pin the next set of bitmap pages */ return iova_bitmap_get(bitmap); } @@ -427,5 +465,10 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, kunmap_local(kaddr); cur_bit += nbits; } while (cur_bit <= last_bit); + + if (unlikely(cur_bit <= last_bit)) { + bitmap->set_ahead_length = + ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift); + } } EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); -- Gitee From 909aa7c1aed365efa365d181a3569d3876e73779 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:11 +0000 Subject: [PATCH 070/219] iommufd/selftest: Refactor dirty bitmap tests ANBZ: #13617 commit 407fc184f0e0bfde61026d6ce3fb1e70a15159a3 upstream. Rework the functions that test and set the bitmaps to receive a new parameter (the pte_page_size) that reflects the expected PTE size in the page tables. The same scheme is still used i.e. even bits are dirty and odd page indexes aren't dirty. Here it just refactors to consider the size of the PTE rather than hardcoded to IOMMU mock base page assumptions. While at it, refactor dirty bitmap tests to use the idev_id created by the fixture instead of creating a new one. This is in preparation for doing tests with IOMMU hugepages where multiple bits set as part of recording a whole hugepage as dirty and thus the pte_page_size will vary depending on io hugepages or io base pages. Link: https://lore.kernel.org/r/20240202133415.23819-6-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 28 ++++++------- tools/testing/selftests/iommu/iommufd_utils.h | 39 ++++++++++++------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index bf1b979cf28d..6a06135ba17b 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1849,7 +1849,7 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { - uint32_t stddev_id; + uint32_t page_size = MOCK_PAGE_SIZE; uint32_t hwpt_id; uint32_t ioas_id; @@ -1859,34 +1859,31 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) test_cmd_hwpt_alloc(self->idev_id, ioas_id, IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); test_cmd_set_dirty_tracking(hwpt_id, true); test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap, self->bitmap_size, 0, _metadata); /* PAGE_SIZE unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, 0, _metadata); /* u64 unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, - self->bitmap + 0xff1, - self->bitmap_size, 0, _metadata); - + MOCK_APERTURE_START, self->page_size, page_size, + self->bitmap + 0xff1, self->bitmap_size, 0, + _metadata); - test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { - uint32_t stddev_id; + uint32_t page_size = MOCK_PAGE_SIZE; uint32_t hwpt_id; uint32_t ioas_id; @@ -1896,19 +1893,18 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) test_cmd_hwpt_alloc(self->idev_id, ioas_id, IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); test_cmd_set_dirty_tracking(hwpt_id, true); test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); /* Unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, @@ -1916,13 +1912,11 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) /* u64 unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, - self->bitmap + 0xff1, - self->bitmap_size, + MOCK_APERTURE_START, self->page_size, page_size, + self->bitmap + 0xff1, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); - test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index c646264aa41f..8d2b46b2114d 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -344,16 +344,19 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, page_size, bitmap, nr)) static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, - __u64 iova, size_t page_size, __u64 *bitmap, + __u64 iova, size_t page_size, + size_t pte_page_size, __u64 *bitmap, __u64 bitmap_size, __u32 flags, struct __test_metadata *_metadata) { - unsigned long i, nbits = bitmap_size * BITS_PER_BYTE; - unsigned long nr = nbits / 2; + unsigned long npte = pte_page_size / page_size, pteset = 2 * npte; + unsigned long nbits = bitmap_size * BITS_PER_BYTE; + unsigned long j, i, nr = nbits / pteset ?: 1; __u64 out_dirty = 0; /* Mark all even bits as dirty in the mock domain */ - for (i = 0; i < nbits; i += 2) + memset(bitmap, 0, bitmap_size); + for (i = 0; i < nbits; i += pteset) set_bit(i, (unsigned long *)bitmap); test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, @@ -365,8 +368,12 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, flags); /* Beware ASSERT_EQ() is two statements -- braces are not redundant! */ - for (i = 0; i < nbits; i++) { - ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); + for (i = 0; i < nbits; i += pteset) { + for (j = 0; j < pteset; j++) { + ASSERT_EQ(j < npte, + test_bit(i + j, (unsigned long *)bitmap)); + } + ASSERT_EQ(!(i % pteset), test_bit(i, (unsigned long *)bitmap)); } memset(bitmap, 0, bitmap_size); @@ -374,19 +381,23 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, flags); /* It as read already -- expect all zeroes */ - for (i = 0; i < nbits; i++) { - ASSERT_EQ(!(i % 2) && (flags & - IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), - test_bit(i, (unsigned long *)bitmap)); + for (i = 0; i < nbits; i += pteset) { + for (j = 0; j < pteset; j++) { + ASSERT_EQ( + (j < npte) && + (flags & + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), + test_bit(i + j, (unsigned long *)bitmap)); + } } return 0; } -#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \ - bitmap_size, flags, _metadata) \ +#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, pte_size,\ + bitmap, bitmap_size, flags, _metadata) \ ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \ - page_size, bitmap, bitmap_size, \ - flags, _metadata)) + page_size, pte_size, bitmap, \ + bitmap_size, flags, _metadata)) static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) -- Gitee From ed1d5cdbe104f21fb6c7f485a0f0df73f48b8d0d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:12 +0000 Subject: [PATCH 071/219] iommufd/selftest: Refactor mock_domain_read_and_clear_dirty() ANBZ: #13617 commit 02a8c61a8b06a4a082b58c3e643b28036c6be60f upstream. Move the clearing of the dirty bit of the mock domain into mock_domain_test_and_clear_dirty() helper, simplifying the caller function. Additionally, rework the mock_domain_read_and_clear_dirty() loop to iterate over a potentially variable IO page size. No functional change intended with the loop refactor. This is in preparation for dirty tracking support for IOMMU hugepage mock domains. Link: https://lore.kernel.org/r/20240202133415.23819-7-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 64 ++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index dd52d28d1423..01c7c132e975 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -206,6 +206,34 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } +static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, + unsigned long iova, size_t page_size, + unsigned long flags) +{ + unsigned long cur, end = iova + page_size - 1; + bool dirty = false; + void *ent, *old; + + for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); + if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) + continue; + + dirty = true; + /* Clear dirty */ + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { + unsigned long val; + + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + } + } + + return dirty; +} + static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long iova, size_t size, unsigned long flags, @@ -213,31 +241,29 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, { struct mock_iommu_domain *mock = container_of(domain, struct mock_iommu_domain, domain); - unsigned long i, max = size / MOCK_IO_PAGE_SIZE; - void *ent, *old; + unsigned long end = iova + size; + void *ent; if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) return -EINVAL; - for (i = 0; i < max; i++) { - unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE; + do { + unsigned long pgsize = MOCK_IO_PAGE_SIZE; + unsigned long head; - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, - cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - iommu_dirty_bitmap_record(dirty, cur, - MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent) { + iova += pgsize; + continue; } - } + + head = iova & ~(pgsize - 1); + + /* Clear dirty */ + if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, head, pgsize); + iova = head + pgsize; + } while (iova < end); return 0; } -- Gitee From d4ce1da6d232b4940dfb47658ce1648ef85f75d5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:13 +0000 Subject: [PATCH 072/219] iommufd/selftest: Hugepage mock domain support ANBZ: #13617 commit 7db521e23fe9e36855b61b01a67291281118570e upstream. Add support to mock iommu hugepages of 1M (for a 2K mock io page size). To avoid breaking test suite defaults, the way this is done is by explicitly creating a iommu mock device which has hugepage support (i.e. through MOCK_FLAGS_DEVICE_HUGE_IOVA). The same scheme is maintained of mock base page index tracking in the XArray, except that an extra bit is added to mark it as a hugepage. One subpage containing the dirty bit, means that the whole hugepage is dirty (similar to AMD IOMMU non-standard page sizes). For clearing, same thing applies, and it must clear all dirty subpages. This is in preparation for dirty tracking to mark mock hugepages as dirty to exercise all the iova-bitmap fixes. Link: https://lore.kernel.org/r/20240202133415.23819-8-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/iommufd_test.h | 1 + drivers/iommu/iommufd/selftest.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 482d4059f5db..e854d3f67205 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -45,6 +45,7 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, + MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, }; enum { diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 01c7c132e975..f94bc0b0f341 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -41,6 +41,7 @@ static atomic_t mock_dev_num; enum { MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, /* * Like a real page table alignment requires the low bits of the address @@ -53,6 +54,7 @@ enum { MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, + MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; /* @@ -257,6 +259,8 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, continue; } + if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) + pgsize = MOCK_HUGE_PAGE_SIZE; head = iova & ~(pgsize - 1); /* Clear dirty */ @@ -275,6 +279,7 @@ const struct iommu_dirty_ops dirty_ops = { static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); @@ -283,6 +288,8 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) + mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; mock->domain.ops = mock_ops.default_domain_ops; mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); @@ -328,7 +335,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, return ERR_PTR(-EOPNOTSUPP); if (user_data || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(NULL); + domain = mock_domain_alloc_paging(dev); if (!domain) return ERR_PTR(-ENOMEM); if (has_dirty_flag) @@ -391,6 +398,9 @@ static int mock_domain_map_pages(struct iommu_domain *domain, if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) flags = MOCK_PFN_LAST_IOVA; + if (pgsize != MOCK_IO_PAGE_SIZE) { + flags |= MOCK_PFN_HUGE_IOVA; + } old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | flags), @@ -645,7 +655,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) struct mock_dev *mdev; int rc; - if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY)) + if (dev_flags & + ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); -- Gitee From e401764f6bfb37df6ebafc3da96ff3a6567cca8c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:14 +0000 Subject: [PATCH 073/219] iommufd/selftest: Add mock IO hugepages tests ANBZ: #13617 commit fe13166f0562117c42fc60a109e664a914523e63 upstream. Leverage previously added MOCK_FLAGS_DEVICE_HUGE_IOVA flag to create an IOMMU domain with more than MOCK_IO_PAGE_SIZE supported. Plumb the hugetlb backing memory for buffer allocation and change the expected page size to MOCK_HUGE_PAGE_SIZE (1M) when hugepage variant test cases are used. These so far are limited to 128M and 256M IOVA range tests cases which is when 1M hugepages can be used. Link: https://lore.kernel.org/r/20240202133415.23819-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/iommufd.c | 45 +++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6a06135ba17b..c8d0f13c91dd 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -12,6 +12,7 @@ static unsigned long HUGEPAGE_SIZE; #define MOCK_PAGE_SIZE (PAGE_SIZE / 2) +#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) static unsigned long get_huge_page_size(void) { @@ -1716,10 +1717,12 @@ FIXTURE(iommufd_dirty_tracking) FIXTURE_VARIANT(iommufd_dirty_tracking) { unsigned long buffer_size; + bool hugepages; }; FIXTURE_SETUP(iommufd_dirty_tracking) { + int mmap_flags; void *vrc; int rc; @@ -1732,9 +1735,17 @@ FIXTURE_SETUP(iommufd_dirty_tracking) variant->buffer_size, rc); } + mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED; + if (variant->hugepages) { + /* + * MAP_POPULATE will cause the kernel to fail mmap if THPs are + * not available. + */ + mmap_flags |= MAP_HUGETLB | MAP_POPULATE; + } assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0); vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + mmap_flags, -1, 0); assert(vrc == self->buffer); self->page_size = MOCK_PAGE_SIZE; @@ -1749,8 +1760,16 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, - &self->idev_id); + /* Enable 1M mock IOMMU hugepages */ + if (variant->hugepages) { + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_HUGE_IOVA, + &self->stdev_id, &self->hwpt_id, + &self->idev_id); + } else { + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, + &self->hwpt_id, &self->idev_id); + } } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -1784,12 +1803,26 @@ FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) .buffer_size = 128UL * 1024UL * 1024UL, }; +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge) +{ + /* 4K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, + .hugepages = true, +}; + FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M) { /* 8K bitmap (256M IOVA range) */ .buffer_size = 256UL * 1024UL * 1024UL, }; +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M_huge) +{ + /* 8K bitmap (256M IOVA range) */ + .buffer_size = 256UL * 1024UL * 1024UL, + .hugepages = true, +}; + TEST_F(iommufd_dirty_tracking, enforce_dirty) { uint32_t ioas_id, stddev_id, idev_id; @@ -1853,6 +1886,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) uint32_t hwpt_id; uint32_t ioas_id; + if (variant->hugepages) + page_size = MOCK_HUGE_PAGE_SIZE; + test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); @@ -1887,6 +1923,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) uint32_t hwpt_id; uint32_t ioas_id; + if (variant->hugepages) + page_size = MOCK_HUGE_PAGE_SIZE; + test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); -- Gitee From 20e31533211cebfadf8159c19d7595b6634533b9 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 16 Dec 2023 18:26:58 +0200 Subject: [PATCH 074/219] iommu/msm-iommu: don't limit the driver too much ANBZ: #13617 commit 18368ee25d881981971fde430c7f33cbc27359d1 upstream. In preparation of dropping most of ARCH_QCOM subtypes, stop limiting the driver just to those machines. Allow it to be built for any 32-bit Qualcomm platform (ARCH_QCOM). Acked-by: Robin Murphy Acked-by: Joerg Roedel Signed-off-by: Dmitry Baryshkov Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231216162700.863456-2-dmitry.baryshkov@linaro.org Signed-off-by: Bjorn Andersson Signed-off-by: Shuai Xue --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 0c1d9ecc7a77..de8da50b2cd9 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -179,7 +179,7 @@ config FSL_PAMU config MSM_IOMMU bool "MSM IOMMU Support" depends on ARM - depends on ARCH_MSM8X60 || ARCH_MSM8960 || COMPILE_TEST + depends on ARCH_QCOM || COMPILE_TEST select IOMMU_API select IOMMU_IO_PGTABLE_ARMV7S help -- Gitee From 22b32ef6101309983388da04eb0364fdca3d9495 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:00 +0000 Subject: [PATCH 075/219] iommu/amd: Remove unused PPR_* macros ANBZ: #13617 commit be4f5995875353efd5c438375b2a68453f137b6e upstream. Commit 5a0b11a180a ("iommu/amd: Remove iommu_v2 module") missed to remove PPR_* macros. Remove these macros as its not used anymore. No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 31275bc6e9e4..bdfe1405be1c 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -77,10 +77,6 @@ static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu) } #endif -#define PPR_SUCCESS 0x0 -#define PPR_INVALID 0x1 -#define PPR_FAILURE 0xf - int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag); -- Gitee From a94aaed1f260326a81eb0f04a4067d9a8875b9e1 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:01 +0000 Subject: [PATCH 076/219] iommu/amd: Remove unused IOVA_* macro ANBZ: #13617 commit a4086637664926b52f5ecd087b3957f935ce48fb upstream. These macros are not used after commit ac6d704679d343 ("iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()"). No functional change intended. Signed-off-by: Vasant Hegde Cc: Lu Baolu Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 80c5efe345c1..5425714cf5da 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -45,10 +45,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -/* IO virtual address start page frame number */ -#define IOVA_START_PFN (1) -#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) - /* Reserved IOVA ranges */ #define MSI_RANGE_START (0xfee00000) #define MSI_RANGE_END (0xfeefffff) -- Gitee From 00fa64ee8536df7ec77f2700bce78b0b53032bca Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:02 +0000 Subject: [PATCH 077/219] iommu/amd: Remove unused APERTURE_* macros ANBZ: #13617 commit 2edf056f57f59cf35980786f1ad4f681b9a5bebf upstream. These macros are not used after commit 518d9b450387 ("iommu/amd: Remove special mapping code for dma_ops path"). No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu_types.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 809d74faa1a5..fdf7807280bd 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -513,14 +513,6 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define for_each_iommu_safe(iommu, next) \ list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) -#define APERTURE_RANGE_SHIFT 27 /* 128 MB */ -#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT) -#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT) -#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */ -#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) -#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) - - struct amd_iommu; struct iommu_domain; struct irq_domain; -- Gitee From 42da0ca095d6af07c8cb62a3634a0d26023c916b Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:03 +0000 Subject: [PATCH 078/219] iommu/amd: Remove duplicate function declarations from amd_iommu.h ANBZ: #13617 commit 773b05e7f407c377d64a9173ceed54489cc47c3a upstream. Perf counter related functions are defined in amd-iommu.h as well. Hence remove duplicate declarations. No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index bdfe1405be1c..4e1f9a444a1a 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -39,14 +39,6 @@ extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; bool amd_iommu_v2_supported(void); -struct amd_iommu *get_amd_iommu(unsigned int idx); -u8 amd_iommu_pc_get_max_banks(unsigned int idx); -bool amd_iommu_pc_supported(void); -u8 amd_iommu_pc_get_max_counters(unsigned int idx); -int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); -int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); /* Device capabilities */ int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); -- Gitee From 5832e521b9f23d60abce0215cfa446abd2e7939a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:04 +0000 Subject: [PATCH 079/219] iommu/amd: Remove redundant error check in amd_iommu_probe_device() ANBZ: #13617 commit 2dc9506bfb135edb684eb4f0bf17ea847c18b2a0 upstream. iommu_init_device() is not returning -ENOTSUPP since commit 61289cbaf6c8 ("iommu/amd: Remove old alias handling code"). No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 5425714cf5da..281dfe134dbf 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1943,8 +1943,7 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) ret = iommu_init_device(iommu, dev); if (ret) { - if (ret != -ENOTSUPP) - dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); + dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); iommu_dev = ERR_PTR(ret); iommu_ignore_device(iommu, dev); } else { -- Gitee From 297d6dd6a4105276285f03a7ac7dc205632e2d3d Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 18 Jan 2024 09:01:05 +0000 Subject: [PATCH 080/219] iommu/amd: Remove EXPORT_SYMBOL for perf counter related functions ANBZ: #13617 commit 108042db53a1eb192b972098669f36c6379ef159 upstream. .. as IOMMU perf counters are always built as part of kernel. No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Alejandro Jimenez Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240118090105.5864-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/init.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 2f7e1cca7d34..226d641ddbf0 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3722,13 +3722,11 @@ u8 amd_iommu_pc_get_max_banks(unsigned int idx) return 0; } -EXPORT_SYMBOL(amd_iommu_pc_get_max_banks); bool amd_iommu_pc_supported(void) { return amd_iommu_pc_present; } -EXPORT_SYMBOL(amd_iommu_pc_supported); u8 amd_iommu_pc_get_max_counters(unsigned int idx) { @@ -3739,7 +3737,6 @@ u8 amd_iommu_pc_get_max_counters(unsigned int idx) return 0; } -EXPORT_SYMBOL(amd_iommu_pc_get_max_counters); static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value, bool is_write) -- Gitee From 0ec13a8b94c2a8e428dabbadc37eae660a207c7e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 5 Feb 2024 15:32:39 +0000 Subject: [PATCH 081/219] iommu/iova: Tidy up iova_cache_get() failure ANBZ: #13617 commit e7b3533c81386464dfdcb01193075f8a9557083a upstream. Failure handling in iova_cache_get() is a little messy, and we'd like to add some more to it, so let's tidy up a bit first. By leaving the hotplug handler until last we can take advantage of kmem_cache_destroy() being NULL-safe to have a single cleanup label. We can also improve the error reporting, noting that kmem_cache_create() already screams if it fails, so that one is redundant. Signed-off-by: Robin Murphy Acked-by: David Rientjes Reviewed-by: Pasha Tatashin Reviewed-by: John Garry Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/ae4a3bda2d6a9b738221553c838d30473bd624e7.1707144953.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iova.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index d30e453d0fb4..cf95001d85c0 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -254,26 +254,20 @@ static void free_iova_mem(struct iova *iova) int iova_cache_get(void) { + int err = -ENOMEM; + mutex_lock(&iova_cache_mutex); if (!iova_cache_users) { - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL, - iova_cpuhp_dead); - if (ret) { - mutex_unlock(&iova_cache_mutex); - pr_err("Couldn't register cpuhp handler\n"); - return ret; - } + iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!iova_cache) + goto out_err; - iova_cache = kmem_cache_create( - "iommu_iova", sizeof(struct iova), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!iova_cache) { - cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD); - mutex_unlock(&iova_cache_mutex); - pr_err("Couldn't create iova cache\n"); - return -ENOMEM; + err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", + NULL, iova_cpuhp_dead); + if (err) { + pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err)); + goto out_err; } } @@ -281,6 +275,11 @@ int iova_cache_get(void) mutex_unlock(&iova_cache_mutex); return 0; + +out_err: + kmem_cache_destroy(iova_cache); + mutex_unlock(&iova_cache_mutex); + return err; } EXPORT_SYMBOL_GPL(iova_cache_get); -- Gitee From e7bf51f57dc0b398b68af5a0a48096f806f79038 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 5 Feb 2024 15:32:40 +0000 Subject: [PATCH 082/219] iommu/iova: Reorganise some code ANBZ: #13617 commit 7f845d8b2eed0986a03a777d4956b52a57007974 upstream. The iova_cache_{get,put}() calls really represent top-level lifecycle management for the whole IOVA library, so it's long been rather confusing to have them buried right in the middle of the allocator implementation details. Move them to a more expected position at the end of the file, where it will then also be easier to expand them. With this, we can also move the rcache hotplug handler (plus another stray function) into the rcache portion of the file. Signed-off-by: Robin Murphy Acked-by: David Rientjes Reviewed-by: Pasha Tatashin Reviewed-by: John Garry Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/d4753562f4faa0e6b3aeebcbf88fdb60cc22d715.1707144953.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iova.c | 128 +++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index cf95001d85c0..b5de865ee50b 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -24,24 +24,8 @@ static bool iova_rcache_insert(struct iova_domain *iovad, static unsigned long iova_rcache_get(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn); -static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); static void free_iova_rcaches(struct iova_domain *iovad); - -unsigned long iova_rcache_range(void) -{ - return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1); -} - -static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct iova_domain *iovad; - - iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead); - - free_cpu_cached_iovas(cpu, iovad); - return 0; -} - +static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); static void free_global_cached_iovas(struct iova_domain *iovad); static struct iova *to_iova(struct rb_node *node) @@ -252,53 +236,6 @@ static void free_iova_mem(struct iova *iova) kmem_cache_free(iova_cache, iova); } -int iova_cache_get(void) -{ - int err = -ENOMEM; - - mutex_lock(&iova_cache_mutex); - if (!iova_cache_users) { - iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!iova_cache) - goto out_err; - - err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", - NULL, iova_cpuhp_dead); - if (err) { - pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err)); - goto out_err; - } - } - - iova_cache_users++; - mutex_unlock(&iova_cache_mutex); - - return 0; - -out_err: - kmem_cache_destroy(iova_cache); - mutex_unlock(&iova_cache_mutex); - return err; -} -EXPORT_SYMBOL_GPL(iova_cache_get); - -void iova_cache_put(void) -{ - mutex_lock(&iova_cache_mutex); - if (WARN_ON(!iova_cache_users)) { - mutex_unlock(&iova_cache_mutex); - return; - } - iova_cache_users--; - if (!iova_cache_users) { - cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD); - kmem_cache_destroy(iova_cache); - } - mutex_unlock(&iova_cache_mutex); -} -EXPORT_SYMBOL_GPL(iova_cache_put); - /** * alloc_iova - allocates an iova * @iovad: - iova domain in question @@ -653,6 +590,11 @@ struct iova_rcache { struct delayed_work work; }; +unsigned long iova_rcache_range(void) +{ + return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1); +} + static struct iova_magazine *iova_magazine_alloc(gfp_t flags) { struct iova_magazine *mag; @@ -989,5 +931,63 @@ static void free_global_cached_iovas(struct iova_domain *iovad) spin_unlock_irqrestore(&rcache->lock, flags); } } + +static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct iova_domain *iovad; + + iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead); + + free_cpu_cached_iovas(cpu, iovad); + return 0; +} + +int iova_cache_get(void) +{ + int err = -ENOMEM; + + mutex_lock(&iova_cache_mutex); + if (!iova_cache_users) { + iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!iova_cache) + goto out_err; + + err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", + NULL, iova_cpuhp_dead); + if (err) { + pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err)); + goto out_err; + } + } + + iova_cache_users++; + mutex_unlock(&iova_cache_mutex); + + return 0; + +out_err: + kmem_cache_destroy(iova_cache); + mutex_unlock(&iova_cache_mutex); + return err; +} +EXPORT_SYMBOL_GPL(iova_cache_get); + +void iova_cache_put(void) +{ + mutex_lock(&iova_cache_mutex); + if (WARN_ON(!iova_cache_users)) { + mutex_unlock(&iova_cache_mutex); + return; + } + iova_cache_users--; + if (!iova_cache_users) { + cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD); + kmem_cache_destroy(iova_cache); + } + mutex_unlock(&iova_cache_mutex); +} +EXPORT_SYMBOL_GPL(iova_cache_put); + MODULE_AUTHOR("Anil S Keshavamurthy "); MODULE_LICENSE("GPL"); -- Gitee From 82454e76653bebbb19c9e105be94a678003c2904 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 5 Feb 2024 15:32:41 +0000 Subject: [PATCH 083/219] iommu/iova: use named kmem_cache for iova magazines ANBZ: #13617 commit 84e6f56be9c68b59aca51544f7ee33706542d7bc upstream. The magazine buffers can take gigabytes of kmem memory, dominating all other allocations. For observability purpose create named slab cache so the iova magazine memory overhead can be clearly observed. With this change: > slabtop -o | head Active / Total Objects (% used) : 869731 / 952904 (91.3%) Active / Total Slabs (% used) : 103411 / 103974 (99.5%) Active / Total Caches (% used) : 135 / 211 (64.0%) Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache On this machine it is now clear that magazine use 242M of kmem memory. Acked-by: David Rientjes Signed-off-by: Pasha Tatashin [ rm: adjust to rework of iova_cache_{get,put} ] Signed-off-by: Robin Murphy Reviewed-by: John Garry Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/dc5c51aaba50906a92b9ba1a5137ed462484a7be.1707144953.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iova.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index b5de865ee50b..d59d0ea2fd21 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -590,6 +590,8 @@ struct iova_rcache { struct delayed_work work; }; +static struct kmem_cache *iova_magazine_cache; + unsigned long iova_rcache_range(void) { return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1); @@ -599,7 +601,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) { struct iova_magazine *mag; - mag = kmalloc(sizeof(*mag), flags); + mag = kmem_cache_alloc(iova_magazine_cache, flags); if (mag) mag->size = 0; @@ -608,7 +610,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) static void iova_magazine_free(struct iova_magazine *mag) { - kfree(mag); + kmem_cache_free(iova_magazine_cache, mag); } static void @@ -953,6 +955,12 @@ int iova_cache_get(void) if (!iova_cache) goto out_err; + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", + sizeof(struct iova_magazine), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!iova_magazine_cache) + goto out_err; + err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL, iova_cpuhp_dead); if (err) { @@ -968,6 +976,7 @@ int iova_cache_get(void) out_err: kmem_cache_destroy(iova_cache); + kmem_cache_destroy(iova_magazine_cache); mutex_unlock(&iova_cache_mutex); return err; } @@ -984,6 +993,7 @@ void iova_cache_put(void) if (!iova_cache_users) { cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD); kmem_cache_destroy(iova_cache); + kmem_cache_destroy(iova_magazine_cache); } mutex_unlock(&iova_cache_mutex); } -- Gitee From 5708bc316901ceb2ffb4f4358a4e9ad44b2f2724 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 5 Feb 2024 16:43:27 +0000 Subject: [PATCH 084/219] iommu/ipmmu-vmsa: Minor cleanups ANBZ: #13617 commit f2d6677ad5772d4972b11dc3a4c3e555973b27d9 upstream. Remove the of_match_ptr() which was supposed to have gone long ago, but managed to got lost in a fix-squashing mishap. On a similar theme, we may as well also modernise the PM ops to get rid of the clunky #ifdefs, and modernise the resource mapping to keep the checkpatch brigade happy. Link: https://lore.kernel.org/linux-iommu/Yxni3d6CdI3FZ5D+@8bytes.org/ Signed-off-by: Robin Murphy Reviewed-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/791877b0d310dc2ab7dc616d2786ab24252b9b8e.1707151207.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/ipmmu-vmsa.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index cd7219319c8b..b657cc09605f 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -1005,7 +1005,6 @@ static const struct of_device_id ipmmu_of_ids[] = { static int ipmmu_probe(struct platform_device *pdev) { struct ipmmu_vmsa_device *mmu; - struct resource *res; int irq; int ret; @@ -1025,8 +1024,7 @@ static int ipmmu_probe(struct platform_device *pdev) return ret; /* Map I/O memory and request IRQ. */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - mmu->base = devm_ioremap_resource(&pdev->dev, res); + mmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(mmu->base)) return PTR_ERR(mmu->base); @@ -1123,7 +1121,6 @@ static void ipmmu_remove(struct platform_device *pdev) ipmmu_device_reset(mmu); } -#ifdef CONFIG_PM_SLEEP static int ipmmu_resume_noirq(struct device *dev) { struct ipmmu_vmsa_device *mmu = dev_get_drvdata(dev); @@ -1153,18 +1150,14 @@ static int ipmmu_resume_noirq(struct device *dev) } static const struct dev_pm_ops ipmmu_pm = { - SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, ipmmu_resume_noirq) + NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, ipmmu_resume_noirq) }; -#define DEV_PM_OPS &ipmmu_pm -#else -#define DEV_PM_OPS NULL -#endif /* CONFIG_PM_SLEEP */ static struct platform_driver ipmmu_driver = { .driver = { .name = "ipmmu-vmsa", - .of_match_table = of_match_ptr(ipmmu_of_ids), - .pm = DEV_PM_OPS, + .of_match_table = ipmmu_of_ids, + .pm = pm_sleep_ptr(&ipmmu_pm), }, .probe = ipmmu_probe, .remove_new = ipmmu_remove, -- Gitee From 1d3763bc59234e8a8f1e6e03a8838520eb339458 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:55:59 +0000 Subject: [PATCH 085/219] iommu/amd: Pass struct iommu_dev_data to set_dte_entry() ANBZ: #13617 commit a6ffb9b3d71ea21c9d56ac5856fe321ef584bb19 upstream. Pass iommu_dev_data structure instead of passing indivisual variables. No functional changes intended. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/amd/iommu.c commit 606309e30626 introduces get_amd_iommu_from_dev_data() ] Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 281dfe134dbf..03ff0bba484c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1700,12 +1700,14 @@ static int setup_gcr3_table(struct protection_domain *domain, int pasids) return 0; } -static void set_dte_entry(struct amd_iommu *iommu, u16 devid, - struct protection_domain *domain, bool ats, bool ppr) +static void set_dte_entry(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data) { u64 pte_root = 0; u64 flags = 0; u32 old_domid; + u16 devid = dev_data->devid; + struct protection_domain *domain = dev_data->domain; struct dev_table_entry *dev_table = get_dev_table(iommu); if (domain->iop.mode != PAGE_MODE_NONE) @@ -1725,10 +1727,10 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, flags = dev_table[devid].data[1]; - if (ats) + if (dev_data->ats_enabled) flags |= DTE_FLAG_IOTLB; - if (ppr) + if (dev_data->ppr) pte_root |= 1ULL << DEV_ENTRY_PPR; if (domain->dirty_tracking) @@ -1804,9 +1806,6 @@ static void do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - bool ats; - - ats = dev_data->ats_enabled; /* Update data structures */ dev_data->domain = domain; @@ -1821,8 +1820,7 @@ static void do_attach(struct iommu_dev_data *dev_data, domain->dev_cnt += 1; /* Update device table */ - set_dte_entry(iommu, dev_data->devid, domain, - ats, dev_data->ppr); + set_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); @@ -1999,8 +1997,7 @@ static void update_device_table(struct protection_domain *domain) list_for_each_entry(dev_data, &domain->dev_list, list) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - set_dte_entry(iommu, dev_data->devid, domain, - dev_data->ats_enabled, dev_data->ppr); + set_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); } } -- Gitee From 40269f253f8cee965ce8d627b4133e628c73e692 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:00 +0000 Subject: [PATCH 086/219] iommu/amd: Enable Guest Translation before registering devices ANBZ: #13617 commit 8e017973317235468a330aa197134c1aebbf13da upstream. IOMMU Guest Translation (GT) feature needs to be enabled before invalidating guest translations (CMD_INV_IOMMU_PAGES with GN=1). Currently GT feature is enabled after setting up interrupt handler. So far it was fine as we were not invalidating guest page table before this point. Upcoming series will introduce per device GCR3 table and it will invalidate guest pages after configuring. Hence move GT feature enablement to early_enable_iommu(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 226d641ddbf0..2e98a6b0c02e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2781,6 +2781,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); + iommu_enable_gt(iommu); iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); @@ -2837,6 +2838,7 @@ static void early_enable_iommus(void) iommu_disable_irtcachedis(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); + iommu_enable_gt(iommu); iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); @@ -2850,10 +2852,8 @@ static void enable_iommus_v2(void) { struct amd_iommu *iommu; - for_each_iommu(iommu) { + for_each_iommu(iommu) iommu_enable_ppr_log(iommu); - iommu_enable_gt(iommu); - } } static void enable_iommus_vapic(void) -- Gitee From 163fdfcfb3343cfbfe0394d2715450608d160732 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:02 +0000 Subject: [PATCH 087/219] iommu/amd: Introduce struct protection_domain.pd_mode ANBZ: #13617 commit fda5108ebafe7797dc8e1279f04e6e9145c9ad10 upstream. This enum variable is used to track the type of page table used by the protection domain. It will replace the protection_domain.flags in subsequent series. Suggested-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu_types.h | 6 ++++++ drivers/iommu/amd/iommu.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index fdf7807280bd..d29d4a321e57 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -541,6 +541,11 @@ struct amd_io_pgtable { u64 *pgd; /* v2 pgtable pgd pointer */ }; +enum protection_domain_mode { + PD_MODE_V1 = 1, + PD_MODE_V2, +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -556,6 +561,7 @@ struct protection_domain { int nid; /* Node ID */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ + enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 03ff0bba484c..1c902699800d 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2077,6 +2077,7 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) return -ENOMEM; } + domain->pd_mode = PD_MODE_V1; amd_iommu_domain_set_pgtable(domain, pt_root, mode); return 0; @@ -2085,6 +2086,7 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) static int protection_domain_init_v2(struct protection_domain *domain) { domain->flags |= PD_GIOV_MASK; + domain->pd_mode = PD_MODE_V2; domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; -- Gitee From f291ba2b96f49df7b158cbf798416f420ae8d50d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:03 +0000 Subject: [PATCH 088/219] iommu/amd: Introduce per-device GCR3 table ANBZ: #13617 commit b7731065523048276a80c8e16120208153438930 upstream. AMD IOMMU GCR3 table is indexed by PASID. Each entry stores guest CR3 register value, which is an address to the root of guest IO page table. The GCR3 table can be programmed per-device. However, Linux AMD IOMMU driver currently managing the table on a per-domain basis. PASID is a device feature. When SVA is enabled it will bind PASID to device, not domain. Hence it makes sense to have per device GCR3 table. Introduce struct iommu_dev_data.gcr3_tbl_info to keep track of GCR3 table configuration. This will eventually replaces gcr3 related variables in protection_domain structure. Suggested-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu_types.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index d29d4a321e57..aed3e88d23c7 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -533,6 +533,12 @@ struct amd_irte_ops; #define io_pgtable_cfg_to_data(x) \ container_of((x), struct amd_io_pgtable, pgtbl_cfg) +struct gcr3_tbl_info { + u64 *gcr3_tbl; /* Guest CR3 table */ + int glx; /* Number of levels for GCR3 table */ + u32 pasid_cnt; /* Track attached PASIDs */ +}; + struct amd_io_pgtable { struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable iop; @@ -814,6 +820,7 @@ struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ + struct gcr3_tbl_info gcr3_info; /* Per-device GCR3 table */ struct device *dev; u16 devid; /* PCI Device ID */ -- Gitee From 14c741305f86381e65d113912ca6a62ac2a5388b Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:04 +0000 Subject: [PATCH 089/219] iommu/amd: Use protection_domain.flags to check page table mode ANBZ: #13617 commit 7b4e5623d8e4f1475b7deac42035d0129fb36ed0 upstream. Page table mode (v1, v2 or pt) is per domain property. Recently we have enhanced protection_domain.pd_mode to track per domain page table mode. Use that variable to check the page table mode instead of global 'amd_iommu_pgtable' in {map/unmap}_pages path. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 1c902699800d..687512b0a3d8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2332,7 +2332,7 @@ static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, int prot = 0; int ret = -EINVAL; - if ((amd_iommu_pgtable == AMD_IOMMU_V1) && + if ((domain->pd_mode == PD_MODE_V1) && (domain->iop.mode == PAGE_MODE_NONE)) return -EINVAL; @@ -2378,7 +2378,7 @@ static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova struct io_pgtable_ops *ops = &domain->iop.iop.ops; size_t r; - if ((amd_iommu_pgtable == AMD_IOMMU_V1) && + if ((domain->pd_mode == PD_MODE_V1) && (domain->iop.mode == PAGE_MODE_NONE)) return 0; -- Gitee From 5a2ca76d90482d39851b1dd551429b3895636563 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:05 +0000 Subject: [PATCH 090/219] iommu/amd: Add support for device based TLB invalidation ANBZ: #13617 commit 474bf01ed9f0358b243b264339baafcfbe6c6670 upstream. Add support to invalidate TLB/IOTLB for the given device. These functions will be used in subsequent patches where we will introduce per device GCR3 table and SVA support. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 5 +++++ drivers/iommu/amd/iommu.c | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 4e1f9a444a1a..1f4bbc6bf1e5 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -55,6 +55,11 @@ void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_complete(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); +void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, + ioasid_t pasid, u64 address, size_t size); +void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, + ioasid_t pasid); + int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, unsigned long cr3); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 687512b0a3d8..53bf2c0087c0 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1547,6 +1547,29 @@ static void amd_iommu_domain_flush_all(struct protection_domain *domain) CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } +void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, + ioasid_t pasid, u64 address, size_t size) +{ + struct iommu_cmd cmd; + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); + + build_inv_iommu_pages(&cmd, address, size, + dev_data->domain->id, pasid, true); + iommu_queue_command(iommu, &cmd); + + if (dev_data->ats_enabled) + device_flush_iotlb(dev_data, address, size, pasid, true); + + iommu_completion_wait(iommu); +} + +void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, + ioasid_t pasid) +{ + amd_iommu_dev_flush_pasid_pages(dev_data, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid); +} + void amd_iommu_domain_flush_complete(struct protection_domain *domain) { int i; -- Gitee From 691570e105f3e076bfe7d969f952041256e55f1e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:06 +0000 Subject: [PATCH 091/219] iommu/amd: Rearrange GCR3 table setup code ANBZ: #13617 commit b2e8a7f5d2c3d99285e13cdd330d339d9b040599 upstream. Consolidate GCR3 table related code in one place so that its easy to maintain. Note that this patch doesn't move __set_gcr3/__clear_gcr3. We are moving GCR3 table from per domain to per device. Following series will rework these functions. During that time I will move these functions as well. No functional changes intended. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 64 +++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 53bf2c0087c0..aa7dacc9e1ad 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1723,6 +1723,38 @@ static int setup_gcr3_table(struct protection_domain *domain, int pasids) return 0; } +static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) +{ + int index; + u64 *pte; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + pte = &root[index]; + + if (level == 0) + break; + + if (!(*pte & GCR3_VALID)) { + if (!alloc) + return NULL; + + root = (void *)get_zeroed_page(GFP_ATOMIC); + if (root == NULL) + return NULL; + + *pte = iommu_virt_to_phys(root) | GCR3_VALID; + } + + root = iommu_phys_to_virt(*pte & PAGE_MASK); + + level -= 1; + } + + return pte; +} + static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { @@ -2759,38 +2791,6 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) return ret; } -static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) -{ - int index; - u64 *pte; - - while (true) { - - index = (pasid >> (9 * level)) & 0x1ff; - pte = &root[index]; - - if (level == 0) - break; - - if (!(*pte & GCR3_VALID)) { - if (!alloc) - return NULL; - - root = (void *)get_zeroed_page(GFP_ATOMIC); - if (root == NULL) - return NULL; - - *pte = iommu_virt_to_phys(root) | GCR3_VALID; - } - - root = iommu_phys_to_virt(*pte & PAGE_MASK); - - level -= 1; - } - - return pte; -} - static int __set_gcr3(struct protection_domain *domain, u32 pasid, unsigned long cr3) { -- Gitee From ee52a885ff8752c5aff35d111ec26fc5ab9fa3e4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:07 +0000 Subject: [PATCH 092/219] iommu: Introduce iommu_group_mutex_assert() ANBZ: #13617 commit bf8aff2945ba4091f503df673b9df33002546e6a upstream. Add function to check iommu group mutex lock. So that device drivers can rely on group mutex lock instead of adding another driver level lock before modifying driver specific device data structure. Suggested-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 19 +++++++++++++++++++ include/linux/iommu.h | 8 ++++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 10673e3e1d80..065bb3b0f63c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1259,6 +1259,25 @@ void iommu_group_remove_device(struct device *dev) } EXPORT_SYMBOL_GPL(iommu_group_remove_device); +#if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) +/** + * iommu_group_mutex_assert - Check device group mutex lock + * @dev: the device that has group param set + * + * This function is called by an iommu driver to check whether it holds + * group mutex lock for the given device or not. + * + * Note that this function must be called after device group param is set. + */ +void iommu_group_mutex_assert(struct device *dev) +{ + struct iommu_group *group = dev->iommu_group; + + lockdep_assert_held(&group->mutex); +} +EXPORT_SYMBOL_GPL(iommu_group_mutex_assert); +#endif + static struct device *iommu_group_first_dev(struct iommu_group *group) { lockdep_assert_held(&group->mutex); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8286b93f14ae..4bf88290bfe4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1388,6 +1388,14 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) +void iommu_group_mutex_assert(struct device *dev); +#else +static inline void iommu_group_mutex_assert(struct device *dev) +{ +} +#endif + /** * iommu_map_sgtable - Map the given buffer to the IOMMU domain * @domain: The IOMMU domain to perform the mapping -- Gitee From 634e1257ce43af908d251b7f3b22395b2b38f3fc Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:08 +0000 Subject: [PATCH 093/219] iommu/amd: Refactor helper function for setting / clearing GCR3 ANBZ: #13617 commit e8e1aac33458d80545e5ad37b2a3e4243eee278e upstream. Refactor GCR3 helper functions in preparation to use per device GCR3 table. * Add new function update_gcr3 to update per device GCR3 table * Remove per domain default GCR3 setup during v2 page table allocation. Subsequent patch will add support to setup default gcr3 while attaching device to domain. * Remove amd_iommu_domain_update() from V2 page table path as device detach path will take care of updating the domain. * Consolidate GCR3 table related code in one place so that its easy to maintain. * Rename functions to reflect its usage. Signed-off-by: Suravee Suthikulpanit Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-11-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 8 ++- drivers/iommu/amd/io_pgtable_v2.c | 21 +----- drivers/iommu/amd/iommu.c | 115 ++++++++++++++---------------- 3 files changed, 61 insertions(+), 83 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 1f4bbc6bf1e5..6e03ff2aaebe 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -44,6 +44,11 @@ bool amd_iommu_v2_supported(void); int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); +/* GCR3 setup */ +int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, + ioasid_t pasid, unsigned long gcr3); +int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid); + int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); /* * This function flushes all internal caches of @@ -61,9 +66,6 @@ void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, ioasid_t pasid); int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); -int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, - unsigned long cr3); -int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid); #ifdef CONFIG_IRQ_REMAP int amd_iommu_create_irq_domain(struct amd_iommu *iommu); diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 1187779c240e..8929c484b11e 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -350,38 +350,26 @@ static const struct iommu_flush_ops v2_flush_ops = { static void v2_free_pgtable(struct io_pgtable *iop) { - struct protection_domain *pdom; struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); - pdom = container_of(pgtable, struct protection_domain, iop); - if (!(pdom->flags & PD_IOMMUV2_MASK)) + if (!pgtable || !pgtable->pgd) return; - /* Clear gcr3 entry */ - amd_iommu_domain_clear_gcr3(&pdom->domain, 0); - - /* Make changes visible to IOMMUs */ - amd_iommu_domain_update(pdom); - /* Free page table */ free_pgtable(pgtable->pgd, get_pgtable_level()); + pgtable->pgd = NULL; } static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) { struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); struct protection_domain *pdom = (struct protection_domain *)cookie; - int ret; int ias = IOMMU_IN_ADDR_BIT_SIZE; pgtable->pgd = alloc_pgtable_page(pdom->nid, GFP_ATOMIC); if (!pgtable->pgd) return NULL; - ret = amd_iommu_domain_set_gcr3(&pdom->domain, 0, iommu_virt_to_phys(pgtable->pgd)); - if (ret) - goto err_free_pgd; - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) ias = 57; @@ -395,11 +383,6 @@ static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo cfg->tlb = &v2_flush_ops; return &pgtable->iop; - -err_free_pgd: - free_pgtable_page(pgtable->pgd); - - return NULL; } struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index aa7dacc9e1ad..12e6d7e186d2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1723,10 +1723,13 @@ static int setup_gcr3_table(struct protection_domain *domain, int pasids) return 0; } -static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) +static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info, + ioasid_t pasid, bool alloc) { int index; u64 *pte; + u64 *root = gcr3_info->gcr3_tbl; + int level = gcr3_info->glx; while (true) { @@ -1755,6 +1758,56 @@ static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) return pte; } +static int update_gcr3(struct iommu_dev_data *dev_data, + ioasid_t pasid, unsigned long gcr3, bool set) +{ + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + u64 *pte; + + pte = __get_gcr3_pte(gcr3_info, pasid, true); + if (pte == NULL) + return -ENOMEM; + + if (set) + *pte = (gcr3 & PAGE_MASK) | GCR3_VALID; + else + *pte = 0; + + amd_iommu_dev_flush_pasid_all(dev_data, pasid); + return 0; +} + +int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, + unsigned long gcr3) +{ + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + int ret; + + iommu_group_mutex_assert(dev_data->dev); + + ret = update_gcr3(dev_data, pasid, gcr3, true); + if (ret) + return ret; + + gcr3_info->pasid_cnt++; + return ret; +} + +int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) +{ + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + int ret; + + iommu_group_mutex_assert(dev_data->dev); + + ret = update_gcr3(dev_data, pasid, 0, false); + if (ret) + return ret; + + gcr3_info->pasid_cnt--; + return ret; +} + static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { @@ -2791,66 +2844,6 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) return ret; } -static int __set_gcr3(struct protection_domain *domain, u32 pasid, - unsigned long cr3) -{ - u64 *pte; - - if (domain->iop.mode != PAGE_MODE_NONE) - return -EINVAL; - - pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); - if (pte == NULL) - return -ENOMEM; - - *pte = (cr3 & PAGE_MASK) | GCR3_VALID; - - return __amd_iommu_flush_tlb(domain, pasid); -} - -static int __clear_gcr3(struct protection_domain *domain, u32 pasid) -{ - u64 *pte; - - if (domain->iop.mode != PAGE_MODE_NONE) - return -EINVAL; - - pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); - if (pte == NULL) - return 0; - - *pte = 0; - - return __amd_iommu_flush_tlb(domain, pasid); -} - -int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, - unsigned long cr3) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&domain->lock, flags); - ret = __set_gcr3(domain, pasid, cr3); - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; -} - -int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&domain->lock, flags); - ret = __clear_gcr3(domain, pasid); - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; -} - int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag) { -- Gitee From b86a3a071429a60928a26fa1bfc9d95c6776d827 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:09 +0000 Subject: [PATCH 094/219] iommu/amd: Refactor attaching / detaching device functions ANBZ: #13617 commit 4ebd4c7f2501cc5876717e0f2382d5b825e59405 upstream. If domain is configured with V2 page table then setup default GCR3 with domain GCR3 pointer. So that all devices in the domain uses same page table for translation. Also return page table setup status from do_attach() function. Signed-off-by: Suravee Suthikulpanit Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-12-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 12e6d7e186d2..991dabb94cf1 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1910,10 +1910,11 @@ static void clear_dte_entry(struct amd_iommu *iommu, u16 devid) amd_iommu_apply_erratum_63(iommu, devid); } -static void do_attach(struct iommu_dev_data *dev_data, - struct protection_domain *domain) +static int do_attach(struct iommu_dev_data *dev_data, + struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + int ret = 0; /* Update data structures */ dev_data->domain = domain; @@ -1927,11 +1928,28 @@ static void do_attach(struct iommu_dev_data *dev_data, domain->dev_iommu[iommu->index] += 1; domain->dev_cnt += 1; + /* Init GCR3 table and update device table */ + if (domain->pd_mode == PD_MODE_V2) { + /* By default, setup GCR3 table to support single PASID */ + ret = setup_gcr3_table(dev_data->domain, 1); + if (ret) + return ret; + + ret = update_gcr3(dev_data, 0, + iommu_virt_to_phys(domain->iop.pgd), true); + if (ret) { + free_gcr3_table(dev_data->domain); + return ret; + } + } + /* Update device table */ set_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); + + return ret; } static void do_detach(struct iommu_dev_data *dev_data) @@ -1939,6 +1957,12 @@ static void do_detach(struct iommu_dev_data *dev_data) struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + /* Clear GCR3 table */ + if (domain->pd_mode == PD_MODE_V2) { + update_gcr3(dev_data, 0, 0, false); + free_gcr3_table(dev_data->domain); + } + /* Update data structures */ dev_data->domain = NULL; list_del(&dev_data->list); @@ -1981,7 +2005,7 @@ static int attach_device(struct device *dev, if (dev_is_pci(dev)) pdev_enable_caps(to_pci_dev(dev)); - do_attach(dev_data, domain); + ret = do_attach(dev_data, domain); out: spin_unlock(&dev_data->lock); -- Gitee From a47a31c9b2b6104eae3f5cc4ae061c172d0176ef Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:10 +0000 Subject: [PATCH 095/219] iommu/amd: Refactor protection_domain helper functions ANBZ: #13617 commit fb575d17813f6aff2e8b7e051dac8def2c291bff upstream. To removes the code to setup GCR3 table, and only handle domain create / destroy, since GCR3 is no longer part of a domain. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-13-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 991dabb94cf1..52bcbe789e05 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2185,9 +2185,6 @@ static void protection_domain_free(struct protection_domain *domain) if (domain->iop.pgtbl_cfg.tlb) free_io_pgtable_ops(&domain->iop.iop.ops); - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); - if (domain->iop.root) free_page((unsigned long)domain->iop.root); @@ -2215,15 +2212,10 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) return 0; } -static int protection_domain_init_v2(struct protection_domain *domain) +static int protection_domain_init_v2(struct protection_domain *pdom) { - domain->flags |= PD_GIOV_MASK; - domain->pd_mode = PD_MODE_V2; - - domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - - if (setup_gcr3_table(domain, 1)) - return -ENOMEM; + pdom->pd_mode = PD_MODE_V2; + pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; return 0; } -- Gitee From 3e8b3ff488c6cd822064c8c383135744ea8ec1e7 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:11 +0000 Subject: [PATCH 096/219] iommu/amd: Refactor GCR3 table helper functions ANBZ: #13617 commit cf70873e3d01653df69115576501f8f7f6f2b203 upstream. To use the new per-device struct gcr3_tbl_info. Use GFP_KERNEL flag instead of GFP_ATOMIC for GCR3 table allocation. Also modify set_dte_entry() to use new per device GCR3 table. Also in free_gcr3_table() path replace BUG_ON with WARN_ON_ONCE(). Signed-off-by: Suravee Suthikulpanit Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-14-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 53 +++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 52bcbe789e05..599dd20c105f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -75,6 +75,9 @@ struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); +static void set_dte_entry(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data); + /**************************************************************************** * * Helper functions @@ -1675,16 +1678,19 @@ static void free_gcr3_tbl_level2(u64 *tbl) } } -static void free_gcr3_table(struct protection_domain *domain) +static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) { - if (domain->glx == 2) - free_gcr3_tbl_level2(domain->gcr3_tbl); - else if (domain->glx == 1) - free_gcr3_tbl_level1(domain->gcr3_tbl); + if (gcr3_info->glx == 2) + free_gcr3_tbl_level2(gcr3_info->gcr3_tbl); + else if (gcr3_info->glx == 1) + free_gcr3_tbl_level1(gcr3_info->gcr3_tbl); else - BUG_ON(domain->glx != 0); + WARN_ON_ONCE(gcr3_info->glx != 0); + + gcr3_info->glx = 0; - free_page((unsigned long)domain->gcr3_tbl); + free_page((unsigned long)gcr3_info->gcr3_tbl); + gcr3_info->gcr3_tbl = NULL; } /* @@ -1703,22 +1709,23 @@ static int get_gcr3_levels(int pasids) return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; } -/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ -static int setup_gcr3_table(struct protection_domain *domain, int pasids) +static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, + struct amd_iommu *iommu, int pasids) { int levels = get_gcr3_levels(pasids); + int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; if (levels > amd_iommu_max_glx_val) return -EINVAL; - domain->gcr3_tbl = alloc_pgtable_page(domain->nid, GFP_ATOMIC); - if (domain->gcr3_tbl == NULL) - return -ENOMEM; + if (gcr3_info->gcr3_tbl) + return -EBUSY; - domain->glx = levels; - domain->flags |= PD_IOMMUV2_MASK; + gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_KERNEL); + if (gcr3_info->gcr3_tbl == NULL) + return -ENOMEM; - amd_iommu_domain_update(domain); + gcr3_info->glx = levels; return 0; } @@ -1817,6 +1824,7 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid = dev_data->devid; struct protection_domain *domain = dev_data->domain; struct dev_table_entry *dev_table = get_dev_table(iommu); + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; if (domain->iop.mode != PAGE_MODE_NONE) pte_root = iommu_virt_to_phys(domain->iop.root); @@ -1844,9 +1852,9 @@ static void set_dte_entry(struct amd_iommu *iommu, if (domain->dirty_tracking) pte_root |= DTE_FLAG_HAD; - if (domain->flags & PD_IOMMUV2_MASK) { - u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); - u64 glx = domain->glx; + if (gcr3_info && gcr3_info->gcr3_tbl) { + u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); + u64 glx = gcr3_info->glx; u64 tmp; pte_root |= DTE_FLAG_GV; @@ -1874,7 +1882,8 @@ static void set_dte_entry(struct amd_iommu *iommu, ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT); } - if (domain->flags & PD_GIOV_MASK) + /* GIOV is supported with V2 page table mode only */ + if (pdom_is_v2_pgtbl_mode(domain)) pte_root |= DTE_FLAG_GIOV; } @@ -1931,14 +1940,14 @@ static int do_attach(struct iommu_dev_data *dev_data, /* Init GCR3 table and update device table */ if (domain->pd_mode == PD_MODE_V2) { /* By default, setup GCR3 table to support single PASID */ - ret = setup_gcr3_table(dev_data->domain, 1); + ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 1); if (ret) return ret; ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(domain->iop.pgd), true); if (ret) { - free_gcr3_table(dev_data->domain); + free_gcr3_table(&dev_data->gcr3_info); return ret; } } @@ -1960,7 +1969,7 @@ static void do_detach(struct iommu_dev_data *dev_data) /* Clear GCR3 table */ if (domain->pd_mode == PD_MODE_V2) { update_gcr3(dev_data, 0, 0, false); - free_gcr3_table(dev_data->domain); + free_gcr3_table(&dev_data->gcr3_info); } /* Update data structures */ -- Gitee From 6e5f8665c149445890746c6d89dd86d432a2b8d4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:12 +0000 Subject: [PATCH 097/219] iommu/amd: Remove unused flush pasid functions ANBZ: #13617 commit 02b990253db7cbdc3ef0631e9c202a69512a14c4 upstream. We have removed iommu_v2 module and converted v2 page table to use common flush functions. Also we have moved GCR3 table to per device. PASID related functions are not used. Hence remove these unused functions. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-15-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 3 - drivers/iommu/amd/iommu.c | 100 ---------------------------------- 2 files changed, 103 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 6e03ff2aaebe..bea910d9ace5 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -49,7 +49,6 @@ int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, unsigned long gcr3); int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid); -int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); /* * This function flushes all internal caches of * the IOMMU used by this driver. @@ -65,8 +64,6 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, ioasid_t pasid); -int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); - #ifdef CONFIG_IRQ_REMAP int amd_iommu_create_irq_domain(struct amd_iommu *iommu); #else diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 599dd20c105f..4c2dd77b4240 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2769,106 +2769,6 @@ const struct iommu_ops amd_iommu_ops = { } }; -static int __flush_pasid(struct protection_domain *domain, u32 pasid, - u64 address, size_t size) -{ - struct iommu_dev_data *dev_data; - struct iommu_cmd cmd; - int i, ret; - - if (!(domain->flags & PD_IOMMUV2_MASK)) - return -EINVAL; - - build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, true); - - /* - * IOMMU TLB needs to be flushed before Device TLB to - * prevent device TLB refill from IOMMU TLB - */ - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain->dev_iommu[i] == 0) - continue; - - ret = iommu_queue_command(amd_iommus[i], &cmd); - if (ret != 0) - goto out; - } - - /* Wait until IOMMU TLB flushes are complete */ - amd_iommu_domain_flush_complete(domain); - - /* Now flush device TLBs */ - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu; - int qdep; - - /* - There might be non-IOMMUv2 capable devices in an IOMMUv2 - * domain. - */ - if (!dev_data->ats_enabled) - continue; - - qdep = dev_data->ats_qdep; - iommu = rlookup_amd_iommu(dev_data->dev); - if (!iommu) - continue; - build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, - address, size, pasid, true); - - ret = iommu_queue_command(iommu, &cmd); - if (ret != 0) - goto out; - } - - /* Wait until all device TLBs are flushed */ - amd_iommu_domain_flush_complete(domain); - - ret = 0; - -out: - - return ret; -} - -static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid, - u64 address) -{ - return __flush_pasid(domain, pasid, address, PAGE_SIZE); -} - -int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, - u64 address) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&domain->lock, flags); - ret = __amd_iommu_flush_page(domain, pasid, address); - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; -} - -static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) -{ - return __flush_pasid(domain, pasid, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); -} - -int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&domain->lock, flags); - ret = __amd_iommu_flush_tlb(domain, pasid); - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; -} - int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag) { -- Gitee From eec492e56c4e39b873b6077fbf64c98f807820a5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:13 +0000 Subject: [PATCH 098/219] iommu/amd: Rearrange device flush code ANBZ: #13617 commit a7b2aff3132562bd26657d9a707f8fa82967147f upstream. Consolidate all flush related code in one place so that its easy to maintain. No functional changes intended. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-16-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 92 ++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 4c2dd77b4240..2652df66fc03 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1614,6 +1614,49 @@ static void domain_flush_devices(struct protection_domain *domain) device_flush_dte(dev_data); } +static void update_device_table(struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); + + set_dte_entry(iommu, dev_data); + clone_aliases(iommu, dev_data->dev); + } +} + +void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) +{ + update_device_table(domain); + domain_flush_devices(domain); +} + +void amd_iommu_domain_update(struct protection_domain *domain) +{ + /* Update device table */ + amd_iommu_update_and_flush_device_table(domain); + + /* Flush domain TLB(s) and wait for completion */ + amd_iommu_domain_flush_all(domain); +} + +int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, + int status, int tag) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + dev_data = dev_iommu_priv_get(&pdev->dev); + iommu = get_amd_iommu_from_dev(&pdev->dev); + + build_complete_ppr(&cmd, dev_data->devid, pasid, status, + tag, dev_data->pri_tlp); + + return iommu_queue_command(iommu, &cmd); +} + /**************************************************************************** * * The next functions belong to the domain allocation. A domain is @@ -2125,39 +2168,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) return acpihid_device_group(dev); } -/***************************************************************************** - * - * The next functions belong to the dma_ops mapping/unmapping code. - * - *****************************************************************************/ - -static void update_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - - set_dte_entry(iommu, dev_data); - clone_aliases(iommu, dev_data->dev); - } -} - -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - update_device_table(domain); - domain_flush_devices(domain); -} - -void amd_iommu_domain_update(struct protection_domain *domain) -{ - /* Update device table */ - amd_iommu_update_and_flush_device_table(domain); - - /* Flush domain TLB(s) and wait for completion */ - amd_iommu_domain_flush_all(domain); -} - /***************************************************************************** * * The following functions belong to the exported interface of AMD IOMMU @@ -2769,22 +2779,6 @@ const struct iommu_ops amd_iommu_ops = { } }; -int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, - int status, int tag) -{ - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; - struct iommu_cmd cmd; - - dev_data = dev_iommu_priv_get(&pdev->dev); - iommu = get_amd_iommu_from_dev(&pdev->dev); - - build_complete_ppr(&cmd, dev_data->devid, pasid, status, - tag, dev_data->pri_tlp); - - return iommu_queue_command(iommu, &cmd); -} - #ifdef CONFIG_IRQ_REMAP /***************************************************************************** -- Gitee From 3d78e7e6e4c43ce3731b2904184dcc08e9065127 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:14 +0000 Subject: [PATCH 099/219] iommu/amd: Remove unused GCR3 table parameters from struct protection_domain ANBZ: #13617 commit c2a6af5e08c27350cb007db4cfeeb413017888b0 upstream. Since they are moved to struct iommu_dev_data, and the driver has been ported to use them. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-17-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu_types.h | 12 ------------ drivers/iommu/amd/iommu.c | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index aed3e88d23c7..43f0f1b13f2d 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -453,15 +453,6 @@ #define MAX_DOMAIN_ID 65536 -/* Protection domain flags */ -#define PD_DMA_OPS_MASK BIT(0) /* domain used for dma_ops */ -#define PD_DEFAULT_MASK BIT(1) /* domain is a default dma_ops - domain for an IOMMU */ -#define PD_PASSTHROUGH_MASK BIT(2) /* domain has no page - translation */ -#define PD_IOMMUV2_MASK BIT(3) /* domain has gcr3 table */ -#define PD_GIOV_MASK BIT(4) /* domain enable GIOV support */ - /* Timeout stuff */ #define LOOP_TIMEOUT 100000 #define MMIO_STATUS_TIMEOUT 2000000 @@ -563,10 +554,7 @@ struct protection_domain { struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ - int glx; /* Number of levels for GCR3 table */ int nid; /* Node ID */ - u64 *gcr3_tbl; /* Guest CR3 table */ - unsigned long flags; /* flags to find out type of domain */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2652df66fc03..7dce13850eb1 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -86,7 +86,7 @@ static void set_dte_entry(struct amd_iommu *iommu, static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) { - return (pdom && (pdom->flags & PD_IOMMUV2_MASK)); + return (pdom && (pdom->pd_mode == PD_MODE_V2)); } static inline int get_acpihid_device_id(struct device *dev, -- Gitee From 0f56eba67fd94f6ece0d71701560da5ffda05750 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:15 +0000 Subject: [PATCH 100/219] iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue ANBZ: #13617 commit 87a6f1f22c9781b1b37847caf65e070bfa6dadb5 upstream. With v1 page table, the AMD IOMMU spec states that the hardware must use the domain ID to tag its internal translation caches. I/O devices with different v1 page tables must be given different domain IDs. I/O devices that share the same v1 page table __may__ be given the same domain ID. This domain ID management policy is currently implemented by the AMD IOMMU driver. In this case, only the domain ID is needed when issuing the INVALIDATE_IOMMU_PAGES command to invalidate the IOMMU translation cache (TLB). With v2 page table, the hardware uses domain ID and PASID as parameters to tag and issue the INVALIDATE_IOMMU_PAGES command. Since the GCR3 table is setup per-device, and there is no guarantee for PASID to be unique across multiple devices. The same PASID for different devices could have different v2 page tables. In such case, if multiple devices share the same domain ID, IOMMU translation cache for these devices would be polluted due to TLB aliasing. Hence, avoid the TLB aliasing issue with v2 page table by allocating unique domain ID for each device even when multiple devices are sharing the same v1 page table. Please note that this fix would result in multiple INVALIDATE_IOMMU_PAGES commands (one per domain id) when unmapping a translation. Domain ID can be shared until device starts using PASID. We will enhance this code later where we will allocate per device domain ID only when its needed. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-18-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/iommu.c | 80 +++++++++++++++++++++++------ 2 files changed, 64 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 43f0f1b13f2d..d1fed5fc219b 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -528,6 +528,7 @@ struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ u32 pasid_cnt; /* Track attached PASIDs */ + u16 domid; /* Per device domain ID */ }; struct amd_io_pgtable { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7dce13850eb1..9ae25547f743 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1452,27 +1452,37 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } -/* - * TLB invalidation function which is called from the mapping functions. - * It invalidates a single PTE if the range to flush is within a single - * page. Otherwise it flushes the whole TLB of the IOMMU. - */ -static void __domain_flush_pages(struct protection_domain *domain, +static int domain_flush_pages_v2(struct protection_domain *pdom, u64 address, size_t size) { struct iommu_dev_data *dev_data; struct iommu_cmd cmd; - int ret = 0, i; - ioasid_t pasid = IOMMU_NO_PASID; - bool gn = false; + int ret = 0; - if (pdom_is_v2_pgtbl_mode(domain)) - gn = true; + list_for_each_entry(dev_data, &pdom->dev_list, list) { + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); + u16 domid = dev_data->gcr3_info.domid; + + build_inv_iommu_pages(&cmd, address, size, + domid, IOMMU_NO_PASID, true); + + ret |= iommu_queue_command(iommu, &cmd); + } - build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, gn); + return ret; +} + +static int domain_flush_pages_v1(struct protection_domain *pdom, + u64 address, size_t size) +{ + struct iommu_cmd cmd; + int ret = 0, i; + + build_inv_iommu_pages(&cmd, address, size, + pdom->id, IOMMU_NO_PASID, false); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (!domain->dev_iommu[i]) + if (!pdom->dev_iommu[i]) continue; /* @@ -1482,6 +1492,28 @@ static void __domain_flush_pages(struct protection_domain *domain, ret |= iommu_queue_command(amd_iommus[i], &cmd); } + return ret; +} + +/* + * TLB invalidation function which is called from the mapping functions. + * It flushes range of PTEs of the domain. + */ +static void __domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) +{ + struct iommu_dev_data *dev_data; + int ret = 0; + ioasid_t pasid = IOMMU_NO_PASID; + bool gn = false; + + if (pdom_is_v2_pgtbl_mode(domain)) { + gn = true; + ret = domain_flush_pages_v2(domain, address, size); + } else { + ret = domain_flush_pages_v1(domain, address, size); + } + list_for_each_entry(dev_data, &domain->dev_list, list) { if (!dev_data->ats_enabled) @@ -1557,7 +1589,7 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); build_inv_iommu_pages(&cmd, address, size, - dev_data->domain->id, pasid, true); + dev_data->gcr3_info.domid, pasid, true); iommu_queue_command(iommu, &cmd); if (dev_data->ats_enabled) @@ -1732,6 +1764,9 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) gcr3_info->glx = 0; + /* Free per device domain ID */ + domain_id_free(gcr3_info->domid); + free_page((unsigned long)gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; } @@ -1764,9 +1799,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, if (gcr3_info->gcr3_tbl) return -EBUSY; + /* Allocate per device domain ID */ + gcr3_info->domid = domain_id_alloc(); + gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_KERNEL); - if (gcr3_info->gcr3_tbl == NULL) + if (gcr3_info->gcr3_tbl == NULL) { + domain_id_free(gcr3_info->domid); return -ENOMEM; + } gcr3_info->glx = levels; @@ -1865,10 +1905,16 @@ static void set_dte_entry(struct amd_iommu *iommu, u64 flags = 0; u32 old_domid; u16 devid = dev_data->devid; + u16 domid; struct protection_domain *domain = dev_data->domain; struct dev_table_entry *dev_table = get_dev_table(iommu); struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + if (gcr3_info && gcr3_info->gcr3_tbl) + domid = dev_data->gcr3_info.domid; + else + domid = domain->id; + if (domain->iop.mode != PAGE_MODE_NONE) pte_root = iommu_virt_to_phys(domain->iop.root); @@ -1881,7 +1927,7 @@ static void set_dte_entry(struct amd_iommu *iommu, * When SNP is enabled, Only set TV bit when IOMMU * page translation is in use. */ - if (!amd_iommu_snp_en || (domain->id != 0)) + if (!amd_iommu_snp_en || (domid != 0)) pte_root |= DTE_FLAG_TV; flags = dev_table[devid].data[1]; @@ -1931,7 +1977,7 @@ static void set_dte_entry(struct amd_iommu *iommu, } flags &= ~DEV_DOMID_MASK; - flags |= domain->id; + flags |= domid; old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK; dev_table[devid].data[1] = flags; -- Gitee From 5605c5daf6b47cd8b94286cf7731b7e59900bd40 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 13 Feb 2024 13:31:56 +0200 Subject: [PATCH 101/219] Revert "iommu/arm-smmu: Convert to domain_alloc_paging()" ANBZ: #13617 commit b419c5e2d9787a98c1410dbe1d4a25906b96f000 upstream. This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling arm_smmu_write_context_bank() from new codepath results in the platform being reset because of the unclocked hardware access. Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()") Signed-off-by: Dmitry Baryshkov Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20240213-iommu-revert-domain-alloc-v1-1-325ff55dece4@linaro.org Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 44cab3c72072..8385121db418 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -864,10 +864,14 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) arm_smmu_rpm_put(smmu); } -static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) +static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; + if (type != IOMMU_DOMAIN_UNMANAGED) { + if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) + return NULL; + } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -880,15 +884,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); - if (dev) { - struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); - - if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) { - kfree(smmu_domain); - return NULL; - } - } - return &smmu_domain->domain; } @@ -1635,7 +1630,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc_paging = arm_smmu_domain_alloc_paging, + .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .probe_finalize = arm_smmu_probe_finalize, -- Gitee From b274bf09b755aa58c4c2df5b5ccebe86c4a06794 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Tue, 13 Feb 2024 10:05:22 -0600 Subject: [PATCH 102/219] powerpc/iommu: Fix the missing iommu_group_put() during platform domain attach ANBZ: #13617 commit 0846dd77c8349ec92ca0079c9c71d130f34cb192 upstream. The function spapr_tce_platform_iommu_attach_dev() is missing to call iommu_group_put() when the domain is already set. This refcount leak shows up with BUG_ON() during DLPAR remove operation as: KernelBug: Kernel bug in state 'None': kernel BUG at arch/powerpc/platforms/pseries/iommu.c:100! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=8192 NUMA pSeries Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_016) hv:phyp pSeries NIP: c0000000000ff4d4 LR: c0000000000ff4cc CTR: 0000000000000000 REGS: c0000013aed5f840 TRAP: 0700 Tainted: G I (6.8.0-rc3-autotest-g99bd3cb0d12e) MSR: 8000000000029033 CR: 44002402 XER: 20040000 CFAR: c000000000a0d170 IRQMASK: 0 ... NIP iommu_reconfig_notifier+0x94/0x200 LR iommu_reconfig_notifier+0x8c/0x200 Call Trace: iommu_reconfig_notifier+0x8c/0x200 (unreliable) notifier_call_chain+0xb8/0x19c blocking_notifier_call_chain+0x64/0x98 of_reconfig_notify+0x44/0xdc of_detach_node+0x78/0xb0 ofdt_write.part.0+0x86c/0xbb8 proc_reg_write+0xf4/0x150 vfs_write+0xf8/0x488 ksys_write+0x84/0x140 system_call_exception+0x138/0x330 system_call_vectored_common+0x15c/0x2ec The patch adds the missing iommu_group_put() call. Fixes: a8ca9fc9134c ("powerpc/iommu: Do not do platform domain attach atctions after probe") Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/274e0d2b-b5cc-475e-94e6-8427e88e271d@linux.vnet.ibm.com/ Signed-off-by: Shivaprasad G Bhat Tested-by: Venkat Rao Bagalkote Reviewed-by: Jason Gunthorpe Signed-off-by: Michael Ellerman Link: https://msgid.link/170784021983.6249.10039296655906636112.stgit@linux.ibm.com Signed-off-by: Shuai Xue --- arch/powerpc/kernel/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index bcf15d93e1b1..1185efebf032 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1289,8 +1289,10 @@ spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, struct iommu_table_group *table_group; /* At first attach the ownership is already set */ - if (!domain) + if (!domain) { + iommu_group_put(grp); return 0; + } table_group = iommu_group_get_iommudata(grp); /* -- Gitee From 40055cd3ac1b90dd2a9e7ddf819f277252016e8c Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sun, 11 Feb 2024 19:22:50 +0100 Subject: [PATCH 103/219] iommu/mtk_iommu: Use devm_kcalloc() instead of devm_kzalloc() ANBZ: #13617 commit b07cd3b746cfe9a4524a5a7337fcf4ccb933509b upstream. This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows [1]. Here the multiplication is obviously safe because MTK_PROTECT_PA_ALIGN is defined as a literal value of 256 or 128. For the "mtk_iommu.c" file: 256 For the "mtk_iommu_v1.c" file: 128 However, using devm_kcalloc() is more appropriate [2] and improves readability. This patch has no effect on runtime behavior. Link: https://github.com/KSPP/linux/issues/162 [1] Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2] Signed-off-by: Erick Archer Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Matthias Brugger Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240211182250.12656-1-erick.archer@gmx.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/mtk_iommu.c | 2 +- drivers/iommu/mtk_iommu_v1.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index add2eaaaa7ee..32deab732209 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -1265,7 +1265,7 @@ static int mtk_iommu_probe(struct platform_device *pdev) data->plat_data = of_device_get_match_data(dev); /* Protect memory. HW will access here while translation fault.*/ - protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2, GFP_KERNEL); + protect = devm_kcalloc(dev, 2, MTK_PROTECT_PA_ALIGN, GFP_KERNEL); if (!protect) return -ENOMEM; data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN); diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 0ddcd153b568..d6e4002200bd 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -623,8 +623,8 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) data->dev = dev; /* Protect memory. HW will access here while translation fault.*/ - protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2, - GFP_KERNEL | GFP_DMA); + protect = devm_kcalloc(dev, 2, MTK_PROTECT_PA_ALIGN, + GFP_KERNEL | GFP_DMA); if (!protect) return -ENOMEM; data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN); -- Gitee From 995c4d6b617fea95ad9fee0cc6d1321392a8afb8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:12 +0800 Subject: [PATCH 104/219] iommu: Move iommu fault data to linux/iommu.h ANBZ: #13617 commit 00a9bc6070434814d39118a0de70c1645f64bf60 upstream. The iommu fault data is currently defined in uapi/linux/iommu.h, but is only used inside the iommu subsystem. Move it to linux/iommu.h, where it will be more accessible to kernel drivers. With this done, uapi/linux/iommu.h becomes empty and can be removed from the tree. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- MAINTAINERS | 1 - include/linux/iommu.h | 152 +++++++++++++++++++++++++++++++++- include/uapi/linux/iommu.h | 161 ------------------------------------- 3 files changed, 151 insertions(+), 163 deletions(-) delete mode 100644 include/uapi/linux/iommu.h diff --git a/MAINTAINERS b/MAINTAINERS index 5065f48fc7fe..f329b6ac04cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11013,7 +11013,6 @@ F: drivers/iommu/ F: include/linux/iommu.h F: include/linux/iova.h F: include/linux/of_iommu.h -F: include/uapi/linux/iommu.h IOMMUFD M: Jason Gunthorpe diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4bf88290bfe4..f4be71f16d0a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,7 +14,6 @@ #include #include #include -#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -44,6 +43,157 @@ struct iommu_sva; struct iommu_fault_event; struct iommu_dma_cookie; +#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ +#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ +#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ +#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ + +/* Generic fault types, can be expanded IRQ remapping fault */ +enum iommu_fault_type { + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ + IOMMU_FAULT_PAGE_REQ, /* page request fault */ +}; + +enum iommu_fault_reason { + IOMMU_FAULT_REASON_UNKNOWN = 0, + + /* Could not access the PASID table (fetch caused external abort) */ + IOMMU_FAULT_REASON_PASID_FETCH, + + /* PASID entry is invalid or has configuration errors */ + IOMMU_FAULT_REASON_BAD_PASID_ENTRY, + + /* + * PASID is out of range (e.g. exceeds the maximum PASID + * supported by the IOMMU) or disabled. + */ + IOMMU_FAULT_REASON_PASID_INVALID, + + /* + * An external abort occurred fetching (or updating) a translation + * table descriptor + */ + IOMMU_FAULT_REASON_WALK_EABT, + + /* + * Could not access the page table entry (Bad address), + * actual translation fault + */ + IOMMU_FAULT_REASON_PTE_FETCH, + + /* Protection flag check failed */ + IOMMU_FAULT_REASON_PERMISSION, + + /* access flag check failed */ + IOMMU_FAULT_REASON_ACCESS, + + /* Output address of a translation stage caused Address Size fault */ + IOMMU_FAULT_REASON_OOR_ADDRESS, +}; + +/** + * struct iommu_fault_unrecoverable - Unrecoverable fault data + * @reason: reason of the fault, from &enum iommu_fault_reason + * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) + * @pasid: Process Address Space ID + * @perm: requested permission access using by the incoming transaction + * (IOMMU_FAULT_PERM_* values) + * @addr: offending page address + * @fetch_addr: address that caused a fetch abort, if any + */ +struct iommu_fault_unrecoverable { + __u32 reason; +#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) +#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) +#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) + __u32 flags; + __u32 pasid; + __u32 perm; + __u64 addr; + __u64 fetch_addr; +}; + +/** + * struct iommu_fault_page_request - Page Request data + * @flags: encodes whether the corresponding fields are valid and whether this + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). + * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response + * must have the same PASID value as the page request. When it is clear, + * the page response should not have a PASID. + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) + * @addr: page address + * @private_data: device-specific private information + */ +struct iommu_fault_page_request { +#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) +#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) +#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u64 private_data[2]; +}; + +/** + * struct iommu_fault - Generic fault data + * @type: fault type from &enum iommu_fault_type + * @padding: reserved for future use (should be zero) + * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV + * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ + * @padding2: sets the fault size to allow for future extensions + */ +struct iommu_fault { + __u32 type; + __u32 padding; + union { + struct iommu_fault_unrecoverable event; + struct iommu_fault_page_request prm; + __u8 padding2[56]; + }; +}; + +/** + * enum iommu_page_response_code - Return status of fault handlers + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is "Success" in PCI PRI. + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from + * this device if possible. This is "Response Failure" in PCI PRI. + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is "Invalid Request" in PCI PRI. + */ +enum iommu_page_response_code { + IOMMU_PAGE_RESP_SUCCESS = 0, + IOMMU_PAGE_RESP_INVALID, + IOMMU_PAGE_RESP_FAILURE, +}; + +/** + * struct iommu_page_response - Generic page response information + * @argsz: User filled size of this data + * @version: API version of this structure + * @flags: encodes whether the corresponding fields are valid + * (IOMMU_FAULT_PAGE_RESPONSE_* values) + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @code: response code from &enum iommu_page_response_code + */ +struct iommu_page_response { + __u32 argsz; +#define IOMMU_PAGE_RESP_VERSION_1 1 + __u32 version; +#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 code; +}; + + /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 #define IOMMU_FAULT_WRITE 0x1 diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h deleted file mode 100644 index 65d8b0234f69..000000000000 --- a/include/uapi/linux/iommu.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * IOMMU user API definitions - */ - -#ifndef _UAPI_IOMMU_H -#define _UAPI_IOMMU_H - -#include - -#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ -#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ -#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ -#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ - -/* Generic fault types, can be expanded IRQ remapping fault */ -enum iommu_fault_type { - IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ - IOMMU_FAULT_PAGE_REQ, /* page request fault */ -}; - -enum iommu_fault_reason { - IOMMU_FAULT_REASON_UNKNOWN = 0, - - /* Could not access the PASID table (fetch caused external abort) */ - IOMMU_FAULT_REASON_PASID_FETCH, - - /* PASID entry is invalid or has configuration errors */ - IOMMU_FAULT_REASON_BAD_PASID_ENTRY, - - /* - * PASID is out of range (e.g. exceeds the maximum PASID - * supported by the IOMMU) or disabled. - */ - IOMMU_FAULT_REASON_PASID_INVALID, - - /* - * An external abort occurred fetching (or updating) a translation - * table descriptor - */ - IOMMU_FAULT_REASON_WALK_EABT, - - /* - * Could not access the page table entry (Bad address), - * actual translation fault - */ - IOMMU_FAULT_REASON_PTE_FETCH, - - /* Protection flag check failed */ - IOMMU_FAULT_REASON_PERMISSION, - - /* access flag check failed */ - IOMMU_FAULT_REASON_ACCESS, - - /* Output address of a translation stage caused Address Size fault */ - IOMMU_FAULT_REASON_OOR_ADDRESS, -}; - -/** - * struct iommu_fault_unrecoverable - Unrecoverable fault data - * @reason: reason of the fault, from &enum iommu_fault_reason - * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) - * @pasid: Process Address Space ID - * @perm: requested permission access using by the incoming transaction - * (IOMMU_FAULT_PERM_* values) - * @addr: offending page address - * @fetch_addr: address that caused a fetch abort, if any - */ -struct iommu_fault_unrecoverable { - __u32 reason; -#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) -#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) -#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) - __u32 flags; - __u32 pasid; - __u32 perm; - __u64 addr; - __u64 fetch_addr; -}; - -/** - * struct iommu_fault_page_request - Page Request data - * @flags: encodes whether the corresponding fields are valid and whether this - * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). - * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response - * must have the same PASID value as the page request. When it is clear, - * the page response should not have a PASID. - * @pasid: Process Address Space ID - * @grpid: Page Request Group Index - * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) - * @addr: page address - * @private_data: device-specific private information - */ -struct iommu_fault_page_request { -#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) -#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) -#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) -#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 perm; - __u64 addr; - __u64 private_data[2]; -}; - -/** - * struct iommu_fault - Generic fault data - * @type: fault type from &enum iommu_fault_type - * @padding: reserved for future use (should be zero) - * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV - * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ - * @padding2: sets the fault size to allow for future extensions - */ -struct iommu_fault { - __u32 type; - __u32 padding; - union { - struct iommu_fault_unrecoverable event; - struct iommu_fault_page_request prm; - __u8 padding2[56]; - }; -}; - -/** - * enum iommu_page_response_code - Return status of fault handlers - * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables - * populated, retry the access. This is "Success" in PCI PRI. - * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from - * this device if possible. This is "Response Failure" in PCI PRI. - * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the - * access. This is "Invalid Request" in PCI PRI. - */ -enum iommu_page_response_code { - IOMMU_PAGE_RESP_SUCCESS = 0, - IOMMU_PAGE_RESP_INVALID, - IOMMU_PAGE_RESP_FAILURE, -}; - -/** - * struct iommu_page_response - Generic page response information - * @argsz: User filled size of this data - * @version: API version of this structure - * @flags: encodes whether the corresponding fields are valid - * (IOMMU_FAULT_PAGE_RESPONSE_* values) - * @pasid: Process Address Space ID - * @grpid: Page Request Group Index - * @code: response code from &enum iommu_page_response_code - */ -struct iommu_page_response { - __u32 argsz; -#define IOMMU_PAGE_RESP_VERSION_1 1 - __u32 version; -#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 code; -}; - -#endif /* _UAPI_IOMMU_H */ -- Gitee From 57c86e7ebf8f49fdf05b2cc0b5b68af23212d72a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:13 +0800 Subject: [PATCH 105/219] iommu/arm-smmu-v3: Remove unrecoverable faults reporting ANBZ: #13617 commit 66014df73b302d326b995178141a150b9a3a52b7 upstream. No device driver registers fault handler to handle the reported unrecoveraable faults. Remove it to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 46 ++++++--------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b3513a8bc8ad..1450fcda121f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1472,7 +1472,6 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) { int ret; - u32 reason; u32 perm = 0; struct arm_smmu_master *master; bool ssid_valid = evt[0] & EVTQ_0_SSV; @@ -1482,16 +1481,9 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) switch (FIELD_GET(EVTQ_0_ID, evt[0])) { case EVT_ID_TRANSLATION_FAULT: - reason = IOMMU_FAULT_REASON_PTE_FETCH; - break; case EVT_ID_ADDR_SIZE_FAULT: - reason = IOMMU_FAULT_REASON_OOR_ADDRESS; - break; case EVT_ID_ACCESS_FAULT: - reason = IOMMU_FAULT_REASON_ACCESS; - break; case EVT_ID_PERMISSION_FAULT: - reason = IOMMU_FAULT_REASON_PERMISSION; break; default: return -EOPNOTSUPP; @@ -1501,6 +1493,9 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) if (evt[1] & EVTQ_1_S2) return -EFAULT; + if (!(evt[1] & EVTQ_1_STALL)) + return -EOPNOTSUPP; + if (evt[1] & EVTQ_1_RnW) perm |= IOMMU_FAULT_PERM_READ; else @@ -1512,32 +1507,17 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) if (evt[1] & EVTQ_1_PnU) perm |= IOMMU_FAULT_PERM_PRIV; - if (evt[1] & EVTQ_1_STALL) { - flt->type = IOMMU_FAULT_PAGE_REQ; - flt->prm = (struct iommu_fault_page_request) { - .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]), - .perm = perm, - .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), - }; - - if (ssid_valid) { - flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); - } - } else { - flt->type = IOMMU_FAULT_DMA_UNRECOV; - flt->event = (struct iommu_fault_unrecoverable) { - .reason = reason, - .flags = IOMMU_FAULT_UNRECOV_ADDR_VALID, - .perm = perm, - .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), - }; + flt->type = IOMMU_FAULT_PAGE_REQ; + flt->prm = (struct iommu_fault_page_request) { + .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, + .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]), + .perm = perm, + .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), + }; - if (ssid_valid) { - flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID; - flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); - } + if (ssid_valid) { + flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); } mutex_lock(&smmu->streams_mutex); -- Gitee From 81ab38f5971ad4d1d4fd97db1881cdac8a014d5d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:14 +0800 Subject: [PATCH 106/219] iommu: Remove unrecoverable fault data ANBZ: #13617 commit 0edeab66eba88947dabe8634a3efd136cc771750 upstream. The unrecoverable fault data is not used anywhere. Remove it to avoid dead code. Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- include/linux/iommu.h | 72 ++----------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f4be71f16d0a..2d687da4dd12 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -50,67 +50,7 @@ struct iommu_dma_cookie; /* Generic fault types, can be expanded IRQ remapping fault */ enum iommu_fault_type { - IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ - IOMMU_FAULT_PAGE_REQ, /* page request fault */ -}; - -enum iommu_fault_reason { - IOMMU_FAULT_REASON_UNKNOWN = 0, - - /* Could not access the PASID table (fetch caused external abort) */ - IOMMU_FAULT_REASON_PASID_FETCH, - - /* PASID entry is invalid or has configuration errors */ - IOMMU_FAULT_REASON_BAD_PASID_ENTRY, - - /* - * PASID is out of range (e.g. exceeds the maximum PASID - * supported by the IOMMU) or disabled. - */ - IOMMU_FAULT_REASON_PASID_INVALID, - - /* - * An external abort occurred fetching (or updating) a translation - * table descriptor - */ - IOMMU_FAULT_REASON_WALK_EABT, - - /* - * Could not access the page table entry (Bad address), - * actual translation fault - */ - IOMMU_FAULT_REASON_PTE_FETCH, - - /* Protection flag check failed */ - IOMMU_FAULT_REASON_PERMISSION, - - /* access flag check failed */ - IOMMU_FAULT_REASON_ACCESS, - - /* Output address of a translation stage caused Address Size fault */ - IOMMU_FAULT_REASON_OOR_ADDRESS, -}; - -/** - * struct iommu_fault_unrecoverable - Unrecoverable fault data - * @reason: reason of the fault, from &enum iommu_fault_reason - * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) - * @pasid: Process Address Space ID - * @perm: requested permission access using by the incoming transaction - * (IOMMU_FAULT_PERM_* values) - * @addr: offending page address - * @fetch_addr: address that caused a fetch abort, if any - */ -struct iommu_fault_unrecoverable { - __u32 reason; -#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) -#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) -#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) - __u32 flags; - __u32 pasid; - __u32 perm; - __u64 addr; - __u64 fetch_addr; + IOMMU_FAULT_PAGE_REQ = 1, /* page request fault */ }; /** @@ -142,19 +82,11 @@ struct iommu_fault_page_request { /** * struct iommu_fault - Generic fault data * @type: fault type from &enum iommu_fault_type - * @padding: reserved for future use (should be zero) - * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ - * @padding2: sets the fault size to allow for future extensions */ struct iommu_fault { __u32 type; - __u32 padding; - union { - struct iommu_fault_unrecoverable event; - struct iommu_fault_page_request prm; - __u8 padding2[56]; - }; + struct iommu_fault_page_request prm; }; /** -- Gitee From f2f588865cf13b95f2b2585a24d1f590a41d61b0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:15 +0800 Subject: [PATCH 107/219] iommu: Cleanup iopf data structure definitions ANBZ: #13617 commit 8b32a3bea2629049c484f595af7aad797e24453e upstream. struct iommu_fault_page_request and struct iommu_page_response are not part of uAPI anymore. Convert them to data structures for kAPI. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 1 - drivers/iommu/iommu.c | 4 ---- include/linux/iommu.h | 27 +++++++++++---------------- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index e5b8b9110c13..24b5545352ae 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -56,7 +56,6 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, enum iommu_page_response_code status) { struct iommu_page_response resp = { - .version = IOMMU_PAGE_RESP_VERSION_1, .pasid = iopf->fault.prm.pasid, .grpid = iopf->fault.prm.grpid, .code = status, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 065bb3b0f63c..0b92aa476037 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1524,10 +1524,6 @@ int iommu_page_response(struct device *dev, if (!param || !param->fault_param) return -EINVAL; - if (msg->version != IOMMU_PAGE_RESP_VERSION_1 || - msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID) - return -EINVAL; - /* Only send response if there is a fault report pending */ mutex_lock(¶m->fault_param->lock); if (list_empty(¶m->fault_param->faults)) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2d687da4dd12..5ee20e461a08 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -71,12 +71,12 @@ struct iommu_fault_page_request { #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) #define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 perm; - __u64 addr; - __u64 private_data[2]; + u32 flags; + u32 pasid; + u32 grpid; + u32 perm; + u64 addr; + u64 private_data[2]; }; /** @@ -85,7 +85,7 @@ struct iommu_fault_page_request { * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ */ struct iommu_fault { - __u32 type; + u32 type; struct iommu_fault_page_request prm; }; @@ -106,8 +106,6 @@ enum iommu_page_response_code { /** * struct iommu_page_response - Generic page response information - * @argsz: User filled size of this data - * @version: API version of this structure * @flags: encodes whether the corresponding fields are valid * (IOMMU_FAULT_PAGE_RESPONSE_* values) * @pasid: Process Address Space ID @@ -115,14 +113,11 @@ enum iommu_page_response_code { * @code: response code from &enum iommu_page_response_code */ struct iommu_page_response { - __u32 argsz; -#define IOMMU_PAGE_RESP_VERSION_1 1 - __u32 version; #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 code; + u32 flags; + u32 pasid; + u32 grpid; + u32 code; }; -- Gitee From 3d5e58241c1258c5ba2104f76afe6074355543ea Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:16 +0800 Subject: [PATCH 108/219] iommu: Merge iopf_device_param into iommu_fault_param ANBZ: #13617 commit 15fc60cdd2d236a73b32c99d21fc0f7b7ce6cbbb upstream. The struct dev_iommu contains two pointers, fault_param and iopf_param. The fault_param pointer points to a data structure that is used to store pending faults that are awaiting responses. The iopf_param pointer points to a data structure that is used to store partial faults that are part of a Page Request Group. The fault_param and iopf_param pointers are essentially duplicate. This causes memory waste. Merge the iopf_device_param pointer into the iommu_fault_param pointer to consolidate the code and save memory. The consolidated pointer would be allocated on demand when the device driver enables the iopf on device, and would be freed after iopf is disabled. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 110 ++++++++++++++++++------------------- drivers/iommu/iommu.c | 34 ++---------- include/linux/iommu.h | 18 ++++-- 3 files changed, 72 insertions(+), 90 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 24b5545352ae..f948303b2a91 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -25,21 +25,6 @@ struct iopf_queue { struct mutex lock; }; -/** - * struct iopf_device_param - IO Page Fault data attached to a device - * @dev: the device that owns this param - * @queue: IOPF queue - * @queue_list: index into queue->devices - * @partial: faults that are part of a Page Request Group for which the last - * request hasn't been submitted yet. - */ -struct iopf_device_param { - struct device *dev; - struct iopf_queue *queue; - struct list_head queue_list; - struct list_head partial; -}; - struct iopf_fault { struct iommu_fault fault; struct list_head list; @@ -144,7 +129,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) int ret; struct iopf_group *group; struct iopf_fault *iopf, *next; - struct iopf_device_param *iopf_param; + struct iommu_fault_param *iopf_param; struct device *dev = cookie; struct dev_iommu *param = dev->iommu; @@ -159,7 +144,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) * As long as we're holding param->lock, the queue can't be unlinked * from the device and therefore cannot disappear. */ - iopf_param = param->iopf_param; + iopf_param = param->fault_param; if (!iopf_param) return -ENODEV; @@ -229,14 +214,14 @@ EXPORT_SYMBOL_GPL(iommu_queue_iopf); int iopf_queue_flush_dev(struct device *dev) { int ret = 0; - struct iopf_device_param *iopf_param; + struct iommu_fault_param *iopf_param; struct dev_iommu *param = dev->iommu; if (!param) return -ENODEV; mutex_lock(¶m->lock); - iopf_param = param->iopf_param; + iopf_param = param->fault_param; if (iopf_param) flush_workqueue(iopf_param->queue->wq); else @@ -260,7 +245,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev); int iopf_queue_discard_partial(struct iopf_queue *queue) { struct iopf_fault *iopf, *next; - struct iopf_device_param *iopf_param; + struct iommu_fault_param *iopf_param; if (!queue) return -EINVAL; @@ -287,34 +272,36 @@ EXPORT_SYMBOL_GPL(iopf_queue_discard_partial); */ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) { - int ret = -EBUSY; - struct iopf_device_param *iopf_param; + int ret = 0; struct dev_iommu *param = dev->iommu; - - if (!param) - return -ENODEV; - - iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL); - if (!iopf_param) - return -ENOMEM; - - INIT_LIST_HEAD(&iopf_param->partial); - iopf_param->queue = queue; - iopf_param->dev = dev; + struct iommu_fault_param *fault_param; mutex_lock(&queue->lock); mutex_lock(¶m->lock); - if (!param->iopf_param) { - list_add(&iopf_param->queue_list, &queue->devices); - param->iopf_param = iopf_param; - ret = 0; + if (param->fault_param) { + ret = -EBUSY; + goto done_unlock; } + + fault_param = kzalloc(sizeof(*fault_param), GFP_KERNEL); + if (!fault_param) { + ret = -ENOMEM; + goto done_unlock; + } + + mutex_init(&fault_param->lock); + INIT_LIST_HEAD(&fault_param->faults); + INIT_LIST_HEAD(&fault_param->partial); + fault_param->dev = dev; + list_add(&fault_param->queue_list, &queue->devices); + fault_param->queue = queue; + + param->fault_param = fault_param; + +done_unlock: mutex_unlock(¶m->lock); mutex_unlock(&queue->lock); - if (ret) - kfree(iopf_param); - return ret; } EXPORT_SYMBOL_GPL(iopf_queue_add_device); @@ -330,34 +317,41 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device); */ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) { - int ret = -EINVAL; + int ret = 0; struct iopf_fault *iopf, *next; - struct iopf_device_param *iopf_param; struct dev_iommu *param = dev->iommu; - - if (!param || !queue) - return -EINVAL; + struct iommu_fault_param *fault_param = param->fault_param; mutex_lock(&queue->lock); mutex_lock(¶m->lock); - iopf_param = param->iopf_param; - if (iopf_param && iopf_param->queue == queue) { - list_del(&iopf_param->queue_list); - param->iopf_param = NULL; - ret = 0; + if (!fault_param) { + ret = -ENODEV; + goto unlock; } - mutex_unlock(¶m->lock); - mutex_unlock(&queue->lock); - if (ret) - return ret; + + if (fault_param->queue != queue) { + ret = -EINVAL; + goto unlock; + } + + if (!list_empty(&fault_param->faults)) { + ret = -EBUSY; + goto unlock; + } + + list_del(&fault_param->queue_list); /* Just in case some faults are still stuck */ - list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) + list_for_each_entry_safe(iopf, next, &fault_param->partial, list) kfree(iopf); - kfree(iopf_param); + param->fault_param = NULL; + kfree(fault_param); +unlock: + mutex_unlock(¶m->lock); + mutex_unlock(&queue->lock); - return 0; + return ret; } EXPORT_SYMBOL_GPL(iopf_queue_remove_device); @@ -403,7 +397,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_alloc); */ void iopf_queue_free(struct iopf_queue *queue) { - struct iopf_device_param *iopf_param, *next; + struct iommu_fault_param *iopf_param, *next; if (!queue) return; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0b92aa476037..2afdfa952287 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1385,27 +1385,18 @@ int iommu_register_device_fault_handler(struct device *dev, struct dev_iommu *param = dev->iommu; int ret = 0; - if (!param) + if (!param || !param->fault_param) return -EINVAL; mutex_lock(¶m->lock); /* Only allow one fault handler registered for each device */ - if (param->fault_param) { + if (param->fault_param->handler) { ret = -EBUSY; goto done_unlock; } - get_device(dev); - param->fault_param = kzalloc(sizeof(*param->fault_param), GFP_KERNEL); - if (!param->fault_param) { - put_device(dev); - ret = -ENOMEM; - goto done_unlock; - } param->fault_param->handler = handler; param->fault_param->data = data; - mutex_init(¶m->fault_param->lock); - INIT_LIST_HEAD(¶m->fault_param->faults); done_unlock: mutex_unlock(¶m->lock); @@ -1426,29 +1417,16 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler); int iommu_unregister_device_fault_handler(struct device *dev) { struct dev_iommu *param = dev->iommu; - int ret = 0; - if (!param) + if (!param || !param->fault_param) return -EINVAL; mutex_lock(¶m->lock); - - if (!param->fault_param) - goto unlock; - - /* we cannot unregister handler if there are pending faults */ - if (!list_empty(¶m->fault_param->faults)) { - ret = -EBUSY; - goto unlock; - } - - kfree(param->fault_param); - param->fault_param = NULL; - put_device(dev); -unlock: + param->fault_param->handler = NULL; + param->fault_param->data = NULL; mutex_unlock(¶m->lock); - return ret; + return 0; } EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5ee20e461a08..c384f645c373 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -42,6 +42,7 @@ struct notifier_block; struct iommu_sva; struct iommu_fault_event; struct iommu_dma_cookie; +struct iopf_queue; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -689,21 +690,31 @@ struct iommu_fault_event { * struct iommu_fault_param - per-device IOMMU fault data * @handler: Callback function to handle IOMMU faults at device level * @data: handler private data - * @faults: holds the pending faults which needs response * @lock: protect pending faults list + * @dev: the device that owns this param + * @queue: IOPF queue + * @queue_list: index into queue->devices + * @partial: faults that are part of a Page Request Group for which the last + * request hasn't been submitted yet. + * @faults: holds the pending faults which need response */ struct iommu_fault_param { iommu_dev_fault_handler_t handler; void *data; - struct list_head faults; struct mutex lock; + + struct device *dev; + struct iopf_queue *queue; + struct list_head queue_list; + + struct list_head partial; + struct list_head faults; }; /** * struct dev_iommu - Collection of per-device IOMMU data * * @fault_param: IOMMU detected device fault reporting data - * @iopf_param: I/O Page Fault queue and data * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data @@ -719,7 +730,6 @@ struct iommu_fault_param { struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; - struct iopf_device_param *iopf_param; struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; -- Gitee From 98eb0169c15c994b3b78cf34e62305f69be78a6e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:17 +0800 Subject: [PATCH 109/219] iommu: Remove iommu_[un]register_device_fault_handler() ANBZ: #13617 commit 1ff25d798e52943d037accf15c675a6845d9776f upstream. The individual iommu driver reports the iommu page faults by calling iommu_report_device_fault(), where a pre-registered device fault handler is called to route the fault to another fault handler installed on the corresponding iommu domain. The pre-registered device fault handler is static and won't be dynamic as the fault handler is eventually per iommu domain. Replace calling device fault handler with iommu_queue_iopf(). After this replacement, the registering and unregistering fault handler interfaces are not needed anywhere. Remove the interfaces and the related data structures to avoid dead code. Convert cookie parameter of iommu_queue_iopf() into a device pointer that is really passed. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 13 +--- drivers/iommu/intel/iommu.c | 24 ++---- drivers/iommu/io-pgfault.c | 6 +- drivers/iommu/iommu-sva.h | 4 +- drivers/iommu/iommu.c | 76 +------------------ include/linux/iommu.h | 23 ------ 6 files changed, 13 insertions(+), 133 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 05722121f00e..ab2b0a5e4369 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -487,7 +487,6 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master) static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master) { - int ret; struct device *dev = master->dev; /* @@ -500,16 +499,7 @@ static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master) if (!master->iopf_enabled) return -EINVAL; - ret = iopf_queue_add_device(master->smmu->evtq.iopf, dev); - if (ret) - return ret; - - ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); - if (ret) { - iopf_queue_remove_device(master->smmu->evtq.iopf, dev); - return ret; - } - return 0; + return iopf_queue_add_device(master->smmu->evtq.iopf, dev); } static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master) @@ -519,7 +509,6 @@ static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master) if (!master->iopf_enabled) return; - iommu_unregister_device_fault_handler(dev); iopf_queue_remove_device(master->smmu->evtq.iopf, dev); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e30017575c36..ca02366f02d1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4504,23 +4504,15 @@ static int intel_iommu_enable_iopf(struct device *dev) if (ret) return ret; - ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); - if (ret) - goto iopf_remove_device; - ret = pci_enable_pri(pdev, PRQ_DEPTH); - if (ret) - goto iopf_unregister_handler; + if (ret) { + iopf_queue_remove_device(iommu->iopf_queue, dev); + return ret; + } + info->pri_enabled = 1; return 0; - -iopf_unregister_handler: - iommu_unregister_device_fault_handler(dev); -iopf_remove_device: - iopf_queue_remove_device(iommu->iopf_queue, dev); - - return ret; } static int intel_iommu_disable_iopf(struct device *dev) @@ -4543,11 +4535,9 @@ static int intel_iommu_disable_iopf(struct device *dev) info->pri_enabled = 0; /* - * With PRI disabled and outstanding PRQs drained, unregistering - * fault handler and removing device from iopf queue should never - * fail. + * With PRI disabled and outstanding PRQs drained, removing device + * from iopf queue should never fail. */ - WARN_ON(iommu_unregister_device_fault_handler(dev)); WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); return 0; diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index f948303b2a91..4fda01de5589 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -87,7 +87,7 @@ static void iopf_handler(struct work_struct *work) /** * iommu_queue_iopf - IO Page Fault handler * @fault: fault event - * @cookie: struct device, passed to iommu_register_device_fault_handler. + * @dev: struct device. * * Add a fault to the device workqueue, to be handled by mm. * @@ -124,14 +124,12 @@ static void iopf_handler(struct work_struct *work) * * Return: 0 on success and <0 on error. */ -int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) +int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) { int ret; struct iopf_group *group; struct iopf_fault *iopf, *next; struct iommu_fault_param *iopf_param; - - struct device *dev = cookie; struct dev_iommu *param = dev->iommu; lockdep_assert_held(¶m->lock); diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h index 54946b5a7caf..de7819c796ce 100644 --- a/drivers/iommu/iommu-sva.h +++ b/drivers/iommu/iommu-sva.h @@ -13,7 +13,7 @@ struct iommu_fault; struct iopf_queue; #ifdef CONFIG_IOMMU_SVA -int iommu_queue_iopf(struct iommu_fault *fault, void *cookie); +int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev); int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); int iopf_queue_remove_device(struct iopf_queue *queue, @@ -26,7 +26,7 @@ enum iommu_page_response_code iommu_sva_handle_iopf(struct iommu_fault *fault, void *data); #else /* CONFIG_IOMMU_SVA */ -static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) +static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) { return -ENODEV; } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2afdfa952287..3d360b754000 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1360,76 +1360,6 @@ void iommu_group_put(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_put); -/** - * iommu_register_device_fault_handler() - Register a device fault handler - * @dev: the device - * @handler: the fault handler - * @data: private data passed as argument to the handler - * - * When an IOMMU fault event is received, this handler gets called with the - * fault event and data as argument. The handler should return 0 on success. If - * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the consumer should also - * complete the fault by calling iommu_page_response() with one of the following - * response code: - * - IOMMU_PAGE_RESP_SUCCESS: retry the translation - * - IOMMU_PAGE_RESP_INVALID: terminate the fault - * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting - * page faults if possible. - * - * Return 0 if the fault handler was installed successfully, or an error. - */ -int iommu_register_device_fault_handler(struct device *dev, - iommu_dev_fault_handler_t handler, - void *data) -{ - struct dev_iommu *param = dev->iommu; - int ret = 0; - - if (!param || !param->fault_param) - return -EINVAL; - - mutex_lock(¶m->lock); - /* Only allow one fault handler registered for each device */ - if (param->fault_param->handler) { - ret = -EBUSY; - goto done_unlock; - } - - param->fault_param->handler = handler; - param->fault_param->data = data; - -done_unlock: - mutex_unlock(¶m->lock); - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler); - -/** - * iommu_unregister_device_fault_handler() - Unregister the device fault handler - * @dev: the device - * - * Remove the device fault handler installed with - * iommu_register_device_fault_handler(). - * - * Return 0 on success, or an error. - */ -int iommu_unregister_device_fault_handler(struct device *dev) -{ - struct dev_iommu *param = dev->iommu; - - if (!param || !param->fault_param) - return -EINVAL; - - mutex_lock(¶m->lock); - param->fault_param->handler = NULL; - param->fault_param->data = NULL; - mutex_unlock(¶m->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler); - /** * iommu_report_device_fault() - Report fault event to device driver * @dev: the device @@ -1454,10 +1384,6 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) /* we only report device fault if there is a handler registered */ mutex_lock(¶m->lock); fparam = param->fault_param; - if (!fparam || !fparam->handler) { - ret = -EINVAL; - goto done_unlock; - } if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { @@ -1472,7 +1398,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) mutex_unlock(&fparam->lock); } - ret = fparam->handler(&evt->fault, fparam->data); + ret = iommu_queue_iopf(&evt->fault, dev); if (ret && evt_pending) { mutex_lock(&fparam->lock); list_del(&evt_pending->list); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c384f645c373..ea1230302c74 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -128,7 +128,6 @@ struct iommu_page_response { typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); -typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { dma_addr_t aperture_start; /* First address that can be mapped */ @@ -688,8 +687,6 @@ struct iommu_fault_event { /** * struct iommu_fault_param - per-device IOMMU fault data - * @handler: Callback function to handle IOMMU faults at device level - * @data: handler private data * @lock: protect pending faults list * @dev: the device that owns this param * @queue: IOPF queue @@ -699,8 +696,6 @@ struct iommu_fault_event { * @faults: holds the pending faults which need response */ struct iommu_fault_param { - iommu_dev_fault_handler_t handler; - void *data; struct mutex lock; struct device *dev; @@ -839,11 +834,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data, extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_register_device_fault_handler(struct device *dev, - iommu_dev_fault_handler_t handler, - void *data); - -extern int iommu_unregister_device_fault_handler(struct device *dev); extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt); @@ -1270,19 +1260,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline -int iommu_register_device_fault_handler(struct device *dev, - iommu_dev_fault_handler_t handler, - void *data) -{ - return -ENODEV; -} - -static inline int iommu_unregister_device_fault_handler(struct device *dev) -{ - return 0; -} - static inline int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) { -- Gitee From b43de42c0fb64683d1c022bf8e46ac6e0d0dfe0b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:18 +0800 Subject: [PATCH 110/219] iommu: Merge iommu_fault_event and iopf_fault ANBZ: #13617 commit 3f02a9dc70007c0e6299fda9c4f7a1e2277ec3d2 upstream. The iommu_fault_event and iopf_fault data structures store the same information about an iopf fault. They are also used in the same way. Merge these two data structures into a single one to make the code more concise and easier to maintain. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +-- drivers/iommu/intel/iommu.h | 2 +- drivers/iommu/intel/svm.c | 5 ++-- drivers/iommu/io-pgfault.c | 5 ---- drivers/iommu/iommu.c | 8 +++--- include/linux/iommu.h | 27 ++++++--------------- 6 files changed, 17 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1450fcda121f..ad862163f1e0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -922,7 +922,7 @@ static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, } static int arm_smmu_page_response(struct device *dev, - struct iommu_fault_event *unused, + struct iopf_fault *unused, struct iommu_page_response *resp) { struct arm_smmu_cmdq_ent cmd = {0}; @@ -1476,7 +1476,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) struct arm_smmu_master *master; bool ssid_valid = evt[0] & EVTQ_0_SSV; u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]); - struct iommu_fault_event fault_evt = { }; + struct iopf_fault fault_evt = { }; struct iommu_fault *flt = &fault_evt.fault; switch (FIELD_GET(EVTQ_0_ID, evt[0])) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 470dc0437000..457a00748730 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1077,7 +1077,7 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, void intel_svm_check(struct intel_iommu *iommu); int intel_svm_enable_prq(struct intel_iommu *iommu); int intel_svm_finish_prq(struct intel_iommu *iommu); -int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, +int intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(void); void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index d4cf8bc3d02d..2ca8532497e2 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -564,13 +564,12 @@ static int prq_to_iommu_prot(struct page_req_dsc *req) static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, struct page_req_dsc *desc) { - struct iommu_fault_event event; + struct iopf_fault event = { }; if (!dev || !dev_is_pci(dev)) return -ENODEV; /* Fill in event data for device specific processing */ - memset(&event, 0, sizeof(struct iommu_fault_event)); event.fault.type = IOMMU_FAULT_PAGE_REQ; event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; event.fault.prm.pasid = desc->pasid; @@ -742,7 +741,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) } int intel_svm_page_response(struct device *dev, - struct iommu_fault_event *evt, + struct iopf_fault *evt, struct iommu_page_response *msg) { struct device_domain_info *info = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 4fda01de5589..10d48eb72608 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -25,11 +25,6 @@ struct iopf_queue { struct mutex lock; }; -struct iopf_fault { - struct iommu_fault fault; - struct list_head list; -}; - struct iopf_group { struct iopf_fault last_fault; struct list_head faults; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3d360b754000..264d09aa6cc0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1371,10 +1371,10 @@ EXPORT_SYMBOL_GPL(iommu_group_put); * * Return 0 on success, or an error. */ -int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { struct dev_iommu *param = dev->iommu; - struct iommu_fault_event *evt_pending = NULL; + struct iopf_fault *evt_pending = NULL; struct iommu_fault_param *fparam; int ret = 0; @@ -1387,7 +1387,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event), + evt_pending = kmemdup(evt, sizeof(struct iopf_fault), GFP_KERNEL); if (!evt_pending) { ret = -ENOMEM; @@ -1416,7 +1416,7 @@ int iommu_page_response(struct device *dev, { bool needs_pasid; int ret = -EINVAL; - struct iommu_fault_event *evt; + struct iopf_fault *evt; struct iommu_fault_page_request *prm; struct dev_iommu *param = dev->iommu; const struct iommu_ops *ops = dev_iommu_ops(dev); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ea1230302c74..bbf31df06121 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -40,7 +40,6 @@ struct iommu_domain_ops; struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; -struct iommu_fault_event; struct iommu_dma_cookie; struct iopf_queue; @@ -121,6 +120,11 @@ struct iommu_page_response { u32 code; }; +struct iopf_fault { + struct iommu_fault fault; + /* node for pending lists */ + struct list_head list; +}; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 @@ -561,7 +565,7 @@ struct iommu_ops { int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); int (*page_response)(struct device *dev, - struct iommu_fault_event *evt, + struct iopf_fault *evt, struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); @@ -671,20 +675,6 @@ struct iommu_device { u32 max_pasids; }; -/** - * struct iommu_fault_event - Generic fault event - * - * Can represent recoverable faults such as a page requests or - * unrecoverable faults such as DMA or IRQ remapping faults. - * - * @fault: fault descriptor - * @list: pending fault event list, used for tracking responses - */ -struct iommu_fault_event { - struct iommu_fault fault; - struct list_head list; -}; - /** * struct iommu_fault_param - per-device IOMMU fault data * @lock: protect pending faults list @@ -835,8 +825,7 @@ extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_report_device_fault(struct device *dev, - struct iommu_fault_event *evt); +extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); extern int iommu_page_response(struct device *dev, struct iommu_page_response *msg); @@ -1261,7 +1250,7 @@ static inline void iommu_group_put(struct iommu_group *group) } static inline -int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { return -ENODEV; } -- Gitee From b5e86530c207d295f9a7e336f65a6fa9351ba486 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:19 +0800 Subject: [PATCH 111/219] iommu: Prepare for separating SVA and IOPF ANBZ: #13617 commit 24b5d268b5ab95c12b5ae58a054d04bfa442f58f upstream. Move iopf_group data structure to iommu.h to make it a minimal set of faults that a domain's page fault handler should handle. Add a new function, iopf_free_group(), to free a fault group after all faults in the group are handled. This function will be made global so that it can be called from other files, such as iommu-sva.c. Move iopf_queue data structure to iommu.h to allow the workqueue to be scheduled out of this file. This will simplify the sequential patches. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 39 ++++++++++++++------------------------ include/linux/iommu.h | 20 ++++++++++++++++++- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 10d48eb72608..c7e6bbed5c05 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -13,24 +13,17 @@ #include "iommu-sva.h" -/** - * struct iopf_queue - IO Page Fault queue - * @wq: the fault workqueue - * @devices: devices attached to this queue - * @lock: protects the device list - */ -struct iopf_queue { - struct workqueue_struct *wq; - struct list_head devices; - struct mutex lock; -}; - -struct iopf_group { - struct iopf_fault last_fault; - struct list_head faults; - struct work_struct work; - struct device *dev; -}; +static void iopf_free_group(struct iopf_group *group) +{ + struct iopf_fault *iopf, *next; + + list_for_each_entry_safe(iopf, next, &group->faults, list) { + if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) + kfree(iopf); + } + + kfree(group); +} static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, enum iommu_page_response_code status) @@ -50,9 +43,9 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, static void iopf_handler(struct work_struct *work) { + struct iopf_fault *iopf; struct iopf_group *group; struct iommu_domain *domain; - struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); @@ -61,7 +54,7 @@ static void iopf_handler(struct work_struct *work) if (!domain || !domain->iopf_handler) status = IOMMU_PAGE_RESP_INVALID; - list_for_each_entry_safe(iopf, next, &group->faults, list) { + list_for_each_entry(iopf, &group->faults, list) { /* * For the moment, errors are sticky: don't handle subsequent * faults in the group if there is an error. @@ -69,14 +62,10 @@ static void iopf_handler(struct work_struct *work) if (status == IOMMU_PAGE_RESP_SUCCESS) status = domain->iopf_handler(&iopf->fault, domain->fault_data); - - if (!(iopf->fault.prm.flags & - IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) - kfree(iopf); } iopf_complete_group(group->dev, &group->last_fault, status); - kfree(group); + iopf_free_group(group); } /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bbf31df06121..b6cb39629639 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,7 +41,6 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; -struct iopf_queue; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -126,6 +125,25 @@ struct iopf_fault { struct list_head list; }; +struct iopf_group { + struct iopf_fault last_fault; + struct list_head faults; + struct work_struct work; + struct device *dev; +}; + +/** + * struct iopf_queue - IO Page Fault queue + * @wq: the fault workqueue + * @devices: devices attached to this queue + * @lock: protects the device list + */ +struct iopf_queue { + struct workqueue_struct *wq; + struct list_head devices; + struct mutex lock; +}; + /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 #define IOMMU_FAULT_WRITE 0x1 -- Gitee From 60ba6dc2c798ea4d22c5a1f5a912ecca08b53efe Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:20 +0800 Subject: [PATCH 112/219] iommu: Make iommu_queue_iopf() more generic ANBZ: #13617 commit 351ffcb11ca0ff64e399982e279cfa131e7cb1aa upstream. Make iommu_queue_iopf() more generic by making the iopf_group a minimal set of iopf's that an iopf handler of domain should handle and respond to. Add domain parameter to struct iopf_group so that the handler can retrieve and use it directly. Change iommu_queue_iopf() to forward groups of iopf's to the domain's iopf handler. This is also a necessary step to decouple the sva iopf handling code from this interface. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 68 +++++++++++++++++++++++++++++++------- drivers/iommu/iommu-sva.c | 3 +- drivers/iommu/iommu-sva.h | 6 ++-- include/linux/iommu.h | 4 +-- 4 files changed, 61 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index c7e6bbed5c05..13cd0929e766 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -13,6 +13,9 @@ #include "iommu-sva.h" +enum iommu_page_response_code +iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm); + static void iopf_free_group(struct iopf_group *group) { struct iopf_fault *iopf, *next; @@ -45,29 +48,48 @@ static void iopf_handler(struct work_struct *work) { struct iopf_fault *iopf; struct iopf_group *group; - struct iommu_domain *domain; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); - domain = iommu_get_domain_for_dev_pasid(group->dev, - group->last_fault.fault.prm.pasid, 0); - if (!domain || !domain->iopf_handler) - status = IOMMU_PAGE_RESP_INVALID; - list_for_each_entry(iopf, &group->faults, list) { /* * For the moment, errors are sticky: don't handle subsequent * faults in the group if there is an error. */ - if (status == IOMMU_PAGE_RESP_SUCCESS) - status = domain->iopf_handler(&iopf->fault, - domain->fault_data); + if (status != IOMMU_PAGE_RESP_SUCCESS) + break; + + status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm); } iopf_complete_group(group->dev, &group->last_fault, status); iopf_free_group(group); } +static struct iommu_domain *get_domain_for_iopf(struct device *dev, + struct iommu_fault *fault) +{ + struct iommu_domain *domain; + + if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { + domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0); + if (IS_ERR(domain)) + domain = NULL; + } else { + domain = iommu_get_domain_for_dev(dev); + } + + if (!domain || !domain->iopf_handler) { + dev_warn_ratelimited(dev, + "iopf (pasid %d) without domain attached or handler installed\n", + fault->prm.pasid); + + return NULL; + } + + return domain; +} + /** * iommu_queue_iopf - IO Page Fault handler * @fault: fault event @@ -112,6 +134,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) { int ret; struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; struct iommu_fault_param *iopf_param; struct dev_iommu *param = dev->iommu; @@ -143,6 +166,12 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) return 0; } + domain = get_domain_for_iopf(dev, fault); + if (!domain) { + ret = -EINVAL; + goto cleanup_partial; + } + group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) { /* @@ -157,8 +186,8 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) group->dev = dev; group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); + group->domain = domain; list_add(&group->last_fault.list, &group->faults); - INIT_WORK(&group->work, iopf_handler); /* See if we have partial faults for this group */ list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { @@ -167,9 +196,13 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) list_move(&iopf->list, &group->faults); } - queue_work(iopf_param->queue->wq, &group->work); - return 0; + mutex_unlock(&iopf_param->lock); + ret = domain->iopf_handler(group); + mutex_lock(&iopf_param->lock); + if (ret) + iopf_free_group(group); + return ret; cleanup_partial: list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { if (iopf->fault.prm.grpid == fault->prm.grpid) { @@ -181,6 +214,17 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) } EXPORT_SYMBOL_GPL(iommu_queue_iopf); +int iommu_sva_handle_iopf(struct iopf_group *group) +{ + struct iommu_fault_param *fault_param = group->dev->iommu->fault_param; + + INIT_WORK(&group->work, iopf_handler); + if (!queue_work(fault_param->queue->wq, &group->work)) + return -EBUSY; + + return 0; +} + /** * iopf_queue_flush_dev - Ensure that all queued faults have been processed * @dev: the endpoint whose faults need to be flushed. diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index c3fc9201d0be..fcae7308fcb7 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -163,11 +163,10 @@ EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); * I/O page fault handler for SVA */ enum iommu_page_response_code -iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm) { vm_fault_t ret; struct vm_area_struct *vma; - struct mm_struct *mm = data; unsigned int access_flags = 0; unsigned int fault_flags = FAULT_FLAG_REMOTE; struct iommu_fault_page_request *prm = &fault->prm; diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h index de7819c796ce..27c8da115b41 100644 --- a/drivers/iommu/iommu-sva.h +++ b/drivers/iommu/iommu-sva.h @@ -22,8 +22,7 @@ int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); -enum iommu_page_response_code -iommu_sva_handle_iopf(struct iommu_fault *fault, void *data); +int iommu_sva_handle_iopf(struct iopf_group *group); #else /* CONFIG_IOMMU_SVA */ static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) @@ -62,8 +61,7 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue) return -ENODEV; } -static inline enum iommu_page_response_code -iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +static inline int iommu_sva_handle_iopf(struct iopf_group *group) { return IOMMU_PAGE_RESP_INVALID; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b6cb39629639..f6a320a7888b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -130,6 +130,7 @@ struct iopf_group { struct list_head faults; struct work_struct work; struct device *dev; + struct iommu_domain *domain; }; /** @@ -209,8 +210,7 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; - enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, - void *data); + int (*iopf_handler)(struct iopf_group *group); void *fault_data; union { struct { -- Gitee From 0e60fff008236ac7b1ad9214b36b20a76866db1e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:21 +0800 Subject: [PATCH 113/219] iommu: Separate SVA and IOPF ANBZ: #13617 commit 17c51a0ea36b800e7a5998a92d83016c82935dff upstream. Add CONFIG_IOMMU_IOPF for page fault handling framework and select it from its real consumer. Move iopf function declaration from iommu-sva.h to iommu.h and remove iommu-sva.h as it's empty now. Consolidate all SVA related code into iommu-sva.c: - Move iommu_sva_domain_alloc() from iommu.c to iommu-sva.c. - Move sva iopf handling code from io-pgfault.c to iommu-sva.c. Consolidate iommu_report_device_fault() and iommu_page_response() into io-pgfault.c. Export iopf_free_group() and iopf_group_response() for iopf handlers implemented in modules. Some functions are renamed with more meaningful names. No other intentional functionality changes. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/iommu.c drivers/iommu/intel/svm.c minor: include conflict ] Signed-off-by: Shuai Xue --- drivers/iommu/Kconfig | 4 + drivers/iommu/Makefile | 3 +- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 1 - drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 - drivers/iommu/intel/Kconfig | 1 + drivers/iommu/intel/iommu.c | 1 - drivers/iommu/intel/svm.c | 1 - drivers/iommu/io-pgfault.c | 188 +++++++++++++----- drivers/iommu/iommu-sva.c | 68 ++++++- drivers/iommu/iommu-sva.h | 69 ------- drivers/iommu/iommu.c | 133 ------------- include/linux/iommu.h | 98 ++++++--- 12 files changed, 277 insertions(+), 291 deletions(-) delete mode 100644 drivers/iommu/iommu-sva.h diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index de8da50b2cd9..6a05de6125ff 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -163,6 +163,9 @@ config IOMMU_SVA select IOMMU_MM_DATA bool +config IOMMU_IOPF + bool + config FSL_PAMU bool "Freescale IOMMU support" depends on PCI @@ -399,6 +402,7 @@ config ARM_SMMU_V3_SVA bool "Shared Virtual Addressing support for the ARM SMMUv3" depends on ARM_SMMU_V3 select IOMMU_SVA + select IOMMU_IOPF select MMU_NOTIFIER help Support for sharing process address spaces with devices using the diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 57fbb37c1115..c936dbfc8387 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o obj-$(CONFIG_S390_IOMMU) += s390-iommu.o obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o -obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o +obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o obj-$(CONFIG_LOONGARCH_IOMMU) += loongarch_iommu.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index ab2b0a5e4369..6513a98fcb72 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -10,7 +10,6 @@ #include #include "arm-smmu-v3.h" -#include "../../iommu-sva.h" #include "../../io-pgtable-arm.h" struct arm_smmu_mmu_notifier { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ad862163f1e0..6bbaff674944 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -29,7 +29,6 @@ #include "arm-smmu-v3.h" #include "../../dma-iommu.h" -#include "../../iommu-sva.h" static bool disable_bypass = true; module_param(disable_bypass, bool, 0444); diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f5348b80652b..f6f46b1b3dad 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -51,6 +51,7 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA + select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ca02366f02d1..5101570e6f7b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -27,7 +27,6 @@ #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" -#include "../iommu-sva.h" #include "../iommu-pages.h" #include "pasid.h" #include "cap_audit.h" diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 2ca8532497e2..07a863635862 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -23,7 +23,6 @@ #include "pasid.h" #include "perf.h" #include "../iommu-pages.h" -#include "../iommu-sva.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 13cd0929e766..c1e88da973ce 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -11,12 +11,9 @@ #include #include -#include "iommu-sva.h" +#include "iommu-priv.h" -enum iommu_page_response_code -iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm); - -static void iopf_free_group(struct iopf_group *group) +void iopf_free_group(struct iopf_group *group) { struct iopf_fault *iopf, *next; @@ -27,44 +24,7 @@ static void iopf_free_group(struct iopf_group *group) kfree(group); } - -static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, - enum iommu_page_response_code status) -{ - struct iommu_page_response resp = { - .pasid = iopf->fault.prm.pasid, - .grpid = iopf->fault.prm.grpid, - .code = status, - }; - - if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) && - (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)) - resp.flags = IOMMU_PAGE_RESP_PASID_VALID; - - return iommu_page_response(dev, &resp); -} - -static void iopf_handler(struct work_struct *work) -{ - struct iopf_fault *iopf; - struct iopf_group *group; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; - - group = container_of(work, struct iopf_group, work); - list_for_each_entry(iopf, &group->faults, list) { - /* - * For the moment, errors are sticky: don't handle subsequent - * faults in the group if there is an error. - */ - if (status != IOMMU_PAGE_RESP_SUCCESS) - break; - - status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm); - } - - iopf_complete_group(group->dev, &group->last_fault, status); - iopf_free_group(group); -} +EXPORT_SYMBOL_GPL(iopf_free_group); static struct iommu_domain *get_domain_for_iopf(struct device *dev, struct iommu_fault *fault) @@ -91,7 +51,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, } /** - * iommu_queue_iopf - IO Page Fault handler + * iommu_handle_iopf - IO Page Fault handler * @fault: fault event * @dev: struct device. * @@ -130,7 +90,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, * * Return: 0 on success and <0 on error. */ -int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) +static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev) { int ret; struct iopf_group *group; @@ -212,18 +172,117 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) } return ret; } -EXPORT_SYMBOL_GPL(iommu_queue_iopf); -int iommu_sva_handle_iopf(struct iopf_group *group) +/** + * iommu_report_device_fault() - Report fault event to device driver + * @dev: the device + * @evt: fault event data + * + * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ + * handler. When this function fails and the fault is recoverable, it is the + * caller's responsibility to complete the fault. + * + * Return 0 on success, or an error. + */ +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { - struct iommu_fault_param *fault_param = group->dev->iommu->fault_param; + struct dev_iommu *param = dev->iommu; + struct iopf_fault *evt_pending = NULL; + struct iommu_fault_param *fparam; + int ret = 0; - INIT_WORK(&group->work, iopf_handler); - if (!queue_work(fault_param->queue->wq, &group->work)) - return -EBUSY; + if (!param || !evt) + return -EINVAL; - return 0; + /* we only report device fault if there is a handler registered */ + mutex_lock(¶m->lock); + fparam = param->fault_param; + + if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && + (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { + evt_pending = kmemdup(evt, sizeof(struct iopf_fault), + GFP_KERNEL); + if (!evt_pending) { + ret = -ENOMEM; + goto done_unlock; + } + mutex_lock(&fparam->lock); + list_add_tail(&evt_pending->list, &fparam->faults); + mutex_unlock(&fparam->lock); + } + + ret = iommu_handle_iopf(&evt->fault, dev); + if (ret && evt_pending) { + mutex_lock(&fparam->lock); + list_del(&evt_pending->list); + mutex_unlock(&fparam->lock); + kfree(evt_pending); + } +done_unlock: + mutex_unlock(¶m->lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_report_device_fault); + +int iommu_page_response(struct device *dev, + struct iommu_page_response *msg) +{ + bool needs_pasid; + int ret = -EINVAL; + struct iopf_fault *evt; + struct iommu_fault_page_request *prm; + struct dev_iommu *param = dev->iommu; + const struct iommu_ops *ops = dev_iommu_ops(dev); + bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; + + if (!ops->page_response) + return -ENODEV; + + if (!param || !param->fault_param) + return -EINVAL; + + /* Only send response if there is a fault report pending */ + mutex_lock(¶m->fault_param->lock); + if (list_empty(¶m->fault_param->faults)) { + dev_warn_ratelimited(dev, "no pending PRQ, drop response\n"); + goto done_unlock; + } + /* + * Check if we have a matching page request pending to respond, + * otherwise return -EINVAL + */ + list_for_each_entry(evt, ¶m->fault_param->faults, list) { + prm = &evt->fault.prm; + if (prm->grpid != msg->grpid) + continue; + + /* + * If the PASID is required, the corresponding request is + * matched using the group ID, the PASID valid bit and the PASID + * value. Otherwise only the group ID matches request and + * response. + */ + needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid)) + continue; + + if (!needs_pasid && has_pasid) { + /* No big deal, just clear it. */ + msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID; + msg->pasid = 0; + } + + ret = ops->page_response(dev, evt, msg); + list_del(&evt->list); + kfree(evt); + break; + } + +done_unlock: + mutex_unlock(¶m->fault_param->lock); + return ret; } +EXPORT_SYMBOL_GPL(iommu_page_response); /** * iopf_queue_flush_dev - Ensure that all queued faults have been processed @@ -258,6 +317,31 @@ int iopf_queue_flush_dev(struct device *dev) } EXPORT_SYMBOL_GPL(iopf_queue_flush_dev); +/** + * iopf_group_response - Respond a group of page faults + * @group: the group of faults with the same group id + * @status: the response code + * + * Return 0 on success and <0 on error. + */ +int iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) +{ + struct iopf_fault *iopf = &group->last_fault; + struct iommu_page_response resp = { + .pasid = iopf->fault.prm.pasid, + .grpid = iopf->fault.prm.grpid, + .code = status, + }; + + if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) && + (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)) + resp.flags = IOMMU_PAGE_RESP_PASID_VALID; + + return iommu_page_response(group->dev, &resp); +} +EXPORT_SYMBOL_GPL(iopf_group_response); + /** * iopf_queue_discard_partial - Remove all pending partial fault * @queue: the queue whose partial faults need to be discarded diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index fcae7308fcb7..9de878e40413 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -7,7 +7,7 @@ #include #include -#include "iommu-sva.h" +#include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); @@ -159,10 +159,21 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); +void mm_pasid_drop(struct mm_struct *mm) +{ + struct iommu_mm_data *iommu_mm = mm->iommu_mm; + + if (!iommu_mm) + return; + + iommu_free_global_pasid(iommu_mm->pasid); + kfree(iommu_mm); +} + /* * I/O page fault handler for SVA */ -enum iommu_page_response_code +static enum iommu_page_response_code iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm) { vm_fault_t ret; @@ -216,13 +227,54 @@ iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm) return status; } -void mm_pasid_drop(struct mm_struct *mm) +static void iommu_sva_handle_iopf(struct work_struct *work) { - struct iommu_mm_data *iommu_mm = mm->iommu_mm; + struct iopf_fault *iopf; + struct iopf_group *group; + enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; + + group = container_of(work, struct iopf_group, work); + list_for_each_entry(iopf, &group->faults, list) { + /* + * For the moment, errors are sticky: don't handle subsequent + * faults in the group if there is an error. + */ + if (status != IOMMU_PAGE_RESP_SUCCESS) + break; + + status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm); + } - if (!iommu_mm) - return; + iopf_group_response(group, status); + iopf_free_group(group); +} - iommu_free_global_pasid(iommu_mm->pasid); - kfree(iommu_mm); +static int iommu_sva_iopf_handler(struct iopf_group *group) +{ + struct iommu_fault_param *fault_param = group->dev->iommu->fault_param; + + INIT_WORK(&group->work, iommu_sva_handle_iopf); + if (!queue_work(fault_param->queue->wq, &group->work)) + return -EBUSY; + + return 0; +} + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; + mmgrab(mm); + domain->mm = mm; + domain->owner = ops; + domain->iopf_handler = iommu_sva_iopf_handler; + + return domain; } diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h deleted file mode 100644 index 27c8da115b41..000000000000 --- a/drivers/iommu/iommu-sva.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * SVA library for IOMMU drivers - */ -#ifndef _IOMMU_SVA_H -#define _IOMMU_SVA_H - -#include - -/* I/O Page fault */ -struct device; -struct iommu_fault; -struct iopf_queue; - -#ifdef CONFIG_IOMMU_SVA -int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev); - -int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); -int iopf_queue_remove_device(struct iopf_queue *queue, - struct device *dev); -int iopf_queue_flush_dev(struct device *dev); -struct iopf_queue *iopf_queue_alloc(const char *name); -void iopf_queue_free(struct iopf_queue *queue); -int iopf_queue_discard_partial(struct iopf_queue *queue); -int iommu_sva_handle_iopf(struct iopf_group *group); - -#else /* CONFIG_IOMMU_SVA */ -static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev) -{ - return -ENODEV; -} - -static inline int iopf_queue_add_device(struct iopf_queue *queue, - struct device *dev) -{ - return -ENODEV; -} - -static inline int iopf_queue_remove_device(struct iopf_queue *queue, - struct device *dev) -{ - return -ENODEV; -} - -static inline int iopf_queue_flush_dev(struct device *dev) -{ - return -ENODEV; -} - -static inline struct iopf_queue *iopf_queue_alloc(const char *name) -{ - return NULL; -} - -static inline void iopf_queue_free(struct iopf_queue *queue) -{ -} - -static inline int iopf_queue_discard_partial(struct iopf_queue *queue) -{ - return -ENODEV; -} - -static inline int iommu_sva_handle_iopf(struct iopf_group *group) -{ - return IOMMU_PAGE_RESP_INVALID; -} -#endif /* CONFIG_IOMMU_SVA */ -#endif /* _IOMMU_SVA_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 264d09aa6cc0..97eb5742853a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -36,8 +36,6 @@ #include "dma-iommu.h" #include "iommu-priv.h" -#include "iommu-sva.h" - static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static DEFINE_IDA(iommu_global_pasid_ida); @@ -1360,117 +1358,6 @@ void iommu_group_put(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_put); -/** - * iommu_report_device_fault() - Report fault event to device driver - * @dev: the device - * @evt: fault event data - * - * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ - * handler. When this function fails and the fault is recoverable, it is the - * caller's responsibility to complete the fault. - * - * Return 0 on success, or an error. - */ -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) -{ - struct dev_iommu *param = dev->iommu; - struct iopf_fault *evt_pending = NULL; - struct iommu_fault_param *fparam; - int ret = 0; - - if (!param || !evt) - return -EINVAL; - - /* we only report device fault if there is a handler registered */ - mutex_lock(¶m->lock); - fparam = param->fault_param; - - if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && - (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - evt_pending = kmemdup(evt, sizeof(struct iopf_fault), - GFP_KERNEL); - if (!evt_pending) { - ret = -ENOMEM; - goto done_unlock; - } - mutex_lock(&fparam->lock); - list_add_tail(&evt_pending->list, &fparam->faults); - mutex_unlock(&fparam->lock); - } - - ret = iommu_queue_iopf(&evt->fault, dev); - if (ret && evt_pending) { - mutex_lock(&fparam->lock); - list_del(&evt_pending->list); - mutex_unlock(&fparam->lock); - kfree(evt_pending); - } -done_unlock: - mutex_unlock(¶m->lock); - return ret; -} -EXPORT_SYMBOL_GPL(iommu_report_device_fault); - -int iommu_page_response(struct device *dev, - struct iommu_page_response *msg) -{ - bool needs_pasid; - int ret = -EINVAL; - struct iopf_fault *evt; - struct iommu_fault_page_request *prm; - struct dev_iommu *param = dev->iommu; - const struct iommu_ops *ops = dev_iommu_ops(dev); - bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; - - if (!ops->page_response) - return -ENODEV; - - if (!param || !param->fault_param) - return -EINVAL; - - /* Only send response if there is a fault report pending */ - mutex_lock(¶m->fault_param->lock); - if (list_empty(¶m->fault_param->faults)) { - dev_warn_ratelimited(dev, "no pending PRQ, drop response\n"); - goto done_unlock; - } - /* - * Check if we have a matching page request pending to respond, - * otherwise return -EINVAL - */ - list_for_each_entry(evt, ¶m->fault_param->faults, list) { - prm = &evt->fault.prm; - if (prm->grpid != msg->grpid) - continue; - - /* - * If the PASID is required, the corresponding request is - * matched using the group ID, the PASID valid bit and the PASID - * value. Otherwise only the group ID matches request and - * response. - */ - needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid)) - continue; - - if (!needs_pasid && has_pasid) { - /* No big deal, just clear it. */ - msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID; - msg->pasid = 0; - } - - ret = ops->page_response(dev, evt, msg); - list_del(&evt->list); - kfree(evt); - break; - } - -done_unlock: - mutex_unlock(¶m->fault_param->lock); - return ret; -} -EXPORT_SYMBOL_GPL(iommu_page_response); - /** * iommu_group_id - Return ID for a group * @group: the group to ID @@ -3569,26 +3456,6 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); -struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, - struct mm_struct *mm) -{ - const struct iommu_ops *ops = dev_iommu_ops(dev); - struct iommu_domain *domain; - - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return NULL; - - domain->type = IOMMU_DOMAIN_SVA; - mmgrab(mm); - domain->mm = mm; - domain->owner = ops; - domain->iopf_handler = iommu_sva_handle_iopf; - domain->fault_data = mm; - - return domain; -} - ioasid_t iommu_alloc_global_pasid(struct device *dev) { int ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f6a320a7888b..ffe81af33c33 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -843,10 +843,6 @@ extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -extern int iommu_page_response(struct device *dev, - struct iommu_page_response *msg); - extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); @@ -1077,8 +1073,6 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group); int iommu_device_claim_dma_owner(struct device *dev, void *owner); void iommu_device_release_dma_owner(struct device *dev); -struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, - struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, @@ -1267,18 +1261,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) -{ - return -ENODEV; -} - -static inline int iommu_page_response(struct device *dev, - struct iommu_page_response *msg) -{ - return -ENODEV; -} - static inline int iommu_group_id(struct iommu_group *group) { return -ENODEV; @@ -1427,12 +1409,6 @@ static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) return -ENODEV; } -static inline struct iommu_domain * -iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} - static inline int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { @@ -1580,6 +1556,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1604,6 +1582,78 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} + +static inline struct iommu_domain * +iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} #endif /* CONFIG_IOMMU_SVA */ +#ifdef CONFIG_IOMMU_IOPF +int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); +int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); +int iopf_queue_flush_dev(struct device *dev); +struct iopf_queue *iopf_queue_alloc(const char *name); +void iopf_queue_free(struct iopf_queue *queue); +int iopf_queue_discard_partial(struct iopf_queue *queue); +void iopf_free_group(struct iopf_group *group); +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); +int iommu_page_response(struct device *dev, struct iommu_page_response *msg); +int iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status); +#else +static inline int +iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) +{ + return -ENODEV; +} + +static inline int +iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) +{ + return -ENODEV; +} + +static inline int iopf_queue_flush_dev(struct device *dev) +{ + return -ENODEV; +} + +static inline struct iopf_queue *iopf_queue_alloc(const char *name) +{ + return NULL; +} + +static inline void iopf_queue_free(struct iopf_queue *queue) +{ +} + +static inline int iopf_queue_discard_partial(struct iopf_queue *queue) +{ + return -ENODEV; +} + +static inline void iopf_free_group(struct iopf_group *group) +{ +} + +static inline int +iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +{ + return -ENODEV; +} + +static inline int +iommu_page_response(struct device *dev, struct iommu_page_response *msg) +{ + return -ENODEV; +} + +static inline int iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) +{ + return -ENODEV; +} +#endif /* CONFIG_IOMMU_IOPF */ #endif /* __LINUX_IOMMU_H */ -- Gitee From 017a1e31457263a1764228cf494f41ce512ad0fb Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:22 +0800 Subject: [PATCH 114/219] iommu: Refine locking for per-device fault data management ANBZ: #13617 commit cc7338e9d807e20e60e6720a62956f0e9d46f0f8 upstream. The per-device fault data is a data structure that is used to store information about faults that occur on a device. This data is allocated when IOPF is enabled on the device and freed when IOPF is disabled. The data is used in the paths of iopf reporting, handling, responding, and draining. The fault data is protected by two locks: - dev->iommu->lock: This lock is used to protect the allocation and freeing of the fault data. - dev->iommu->fault_parameter->lock: This lock is used to protect the fault data itself. Apply the locking mechanism to the fault reporting and responding paths. The fault_parameter->lock is also added in iopf_queue_discard_partial(). It does not fix any real issue, as iopf_queue_discard_partial() is only used in the VT-d driver's prq_event_thread(), which is a single-threaded path that reports the IOPFs. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 61 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index c1e88da973ce..5aea8402be47 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -53,7 +53,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, /** * iommu_handle_iopf - IO Page Fault handler * @fault: fault event - * @dev: struct device. + * @iopf_param: the fault parameter of the device. * * Add a fault to the device workqueue, to be handled by mm. * @@ -90,29 +90,21 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, * * Return: 0 on success and <0 on error. */ -static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev) +static int iommu_handle_iopf(struct iommu_fault *fault, + struct iommu_fault_param *iopf_param) { int ret; struct iopf_group *group; struct iommu_domain *domain; struct iopf_fault *iopf, *next; - struct iommu_fault_param *iopf_param; - struct dev_iommu *param = dev->iommu; + struct device *dev = iopf_param->dev; - lockdep_assert_held(¶m->lock); + lockdep_assert_held(&iopf_param->lock); if (fault->type != IOMMU_FAULT_PAGE_REQ) /* Not a recoverable page fault */ return -EOPNOTSUPP; - /* - * As long as we're holding param->lock, the queue can't be unlinked - * from the device and therefore cannot disappear. - */ - iopf_param = param->fault_param; - if (!iopf_param) - return -ENODEV; - if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { iopf = kzalloc(sizeof(*iopf), GFP_KERNEL); if (!iopf) @@ -186,18 +178,19 @@ static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev) */ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { - struct dev_iommu *param = dev->iommu; + struct iommu_fault_param *fault_param; struct iopf_fault *evt_pending = NULL; - struct iommu_fault_param *fparam; + struct dev_iommu *param = dev->iommu; int ret = 0; - if (!param || !evt) - return -EINVAL; - - /* we only report device fault if there is a handler registered */ mutex_lock(¶m->lock); - fparam = param->fault_param; + fault_param = param->fault_param; + if (!fault_param) { + mutex_unlock(¶m->lock); + return -EINVAL; + } + mutex_lock(&fault_param->lock); if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { evt_pending = kmemdup(evt, sizeof(struct iopf_fault), @@ -206,20 +199,18 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) ret = -ENOMEM; goto done_unlock; } - mutex_lock(&fparam->lock); - list_add_tail(&evt_pending->list, &fparam->faults); - mutex_unlock(&fparam->lock); + list_add_tail(&evt_pending->list, &fault_param->faults); } - ret = iommu_handle_iopf(&evt->fault, dev); + ret = iommu_handle_iopf(&evt->fault, fault_param); if (ret && evt_pending) { - mutex_lock(&fparam->lock); list_del(&evt_pending->list); - mutex_unlock(&fparam->lock); kfree(evt_pending); } done_unlock: + mutex_unlock(&fault_param->lock); mutex_unlock(¶m->lock); + return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); @@ -232,18 +223,23 @@ int iommu_page_response(struct device *dev, struct iopf_fault *evt; struct iommu_fault_page_request *prm; struct dev_iommu *param = dev->iommu; + struct iommu_fault_param *fault_param; const struct iommu_ops *ops = dev_iommu_ops(dev); bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; if (!ops->page_response) return -ENODEV; - if (!param || !param->fault_param) + mutex_lock(¶m->lock); + fault_param = param->fault_param; + if (!fault_param) { + mutex_unlock(¶m->lock); return -EINVAL; + } /* Only send response if there is a fault report pending */ - mutex_lock(¶m->fault_param->lock); - if (list_empty(¶m->fault_param->faults)) { + mutex_lock(&fault_param->lock); + if (list_empty(&fault_param->faults)) { dev_warn_ratelimited(dev, "no pending PRQ, drop response\n"); goto done_unlock; } @@ -251,7 +247,7 @@ int iommu_page_response(struct device *dev, * Check if we have a matching page request pending to respond, * otherwise return -EINVAL */ - list_for_each_entry(evt, ¶m->fault_param->faults, list) { + list_for_each_entry(evt, &fault_param->faults, list) { prm = &evt->fault.prm; if (prm->grpid != msg->grpid) continue; @@ -279,7 +275,8 @@ int iommu_page_response(struct device *dev, } done_unlock: - mutex_unlock(¶m->fault_param->lock); + mutex_unlock(&fault_param->lock); + mutex_unlock(¶m->lock); return ret; } EXPORT_SYMBOL_GPL(iommu_page_response); @@ -362,11 +359,13 @@ int iopf_queue_discard_partial(struct iopf_queue *queue) mutex_lock(&queue->lock); list_for_each_entry(iopf_param, &queue->devices, queue_list) { + mutex_lock(&iopf_param->lock); list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { list_del(&iopf->list); kfree(iopf); } + mutex_unlock(&iopf_param->lock); } mutex_unlock(&queue->lock); return 0; -- Gitee From c3e3f33b0fb1a014cb3fb5b91e955865204628d4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:23 +0800 Subject: [PATCH 115/219] iommu: Use refcount for fault data access ANBZ: #13617 commit a74c077b9021b36c785095c571336e5b204d3c2d upstream. The per-device fault data structure stores information about faults occurring on a device. Its lifetime spans from IOPF enablement to disablement. Multiple paths, including IOPF reporting, handling, and responding, may access it concurrently. Previously, a mutex protected the fault data from use after free. But this is not performance friendly due to the critical nature of IOPF handling paths. Refine this with a refcount-based approach. The fault data pointer is obtained within an RCU read region with a refcount. The fault data pointer is returned for usage only when the pointer is valid and a refcount is successfully obtained. The fault data is freed with kfree_rcu(), ensuring data is only freed after all RCU critical regions complete. An iopf handling work starts once an iopf group is created. The handling work continues until iommu_page_response() is called to respond to the iopf and the iopf group is freed. During this time, the device fault parameter should always be available. Add a pointer to the device fault parameter in the iopf_group structure and hold the reference until the iopf_group is freed. Make iommu_page_response() static as it is only used in io-pgfault.c. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 127 +++++++++++++++++++++++-------------- drivers/iommu/iommu-sva.c | 2 +- include/linux/iommu.h | 17 +++-- 3 files changed, 88 insertions(+), 58 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 5aea8402be47..ce7058892b59 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -13,6 +13,32 @@ #include "iommu-priv.h" +/* + * Return the fault parameter of a device if it exists. Otherwise, return NULL. + * On a successful return, the caller takes a reference of this parameter and + * should put it after use by calling iopf_put_dev_fault_param(). + */ +static struct iommu_fault_param *iopf_get_dev_fault_param(struct device *dev) +{ + struct dev_iommu *param = dev->iommu; + struct iommu_fault_param *fault_param; + + rcu_read_lock(); + fault_param = rcu_dereference(param->fault_param); + if (fault_param && !refcount_inc_not_zero(&fault_param->users)) + fault_param = NULL; + rcu_read_unlock(); + + return fault_param; +} + +/* Caller must hold a reference of the fault parameter. */ +static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param) +{ + if (refcount_dec_and_test(&fault_param->users)) + kfree_rcu(fault_param, rcu); +} + void iopf_free_group(struct iopf_group *group) { struct iopf_fault *iopf, *next; @@ -22,6 +48,8 @@ void iopf_free_group(struct iopf_group *group) kfree(iopf); } + /* Pair with iommu_report_device_fault(). */ + iopf_put_dev_fault_param(group->fault_param); kfree(group); } EXPORT_SYMBOL_GPL(iopf_free_group); @@ -135,7 +163,7 @@ static int iommu_handle_iopf(struct iommu_fault *fault, goto cleanup_partial; } - group->dev = dev; + group->fault_param = iopf_param; group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); group->domain = domain; @@ -178,64 +206,61 @@ static int iommu_handle_iopf(struct iommu_fault *fault, */ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { + bool last_prq = evt->fault.type == IOMMU_FAULT_PAGE_REQ && + (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE); struct iommu_fault_param *fault_param; - struct iopf_fault *evt_pending = NULL; - struct dev_iommu *param = dev->iommu; - int ret = 0; + struct iopf_fault *evt_pending; + int ret; - mutex_lock(¶m->lock); - fault_param = param->fault_param; - if (!fault_param) { - mutex_unlock(¶m->lock); + fault_param = iopf_get_dev_fault_param(dev); + if (!fault_param) return -EINVAL; - } mutex_lock(&fault_param->lock); - if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && - (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { + if (last_prq) { evt_pending = kmemdup(evt, sizeof(struct iopf_fault), GFP_KERNEL); if (!evt_pending) { ret = -ENOMEM; - goto done_unlock; + goto err_unlock; } list_add_tail(&evt_pending->list, &fault_param->faults); } ret = iommu_handle_iopf(&evt->fault, fault_param); - if (ret && evt_pending) { + if (ret) + goto err_free; + + mutex_unlock(&fault_param->lock); + /* The reference count of fault_param is now held by iopf_group. */ + if (!last_prq) + iopf_put_dev_fault_param(fault_param); + + return 0; +err_free: + if (last_prq) { list_del(&evt_pending->list); kfree(evt_pending); } -done_unlock: +err_unlock: mutex_unlock(&fault_param->lock); - mutex_unlock(¶m->lock); + iopf_put_dev_fault_param(fault_param); return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); -int iommu_page_response(struct device *dev, - struct iommu_page_response *msg) +static int iommu_page_response(struct iopf_group *group, + struct iommu_page_response *msg) { bool needs_pasid; int ret = -EINVAL; struct iopf_fault *evt; struct iommu_fault_page_request *prm; - struct dev_iommu *param = dev->iommu; - struct iommu_fault_param *fault_param; + struct device *dev = group->fault_param->dev; const struct iommu_ops *ops = dev_iommu_ops(dev); bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; - - if (!ops->page_response) - return -ENODEV; - - mutex_lock(¶m->lock); - fault_param = param->fault_param; - if (!fault_param) { - mutex_unlock(¶m->lock); - return -EINVAL; - } + struct iommu_fault_param *fault_param = group->fault_param; /* Only send response if there is a fault report pending */ mutex_lock(&fault_param->lock); @@ -276,10 +301,9 @@ int iommu_page_response(struct device *dev, done_unlock: mutex_unlock(&fault_param->lock); - mutex_unlock(¶m->lock); + return ret; } -EXPORT_SYMBOL_GPL(iommu_page_response); /** * iopf_queue_flush_dev - Ensure that all queued faults have been processed @@ -295,22 +319,20 @@ EXPORT_SYMBOL_GPL(iommu_page_response); */ int iopf_queue_flush_dev(struct device *dev) { - int ret = 0; struct iommu_fault_param *iopf_param; - struct dev_iommu *param = dev->iommu; - if (!param) + /* + * It's a driver bug to be here after iopf_queue_remove_device(). + * Therefore, it's safe to dereference the fault parameter without + * holding the lock. + */ + iopf_param = rcu_dereference_check(dev->iommu->fault_param, true); + if (WARN_ON(!iopf_param)) return -ENODEV; - mutex_lock(¶m->lock); - iopf_param = param->fault_param; - if (iopf_param) - flush_workqueue(iopf_param->queue->wq); - else - ret = -ENODEV; - mutex_unlock(¶m->lock); + flush_workqueue(iopf_param->queue->wq); - return ret; + return 0; } EXPORT_SYMBOL_GPL(iopf_queue_flush_dev); @@ -335,7 +357,7 @@ int iopf_group_response(struct iopf_group *group, (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)) resp.flags = IOMMU_PAGE_RESP_PASID_VALID; - return iommu_page_response(group->dev, &resp); + return iommu_page_response(group, &resp); } EXPORT_SYMBOL_GPL(iopf_group_response); @@ -384,10 +406,15 @@ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) int ret = 0; struct dev_iommu *param = dev->iommu; struct iommu_fault_param *fault_param; + const struct iommu_ops *ops = dev_iommu_ops(dev); + + if (!ops->page_response) + return -ENODEV; mutex_lock(&queue->lock); mutex_lock(¶m->lock); - if (param->fault_param) { + if (rcu_dereference_check(param->fault_param, + lockdep_is_held(¶m->lock))) { ret = -EBUSY; goto done_unlock; } @@ -402,10 +429,11 @@ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) INIT_LIST_HEAD(&fault_param->faults); INIT_LIST_HEAD(&fault_param->partial); fault_param->dev = dev; + refcount_set(&fault_param->users, 1); list_add(&fault_param->queue_list, &queue->devices); fault_param->queue = queue; - param->fault_param = fault_param; + rcu_assign_pointer(param->fault_param, fault_param); done_unlock: mutex_unlock(¶m->lock); @@ -429,10 +457,12 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) int ret = 0; struct iopf_fault *iopf, *next; struct dev_iommu *param = dev->iommu; - struct iommu_fault_param *fault_param = param->fault_param; + struct iommu_fault_param *fault_param; mutex_lock(&queue->lock); mutex_lock(¶m->lock); + fault_param = rcu_dereference_check(param->fault_param, + lockdep_is_held(¶m->lock)); if (!fault_param) { ret = -ENODEV; goto unlock; @@ -454,8 +484,9 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) list_for_each_entry_safe(iopf, next, &fault_param->partial, list) kfree(iopf); - param->fault_param = NULL; - kfree(fault_param); + /* dec the ref owned by iopf_queue_add_device() */ + rcu_assign_pointer(param->fault_param, NULL); + iopf_put_dev_fault_param(fault_param); unlock: mutex_unlock(¶m->lock); mutex_unlock(&queue->lock); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 9de878e40413..b51995b4fe90 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -251,7 +251,7 @@ static void iommu_sva_handle_iopf(struct work_struct *work) static int iommu_sva_iopf_handler(struct iopf_group *group) { - struct iommu_fault_param *fault_param = group->dev->iommu->fault_param; + struct iommu_fault_param *fault_param = group->fault_param; INIT_WORK(&group->work, iommu_sva_handle_iopf); if (!queue_work(fault_param->queue->wq, &group->work)) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ffe81af33c33..6c4544b39797 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,6 +41,7 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_fault_param; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -129,8 +130,9 @@ struct iopf_group { struct iopf_fault last_fault; struct list_head faults; struct work_struct work; - struct device *dev; struct iommu_domain *domain; + /* The device's fault data parameter. */ + struct iommu_fault_param *fault_param; }; /** @@ -696,6 +698,8 @@ struct iommu_device { /** * struct iommu_fault_param - per-device IOMMU fault data * @lock: protect pending faults list + * @users: user counter to manage the lifetime of the data + * @rcu: rcu head for kfree_rcu() * @dev: the device that owns this param * @queue: IOPF queue * @queue_list: index into queue->devices @@ -705,6 +709,8 @@ struct iommu_device { */ struct iommu_fault_param { struct mutex lock; + refcount_t users; + struct rcu_head rcu; struct device *dev; struct iopf_queue *queue; @@ -732,7 +738,7 @@ struct iommu_fault_param { */ struct dev_iommu { struct mutex lock; - struct iommu_fault_param *fault_param; + struct iommu_fault_param __rcu *fault_param; struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; @@ -1599,7 +1605,6 @@ void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -int iommu_page_response(struct device *dev, struct iommu_page_response *msg); int iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); #else @@ -1644,12 +1649,6 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) return -ENODEV; } -static inline int -iommu_page_response(struct device *dev, struct iommu_page_response *msg) -{ - return -ENODEV; -} - static inline int iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status) { -- Gitee From f1f530835161d2c7ed18137ca6886f06680e80a6 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:24 +0800 Subject: [PATCH 116/219] iommu: Improve iopf_queue_remove_device() ANBZ: #13617 commit 0095bf83554f8e7a681961656608101bdf40e9ef upstream. Convert iopf_queue_remove_device() to return void instead of an error code, as the return value is never used. This removal helper is designed to be never-failed, so there's no need for error handling. Ack all outstanding page requests from the device with the response code of IOMMU_PAGE_RESP_INVALID, indicating device should not attempt any retry. Add comments to this helper explaining the steps involved in removing a device from the iopf queue and disabling its PRI. The individual drivers are expected to be adjusted accordingly. Here we just define the expected behaviors of the individual iommu driver from the core's perspective. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-14-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 7 +---- drivers/iommu/io-pgfault.c | 57 ++++++++++++++++++++++++------------- include/linux/iommu.h | 5 ++-- 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5101570e6f7b..49f25937ecd5 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4532,12 +4532,7 @@ static int intel_iommu_disable_iopf(struct device *dev) */ pci_disable_pri(to_pci_dev(dev)); info->pri_enabled = 0; - - /* - * With PRI disabled and outstanding PRQs drained, removing device - * from iopf queue should never fail. - */ - WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); + iopf_queue_remove_device(iommu->iopf_queue, dev); return 0; } diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index ce7058892b59..ece09552e5cf 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -448,41 +448,60 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device); * @queue: IOPF queue * @dev: device to remove * - * Caller makes sure that no more faults are reported for this device. + * Removing a device from an iopf_queue. It's recommended to follow these + * steps when removing a device: * - * Return: 0 on success and <0 on error. + * - Disable new PRI reception: Turn off PRI generation in the IOMMU hardware + * and flush any hardware page request queues. This should be done before + * calling into this helper. + * - Acknowledge all outstanding PRQs to the device: Respond to all outstanding + * page requests with IOMMU_PAGE_RESP_INVALID, indicating the device should + * not retry. This helper function handles this. + * - Disable PRI on the device: After calling this helper, the caller could + * then disable PRI on the device. + * + * Calling iopf_queue_remove_device() essentially disassociates the device. + * The fault_param might still exist, but iommu_page_response() will do + * nothing. The device fault parameter reference count has been properly + * passed from iommu_report_device_fault() to the fault handling work, and + * will eventually be released after iommu_page_response(). */ -int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) +void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) { - int ret = 0; struct iopf_fault *iopf, *next; + struct iommu_page_response resp; struct dev_iommu *param = dev->iommu; struct iommu_fault_param *fault_param; + const struct iommu_ops *ops = dev_iommu_ops(dev); mutex_lock(&queue->lock); mutex_lock(¶m->lock); fault_param = rcu_dereference_check(param->fault_param, lockdep_is_held(¶m->lock)); - if (!fault_param) { - ret = -ENODEV; - goto unlock; - } - if (fault_param->queue != queue) { - ret = -EINVAL; + if (WARN_ON(!fault_param || fault_param->queue != queue)) goto unlock; - } - if (!list_empty(&fault_param->faults)) { - ret = -EBUSY; - goto unlock; - } + mutex_lock(&fault_param->lock); + list_for_each_entry_safe(iopf, next, &fault_param->partial, list) + kfree(iopf); - list_del(&fault_param->queue_list); + list_for_each_entry_safe(iopf, next, &fault_param->faults, list) { + memset(&resp, 0, sizeof(struct iommu_page_response)); + resp.pasid = iopf->fault.prm.pasid; + resp.grpid = iopf->fault.prm.grpid; + resp.code = IOMMU_PAGE_RESP_INVALID; - /* Just in case some faults are still stuck */ - list_for_each_entry_safe(iopf, next, &fault_param->partial, list) + if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID) + resp.flags = IOMMU_PAGE_RESP_PASID_VALID; + + ops->page_response(dev, iopf, &resp); + list_del(&iopf->list); kfree(iopf); + } + mutex_unlock(&fault_param->lock); + + list_del(&fault_param->queue_list); /* dec the ref owned by iopf_queue_add_device() */ rcu_assign_pointer(param->fault_param, NULL); @@ -490,8 +509,6 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) unlock: mutex_unlock(¶m->lock); mutex_unlock(&queue->lock); - - return ret; } EXPORT_SYMBOL_GPL(iopf_queue_remove_device); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6c4544b39797..d68504ffdc47 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1598,7 +1598,7 @@ iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) #ifdef CONFIG_IOMMU_IOPF int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); -int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); +void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); @@ -1614,10 +1614,9 @@ iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) return -ENODEV; } -static inline int +static inline void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) { - return -ENODEV; } static inline int iopf_queue_flush_dev(struct device *dev) -- Gitee From e2322b35f0fbba6d8aff791d19665229a75037e3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:25 +0800 Subject: [PATCH 117/219] iommu: Track iopf group instead of last fault ANBZ: #13617 commit 19911232713573a2ebea84a25bd4d71d024ed86b upstream. Previously, before a group of page faults was passed to the domain's iopf handler, the last page fault of the group was kept in the list of iommu_fault_param::faults. In the page fault response path, the group's last page fault was used to look up the list, and the page faults were responded to device only if there was a matched fault. The previous approach seems unnecessarily complex and not performance friendly. Put the page fault group itself to the outstanding fault list. It can be removed in the page fault response path or in the iopf_queue_remove_device() path. The pending list is protected by iommu_fault_param::lock. To allow checking for the group's presence in the list using list_empty(), the iopf group should be removed from the list with list_del_init(). IOMMU_PAGE_RESP_PASID_VALID is set in the code but not used anywhere. Remove it to make the code clean. IOMMU_PAGE_RESP_PASID_VALID is set in the response message indicating that the response message includes a valid PASID value. Actually, we should keep this hardware detail in the individual driver. When the page fault handling framework in IOMMU and IOMMUFD subsystems includes a valid PASID in the fault message, the response message should always contain the same PASID value. Individual drivers should be responsible for deciding whether to include the PASID in the messages they provide for the hardware. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-15-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgfault.c | 242 +++++++++++++------------------------ include/linux/iommu.h | 6 +- 2 files changed, 86 insertions(+), 162 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index ece09552e5cf..05e49e2e6a52 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -78,12 +78,33 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, return domain; } +/* Non-last request of a group. Postpone until the last one. */ +static int report_partial_fault(struct iommu_fault_param *fault_param, + struct iommu_fault *fault) +{ + struct iopf_fault *iopf; + + iopf = kzalloc(sizeof(*iopf), GFP_KERNEL); + if (!iopf) + return -ENOMEM; + + iopf->fault = *fault; + + mutex_lock(&fault_param->lock); + list_add(&iopf->list, &fault_param->partial); + mutex_unlock(&fault_param->lock); + + return 0; +} + /** - * iommu_handle_iopf - IO Page Fault handler - * @fault: fault event - * @iopf_param: the fault parameter of the device. + * iommu_report_device_fault() - Report fault event to device driver + * @dev: the device + * @evt: fault event data * - * Add a fault to the device workqueue, to be handled by mm. + * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ + * handler. When this function fails and the fault is recoverable, it is the + * caller's responsibility to complete the fault. * * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't @@ -118,34 +139,37 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev, * * Return: 0 on success and <0 on error. */ -static int iommu_handle_iopf(struct iommu_fault *fault, - struct iommu_fault_param *iopf_param) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { - int ret; - struct iopf_group *group; - struct iommu_domain *domain; + struct iommu_fault *fault = &evt->fault; + struct iommu_fault_param *iopf_param; struct iopf_fault *iopf, *next; - struct device *dev = iopf_param->dev; - - lockdep_assert_held(&iopf_param->lock); + struct iommu_domain *domain; + struct iopf_group *group; + int ret; if (fault->type != IOMMU_FAULT_PAGE_REQ) - /* Not a recoverable page fault */ return -EOPNOTSUPP; - if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - iopf = kzalloc(sizeof(*iopf), GFP_KERNEL); - if (!iopf) - return -ENOMEM; - - iopf->fault = *fault; + iopf_param = iopf_get_dev_fault_param(dev); + if (!iopf_param) + return -ENODEV; - /* Non-last request of a group. Postpone until the last one */ - list_add(&iopf->list, &iopf_param->partial); + if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { + ret = report_partial_fault(iopf_param, fault); + iopf_put_dev_fault_param(iopf_param); - return 0; + return ret; } + /* + * This is the last page fault of a group. Allocate an iopf group and + * pass it to domain's page fault handler. The group holds a reference + * count of the fault parameter. It will be released after response or + * error path of this function. If an error is returned, the caller + * will send a response to the hardware. We need to clean up before + * leaving, otherwise partial faults will be stuck. + */ domain = get_domain_for_iopf(dev, fault); if (!domain) { ret = -EINVAL; @@ -154,11 +178,6 @@ static int iommu_handle_iopf(struct iommu_fault *fault, group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) { - /* - * The caller will send a response to the hardware. But we do - * need to clean up before leaving, otherwise partial faults - * will be stuck. - */ ret = -ENOMEM; goto cleanup_partial; } @@ -166,145 +185,45 @@ static int iommu_handle_iopf(struct iommu_fault *fault, group->fault_param = iopf_param; group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); + INIT_LIST_HEAD(&group->pending_node); group->domain = domain; list_add(&group->last_fault.list, &group->faults); /* See if we have partial faults for this group */ + mutex_lock(&iopf_param->lock); list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { if (iopf->fault.prm.grpid == fault->prm.grpid) /* Insert *before* the last fault */ list_move(&iopf->list, &group->faults); } - + list_add(&group->pending_node, &iopf_param->faults); mutex_unlock(&iopf_param->lock); + ret = domain->iopf_handler(group); - mutex_lock(&iopf_param->lock); - if (ret) + if (ret) { + mutex_lock(&iopf_param->lock); + list_del_init(&group->pending_node); + mutex_unlock(&iopf_param->lock); iopf_free_group(group); + } return ret; + cleanup_partial: + mutex_lock(&iopf_param->lock); list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { if (iopf->fault.prm.grpid == fault->prm.grpid) { list_del(&iopf->list); kfree(iopf); } } - return ret; -} - -/** - * iommu_report_device_fault() - Report fault event to device driver - * @dev: the device - * @evt: fault event data - * - * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ - * handler. When this function fails and the fault is recoverable, it is the - * caller's responsibility to complete the fault. - * - * Return 0 on success, or an error. - */ -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) -{ - bool last_prq = evt->fault.type == IOMMU_FAULT_PAGE_REQ && - (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE); - struct iommu_fault_param *fault_param; - struct iopf_fault *evt_pending; - int ret; - - fault_param = iopf_get_dev_fault_param(dev); - if (!fault_param) - return -EINVAL; - - mutex_lock(&fault_param->lock); - if (last_prq) { - evt_pending = kmemdup(evt, sizeof(struct iopf_fault), - GFP_KERNEL); - if (!evt_pending) { - ret = -ENOMEM; - goto err_unlock; - } - list_add_tail(&evt_pending->list, &fault_param->faults); - } - - ret = iommu_handle_iopf(&evt->fault, fault_param); - if (ret) - goto err_free; - - mutex_unlock(&fault_param->lock); - /* The reference count of fault_param is now held by iopf_group. */ - if (!last_prq) - iopf_put_dev_fault_param(fault_param); - - return 0; -err_free: - if (last_prq) { - list_del(&evt_pending->list); - kfree(evt_pending); - } -err_unlock: - mutex_unlock(&fault_param->lock); - iopf_put_dev_fault_param(fault_param); + mutex_unlock(&iopf_param->lock); + iopf_put_dev_fault_param(iopf_param); return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); -static int iommu_page_response(struct iopf_group *group, - struct iommu_page_response *msg) -{ - bool needs_pasid; - int ret = -EINVAL; - struct iopf_fault *evt; - struct iommu_fault_page_request *prm; - struct device *dev = group->fault_param->dev; - const struct iommu_ops *ops = dev_iommu_ops(dev); - bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; - struct iommu_fault_param *fault_param = group->fault_param; - - /* Only send response if there is a fault report pending */ - mutex_lock(&fault_param->lock); - if (list_empty(&fault_param->faults)) { - dev_warn_ratelimited(dev, "no pending PRQ, drop response\n"); - goto done_unlock; - } - /* - * Check if we have a matching page request pending to respond, - * otherwise return -EINVAL - */ - list_for_each_entry(evt, &fault_param->faults, list) { - prm = &evt->fault.prm; - if (prm->grpid != msg->grpid) - continue; - - /* - * If the PASID is required, the corresponding request is - * matched using the group ID, the PASID valid bit and the PASID - * value. Otherwise only the group ID matches request and - * response. - */ - needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid)) - continue; - - if (!needs_pasid && has_pasid) { - /* No big deal, just clear it. */ - msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID; - msg->pasid = 0; - } - - ret = ops->page_response(dev, evt, msg); - list_del(&evt->list); - kfree(evt); - break; - } - -done_unlock: - mutex_unlock(&fault_param->lock); - - return ret; -} - /** * iopf_queue_flush_dev - Ensure that all queued faults have been processed * @dev: the endpoint whose faults need to be flushed. @@ -346,18 +265,26 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev); int iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status) { + struct iommu_fault_param *fault_param = group->fault_param; struct iopf_fault *iopf = &group->last_fault; + struct device *dev = group->fault_param->dev; + const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_page_response resp = { .pasid = iopf->fault.prm.pasid, .grpid = iopf->fault.prm.grpid, .code = status, }; + int ret = -EINVAL; - if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) && - (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)) - resp.flags = IOMMU_PAGE_RESP_PASID_VALID; + /* Only send response if there is a fault report pending */ + mutex_lock(&fault_param->lock); + if (!list_empty(&group->pending_node)) { + ret = ops->page_response(dev, &group->last_fault, &resp); + list_del_init(&group->pending_node); + } + mutex_unlock(&fault_param->lock); - return iommu_page_response(group, &resp); + return ret; } EXPORT_SYMBOL_GPL(iopf_group_response); @@ -468,8 +395,9 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device); */ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) { - struct iopf_fault *iopf, *next; - struct iommu_page_response resp; + struct iopf_fault *partial_iopf; + struct iopf_fault *next; + struct iopf_group *group, *temp; struct dev_iommu *param = dev->iommu; struct iommu_fault_param *fault_param; const struct iommu_ops *ops = dev_iommu_ops(dev); @@ -483,21 +411,19 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) goto unlock; mutex_lock(&fault_param->lock); - list_for_each_entry_safe(iopf, next, &fault_param->partial, list) - kfree(iopf); - - list_for_each_entry_safe(iopf, next, &fault_param->faults, list) { - memset(&resp, 0, sizeof(struct iommu_page_response)); - resp.pasid = iopf->fault.prm.pasid; - resp.grpid = iopf->fault.prm.grpid; - resp.code = IOMMU_PAGE_RESP_INVALID; + list_for_each_entry_safe(partial_iopf, next, &fault_param->partial, list) + kfree(partial_iopf); - if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID) - resp.flags = IOMMU_PAGE_RESP_PASID_VALID; + list_for_each_entry_safe(group, temp, &fault_param->faults, pending_node) { + struct iopf_fault *iopf = &group->last_fault; + struct iommu_page_response resp = { + .pasid = iopf->fault.prm.pasid, + .grpid = iopf->fault.prm.grpid, + .code = IOMMU_PAGE_RESP_INVALID + }; ops->page_response(dev, iopf, &resp); - list_del(&iopf->list); - kfree(iopf); + list_del_init(&group->pending_node); } mutex_unlock(&fault_param->lock); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d68504ffdc47..59a9e6516b77 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -106,15 +106,11 @@ enum iommu_page_response_code { /** * struct iommu_page_response - Generic page response information - * @flags: encodes whether the corresponding fields are valid - * (IOMMU_FAULT_PAGE_RESPONSE_* values) * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @code: response code from &enum iommu_page_response_code */ struct iommu_page_response { -#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) - u32 flags; u32 pasid; u32 grpid; u32 code; @@ -129,6 +125,8 @@ struct iopf_fault { struct iopf_group { struct iopf_fault last_fault; struct list_head faults; + /* list node for iommu_fault_param::faults */ + struct list_head pending_node; struct work_struct work; struct iommu_domain *domain; /* The device's fault data parameter. */ -- Gitee From 5b6298d250a0f9a58ea8d610684d2d3581f4903e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:26 +0800 Subject: [PATCH 118/219] iommu: Make iopf_group_response() return void ANBZ: #13617 commit b554e396e51ce3d378a560666f85c6836a8323fd upstream. The iopf_group_response() should return void, as nothing can do anything with the failure. This implies that ops->page_response() must also return void; this is consistent with what the drivers do. The failure paths, which are all integrity validations of the fault, should be WARN_ON'd, not return codes. If the iommu core fails to enqueue the fault, it should respond the fault directly by calling ops->page_response() instead of returning an error number and relying on the iommu drivers to do so. Consolidate the error fault handling code in the core. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240212012227.119381-16-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 50 +++----- drivers/iommu/intel/iommu.h | 4 +- drivers/iommu/intel/svm.c | 18 +-- drivers/iommu/io-pgfault.c | 132 +++++++++++--------- include/linux/iommu.h | 14 +-- 5 files changed, 98 insertions(+), 120 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6bbaff674944..14d687222daa 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -920,31 +920,29 @@ static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); } -static int arm_smmu_page_response(struct device *dev, - struct iopf_fault *unused, - struct iommu_page_response *resp) +static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused, + struct iommu_page_response *resp) { struct arm_smmu_cmdq_ent cmd = {0}; struct arm_smmu_master *master = dev_iommu_priv_get(dev); int sid = master->streams[0].id; - if (master->stall_enabled) { - cmd.opcode = CMDQ_OP_RESUME; - cmd.resume.sid = sid; - cmd.resume.stag = resp->grpid; - switch (resp->code) { - case IOMMU_PAGE_RESP_INVALID: - case IOMMU_PAGE_RESP_FAILURE: - cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT; - break; - case IOMMU_PAGE_RESP_SUCCESS: - cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY; - break; - default: - return -EINVAL; - } - } else { - return -ENODEV; + if (WARN_ON(!master->stall_enabled)) + return; + + cmd.opcode = CMDQ_OP_RESUME; + cmd.resume.sid = sid; + cmd.resume.stag = resp->grpid; + switch (resp->code) { + case IOMMU_PAGE_RESP_INVALID: + case IOMMU_PAGE_RESP_FAILURE: + cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT; + break; + case IOMMU_PAGE_RESP_SUCCESS: + cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY; + break; + default: + break; } arm_smmu_cmdq_issue_cmd(master->smmu, &cmd); @@ -954,8 +952,6 @@ static int arm_smmu_page_response(struct device *dev, * terminated... at some point in the future. PRI_RESP is fire and * forget. */ - - return 0; } /* Context descriptor manipulation functions */ @@ -1527,16 +1523,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) } ret = iommu_report_device_fault(master->dev, &fault_evt); - if (ret && flt->type == IOMMU_FAULT_PAGE_REQ) { - /* Nobody cared, abort the access */ - struct iommu_page_response resp = { - .pasid = flt->prm.pasid, - .grpid = flt->prm.grpid, - .code = IOMMU_PAGE_RESP_FAILURE, - }; - arm_smmu_page_response(master->dev, &fault_evt, &resp); - } - out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 457a00748730..17fd5a76ff3c 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1077,8 +1077,8 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, void intel_svm_check(struct intel_iommu *iommu); int intel_svm_enable_prq(struct intel_iommu *iommu); int intel_svm_finish_prq(struct intel_iommu *iommu); -int intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); +void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(void); void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); void intel_drain_pasid_prq(struct device *dev, u32 pasid); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 07a863635862..66aec2d3c78b 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -739,9 +739,8 @@ static irqreturn_t prq_event_thread(int irq, void *d) return IRQ_RETVAL(handled); } -int intel_svm_page_response(struct device *dev, - struct iopf_fault *evt, - struct iommu_page_response *msg) +void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -750,7 +749,6 @@ int intel_svm_page_response(struct device *dev, bool private_present; bool pasid_present; bool last_page; - int ret = 0; u16 sid; prm = &evt->fault.prm; @@ -759,16 +757,6 @@ int intel_svm_page_response(struct device *dev, private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (!pasid_present) { - ret = -EINVAL; - goto out; - } - - if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { - ret = -EINVAL; - goto out; - } - /* * Per VT-d spec. v3.0 ch7.7, system software must respond * with page group response if private data is present (PDP) @@ -797,8 +785,6 @@ int intel_svm_page_response(struct device *dev, qi_submit_sync(iommu, &desc, 1, 0); } -out: - return ret; } static int intel_svm_set_dev_pasid(struct iommu_domain *domain, diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 05e49e2e6a52..6a325bff8164 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -39,7 +39,7 @@ static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param) kfree_rcu(fault_param, rcu); } -void iopf_free_group(struct iopf_group *group) +static void __iopf_free_group(struct iopf_group *group) { struct iopf_fault *iopf, *next; @@ -50,6 +50,11 @@ void iopf_free_group(struct iopf_group *group) /* Pair with iommu_report_device_fault(). */ iopf_put_dev_fault_param(group->fault_param); +} + +void iopf_free_group(struct iopf_group *group) +{ + __iopf_free_group(group); kfree(group); } EXPORT_SYMBOL_GPL(iopf_free_group); @@ -97,14 +102,49 @@ static int report_partial_fault(struct iommu_fault_param *fault_param, return 0; } +static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, + struct iopf_fault *evt, + struct iopf_group *abort_group) +{ + struct iopf_fault *iopf, *next; + struct iopf_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) { + /* + * We always need to construct the group as we need it to abort + * the request at the driver if it can't be handled. + */ + group = abort_group; + } + + group->fault_param = iopf_param; + group->last_fault.fault = evt->fault; + INIT_LIST_HEAD(&group->faults); + INIT_LIST_HEAD(&group->pending_node); + list_add(&group->last_fault.list, &group->faults); + + /* See if we have partial faults for this group */ + mutex_lock(&iopf_param->lock); + list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { + if (iopf->fault.prm.grpid == evt->fault.prm.grpid) + /* Insert *before* the last fault */ + list_move(&iopf->list, &group->faults); + } + list_add(&group->pending_node, &iopf_param->faults); + mutex_unlock(&iopf_param->lock); + + return group; +} + /** * iommu_report_device_fault() - Report fault event to device driver * @dev: the device * @evt: fault event data * * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ - * handler. When this function fails and the fault is recoverable, it is the - * caller's responsibility to complete the fault. + * handler. If this function fails then ops->page_response() was called to + * complete evt if required. * * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't @@ -143,22 +183,18 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { struct iommu_fault *fault = &evt->fault; struct iommu_fault_param *iopf_param; - struct iopf_fault *iopf, *next; - struct iommu_domain *domain; + struct iopf_group abort_group = {}; struct iopf_group *group; int ret; - if (fault->type != IOMMU_FAULT_PAGE_REQ) - return -EOPNOTSUPP; - iopf_param = iopf_get_dev_fault_param(dev); - if (!iopf_param) + if (WARN_ON(!iopf_param)) return -ENODEV; if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { ret = report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); - + /* A request that is not the last does not need to be ack'd */ return ret; } @@ -170,56 +206,33 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) * will send a response to the hardware. We need to clean up before * leaving, otherwise partial faults will be stuck. */ - domain = get_domain_for_iopf(dev, fault); - if (!domain) { - ret = -EINVAL; - goto cleanup_partial; - } - - group = kzalloc(sizeof(*group), GFP_KERNEL); - if (!group) { + group = iopf_group_alloc(iopf_param, evt, &abort_group); + if (group == &abort_group) { ret = -ENOMEM; - goto cleanup_partial; + goto err_abort; } - group->fault_param = iopf_param; - group->last_fault.fault = *fault; - INIT_LIST_HEAD(&group->faults); - INIT_LIST_HEAD(&group->pending_node); - group->domain = domain; - list_add(&group->last_fault.list, &group->faults); - - /* See if we have partial faults for this group */ - mutex_lock(&iopf_param->lock); - list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { - if (iopf->fault.prm.grpid == fault->prm.grpid) - /* Insert *before* the last fault */ - list_move(&iopf->list, &group->faults); - } - list_add(&group->pending_node, &iopf_param->faults); - mutex_unlock(&iopf_param->lock); - - ret = domain->iopf_handler(group); - if (ret) { - mutex_lock(&iopf_param->lock); - list_del_init(&group->pending_node); - mutex_unlock(&iopf_param->lock); - iopf_free_group(group); + group->domain = get_domain_for_iopf(dev, fault); + if (!group->domain) { + ret = -EINVAL; + goto err_abort; } - return ret; - -cleanup_partial: - mutex_lock(&iopf_param->lock); - list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { - if (iopf->fault.prm.grpid == fault->prm.grpid) { - list_del(&iopf->list); - kfree(iopf); - } - } - mutex_unlock(&iopf_param->lock); - iopf_put_dev_fault_param(iopf_param); + /* + * On success iopf_handler must call iopf_group_response() and + * iopf_free_group() + */ + ret = group->domain->iopf_handler(group); + if (ret) + goto err_abort; + return 0; +err_abort: + iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE); + if (group == &abort_group) + __iopf_free_group(group); + else + iopf_free_group(group); return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); @@ -259,11 +272,9 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev); * iopf_group_response - Respond a group of page faults * @group: the group of faults with the same group id * @status: the response code - * - * Return 0 on success and <0 on error. */ -int iopf_group_response(struct iopf_group *group, - enum iommu_page_response_code status) +void iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) { struct iommu_fault_param *fault_param = group->fault_param; struct iopf_fault *iopf = &group->last_fault; @@ -274,17 +285,14 @@ int iopf_group_response(struct iopf_group *group, .grpid = iopf->fault.prm.grpid, .code = status, }; - int ret = -EINVAL; /* Only send response if there is a fault report pending */ mutex_lock(&fault_param->lock); if (!list_empty(&group->pending_node)) { - ret = ops->page_response(dev, &group->last_fault, &resp); + ops->page_response(dev, &group->last_fault, &resp); list_del_init(&group->pending_node); } mutex_unlock(&fault_param->lock); - - return ret; } EXPORT_SYMBOL_GPL(iopf_group_response); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 59a9e6516b77..53e9e4db1a19 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -582,9 +582,8 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - int (*page_response)(struct device *dev, - struct iopf_fault *evt, - struct iommu_page_response *msg); + void (*page_response)(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); @@ -1603,8 +1602,8 @@ void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -int iopf_group_response(struct iopf_group *group, - enum iommu_page_response_code status); +void iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status); #else static inline int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) @@ -1646,10 +1645,9 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) return -ENODEV; } -static inline int iopf_group_response(struct iopf_group *group, - enum iommu_page_response_code status) +static inline void iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) { - return -ENODEV; } #endif /* CONFIG_IOMMU_IOPF */ #endif /* __LINUX_IOMMU_H */ -- Gitee From 679027b1c285d248a8cb57a730d8a23c2faa35a3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:27 +0800 Subject: [PATCH 119/219] iommu: Make iommu_report_device_fault() return void ANBZ: #13617 commit 3dfa64aecbafc288216b2790438d395add192c30 upstream. As the iommu_report_device_fault() has been converted to auto-respond a page fault if it fails to enqueue it, there's no need to return a code in any case. Make it return void. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240212012227.119381-17-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 ++-- drivers/iommu/intel/svm.c | 19 ++++++---------- drivers/iommu/io-pgfault.c | 25 +++++++-------------- include/linux/iommu.h | 5 ++--- 4 files changed, 19 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 14d687222daa..9a92f3042422 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1466,7 +1466,7 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) /* IRQ and event handlers */ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) { - int ret; + int ret = 0; u32 perm = 0; struct arm_smmu_master *master; bool ssid_valid = evt[0] & EVTQ_0_SSV; @@ -1522,7 +1522,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) goto out_unlock; } - ret = iommu_report_device_fault(master->dev, &fault_evt); + iommu_report_device_fault(master->dev, &fault_evt); out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 66aec2d3c78b..8d7b8170f5f3 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -560,14 +560,11 @@ static int prq_to_iommu_prot(struct page_req_dsc *req) return prot; } -static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) +static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) { struct iopf_fault event = { }; - if (!dev || !dev_is_pci(dev)) - return -ENODEV; - /* Fill in event data for device specific processing */ event.fault.type = IOMMU_FAULT_PAGE_REQ; event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; @@ -600,7 +597,7 @@ static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); } - return iommu_report_device_fault(dev, &event); + iommu_report_device_fault(dev, &event); } static void handle_bad_prq_event(struct intel_iommu *iommu, @@ -703,12 +700,10 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (!pdev) goto bad_req; - if (intel_svm_prq_report(iommu, &pdev->dev, req)) - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - else - trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1, - req->priv_data[0], req->priv_data[1], - iommu->prq_seq_number++); + intel_svm_prq_report(iommu, &pdev->dev, req); + trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1, + req->priv_data[0], req->priv_data[1], + iommu->prq_seq_number++); pci_dev_put(pdev); prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 6a325bff8164..06d78fcc79fd 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -176,26 +176,22 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, * freed after the device has stopped generating page faults (or the iommu * hardware has been set to block the page faults) and the pending page faults * have been flushed. - * - * Return: 0 on success and <0 on error. */ -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { struct iommu_fault *fault = &evt->fault; struct iommu_fault_param *iopf_param; struct iopf_group abort_group = {}; struct iopf_group *group; - int ret; iopf_param = iopf_get_dev_fault_param(dev); if (WARN_ON(!iopf_param)) - return -ENODEV; + return; if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - ret = report_partial_fault(iopf_param, fault); + report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ - return ret; } /* @@ -207,25 +203,21 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) * leaving, otherwise partial faults will be stuck. */ group = iopf_group_alloc(iopf_param, evt, &abort_group); - if (group == &abort_group) { - ret = -ENOMEM; + if (group == &abort_group) goto err_abort; - } group->domain = get_domain_for_iopf(dev, fault); - if (!group->domain) { - ret = -EINVAL; + if (!group->domain) goto err_abort; - } /* * On success iopf_handler must call iopf_group_response() and * iopf_free_group() */ - ret = group->domain->iopf_handler(group); - if (ret) + if (group->domain->iopf_handler(group)) goto err_abort; - return 0; + + return; err_abort: iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE); @@ -233,7 +225,6 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) __iopf_free_group(group); else iopf_free_group(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 53e9e4db1a19..ac9062b004e1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1601,7 +1601,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); +void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); void iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); #else @@ -1639,10 +1639,9 @@ static inline void iopf_free_group(struct iopf_group *group) { } -static inline int +static inline void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { - return -ENODEV; } static inline void iopf_group_response(struct iopf_group *group, -- Gitee From 7de78eebe9d65b5e8aa4685787613512443658d8 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:52 +0800 Subject: [PATCH 120/219] iommu/vt-d: Track nested domains in parent ANBZ: #13617 commit 85ce8e1d6d73e8d54cb244d10dd4021771231746 upstream. Today the parent domain (s2_domain) is unaware of which DID's are used by and which devices are attached to nested domains (s1_domain) nested on it. This leads to a problem that some operations (flush iotlb/devtlb and enable dirty tracking) on parent domain only apply to DID's and devices directly tracked in the parent domain hence are incomplete. This tracks the nested domains in list in parent domain. With this, operations on parent domain can loop the nested domains and refer to the devices and iommu_array to ensure the operations on parent domain take effect on all the affected devices and iommus. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 18 ++++++++++++++---- drivers/iommu/intel/iommu.h | 6 ++++++ drivers/iommu/intel/nested.c | 12 +++++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 49f25937ecd5..cfd787292ef0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3956,6 +3956,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; struct iommu_domain *domain; /* Must be NESTING domain */ @@ -3981,11 +3982,16 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (!domain) return ERR_PTR(-ENOMEM); - if (nested_parent) - to_dmar_domain(domain)->nested_parent = true; + dmar_domain = to_dmar_domain(domain); + + if (nested_parent) { + dmar_domain->nested_parent = true; + INIT_LIST_HEAD(&dmar_domain->s1_domains); + spin_lock_init(&dmar_domain->s1_lock); + } if (dirty_tracking) { - if (to_dmar_domain(domain)->use_first_level) { + if (dmar_domain->use_first_level) { iommu_domain_free(domain); return ERR_PTR(-EOPNOTSUPP); } @@ -3997,8 +4003,12 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, static void intel_iommu_domain_free(struct iommu_domain *domain) { + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains)); if (domain != &si_domain->domain) - domain_exit(to_dmar_domain(domain)); + domain_exit(dmar_domain); } int prepare_domain_attach_device(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 17fd5a76ff3c..0a2adb961062 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -627,6 +627,10 @@ struct dmar_domain { int agaw; /* maximum mapped address */ u64 max_addr; + /* Protect the s1_domains list */ + spinlock_t s1_lock; + /* Track s1_domains nested on this domain */ + struct list_head s1_domains; }; /* Nested user domain */ @@ -637,6 +641,8 @@ struct dmar_domain { unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; + /* link to parent domain siblings */ + struct list_head s2_link; }; }; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index f26c7f1c46cc..6a75f6eb18f1 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -70,7 +70,13 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, static void intel_nested_domain_free(struct iommu_domain *domain) { - kfree(to_dmar_domain(domain)); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct dmar_domain *s2_domain = dmar_domain->s2_domain; + + spin_lock(&s2_domain->s1_lock); + list_del(&dmar_domain->s2_link); + spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain); } static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, @@ -201,5 +207,9 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, spin_lock_init(&domain->lock); xa_init(&domain->iommu_array); + spin_lock(&s2_domain->s1_lock); + list_add(&domain->s2_link, &s2_domain->s1_domains); + spin_unlock(&s2_domain->s1_lock); + return &domain->domain; } -- Gitee From e0b03139f311797f33e1e0321a7941532711e298 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:53 +0800 Subject: [PATCH 121/219] iommu/vt-d: Add __iommu_flush_iotlb_psi() ANBZ: #13617 commit 0455d317f533e4427ddaa805563ff48711f05810 upstream. Add __iommu_flush_iotlb_psi() to do the psi iotlb flush with a DID input rather than calculating it within the helper. This is useful when flushing cache for parent domain which reuses DIDs of its nested domains. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-3-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 78 ++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cfd787292ef0..aa49f0833cce 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1380,6 +1380,46 @@ static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, spin_unlock_irqrestore(&domain->lock, flags); } +static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + unsigned long pfn, unsigned int pages, + int ih) +{ + unsigned int aligned_pages = __roundup_pow_of_two(pages); + unsigned long bitmask = aligned_pages - 1; + unsigned int mask = ilog2(aligned_pages); + u64 addr = (u64)pfn << VTD_PAGE_SHIFT; + + /* + * PSI masks the low order bits of the base address. If the + * address isn't aligned to the mask, then compute a mask value + * needed to ensure the target range is flushed. + */ + if (unlikely(bitmask & pfn)) { + unsigned long end_pfn = pfn + pages - 1, shared_bits; + + /* + * Since end_pfn <= pfn + bitmask, the only way bits + * higher than bitmask can differ in pfn and end_pfn is + * by carrying. This means after masking out bitmask, + * high bits starting with the first set bit in + * shared_bits are all equal in both pfn and end_pfn. + */ + shared_bits = ~(pfn ^ end_pfn) & ~bitmask; + mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; + } + + /* + * Fallback to domain selective flush if no PSI support or + * the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, + DMA_TLB_PSI_FLUSH); +} + static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, struct dmar_domain *domain, unsigned long pfn, unsigned int pages, @@ -1396,42 +1436,10 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, if (ih) ih = 1 << 6; - if (domain->use_first_level) { + if (domain->use_first_level) domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); - } else { - unsigned long bitmask = aligned_pages - 1; - - /* - * PSI masks the low order bits of the base address. If the - * address isn't aligned to the mask, then compute a mask value - * needed to ensure the target range is flushed. - */ - if (unlikely(bitmask & pfn)) { - unsigned long end_pfn = pfn + pages - 1, shared_bits; - - /* - * Since end_pfn <= pfn + bitmask, the only way bits - * higher than bitmask can differ in pfn and end_pfn is - * by carrying. This means after masking out bitmask, - * high bits starting with the first set bit in - * shared_bits are all equal in both pfn and end_pfn. - */ - shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; - } - - /* - * Fallback to domain selective flush if no PSI support or - * the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + else + __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); /* * In caching mode, changes of pages from non-present to present require -- Gitee From 6c665221a41fe5bc064d3c037c1dec2ceb344627 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:54 +0800 Subject: [PATCH 122/219] iommu/vt-d: Add missing iotlb flush for parent domain ANBZ: #13617 commit 821985301124c0a5e7ca15fc69a9a531e9442d70 upstream. If a domain is used as the parent in nested translation its mappings might be cached using DID of the nested domain. But the existing code ignores this fact to only invalidate the iotlb entries tagged by the domain's own DID. Loop the s1_domains list, if any, to invalidate all iotlb entries related to the target s2 address range. According to VT-d spec there is no need for software to explicitly flush the affected s1 cache. It's implicitly done by HW when s2 cache is invalidated. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-4-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/iommu.c commit b600b9978904 introduces iommu_put_pages_list() in intel_iommu_tlb_sync ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index aa49f0833cce..42b2dd046f0b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1463,6 +1463,28 @@ static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain * iommu_flush_write_buffer(iommu); } +/* + * Flush the relevant caches in nested translation if the domain + * also serves as a parent + */ +static void parent_domain_flush(struct dmar_domain *domain, + unsigned long pfn, + unsigned long pages, int ih) +{ + struct dmar_domain *s1_domain; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + struct iommu_domain_info *info; + unsigned long i; + + xa_for_each(&s1_domain->iommu_array, i, info) + __iommu_flush_iotlb_psi(info->iommu, info->did, + pfn, pages, ih); + } + spin_unlock(&domain->s1_lock); +} + static void intel_flush_iotlb_all(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -1482,6 +1504,9 @@ static void intel_flush_iotlb_all(struct iommu_domain *domain) if (!cap_caching_mode(iommu->cap)) iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); } + + if (dmar_domain->nested_parent) + parent_domain_flush(dmar_domain, 0, -1, 0); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -2005,6 +2030,9 @@ static void switch_to_super_page(struct dmar_domain *domain, iommu_flush_iotlb_psi(info->iommu, domain, start_pfn, lvl_pages, 0, 0); + if (domain->nested_parent) + parent_domain_flush(domain, start_pfn, + lvl_pages, 0); } pte++; @@ -4198,6 +4226,9 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, start_pfn, nrpages, list_empty(&gather->freelist), 0); + if (dmar_domain->nested_parent) + parent_domain_flush(dmar_domain, start_pfn, nrpages, + list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } -- Gitee From 88282511a56828191c7312d0aa98c1bf9c23b206 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:55 +0800 Subject: [PATCH 123/219] iommu/vt-d: Update iotlb in nested domain attach ANBZ: #13617 commit 29e10487d6df050afeee886b7c1da208f389cb5b upstream. Should call domain_update_iotlb() to update the has_iotlb_device flag of the domain after attaching device to nested domain. Without it, this flag is not set properly and would result in missing device TLB flush. Fixes: 9838f2bb6b6b ("iommu/vt-d: Set the nested domain to a device") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-5-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 4 +--- drivers/iommu/intel/iommu.h | 1 + drivers/iommu/intel/nested.c | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 42b2dd046f0b..e46dee72b50d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -396,8 +396,6 @@ static int domain_update_device_node(struct dmar_domain *domain) return nid; } -static void domain_update_iotlb(struct dmar_domain *domain); - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -1230,7 +1228,7 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -static void domain_update_iotlb(struct dmar_domain *domain) +void domain_update_iotlb(struct dmar_domain *domain) { struct dev_pasid_info *dev_pasid; struct device_domain_info *info; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 0a2adb961062..f5e0726258d9 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1066,6 +1066,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 6a75f6eb18f1..d5af5925a31c 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -65,6 +65,8 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_update_iotlb(dmar_domain); + return 0; } -- Gitee From bb40f2e14a50371a5c1c89e03100e6255d3efc97 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:56 +0800 Subject: [PATCH 124/219] iommu/vt-d: Add missing device iotlb flush for parent domain ANBZ: #13617 commit 5e54e861f16f37c84ae68d6d4bbd3de54b668317 upstream. ATS-capable devices cache the result of nested translation. This result relies on the mappings in s2 domain (a.k.a. parent). When there are modifications in the s2 domain, the related nested translation caches on the device should be flushed. This includes the devices that are attached to the s1 domain. However, the existing code ignores this fact to only loops its own devices. As there is no easy way to identify the exact set of nested translations affected by the change of s2 domain. So, this just flushes the entire device iotlb on the device. As above, driver loops the s2 domain's s1_domains list and loops the devices list of each s1_domain to flush the entire device iotlb on the devices. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-6-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e46dee72b50d..6a57c6b0a932 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1473,12 +1473,30 @@ static void parent_domain_flush(struct dmar_domain *domain, spin_lock(&domain->s1_lock); list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + struct device_domain_info *device_info; struct iommu_domain_info *info; + unsigned long flags; unsigned long i; xa_for_each(&s1_domain->iommu_array, i, info) __iommu_flush_iotlb_psi(info->iommu, info->did, pfn, pages, ih); + + if (!s1_domain->has_iotlb_device) + continue; + + spin_lock_irqsave(&s1_domain->lock, flags); + list_for_each_entry(device_info, &s1_domain->devices, link) + /* + * Address translation cache in device side caches the + * result of nested translation. There is no easy way + * to identify the exact set of nested translations + * affected by a change in S2. So just flush the entire + * device cache. + */ + __iommu_flush_dev_iotlb(device_info, 0, + MAX_AGAW_PFN_WIDTH); + spin_unlock_irqrestore(&s1_domain->lock, flags); } spin_unlock(&domain->s1_lock); } -- Gitee From 9580a244e77c046be04919bb4be4a2099da7a81c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:57 +0800 Subject: [PATCH 125/219] iommu/vt-d: Remove domain parameter for intel_pasid_setup_dirty_tracking() ANBZ: #13617 commit 56ecaf6c5834ace14941d7f13dceb48bc3327111 upstream. The only usage of input @domain is to get the domain id (DID) to flush cache after setting dirty tracking. However, DID can be obtained from the pasid entry. So no need to pass in domain. This can make this helper cleaner when adding the missing dirty tracking for the parent domain, which needs to use the DID of nested domain. Signed-off-by: Yi Liu Reviewed-by: Joao Martins Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-7-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 7 +++---- drivers/iommu/intel/pasid.c | 3 +-- drivers/iommu/intel/pasid.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6a57c6b0a932..8fed34b53537 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4802,8 +4802,7 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, goto out_unlock; list_for_each_entry(info, &dmar_domain->devices, link) { - ret = intel_pasid_setup_dirty_tracking(info->iommu, - info->domain, info->dev, + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, IOMMU_NO_PASID, enable); if (ret) goto err_unwind; @@ -4817,8 +4816,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, err_unwind: list_for_each_entry(info, &dmar_domain->devices, link) - intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, - info->dev, IOMMU_NO_PASID, + intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index e1cdf8899fa9..dfe4257856c8 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -431,7 +431,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, * Set up dirty tracking on a second only or nested translation type. */ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid, bool enabled) { @@ -448,7 +447,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, return -ENODEV; } - did = domain_id_iommu(domain, iommu); + did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && pgtt != PASID_ENTRY_PGTT_NESTED) { diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 8d40d4c66e31..487ede039bdd 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -307,7 +307,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid, bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, -- Gitee From b9bbbb5e75cb1da151fa1748cda4c02192318e02 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:58 +0800 Subject: [PATCH 126/219] iommu/vt-d: Wrap the dirty tracking loop to be a helper ANBZ: #13617 commit 0c7f2497b39da44253d7bcf2b41f52b0048859ad upstream. Add device_set_dirty_tracking() to loop all the devices and set the dirty tracking per the @enable parameter. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20240208082307.15759-8-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8fed34b53537..6b3320b513f5 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4790,23 +4790,38 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) return vtd; } +/* + * Set dirty tracking for the device list of a domain. The caller must + * hold the domain->lock when calling it. + */ +static int device_set_dirty_tracking(struct list_head *devices, bool enable) +{ + struct device_domain_info *info; + int ret = 0; + + list_for_each_entry(info, devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + break; + } + + return ret; +} + static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct device_domain_info *info; int ret; spin_lock(&dmar_domain->lock); if (dmar_domain->dirty_tracking == enable) goto out_unlock; - list_for_each_entry(info, &dmar_domain->devices, link) { - ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, - IOMMU_NO_PASID, enable); - if (ret) - goto err_unwind; - } + ret = device_set_dirty_tracking(&dmar_domain->devices, enable); + if (ret) + goto err_unwind; dmar_domain->dirty_tracking = enable; out_unlock: @@ -4815,10 +4830,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; err_unwind: - list_for_each_entry(info, &dmar_domain->devices, link) - intel_pasid_setup_dirty_tracking(info->iommu, info->dev, - IOMMU_NO_PASID, - dmar_domain->dirty_tracking); + device_set_dirty_tracking(&dmar_domain->devices, + dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; } -- Gitee From b05d1b7ef08c712d5ae5f314032132fb08baafe3 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:59 +0800 Subject: [PATCH 127/219] iommu/vt-d: Add missing dirty tracking set for parent domain ANBZ: #13617 commit f1e1610950eac0af5e40f6ee02315952f78192f7 upstream. Setting dirty tracking for a s2 domain requires to loop all the related devices and set the dirty tracking enable bit in the PASID table entry. This includes the devices that are attached to the nested domains of a s2 domain if this s2 domain is used as parent. However, the existing dirty tracking set only loops s2 domain's own devices. It will miss dirty page logs in the parent domain. Now, the parent domain tracks the nested domains, so it can loop the nested domains and the devices attached to the nested domains to ensure dirty tracking on the parent is set completely. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Sun Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-9-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6b3320b513f5..4bb2fdc8d26d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4809,6 +4809,35 @@ static int device_set_dirty_tracking(struct list_head *devices, bool enable) return ret; } +static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, + bool enable) +{ + struct dmar_domain *s1_domain; + unsigned long flags; + int ret; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + ret = device_set_dirty_tracking(&s1_domain->devices, enable); + spin_unlock_irqrestore(&s1_domain->lock, flags); + if (ret) + goto err_unwind; + } + spin_unlock(&domain->s1_lock); + return 0; + +err_unwind: + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + device_set_dirty_tracking(&s1_domain->devices, + domain->dirty_tracking); + spin_unlock_irqrestore(&s1_domain->lock, flags); + } + spin_unlock(&domain->s1_lock); + return ret; +} + static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { @@ -4823,6 +4852,12 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, if (ret) goto err_unwind; + if (dmar_domain->nested_parent) { + ret = parent_domain_set_dirty_tracking(dmar_domain, enable); + if (ret) + goto err_unwind; + } + dmar_domain->dirty_tracking = enable; out_unlock: spin_unlock(&dmar_domain->lock); -- Gitee From 4a11810a528b2d05ae7384600d84abe2682c7f6d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:16:00 +0800 Subject: [PATCH 128/219] iommu/vt-d: Set SSADE when attaching to a parent with dirty tracking ANBZ: #13617 commit 1f0198fce68340e0da2d438f4ea9fc20d2c958da upstream. Should set the SSADE (Second Stage Access/Dirty bit Enable) bit of the pasid entry when attaching a device to a nested domain if its parent has already enabled dirty tracking. Fixes: 111bf85c68f6 ("iommu/vt-d: Add helper to setup pasid nested translation") Signed-off-by: Yi Liu Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20240208091414.28133-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/pasid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index dfe4257856c8..1fd5e4f7b9e4 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -660,6 +660,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, pasid_set_domain_id(pte, did); pasid_set_address_width(pte, s2_domain->agaw); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); pasid_set_present(pte); spin_unlock(&iommu->lock); -- Gitee From f5c8ed292f220dad465ddb826993fc43dae65dc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 19 Feb 2024 19:16:01 +0800 Subject: [PATCH 129/219] iommu/vt-d: Fix constant-out-of-range warning ANBZ: #13617 commit 4578f989ed6b77c14af86bb882cc5ff0a46a69eb upstream. On 32-bit builds, the vt-d driver causes a warning with clang: drivers/iommu/intel/nested.c:112:13: error: result of comparison of constant 18446744073709551615 with expression of type 'unsigned long' is always false [-Werror,-Wtautological-constant-out-of-range-compare] 112 | if (npages == U64_MAX) | ~~~~~~ ^ ~~~~~~~ Make the variable a 64-bit type, which matches both the caller and the use anyway. Fixes: f6f3721244a8 ("iommu/vt-d: Add iotlb flush for nested domain") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240213095832.455245-1-arnd@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index d5af5925a31c..a7d68f3d518a 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -103,7 +103,7 @@ static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, } static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, - unsigned long npages, bool ih) + u64 npages, bool ih) { struct iommu_domain_info *info; unsigned int mask; -- Gitee From daf9dd47c3357a77d02abe729206d077878033da Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Feb 2024 14:43:54 -0400 Subject: [PATCH 130/219] iommufd: Reject non-zero data_type if no data_len is provided ANBZ: #13617 commit 7adc0c1cfa7732b81bf7bf2ed16ffb99719ceebf upstream. Since the current design doesn't forward the data_type to the driver to check unless there is a data_len/uptr for a driver specific struct we should check and ensure that data_type is 0 if data_len is 0. Otherwise any value is permitted. Fixes: bd529dbb661d ("iommufd: Add a nested HW pagetable object") Link: https://lore.kernel.org/r/0-v1-9b1ea6869554+110c60-iommufd_ck_data_type_jgg@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 3f3f1fa1a0a9..33d142f8057d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -263,7 +263,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (cmd->__reserved) return -EOPNOTSUPP; - if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + if ((cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) || + (cmd->data_type != IOMMU_HWPT_DATA_NONE && !cmd->data_len)) return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); -- Gitee From ef7c729386b63d93f01d86b44de7b9a833328530 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 21 Feb 2024 20:27:02 -0400 Subject: [PATCH 131/219] iommu/arm-smmu-v3: Do not use GFP_KERNEL under as spinlock ANBZ: #13617 commit b5bf7778b722105d7a04b1d51e884497b542638b upstream. If the SMMU is configured to use a two level CD table then arm_smmu_write_ctx_desc() allocates a CD table leaf internally using GFP_KERNEL. Due to recent changes this is being done under a spinlock to iterate over the device list - thus it will trigger a sleeping while atomic warning: arm_smmu_sva_set_dev_pasid() mutex_lock(&sva_lock); __arm_smmu_sva_bind() arm_smmu_mmu_notifier_get() spin_lock_irqsave() arm_smmu_write_ctx_desc() arm_smmu_get_cd_ptr() arm_smmu_alloc_cd_leaf_table() dmam_alloc_coherent(GFP_KERNEL) This is a 64K high order allocation and really should not be done atomically. At the moment the rework of the SVA to follow the new API is half finished. Recently the CD table memory was moved from the domain to the master, however we have the confusing situation where the SVA code is wrongly using the RID domains device's list to track which CD tables the SVA is installed in. Remove the logic to replicate the CD across all the domain's masters during attach. We know which master and which CD table the PASID should be installed in. Right now SVA only works when dma-iommu.c is in control of the RID translation, which means we have a single iommu_domain shared across the entire group and that iommu_domain is not shared outside the group. Critically this means that the iommu_group->devices list and RID's smmu_domain->devices list describe the same set of masters. For PCI cases the core code also insists on singleton groups so there is only one entry in the smmu_domain->devices list that is equal to the master being passed in to arm_smmu_sva_set_dev_pasid(). Only non-PCI cases may have multi-device groups. However, the core code will repeat the calls to arm_smmu_sva_set_dev_pasid() across the entire iommu_group->devices list. Instead of having arm_smmu_mmu_notifier_get() indirectly loop over all the devices in the group via the RID's smmu_domain, rely on __arm_smmu_sva_bind() to be called for each device in the group and install the repeated CD entry that way. This avoids taking the spinlock to access the devices list and permits the arm_smmu_write_ctx_desc() to use a sleeping allocation. Leave the arm_smmu_mm_release() as a confusing situation, this requires tracking attached masters inside the SVA domain. Removing the loop allows arm_smmu_write_ctx_desc() to be called outside the spinlock and thus is safe to use GFP_KERNEL. Move the clearing of the CD into arm_smmu_sva_remove_dev_pasid() so that arm_smmu_mmu_notifier_get/put() remain paired functions. Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/ Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Shavit Link: https://lore.kernel.org/r/0-v3-11978fc67151+112-smmu_cd_atomic_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 6513a98fcb72..874d1f977d90 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -291,10 +291,8 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, struct mm_struct *mm) { int ret; - unsigned long flags; struct arm_smmu_ctx_desc *cd; struct arm_smmu_mmu_notifier *smmu_mn; - struct arm_smmu_master *master; list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) { if (smmu_mn->mn.mm == mm) { @@ -324,28 +322,9 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, goto err_free_cd; } - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { - ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), - cd); - if (ret) { - list_for_each_entry_from_reverse( - master, &smmu_domain->devices, domain_head) - arm_smmu_write_ctx_desc( - master, mm_get_enqcmd_pasid(mm), NULL); - break; - } - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - if (ret) - goto err_put_notifier; - list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); return smmu_mn; -err_put_notifier: - /* Frees smmu_mn */ - mmu_notifier_put(&smmu_mn->mn); err_free_cd: arm_smmu_free_shared_cd(cd); return ERR_PTR(ret); @@ -362,9 +341,6 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); - arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), - NULL); - /* * If we went through clear(), we've already invalidated, and no * new TLB entry can have been formed. @@ -380,7 +356,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) arm_smmu_free_shared_cd(cd); } -static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) +static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, + struct mm_struct *mm) { int ret; struct arm_smmu_bond *bond; @@ -403,9 +380,15 @@ static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) goto err_free_bond; } + ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd); + if (ret) + goto err_put_notifier; + list_add(&bond->list, &master->bonds); return 0; +err_put_notifier: + arm_smmu_mmu_notifier_put(bond->smmu_mn); err_free_bond: kfree(bond); return ret; @@ -556,6 +539,9 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); mutex_lock(&sva_lock); + + arm_smmu_write_ctx_desc(master, id, NULL); + list_for_each_entry(t, &master->bonds, list) { if (t->mm == mm) { bond = t; @@ -578,7 +564,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, struct mm_struct *mm = domain->mm; mutex_lock(&sva_lock); - ret = __arm_smmu_sva_bind(dev, mm); + ret = __arm_smmu_sva_bind(dev, id, mm); mutex_unlock(&sva_lock); return ret; -- Gitee From 675a2f3c9505396012e4fdc34ec1e1de6c6add33 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Wed, 31 Jan 2024 14:35:16 +0200 Subject: [PATCH 132/219] iommu/arm-smmu-qcom: Add X1E80100 MDSS compatible ANBZ: #13617 commit 12721e66005798b2a07bd8309060c230e52ba2b0 upstream. Add the X1E80100 MDSS compatible to clients compatible list, as it also needs the workarounds. Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20240131-x1e80100-iommu-arm-smmu-qcom-v1-1-c1240419c718@linaro.org Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 1566cfd6c7e8..416c71b684c4 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -260,6 +260,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,sm6375-mdss" }, { .compatible = "qcom,sm8150-mdss" }, { .compatible = "qcom,sm8250-mdss" }, + { .compatible = "qcom,x1e80100-mdss" }, { } }; -- Gitee From 34fec29c4d742ff776b609058cc56113eb803130 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Feb 2024 10:07:41 -0400 Subject: [PATCH 133/219] iommu/sva: Restore SVA handle sharing ANBZ: #13617 commit 65d4418c5002ec5b0e529455bf4152fd43459079 upstream. Prior to commit 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") the code allowed a SVA handle to be bound multiple times to the same (mm, device) pair. This was alluded to in the kdoc comment, but we had understood this to be more a remark about allowing multiple devices, not a literal same-driver re-opening the same SVA. It turns out uacce and idxd were both relying on the core code to handle reference counting for same-device same-mm scenarios. As this looks hard to resolve in the drivers bring it back to the core code. The new design has changed the meaning of the domain->users refcount to refer to the number of devices that are sharing that domain for the same mm. This is part of the design to lift the SVA domain de-duplication out of the drivers. Return the old behavior by explicitly de-duplicating the struct iommu_sva handle. The same (mm, device) will return the same handle pointer and the core code will handle tracking this. The last unbind of the handle will destroy it. Fixes: 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") Reported-by: Zhangfei Gao Closes: https://lore.kernel.org/all/20240221110658.529-1-zhangfei.gao@linaro.org/ Tested-by: Zhangfei Gao Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-9455fc497a6f+3b4-iommu_sva_sharing_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sva.c | 17 +++++++++++++++++ include/linux/iommu.h | 3 +++ 2 files changed, 20 insertions(+) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index b51995b4fe90..6bebaedb786a 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -41,6 +41,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de } iommu_mm->pasid = pasid; INIT_LIST_HEAD(&iommu_mm->sva_domains); + INIT_LIST_HEAD(&iommu_mm->sva_handles); /* * Make sure the write to mm->iommu_mm is not reordered in front of * initialization to iommu_mm fields. If it does, readers may see a @@ -82,6 +83,14 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_unlock; } + list_for_each_entry(handle, &mm->iommu_mm->sva_handles, handle_item) { + if (handle->dev == dev) { + refcount_inc(&handle->users); + mutex_unlock(&iommu_sva_lock); + return handle; + } + } + handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) { ret = -ENOMEM; @@ -108,7 +117,9 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; + refcount_set(&handle->users, 1); list_add(&domain->next, &mm->iommu_mm->sva_domains); + list_add(&handle->handle_item, &mm->iommu_mm->sva_handles); out: mutex_unlock(&iommu_sva_lock); @@ -141,6 +152,12 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); + if (!refcount_dec_and_test(&handle->users)) { + mutex_unlock(&iommu_sva_lock); + return; + } + list_del(&handle->handle_item); + iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { list_del(&domain->next); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ac9062b004e1..02ea0db236e6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1023,11 +1023,14 @@ struct iommu_fwspec { struct iommu_sva { struct device *dev; struct iommu_domain *domain; + struct list_head handle_item; + refcount_t users; }; struct iommu_mm_data { u32 pasid; struct list_head sva_domains; + struct list_head sva_handles; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, -- Gitee From ce40726e529d434951214f8c79fa56e60db2ddcf Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 22 Feb 2024 13:23:46 -0800 Subject: [PATCH 134/219] iommufd/selftest: Fix mock_dev_num bug ANBZ: #13617 commit fde372df96afddcda3ec94944351f2a14f7cd98d upstream. Syzkaller reported the following bug: sysfs: cannot create duplicate filename '/devices/iommufd_mock4' Call Trace: sysfs_warn_dup+0x71/0x90 sysfs_create_dir_ns+0x1ee/0x260 ? sysfs_create_mount_point+0x80/0x80 ? spin_bug+0x1d0/0x1d0 ? do_raw_spin_unlock+0x54/0x220 kobject_add_internal+0x221/0x970 kobject_add+0x11c/0x1e0 ? lockdep_hardirqs_on_prepare+0x273/0x3e0 ? kset_create_and_add+0x160/0x160 ? kobject_put+0x5d/0x390 ? bus_get_dev_root+0x4a/0x60 ? kobject_put+0x5d/0x390 device_add+0x1d5/0x1550 ? __fw_devlink_link_to_consumers.isra.0+0x1f0/0x1f0 ? __init_waitqueue_head+0xcb/0x150 iommufd_test+0x462/0x3b60 ? lock_release+0x1fe/0x640 ? __might_fault+0x117/0x170 ? reacquire_held_locks+0x4b0/0x4b0 ? iommufd_selftest_destroy+0xd0/0xd0 ? __might_fault+0xbe/0x170 iommufd_fops_ioctl+0x256/0x350 ? iommufd_option+0x180/0x180 ? __lock_acquire+0x1755/0x45f0 __x64_sys_ioctl+0xa13/0x1640 The bug is triggered when Syzkaller created multiple mock devices but didn't destroy them in the same sequence, messing up the mock_dev_num counter. Replace the atomic with an mock_dev_ida. Cc: stable@vger.kernel.org Fixes: 23a1b46f15d5 ("iommufd/selftest: Make the mock iommu driver into a real driver") Link: https://lore.kernel.org/r/5af41d5af6d5c013cc51de01427abb8141b3587e.1708636627.git.nicolinc@nvidia.com Reported-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index f94bc0b0f341..d59e199a8705 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -36,7 +36,7 @@ static struct mock_bus_type iommufd_mock_bus_type = { }, }; -static atomic_t mock_dev_num; +static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, @@ -138,6 +138,7 @@ enum selftest_obj_type { struct mock_dev { struct device dev; unsigned long flags; + int id; }; struct selftest_obj { @@ -646,7 +647,7 @@ static void mock_dev_release(struct device *dev) { struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - atomic_dec(&mock_dev_num); + ida_free(&mock_dev_ida, mdev->id); kfree(mdev); } @@ -668,8 +669,12 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; - rc = dev_set_name(&mdev->dev, "iommufd_mock%u", - atomic_inc_return(&mock_dev_num)); + rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); + if (rc < 0) + goto err_put; + mdev->id = rc; + + rc = dev_set_name(&mdev->dev, "iommufd_mock%u", mdev->id); if (rc) goto err_put; -- Gitee From 449f3669621c830a5e8b268bae91408cf12825cd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 23 Feb 2024 14:44:08 -0400 Subject: [PATCH 135/219] iommufd/selftest: Don't check map/unmap pairing with HUGE_PAGES ANBZ: #13617 commit bb04d13353885f81c87879b2deb296bd2adb6cab upstream. Since MOCK_HUGE_PAGE_SIZE was introduced it allows the core code to invoke mock with large page sizes. This confuses the validation logic that checks that map/unmap are paired. This is because the page size computed for map is based on the physical address and in many cases will always be the base page size, however the entire range generated by iommufd will be passed to map. Randomly iommufd can see small groups of physically contiguous pages, (say 8k unaligned and grouped together), but that group crosses a huge page boundary. The map side will observe this as a contiguous run and mark it accordingly, but there is a chance the unmap side will end up terminating interior huge pages in the middle of that group and trigger a validation failure. Meaning the validation only works if the core code passes the iova/length directly from iommufd to mock. syzkaller randomly hits this with failures like: WARNING: CPU: 0 PID: 11568 at drivers/iommu/iommufd/selftest.c:461 mock_domain_unmap_pages+0x1c0/0x250 Modules linked in: CPU: 0 PID: 11568 Comm: syz-executor.0 Not tainted 6.8.0-rc3+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:mock_domain_unmap_pages+0x1c0/0x250 Code: 2b e8 94 37 0f ff 48 d1 eb 31 ff 48 b8 00 00 00 00 00 00 20 00 48 21 c3 48 89 de e8 aa 32 0f ff 48 85 db 75 07 e8 70 37 0f ff <0f> 0b e8 69 37 0f ff 31 f6 31 ff e8 90 32 0f ff e8 5b 37 0f ff 4c RSP: 0018:ffff88800e707490 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff822dfae6 RDX: ffff88800cf86400 RSI: ffffffff822dfaf0 RDI: 0000000000000007 RBP: ffff88800e7074d8 R08: 0000000000000000 R09: ffffed1001167c90 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000001500000 R13: 0000000000083000 R14: 0000000000000001 R15: 0000000000000800 FS: 0000555556048480(0000) GS:ffff88806d400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2dc23000 CR3: 0000000008cbb000 CR4: 0000000000350eb0 Call Trace: __iommu_unmap+0x281/0x520 iommu_unmap+0xc9/0x180 iopt_area_unmap_domain_range+0x1b1/0x290 iopt_area_unpin_domain+0x590/0x800 __iopt_area_unfill_domain+0x22e/0x650 iopt_area_unfill_domain+0x47/0x60 iopt_unfill_domain+0x187/0x590 iopt_table_remove_domain+0x267/0x2d0 iommufd_hwpt_paging_destroy+0x1f1/0x370 iommufd_object_remove+0x2a3/0x490 iommufd_device_detach+0x23a/0x2c0 iommufd_selftest_destroy+0x7a/0xf0 iommufd_fops_release+0x1d3/0x340 __fput+0x272/0xb50 __fput_sync+0x4b/0x60 __x64_sys_close+0x8b/0x110 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x46/0x4e Do the simple thing and just disable the validation when the huge page tests are being run. Fixes: 7db521e23fe9 ("iommufd/selftest: Hugepage mock domain support") Link: https://lore.kernel.org/r/0-v1-1e17e60a5c8a+103fb-iommufd_mock_hugepg_jgg@nvidia.com Reviewed-by: Joao Martins Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/selftest.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d59e199a8705..7a2199470f31 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -446,20 +446,27 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, /* * iommufd generates unmaps that must be a strict - * superset of the map's performend So every starting - * IOVA should have been an iova passed to map, and the + * superset of the map's performend So every + * starting/ending IOVA should have been an iova passed + * to map. * - * First IOVA must be present and have been a first IOVA - * passed to map_pages + * This simple logic doesn't work when the HUGE_PAGE is + * turned on since the core code will automatically + * switch between the two page sizes creating a break in + * the unmap calls. The break can land in the middle of + * contiguous IOVA. */ - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; + if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { + if (first) { + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); + first = false; + } + if (pgcount == 1 && + cur + MOCK_IO_PAGE_SIZE == pgsize) + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); } - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); iova += MOCK_IO_PAGE_SIZE; ret += MOCK_IO_PAGE_SIZE; -- Gitee From c8d522d60fe8cd5b75c1caf444f0e39f18f26a79 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:12 -0400 Subject: [PATCH 136/219] iommu/arm-smmu-v3: Make STE programming independent of the callers ANBZ: #13617 commit 7da51af9125c624318c8099de13c5ddefd47e9e8 upstream. As the comment in arm_smmu_write_strtab_ent() explains, this routine has been limited to only work correctly in certain scenarios that the caller must ensure. Generally the caller must put the STE into ABORT or BYPASS before attempting to program it to something else. The iommu core APIs would ideally expect the driver to do a hitless change of iommu_domain in a number of cases: - RESV_DIRECT support wants IDENTITY -> DMA -> IDENTITY to be hitless for the RESV ranges - PASID upgrade has IDENTIY on the RID with no PASID then a PASID paging domain installed. The RID should not be impacted - PASID downgrade has IDENTIY on the RID and all PASID's removed. The RID should not be impacted - RID does PAGING -> BLOCKING with active PASID, PASID's should not be impacted - NESTING -> NESTING for carrying all the above hitless cases in a VM into the hypervisor. To comprehensively emulate the HW in a VM we should assume the VM OS is running logic like this and expecting hitless updates to be relayed to real HW. For CD updates arm_smmu_write_ctx_desc() has a similar comment explaining how limited it is, and the driver does have a need for hitless CD updates: - SMMUv3 BTM S1 ASID re-label - SVA mm release should change the CD to answert not-present to all requests without allowing logging (EPD0) The next patches/series are going to start removing some of this logic from the callers, and add more complex state combinations than currently. At the end everything that can be hitless will be hitless, including all of the above. Introduce arm_smmu_write_ste() which will run through the multi-qword programming sequence to avoid creating an incoherent 'torn' STE in the HW caches. It automatically detects which of two algorithms to use: 1) The disruptive V=0 update described in the spec which disrupts the entry and does three syncs to make the change: - Write V=0 to QWORD 0 - Write the entire STE except QWORD 0 - Write QWORD 0 2) A hitless update algorithm that follows the same rational that the driver already uses. It is safe to change IGNORED bits that HW doesn't use: - Write the target value into all currently unused bits - Write a single QWORD, this makes the new STE live atomically - Ensure now unused bits are 0 The detection of which path to use and the implementation of the hitless update rely on a "used bitmask" describing what bits the HW is actually using based on the V/CFG/etc bits. This flows from the spec language, typically indicated as IGNORED. Knowing which bits the HW is using we can update the bits it does not use and then compute how many QWORDS need to be changed. If only one qword needs to be updated the hitless algorithm is possible. Later patches will include CD updates in this mechanism so make the implementation generic using a struct arm_smmu_entry_writer and struct arm_smmu_entry_writer_ops to abstract the differences between STE and CD to be plugged in. At this point it generates the same sequence of updates as the current code, except that zeroing the VMID on entry to BYPASS/ABORT will do an extra sync (this seems to be an existing bug). Going forward this will use a V=0 transition instead of cycling through ABORT if a hitfull change is required. This seems more appropriate as ABORT will fail DMAs without any logging, but dropping a DMA due to transient V=0 is probably signaling a bug, so the C_BAD_STE is valuable. Add STRTAB_STE_1_SHCFG_INCOMING to s2_cfg, this was editing the STE in place and subtly inherited the value of data[1] from abort/bypass. Signed-off-by: Michael Shavit Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 275 +++++++++++++++----- 1 file changed, 211 insertions(+), 64 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 9a92f3042422..5b1be0f9c712 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -47,6 +47,9 @@ enum arm_smmu_msi_index { ARM_SMMU_MAX_MSIS, }; +static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, + ioasid_t sid); + static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = { [EVTQ_MSI_INDEX] = { ARM_SMMU_EVTQ_IRQ_CFG0, @@ -966,6 +969,199 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } +/* + * Based on the value of ent report which bits of the STE the HW will access. It + * would be nice if this was complete according to the spec, but minimally it + * has to capture the bits this driver uses. + */ +static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent, + struct arm_smmu_ste *used_bits) +{ + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0])); + + used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V); + if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V))) + return; + + used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG); + + /* S1 translates */ + if (cfg & BIT(0)) { + used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | + STRTAB_STE_0_S1CTXPTR_MASK | + STRTAB_STE_0_S1CDMAX); + used_bits->data[1] |= + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW | + STRTAB_STE_1_EATS); + used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); + } + + /* S2 translates */ + if (cfg & BIT(1)) { + used_bits->data[1] |= + cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG); + used_bits->data[2] |= + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); + used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); + } + + if (cfg == STRTAB_STE_0_CFG_BYPASS) + used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); +} + +/* + * Figure out if we can do a hitless update of entry to become target. Returns a + * bit mask where 1 indicates that qword needs to be set disruptively. + * unused_update is an intermediate value of entry that has unused bits set to + * their new values. + */ +static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry, + const struct arm_smmu_ste *target, + struct arm_smmu_ste *unused_update) +{ + struct arm_smmu_ste target_used = {}; + struct arm_smmu_ste cur_used = {}; + u8 used_qword_diff = 0; + unsigned int i; + + arm_smmu_get_ste_used(entry, &cur_used); + arm_smmu_get_ste_used(target, &target_used); + + for (i = 0; i != ARRAY_SIZE(target_used.data); i++) { + /* + * Check that masks are up to date, the make functions are not + * allowed to set a bit to 1 if the used function doesn't say it + * is used. + */ + WARN_ON_ONCE(target->data[i] & ~target_used.data[i]); + + /* Bits can change because they are not currently being used */ + unused_update->data[i] = (entry->data[i] & cur_used.data[i]) | + (target->data[i] & ~cur_used.data[i]); + /* + * Each bit indicates that a used bit in a qword needs to be + * changed after unused_update is applied. + */ + if ((unused_update->data[i] & target_used.data[i]) != + target->data[i]) + used_qword_diff |= 1 << i; + } + return used_qword_diff; +} + +static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid, + struct arm_smmu_ste *entry, + const struct arm_smmu_ste *target, unsigned int start, + unsigned int len) +{ + bool changed = false; + unsigned int i; + + for (i = start; len != 0; len--, i++) { + if (entry->data[i] != target->data[i]) { + WRITE_ONCE(entry->data[i], target->data[i]); + changed = true; + } + } + + if (changed) + arm_smmu_sync_ste_for_sid(smmu, sid); + return changed; +} + +/* + * Update the STE/CD to the target configuration. The transition from the + * current entry to the target entry takes place over multiple steps that + * attempts to make the transition hitless if possible. This function takes care + * not to create a situation where the HW can perceive a corrupted entry. HW is + * only required to have a 64 bit atomicity with stores from the CPU, while + * entries are many 64 bit values big. + * + * The difference between the current value and the target value is analyzed to + * determine which of three updates are required - disruptive, hitless or no + * change. + * + * In the most general disruptive case we can make any update in three steps: + * - Disrupting the entry (V=0) + * - Fill now unused qwords, execpt qword 0 which contains V + * - Make qword 0 have the final value and valid (V=1) with a single 64 + * bit store + * + * However this disrupts the HW while it is happening. There are several + * interesting cases where a STE/CD can be updated without disturbing the HW + * because only a small number of bits are changing (S1DSS, CONFIG, etc) or + * because the used bits don't intersect. We can detect this by calculating how + * many 64 bit values need update after adjusting the unused bits and skip the + * V=0 process. This relies on the IGNORED behavior described in the + * specification. + */ +static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, + struct arm_smmu_ste *entry, + const struct arm_smmu_ste *target) +{ + unsigned int num_entry_qwords = ARRAY_SIZE(target->data); + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_ste unused_update; + u8 used_qword_diff; + + used_qword_diff = + arm_smmu_entry_qword_diff(entry, target, &unused_update); + if (hweight8(used_qword_diff) == 1) { + /* + * Only one qword needs its used bits to be changed. This is a + * hitless update, update all bits the current STE is ignoring + * to their new values, then update a single "critical qword" to + * change the STE and finally 0 out any bits that are now unused + * in the target configuration. + */ + unsigned int critical_qword_index = ffs(used_qword_diff) - 1; + + /* + * Skip writing unused bits in the critical qword since we'll be + * writing it in the next step anyways. This can save a sync + * when the only change is in that qword. + */ + unused_update.data[critical_qword_index] = + entry->data[critical_qword_index]; + entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords); + entry_set(smmu, sid, entry, target, critical_qword_index, 1); + entry_set(smmu, sid, entry, target, 0, num_entry_qwords); + } else if (used_qword_diff) { + /* + * At least two qwords need their inuse bits to be changed. This + * requires a breaking update, zero the V bit, write all qwords + * but 0, then set qword 0 + */ + unused_update.data[0] = entry->data[0] & (~STRTAB_STE_0_V); + entry_set(smmu, sid, entry, &unused_update, 0, 1); + entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1); + entry_set(smmu, sid, entry, target, 0, 1); + } else { + /* + * No inuse bit changed. Sanity check that all unused bits are 0 + * in the entry. The target was already sanity checked by + * compute_qword_diff(). + */ + WARN_ON_ONCE( + entry_set(smmu, sid, entry, target, 0, num_entry_qwords)); + } + + /* It's likely that we'll want to use the new STE soon */ + if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) { + struct arm_smmu_cmdq_ent + prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, + .prefetch = { + .sid = sid, + } }; + + arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + } +} + static void arm_smmu_sync_cd(struct arm_smmu_master *master, int ssid, bool leaf) { @@ -1249,34 +1445,12 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, struct arm_smmu_ste *dst) { - /* - * This is hideously complicated, but we only really care about - * three cases at the moment: - * - * 1. Invalid (all zero) -> bypass/fault (init) - * 2. Bypass/fault -> translation/bypass (attach) - * 3. Translation/bypass -> bypass/fault (detach) - * - * Given that we can't update the STE atomically and the SMMU - * doesn't read the thing in a defined order, that leaves us - * with the following maintenance requirements: - * - * 1. Update Config, return (init time STEs aren't live) - * 2. Write everything apart from dword 0, sync, write dword 0, sync - * 3. Update Config, sync - */ - u64 val = le64_to_cpu(dst->data[0]); - bool ste_live = false; + u64 val; struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; struct arm_smmu_domain *smmu_domain = master->domain; - struct arm_smmu_cmdq_ent prefetch_cmd = { - .opcode = CMDQ_OP_PREFETCH_CFG, - .prefetch = { - .sid = sid, - }, - }; + struct arm_smmu_ste target = {}; if (smmu_domain) { switch (smmu_domain->stage) { @@ -1291,22 +1465,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, } } - if (val & STRTAB_STE_0_V) { - switch (FIELD_GET(STRTAB_STE_0_CFG, val)) { - case STRTAB_STE_0_CFG_BYPASS: - break; - case STRTAB_STE_0_CFG_S1_TRANS: - case STRTAB_STE_0_CFG_S2_TRANS: - ste_live = true; - break; - case STRTAB_STE_0_CFG_ABORT: - BUG_ON(!disable_bypass); - break; - default: - BUG(); /* STE corruption */ - } - } - /* Nuke the existing STE_0 value, as we're going to rewrite it */ val = STRTAB_STE_0_V; @@ -1317,16 +1475,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); - dst->data[0] = cpu_to_le64(val); - dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + target.data[0] = cpu_to_le64(val); + target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - dst->data[2] = 0; /* Nuke the VMID */ - /* - * The SMMU can perform negative caching, so we must sync - * the STE regardless of whether the old value was live. - */ - if (smmu) - arm_smmu_sync_ste_for_sid(smmu, sid); + target.data[2] = 0; /* Nuke the VMID */ + arm_smmu_write_ste(master, sid, dst, &target); return; } @@ -1334,8 +1487,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; - BUG_ON(ste_live); - dst->data[1] = cpu_to_le64( + target.data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1344,7 +1496,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu->features & ARM_SMMU_FEAT_STALLS && !master->stall_enabled) - dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | @@ -1353,8 +1505,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, } if (s2_cfg) { - BUG_ON(ste_live); - dst->data[2] = cpu_to_le64( + target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); + target.data[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | #ifdef __BIG_ENDIAN @@ -1363,23 +1516,17 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } if (master->ats_enabled) - dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, + target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS)); - arm_smmu_sync_ste_for_sid(smmu, sid); - /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst->data[0], cpu_to_le64(val)); - arm_smmu_sync_ste_for_sid(smmu, sid); - - /* It's likely that we'll want to use the new STE soon */ - if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) - arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + target.data[0] = cpu_to_le64(val); + arm_smmu_write_ste(master, sid, dst, &target); } static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, -- Gitee From 445943e6ba7c1afedc15370fb84644db71af248e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:13 -0400 Subject: [PATCH 137/219] iommu/arm-smmu-v3: Consolidate the STE generation for abort/bypass ANBZ: #13617 commit 7686aa5f8d61388eeaa64730363ffc8df20a481d upstream. This allows writing the flow of arm_smmu_write_strtab_ent() around abort and bypass domains more naturally. Note that the core code no longer supplies NULL domains, though there is still a flow in the driver that end up in arm_smmu_write_strtab_ent() with NULL. A later patch will remove it. Remove the duplicate calculation of the STE in arm_smmu_init_bypass_stes() and remove the force parameter. arm_smmu_rmr_install_bypass_ste() can now simply invoke arm_smmu_make_bypass_ste() directly. Rename arm_smmu_init_bypass_stes() to arm_smmu_init_initial_stes() to better reflect its purpose. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 97 ++++++++++++--------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 5b1be0f9c712..636a274d82f3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1442,6 +1442,24 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } +static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) +{ + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT)); +} + +static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target) +{ + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS)); + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); +} + static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, struct arm_smmu_ste *dst) { @@ -1452,37 +1470,31 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, struct arm_smmu_domain *smmu_domain = master->domain; struct arm_smmu_ste target = {}; - if (smmu_domain) { - switch (smmu_domain->stage) { - case ARM_SMMU_DOMAIN_S1: - cd_table = &master->cd_table; - break; - case ARM_SMMU_DOMAIN_S2: - s2_cfg = &smmu_domain->s2_cfg; - break; - default: - break; - } - } - - /* Nuke the existing STE_0 value, as we're going to rewrite it */ - val = STRTAB_STE_0_V; - - /* Bypass/fault */ - if (!smmu_domain || !(cd_table || s2_cfg)) { - if (!smmu_domain && disable_bypass) - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); + if (!smmu_domain) { + if (disable_bypass) + arm_smmu_make_abort_ste(&target); else - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); + arm_smmu_make_bypass_ste(&target); + arm_smmu_write_ste(master, sid, dst, &target); + return; + } - target.data[0] = cpu_to_le64(val); - target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); - target.data[2] = 0; /* Nuke the VMID */ + switch (smmu_domain->stage) { + case ARM_SMMU_DOMAIN_S1: + cd_table = &master->cd_table; + break; + case ARM_SMMU_DOMAIN_S2: + s2_cfg = &smmu_domain->s2_cfg; + break; + case ARM_SMMU_DOMAIN_BYPASS: + arm_smmu_make_bypass_ste(&target); arm_smmu_write_ste(master, sid, dst, &target); return; } + /* Nuke the existing STE_0 value, as we're going to rewrite it */ + val = STRTAB_STE_0_V; + if (cd_table) { u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; @@ -1529,22 +1541,20 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, arm_smmu_write_ste(master, sid, dst, &target); } -static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, - unsigned int nent, bool force) +/* + * This can safely directly manipulate the STE memory without a sync sequence + * because the STE table has not been installed in the SMMU yet. + */ +static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, + unsigned int nent) { unsigned int i; - u64 val = STRTAB_STE_0_V; - - if (disable_bypass && !force) - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); - else - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); for (i = 0; i < nent; ++i) { - strtab->data[0] = cpu_to_le64(val); - strtab->data[1] = cpu_to_le64(FIELD_PREP( - STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - strtab->data[2] = 0; + if (disable_bypass) + arm_smmu_make_abort_ste(strtab); + else + arm_smmu_make_bypass_ste(strtab); strtab++; } } @@ -1572,7 +1582,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) return -ENOMEM; } - arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT, false); + arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT); arm_smmu_write_strtab_l1_desc(strtab, desc); return 0; } @@ -3167,7 +3177,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); cfg->strtab_base_cfg = reg; - arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents, false); + arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents); return 0; } @@ -3879,7 +3889,6 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); list_for_each_entry(e, &rmr_list, list) { - struct arm_smmu_ste *step; struct iommu_iort_rmr_data *rmr; int ret, i; @@ -3892,8 +3901,12 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) continue; } - step = arm_smmu_get_step_for_sid(smmu, rmr->sids[i]); - arm_smmu_init_bypass_stes(step, 1, true); + /* + * STE table is not programmed to HW, see + * arm_smmu_initial_bypass_stes() + */ + arm_smmu_make_bypass_ste( + arm_smmu_get_step_for_sid(smmu, rmr->sids[i])); } } -- Gitee From 6cbc165e9ec63ee8aa06ba550f157acf6e3363c1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:14 -0400 Subject: [PATCH 138/219] iommu/arm-smmu-v3: Move the STE generation for S1 and S2 domains into functions ANBZ: #13617 commit efe15df08727d483bd247ff905a828f0de955de6 upstream. This is preparation to move the STE calculation higher up in to the call chain and remove arm_smmu_write_strtab_ent(). These new functions will be called directly from attach_dev. Reviewed-by: Moritz Fischer Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 138 ++++++++++++-------- 1 file changed, 83 insertions(+), 55 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 636a274d82f3..a418f667f469 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1460,13 +1460,89 @@ static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target) FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); } +static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master) +{ + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + struct arm_smmu_device *smmu = master->smmu; + + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | + FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt) | + (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | + FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax)); + + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | + FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | + ((smmu->features & ARM_SMMU_FEAT_STALLS && + !master->stall_enabled) ? + STRTAB_STE_1_S1STALLD : + 0) | + FIELD_PREP(STRTAB_STE_1_EATS, + master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + + if (smmu->features & ARM_SMMU_FEAT_E2H) { + /* + * To support BTM the streamworld needs to match the + * configuration of the CPU so that the ASID broadcasts are + * properly matched. This means either S/NS-EL2-E2H (hypervisor) + * or NS-EL1 (guest). Since an SVA domain can be installed in a + * PASID this should always use a BTM compatible configuration + * if the HW supports it. + */ + target->data[1] |= cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_EL2)); + } else { + target->data[1] |= cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1)); + + /* + * VMID 0 is reserved for stage-2 bypass EL1 STEs, see + * arm_smmu_domain_alloc_id() + */ + target->data[2] = + cpu_to_le64(FIELD_PREP(STRTAB_STE_2_S2VMID, 0)); + } +} + +static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain) +{ + struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg; + + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS)); + + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_EATS, + master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) | + FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); + + target->data[2] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | + FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | + STRTAB_STE_2_S2AA64 | +#ifdef __BIG_ENDIAN + STRTAB_STE_2_S2ENDI | +#endif + STRTAB_STE_2_S2PTW | + STRTAB_STE_2_S2R); + + target->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); +} + static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, struct arm_smmu_ste *dst) { - u64 val; - struct arm_smmu_device *smmu = master->smmu; - struct arm_smmu_ctx_desc_cfg *cd_table = NULL; - struct arm_smmu_s2_cfg *s2_cfg = NULL; struct arm_smmu_domain *smmu_domain = master->domain; struct arm_smmu_ste target = {}; @@ -1481,63 +1557,15 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: - cd_table = &master->cd_table; + arm_smmu_make_cdtable_ste(&target, master); break; case ARM_SMMU_DOMAIN_S2: - s2_cfg = &smmu_domain->s2_cfg; + arm_smmu_make_s2_domain_ste(&target, master, smmu_domain); break; case ARM_SMMU_DOMAIN_BYPASS: arm_smmu_make_bypass_ste(&target); - arm_smmu_write_ste(master, sid, dst, &target); - return; - } - - /* Nuke the existing STE_0 value, as we're going to rewrite it */ - val = STRTAB_STE_0_V; - - if (cd_table) { - u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? - STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; - - target.data[1] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | - FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | - FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | - FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | - FIELD_PREP(STRTAB_STE_1_STRW, strw)); - - if (smmu->features & ARM_SMMU_FEAT_STALLS && - !master->stall_enabled) - target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); - - val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | - FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | - FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax) | - FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt); - } - - if (s2_cfg) { - target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); - target.data[2] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | - FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | -#ifdef __BIG_ENDIAN - STRTAB_STE_2_S2ENDI | -#endif - STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | - STRTAB_STE_2_S2R); - - target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); - - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); + break; } - - if (master->ats_enabled) - target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, - STRTAB_STE_1_EATS_TRANS)); - - target.data[0] = cpu_to_le64(val); arm_smmu_write_ste(master, sid, dst, &target); } -- Gitee From 773449c98c73a0ecb14fa1260af95cb2f23157a6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:15 -0400 Subject: [PATCH 139/219] iommu/arm-smmu-v3: Build the whole STE in arm_smmu_make_s2_domain_ste() ANBZ: #13617 commit 71b0aa10b18dd589a899449457e5ab7c1da00d01 upstream. Half the code was living in arm_smmu_domain_finalise_s2(), just move it here and take the values directly from the pgtbl_ops instead of storing copies. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 27 ++++++++++++--------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index a418f667f469..eab81c58dcff 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1515,6 +1515,11 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, struct arm_smmu_domain *smmu_domain) { struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg; + const struct io_pgtable_cfg *pgtbl_cfg = + &io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops)->cfg; + typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr = + &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; + u64 vtcr_val; memset(target, 0, sizeof(*target)); target->data[0] = cpu_to_le64( @@ -1527,9 +1532,16 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); + vtcr_val = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps); target->data[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | - FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | + FIELD_PREP(STRTAB_STE_2_VTCR, vtcr_val) | STRTAB_STE_2_S2AA64 | #ifdef __BIG_ENDIAN STRTAB_STE_2_S2ENDI | @@ -1537,7 +1549,8 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); - target->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr & + STRTAB_STE_3_S2TTB_MASK); } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, @@ -2278,7 +2291,6 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, int vmid; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; - typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr; /* Reserve VMID 0 for stage-2 bypass STEs */ vmid = ida_alloc_range(&smmu->vmid_map, 1, (1 << smmu->vmid_bits) - 1, @@ -2286,16 +2298,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, if (vmid < 0) return vmid; - vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; cfg->vmid = (u16)vmid; - cfg->vttbr = pgtbl_cfg->arm_lpae_s2_cfg.vttbr; - cfg->vtcr = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps); return 0; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 27ddf1acd12c..1be0c1151c50 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -609,8 +609,6 @@ struct arm_smmu_ctx_desc_cfg { struct arm_smmu_s2_cfg { u16 vmid; - u64 vttbr; - u64 vtcr; }; struct arm_smmu_strtab_cfg { -- Gitee From 13778d7fb4a1c9c5d0ed4bc89a8b0f8b57fc82c5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:16 -0400 Subject: [PATCH 140/219] iommu/arm-smmu-v3: Hold arm_smmu_asid_lock during all of attach_dev ANBZ: #13617 commit 9f7c68911579bc15c57d227d021ccd253da2b635 upstream. The BTM support wants to be able to change the ASID of any smmu_domain. When it goes to do this it holds the arm_smmu_asid_lock and iterates over the target domain's devices list. During attach of a S1 domain we must ensure that the devices list and CD are in sync, otherwise we could miss CD updates or a parallel CD update could push an out of date CD. This is pretty complicated, and almost works today because arm_smmu_detach_dev() removes the master from the linked list before working on the CD entries, preventing parallel update of the CD. However, it does have an issue where the CD can remain programed while the domain appears to be unattached. arm_smmu_share_asid() will then not clear any CD entriess and install its own CD entry with the same ASID concurrently. This creates a small race window where the IOMMU can see two ASIDs pointing to different translations. CPU0 CPU1 arm_smmu_attach_dev() arm_smmu_detach_dev() spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); arm_smmu_mmu_notifier_get() arm_smmu_alloc_shared_cd() arm_smmu_share_asid(): // Does nothing due to list_del above arm_smmu_update_ctx_desc_devices() arm_smmu_tlb_inv_asid() arm_smmu_write_ctx_desc() ** Now the ASID is in two CDs with different translation arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); Solve this by wrapping most of the attach flow in the arm_smmu_asid_lock. This locks more than strictly needed to prepare for the next patch which will reorganize the order of the linked list, STE and CD changes. Move arm_smmu_detach_dev() till after we have initialized the domain so the lock can be held for less time. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 ++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index eab81c58dcff..0477d5b716c2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2562,8 +2562,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return -EBUSY; } - arm_smmu_detach_dev(master); - mutex_lock(&smmu_domain->init_mutex); if (!smmu_domain->smmu) { @@ -2578,6 +2576,16 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (ret) return ret; + /* + * Prevent arm_smmu_share_asid() from trying to change the ASID + * of either the old or new domain while we are working on it. + * This allows the STE and the smmu_domain->devices list to + * be inconsistent during this routine. + */ + mutex_lock(&arm_smmu_asid_lock); + + arm_smmu_detach_dev(master); + master->domain = smmu_domain; /* @@ -2603,13 +2611,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } } - /* - * Prevent SVA from concurrently modifying the CD or writing to - * the CD entry - */ - mutex_lock(&arm_smmu_asid_lock); ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); - mutex_unlock(&arm_smmu_asid_lock); if (ret) { master->domain = NULL; goto out_list_del; @@ -2619,13 +2621,15 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_install_ste_for_dev(master); arm_smmu_enable_ats(master); - return 0; + goto out_unlock; out_list_del: spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); +out_unlock: + mutex_unlock(&arm_smmu_asid_lock); return ret; } -- Gitee From 1696a6b3ee4c29b6980a8c853be223f4c2ecf914 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:17 -0400 Subject: [PATCH 141/219] iommu/arm-smmu-v3: Compute the STE only once for each master ANBZ: #13617 commit 65547275d76965c3106fbcd4a9244242eb88224c upstream. Currently arm_smmu_install_ste_for_dev() iterates over every SID and computes from scratch an identical STE. Every SID should have the same STE contents. Turn this inside out so that the STE is supplied by the caller and arm_smmu_install_ste_for_dev() simply installs it to every SID. This is possible now that the STE generation does not inform what sequence should be used to program it. This allows splitting the STE calculation up according to the call site, which following patches will make use of, and removes the confusing NULL domain special case that only supported arm_smmu_detach_dev(). Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 57 ++++++++------------- 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0477d5b716c2..ba6ada2482f8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1553,35 +1553,6 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, STRTAB_STE_3_S2TTB_MASK); } -static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, - struct arm_smmu_ste *dst) -{ - struct arm_smmu_domain *smmu_domain = master->domain; - struct arm_smmu_ste target = {}; - - if (!smmu_domain) { - if (disable_bypass) - arm_smmu_make_abort_ste(&target); - else - arm_smmu_make_bypass_ste(&target); - arm_smmu_write_ste(master, sid, dst, &target); - return; - } - - switch (smmu_domain->stage) { - case ARM_SMMU_DOMAIN_S1: - arm_smmu_make_cdtable_ste(&target, master); - break; - case ARM_SMMU_DOMAIN_S2: - arm_smmu_make_s2_domain_ste(&target, master, smmu_domain); - break; - case ARM_SMMU_DOMAIN_BYPASS: - arm_smmu_make_bypass_ste(&target); - break; - } - arm_smmu_write_ste(master, sid, dst, &target); -} - /* * This can safely directly manipulate the STE memory without a sync sequence * because the STE table has not been installed in the SMMU yet. @@ -2389,7 +2360,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) +static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; @@ -2406,7 +2378,7 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) if (j < i) continue; - arm_smmu_write_strtab_ent(master, sid, step); + arm_smmu_write_ste(master, sid, step, target); } } @@ -2513,6 +2485,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) static void arm_smmu_detach_dev(struct arm_smmu_master *master) { unsigned long flags; + struct arm_smmu_ste target; struct arm_smmu_domain *smmu_domain = master->domain; if (!smmu_domain) @@ -2526,7 +2499,11 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) master->domain = NULL; master->ats_enabled = false; - arm_smmu_install_ste_for_dev(master); + if (disable_bypass) + arm_smmu_make_abort_ste(&target); + else + arm_smmu_make_bypass_ste(&target); + arm_smmu_install_ste_for_dev(master, &target); /* * Clearing the CD entry isn't strictly required to detach the domain * since the table is uninstalled anyway, but it helps avoid confusion @@ -2541,6 +2518,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; unsigned long flags; + struct arm_smmu_ste target; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); @@ -2602,7 +2580,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) list_add(&master->domain_head, &smmu_domain->devices); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + switch (smmu_domain->stage) { + case ARM_SMMU_DOMAIN_S1: if (!master->cd_table.cdtab) { ret = arm_smmu_alloc_cd_tables(master); if (ret) { @@ -2616,9 +2595,17 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) master->domain = NULL; goto out_list_del; } - } - arm_smmu_install_ste_for_dev(master); + arm_smmu_make_cdtable_ste(&target, master); + break; + case ARM_SMMU_DOMAIN_S2: + arm_smmu_make_s2_domain_ste(&target, master, smmu_domain); + break; + case ARM_SMMU_DOMAIN_BYPASS: + arm_smmu_make_bypass_ste(&target); + break; + } + arm_smmu_install_ste_for_dev(master, &target); arm_smmu_enable_ats(master); goto out_unlock; -- Gitee From b23862292e3e23ceeaabbe464415c18694fdd508 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:18 -0400 Subject: [PATCH 142/219] iommu/arm-smmu-v3: Do not change the STE twice during arm_smmu_attach_dev() ANBZ: #13617 commit 8c73c32c83ce7f3c31864cb044abd3daefacd996 upstream. This was needed because the STE code required the STE to be in ABORT/BYPASS inorder to program a cdtable or S2 STE. Now that the STE code can automatically handle all transitions we can remove this step from the attach_dev flow. A few small bugs exist because of this: 1) If the core code does BLOCKED -> UNMANAGED with disable_bypass=false then there will be a moment where the STE points at BYPASS. Since this can be done by VFIO/IOMMUFD it is a small security race. 2) If the core code does IDENTITY -> DMA then any IOMMU_RESV_DIRECT regions will temporarily become BLOCKED. We'd like drivers to work in a way that allows IOMMU_RESV_DIRECT to be continuously functional during these transitions. Make arm_smmu_release_device() put the STE back to the correct ABORT/BYPASS setting. Fix a bug where a IOMMU_RESV_DIRECT was ignored on this path. As noted before the reordering of the linked list/STE/CD changes is OK against concurrent arm_smmu_share_asid() because of the arm_smmu_asid_lock. Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Reviewed-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ba6ada2482f8..80630a9c81e3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2485,7 +2485,6 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) static void arm_smmu_detach_dev(struct arm_smmu_master *master) { unsigned long flags; - struct arm_smmu_ste target; struct arm_smmu_domain *smmu_domain = master->domain; if (!smmu_domain) @@ -2499,11 +2498,6 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) master->domain = NULL; master->ats_enabled = false; - if (disable_bypass) - arm_smmu_make_abort_ste(&target); - else - arm_smmu_make_bypass_ste(&target); - arm_smmu_install_ste_for_dev(master, &target); /* * Clearing the CD entry isn't strictly required to detach the domain * since the table is uninstalled anyway, but it helps avoid confusion @@ -2845,9 +2839,18 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) static void arm_smmu_release_device(struct device *dev) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_ste target; if (WARN_ON(arm_smmu_master_sva_enabled(master))) iopf_queue_remove_device(master->smmu->evtq.iopf, dev); + + /* Put the STE back to what arm_smmu_init_strtab() sets */ + if (disable_bypass && !dev->iommu->require_direct) + arm_smmu_make_abort_ste(&target); + else + arm_smmu_make_bypass_ste(&target); + arm_smmu_install_ste_for_dev(master, &target); + arm_smmu_detach_dev(master); arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); -- Gitee From 58cd6f156f3ee9c4a2852635daf9225844b2b143 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:19 -0400 Subject: [PATCH 143/219] iommu/arm-smmu-v3: Put writing the context descriptor in the right order ANBZ: #13617 commit d2e053d73247b68144c7f44d002ebf56acaf2d48 upstream. Get closer to the IOMMU API ideal that changes between domains can be hitless. The ordering for the CD table entry is not entirely clean from this perspective. When switching away from a STE with a CD table programmed in it we should write the new STE first, then clear any old data in the CD entry. If we are programming a CD table for the first time to a STE then the CD entry should be programmed before the STE is loaded. If we are replacing a CD table entry when the STE already points at the CD entry then we just need to do the make/break sequence. Lift this code out of arm_smmu_detach_dev() so it can all be sequenced properly. The only other caller is arm_smmu_release_device() and it is going to free the cdtable anyhow, so it doesn't matter what is in it. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 29 ++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 80630a9c81e3..a6c6db63a3b6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2498,14 +2498,6 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) master->domain = NULL; master->ats_enabled = false; - /* - * Clearing the CD entry isn't strictly required to detach the domain - * since the table is uninstalled anyway, but it helps avoid confusion - * in the call to arm_smmu_write_ctx_desc on the next attach (which - * expects the entry to be empty). - */ - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab) - arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -2582,6 +2574,17 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) master->domain = NULL; goto out_list_del; } + } else { + /* + * arm_smmu_write_ctx_desc() relies on the entry being + * invalid to work, clear any existing entry. + */ + ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, + NULL); + if (ret) { + master->domain = NULL; + goto out_list_del; + } } ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); @@ -2591,15 +2594,23 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } arm_smmu_make_cdtable_ste(&target, master); + arm_smmu_install_ste_for_dev(master, &target); break; case ARM_SMMU_DOMAIN_S2: arm_smmu_make_s2_domain_ste(&target, master, smmu_domain); + arm_smmu_install_ste_for_dev(master, &target); + if (master->cd_table.cdtab) + arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, + NULL); break; case ARM_SMMU_DOMAIN_BYPASS: arm_smmu_make_bypass_ste(&target); + arm_smmu_install_ste_for_dev(master, &target); + if (master->cd_table.cdtab) + arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, + NULL); break; } - arm_smmu_install_ste_for_dev(master, &target); arm_smmu_enable_ats(master); goto out_unlock; -- Gitee From 4ec04d6d6d593bbcfc7e89962329898d944a204b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:20 -0400 Subject: [PATCH 144/219] iommu/arm-smmu-v3: Pass smmu_domain to arm_enable/disable_ats() ANBZ: #13617 commit d550ddc5b789f258cb5ce3bfe74af6d5383589b5 upstream. The caller already has the domain, just pass it in. A following patch will remove master->domain. Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/9-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index a6c6db63a3b6..0724a94a1c84 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2397,12 +2397,12 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master) return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)); } -static void arm_smmu_enable_ats(struct arm_smmu_master *master) +static void arm_smmu_enable_ats(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain) { size_t stu; struct pci_dev *pdev; struct arm_smmu_device *smmu = master->smmu; - struct arm_smmu_domain *smmu_domain = master->domain; /* Don't enable ATS at the endpoint if it's not enabled in the STE */ if (!master->ats_enabled) @@ -2418,10 +2418,9 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master) dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); } -static void arm_smmu_disable_ats(struct arm_smmu_master *master) +static void arm_smmu_disable_ats(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_domain *smmu_domain = master->domain; - if (!master->ats_enabled) return; @@ -2490,7 +2489,7 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) if (!smmu_domain) return; - arm_smmu_disable_ats(master); + arm_smmu_disable_ats(master, smmu_domain); spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); @@ -2612,7 +2611,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) break; } - arm_smmu_enable_ats(master); + arm_smmu_enable_ats(master, smmu_domain); goto out_unlock; out_list_del: -- Gitee From 6786dc7ad5e3cd0317516cfca94b7dd6ec0ca0ba Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:21 -0400 Subject: [PATCH 145/219] iommu/arm-smmu-v3: Remove arm_smmu_master->domain ANBZ: #13617 commit 1b50017d39f650d78a0066734d6fe05920a8c9e8 upstream. Introducing global statics which are of type struct iommu_domain, not struct arm_smmu_domain makes it difficult to retain arm_smmu_master->domain, as it can no longer point to an IDENTITY or BLOCKED domain. The only place that uses the value is arm_smmu_detach_dev(). Change things to work like other drivers and call iommu_get_domain_for_dev() to obtain the current domain. The master->domain is subtly protecting the master->domain_head against being unused as only PAGING domains will set master->domain and only paging domains use the master->domain_head. To make it simple keep the master->domain_head initialized so that the list_del() logic just does nothing for attached non-PAGING domains. Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/10-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 ++++++++------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0724a94a1c84..efb52530fd83 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2483,19 +2483,20 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) static void arm_smmu_detach_dev(struct arm_smmu_master *master) { + struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev); + struct arm_smmu_domain *smmu_domain; unsigned long flags; - struct arm_smmu_domain *smmu_domain = master->domain; - if (!smmu_domain) + if (!domain) return; + smmu_domain = to_smmu_domain(domain); arm_smmu_disable_ats(master, smmu_domain); spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_del(&master->domain_head); + list_del_init(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - master->domain = NULL; master->ats_enabled = false; } @@ -2549,8 +2550,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_detach_dev(master); - master->domain = smmu_domain; - /* * The SMMU does not support enabling ATS with bypass. When the STE is * in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests and @@ -2569,10 +2568,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) case ARM_SMMU_DOMAIN_S1: if (!master->cd_table.cdtab) { ret = arm_smmu_alloc_cd_tables(master); - if (ret) { - master->domain = NULL; + if (ret) goto out_list_del; - } } else { /* * arm_smmu_write_ctx_desc() relies on the entry being @@ -2580,17 +2577,13 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) */ ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); - if (ret) { - master->domain = NULL; + if (ret) goto out_list_del; - } } ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); - if (ret) { - master->domain = NULL; + if (ret) goto out_list_del; - } arm_smmu_make_cdtable_ste(&target, master); arm_smmu_install_ste_for_dev(master, &target); @@ -2616,7 +2609,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) out_list_del: spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_del(&master->domain_head); + list_del_init(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); out_unlock: @@ -2811,6 +2804,7 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) master->dev = dev; master->smmu = smmu; INIT_LIST_HEAD(&master->bonds); + INIT_LIST_HEAD(&master->domain_head); dev_iommu_priv_set(dev, master); ret = arm_smmu_insert_master(smmu, master); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1be0c1151c50..21f2f7350101 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -695,7 +695,6 @@ struct arm_smmu_stream { struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; - struct arm_smmu_domain *domain; struct list_head domain_head; struct arm_smmu_stream *streams; /* Locked by the iommu core using the group mutex */ -- Gitee From 6eb341668f92984077c76c2e425bc684bf373a82 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:22 -0400 Subject: [PATCH 146/219] iommu/arm-smmu-v3: Check that the RID domain is S1 in SVA ANBZ: #13617 commit ae91f6552c301e5e8569667e9d5440d5f75a90c4 upstream. The SVA code only works if the RID domain is a S1 domain and has already installed the cdtable. Originally the check for this was in arm_smmu_sva_bind() but when the op was removed the test didn't get copied over to the new arm_smmu_sva_set_dev_pasid(). Without the test wrong usage usually will hit a WARN_ON() in arm_smmu_write_ctx_desc() due to a missing ctx table. However, the next patches wil change things so that an IDENTITY domain is not a struct arm_smmu_domain and this will get into memory corruption if the struct is wrongly casted. Fail in arm_smmu_sva_set_dev_pasid() if the STE does not have a S1, which is a proxy for the STE having a pointer to the CD table. Write it in a way that will be compatible with the next patches. Fixes: 386fa64fd52b ("arm-smmu-v3/sva: Add SVA domain support") Reported-by: Shameerali Kolothum Thodi Closes: https://lore.kernel.org/linux-iommu/2a828e481416405fb3a4cceb9e075a59@huawei.com/ Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 874d1f977d90..2cd433a9c8a0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -363,7 +363,13 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, struct arm_smmu_bond *bond; struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_domain *smmu_domain; + + if (!(domain->type & __IOMMU_DOMAIN_PAGING)) + return -ENODEV; + smmu_domain = to_smmu_domain(domain); + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -ENODEV; if (!master || !master->sva_enabled) return -ENODEV; -- Gitee From f5928e7c721b3a8b815d4de5b1fbdcc3919368a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:23 -0400 Subject: [PATCH 147/219] iommu/arm-smmu-v3: Add a global static IDENTITY domain ANBZ: #13617 commit 12dacfb5b938cdd90ced0109165eee9cb27061d9 upstream. Move to the new static global for identity domains. Move all the logic out of arm_smmu_attach_dev into an identity only function. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/12-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 82 +++++++++++++++------ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 58 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index efb52530fd83..d9043e1cd933 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2176,8 +2176,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) return arm_smmu_sva_domain_alloc(); if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) + type != IOMMU_DOMAIN_DMA) return NULL; /* @@ -2285,11 +2284,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS; - return 0; - } - /* Restrict the stage to what we can actually support */ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) smmu_domain->stage = ARM_SMMU_DOMAIN_S2; @@ -2487,7 +2481,7 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) struct arm_smmu_domain *smmu_domain; unsigned long flags; - if (!domain) + if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING)) return; smmu_domain = to_smmu_domain(domain); @@ -2550,15 +2544,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_detach_dev(master); - /* - * The SMMU does not support enabling ATS with bypass. When the STE is - * in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests and - * Translated transactions are denied as though ATS is disabled for the - * stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and - * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry). - */ - if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) - master->ats_enabled = arm_smmu_ats_supported(master); + master->ats_enabled = arm_smmu_ats_supported(master); spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_add(&master->domain_head, &smmu_domain->devices); @@ -2595,13 +2581,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); break; - case ARM_SMMU_DOMAIN_BYPASS: - arm_smmu_make_bypass_ste(&target); - arm_smmu_install_ste_for_dev(master, &target); - if (master->cd_table.cdtab) - arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, - NULL); - break; } arm_smmu_enable_ats(master, smmu_domain); @@ -2617,6 +2596,60 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; } +static int arm_smmu_attach_dev_ste(struct device *dev, + struct arm_smmu_ste *ste) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + + if (arm_smmu_master_sva_enabled(master)) + return -EBUSY; + + /* + * Do not allow any ASID to be changed while are working on the STE, + * otherwise we could miss invalidations. + */ + mutex_lock(&arm_smmu_asid_lock); + + /* + * The SMMU does not support enabling ATS with bypass/abort. When the + * STE is in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests + * and Translated transactions are denied as though ATS is disabled for + * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and + * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry). + */ + arm_smmu_detach_dev(master); + + arm_smmu_install_ste_for_dev(master, ste); + mutex_unlock(&arm_smmu_asid_lock); + + /* + * This has to be done after removing the master from the + * arm_smmu_domain->devices to avoid races updating the same context + * descriptor from arm_smmu_share_asid(). + */ + if (master->cd_table.cdtab) + arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); + return 0; +} + +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_ste ste; + + arm_smmu_make_bypass_ste(&ste); + return arm_smmu_attach_dev_ste(dev, &ste); +} + +static const struct iommu_domain_ops arm_smmu_identity_ops = { + .attach_dev = arm_smmu_attach_dev_identity, +}; + +static struct iommu_domain arm_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &arm_smmu_identity_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -3001,6 +3034,7 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) } static struct iommu_ops arm_smmu_ops = { + .identity_domain = &arm_smmu_identity_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 21f2f7350101..154808f96718 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -712,7 +712,6 @@ struct arm_smmu_master { enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, - ARM_SMMU_DOMAIN_BYPASS, }; struct arm_smmu_domain { -- Gitee From 6e9e392dbbdfc9e7f067081863addab5d054c8f3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:24 -0400 Subject: [PATCH 148/219] iommu/arm-smmu-v3: Add a global static BLOCKED domain ANBZ: #13617 commit 352bd64cd8288c5c6808735d52a75809dfef8635 upstream. Using the same design as the IDENTITY domain install an STRTAB_STE_0_CFG_ABORT STE. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/13-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d9043e1cd933..0e755aa82716 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2650,6 +2650,24 @@ static struct iommu_domain arm_smmu_identity_domain = { .ops = &arm_smmu_identity_ops, }; +static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_ste ste; + + arm_smmu_make_abort_ste(&ste); + return arm_smmu_attach_dev_ste(dev, &ste); +} + +static const struct iommu_domain_ops arm_smmu_blocked_ops = { + .attach_dev = arm_smmu_attach_dev_blocked, +}; + +static struct iommu_domain arm_smmu_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &arm_smmu_blocked_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -3035,6 +3053,7 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, + .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, -- Gitee From f8e8e801b40d475f2f77b03cef615d58b4a45a65 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:25 -0400 Subject: [PATCH 149/219] iommu/arm-smmu-v3: Use the identity/blocked domain during release ANBZ: #13617 commit d36464f40f29c984984168ea89e08629bdba41df upstream. Consolidate some more code by having release call arm_smmu_attach_dev_identity/blocked() instead of open coding this. Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/14-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0e755aa82716..348aad28f60d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2894,19 +2894,16 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) static void arm_smmu_release_device(struct device *dev) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct arm_smmu_ste target; if (WARN_ON(arm_smmu_master_sva_enabled(master))) iopf_queue_remove_device(master->smmu->evtq.iopf, dev); /* Put the STE back to what arm_smmu_init_strtab() sets */ if (disable_bypass && !dev->iommu->require_direct) - arm_smmu_make_abort_ste(&target); + arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); else - arm_smmu_make_bypass_ste(&target); - arm_smmu_install_ste_for_dev(master, &target); + arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); - arm_smmu_detach_dev(master); arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); if (master->cd_table.cdtab) -- Gitee From 629c1a78542be854c7c7806c63061ee1f70d15ce Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:26 -0400 Subject: [PATCH 150/219] iommu/arm-smmu-v3: Pass arm_smmu_domain and arm_smmu_device to finalize ANBZ: #13617 commit d8cd200609cf6a404cda73794f0c8c4fd74c568c upstream. Instead of putting container_of() casts in the internals, use the proper type in this call chain. This makes it easier to check that the two global static domains are not leaking into call chains they should not. Passing the smmu avoids the only caller from having to set it and unset it in the error path. Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 +++++++++++---------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 348aad28f60d..56a1d3f650aa 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -88,6 +88,9 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { { 0, NULL}, }; +static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_device *smmu); + static void parse_driver_options(struct arm_smmu_device *smmu) { int i = 0; @@ -2218,12 +2221,12 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) kfree(smmu_domain); } -static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, +static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain, struct io_pgtable_cfg *pgtbl_cfg) { int ret; u32 asid; - struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_ctx_desc *cd = &smmu_domain->cd; typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr; @@ -2255,11 +2258,11 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, return ret; } -static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, +static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain, struct io_pgtable_cfg *pgtbl_cfg) { int vmid; - struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; /* Reserve VMID 0 for stage-2 bypass STEs */ @@ -2272,17 +2275,17 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, return 0; } -static int arm_smmu_domain_finalise(struct iommu_domain *domain) +static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_device *smmu) { int ret; unsigned long ias, oas; enum io_pgtable_fmt fmt; struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; - int (*finalise_stage_fn)(struct arm_smmu_domain *, - struct io_pgtable_cfg *); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu = smmu_domain->smmu; + int (*finalise_stage_fn)(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg); /* Restrict the stage to what we can actually support */ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) @@ -2321,17 +2324,18 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) if (!pgtbl_ops) return -ENOMEM; - domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; - domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; - domain->geometry.force_aperture = true; + smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; + smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; + smmu_domain->domain.geometry.force_aperture = true; - ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); + ret = finalise_stage_fn(smmu, smmu_domain, &pgtbl_cfg); if (ret < 0) { free_io_pgtable_ops(pgtbl_ops); return ret; } smmu_domain->pgtbl_ops = pgtbl_ops; + smmu_domain->smmu = smmu; return 0; } @@ -2523,10 +2527,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) mutex_lock(&smmu_domain->init_mutex); if (!smmu_domain->smmu) { - smmu_domain->smmu = smmu; - ret = arm_smmu_domain_finalise(domain); - if (ret) - smmu_domain->smmu = NULL; + ret = arm_smmu_domain_finalise(smmu_domain, smmu); } else if (smmu_domain->smmu != smmu) ret = -EINVAL; -- Gitee From c00b840a422abfeda00e1187ed082ea0257ca0d4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Feb 2024 13:07:27 -0400 Subject: [PATCH 151/219] iommu/arm-smmu-v3: Convert to domain_alloc_paging() ANBZ: #13617 commit 327e10b47ae99f76ac53f0b8b73a0539f390d2d2 upstream. Now that the BLOCKED and IDENTITY behaviors are managed with their own domains change to the domain_alloc_paging() op. For now SVA remains using the old interface, eventually it will get its own op that can pass in the device and mm_struct which will let us have a sane lifetime for the mmu_notifier. Call arm_smmu_domain_finalise() early if dev is available. Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Tested-by: Moritz Fischer Reviewed-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/16-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 ++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 56a1d3f650aa..b2c19f34ab26 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2173,14 +2173,15 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { - struct arm_smmu_domain *smmu_domain; if (type == IOMMU_DOMAIN_SVA) return arm_smmu_sva_domain_alloc(); + return ERR_PTR(-EOPNOTSUPP); +} - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA) - return NULL; +static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) +{ + struct arm_smmu_domain *smmu_domain; /* * Allocate the domain and initialise some of its data structures. @@ -2189,13 +2190,23 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) */ smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL); if (!smmu_domain) - return NULL; + return ERR_PTR(-ENOMEM); mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); INIT_LIST_HEAD(&smmu_domain->mmu_notifiers); + if (dev) { + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + int ret; + + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu); + if (ret) { + kfree(smmu_domain); + return ERR_PTR(ret); + } + } return &smmu_domain->domain; } @@ -3054,6 +3065,7 @@ static struct iommu_ops arm_smmu_ops = { .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, + .domain_alloc_paging = arm_smmu_domain_alloc_paging, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, -- Gitee From 2717ea236915d3db0dcf373569502dc984693a4e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 15:40:24 +0100 Subject: [PATCH 152/219] iommu: constify pointer to bus_type ANBZ: #13617 commit e70e9ecd7cb349d00736c594b8e9bada0a762238 upstream. Make pointer to bus_type a pointer to const for code safety. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20240216144027.185959-1-krzysztof.kozlowski@linaro.org Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-priv.h | 5 +++-- drivers/iommu/iommu.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index 2024a2313348..5f731d994803 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -21,10 +21,11 @@ int iommu_group_replace_domain(struct iommu_group *group, struct iommu_domain *new_domain); int iommu_device_register_bus(struct iommu_device *iommu, - const struct iommu_ops *ops, struct bus_type *bus, + const struct iommu_ops *ops, + const struct bus_type *bus, struct notifier_block *nb); void iommu_device_unregister_bus(struct iommu_device *iommu, - struct bus_type *bus, + const struct bus_type *bus, struct notifier_block *nb); #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 97eb5742853a..1665e36f1633 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(iommu_device_unregister); #if IS_ENABLED(CONFIG_IOMMUFD_TEST) void iommu_device_unregister_bus(struct iommu_device *iommu, - struct bus_type *bus, + const struct bus_type *bus, struct notifier_block *nb) { bus_unregister_notifier(bus, nb); @@ -303,7 +303,8 @@ EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); * some memory to hold a notifier_block. */ int iommu_device_register_bus(struct iommu_device *iommu, - const struct iommu_ops *ops, struct bus_type *bus, + const struct iommu_ops *ops, + const struct bus_type *bus, struct notifier_block *nb) { int err; -- Gitee From 4ceb34db3847357de4ceac4f66a93f89e7461d1a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 15:40:26 +0100 Subject: [PATCH 153/219] iommu: constify fwnode in iommu_ops_from_fwnode() ANBZ: #13617 commit 5896e6e39b86c1d820b3ccf5caea9aef40c2eacd upstream. Make pointer to fwnode_handle a pointer to const for code safety. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240216144027.185959-3-krzysztof.kozlowski@linaro.org Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 2 +- include/linux/iommu.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1665e36f1633..16897f15b2e8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2806,7 +2806,7 @@ bool iommu_default_passthrough(void) } EXPORT_SYMBOL_GPL(iommu_default_passthrough); -const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { const struct iommu_ops *ops = NULL; struct iommu_device *iommu; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 02ea0db236e6..308bcf09e6a5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1037,7 +1037,7 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); -const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode); +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { @@ -1360,7 +1360,7 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, } static inline -const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { return NULL; } -- Gitee From 8052781393cfecde90d8e9b2cb9b07b61f033b41 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 15:40:27 +0100 Subject: [PATCH 154/219] iommu: re-use local fwnode variable in iommu_ops_from_fwnode() ANBZ: #13617 commit b5a1f7513a2fcb1b9e646128ff0cb0dbba5ed6c1 upstream. iommu_ops_from_fwnode() stores &iommu_spec->np->fwnode in local variable, so use it to simplify the code (iommu_spec is not changed between these dereferences). Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240216144027.185959-4-krzysztof.kozlowski@linaro.org Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/of_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 719652b60840..3afe0b48a48d 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -29,7 +29,7 @@ static int of_iommu_xlate(struct device *dev, !of_device_is_available(iommu_spec->np)) return -ENODEV; - ret = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops); + ret = iommu_fwspec_init(dev, fwnode, ops); if (ret) return ret; /* -- Gitee From 3b2addf090656a9f09700c146a1f7dcedd690992 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 27 Feb 2024 10:14:34 +0800 Subject: [PATCH 155/219] iommu/vt-d: Remove INTEL_IOMMU_BROKEN_GFX_WA ANBZ: #13617 commit 4b8d18c0c986877fe89ade2214e1e81ad817cef1 upstream. Commit 62edf5dc4a524 ("intel-iommu: Restore DMAR_BROKEN_GFX_WA option for broken graphics drivers") was introduced 24 years ago as a temporary workaround for graphics drivers that used physical addresses for DMA and avoided DMA APIs. This workaround was disabled by default. As 24 years have passed, it is expected that graphics driver developers have migrated their drivers to use kernel DMA APIs. Therefore, this workaround is no longer required and could been removed. The Intel iommu driver also provides a "igfx_off" option to turn off the DMA translation for the graphic dedicated IOMMU. Hence, there is really no good reason to keep this config option. Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240130060823.57990-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/Kconfig | 11 ----------- drivers/iommu/intel/iommu.c | 4 ---- 2 files changed, 15 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f6f46b1b3dad..0a6b9eda12a3 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -65,17 +65,6 @@ config INTEL_IOMMU_DEFAULT_ON one is found. If this option is not selected, DMAR support can be enabled by passing intel_iommu=on to the kernel. -config INTEL_IOMMU_BROKEN_GFX_WA - bool "Workaround broken graphics drivers (going away soon)" - depends on BROKEN && X86 - help - Current Graphics drivers tend to use physical address - for DMA and avoid using DMA APIs. Setting this config - option permits the IOMMU driver to set a unity map for - all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA, at least until this - option is removed in the 2.6.32 kernel. - config INTEL_IOMMU_FLOPPY_WA def_bool y depends on X86 diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4bb2fdc8d26d..041d2f6b11d7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2724,10 +2724,6 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); } -#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA - dmar_map_gfx = 0; -#endif - if (!dmar_map_gfx) iommu_identity_mapping |= IDENTMAP_GFX; -- Gitee From c1372103d6ca3c164cbf4b1d52f25bf4d0d849fd Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Tue, 27 Feb 2024 10:14:35 +0800 Subject: [PATCH 156/219] iommu/vt-d: Use kcalloc() instead of kzalloc() ANBZ: #13617 commit 8379054869a0e8d0ebf8f2160ccc778ed6268b96 upstream. This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows [1]. Here the multiplication is obviously safe because DMAR_LATENCY_NUM is the number of latency types defined in the "latency_type" enum. enum latency_type { DMAR_LATENCY_INV_IOTLB = 0, DMAR_LATENCY_INV_DEVTLB, DMAR_LATENCY_INV_IEC, DMAR_LATENCY_PRQ, DMAR_LATENCY_NUM }; However, using kcalloc() is more appropriate [2] and improves readability. This patch has no effect on runtime behavior. Link: https://github.com/KSPP/linux/issues/162 [1] Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2] Signed-off-by: Erick Archer Reviewed-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240211175143.9229-1-erick.archer@gmx.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c index 94ee70ac38e3..adc4de6bbd88 100644 --- a/drivers/iommu/intel/perf.c +++ b/drivers/iommu/intel/perf.c @@ -33,7 +33,7 @@ int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) spin_lock_irqsave(&latency_lock, flags); if (!iommu->perf_statistic) { - iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM, + iommu->perf_statistic = kcalloc(DMAR_LATENCY_NUM, sizeof(*lstat), GFP_ATOMIC); if (!iommu->perf_statistic) { ret = -ENOMEM; -- Gitee From 9c2b15d59139cfdf8b2c07a964f737db40d22521 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Tue, 27 Feb 2024 10:14:37 +0800 Subject: [PATCH 157/219] iommu/vt-d: Remove treatment for revoking PASIDs with pending page faults ANBZ: #13617 commit b4b1054f6cdd08b040fc5132936a5700c1d2d05b upstream. Commit 2f26e0a9c986 ("iommu/vt-d: Add basic SVM PASID support") added a special treatment to mandate that no page faults may be outstanding for the PASID after intel_svm_unbind_mm() is called, as the PASID will be released and reused after unbind. This is unnecessary anymore as no outstanding page faults have been ensured in the driver's remove_dev_pasid path: - Tear down the pasid entry, which guarantees that new page faults for the PASID will be rejected by the iommu hardware. - All outstanding page faults have been responded to. - All hardware pending faults are drained in intel_drain_pasid_prq(). Remove this unnecessary code. Signed-off-by: Tina Zhang Link: https://lore.kernel.org/r/20240219125723.1645703-2-tina.zhang@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/svm.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 8d7b8170f5f3..97780aa03a92 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -406,13 +406,6 @@ void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) if (svm->notifier.ops) mmu_notifier_unregister(&svm->notifier, mm); pasid_private_remove(svm->pasid); - /* - * We mandate that no page faults may be outstanding - * for the PASID when intel_svm_unbind_mm() is called. - * If that is not obeyed, subtle errors will happen. - * Let's make them less subtle... - */ - memset(svm, 0x6b, sizeof(*svm)); kfree(svm); } } -- Gitee From 29527a1f0c4225d7bcbaea59dfb318a1d5077836 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Tue, 27 Feb 2024 10:14:38 +0800 Subject: [PATCH 158/219] iommu/vt-d: Remove initialization for dynamically heap-allocated rcu_head ANBZ: #13617 commit cb0b95e56269bb6ca996373a02ff0b6edfa09d32 upstream. The rcu_head structures allocated dynamically in the heap don't need any initialization. Therefore, remove the init_rcu_head(). Signed-off-by: Tina Zhang Link: https://lore.kernel.org/r/20240219125723.1645703-3-tina.zhang@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/svm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 97780aa03a92..96ee7d1dbdd2 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -358,7 +358,6 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, sdev->iommu = iommu; sdev->did = FLPT_DEFAULT_DID; sdev->sid = PCI_DEVID(info->bus, info->devfn); - init_rcu_head(&sdev->rcu); if (info->ats_enabled) { sdev->qdep = info->ats_qdep; if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) -- Gitee From e1b96ff95fe824358947e98d4599909c53479433 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Tue, 27 Feb 2024 10:14:39 +0800 Subject: [PATCH 159/219] iommu/vt-d: Merge intel_svm_bind_mm() into its caller ANBZ: #13617 commit 8ca918cbc2522e72faa5c102d02059e53d5128d0 upstream. intel_svm_set_dev_pasid() is the only caller of intel_svm_bind_mm(). Merge them and remove intel_svm_bind_mm(). No functional change intended. Signed-off-by: Tina Zhang Link: https://lore.kernel.org/r/20240219125723.1645703-4-tina.zhang@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/svm.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 96ee7d1dbdd2..dc75434c14ab 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -313,10 +313,11 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, return 0; } -static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct iommu_domain *domain, ioasid_t pasid) +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; struct intel_svm_dev *sdev; struct intel_svm *svm; @@ -774,15 +775,6 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, } } -static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - - return intel_svm_bind_mm(iommu, dev, domain, pasid); -} - static void intel_svm_domain_free(struct iommu_domain *domain) { kfree(to_dmar_domain(domain)); -- Gitee From 95e056f037233b1630239db9730a6d05026ff680 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 27 Feb 2024 10:14:40 +0800 Subject: [PATCH 160/219] iommu/vt-d: Use rbtree to track iommu probed devices ANBZ: #13617 commit 1a75cc710b956010137b4fe1d1fa3282bfd8f86c upstream. Use a red-black tree(rbtree) to track devices probed by the driver's probe_device callback. These devices need to be looked up quickly by a source ID when the hardware reports a fault, either recoverable or unrecoverable. Fault reporting paths are critical. Searching a list in this scenario is inefficient, with an algorithm complexity of O(n). An rbtree is a self-balancing binary search tree, offering an average search time complexity of O(log(n)). This significant performance improvement makes rbtrees a better choice. Furthermore, rbtrees are implemented on a per-iommu basis, eliminating the need for global searches and further enhancing efficiency in critical fault paths. The rbtree is protected by a spin lock with interrupts disabled to ensure thread-safe access even within interrupt contexts. Co-developed-by: Huang Jiaqing Signed-off-by: Huang Jiaqing Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240220065939.121116-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/dmar.c | 3 +- drivers/iommu/intel/iommu.c | 88 ++++++++++++++++++++++++++++++++++++- drivers/iommu/intel/iommu.h | 8 ++++ 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 2bb40c1d7843..6d8c29a26df2 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1153,7 +1153,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->agaw = agaw; iommu->msagaw = msagaw; iommu->segment = drhd->segment; - + iommu->device_rbtree = RB_ROOT; + spin_lock_init(&iommu->device_rbtree_lock); iommu->node = NUMA_NO_NODE; ver = readl(iommu->reg + DMAR_VER_REG); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 041d2f6b11d7..a7b3a9cfca90 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -97,6 +97,81 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } +static int device_rid_cmp_key(const void *key, const struct rb_node *node) +{ + struct device_domain_info *info = + rb_entry(node, struct device_domain_info, node); + const u16 *rid_lhs = key; + + if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) + return -1; + + if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) + return 1; + + return 0; +} + +static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) +{ + struct device_domain_info *info = + rb_entry(lhs, struct device_domain_info, node); + u16 key = PCI_DEVID(info->bus, info->devfn); + + return device_rid_cmp_key(&key, rhs); +} + +/* + * Looks up an IOMMU-probed device using its source ID. + * + * Returns the pointer to the device if there is a match. Otherwise, + * returns NULL. + * + * Note that this helper doesn't guarantee that the device won't be + * released by the iommu subsystem after being returned. The caller + * should use its own synchronization mechanism to avoid the device + * being released during its use if its possibly the case. + */ +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) +{ + struct device_domain_info *info = NULL; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); + if (node) + info = rb_entry(node, struct device_domain_info, node); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + + return info ? info->dev : NULL; +} + +static int device_rbtree_insert(struct intel_iommu *iommu, + struct device_domain_info *info) +{ + struct rb_node *curr; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + if (WARN_ON(curr)) + return -EEXIST; + + return 0; +} + +static void device_rbtree_remove(struct device_domain_info *info) +{ + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + rb_erase(&info->node, &iommu->device_rbtree); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); +} + /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -4399,25 +4474,34 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } dev_iommu_priv_set(dev, info); + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); - kfree(info); - return ERR_PTR(ret); + goto clear_rbtree; } } intel_iommu_debugfs_create_dev(info); return &iommu->iommu; +clear_rbtree: + device_rbtree_remove(info); +free: + kfree(info); + + return ERR_PTR(ret); } static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); + device_rbtree_remove(info); dmar_remove_one_dev_info(dev); intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f5e0726258d9..1231c80bc795 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -722,6 +722,11 @@ struct intel_iommu { struct q_inval *qi; /* Queued invalidation info */ u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/ + /* rb tree for all probed devices */ + struct rb_root device_rbtree; + /* protect the device_rbtree */ + spinlock_t device_rbtree_lock; + #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ struct irq_domain *ir_domain; @@ -755,6 +760,8 @@ struct device_domain_info { struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ struct pasid_table *pasid_table; /* pasid table */ + /* device tracking node(lookup by PCI RID) */ + struct rb_node node; #ifdef CONFIG_INTEL_IOMMU_DEBUGFS struct dentry *debugfs_dentry; /* pointer to device directory dentry */ #endif @@ -1079,6 +1086,7 @@ int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -- Gitee From 9833d6c66c102e5610945997f58d14e561e16362 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 27 Feb 2024 10:14:41 +0800 Subject: [PATCH 161/219] iommu/vt-d: Use device rbtree in iopf reporting path ANBZ: #13617 commit def054b01a867822254e1dda13d587f5c7a99e2a upstream. The existing I/O page fault handler currently locates the PCI device by calling pci_get_domain_bus_and_slot(). This function searches the list of all PCI devices until the desired device is found. To improve lookup efficiency, replace it with device_rbtree_find() to search the device within the probed device rbtree. The I/O page fault is initiated by the device, which does not have any synchronization mechanism with the software to ensure that the device stays in the probed device tree. Theoretically, a device could be released by the IOMMU subsystem after device_rbtree_find() and before iopf_get_dev_fault_param(), which would cause a use-after-free problem. Add a mutex to synchronize the I/O page fault reporting path and the IOMMU release device path. This lock doesn't introduce any performance overhead, as the conflict between I/O page fault reporting and device releasing is very rare. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240220065939.121116-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/svm.c commit 3dfa64aecbaf make intel_svm_prq_report return void, but do not know why conflict here ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/dmar.c | 1 + drivers/iommu/intel/iommu.c | 3 +++ drivers/iommu/intel/iommu.h | 2 ++ drivers/iommu/intel/svm.c | 17 +++++++++-------- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 6d8c29a26df2..0a5e46849634 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1155,6 +1155,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->segment = drhd->segment; iommu->device_rbtree = RB_ROOT; spin_lock_init(&iommu->device_rbtree_lock); + mutex_init(&iommu->iopf_lock); iommu->node = NUMA_NO_NODE; ver = readl(iommu->reg + DMAR_VER_REG); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a7b3a9cfca90..8e99c5d374af 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4500,8 +4500,11 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + mutex_lock(&iommu->iopf_lock); device_rbtree_remove(info); + mutex_unlock(&iommu->iopf_lock); dmar_remove_one_dev_info(dev); intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 1231c80bc795..c769f57c44ae 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -719,6 +719,8 @@ struct intel_iommu { #endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; + /* Synchronization between fault report and iommu device release. */ + struct mutex iopf_lock; struct q_inval *qi; /* Queued invalidation info */ u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index dc75434c14ab..71c1b2a0ca16 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -637,7 +637,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) struct intel_iommu *iommu = d; struct page_req_dsc *req; int head, tail, handled; - struct pci_dev *pdev; + struct device *dev; u64 address; /* @@ -683,21 +683,22 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) goto prq_advance; - pdev = pci_get_domain_bus_and_slot(iommu->segment, - PCI_BUS_NUM(req->rid), - req->rid & 0xff); /* * If prq is to be handled outside iommu driver via receiver of * the fault notifiers, we skip the page response here. */ - if (!pdev) + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); goto bad_req; + } - intel_svm_prq_report(iommu, &pdev->dev, req); - trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1, + intel_svm_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, req->priv_data[0], req->priv_data[1], iommu->prq_seq_number++); - pci_dev_put(pdev); + mutex_unlock(&iommu->iopf_lock); prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; } -- Gitee From a370fb23acaf5e6b095e64e74ae166245c70d21b Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Tue, 27 Feb 2024 06:48:21 +0000 Subject: [PATCH 162/219] iommu/sva: Fix SVA handle sharing in multi device case ANBZ: #13617 commit 6384c56c99d98c84afea5b9ec7029a6e153ae431 upstream. iommu_sva_bind_device will directly goto out in multi-device case when found existing domain, ignoring list_add handle, which causes the handle to fail to be shared. Fixes: 65d4418c5002 ("iommu/sva: Restore SVA handle sharing") Signed-off-by: Zhangfei Gao Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240227064821.128-1-zhangfei.gao@linaro.org Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sva.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 6bebaedb786a..640acc804e8c 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -117,11 +117,11 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - refcount_set(&handle->users, 1); list_add(&domain->next, &mm->iommu_mm->sva_domains); - list_add(&handle->handle_item, &mm->iommu_mm->sva_handles); out: + refcount_set(&handle->users, 1); + list_add(&handle->handle_item, &mm->iommu_mm->sva_handles); mutex_unlock(&iommu_sva_lock); handle->dev = dev; handle->domain = domain; -- Gitee From 66faedc64381ffe96ba2267fea388a93d33a7dee Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Tue, 5 Mar 2024 20:21:16 +0800 Subject: [PATCH 163/219] iommu/vt-d: Improve ITE fault handling if target device isn't present ANBZ: #13617 commit 80a9b50c0b9e297669a8a400eb35468cd87a9aed upstream. Because surprise removal could happen anytime, e.g. user could request safe removal to EP(endpoint device) via sysfs and brings its link down to do surprise removal cocurrently. such aggressive cases would cause ATS invalidation request issued to non-existence target device, then deadly loop to retry that request after ITE fault triggered in interrupt context. this patch aims to optimize the ITE handling by checking the target device presence state to avoid retrying the timeout request blindly, thus avoid hard lockup or system hang. Devices TLB should only be invalidated when devices are in the iommu->device_rbtree (probed, not released) and present. Fixes: 6ba6c3a4cacf ("VT-d: add device IOTLB invalidation support") Reviewed-by: Dan Carpenter Signed-off-by: Ethan Zhao Link: https://lore.kernel.org/r/20240301080727.3529832-4-haifeng.zhao@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/dmar.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 0a5e46849634..a2960a1c60a6 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1329,6 +1329,8 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) { u32 fault; int head, tail; + struct device *dev; + u64 iqe_err, ite_sid; struct q_inval *qi = iommu->qi; int shift = qi_shift(iommu); @@ -1373,6 +1375,13 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) tail = readl(iommu->reg + DMAR_IQT_REG); tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH; + /* + * SID field is valid only when the ITE field is Set in FSTS_REG + * see Intel VT-d spec r4.1, section 11.4.9.9 + */ + iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); + ite_sid = DMAR_IQER_REG_ITESID(iqe_err); + writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); pr_info("Invalidation Time-out Error (ITE) cleared\n"); @@ -1382,6 +1391,19 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) head = (head - 2 + QI_LENGTH) % QI_LENGTH; } while (head != tail); + /* + * If device was released or isn't present, no need to retry + * the ATS invalidate request anymore. + * + * 0 value of ite_sid means old VT-d device, no ite_sid value. + * see Intel VT-d spec r4.1, section 11.4.9.9 + */ + if (ite_sid) { + dev = device_rbtree_find(iommu, ite_sid); + if (!dev || !dev_is_pci(dev) || + !pci_device_is_present(to_pci_dev(dev))) + return -ETIMEDOUT; + } if (qi->desc_status[wait_index] == QI_ABORT) return -EAGAIN; } -- Gitee From 70d615b1f5fe606c9e8d0e8f3644eb25e83be5ff Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:17 +0800 Subject: [PATCH 164/219] iommu: Add static iommu_ops->release_domain ANBZ: #13617 commit 0061ffe289e19caabeea8103e69cb0f1896e34d8 upstream. The current device_release callback for individual iommu drivers does the following: 1) Silent IOMMU DMA translation: It detaches any existing domain from the device and puts it into a blocking state (some drivers might use the identity state). 2) Resource release: It releases resources allocated during the device_probe callback and restores the device to its pre-probe state. Step 1 is challenging for individual iommu drivers because each must check if a domain is already attached to the device. Additionally, if a deferred attach never occurred, the device_release should avoid modifying hardware configuration regardless of the reason for its call. To simplify this process, introduce a static release_domain within the iommu_ops structure. It can be either a blocking or identity domain depending on the iommu hardware. The iommu core will decide whether to attach this domain before the device_release callback, eliminating the need for repetitive code in various drivers. Consequently, the device_release callback can focus solely on the opposite operations of device_probe, including releasing all resources allocated during that callback. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 19 +++++++++++++++---- include/linux/iommu.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 16897f15b2e8..d044850618bf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -462,13 +462,24 @@ static void iommu_deinit_device(struct device *dev) /* * release_device() must stop using any attached domain on the device. - * If there are still other devices in the group they are not effected + * If there are still other devices in the group, they are not affected * by this callback. * - * The IOMMU driver must set the device to either an identity or - * blocking translation and stop using any domain pointer, as it is - * going to be freed. + * If the iommu driver provides release_domain, the core code ensures + * that domain is attached prior to calling release_device. Drivers can + * use this to enforce a translation on the idle iommu. Typically, the + * global static blocked_domain is a good choice. + * + * Otherwise, the iommu driver must set the device to either an identity + * or a blocking translation in release_device() and stop using any + * domain pointer, as it is going to be freed. + * + * Regardless, if a delayed attach never occurred, then the release + * should still avoid touching any hardware configuration either. */ + if (!dev->iommu->attach_deferred && ops->release_domain) + ops->release_domain->ops->attach_dev(ops->release_domain, dev); + if (ops->release_device) ops->release_device(dev); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 308bcf09e6a5..8abb55865861 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -593,6 +593,7 @@ struct iommu_ops { struct module *owner; struct iommu_domain *identity_domain; struct iommu_domain *blocked_domain; + struct iommu_domain *release_domain; struct iommu_domain *default_domain; CK_KABI_RESERVE(1) -- Gitee From 5ddce62b6ba7df9018f048fefed76e43d4b3c950 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:18 +0800 Subject: [PATCH 165/219] iommu/vt-d: Fix NULL domain on device release ANBZ: #13617 commit 81e921fd321614c2ad8ac333b041aae1da7a1c6d upstream. In the kdump kernel, the IOMMU operates in deferred_attach mode. In this mode, info->domain may not yet be assigned by the time the release_device function is called. It leads to the following crash in the crash kernel: BUG: kernel NULL pointer dereference, address: 000000000000003c ... RIP: 0010:do_raw_spin_lock+0xa/0xa0 ... _raw_spin_lock_irqsave+0x1b/0x30 intel_iommu_release_device+0x96/0x170 iommu_deinit_device+0x39/0xf0 __iommu_group_remove_device+0xa0/0xd0 iommu_bus_notifier+0x55/0xb0 notifier_call_chain+0x5a/0xd0 blocking_notifier_call_chain+0x41/0x60 bus_notify+0x34/0x50 device_del+0x269/0x3d0 pci_remove_bus_device+0x77/0x100 p2sb_bar+0xae/0x1d0 ... i801_probe+0x423/0x740 Use the release_domain mechanism to fix it. The scalable mode context entry which is not part of release domain should be cleared in release_device(). Fixes: 586081d3f6b1 ("iommu/vt-d: Remove DEFER_DEVICE_DOMAIN_INFO") Reported-by: Eric Badger Closes: https://lore.kernel.org/r/20240113181713.1817855-1-ebadger@purestorage.com Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 31 ++++-------------- drivers/iommu/intel/pasid.c | 64 +++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 1 + 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8e99c5d374af..6bc1e2ad050a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3943,30 +3943,6 @@ static void domain_context_clear(struct device_domain_info *info) &domain_context_clear_one_cb, info); } -static void dmar_remove_one_dev_info(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; - struct intel_iommu *iommu = info->iommu; - unsigned long flags; - - if (!dev_is_real_dma_subdevice(info->dev)) { - if (dev_is_pci(info->dev) && sm_supported(iommu)) - intel_pasid_tear_down_entry(iommu, info->dev, - IOMMU_NO_PASID, false); - - iommu_disable_pci_caps(info); - domain_context_clear(info); - } - - spin_lock_irqsave(&domain->lock, flags); - list_del(&info->link); - spin_unlock_irqrestore(&domain->lock, flags); - - domain_detach_iommu(domain, iommu); - info->domain = NULL; -} - /* * Clear the page table pointer in context or pasid table entries so that * all DMA requests without PASID from the device are blocked. If the page @@ -4505,7 +4481,11 @@ static void intel_iommu_release_device(struct device *dev) mutex_lock(&iommu->iopf_lock); device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); - dmar_remove_one_dev_info(dev); + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + !context_copied(iommu, info->bus, info->devfn)) + intel_pasid_teardown_sm_context(dev); + intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); kfree(info); @@ -4999,6 +4979,7 @@ static const struct iommu_dirty_ops intel_dirty_ops = { const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, + .release_domain = &blocking_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 1fd5e4f7b9e4..6ef582bfaea5 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -670,3 +670,67 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return 0; } + +/* + * Interfaces to setup or teardown a pasid table to the scalable-mode + * context table entry: + */ + +static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, false); + if (!context) { + spin_unlock(&iommu->lock); + return; + } + + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + spin_unlock(&iommu->lock); + + /* + * Cache invalidation for changes to a scalable-mode context table + * entry. + * + * Section 6.5.3.3 of the VT-d spec: + * - Device-selective context-cache invalidation; + * - Domain-selective PASID-cache invalidation to affected domains + * (can be skipped if all PASID entries were not-present); + * - Domain-selective IOTLB invalidation to affected domains; + * - Global Device-TLB invalidation to affected functions. + * + * The iommu has been parked in the blocking state. All domains have + * been detached from the device or PASID. The PASID and IOTLB caches + * have been invalidated during the domain detach path. + */ + iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); + devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); +} + +static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev == &pdev->dev) + device_pasid_table_teardown(dev, PCI_BUS_NUM(alias), alias & 0xff); + + return 0; +} + +void intel_pasid_teardown_sm_context(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) { + device_pasid_table_teardown(dev, info->bus, info->devfn); + return; + } + + pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_teardown, dev); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 487ede039bdd..42fda97fd851 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -318,4 +318,5 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, bool fault_ignore); void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, struct device *dev, u32 pasid); +void intel_pasid_teardown_sm_context(struct device *dev); #endif /* __INTEL_PASID_H */ -- Gitee From 7ecf5ea736cd42b70b78432565f0c03686e56466 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:19 +0800 Subject: [PATCH 166/219] iommu/vt-d: Setup scalable mode context entry in probe path ANBZ: #13617 commit 301f1a80487fd2f51012533792583d4425e8b8c0 upstream. In contrast to legacy mode, the DMA translation table is configured in the PASID table entry instead of the context entry for scalable mode. For this reason, it is more appropriate to set up the scalable mode context entry in the device_probe callback and direct it to the appropriate PASID table. The iommu domain attach/detach operations only affect the PASID table entry. Therefore, there is no need to modify the context entry when configuring the translation type and page table. The only exception is the kdump case, where context entry setup is postponed until the device driver invokes the first DMA interface. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 12 ++++ drivers/iommu/intel/pasid.c | 138 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 1 + 3 files changed, 151 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6bc1e2ad050a..a33ba0e56d6b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4147,6 +4147,10 @@ int prepare_domain_attach_device(struct iommu_domain *domain, dmar_domain->agaw--; } + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + context_copied(iommu, info->bus, info->devfn)) + return intel_pasid_setup_sm_context(dev); + return 0; } @@ -4460,11 +4464,19 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) dev_err(dev, "PASID table allocation failed\n"); goto clear_rbtree; } + + if (!context_copied(iommu, info->bus, info->devfn)) { + ret = intel_pasid_setup_sm_context(dev); + if (ret) + goto free_table; + } } intel_iommu_debugfs_create_dev(info); return &iommu->iommu; +free_table: + intel_pasid_free_table(dev); clear_rbtree: device_rbtree_remove(info); free: diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 6ef582bfaea5..abce19e2ad6f 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -734,3 +734,141 @@ void intel_pasid_teardown_sm_context(struct device *dev) pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_teardown, dev); } + +/* + * Get the PASID directory size for scalable mode context entry. + * Value of X in the PDTS field of a scalable mode context entry + * indicates PASID directory with 2^(X + 7) entries. + */ +static unsigned long context_get_sm_pds(struct pasid_table *table) +{ + unsigned long pds, max_pde; + + max_pde = table->max_pasid >> PASID_PDE_SHIFT; + pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); + if (pds < 7) + return 0; + + return pds - 7; +} + +static int context_entry_set_pasid_table(struct context_entry *context, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *table = info->pasid_table; + struct intel_iommu *iommu = info->iommu; + unsigned long pds; + + context_clear_entry(context); + + pds = context_get_sm_pds(table); + context->lo = (u64)virt_to_phys(table->table) | context_pdts(pds); + context_set_sm_rid2pasid(context, IOMMU_NO_PASID); + + if (info->ats_supported) + context_set_sm_dte(context); + if (info->pri_supported) + context_set_sm_pre(context); + if (info->pasid_supported) + context_set_pasid(context); + + context_set_fault_enable(context); + context_set_present(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + + return 0; +} + +static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, true); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + if (context_copied(iommu, bus, devfn)) { + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + + /* + * For kdump cases, old valid entries may be cached due to + * the in-flight DMA and copied pgtable, but there is no + * unmapping behaviour for them, thus we need explicit cache + * flushes for all affected domain IDs and PASIDs used in + * the copied PASID table. Given that we have no idea about + * which domain IDs and PASIDs were used in the copied tables, + * upgrade them to global PASID and IOTLB cache invalidation. + */ + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); + devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + + /* + * At this point, the device is supposed to finish reset at + * its driver probe stage, so no in-flight DMA will exist, + * and we don't need to worry anymore hereafter. + */ + clear_context_copied(iommu, bus, devfn); + } + + context_entry_set_pasid_table(context, dev); + spin_unlock(&iommu->lock); + + /* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we don't need to flush the caches. If it does + * cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); + } + + return 0; +} + +static int pci_pasid_table_setup(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev != &pdev->dev) + return 0; + + return device_pasid_table_setup(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +/* + * Set the device's PASID table to its context table entry. + * + * The PASID table is set to the context entries of both device itself + * and its alias requester ID for DMA. + */ +int intel_pasid_setup_sm_context(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return device_pasid_table_setup(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 42fda97fd851..da9978fef7ac 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -318,5 +318,6 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, bool fault_ignore); void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, struct device *dev, u32 pasid); +int intel_pasid_setup_sm_context(struct device *dev); void intel_pasid_teardown_sm_context(struct device *dev); #endif /* __INTEL_PASID_H */ -- Gitee From 15ea3b4c40c82a8625572bc35f0a46d80991c221 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:20 +0800 Subject: [PATCH 167/219] iommu/vt-d: Remove scalable mode context entry setup from attach_dev ANBZ: #13617 commit a016e53843ed8bb9cccb832768e2be9856b0a913 upstream. The scalable mode context entry is now setup in the probe_device path, eliminating the need to configure it in the attach_dev path. Removes the redundant code from the attach_dev path to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 156 ++++++++++-------------------------- 1 file changed, 44 insertions(+), 112 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a33ba0e56d6b..8343986d643e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1863,34 +1863,17 @@ static void domain_exit(struct dmar_domain *domain) kfree(domain); } -/* - * Get the PASID directory size for scalable mode context entry. - * Value of X in the PDTS field of a scalable mode context entry - * indicates PASID directory with 2^(X + 7) entries. - */ -static unsigned long context_get_sm_pds(struct pasid_table *table) -{ - unsigned long pds, max_pde; - - max_pde = table->max_pasid >> PASID_PDE_SHIFT; - pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); - if (pds < 7) - return 0; - - return pds - 7; -} - static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, - struct pasid_table *table, u8 bus, u8 devfn) { struct device_domain_info *info = domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; + struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int ret; + int agaw, ret; if (hw_pass_through && domain_type_is_si(domain)) translation = CONTEXT_TT_PASS_THROUGH; @@ -1933,65 +1916,37 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } context_clear_entry(context); + context_set_domain_id(context, did); - if (sm_supported(iommu)) { - unsigned long pds; - - /* Setup the PASID DIR pointer: */ - pds = context_get_sm_pds(table); - context->lo = (u64)virt_to_phys(table->table) | - context_pdts(pds); - - /* Setup the RID_PASID field: */ - context_set_sm_rid2pasid(context, IOMMU_NO_PASID); - + if (translation != CONTEXT_TT_PASS_THROUGH) { /* - * Setup the Device-TLB enable bit and Page request - * Enable bit: + * Skip top levels of page tables for iommu which has + * less agaw than default. Unnecessary for PT mode. */ - if (info && info->ats_supported) - context_set_sm_dte(context); - if (info && info->pri_supported) - context_set_sm_pre(context); - if (info && info->pasid_supported) - context_set_pasid(context); - } else { - struct dma_pte *pgd = domain->pgd; - int agaw; - - context_set_domain_id(context, did); - - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; - - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { - /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. - */ - context_set_address_width(context, iommu->msagaw); + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + ret = -ENOMEM; + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) + goto out_unlock; } - context_set_translation_type(context, translation); + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, agaw); + } else { + /* + * In pass through mode, AW must be programmed to + * indicate the largest AGAW value supported by + * hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); } + context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) @@ -2021,43 +1976,29 @@ static int domain_context_mapping_one(struct dmar_domain *domain, return ret; } -struct domain_context_mapping_data { - struct dmar_domain *domain; - struct intel_iommu *iommu; - struct pasid_table *table; -}; - static int domain_context_mapping_cb(struct pci_dev *pdev, u16 alias, void *opaque) { - struct domain_context_mapping_data *data = opaque; + struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *domain = opaque; - return domain_context_mapping_one(data->domain, data->iommu, - data->table, PCI_BUS_NUM(alias), - alias & 0xff); + return domain_context_mapping_one(domain, iommu, + PCI_BUS_NUM(alias), alias & 0xff); } static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct domain_context_mapping_data data; struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; - struct pasid_table *table; - - table = intel_pasid_get_table(dev); if (!dev_is_pci(dev)) - return domain_context_mapping_one(domain, iommu, table, - bus, devfn); - - data.domain = domain; - data.iommu = iommu; - data.table = table; + return domain_context_mapping_one(domain, iommu, bus, devfn); return pci_for_each_dma_alias(to_pci_dev(dev), - &domain_context_mapping_cb, &data); + domain_context_mapping_cb, domain); } /* Returns a number of VTD pages, but aligned to MM page size */ @@ -2417,28 +2358,19 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); - /* PASID table is mandatory for a PCI device in scalable mode. */ - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { - /* Setup the PASID entry for requests without PASID: */ - if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, - dev, IOMMU_NO_PASID); - else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, - IOMMU_NO_PASID); - else - ret = intel_pasid_setup_second_level(iommu, domain, - dev, IOMMU_NO_PASID); - if (ret) { - dev_err(dev, "Setup RID2PASID failed\n"); - device_block_translation(dev); - return ret; - } - } + if (dev_is_real_dma_subdevice(dev)) + return 0; + + if (!sm_supported(iommu)) + ret = domain_context_mapping(domain, dev); + else if (hw_pass_through && domain_type_is_si(domain)) + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + else if (domain->use_first_level) + ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); + else + ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); - ret = domain_context_mapping(domain, dev); if (ret) { - dev_err(dev, "Domain context map failed\n"); device_block_translation(dev); return ret; } -- Gitee From dc04b140940c5685abe72efe586db2fe70b3169d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:21 +0800 Subject: [PATCH 168/219] iommu/vt-d: Remove scalabe mode in domain_context_clear_one() ANBZ: #13617 commit 80ca79f398bff2708ee7a19d5904804415680aad upstream. domain_context_clear_one() only handles the context entry teardown in legacy mode. Remove the scalable mode check in it to avoid dead code. Remove an unnecessary check in the code as well. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8343986d643e..49ffec0b8f92 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2188,9 +2188,6 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 struct context_entry *context; u16 did_old; - if (!iommu) - return; - spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); if (!context) { @@ -2198,14 +2195,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } - if (sm_supported(iommu)) { - if (hw_pass_through && domain_type_is_si(info->domain)) - did_old = FLPT_DEFAULT_DID; - else - did_old = domain_id_iommu(info->domain, iommu); - } else { - did_old = context_domain_id(context); - } + did_old = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); @@ -2216,9 +2206,6 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); - if (sm_supported(iommu)) - qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); - iommu->flush.flush_iotlb(iommu, did_old, 0, -- Gitee From d0dd7e78bfe354e999d5c49812900c80235a3c97 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 4 Mar 2024 12:05:42 +0000 Subject: [PATCH 169/219] iommu/dma: Document min_align_mask assumption ANBZ: #13617 commit e2addba4930558a6f37a2e4b2cb6f726638a6dce upstream. iommu-dma does not explicitly reference min_align_mask since we already assume that will be less than or equal to any typical IOVA granule. We wouldn't realistically expect to see the case where it is larger, and that would be non-trivial to support, however for the sake of reasoning (particularly around the interaction with SWIOTLB), let's clearly enforce the assumption. Signed-off-by: Robin Murphy Reviewed-by: Michael Kelley Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/dbb4d2d8e5d1691ac9a6c67e9758904e6c447ba5.1709553942.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/dma-iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index c2926bd4f8fa..3ea2e569d356 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -879,6 +879,11 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, iommu_deferred_attach(dev, domain)) return DMA_MAPPING_ERROR; + /* If anyone ever wants this we'd need support in the IOVA allocator */ + if (dev_WARN_ONCE(dev, dma_get_min_align_mask(dev) > iova_mask(iovad), + "Unsupported alignment constraint\n")) + return DMA_MAPPING_ERROR; + size = iova_align(iovad, size + iova_off); iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev); -- Gitee From 00f48ac3496404f58ae9017bbdb5bcc8c20827b2 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 7 Mar 2024 05:27:38 +0000 Subject: [PATCH 170/219] iommu/amd: Fix sleeping in atomic context ANBZ: #13617 commit a0c8bf0a474eea8ee2590332a1ebdec6048ce40c upstream. Commit cf70873e3d01 ("iommu/amd: Refactor GCR3 table helper functions") changed GFP flag we use for GCR3 table. Original plan was to move GCR3 table allocation outside spinlock. But this requires complete rework of attach device path. Hence we didn't do it as part of SVA series. For now revert the GFP flag to ATOMIC (same as original code). Fixes: cf70873e3d01 ("iommu/amd: Refactor GCR3 table helper functions") Reported-by: Dan Carpenter Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240307052738.116035-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9ae25547f743..56c546780c63 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1802,7 +1802,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, /* Allocate per device domain ID */ gcr3_info->domid = domain_id_alloc(); - gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_KERNEL); + gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_ATOMIC); if (gcr3_info->gcr3_tbl == NULL) { domain_id_free(gcr3_info->domid); return -ENOMEM; -- Gitee From 4369a124e010f1bc70fd97c540fd0aaaa443837a Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 30 Jun 2025 15:57:49 +0800 Subject: [PATCH 171/219] iommu/riscv: Device directory management. ANBZ: #13617 commit 1bac10c557adb29891f938c0b5ff93f37e9ba8b1 upstream. Introduce device context allocation and device directory tree management including capabilities discovery sequence, as described in Chapter 2.1 of the RISC-V IOMMU Architecture Specification. Device directory mode will be auto detected using DDTP WARL property, using highest mode supported by the driver and hardware. If none supported can be configured, driver will fall back to global pass-through. First level DDTP page can be located in I/O (detected using DDTP WARL) and system memory. Only simple identity and blocking protection domains are supported by this implementation. Co-developed-by: Nick Kossifidis Signed-off-by: Nick Kossifidis Reviewed-by: Lu Baolu Reviewed-by: Zong Li Signed-off-by: Tomasz Jeznach Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/e1c763aeccd2c05fd4ad3a32f6f2ff3b3148d907.1729059707.git.tjeznach@rivosinc.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui Reviewed-by: Ruidong Tian Reviewed-by: Guixin Liu Link: https://gitee.com/anolis/cloud-kernel/pulls/5473 Signed-off-by: Shuai Xue --- drivers/iommu/riscv/iommu.c | 398 +++++++++++++++++++++++++++++++++++- drivers/iommu/riscv/iommu.h | 5 + 2 files changed, 392 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index c54088bf138f..d702c8f6ddf7 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -16,15 +16,168 @@ #include #include #include +#include #include #include +#include "../iommu-pages.h" #include "iommu-bits.h" #include "iommu.h" /* Timeouts in [us] */ #define RISCV_IOMMU_DDTP_TIMEOUT 50000 +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + +/* Device resource-managed allocations */ +struct riscv_iommu_devres { + void *addr; + int order; +}; + +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) +{ + struct riscv_iommu_devres *devres = res; + + iommu_free_pages(devres->addr, devres->order); +} + +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) +{ + struct riscv_iommu_devres *devres = res; + struct riscv_iommu_devres *target = p; + + return devres->addr == target->addr; +} + +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) +{ + struct riscv_iommu_devres *devres; + void *addr; + + addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), + GFP_KERNEL_ACCOUNT, order); + if (unlikely(!addr)) + return NULL; + + devres = devres_alloc(riscv_iommu_devres_pages_release, + sizeof(struct riscv_iommu_devres), GFP_KERNEL); + + if (unlikely(!devres)) { + iommu_free_pages(addr, order); + return NULL; + } + + devres->addr = addr; + devres->order = order; + + devres_add(iommu->dev, devres); + + return addr; +} + +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) +{ + struct riscv_iommu_devres devres = { .addr = addr }; + + devres_release(iommu->dev, riscv_iommu_devres_pages_release, + riscv_iommu_devres_pages_match, &devres); +} + +/* Lookup and initialize device context info structure. */ +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, + unsigned int devid) +{ + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); + unsigned int depth; + unsigned long ddt, old, new; + void *ptr; + u8 ddi_bits[3] = { 0 }; + u64 *ddtp = NULL; + + /* Make sure the mode is valid */ + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || + iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) + return NULL; + + /* + * Device id partitioning for base format: + * DDI[0]: bits 0 - 6 (1st level) (7 bits) + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) + * + * For extended format: + * DDI[0]: bits 0 - 5 (1st level) (6 bits) + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) + */ + if (base_format) { + ddi_bits[0] = 7; + ddi_bits[1] = 7 + 9; + ddi_bits[2] = 7 + 9 + 8; + } else { + ddi_bits[0] = 6; + ddi_bits[1] = 6 + 9; + ddi_bits[2] = 6 + 9 + 9; + } + + /* Make sure device id is within range */ + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; + if (devid >= (1 << ddi_bits[depth])) + return NULL; + + /* Get to the level of the non-leaf node that holds the device context */ + for (ddtp = iommu->ddt_root; depth-- > 0;) { + const int split = ddi_bits[depth]; + /* + * Each non-leaf node is 64bits wide and on each level + * nodes are indexed by DDI[depth]. + */ + ddtp += (devid >> split) & 0x1FF; + + /* + * Check if this node has been populated and if not + * allocate a new level and populate it. + */ + do { + ddt = READ_ONCE(*(unsigned long *)ddtp); + if (ddt & RISCV_IOMMU_DDTE_V) { + ddtp = __va(ppn_to_phys(ddt)); + break; + } + + ptr = riscv_iommu_get_pages(iommu, 0); + if (!ptr) + return NULL; + + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); + + if (old == ddt) { + ddtp = (u64 *)ptr; + break; + } + + /* Race setting DDT detected, re-read and retry. */ + riscv_iommu_free_pages(iommu, ptr); + } while (1); + } + + /* + * Grab the node that matches DDI[depth], note that when using base + * format the device context is 4 * 64bits, and the extended format + * is 8 * 64bits, hence the (3 - base_format) below. + */ + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); + + return (struct riscv_iommu_dc *)ddtp; +} + /* * This is best effort IOMMU translation shutdown flow. * Disable IOMMU without waiting for hardware response. @@ -37,10 +190,201 @@ static void riscv_iommu_disable(struct riscv_iommu_device *iommu) riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); } +#define riscv_iommu_read_ddtp(iommu) ({ \ + u64 ddtp; \ + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ + RISCV_IOMMU_DDTP_TIMEOUT); \ + ddtp; }) + +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + unsigned int mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* + * It is optional for the hardware to report a fixed address for device + * directory root page when DDT.MODE is OFF or BARE. + */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || + mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { + /* Use WARL to discover hardware fixed DDT PPN */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + iommu->ddt_phys = ppn_to_phys(ddtp); + if (iommu->ddt_phys) + iommu->ddt_root = devm_ioremap(iommu->dev, + iommu->ddt_phys, PAGE_SIZE); + if (iommu->ddt_root) + memset(iommu->ddt_root, 0, PAGE_SIZE); + } + + if (!iommu->ddt_root) { + iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); + iommu->ddt_phys = __pa(iommu->ddt_root); + } + + if (!iommu->ddt_root) + return -ENOMEM; + + return 0; +} + +/* + * Discover supported DDT modes starting from requested value, + * configure DDTP register with accepted mode and root DDT address. + * Accepted iommu->ddt_mode is updated on success. + */ +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, + unsigned int ddtp_mode) +{ + struct device *dev = iommu->dev; + u64 ddtp, rq_ddtp; + unsigned int mode, rq_mode = ddtp_mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* Disallow state transition from xLVL to xLVL. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) + return -EINVAL; + + do { + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); + if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", + rq_mode, ddtp); + return -EBUSY; + } + + /* Verify IOMMU hardware accepts new DDTP config. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + + if (rq_mode == mode) + break; + + /* Hardware mandatory DDTP mode has not been accepted. */ + if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", + ddtp, rq_ddtp); + return -EINVAL; + } + + /* + * Mode field is WARL, an IOMMU may support a subset of + * directory table levels in which case if we tried to set + * an unsupported number of levels we'll readback either + * a valid xLVL or off/bare. If we got off/bare, try again + * with a smaller xLVL. + */ + if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && + rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); + rq_mode--; + continue; + } + + /* + * We tried all supported modes and IOMMU hardware failed to + * accept new settings, something went very wrong since off/bare + * and at least one xLVL must be supported. + */ + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", + mode, ddtp_mode); + return -EINVAL; + } while (1); + + iommu->ddt_mode = mode; + if (mode != ddtp_mode) + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + + return 0; +} + +#define RISCV_IOMMU_FSC_BARE 0 + +/* + * Update IODIR for the device. + * + * During the execution of riscv_iommu_probe_device(), IODIR entries are + * allocated for the device's identifiers. Device context invalidation + * becomes necessary only if one of the updated entries was previously + * marked as valid, given that invalid device context entries are not + * cached by the IOMMU hardware. + * In this implementation, updating a valid device context while the + * device is not quiesced might be disruptive, potentially causing + * interim translation faults. + */ +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, u64 fsc, u64 ta) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_dc *dc; + u64 tc; + int i; + + /* Device context invalidation ignored for now. */ + + /* + * For device context with DC_TC_PDTV = 0, translation attributes valid bit + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). + */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + tc |= ta & RISCV_IOMMU_DC_TC_V; + + WRITE_ONCE(dc->fsc, fsc); + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + /* Update device context, write TC.V as the last step. */ + dma_wmb(); + WRITE_ONCE(dc->tc, tc); + } +} + +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + + return 0; +} + +static struct iommu_domain riscv_iommu_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_blocking_domain, + } +}; + static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, struct device *dev) { - /* Global pass-through already enabled, do nothing for now. */ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + return 0; } @@ -72,6 +416,9 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_device *iommu; + struct riscv_iommu_dc *dc; + u64 tc; + int i; if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) return ERR_PTR(-ENODEV); @@ -80,12 +427,37 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) if (!iommu) return ERR_PTR(-ENODEV); + /* + * IOMMU hardware operating in fail-over BARE mode will provide + * identity translation for all connected devices anyway... + */ + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + return ERR_PTR(-ENODEV); + + /* + * Allocate and pre-configure device context entries in + * the device directory. Do not mark the context valid yet. + */ + tc = 0; + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) + tc |= RISCV_IOMMU_DC_TC_SADE; + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + if (!dc) + return ERR_PTR(-ENODEV); + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) + dev_warn(dev, "already attached to IOMMU device directory\n"); + WRITE_ONCE(dc->tc, tc); + } + return &iommu->iommu; } static const struct iommu_ops riscv_iommu_ops = { .of_xlate = riscv_iommu_of_xlate, .identity_domain = &riscv_iommu_identity_domain, + .blocked_domain = &riscv_iommu_blocking_domain, + .release_domain = &riscv_iommu_blocking_domain, .def_domain_type = riscv_iommu_device_domain_type, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, @@ -131,6 +503,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu) { iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); } int riscv_iommu_init(struct riscv_iommu_device *iommu) @@ -141,19 +514,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) if (rc) return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); - /* - * Placeholder for a complete IOMMU device initialization. For now, - * only bare minimum: enable global identity mapping mode and register sysfs. - */ - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, - FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, - RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); + rc = riscv_iommu_iodir_alloc(iommu); + if (rc) + return rc; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + if (rc) + return rc; rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", dev_name(iommu->dev)); - if (rc) - return dev_err_probe(iommu->dev, rc, - "cannot register sysfs interface\n"); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); + goto err_iodir_off; + } rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); if (rc) { @@ -165,5 +539,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); +err_iodir_off: + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); return rc; } diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 700e33dc2446..f1696926582c 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -34,6 +34,11 @@ struct riscv_iommu_device { /* available interrupt numbers, MSI or WSI */ unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; unsigned int irqs_count; + + /* device directory */ + unsigned int ddt_mode; + dma_addr_t ddt_phys; + u64 *ddt_root; }; int riscv_iommu_init(struct riscv_iommu_device *iommu); -- Gitee From f880331dac416c40c0d9a80ddec170ae5aeed7bf Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 30 Jun 2025 16:03:13 +0800 Subject: [PATCH 172/219] iommu/riscv: Command and fault queue support ANBZ: #13617 commit 856c0cfe5c5f6a2cc8d995872eb67bff9c68c57c upstream. Introduce device command submission and fault reporting queues, as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture Specification. Command and fault queues are instantiated in contiguous system memory local to IOMMU device domain, or mapped from fixed I/O space provided by the hardware implementation. Detection of the location and maximum allowed size of the queue utilize WARL properties of queue base control register. Driver implementation will try to allocate up to 128KB of system memory, while respecting hardware supported maximum queue size. Interrupts allocation is based on interrupt vectors availability and distributed to all queues in simple round-robin fashion. For hardware Implementation with fixed event type to interrupt vector assignment IVEC WARL property is used to discover such mappings. Address translation, command and queue fault handling in this change is limited to simple fault reporting without taking any action. Reviewed-by: Lu Baolu Reviewed-by: Zong Li Signed-off-by: Tomasz Jeznach Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/c4735fb6829053eff37ce1bcca4906192afd743c.1729059707.git.tjeznach@rivosinc.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui Reviewed-by: Ruidong Tian Reviewed-by: Guixin Liu Link: https://gitee.com/anolis/cloud-kernel/pulls/5473 Signed-off-by: Shuai Xue --- drivers/iommu/riscv/iommu-bits.h | 75 +++++ drivers/iommu/riscv/iommu.c | 519 ++++++++++++++++++++++++++++++- drivers/iommu/riscv/iommu.h | 21 ++ 3 files changed, 613 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h index 3a67d4d664b0..98daf0e1a306 100644 --- a/drivers/iommu/riscv/iommu-bits.h +++ b/drivers/iommu/riscv/iommu-bits.h @@ -706,4 +706,79 @@ struct riscv_iommu_msipte { #define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD #define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60) +/* Helper functions: command structure builders. */ + +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, + u64 addr) +{ + cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr)); + cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV; +} + +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd, + int pscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) | + RISCV_IOMMU_CMD_IOTINVAL_PSCV; +} + +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd, + int gscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) | + RISCV_IOMMU_CMD_IOTINVAL_GV; +} + +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW; + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd, + u64 addr, u32 data) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | + RISCV_IOMMU_CMD_IOFENCE_AV; + cmd->dword1 = addr >> 2; +} + +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd, + unsigned int devid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | + RISCV_IOMMU_CMD_IODIR_DV; +} + +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd, + unsigned int pasid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid); +} + #endif /* _RISCV_IOMMU_BITS_H_ */ diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index d702c8f6ddf7..0c6263101815 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -25,7 +25,14 @@ #include "iommu.h" /* Timeouts in [us] */ -#define RISCV_IOMMU_DDTP_TIMEOUT 50000 +#define RISCV_IOMMU_QCSR_TIMEOUT 150000 +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 + +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */ +#define RISCV_IOMMU_DEF_CQ_COUNT 8192 +#define RISCV_IOMMU_DEF_FQ_COUNT 4096 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) @@ -89,6 +96,458 @@ static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) riscv_iommu_devres_pages_match, &devres); } +/* + * Hardware queue allocation and management. + */ + +/* Setup queue base, control registers and default queue length */ +#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ + struct riscv_iommu_queue *_q = q; \ + _q->qid = RISCV_IOMMU_INTR_ ## name; \ + _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ + _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ + _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ +} while (0) + +/* Note: offsets are the same for all queues */ +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) +#define Q_ITEM(q, index) ((q)->mask & (index)) +#define Q_IPSR(q) BIT((q)->qid) + +/* + * Discover queue ring buffer hardware configuration, allocate in-memory + * ring buffer or use fixed I/O memory location, configure queue base register. + * Must be called before hardware queue is enabled. + * + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() + * @entry_size - queue single element size in bytes. + */ +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + size_t entry_size) +{ + unsigned int logsz; + u64 qb, rb; + + /* + * Use WARL base register property to discover maximum allowed + * number of entries and optional fixed IO address for queue location. + */ + riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); + qb = riscv_iommu_readq(iommu, queue->qbr); + + /* + * Calculate and verify hardware supported queue length, as reported + * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). + * Update queue size based on hardware supported value. + */ + logsz = ilog2(queue->mask); + if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) + logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); + + /* + * Use WARL base register property to discover an optional fixed IO + * address for queue ring buffer location. Otherwise allocate contiguous + * system memory. + */ + if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { + const size_t queue_size = entry_size << (logsz + 1); + + queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); + queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); + } else { + do { + const size_t queue_size = entry_size << (logsz + 1); + const int order = get_order(queue_size); + + queue->base = riscv_iommu_get_pages(iommu, order); + queue->phys = __pa(queue->base); + } while (!queue->base && logsz-- > 0); + } + + if (!queue->base) + return -ENOMEM; + + qb = phys_to_ppn(queue->phys) | + FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); + + /* Update base register and read back to verify hw accepted our write */ + riscv_iommu_writeq(iommu, queue->qbr, qb); + rb = riscv_iommu_readq(iommu, queue->qbr); + if (rb != qb) { + dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); + return -ENODEV; + } + + /* Update actual queue mask */ + queue->mask = (2U << logsz) - 1; + + dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", + queue->qid, logsz + 1); + + return 0; +} + +/* Check interrupt queue status, IPSR */ +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + + if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) + return IRQ_WAKE_THREAD; + + return IRQ_NONE; +} + +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) +{ + /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ + return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; +} + +/* + * Enable queue processing in the hardware, register interrupt handler. + * + * @queue - data structure, already allocated with riscv_iommu_queue_alloc() + * @irq_handler - threaded interrupt handler. + */ +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + irq_handler_t irq_handler) +{ + const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; + u32 csr; + int rc; + + if (queue->iommu) + return -EBUSY; + + /* Polling not implemented */ + if (!irq) + return -ENODEV; + + queue->iommu = iommu; + rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + dev_name(iommu->dev), queue); + if (rc) { + queue->iommu = NULL; + return rc; + } + + /* + * Enable queue with interrupts, clear any memory fault if any. + * Wait for the hardware to acknowledge request and activate queue + * processing. + * Note: All CSR bitfields are in the same offsets for all queues. + */ + riscv_iommu_writel(iommu, queue->qcr, + RISCV_IOMMU_QUEUE_ENABLE | + RISCV_IOMMU_QUEUE_INTR_ENABLE | + RISCV_IOMMU_QUEUE_MEM_FAULT); + + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | + RISCV_IOMMU_QUEUE_BUSY | + RISCV_IOMMU_QUEUE_MEM_FAULT))) { + /* Best effort to stop and disable failing hardware queue. */ + riscv_iommu_writel(iommu, queue->qcr, 0); + free_irq(irq, queue); + queue->iommu = NULL; + dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); + return -EBUSY; + } + + /* Clear any pending interrupt flag. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return 0; +} + +/* + * Disable queue. Wait for the hardware to acknowledge request and + * stop processing enqueued requests. Report errors but continue. + */ +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) +{ + struct riscv_iommu_device *iommu = queue->iommu; + u32 csr; + + if (!iommu) + return; + + free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); + riscv_iommu_writel(iommu, queue->qcr, 0); + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) + dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", + queue->qid, csr); + + queue->iommu = NULL; +} + +/* + * Returns number of available valid queue entries and the first item index. + * Update shadow producer index if necessary. + */ +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, + unsigned int *index) +{ + unsigned int head = atomic_read(&queue->head); + unsigned int tail = atomic_read(&queue->tail); + unsigned int last = Q_ITEM(queue, tail); + int available = (int)(tail - head); + + *index = head; + + if (available > 0) + return available; + + /* read hardware producer index, check reserved register bits are not set. */ + if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), + tail, (tail & ~queue->mask) == 0, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { + dev_err_once(queue->iommu->dev, + "Hardware error: queue access timeout\n"); + return 0; + } + + if (tail == last) + return 0; + + /* update shadow producer index */ + return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); +} + +/* + * Release processed queue entries, should match riscv_iommu_queue_consume() calls. + */ +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) +{ + const unsigned int head = atomic_add_return(count, &queue->head); + + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); +} + +/* Return actual consumer index based on hardware reported queue head index. */ +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) +{ + const unsigned int cons = atomic_read(&queue->head); + const unsigned int last = Q_ITEM(queue, cons); + unsigned int head; + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask), + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + return cons; + + return cons + ((head - last) & queue->mask); +} + +/* Wait for submitted item to be processed. */ +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, + unsigned int index, + unsigned int timeout_us) +{ + unsigned int cons = atomic_read(&queue->head); + + /* Already processed by the consumer */ + if ((int)(cons - index) > 0) + return 0; + + /* Monitor consumer index */ + return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, + (int)(cons - index) > 0, 0, timeout_us); +} + +/* Enqueue an entry and wait to be processed if timeout_us > 0 + * + * Error handling for IOMMU hardware not responding in reasonable time + * will be added as separate patch series along with other RAS features. + * For now, only report hardware failure and continue. + */ +static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, + void *entry, size_t entry_size) +{ + unsigned int prod; + unsigned int head; + unsigned int tail; + unsigned long flags; + + /* Do not preempt submission flow. */ + local_irq_save(flags); + + /* 1. Allocate some space in the queue */ + prod = atomic_inc_return(&queue->prod) - 1; + head = atomic_read(&queue->head); + + /* 2. Wait for space availability. */ + if ((prod - head) > queue->mask) { + if (readx_poll_timeout(atomic_read, &queue->head, + head, (prod - head) < queue->mask, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + } else if ((prod - head) == queue->mask) { + const unsigned int last = Q_ITEM(queue, head); + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask) && head != last, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + atomic_add((head - last) & queue->mask, &queue->head); + } + + /* 3. Store entry in the ring buffer */ + memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); + + /* 4. Wait for all previous entries to be ready */ + if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + + /* + * 5. Make sure the ring buffer update (whether in normal or I/O memory) is + * completed and visible before signaling the tail doorbell to fetch + * the next command. 'fence ow, ow' + */ + dma_wmb(); + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); + + /* + * 6. Make sure the doorbell write to the device has finished before updating + * the shadow tail index in normal memory. 'fence o, w' + */ + mmiowb(); + atomic_inc(&queue->tail); + + /* 7. Complete submission and restore local interrupts */ + local_irq_restore(flags); + + return prod; + +err_busy: + local_irq_restore(flags); + dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); + + return prod; +} + +/* + * IOMMU Command queue chapter 3.1 + */ + +/* Command queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) +{ + const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + unsigned int ctrl; + + /* Clear MF/CQ errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | + RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { + riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); + dev_warn(queue->iommu->dev, + "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), + !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); + } + + /* Placeholder for command queue interrupt notifiers */ + + /* Clear command interrupt pending. */ + riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return IRQ_HANDLED; +} + +/* Send command to the IOMMU command queue */ +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd) +{ + riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); +} + +/* Send IOFENCE.C command and wait for all scheduled commands to complete. */ +static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, + unsigned int timeout_us) +{ + struct riscv_iommu_command cmd; + unsigned int prod; + + riscv_iommu_cmd_iofence(&cmd); + prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); + + if (!timeout_us) + return; + + if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) + dev_err_once(iommu->dev, + "Hardware error: command execution timeout\n"); +} + +/* + * IOMMU Fault/Event queue chapter 3.2 + */ + +static void riscv_iommu_fault(struct riscv_iommu_device *iommu, + struct riscv_iommu_fq_record *event) +{ + unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); + unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); + + /* Placeholder for future fault handling implementation, report only. */ + if (err) + dev_warn_ratelimited(iommu->dev, + "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", + err, devid, event->iotval, event->iotval2); +} + +/* Fault queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + struct riscv_iommu_device *iommu = queue->iommu; + struct riscv_iommu_fq_record *events; + unsigned int ctrl, idx; + int cnt, len; + + events = (struct riscv_iommu_fq_record *)queue->base; + + /* Clear fault interrupt pending and process all received fault events. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + do { + cnt = riscv_iommu_queue_consume(queue, &idx); + for (len = 0; len < cnt; idx++, len++) + riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); + riscv_iommu_queue_release(queue, cnt); + } while (cnt > 0); + + /* Clear MF/OF errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { + riscv_iommu_writel(iommu, queue->qcr, ctrl); + dev_warn(iommu->dev, + "Queue #%u error; memory fault:%d overflow:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), + !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); + } + + return IRQ_HANDLED; +} + /* Lookup and initialize device context info structure. */ static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, unsigned int devid) @@ -250,6 +709,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, struct device *dev = iommu->dev; u64 ddtp, rq_ddtp; unsigned int mode, rq_mode = ddtp_mode; + struct riscv_iommu_command cmd; ddtp = riscv_iommu_read_ddtp(iommu); if (ddtp & RISCV_IOMMU_DDTP_BUSY) @@ -317,6 +777,17 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, if (mode != ddtp_mode) dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + /* Invalidate device context cache */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* Invalidate address translation cache */ + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* IOFENCE.C */ + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + return 0; } @@ -496,6 +967,24 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) return -EINVAL; } + /* + * Distribute interrupt vectors, always use first vector for CIV. + * At least one interrupt is required. Read back and verify. + */ + if (!iommu->irqs_count) + return -EINVAL; + + iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); + iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); + if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), + max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) + return -EINVAL; + return 0; } @@ -504,12 +993,17 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu) iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); + riscv_iommu_queue_disable(&iommu->cmdq); + riscv_iommu_queue_disable(&iommu->fltq); } int riscv_iommu_init(struct riscv_iommu_device *iommu) { int rc; + RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); + RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); + rc = riscv_iommu_init_check(iommu); if (rc) return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); @@ -518,10 +1012,28 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) if (rc) return rc; - rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, + sizeof(struct riscv_iommu_command)); if (rc) return rc; + rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, + sizeof(struct riscv_iommu_fq_record)); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); + if (rc) + goto err_queue_disable; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + if (rc) + goto err_queue_disable; + rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", dev_name(iommu->dev)); if (rc) { @@ -541,5 +1053,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) iommu_device_sysfs_remove(&iommu->iommu); err_iodir_off: riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); +err_queue_disable: + riscv_iommu_queue_disable(&iommu->fltq); + riscv_iommu_queue_disable(&iommu->cmdq); return rc; } diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index f1696926582c..b1c4664542b4 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -17,6 +17,22 @@ #include "iommu-bits.h" +struct riscv_iommu_device; + +struct riscv_iommu_queue { + atomic_t prod; /* unbounded producer allocation index */ + atomic_t head; /* unbounded shadow ring buffer consumer index */ + atomic_t tail; /* unbounded shadow ring buffer producer index */ + unsigned int mask; /* index mask, queue length - 1 */ + unsigned int irq; /* allocated interrupt number */ + struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */ + void *base; /* ring buffer kernel pointer */ + dma_addr_t phys; /* ring buffer physical address */ + u16 qbr; /* base register offset, head and tail reference */ + u16 qcr; /* control and status register offset */ + u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */ +}; + struct riscv_iommu_device { /* iommu core interface */ struct iommu_device iommu; @@ -34,6 +50,11 @@ struct riscv_iommu_device { /* available interrupt numbers, MSI or WSI */ unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; unsigned int irqs_count; + unsigned int icvec; + + /* hardware queues */ + struct riscv_iommu_queue cmdq; + struct riscv_iommu_queue fltq; /* device directory */ unsigned int ddt_mode; -- Gitee From 533b49ed1c410bd0ed31eb455aec64feaa5a5fe5 Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 30 Jun 2025 16:06:37 +0800 Subject: [PATCH 173/219] iommu/riscv: Paging domain support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #13617 commit 488ffbf181718b9ad8c1838cb249d60973e78eda upstream. Introduce first-stage address translation support. Page table configured by the IOMMU driver will use the highest mode implemented by the hardware, unless not known at the domain allocation time falling back to the CPU’s MMU page mode. This change introduces IOTINVAL.VMA command, required to invalidate any cached IOATC entries after mapping is updated and/or removed from the paging domain. Invalidations for the non-leaf page entries use IOTINVAL for all addresses assigned to the protection domain for hardware not supporting more granular non-leaf page table cache invalidations. Reviewed-by: Lu Baolu Reviewed-by: Zong Li Signed-off-by: Tomasz Jeznach Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/1109202d389f51c7121cb1460eb2f21429b9bd5d.1729059707.git.tjeznach@rivosinc.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui Reviewed-by: Ruidong Tian Reviewed-by: Guixin Liu Link: https://gitee.com/anolis/cloud-kernel/pulls/5473 Signed-off-by: Shuai Xue --- drivers/iommu/riscv/iommu.c | 617 +++++++++++++++++++++++++++++++++++- 1 file changed, 609 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 0c6263101815..8a05def774bd 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -41,6 +41,10 @@ #define dev_to_iommu(dev) \ iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) +/* IOMMU PSCID allocation namespace. */ +static DEFINE_IDA(riscv_iommu_pscids); +#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) + /* Device resource-managed allocations */ struct riscv_iommu_devres { void *addr; @@ -791,6 +795,197 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, return 0; } +/* This struct contains protection domain specific IOMMU driver data. */ +struct riscv_iommu_domain { + struct iommu_domain domain; + struct list_head bonds; + spinlock_t lock; /* protect bonds list updates. */ + int pscid; + bool amo_enabled; + int numa_node; + unsigned int pgd_mode; + unsigned long *pgd_root; +}; + +#define iommu_domain_to_riscv(iommu_domain) \ + container_of(iommu_domain, struct riscv_iommu_domain, domain) + +/* Private IOMMU data for managed devices, dev_iommu_priv_* */ +struct riscv_iommu_info { + struct riscv_iommu_domain *domain; +}; + +/* + * Linkage between an iommu_domain and attached devices. + * + * Protection domain requiring IOATC and DevATC translation cache invalidations, + * should be linked to attached devices using a riscv_iommu_bond structure. + * Devices should be linked to the domain before first use and unlinked after + * the translations from the referenced protection domain can no longer be used. + * Blocking and identity domains are not tracked here, as the IOMMU hardware + * does not cache negative and/or identity (BARE mode) translations, and DevATC + * is disabled for those protection domains. + * + * The device pointer and IOMMU data remain stable in the bond struct after + * _probe_device() where it's attached to the managed IOMMU, up to the + * completion of the _release_device() call. The release of the bond structure + * is synchronized with the device release. + */ +struct riscv_iommu_bond { + struct list_head list; + struct rcu_head rcu; + struct device *dev; +}; + +static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond; + struct list_head *bonds; + + bond = kzalloc(sizeof(*bond), GFP_KERNEL); + if (!bond) + return -ENOMEM; + bond->dev = dev; + + /* + * List of devices attached to the domain is arranged based on + * managed IOMMU device. + */ + + spin_lock(&domain->lock); + list_for_each(bonds, &domain->bonds) + if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) + break; + list_add_rcu(&bond->list, bonds); + spin_unlock(&domain->lock); + + /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ + smp_mb(); + + return 0; +} + +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond, *found = NULL; + struct riscv_iommu_command cmd; + int count = 0; + + if (!domain) + return; + + spin_lock(&domain->lock); + list_for_each_entry(bond, &domain->bonds, list) { + if (found && count) + break; + else if (bond->dev == dev) + found = bond; + else if (dev_to_iommu(bond->dev) == iommu) + count++; + } + if (found) + list_del_rcu(&found->list); + spin_unlock(&domain->lock); + kfree_rcu(found, rcu); + + /* + * If this was the last bond between this domain and the IOMMU + * invalidate all cached entries for domain's PSCID. + */ + if (!count) { + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + riscv_iommu_cmd_send(iommu, &cmd); + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + } +} + +/* + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. + * This limit will be replaced with range invalidations, if supported by + * the hardware, when RISC-V IOMMU architecture specification update for + * range invalidations update will be available. + */ +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) + +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, + unsigned long start, unsigned long end) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_command cmd; + unsigned long len = end - start + 1; + unsigned long iova; + + /* + * For each IOMMU linked with this protection domain (via bonds->dev), + * an IOTLB invaliation command will be submitted and executed. + * + * Possbile race with domain attach flow is handled by sequencing + * bond creation - riscv_iommu_bond_link(), and device directory + * update - riscv_iommu_iodir_update(). + * + * PTE Update / IOTLB Inval Device attach & directory update + * -------------------------- -------------------------- + * update page table entries add dev to the bond list + * FENCE RW,RW FENCE RW,RW + * For all IOMMUs: (can be empty) Update FSC/PSCID + * FENCE IOW,IOW FENCE IOW,IOW + * IOTLB.INVAL IODIR.INVAL + * IOFENCE.C + * + * If bond list is not updated with new device, directory context will + * be configured with already valid page table content. If an IOMMU is + * linked to the protection domain it will receive invalidation + * requests for updated page table entries. + */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + + /* + * IOTLB invalidation request can be safely omitted if already sent + * to the IOMMU for the same PSCID, and with domain->bonds list + * arranged based on the device's IOMMU, it's sufficient to check + * last device the invalidation was sent to. + */ + if (iommu == prev) + continue; + + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { + for (iova = start; iova < end; iova += PAGE_SIZE) { + riscv_iommu_cmd_inval_set_addr(&cmd, iova); + riscv_iommu_cmd_send(iommu, &cmd); + } + } else { + riscv_iommu_cmd_send(iommu, &cmd); + } + prev = iommu; + } + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + prev = iommu; + } + rcu_read_unlock(); +} + #define RISCV_IOMMU_FSC_BARE 0 /* @@ -810,10 +1005,28 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_dc *dc; + struct riscv_iommu_command cmd; + bool sync_required = false; u64 tc; int i; - /* Device context invalidation ignored for now. */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + if (!(tc & RISCV_IOMMU_DC_TC_V)) + continue; + + WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); + + /* Invalidate device context cached values */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + sync_required = true; + } + + if (sync_required) + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); /* * For device context with DC_TC_PDTV = 0, translation attributes valid bit @@ -829,15 +1042,388 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, /* Update device context, write TC.V as the last step. */ dma_wmb(); WRITE_ONCE(dc->tc, tc); + + /* Invalidate device context after update */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); } + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); +} + +/* + * IOVA page translation tree management. + */ + +static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); +} + +static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, gather->start, gather->end); +} + +#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) + +#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) +#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) +#define _io_pte_none(pte) ((pte) == 0) +#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) + +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, + unsigned long pte, struct list_head *freelist) +{ + unsigned long *ptr; + int i; + + if (!_io_pte_present(pte) || _io_pte_leaf(pte)) + return; + + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + + /* Recursively free all sub page table pages */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = READ_ONCE(ptr[i]); + if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) + riscv_iommu_pte_free(domain, pte, freelist); + } + + if (freelist) + list_add_tail(&virt_to_page(ptr)->lru, freelist); + else + iommu_free_page(ptr); +} + +static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, + unsigned long iova, size_t pgsize, + gfp_t gfp) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte, old; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + void *addr; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + /* + * Note: returned entry might be a non-leaf if there was + * existing mapping with smaller granularity. Up to the caller + * to replace and invalidate. + */ + if (((size_t)1 << shift) == pgsize) + return ptr; +pte_retry: + pte = READ_ONCE(*ptr); + /* + * This is very likely incorrect as we should not be adding + * new mapping with smaller granularity on top + * of existing 2M/1G mapping. Fail. + */ + if (_io_pte_present(pte) && _io_pte_leaf(pte)) + return NULL; + /* + * Non-leaf entry is missing, allocate and try to add to the + * page table. This might race with other mappings, retry. + */ + if (_io_pte_none(pte)) { + addr = iommu_alloc_page_node(domain->numa_node, gfp); + if (!addr) + return NULL; + old = pte; + pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); + if (cmpxchg_relaxed(ptr, old, pte) != old) { + iommu_free_page(addr); + goto pte_retry; + } + } + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, + unsigned long iova, size_t *pte_pgsize) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + pte = READ_ONCE(*ptr); + if (_io_pte_present(pte) && _io_pte_leaf(pte)) { + *pte_pgsize = (size_t)1 << shift; + return ptr; + } + if (_io_pte_none(pte)) + return NULL; + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, + unsigned long iova, phys_addr_t phys, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = 0; + unsigned long *ptr; + unsigned long pte, old, pte_prot; + int rc = 0; + LIST_HEAD(freelist); + + if (!(prot & IOMMU_WRITE)) + pte_prot = _PAGE_BASE | _PAGE_READ; + else if (domain->amo_enabled) + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; + else + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; + + while (pgcount) { + ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); + if (!ptr) { + rc = -ENOMEM; + break; + } + + old = READ_ONCE(*ptr); + pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); + if (cmpxchg_relaxed(ptr, old, pte) != old) + continue; + + riscv_iommu_pte_free(domain, old, &freelist); + + size += pgsize; + iova += pgsize; + phys += pgsize; + --pgcount; + } + + *mapped = size; + + if (!list_empty(&freelist)) { + /* + * In 1.0 spec version, the smallest scope we can use to + * invalidate all levels of page table (i.e. leaf and non-leaf) + * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. + * This will be updated with hardware support for + * capability.NL (non-leaf) IOTINVAL command. + */ + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); + iommu_put_pages_list(&freelist); + } + + return rc; +} + +static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = pgcount << __ffs(pgsize); + unsigned long *ptr, old; + size_t unmapped = 0; + size_t pte_size; + + while (unmapped < size) { + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (!ptr) + return unmapped; + + /* partial unmap is not allowed, fail. */ + if (iova & (pte_size - 1)) + return unmapped; + + old = READ_ONCE(*ptr); + if (cmpxchg_relaxed(ptr, old, 0) != old) + continue; + + iommu_iotlb_gather_add_page(&domain->domain, gather, iova, + pte_size); + + iova += pte_size; + unmapped += pte_size; + } + + return unmapped; +} + +static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, + dma_addr_t iova) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + unsigned long pte_size; + unsigned long *ptr; + + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) + return 0; + + return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); +} + +static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + const unsigned long pfn = virt_to_pfn(domain->pgd_root); + + WARN_ON(!list_empty(&domain->bonds)); + + if ((int)domain->pscid > 0) + ida_free(&riscv_iommu_pscids, domain->pscid); + + riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); + kfree(domain); +} + +static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) +{ + switch (pgd_mode) { + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; + } + return false; +} + +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + u64 fsc, ta; + + if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) + return -ENODEV; + + fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); + ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V; + + if (riscv_iommu_bond_link(domain, dev)) + return -ENOMEM; + + riscv_iommu_iodir_update(iommu, dev, fsc, ta); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = domain; + + return 0; +} + +static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { + .attach_dev = riscv_iommu_attach_paging_domain, + .free = riscv_iommu_free_paging_domain, + .map_pages = riscv_iommu_map_pages, + .unmap_pages = riscv_iommu_unmap_pages, + .iova_to_phys = riscv_iommu_iova_to_phys, + .iotlb_sync = riscv_iommu_iotlb_sync, + .flush_iotlb_all = riscv_iommu_iotlb_flush_all, +}; + +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_device *iommu; + unsigned int pgd_mode; + dma_addr_t va_mask; + int va_bits; + + iommu = dev_to_iommu(dev); + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; + va_bits = 57; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; + va_bits = 48; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; + va_bits = 39; + } else { + dev_err(dev, "cannot find supported page table mode\n"); + return ERR_PTR(-ENODEV); + } + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&domain->bonds); + spin_lock_init(&domain->lock); + domain->numa_node = dev_to_node(iommu->dev); + domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); + domain->pgd_mode = pgd_mode; + domain->pgd_root = iommu_alloc_page_node(domain->numa_node, + GFP_KERNEL_ACCOUNT); + if (!domain->pgd_root) { + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); + if (domain->pscid < 0) { + iommu_free_page(domain->pgd_root); + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + /* + * Note: RISC-V Privilege spec mandates that virtual addresses + * need to be sign-extended, so if (VA_BITS - 1) is set, all + * bits >= VA_BITS need to also be set or else we'll get a + * page fault. However the code that creates the mappings + * above us (e.g. iommu_dma_alloc_iova()) won't do that for us + * for now, so we'll end up with invalid virtual addresses + * to map. As a workaround until we get this sorted out + * limit the available virtual addresses to VA_BITS - 1. + */ + va_mask = DMA_BIT_MASK(va_bits - 1); + + domain->domain.geometry.aperture_start = 0; + domain->domain.geometry.aperture_end = va_mask; + domain->domain.geometry.force_aperture = true; + domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); + + domain->domain.ops = &riscv_iommu_paging_domain_ops; + + return &domain->domain; } static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, struct device *dev) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + /* Make device context invalid, translation requests will fault w/ #258 */ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; return 0; } @@ -853,8 +1439,11 @@ static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, struct device *dev) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; return 0; } @@ -866,11 +1455,6 @@ static struct iommu_domain riscv_iommu_identity_domain = { } }; -static int riscv_iommu_device_domain_type(struct device *dev) -{ - return IOMMU_DOMAIN_IDENTITY; -} - static struct iommu_group *riscv_iommu_device_group(struct device *dev) { if (dev_is_pci(dev)) @@ -887,6 +1471,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_device *iommu; + struct riscv_iommu_info *info; struct riscv_iommu_dc *dc; u64 tc; int i; @@ -905,6 +1490,9 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) return ERR_PTR(-ENODEV); + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); /* * Allocate and pre-configure device context entries in * the device directory. Do not mark the context valid yet. @@ -914,24 +1502,37 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) tc |= RISCV_IOMMU_DC_TC_SADE; for (i = 0; i < fwspec->num_ids; i++) { dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); - if (!dc) + if (!dc) { + kfree(info); return ERR_PTR(-ENODEV); + } if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) dev_warn(dev, "already attached to IOMMU device directory\n"); WRITE_ONCE(dc->tc, tc); } + dev_iommu_priv_set(dev, info); + return &iommu->iommu; } +static void riscv_iommu_release_device(struct device *dev) +{ + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + kfree_rcu_mightsleep(info); +} + static const struct iommu_ops riscv_iommu_ops = { + .pgsize_bitmap = SZ_4K, .of_xlate = riscv_iommu_of_xlate, .identity_domain = &riscv_iommu_identity_domain, .blocked_domain = &riscv_iommu_blocking_domain, .release_domain = &riscv_iommu_blocking_domain, - .def_domain_type = riscv_iommu_device_domain_type, + .domain_alloc_paging = riscv_iommu_alloc_paging_domain, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, + .release_device = riscv_iommu_release_device, }; static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) -- Gitee From b6c0bfdec84de9ab40bba305e24e956395c7b99a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 4 Mar 2024 15:50:08 -0400 Subject: [PATCH 174/219] iommu/arm-smmu-v3: Add cpu_to_le64() around STRTAB_STE_0_V ANBZ: #13617 commit 0493e739ccc60a3e0870847f1a12d6d79b86a1fc upstream. STRTAB_STE_0_V is a CPU value, it needs conversion for sparse to be clean. The missing annotation was a mistake introduced by splitting the ops out from the STE writer. Fixes: 7da51af9125c ("iommu/arm-smmu-v3: Make STE programming independent of the callers") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403011441.5WqGrYjp-lkp@intel.com/ Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-98b23ebb0c84+9f-smmu_cputole_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b2c19f34ab26..bf346bce1e46 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1139,7 +1139,8 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, * requires a breaking update, zero the V bit, write all qwords * but 0, then set qword 0 */ - unused_update.data[0] = entry->data[0] & (~STRTAB_STE_0_V); + unused_update.data[0] = entry->data[0] & + cpu_to_le64(~STRTAB_STE_0_V); entry_set(smmu, sid, entry, &unused_update, 0, 1); entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1); entry_set(smmu, sid, entry, target, 0, 1); -- Gitee From 39119cb5efc06ad57133601e9ce230db126fd238 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Sat, 23 Mar 2024 13:46:58 +0000 Subject: [PATCH 175/219] iommu/arm-smmu-v3: Fix access for STE.SHCFG ANBZ: #13617 commit ec9098d6bffea6e82d63640134c123a3d96e0781 upstream. STE attributes(NSCFG, PRIVCFG, INSTCFG) use value 0 for "Use Icomming", for some reason SHCFG doesn't follow that, and it is defined as "0b01". Currently the driver sets SHCFG to Use Incoming for stage-2 and bypass domains. However according to the User Manual (ARM IHI 0070 F.b): When SMMU_IDR1.ATTR_TYPES_OVR == 0, this field is RES0 and the incoming Shareability attribute is used. This patch adds a condition for writing SHCFG to Use incoming to be compliant with the architecture, and defines ATTR_TYPE_OVR as a new feature discovered from IDR1. This also required to propagate the SMMU through some functions args. There is no need to add similar condition for the newly introduced function arm_smmu_get_ste_used() as the values of the STE are the same before and after any transition, so this will not trigger any change. (we already do the same for the VMID). Although this is a misconfiguration from the driver, this has been there for a long time, so probably no HW running Linux is affected by it. Reported-by: Will Deacon Closes: https://lore.kernel.org/all/20240215134952.GA690@willie-the-truck/ Signed-off-by: Mostafa Saleh Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240323134658.464743-1-smostafa@google.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 ++++++++++++++------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 ++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index bf346bce1e46..e2dac42158b0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1454,14 +1454,17 @@ static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT)); } -static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target) +static void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu, + struct arm_smmu_ste *target) { memset(target, 0, sizeof(*target)); target->data[0] = cpu_to_le64( STRTAB_STE_0_V | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS)); - target->data[1] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); + + if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) + target->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); } static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, @@ -1524,6 +1527,7 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; u64 vtcr_val; + struct arm_smmu_device *smmu = master->smmu; memset(target, 0, sizeof(*target)); target->data[0] = cpu_to_le64( @@ -1532,9 +1536,11 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, target->data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_EATS, - master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) | - FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); + master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + + if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) + target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); vtcr_val = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | @@ -1561,7 +1567,8 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, * This can safely directly manipulate the STE memory without a sync sequence * because the STE table has not been installed in the SMMU yet. */ -static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, +static void arm_smmu_init_initial_stes(struct arm_smmu_device *smmu, + struct arm_smmu_ste *strtab, unsigned int nent) { unsigned int i; @@ -1570,7 +1577,7 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, if (disable_bypass) arm_smmu_make_abort_ste(strtab); else - arm_smmu_make_bypass_ste(strtab); + arm_smmu_make_bypass_ste(smmu, strtab); strtab++; } } @@ -1598,7 +1605,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) return -ENOMEM; } - arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT); + arm_smmu_init_initial_stes(smmu, desc->l2ptr, 1 << STRTAB_SPLIT); arm_smmu_write_strtab_l1_desc(strtab, desc); return 0; } @@ -2649,8 +2656,9 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, struct device *dev) { struct arm_smmu_ste ste; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); - arm_smmu_make_bypass_ste(&ste); + arm_smmu_make_bypass_ste(master->smmu, &ste); return arm_smmu_attach_dev_ste(dev, &ste); } @@ -3270,7 +3278,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); cfg->strtab_base_cfg = reg; - arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents); + arm_smmu_init_initial_stes(smmu, strtab, cfg->num_l1_ents); return 0; } @@ -3783,6 +3791,9 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) return -ENXIO; } + if (reg & IDR1_ATTR_TYPES_OVR) + smmu->features |= ARM_SMMU_FEAT_ATTR_TYPES_OVR; + /* Queue sizes, capped to ensure natural alignment */ smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, reg)); @@ -3998,7 +4009,7 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) * STE table is not programmed to HW, see * arm_smmu_initial_bypass_stes() */ - arm_smmu_make_bypass_ste( + arm_smmu_make_bypass_ste(smmu, arm_smmu_get_step_for_sid(smmu, rmr->sids[i])); } } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 154808f96718..8d181b5e5f1b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -44,6 +44,7 @@ #define IDR1_TABLES_PRESET (1 << 30) #define IDR1_QUEUES_PRESET (1 << 29) #define IDR1_REL (1 << 28) +#define IDR1_ATTR_TYPES_OVR (1 << 27) #define IDR1_CMDQS GENMASK(25, 21) #define IDR1_EVTQS GENMASK(20, 16) #define IDR1_PRIQS GENMASK(15, 11) @@ -647,6 +648,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_SVA (1 << 17) #define ARM_SMMU_FEAT_E2H (1 << 18) #define ARM_SMMU_FEAT_NESTING (1 << 19) +#define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) -- Gitee From 3591a9de9bdec094646bbe5825664da055d42a11 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 27 Mar 2024 10:41:39 -0300 Subject: [PATCH 176/219] iommu: Validate the PASID in iommu_attach_device_pasid() ANBZ: #13617 commit c404f55c26fc23c70a9f2262f3f36a69fc46289b upstream. The SVA code checks that the PASID is valid for the device when assigning the PASID to the MM, but the normal PAGING related path does not check it. Devices that don't support PASID or PASID values too large for the device should not invoke the driver callback. The drivers should rely on the core code for this enforcement. Fixes: 16603704559c7a68 ("iommu: Add attach/detach_dev_pasid iommu interfaces") Signed-off-by: Jason Gunthorpe Reviewed-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v1-460705442b30+659-iommu_check_pasid_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d044850618bf..e4fd05bcf22a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3383,6 +3383,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; + struct group_device *device; void *curr; int ret; @@ -3392,10 +3393,18 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, if (!group) return -ENODEV; - if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner || + pasid == IOMMU_NO_PASID) return -EINVAL; mutex_lock(&group->mutex); + for_each_group_device(group, device) { + if (pasid >= device->dev->iommu->max_pasids) { + ret = -EINVAL; + goto out_unlock; + } + } + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); if (curr) { ret = xa_err(curr) ? : -EBUSY; -- Gitee From a587df16afe54e2d70ab4d870444b02437bfdd06 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 5 Apr 2024 17:52:07 +0100 Subject: [PATCH 177/219] iommu/arm-smmu-v3: Retire disable_bypass parameter ANBZ: #13617 commit 734554fdfce6731b22f0777ec3f1e4a853354883 upstream. The disable_bypass parameter has been mostly meaningless for a long time since the introduction of default domains. Its original intent is now fulfilled by the controls users have over the default domain type, and its remaining effect in the brief window between Stream Table initialisation and default domain creation hardly seems worth the complication. Furthermore, thanks to 2-level Stream Tables, disabling disable_bypass (there's another reason not to like it right there) has never guaranteed that any particular StreamID *will* bypass anyway - any device which might actually care about that wants RMRs - so there's not really much lost by taking away that option (which has already been non-default for nearing 6 years now). As part of this, also remove the weird behaviour where we "successfully" probe and register a non-functional SMMU if the DT "#iommu-cells" property is wrong. I have no memory of what possessed me to think that was a good idea at the time, and by now I suspect it's likely to break things worse than simply failing probe would. Signed-off-by: Robin Murphy Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/ea3ac4cd595a81b5511729601b2f7d4668178438.1712335927.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 46 ++++++--------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e2dac42158b0..895719ab1db2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -30,11 +30,6 @@ #include "arm-smmu-v3.h" #include "../../dma-iommu.h" -static bool disable_bypass = true; -module_param(disable_bypass, bool, 0444); -MODULE_PARM_DESC(disable_bypass, - "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU."); - static bool disable_msipolling; module_param(disable_msipolling, bool, 0444); MODULE_PARM_DESC(disable_msipolling, @@ -1567,17 +1562,13 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, * This can safely directly manipulate the STE memory without a sync sequence * because the STE table has not been installed in the SMMU yet. */ -static void arm_smmu_init_initial_stes(struct arm_smmu_device *smmu, - struct arm_smmu_ste *strtab, +static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, unsigned int nent) { unsigned int i; for (i = 0; i < nent; ++i) { - if (disable_bypass) - arm_smmu_make_abort_ste(strtab); - else - arm_smmu_make_bypass_ste(smmu, strtab); + arm_smmu_make_abort_ste(strtab); strtab++; } } @@ -1605,7 +1596,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) return -ENOMEM; } - arm_smmu_init_initial_stes(smmu, desc->l2ptr, 1 << STRTAB_SPLIT); + arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT); arm_smmu_write_strtab_l1_desc(strtab, desc); return 0; } @@ -2920,10 +2911,10 @@ static void arm_smmu_release_device(struct device *dev) iopf_queue_remove_device(master->smmu->evtq.iopf, dev); /* Put the STE back to what arm_smmu_init_strtab() sets */ - if (disable_bypass && !dev->iommu->require_direct) - arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); - else + if (dev->iommu->require_direct) arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); + else + arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); @@ -3278,7 +3269,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); cfg->strtab_base_cfg = reg; - arm_smmu_init_initial_stes(smmu, strtab, cfg->num_l1_ents); + arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents); return 0; } @@ -3508,7 +3499,7 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu) return ret; } -static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) +static int arm_smmu_device_reset(struct arm_smmu_device *smmu) { int ret; u32 reg, enables; @@ -3518,7 +3509,6 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) reg = readl_relaxed(smmu->base + ARM_SMMU_CR0); if (reg & CR0_SMMUEN) { dev_warn(smmu->dev, "SMMU currently enabled! Resetting...\n"); - WARN_ON(is_kdump_kernel() && !disable_bypass); arm_smmu_update_gbpa(smmu, GBPA_ABORT, 0); } @@ -3625,14 +3615,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) if (is_kdump_kernel()) enables &= ~(CR0_EVTQEN | CR0_PRIQEN); - /* Enable the SMMU interface, or ensure bypass */ - if (!bypass || disable_bypass) { - enables |= CR0_SMMUEN; - } else { - ret = arm_smmu_update_gbpa(smmu, 0, GBPA_ABORT); - if (ret) - return ret; - } + /* Enable the SMMU interface */ + enables |= CR0_SMMUEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, ARM_SMMU_CR0ACK); if (ret) { @@ -4024,7 +4008,6 @@ static int arm_smmu_device_probe(struct platform_device *pdev) resource_size_t ioaddr; struct arm_smmu_device *smmu; struct device *dev = &pdev->dev; - bool bypass; smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL); if (!smmu) @@ -4035,12 +4018,9 @@ static int arm_smmu_device_probe(struct platform_device *pdev) ret = arm_smmu_device_dt_probe(pdev, smmu); } else { ret = arm_smmu_device_acpi_probe(pdev, smmu); - if (ret == -ENODEV) - return ret; } - - /* Set bypass mode according to firmware probing result */ - bypass = !!ret; + if (ret) + return ret; /* Base address */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -4104,7 +4084,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) arm_smmu_rmr_install_bypass_ste(smmu); /* Reset the device */ - ret = arm_smmu_device_reset(smmu, bypass); + ret = arm_smmu_device_reset(smmu); if (ret) goto err_disable; -- Gitee From cf6a15fd3985befdc671fa52cd2d9544db804523 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 27 Mar 2024 15:07:49 -0300 Subject: [PATCH 178/219] iommu/arm-smmu-v3: Do not allow a SVA domain to be set on the wrong PASID ANBZ: #13617 commit fdc69d39e77f88264ee6e8174ff9aaf0953aecd9 upstream. The SVA code is wired to assume that the SVA is programmed onto the mm->pasid. The current core code always does this, so it is fine. Add a check for clarity. Tested-by: Nicolin Chen Tested-by: Shameer Kolothum Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2cd433a9c8a0..41b44baef15e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -569,6 +569,9 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, int ret = 0; struct mm_struct *mm = domain->mm; + if (mm_get_enqcmd_pasid(mm) != id) + return -EINVAL; + mutex_lock(&sva_lock); ret = __arm_smmu_sva_bind(dev, id, mm); mutex_unlock(&sva_lock); -- Gitee From 547b1c76d63c2818a070432157c71eafce15f937 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 27 Mar 2024 15:07:50 -0300 Subject: [PATCH 179/219] iommu/arm-smmu-v3: Do not ATC invalidate the entire domain ANBZ: #13617 commit 86e5ca098dd9f8c5b80a6205395aea0535018837 upstream. At this point we know which master we are going to change the PCI config on, this is the only device we need to invalidate. Switch arm_smmu_atc_inv_domain() for arm_smmu_atc_inv_master(). Tested-by: Nicolin Chen Tested-by: Shameer Kolothum Reviewed-by: Michael Shavit Reviewed-by: Nicolin Chen Reviewed-by: Moritz Fischer Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 895719ab1db2..6a7c871841cc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2421,7 +2421,10 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master, pdev = to_pci_dev(master->dev); atomic_inc(&smmu_domain->nr_ats_masters); - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0); + /* + * ATC invalidation of PASID 0 causes the entire ATC to be flushed. + */ + arm_smmu_atc_inv_master(master); if (pci_enable_ats(pdev, stu)) dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); } -- Gitee From 04c070d6f1f01394d79adbd9d01b544a9c697714 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 27 Mar 2024 15:07:51 -0300 Subject: [PATCH 180/219] iommu/arm-smmu-v3: Add a type for the CD entry ANBZ: #13617 commit e8e4398d53f98be7ac48e0bda9ea6e26df24136d upstream. Instead of passing a naked __le16 * around to represent a CD table entry wrap it in a "struct arm_smmu_cd" with an array of the correct size. This makes it much clearer which functions will comprise the "CD API". Tested-by: Nicolin Chen Tested-by: Shameer Kolothum Reviewed-by: Michael Shavit Reviewed-by: Moritz Fischer Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 +++++++++++--------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6a7c871841cc..0b42059a7866 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1209,7 +1209,8 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst, WRITE_ONCE(*dst, cpu_to_le64(val)); } -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) +static struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, + u32 ssid) { __le64 *l1ptr; unsigned int idx; @@ -1218,7 +1219,8 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR) - return cd_table->cdtab + ssid * CTXDESC_CD_DWORDS; + return (struct arm_smmu_cd *)(cd_table->cdtab + + ssid * CTXDESC_CD_DWORDS); idx = ssid >> CTXDESC_SPLIT; l1_desc = &cd_table->l1_desc[idx]; @@ -1232,7 +1234,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) arm_smmu_sync_cd(master, ssid, false); } idx = ssid & (CTXDESC_L2_ENTRIES - 1); - return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS; + return &l1_desc->l2ptr[idx]; } int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, @@ -1251,7 +1253,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, */ u64 val; bool cd_live; - __le64 *cdptr; + struct arm_smmu_cd *cdptr; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; struct arm_smmu_device *smmu = master->smmu; @@ -1262,7 +1264,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, if (!cdptr) return -ENOMEM; - val = le64_to_cpu(cdptr[0]); + val = le64_to_cpu(cdptr->data[0]); cd_live = !!(val & CTXDESC_CD_0_V); if (!cd) { /* (5) */ @@ -1279,9 +1281,9 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, * this substream's traffic */ } else { /* (1) and (2) */ - cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK); - cdptr[2] = 0; - cdptr[3] = cpu_to_le64(cd->mair); + cdptr->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK); + cdptr->data[2] = 0; + cdptr->data[3] = cpu_to_le64(cd->mair); /* * STE may be live, and the SMMU might read dwords of this CD in any @@ -1313,7 +1315,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, * field within an aligned 64-bit span of a structure can be altered * without first making the structure invalid. */ - WRITE_ONCE(cdptr[0], cpu_to_le64(val)); + WRITE_ONCE(cdptr->data[0], cpu_to_le64(val)); arm_smmu_sync_cd(master, ssid, true); return 0; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8d181b5e5f1b..2da767a542ff 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -283,6 +283,11 @@ struct arm_smmu_ste { #define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12) #define CTXDESC_CD_DWORDS 8 + +struct arm_smmu_cd { + __le64 data[CTXDESC_CD_DWORDS]; +}; + #define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0) #define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6) #define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8) @@ -592,7 +597,7 @@ struct arm_smmu_ctx_desc { }; struct arm_smmu_l1_ctx_desc { - __le64 *l2ptr; + struct arm_smmu_cd *l2ptr; dma_addr_t l2ptr_dma; }; -- Gitee From fcde6a7b9eff64b07b69516ed81d7609fe98a6b5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 4 Apr 2024 10:27:17 +0000 Subject: [PATCH 181/219] iommu/amd: Fix possible irq lock inversion dependency issue ANBZ: #13617 commit 84b1cec4fac5889b6d138d4b5ff6f2d5ef126c5e upstream. LOCKDEP detector reported below warning: ---------------------------------------- [ 23.796949] ======================================================== [ 23.796950] WARNING: possible irq lock inversion dependency detected [ 23.796952] 6.8.0fix+ #811 Not tainted [ 23.796954] -------------------------------------------------------- [ 23.796954] kworker/0:1/8 just changed the state of lock: [ 23.796956] ff365325e084a9b8 (&domain->lock){..-.}-{3:3}, at: amd_iommu_flush_iotlb_all+0x1f/0x50 [ 23.796969] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 23.796970] (pd_bitmap_lock){+.+.}-{3:3} [ 23.796972] and interrupts could create inverse lock ordering between them. [ 23.796973] other info that might help us debug this: [ 23.796974] Chain exists of: &domain->lock --> &dev_data->lock --> pd_bitmap_lock [ 23.796980] Possible interrupt unsafe locking scenario: [ 23.796981] CPU0 CPU1 [ 23.796982] ---- ---- [ 23.796983] lock(pd_bitmap_lock); [ 23.796985] local_irq_disable(); [ 23.796985] lock(&domain->lock); [ 23.796988] lock(&dev_data->lock); [ 23.796990] [ 23.796991] lock(&domain->lock); Fix this issue by disabling interrupt when acquiring pd_bitmap_lock. Note that this is temporary fix. We have a plan to replace custom bitmap allocator with IDA allocator. Fixes: 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") Reviewed-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240404102717.6705-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 56c546780c63..55f004a21795 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1701,26 +1701,29 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, static u16 domain_id_alloc(void) { + unsigned long flags; int id; - spin_lock(&pd_bitmap_lock); + spin_lock_irqsave(&pd_bitmap_lock, flags); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); return id; } static void domain_id_free(int id) { - spin_lock(&pd_bitmap_lock); + unsigned long flags; + + spin_lock_irqsave(&pd_bitmap_lock, flags); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); } static void free_gcr3_tbl_level1(u64 *tbl) -- Gitee From 66e2efd6de6f8407e096b0972c21a032f3808fb7 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 11 Apr 2024 11:07:44 +0800 Subject: [PATCH 182/219] iommu/vt-d: Fix WARN_ON in iommu probe path ANBZ: #13617 commit 89436f4f54125b1297aec1f466efd8acb4ec613d upstream. Commit 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") adds all devices probed by the iommu driver in a rbtree indexed by the source ID of each device. It assumes that each device has a unique source ID. This assumption is incorrect and the VT-d spec doesn't state this requirement either. The reason for using a rbtree to track devices is to look up the device with PCI bus and devfunc in the paths of handling ATS invalidation time out error and the PRI I/O page faults. Both are PCI ATS feature related. Only track the devices that have PCI ATS capabilities in the rbtree to avoid unnecessary WARN_ON in the iommu probe path. Otherwise, on some platforms below kernel splat will be displayed and the iommu probe results in failure. WARNING: CPU: 3 PID: 166 at drivers/iommu/intel/iommu.c:158 intel_iommu_probe_device+0x319/0xd90 Call Trace: ? __warn+0x7e/0x180 ? intel_iommu_probe_device+0x319/0xd90 ? report_bug+0x1f8/0x200 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? intel_iommu_probe_device+0x319/0xd90 ? debug_mutex_init+0x37/0x50 __iommu_probe_device+0xf2/0x4f0 iommu_probe_device+0x22/0x70 iommu_bus_notifier+0x1e/0x40 notifier_call_chain+0x46/0x150 blocking_notifier_call_chain+0x42/0x60 bus_notify+0x2f/0x50 device_add+0x5ed/0x7e0 platform_device_add+0xf5/0x240 mfd_add_devices+0x3f9/0x500 ? preempt_count_add+0x4c/0xa0 ? up_write+0xa2/0x1b0 ? __debugfs_create_file+0xe3/0x150 intel_lpss_probe+0x49f/0x5b0 ? pci_conf1_write+0xa3/0xf0 intel_lpss_pci_probe+0xcf/0x110 [intel_lpss_pci] pci_device_probe+0x95/0x120 really_probe+0xd9/0x370 ? __pfx___driver_attach+0x10/0x10 __driver_probe_device+0x73/0x150 driver_probe_device+0x19/0xa0 __driver_attach+0xb6/0x180 ? __pfx___driver_attach+0x10/0x10 bus_for_each_dev+0x77/0xd0 bus_add_driver+0x114/0x210 driver_register+0x5b/0x110 ? __pfx_intel_lpss_pci_driver_init+0x10/0x10 [intel_lpss_pci] do_one_initcall+0x57/0x2b0 ? kmalloc_trace+0x21e/0x280 ? do_init_module+0x1e/0x210 do_init_module+0x5f/0x210 load_module+0x1d37/0x1fc0 ? init_module_from_file+0x86/0xd0 init_module_from_file+0x86/0xd0 idempotent_init_module+0x17c/0x230 __x64_sys_finit_module+0x56/0xb0 do_syscall_64+0x6e/0x140 entry_SYSCALL_64_after_hwframe+0x71/0x79 Fixes: 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10689 Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20240407011429.136282-1-baolu.lu@linux.intel.com Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 49ffec0b8f92..9c1ec1a5f271 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4373,9 +4373,11 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } dev_iommu_priv_set(dev, info); - ret = device_rbtree_insert(iommu, info); - if (ret) - goto free; + if (pdev && pci_ats_supported(pdev)) { + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); @@ -4410,7 +4412,8 @@ static void intel_iommu_release_device(struct device *dev) struct intel_iommu *iommu = info->iommu; mutex_lock(&iommu->iopf_lock); - device_rbtree_remove(info); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && -- Gitee From 66f16c72fcdf99dd4cff396bfbf28d27e7b42c7b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Mar 2024 05:29:58 -0700 Subject: [PATCH 183/219] iommu: Pass domain to remove_dev_pasid() op ANBZ: #13617 commit d2f85a263883b679f87ed8f911746105658e9c47 upstream. Existing remove_dev_pasid() callbacks of the underlying iommu drivers get the attached domain from the group->pasid_array. However, the domain stored in group->pasid_array is not always correct in all scenarios. A wrong domain may result in failure in remove_dev_pasid() callback. To avoid such problems, it is more reliable to pass the domain to the remove_dev_pasid() op. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240328122958.83332-3-yi.l.liu@intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 ++------- drivers/iommu/intel/iommu.c | 11 +++-------- drivers/iommu/iommu.c | 9 +++++---- include/linux/iommu.h | 3 ++- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0b42059a7866..e064fd8bec6d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3054,14 +3054,9 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) { - struct iommu_domain *domain; - - domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA); - if (WARN_ON(IS_ERR(domain)) || !domain) - return; - arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9c1ec1a5f271..a7890a92665b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4667,19 +4667,15 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; unsigned long flags; - domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); - if (WARN_ON_ONCE(!domain)) - goto out_tear_down; - /* * The SVA implementation needs to handle its own stuffs like the mm * notification. Before consolidating that code into iommu core, let @@ -4690,7 +4686,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) goto out_tear_down; } - dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e4fd05bcf22a..9265670edf0b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3353,20 +3353,21 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, if (device == last_gdev) break; - ops->remove_dev_pasid(device->dev, pasid); + ops->remove_dev_pasid(device->dev, pasid, domain); } return ret; } static void __iommu_remove_group_pasid(struct iommu_group *group, - ioasid_t pasid) + ioasid_t pasid, + struct iommu_domain *domain) { struct group_device *device; const struct iommu_ops *ops; for_each_group_device(group, device) { ops = dev_iommu_ops(device->dev); - ops->remove_dev_pasid(device->dev, pasid); + ops->remove_dev_pasid(device->dev, pasid, domain); } } @@ -3436,7 +3437,7 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); - __iommu_remove_group_pasid(group, pasid); + __iommu_remove_group_pasid(group, pasid, domain); WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); mutex_unlock(&group->mutex); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8abb55865861..20399f549fad 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -586,7 +586,8 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); - void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); + void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; -- Gitee From a1221fb7eef70553a4efde38dde20d0c4765f714 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 4 Apr 2024 21:05:14 -0300 Subject: [PATCH 184/219] iommufd: Add missing IOMMUFD_DRIVER kconfig for the selftest ANBZ: #13617 commit 8541323285994528ad5be2c1bdc759e6c83b936e upstream. Some kconfigs don't automatically include this symbol which results in sub functions for some of the dirty tracking related things that are non-functional. Thus the test suite will fail. select IOMMUFD_DRIVER in the IOMMUFD_TEST kconfig to fix it. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240327182050.GA1363414@ziepe.ca Tested-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- drivers/iommu/iommufd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 99d4b075df49..76656fe0470d 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -37,6 +37,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + select IOMMUFD_DRIVER default n help This is dangerous, do not enable unless running -- Gitee From 6a6046ed3b1b5215b8aaba0a5c37b45220bec860 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 25 Mar 2024 14:00:48 +0500 Subject: [PATCH 185/219] iommufd: Add config needed for iommufd_fail_nth ANBZ: #13617 commit 2760c51b8040d7cffedc337939e7475a17cc4b19 upstream. Add FAULT_INJECTION_DEBUG_FS and FAILSLAB configurations to the kconfig fragment for the iommfd selftests. These kconfigs are needed by the iommufd_fail_nth test. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240325090048.1423908-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe Signed-off-by: Shuai Xue --- tools/testing/selftests/iommu/config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 110d73917615..02a2a1b267c1 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,3 +1,5 @@ CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_TEST=y +CONFIG_FAILSLAB=y -- Gitee From c253fb800a27b48c6c2ab1b788cde6fe63ccd461 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:12 +0000 Subject: [PATCH 186/219] iommu/vt-d: add wrapper functions for page allocations ANBZ: #13617 commit 06c375053cefc3a2f383d200596abe5ab3fb35f9 upstream. In order to improve observability and accountability of IOMMU layer, we must account the number of pages that are allocated by functions that are calling directly into buddy allocator. This is achieved by first wrapping the allocation related functions into a separate inline functions in new file: drivers/iommu/iommu-pages.h Convert all page allocation calls under iommu/intel to use these new functions. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-2-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a7890a92665b..5952f7083f78 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -299,22 +299,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -void *alloc_pgtable_page(int node, gfp_t gfp) -{ - struct page *page; - void *vaddr = NULL; - - page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); - if (page) - vaddr = page_address(page); - return vaddr; -} - -void free_pgtable_page(void *vaddr) -{ - free_page((unsigned long)vaddr); -} - static int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; -- Gitee From 76dd4434d922c2d9324825ea634f95522529ed40 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:13 +0000 Subject: [PATCH 187/219] iommu/dma: use iommu_put_pages_list() to releae freelist ANBZ: #13617 commit 95b18ef9c69157ded5ece1136377cf8123b597f0 upstream. Free the IOMMU page tables via iommu_put_pages_list(). The page tables were allocated via iommu_alloc_* functions in architecture specific places, but are released in dma-iommu if the freelist is gathered during map/unmap operations into iommu_iotlb_gather data structure. Currently, only iommu/intel that does that. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Link: https://lore.kernel.org/r/20240413002522.1101315-3-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/dma-iommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3ea2e569d356..3020b3b295d2 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -32,6 +32,7 @@ #include #include "dma-iommu.h" +#include "iommu-pages.h" struct iommu_dma_msi_page { struct list_head list; @@ -156,7 +157,7 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq if (fq->entries[idx].counter >= counter) break; - put_pages_list(&fq->entries[idx].freelist); + iommu_put_pages_list(&fq->entries[idx].freelist); free_iova_fast(&cookie->iovad, fq->entries[idx].iova_pfn, fq->entries[idx].pages); @@ -254,7 +255,7 @@ static void iommu_dma_free_fq_single(struct iova_fq *fq) int idx; fq_ring_for_each(idx, fq) - put_pages_list(&fq->entries[idx].freelist); + iommu_put_pages_list(&fq->entries[idx].freelist); vfree(fq); } @@ -267,7 +268,7 @@ static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq) struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu); fq_ring_for_each(idx, fq) - put_pages_list(&fq->entries[idx].freelist); + iommu_put_pages_list(&fq->entries[idx].freelist); } free_percpu(percpu_fq); -- Gitee From 55c442b2ba1fe6832befe6ff95e50b227e124d2c Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:14 +0000 Subject: [PATCH 188/219] iommu/amd: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit 75114cbaa136fc50de3a339c85124b20466a7c46 upstream. Convert iommu/amd/* files to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-4-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 8 --- drivers/iommu/amd/init.c | 91 ++++++++++++++----------------- drivers/iommu/amd/io_pgtable.c | 13 +++-- drivers/iommu/amd/io_pgtable_v2.c | 18 +++--- drivers/iommu/amd/iommu.c | 11 ++-- 5 files changed, 62 insertions(+), 79 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index bea910d9ace5..0416f0c186c7 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -134,14 +134,6 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev) return PCI_SEG_DEVID_TO_SBDF(seg, devid); } -static inline void *alloc_pgtable_page(int nid, gfp_t gfp) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp | __GFP_ZERO, 0); - return page ? page_address(page) : NULL; -} - /* * This must be called after device probe completes. During probe * use rlookup_amd_iommu() get the iommu. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 2e98a6b0c02e..a78307dc96f3 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -35,6 +35,7 @@ #include "amd_iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" /* * definitions for the ACPI scanning code @@ -648,8 +649,8 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table, u16 pci_ /* Allocate per PCI segment device table */ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO | GFP_DMA32, - get_order(pci_seg->dev_table_size)); + pci_seg->dev_table = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32, + get_order(pci_seg->dev_table_size)); if (!pci_seg->dev_table) return -ENOMEM; @@ -658,17 +659,16 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) { - free_pages((unsigned long)pci_seg->dev_table, - get_order(pci_seg->dev_table_size)); + iommu_free_pages(pci_seg->dev_table, + get_order(pci_seg->dev_table_size)); pci_seg->dev_table = NULL; } /* Allocate per PCI segment IOMMU rlookup table. */ static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->rlookup_table = (void *)__get_free_pages( - GFP_KERNEL | __GFP_ZERO, - get_order(pci_seg->rlookup_table_size)); + pci_seg->rlookup_table = iommu_alloc_pages(GFP_KERNEL, + get_order(pci_seg->rlookup_table_size)); if (pci_seg->rlookup_table == NULL) return -ENOMEM; @@ -677,16 +677,15 @@ static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_rlookup_table(struct amd_iommu_pci_seg *pci_seg) { - free_pages((unsigned long)pci_seg->rlookup_table, - get_order(pci_seg->rlookup_table_size)); + iommu_free_pages(pci_seg->rlookup_table, + get_order(pci_seg->rlookup_table_size)); pci_seg->rlookup_table = NULL; } static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->irq_lookup_table = (void *)__get_free_pages( - GFP_KERNEL | __GFP_ZERO, - get_order(pci_seg->rlookup_table_size)); + pci_seg->irq_lookup_table = iommu_alloc_pages(GFP_KERNEL, + get_order(pci_seg->rlookup_table_size)); kmemleak_alloc(pci_seg->irq_lookup_table, pci_seg->rlookup_table_size, 1, GFP_KERNEL); if (pci_seg->irq_lookup_table == NULL) @@ -698,8 +697,8 @@ static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_se static inline void free_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg) { kmemleak_free(pci_seg->irq_lookup_table); - free_pages((unsigned long)pci_seg->irq_lookup_table, - get_order(pci_seg->rlookup_table_size)); + iommu_free_pages(pci_seg->irq_lookup_table, + get_order(pci_seg->rlookup_table_size)); pci_seg->irq_lookup_table = NULL; } @@ -707,8 +706,8 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg) { int i; - pci_seg->alias_table = (void *)__get_free_pages(GFP_KERNEL, - get_order(pci_seg->alias_table_size)); + pci_seg->alias_table = iommu_alloc_pages(GFP_KERNEL, + get_order(pci_seg->alias_table_size)); if (!pci_seg->alias_table) return -ENOMEM; @@ -723,8 +722,8 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg) static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) { - free_pages((unsigned long)pci_seg->alias_table, - get_order(pci_seg->alias_table_size)); + iommu_free_pages(pci_seg->alias_table, + get_order(pci_seg->alias_table_size)); pci_seg->alias_table = NULL; } @@ -735,8 +734,8 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) */ static int __init alloc_command_buffer(struct amd_iommu *iommu) { - iommu->cmd_buf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(CMD_BUFFER_SIZE)); + iommu->cmd_buf = iommu_alloc_pages(GFP_KERNEL, + get_order(CMD_BUFFER_SIZE)); return iommu->cmd_buf ? 0 : -ENOMEM; } @@ -844,19 +843,19 @@ static void iommu_disable_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); + iommu_free_pages(iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp, size_t size) { int order = get_order(size); - void *buf = (void *)__get_free_pages(gfp, order); + void *buf = iommu_alloc_pages(gfp, order); if (buf && check_feature(FEATURE_SNP) && set_memory_4k((unsigned long)buf, (1 << order))) { - free_pages((unsigned long)buf, order); + iommu_free_pages(buf, order); buf = NULL; } @@ -866,7 +865,7 @@ static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, /* allocates the memory where the IOMMU will log its events to */ static int __init alloc_event_buffer(struct amd_iommu *iommu) { - iommu->evt_buf = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO, + iommu->evt_buf = iommu_alloc_4k_pages(iommu, GFP_KERNEL, EVT_BUFFER_SIZE); return iommu->evt_buf ? 0 : -ENOMEM; @@ -900,14 +899,13 @@ static void iommu_disable_event_buffer(struct amd_iommu *iommu) static void __init free_event_buffer(struct amd_iommu *iommu) { - free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); + iommu_free_pages(iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); } /* allocates the memory where the IOMMU will log its events to */ static int __init alloc_ppr_log(struct amd_iommu *iommu) { - iommu->ppr_log = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO, - PPR_LOG_SIZE); + iommu->ppr_log = iommu_alloc_4k_pages(iommu, GFP_KERNEL, PPR_LOG_SIZE); return iommu->ppr_log ? 0 : -ENOMEM; } @@ -936,14 +934,14 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu) static void __init free_ppr_log(struct amd_iommu *iommu) { - free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); + iommu_free_pages(iommu->ppr_log, get_order(PPR_LOG_SIZE)); } static void free_ga_log(struct amd_iommu *iommu) { #ifdef CONFIG_IRQ_REMAP - free_pages((unsigned long)iommu->ga_log, get_order(GA_LOG_SIZE)); - free_pages((unsigned long)iommu->ga_log_tail, get_order(8)); + iommu_free_pages(iommu->ga_log, get_order(GA_LOG_SIZE)); + iommu_free_pages(iommu->ga_log_tail, get_order(8)); #endif } @@ -988,13 +986,11 @@ static int iommu_init_ga_log(struct amd_iommu *iommu) if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) return 0; - iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(GA_LOG_SIZE)); + iommu->ga_log = iommu_alloc_pages(GFP_KERNEL, get_order(GA_LOG_SIZE)); if (!iommu->ga_log) goto err_out; - iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(8)); + iommu->ga_log_tail = iommu_alloc_pages(GFP_KERNEL, get_order(8)); if (!iommu->ga_log_tail) goto err_out; @@ -1007,7 +1003,7 @@ static int iommu_init_ga_log(struct amd_iommu *iommu) static int __init alloc_cwwb_sem(struct amd_iommu *iommu) { - iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO, 1); + iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1); return iommu->cmd_sem ? 0 : -ENOMEM; } @@ -1015,7 +1011,7 @@ static int __init alloc_cwwb_sem(struct amd_iommu *iommu) static void __init free_cwwb_sem(struct amd_iommu *iommu) { if (iommu->cmd_sem) - free_page((unsigned long)iommu->cmd_sem); + iommu_free_page((void *)iommu->cmd_sem); } static void iommu_enable_xt(struct amd_iommu *iommu) @@ -1080,7 +1076,6 @@ static bool __copy_device_table(struct amd_iommu *iommu) u32 lo, hi, devid, old_devtb_size; phys_addr_t old_devtb_phys; u16 dom_id, dte_v, irq_v; - gfp_t gfp_flag; u64 tmp; /* Each IOMMU use separate device table with the same size */ @@ -1114,9 +1109,8 @@ static bool __copy_device_table(struct amd_iommu *iommu) if (!old_devtb) return false; - gfp_flag = GFP_KERNEL | __GFP_ZERO | GFP_DMA32; - pci_seg->old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag, - get_order(pci_seg->dev_table_size)); + pci_seg->old_dev_tbl_cpy = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32, + get_order(pci_seg->dev_table_size)); if (pci_seg->old_dev_tbl_cpy == NULL) { pr_err("Failed to allocate memory for copying old device table!\n"); memunmap(old_devtb); @@ -2813,8 +2807,8 @@ static void early_enable_iommus(void) for_each_pci_segment(pci_seg) { if (pci_seg->old_dev_tbl_cpy != NULL) { - free_pages((unsigned long)pci_seg->old_dev_tbl_cpy, - get_order(pci_seg->dev_table_size)); + iommu_free_pages(pci_seg->old_dev_tbl_cpy, + get_order(pci_seg->dev_table_size)); pci_seg->old_dev_tbl_cpy = NULL; } } @@ -2827,8 +2821,8 @@ static void early_enable_iommus(void) pr_info("Copied DEV table from previous kernel.\n"); for_each_pci_segment(pci_seg) { - free_pages((unsigned long)pci_seg->dev_table, - get_order(pci_seg->dev_table_size)); + iommu_free_pages(pci_seg->dev_table, + get_order(pci_seg->dev_table_size)); pci_seg->dev_table = pci_seg->old_dev_tbl_cpy; } @@ -3038,8 +3032,8 @@ static bool __init check_ioapic_information(void) static void __init free_dma_resources(void) { - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID/8)); + iommu_free_pages(amd_iommu_pd_alloc_bitmap, + get_order(MAX_DOMAIN_ID / 8)); amd_iommu_pd_alloc_bitmap = NULL; free_unity_maps(); @@ -3111,9 +3105,8 @@ static int __init early_amd_iommu_init(void) /* Device table - directly used by all IOMMUs */ ret = -ENOMEM; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( - GFP_KERNEL | __GFP_ZERO, - get_order(MAX_DOMAIN_ID/8)); + amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL, + get_order(MAX_DOMAIN_ID / 8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto out; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 2a0d1e97e52f..9d9a7fde59e7 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -22,6 +22,7 @@ #include "amd_iommu_types.h" #include "amd_iommu.h" +#include "../iommu-pages.h" static void v1_tlb_flush_all(void *cookie) { @@ -156,7 +157,7 @@ static bool increase_address_space(struct protection_domain *domain, bool ret = true; u64 *pte; - pte = alloc_pgtable_page(domain->nid, gfp); + pte = iommu_alloc_page_node(domain->nid, gfp); if (!pte) return false; @@ -187,7 +188,7 @@ static bool increase_address_space(struct protection_domain *domain, out: spin_unlock_irqrestore(&domain->lock, flags); - free_page((unsigned long)pte); + iommu_free_page(pte); return ret; } @@ -250,7 +251,7 @@ static u64 *alloc_pte(struct protection_domain *domain, if (!IOMMU_PTE_PRESENT(__pte) || pte_level == PAGE_MODE_NONE) { - page = alloc_pgtable_page(domain->nid, gfp); + page = iommu_alloc_page_node(domain->nid, gfp); if (!page) return NULL; @@ -259,7 +260,7 @@ static u64 *alloc_pte(struct protection_domain *domain, /* pte could have been changed somewhere. */ if (!try_cmpxchg64(pte, &__pte, __npte)) - free_page((unsigned long)page); + iommu_free_page(page); else if (IOMMU_PTE_PRESENT(__pte)) *updated = true; @@ -431,7 +432,7 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, } /* Everything flushed out, free pages now */ - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); return ret; } @@ -580,7 +581,7 @@ static void v1_free_pgtable(struct io_pgtable *iop) /* Make changes visible to IOMMUs */ amd_iommu_domain_update(dom); - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); } static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 8929c484b11e..e1d6ad01c641 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -18,6 +18,7 @@ #include "amd_iommu_types.h" #include "amd_iommu.h" +#include "../iommu-pages.h" #define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ #define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ @@ -99,11 +100,6 @@ static inline int page_size_to_level(u64 pg_size) return PAGE_MODE_1_LEVEL; } -static inline void free_pgtable_page(u64 *pt) -{ - free_page((unsigned long)pt); -} - static void free_pgtable(u64 *pt, int level) { u64 *p; @@ -125,10 +121,10 @@ static void free_pgtable(u64 *pt, int level) if (level > 2) free_pgtable(p, level - 1); else - free_pgtable_page(p); + iommu_free_page(p); } - free_pgtable_page(pt); + iommu_free_page(pt); } /* Allocate page table */ @@ -156,14 +152,14 @@ static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, } if (!IOMMU_PTE_PRESENT(__pte)) { - page = alloc_pgtable_page(nid, gfp); + page = iommu_alloc_page_node(nid, gfp); if (!page) return NULL; __npte = set_pgtable_attr(page); /* pte could have been changed somewhere. */ if (cmpxchg64(pte, __pte, __npte) != __pte) - free_pgtable_page(page); + iommu_free_page(page); else if (IOMMU_PTE_PRESENT(__pte)) *updated = true; @@ -185,7 +181,7 @@ static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, if (pg_size == IOMMU_PAGE_SIZE_1G) free_pgtable(__pte, end_level - 1); else if (pg_size == IOMMU_PAGE_SIZE_2M) - free_pgtable_page(__pte); + iommu_free_page(__pte); } return pte; @@ -366,7 +362,7 @@ static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo struct protection_domain *pdom = (struct protection_domain *)cookie; int ias = IOMMU_IN_ADDR_BIT_SIZE; - pgtable->pgd = alloc_pgtable_page(pdom->nid, GFP_ATOMIC); + pgtable->pgd = iommu_alloc_page_node(pdom->nid, GFP_ATOMIC); if (!pgtable->pgd) return NULL; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 55f004a21795..654a75041ea6 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -42,6 +42,7 @@ #include "amd_iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) @@ -1737,7 +1738,7 @@ static void free_gcr3_tbl_level1(u64 *tbl) ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); - free_page((unsigned long)ptr); + iommu_free_page(ptr); } } @@ -1770,7 +1771,7 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) /* Free per device domain ID */ domain_id_free(gcr3_info->domid); - free_page((unsigned long)gcr3_info->gcr3_tbl); + iommu_free_page(gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; } @@ -1805,7 +1806,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, /* Allocate per device domain ID */ gcr3_info->domid = domain_id_alloc(); - gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_ATOMIC); + gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC); if (gcr3_info->gcr3_tbl == NULL) { domain_id_free(gcr3_info->domid); return -ENOMEM; @@ -2254,7 +2255,7 @@ static void protection_domain_free(struct protection_domain *domain) free_io_pgtable_ops(&domain->iop.iop.ops); if (domain->iop.root) - free_page((unsigned long)domain->iop.root); + iommu_free_page(domain->iop.root); if (domain->id) domain_id_free(domain->id); @@ -2269,7 +2270,7 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); if (mode != PAGE_MODE_NONE) { - pt_root = (void *)get_zeroed_page(GFP_KERNEL); + pt_root = iommu_alloc_page(GFP_KERNEL); if (!pt_root) return -ENOMEM; } -- Gitee From 1e7271e8b5c21775b88c9273b62940df5821e031 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:15 +0000 Subject: [PATCH 189/219] iommu/io-pgtable-arm: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit 9a3dd4c1ee7a183229163320eb38f15b17342f78 upstream. Convert iommu/io-pgtable-arm.c to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-5-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgtable-arm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index a9dd40bd459d..38ebd192e5db 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -21,6 +21,7 @@ #include #include "io-pgtable-arm.h" +#include "iommu-pages.h" #define ARM_LPAE_MAX_ADDR_BITS 52 #define ARM_LPAE_S2_MAX_CONCAT_PAGES 16 @@ -210,14 +211,10 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, VM_BUG_ON((gfp & __GFP_HIGHMEM)); - if (cfg->alloc) { + if (cfg->alloc) pages = cfg->alloc(cookie, size, gfp); - } else { - struct page *p; - - p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order); - pages = p ? page_address(p) : NULL; - } + else + pages = iommu_alloc_pages_node(dev_to_node(dev), gfp, order); if (!pages) return NULL; @@ -245,7 +242,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, if (cfg->free) cfg->free(cookie, pages, size); else - free_pages((unsigned long)pages, order); + iommu_free_pages(pages, order); return NULL; } @@ -261,7 +258,7 @@ static void __arm_lpae_free_pages(void *pages, size_t size, if (cfg->free) cfg->free(cookie, pages, size); else - free_pages((unsigned long)pages, get_order(size)); + iommu_free_pages(pages, get_order(size)); } static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, -- Gitee From d27f5ff10f9ab4afd291f75b2cab4e0e3d428f17 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:16 +0000 Subject: [PATCH 190/219] iommu/io-pgtable-dart: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit 4a0b77e7c899d9a64b597ae8c9624a066e95f555 upstream. Convert iommu/io-pgtable-dart.c to use the new page allocation functions provided in iommu-pages.h., and remove unnecessary struct io_pgtable_cfg argument from __dart_alloc_pages(). Signed-off-by: Pasha Tatashin Reviewed-by: Janne Grunau Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-6-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/io-pgtable-dart.c | 37 +++++++++++++-------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index 10811e0b773d..c004640640ee 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -23,6 +23,7 @@ #include #include +#include "iommu-pages.h" #define DART1_MAX_ADDR_BITS 36 @@ -106,18 +107,12 @@ static phys_addr_t iopte_to_paddr(dart_iopte pte, return paddr; } -static void *__dart_alloc_pages(size_t size, gfp_t gfp, - struct io_pgtable_cfg *cfg) +static void *__dart_alloc_pages(size_t size, gfp_t gfp) { int order = get_order(size); - struct page *p; VM_BUG_ON((gfp & __GFP_HIGHMEM)); - p = alloc_pages(gfp | __GFP_ZERO, order); - if (!p) - return NULL; - - return page_address(p); + return iommu_alloc_pages(gfp, order); } static int dart_init_pte(struct dart_io_pgtable *data, @@ -261,13 +256,13 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, /* no L2 table present */ if (!pte) { - cptep = __dart_alloc_pages(tblsz, gfp, cfg); + cptep = __dart_alloc_pages(tblsz, gfp); if (!cptep) return -ENOMEM; pte = dart_install_table(cptep, ptep, 0, data); if (pte) - free_pages((unsigned long)cptep, get_order(tblsz)); + iommu_free_pages(cptep, get_order(tblsz)); /* L2 table is present (now) */ pte = READ_ONCE(*ptep); @@ -418,8 +413,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits; for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) { - data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), GFP_KERNEL, - cfg); + data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), GFP_KERNEL); if (!data->pgd[i]) goto out_free_data; cfg->apple_dart_cfg.ttbr[i] = virt_to_phys(data->pgd[i]); @@ -428,9 +422,10 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) return &data->iop; out_free_data: - while (--i >= 0) - free_pages((unsigned long)data->pgd[i], - get_order(DART_GRANULE(data))); + while (--i >= 0) { + iommu_free_pages(data->pgd[i], + get_order(DART_GRANULE(data))); + } kfree(data); return NULL; } @@ -438,6 +433,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) static void apple_dart_free_pgtable(struct io_pgtable *iop) { struct dart_io_pgtable *data = io_pgtable_to_data(iop); + int order = get_order(DART_GRANULE(data)); dart_iopte *ptep, *end; int i; @@ -448,15 +444,10 @@ static void apple_dart_free_pgtable(struct io_pgtable *iop) while (ptep != end) { dart_iopte pte = *ptep++; - if (pte) { - unsigned long page = - (unsigned long)iopte_deref(pte, data); - - free_pages(page, get_order(DART_GRANULE(data))); - } + if (pte) + iommu_free_pages(iopte_deref(pte, data), order); } - free_pages((unsigned long)data->pgd[i], - get_order(DART_GRANULE(data))); + iommu_free_pages(data->pgd[i], order); } kfree(data); -- Gitee From 740aa04bcf3cfc5e05fa83c3e79eebf9b4039177 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:17 +0000 Subject: [PATCH 191/219] iommu/exynos: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit fe046f1bf820ab721fbd49aba6e5db39329c9eaa upstream. Convert iommu/exynos-iommu.c to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Acked-by: Marek Szyprowski Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-7-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/exynos-iommu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index d98c9161948a..c666ecab955d 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -22,6 +22,8 @@ #include #include +#include "iommu-pages.h" + typedef u32 sysmmu_iova_t; typedef u32 sysmmu_pte_t; static struct iommu_domain exynos_identity_domain; @@ -900,11 +902,11 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) if (!domain) return NULL; - domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2); + domain->pgtable = iommu_alloc_pages(GFP_KERNEL, 2); if (!domain->pgtable) goto err_pgtable; - domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + domain->lv2entcnt = iommu_alloc_pages(GFP_KERNEL, 1); if (!domain->lv2entcnt) goto err_counter; @@ -930,9 +932,9 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) return &domain->domain; err_lv2ent: - free_pages((unsigned long)domain->lv2entcnt, 1); + iommu_free_pages(domain->lv2entcnt, 1); err_counter: - free_pages((unsigned long)domain->pgtable, 2); + iommu_free_pages(domain->pgtable, 2); err_pgtable: kfree(domain); return NULL; @@ -973,8 +975,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) phys_to_virt(base)); } - free_pages((unsigned long)domain->pgtable, 2); - free_pages((unsigned long)domain->lv2entcnt, 1); + iommu_free_pages(domain->pgtable, 2); + iommu_free_pages(domain->lv2entcnt, 1); kfree(domain); } -- Gitee From 13977e05e0ad3692b5cb5a3c9b5913c89844f0f1 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:18 +0000 Subject: [PATCH 192/219] iommu/rockchip: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit 5404ccaaff357d2d8d40e1a879446da9c9da5879 upstream. Convert iommu/rockchip-iommu.c to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-8-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/rockchip-iommu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index e77a20f1dff8..4c7f470a4752 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -26,6 +26,8 @@ #include #include +#include "iommu-pages.h" + /** MMU register offsets */ #define RK_MMU_DTE_ADDR 0x00 /* Directory table address */ #define RK_MMU_STATUS 0x04 @@ -727,14 +729,14 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, if (rk_dte_is_pt_valid(dte)) goto done; - page_table = (u32 *)get_zeroed_page(GFP_ATOMIC | rk_ops->gfp_flags); + page_table = iommu_alloc_page(GFP_ATOMIC | rk_ops->gfp_flags); if (!page_table) return ERR_PTR(-ENOMEM); pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(dma_dev, pt_dma)) { dev_err(dma_dev, "DMA mapping error while allocating page table\n"); - free_page((unsigned long)page_table); + iommu_free_page(page_table); return ERR_PTR(-ENOMEM); } @@ -1061,7 +1063,7 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries. * Allocate one 4 KiB page for each table. */ - rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | rk_ops->gfp_flags); + rk_domain->dt = iommu_alloc_page(GFP_KERNEL | rk_ops->gfp_flags); if (!rk_domain->dt) goto err_free_domain; @@ -1083,7 +1085,7 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) return &rk_domain->domain; err_free_dt: - free_page((unsigned long)rk_domain->dt); + iommu_free_page(rk_domain->dt); err_free_domain: kfree(rk_domain); @@ -1104,13 +1106,13 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) u32 *page_table = phys_to_virt(pt_phys); dma_unmap_single(dma_dev, pt_phys, SPAGE_SIZE, DMA_TO_DEVICE); - free_page((unsigned long)page_table); + iommu_free_page(page_table); } } dma_unmap_single(dma_dev, rk_domain->dt_dma, SPAGE_SIZE, DMA_TO_DEVICE); - free_page((unsigned long)rk_domain->dt); + iommu_free_page(rk_domain->dt); kfree(rk_domain); } -- Gitee From b5ddbd254d99c5dedf790fc0b8c81785b24b61d2 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:19 +0000 Subject: [PATCH 193/219] iommu/sun50i: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit cb06b259e166e69eaa07b348202a6205346e2791 upstream. Convert iommu/sun50i-iommu.c to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Acked-by: Jernej Skrabec Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-9-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/sun50i-iommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 9896b940df66..dd3f07384624 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -26,6 +26,8 @@ #include #include +#include "iommu-pages.h" + #define IOMMU_RESET_REG 0x010 #define IOMMU_RESET_RELEASE_ALL 0xffffffff #define IOMMU_ENABLE_REG 0x020 @@ -680,8 +682,7 @@ sun50i_iommu_domain_alloc_paging(struct device *dev) if (!sun50i_domain) return NULL; - sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(DT_SIZE)); + sun50i_domain->dt = iommu_alloc_pages(GFP_KERNEL, get_order(DT_SIZE)); if (!sun50i_domain->dt) goto err_free_domain; @@ -703,7 +704,7 @@ static void sun50i_iommu_domain_free(struct iommu_domain *domain) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); - free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE)); + iommu_free_pages(sun50i_domain->dt, get_order(DT_SIZE)); sun50i_domain->dt = NULL; kfree(sun50i_domain); -- Gitee From 2ffe67bc4c80f2100ac06cca609c5ac7b059b244 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:20 +0000 Subject: [PATCH 194/219] iommu/tegra-smmu: use page allocation function provided by iommu-pages.h ANBZ: #13617 commit 8e8b4ac5b0ab759fffcd4d802ab3e084e3609191 upstream. Convert iommu/tegra-smmu.c to use the new page allocation functions provided in iommu-pages.h. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Acked-by: Thierry Reding Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-10-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/tegra-smmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 14e525bd0d9b..f86c7ae91814 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -19,6 +19,8 @@ #include #include +#include "iommu-pages.h" + struct tegra_smmu_group { struct list_head list; struct tegra_smmu *smmu; @@ -282,7 +284,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE; - as->pd = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO); + as->pd = __iommu_alloc_pages(GFP_KERNEL | __GFP_DMA, 0); if (!as->pd) { kfree(as); return NULL; @@ -290,7 +292,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL); if (!as->count) { - __free_page(as->pd); + __iommu_free_pages(as->pd, 0); kfree(as); return NULL; } @@ -298,7 +300,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL); if (!as->pts) { kfree(as->count); - __free_page(as->pd); + __iommu_free_pages(as->pd, 0); kfree(as); return NULL; } @@ -599,14 +601,14 @@ static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova, dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT, DMA_TO_DEVICE); if (dma_mapping_error(smmu->dev, dma)) { - __free_page(page); + __iommu_free_pages(page, 0); return NULL; } if (!smmu_dma_addr_valid(smmu, dma)) { dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT, DMA_TO_DEVICE); - __free_page(page); + __iommu_free_pages(page, 0); return NULL; } @@ -649,7 +651,7 @@ static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova) tegra_smmu_set_pde(as, iova, 0); dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE); - __free_page(page); + __iommu_free_pages(page, 0); as->pts[pde] = NULL; } } @@ -688,7 +690,7 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); - page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); + page = __iommu_alloc_pages(gfp | __GFP_DMA, 0); if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); @@ -700,7 +702,7 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, */ if (as->pts[pde]) { if (page) - __free_page(page); + __iommu_free_pages(page, 0); page = as->pts[pde]; } -- Gitee From 150b62f154d49a2d595332ff049445c3b19aa703 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:21 +0000 Subject: [PATCH 195/219] iommu: observability of the IOMMU allocations ANBZ: #13617 commit bd3520a93a84cd8c3897283e5891a9106fcf5acc upstream. Add NR_IOMMU_PAGES into node_stat_item that counts number of pages that are allocated by the IOMMU subsystem. The allocations can be view per-node via: /sys/devices/system/node/nodeN/vmstat. For example: $ grep iommu /sys/devices/system/node/node*/vmstat /sys/devices/system/node/node0/vmstat:nr_iommu_pages 106025 /sys/devices/system/node/node1/vmstat:nr_iommu_pages 3464 The value is in page-count, therefore, in the above example the iommu allocations amount to ~428M. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-11-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-pages.h | 30 ++++++++++++++++++++++++++++++ include/linux/mmzone.h | 3 +++ mm/vmstat.c | 3 +++ 3 files changed, 36 insertions(+) diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index 5a222d0ad25c..1264b0f6b6c3 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -20,6 +20,30 @@ * large, i.e. multiple gigabytes in size. */ +/** + * __iommu_alloc_account - account for newly allocated page. + * @page: head struct page of the page. + * @order: order of the page + */ +static inline void __iommu_alloc_account(struct page *page, int order) +{ + const long pgcnt = 1l << order; + + mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt); +} + +/** + * __iommu_free_account - account a page that is about to be freed. + * @page: head struct page of the page. + * @order: order of the page + */ +static inline void __iommu_free_account(struct page *page, int order) +{ + const long pgcnt = 1l << order; + + mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt); +} + /** * __iommu_alloc_pages - allocate a zeroed page of a given order. * @gfp: buddy allocator flags @@ -35,6 +59,8 @@ static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order) if (unlikely(!page)) return NULL; + __iommu_alloc_account(page, order); + return page; } @@ -48,6 +74,7 @@ static inline void __iommu_free_pages(struct page *page, int order) if (!page) return; + __iommu_free_account(page, order); __free_pages(page, order); } @@ -67,6 +94,8 @@ static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order) if (unlikely(!page)) return NULL; + __iommu_alloc_account(page, order); + return page_address(page); } @@ -147,6 +176,7 @@ static inline void iommu_put_pages_list(struct list_head *page) struct page *p = list_entry(page->prev, struct page, lru); list_del(&p->lru); + __iommu_free_account(p, 0); put_page(p); } } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 966613cf4575..82825889c33b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -202,6 +202,9 @@ enum node_stat_item { #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ +#ifdef CONFIG_IOMMU_SUPPORT + NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ +#endif #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 5131433cfcc6..05b871ea2686 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1238,6 +1238,9 @@ const char * const vmstat_text[] = { #endif "nr_page_table_pages", "nr_sec_page_table_pages", +#ifdef CONFIG_IOMMU_SUPPORT + "nr_iommu_pages", +#endif #ifdef CONFIG_SWAP "nr_swapcached", #endif -- Gitee From 4ea3fbae56cda3985deb2b748a97e1eb0072bab6 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 13 Apr 2024 00:25:22 +0000 Subject: [PATCH 196/219] iommu: account IOMMU allocated memory ANBZ: #13617 commit 212c5c078d83d780cf2873ca931df135771e8bb7 upstream. In order to be able to limit the amount of memory that is allocated by IOMMU subsystem, the memory must be accounted. Account IOMMU as part of the secondary pagetables as it was discussed at LPC. The value of SecPageTables now contains mmeory allocation by IOMMU and KVM. There is a difference between GFP_ACCOUNT and what NR_IOMMU_PAGES shows. GFP_ACCOUNT is set only where it makes sense to charge to user processes, i.e. IOMMU Page Tables, but there more IOMMU shared data that should not really be charged to a specific process. Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240413002522.1101315-12-pasha.tatashin@soleen.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/filesystems/proc.rst | 4 ++-- drivers/iommu/iommu-pages.h | 2 ++ include/linux/mmzone.h | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7d76752ed5ed..ec0d91c9f36a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1424,7 +1424,7 @@ PAGE_SIZE multiple when read back. sec_pagetables Amount of memory allocated for secondary page tables, this currently includes KVM mmu allocations on x86 - and arm64. + and arm64 and IOMMU page tables. percpu (npn) Amount of memory used for storing per-cpu kernel diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 6652b658ee77..9573dd155d09 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1104,8 +1104,8 @@ KernelStack PageTables Memory consumed by userspace page tables SecPageTables - Memory consumed by secondary page tables, this currently - currently includes KVM mmu allocations on x86 and arm64. + Memory consumed by secondary page tables, this currently includes + KVM mmu and IOMMU allocations on x86 and arm64. NFS_Unstable Always zero. Previous counted pages which had been written to the server, but has not been committed to stable storage. diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index 1264b0f6b6c3..82ebf0033081 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -30,6 +30,7 @@ static inline void __iommu_alloc_account(struct page *page, int order) const long pgcnt = 1l << order; mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt); + mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, pgcnt); } /** @@ -42,6 +43,7 @@ static inline void __iommu_free_account(struct page *page, int order) const long pgcnt = 1l << order; mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt); + mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, -pgcnt); } /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 82825889c33b..9d6ba75aabcf 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -201,7 +201,7 @@ enum node_stat_item { NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ - NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ + NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */ #ifdef CONFIG_IOMMU_SUPPORT NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ #endif -- Gitee From 50b69814f86d5fe1c2bafa748e5d900a6b4a6b54 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 14:12:11 -0300 Subject: [PATCH 197/219] iommu/arm-smmu: Convert to domain_alloc_paging() ANBZ: #13617 commit d75d7dc26f29141fe31167c5414605d4087f0abb upstream. Now that the BLOCKED and IDENTITY behaviors are managed with their own domains change to the domain_alloc_paging() op. The check for using_legacy_binding is now redundant, arm_smmu_def_domain_type() always returns IOMMU_DOMAIN_IDENTITY for this mode, so the core code will never attempt to create a DMA domain in the first place. Since commit a4fdd9762272 ("iommu: Use flush queue capability") the core code only passes in IDENTITY/BLOCKED/UNMANAGED/DMA domain types. It will not pass in IDENTITY or BLOCKED if the global statics exist, so the test for DMA is also redundant now too. Cc: Dmitry Baryshkov Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-3632c65678e0+2f1-smmu_alloc_paging_jgg@nvidia.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 8385121db418..929e360a3e71 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -864,14 +864,10 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) arm_smmu_rpm_put(smmu); } -static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) +static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED) { - if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) - return NULL; - } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -1630,7 +1626,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc = arm_smmu_domain_alloc, + .domain_alloc_paging = arm_smmu_domain_alloc_paging, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .probe_finalize = arm_smmu_probe_finalize, -- Gitee From d96e97717f35284b58b4aca0d2ded2c536182ab0 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 17 Apr 2024 06:37:26 -0700 Subject: [PATCH 198/219] iommu/arm-smmu-qcom-debug: Add support for TBUs ANBZ: #13617 commit 414ecb030870a31262e5fd29fd372ee73b32a6be upstream. Operating the TBUs (Translation Buffer Units) from Linux on Qualcomm platforms can help with debugging context faults. To help with that, the TBUs can run ATOS (Address Translation Operations) to manually trigger address translation of IOVA to physical address in hardware and provide more details when a context fault happens. The driver will control the resources needed by the TBU to allow running the debug operations such as ATOS, check for outstanding transactions, do snapshot capture etc. Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/20240417133731.2055383-3-quic_c_gdjako@quicinc.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/Kconfig | 12 +- .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c | 353 ++++++++++++++++++ drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 2 + 4 files changed, 365 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6a05de6125ff..9cb6193b3aaf 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -380,10 +380,14 @@ config ARM_SMMU_QCOM_DEBUG depends on ARM_SMMU_QCOM help Support for implementation specific debug features in ARM SMMU - hardware found in QTI platforms. - - Say Y here to enable debug for issues such as TLB sync timeouts - which requires implementation defined register dumps. + hardware found in QTI platforms. This include support for + the Translation Buffer Units (TBU) that can be used to obtain + additional information when debugging memory management issues + like context faults. + + Say Y here to enable debug for issues such as context faults + or TLB sync timeouts which requires implementation defined + register dumps. config ARM_SMMU_V3 tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c index bb89d49adf8d..eff7ca94ec8d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c @@ -1,15 +1,66 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */ +#include #include +#include #include +#include +#include +#include +#include +#include #include +#include #include "arm-smmu.h" #include "arm-smmu-qcom.h" +#define TBU_DBG_TIMEOUT_US 100 +#define DEBUG_AXUSER_REG 0x30 +#define DEBUG_AXUSER_CDMID GENMASK_ULL(43, 36) +#define DEBUG_AXUSER_CDMID_VAL 0xff +#define DEBUG_PAR_REG 0x28 +#define DEBUG_PAR_FAULT_VAL BIT(0) +#define DEBUG_PAR_PA GENMASK_ULL(47, 12) +#define DEBUG_SID_HALT_REG 0x0 +#define DEBUG_SID_HALT_VAL BIT(16) +#define DEBUG_SID_HALT_SID GENMASK(9, 0) +#define DEBUG_SR_HALT_ACK_REG 0x20 +#define DEBUG_SR_HALT_ACK_VAL BIT(1) +#define DEBUG_SR_ECATS_RUNNING_VAL BIT(0) +#define DEBUG_TXN_AXCACHE GENMASK(5, 2) +#define DEBUG_TXN_AXPROT GENMASK(8, 6) +#define DEBUG_TXN_AXPROT_PRIV 0x1 +#define DEBUG_TXN_AXPROT_NSEC 0x2 +#define DEBUG_TXN_TRIGG_REG 0x18 +#define DEBUG_TXN_TRIGGER BIT(0) +#define DEBUG_VA_ADDR_REG 0x8 + +static LIST_HEAD(tbu_list); +static DEFINE_MUTEX(tbu_list_lock); +static DEFINE_SPINLOCK(atos_lock); + +struct qcom_tbu { + struct device *dev; + struct device_node *smmu_np; + u32 sid_range[2]; + struct list_head list; + struct clk *clk; + struct icc_path *path; + void __iomem *base; + spinlock_t halt_lock; /* multiple halt or resume can't execute concurrently */ + int halt_count; +}; + +static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) +{ + return container_of(smmu, struct qcom_smmu, smmu); +} + void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { int ret; @@ -49,3 +100,305 @@ void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) tbu_pwr_status, sync_inv_ack, sync_inv_progress); } } + +static struct qcom_tbu *qcom_find_tbu(struct qcom_smmu *qsmmu, u32 sid) +{ + struct qcom_tbu *tbu; + u32 start, end; + + guard(mutex)(&tbu_list_lock); + + if (list_empty(&tbu_list)) + return NULL; + + list_for_each_entry(tbu, &tbu_list, list) { + start = tbu->sid_range[0]; + end = start + tbu->sid_range[1]; + + if (qsmmu->smmu.dev->of_node == tbu->smmu_np && + start <= sid && sid < end) + return tbu; + } + dev_err(qsmmu->smmu.dev, "Unable to find TBU for sid 0x%x\n", sid); + + return NULL; +} + +static int qcom_tbu_halt(struct qcom_tbu *tbu, struct arm_smmu_domain *smmu_domain) +{ + struct arm_smmu_device *smmu = smmu_domain->smmu; + int ret = 0, idx = smmu_domain->cfg.cbndx; + u32 val, fsr, status; + + guard(spinlock_irqsave)(&tbu->halt_lock); + if (tbu->halt_count) { + tbu->halt_count++; + return ret; + } + + val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG); + val |= DEBUG_SID_HALT_VAL; + writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); + + fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); + if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) { + u32 sctlr_orig, sctlr; + + /* + * We are in a fault. Our request to halt the bus will not + * complete until transactions in front of us (such as the fault + * itself) have completed. Disable iommu faults and terminate + * any existing transactions. + */ + sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR); + sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig); + } + + if (readl_poll_timeout_atomic(tbu->base + DEBUG_SR_HALT_ACK_REG, status, + (status & DEBUG_SR_HALT_ACK_VAL), + 0, TBU_DBG_TIMEOUT_US)) { + dev_err(tbu->dev, "Timeout while trying to halt TBU!\n"); + ret = -ETIMEDOUT; + + val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG); + val &= ~DEBUG_SID_HALT_VAL; + writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); + + return ret; + } + + tbu->halt_count = 1; + + return ret; +} + +static void qcom_tbu_resume(struct qcom_tbu *tbu) +{ + u32 val; + + guard(spinlock_irqsave)(&tbu->halt_lock); + if (!tbu->halt_count) { + WARN(1, "%s: halt_count is 0", dev_name(tbu->dev)); + return; + } + + if (tbu->halt_count > 1) { + tbu->halt_count--; + return; + } + + val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG); + val &= ~DEBUG_SID_HALT_VAL; + writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); + + tbu->halt_count = 0; +} + +static phys_addr_t qcom_tbu_trigger_atos(struct arm_smmu_domain *smmu_domain, + struct qcom_tbu *tbu, dma_addr_t iova, u32 sid) +{ + bool atos_timedout = false; + phys_addr_t phys = 0; + ktime_t timeout; + u64 val; + + /* Set address and stream-id */ + val = readq_relaxed(tbu->base + DEBUG_SID_HALT_REG); + val &= ~DEBUG_SID_HALT_SID; + val |= FIELD_PREP(DEBUG_SID_HALT_SID, sid); + writeq_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); + writeq_relaxed(iova, tbu->base + DEBUG_VA_ADDR_REG); + val = FIELD_PREP(DEBUG_AXUSER_CDMID, DEBUG_AXUSER_CDMID_VAL); + writeq_relaxed(val, tbu->base + DEBUG_AXUSER_REG); + + /* Write-back read and write-allocate */ + val = FIELD_PREP(DEBUG_TXN_AXCACHE, 0xf); + + /* Non-secure access */ + val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_NSEC); + + /* Privileged access */ + val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_PRIV); + + val |= DEBUG_TXN_TRIGGER; + writeq_relaxed(val, tbu->base + DEBUG_TXN_TRIGG_REG); + + timeout = ktime_add_us(ktime_get(), TBU_DBG_TIMEOUT_US); + for (;;) { + val = readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG); + if (!(val & DEBUG_SR_ECATS_RUNNING_VAL)) + break; + val = readl_relaxed(tbu->base + DEBUG_PAR_REG); + if (val & DEBUG_PAR_FAULT_VAL) + break; + if (ktime_compare(ktime_get(), timeout) > 0) { + atos_timedout = true; + break; + } + } + + val = readq_relaxed(tbu->base + DEBUG_PAR_REG); + if (val & DEBUG_PAR_FAULT_VAL) + dev_err(tbu->dev, "ATOS generated a fault interrupt! PAR = %llx, SID=0x%x\n", + val, sid); + else if (atos_timedout) + dev_err_ratelimited(tbu->dev, "ATOS translation timed out!\n"); + else + phys = FIELD_GET(DEBUG_PAR_PA, val); + + /* Reset hardware */ + writeq_relaxed(0, tbu->base + DEBUG_TXN_TRIGG_REG); + writeq_relaxed(0, tbu->base + DEBUG_VA_ADDR_REG); + val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG); + val &= ~DEBUG_SID_HALT_SID; + writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); + + return phys; +} + +static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain, + dma_addr_t iova, u32 sid) +{ + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + int idx = smmu_domain->cfg.cbndx; + struct qcom_tbu *tbu; + u32 sctlr_orig, sctlr; + phys_addr_t phys = 0; + int attempt = 0; + int ret; + u64 fsr; + + tbu = qcom_find_tbu(qsmmu, sid); + if (!tbu) + return 0; + + ret = icc_set_bw(tbu->path, 0, UINT_MAX); + if (ret) + return ret; + + ret = clk_prepare_enable(tbu->clk); + if (ret) + goto disable_icc; + + ret = qcom_tbu_halt(tbu, smmu_domain); + if (ret) + goto disable_clk; + + /* + * ATOS/ECATS can trigger the fault interrupt, so disable it temporarily + * and check for an interrupt manually. + */ + sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR); + sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr); + + fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); + if (fsr & ARM_SMMU_FSR_FAULT) { + /* Clear pending interrupts */ + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + + /* + * TBU halt takes care of resuming any stalled transcation. + * Kept it here for completeness sake. + */ + if (fsr & ARM_SMMU_FSR_SS) + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, + ARM_SMMU_RESUME_TERMINATE); + } + + /* Only one concurrent atos operation */ + scoped_guard(spinlock_irqsave, &atos_lock) { + /* + * If the translation fails, attempt the lookup more time." + */ + do { + phys = qcom_tbu_trigger_atos(smmu_domain, tbu, iova, sid); + + fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); + if (fsr & ARM_SMMU_FSR_FAULT) { + /* Clear pending interrupts */ + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + + if (fsr & ARM_SMMU_FSR_SS) + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, + ARM_SMMU_RESUME_TERMINATE); + } + } while (!phys && attempt++ < 2); + + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig); + } + qcom_tbu_resume(tbu); + + /* Read to complete prior write transcations */ + readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG); + +disable_clk: + clk_disable_unprepare(tbu->clk); +disable_icc: + icc_set_bw(tbu->path, 0, 0); + + return phys; +} + +static int qcom_tbu_probe(struct platform_device *pdev) +{ + struct of_phandle_args args = { .args_count = 2 }; + struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct qcom_tbu *tbu; + + tbu = devm_kzalloc(dev, sizeof(*tbu), GFP_KERNEL); + if (!tbu) + return -ENOMEM; + + tbu->dev = dev; + INIT_LIST_HEAD(&tbu->list); + spin_lock_init(&tbu->halt_lock); + + if (of_parse_phandle_with_args(np, "qcom,stream-id-range", "#iommu-cells", 0, &args)) { + dev_err(dev, "Cannot parse the 'qcom,stream-id-range' DT property\n"); + return -EINVAL; + } + + tbu->smmu_np = args.np; + tbu->sid_range[0] = args.args[0]; + tbu->sid_range[1] = args.args[1]; + of_node_put(args.np); + + tbu->base = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(tbu->base)) + return PTR_ERR(tbu->base); + + tbu->clk = devm_clk_get_optional(dev, NULL); + if (IS_ERR(tbu->clk)) + return PTR_ERR(tbu->clk); + + tbu->path = devm_of_icc_get(dev, NULL); + if (IS_ERR(tbu->path)) + return PTR_ERR(tbu->path); + + guard(mutex)(&tbu_list_lock); + list_add_tail(&tbu->list, &tbu_list); + + return 0; +} + +static const struct of_device_id qcom_tbu_of_match[] = { + { .compatible = "qcom,sc7280-tbu" }, + { .compatible = "qcom,sdm845-tbu" }, + { } +}; + +static struct platform_driver qcom_tbu_driver = { + .driver = { + .name = "qcom_tbu", + .of_match_table = qcom_tbu_of_match, + }, + .probe = qcom_tbu_probe, +}; +builtin_platform_driver(qcom_tbu_driver); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h index 593910567b88..9bb3ae7d62da 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h @@ -30,6 +30,8 @@ struct qcom_smmu_match_data { const struct arm_smmu_impl *adreno_impl; }; +irqreturn_t qcom_smmu_context_fault(int irq, void *dev); + #ifdef CONFIG_ARM_SMMU_QCOM_DEBUG void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu); #else diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 836ed6799a80..1670e95c4637 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -136,6 +136,7 @@ enum arm_smmu_cbar_type { #define ARM_SMMU_CBAR_VMID GENMASK(7, 0) #define ARM_SMMU_GR1_CBFRSYNRA(n) (0x400 + ((n) << 2)) +#define ARM_SMMU_CBFRSYNRA_SID GENMASK(15, 0) #define ARM_SMMU_GR1_CBA2R(n) (0x800 + ((n) << 2)) #define ARM_SMMU_CBA2R_VMID16 GENMASK(31, 16) @@ -238,6 +239,7 @@ enum arm_smmu_cbar_type { #define ARM_SMMU_CB_ATSR 0x8f0 #define ARM_SMMU_ATSR_ACTIVE BIT(0) +#define ARM_SMMU_RESUME_TERMINATE BIT(0) /* Maximum number of context banks per SMMU */ #define ARM_SMMU_MAX_CBS 128 -- Gitee From b7e0ff5b5273ff66156898d719c2ec32d1794431 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 17 Apr 2024 06:37:27 -0700 Subject: [PATCH 199/219] iommu/arm-smmu: Allow using a threaded handler for context interrupts ANBZ: #13617 commit 960be6e10d4fcb18cfe863e0ceb3213a75eecb81 upstream. Threaded IRQ handlers run in a less critical context compared to normal IRQs, so they can perform more complex and time-consuming operations without causing significant delays in other parts of the kernel. During a context fault, it might be needed to do more processing and gather debug information from TBUs in the handler. These operations may sleep, so add an option to use a threaded IRQ handler in these cases. Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/20240417133731.2055383-4-quic_c_gdjako@quicinc.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 12 ++++++++++-- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 929e360a3e71..bd09113cdbc6 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -811,8 +811,16 @@ static int arm_smmu_init_domain_context(struct arm_smmu_domain *smmu_domain, else context_fault = arm_smmu_context_fault; - ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED, - "arm-smmu-context-fault", smmu_domain); + if (smmu->impl && smmu->impl->context_fault_needs_threaded_irq) + ret = devm_request_threaded_irq(smmu->dev, irq, NULL, + context_fault, + IRQF_ONESHOT | IRQF_SHARED, + "arm-smmu-context-fault", + smmu_domain); + else + ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED, + "arm-smmu-context-fault", smmu_domain); + if (ret < 0) { dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n", cfg->irptndx, irq); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 1670e95c4637..4765c6945c34 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -438,6 +438,7 @@ struct arm_smmu_impl { int (*def_domain_type)(struct device *dev); irqreturn_t (*global_fault)(int irq, void *dev); irqreturn_t (*context_fault)(int irq, void *dev); + bool context_fault_needs_threaded_irq; int (*alloc_context_bank)(struct arm_smmu_domain *smmu_domain, struct arm_smmu_device *smmu, struct device *dev, int start); -- Gitee From 9af35bb8a1225e71a47b6a3f9c8ea84547f6246e Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 17 Apr 2024 06:37:28 -0700 Subject: [PATCH 200/219] iommu/arm-smmu-qcom: Use a custom context fault handler for sdm845 ANBZ: #13617 commit d374555ef993433f4d2e08b700dbd27788427d61 upstream. The sdm845 platform now supports TBUs, so let's get additional debug info from the TBUs when a context fault occurs. Implement a custom context fault handler that does both software + hardware page table walks and TLB Invalidate All. Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/20240417133731.2055383-5-quic_c_gdjako@quicinc.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c | 143 ++++++++++++++++++ drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 4 + 2 files changed, 147 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c index eff7ca94ec8d..552199cbd9e2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c @@ -345,6 +345,149 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain, return phys; } +static phys_addr_t qcom_smmu_iova_to_phys_hard(struct arm_smmu_domain *smmu_domain, dma_addr_t iova) +{ + struct arm_smmu_device *smmu = smmu_domain->smmu; + int idx = smmu_domain->cfg.cbndx; + u32 frsynra; + u16 sid; + + frsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); + sid = FIELD_GET(ARM_SMMU_CBFRSYNRA_SID, frsynra); + + return qcom_iova_to_phys(smmu_domain, iova, sid); +} + +static phys_addr_t qcom_smmu_verify_fault(struct arm_smmu_domain *smmu_domain, dma_addr_t iova, u32 fsr) +{ + struct io_pgtable *iop = io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops); + struct arm_smmu_device *smmu = smmu_domain->smmu; + phys_addr_t phys_post_tlbiall; + phys_addr_t phys; + + phys = qcom_smmu_iova_to_phys_hard(smmu_domain, iova); + io_pgtable_tlb_flush_all(iop); + phys_post_tlbiall = qcom_smmu_iova_to_phys_hard(smmu_domain, iova); + + if (phys != phys_post_tlbiall) { + dev_err(smmu->dev, + "ATOS results differed across TLBIALL... (before: %pa after: %pa)\n", + &phys, &phys_post_tlbiall); + } + + return (phys == 0 ? phys_post_tlbiall : phys); +} + +irqreturn_t qcom_smmu_context_fault(int irq, void *dev) +{ + struct arm_smmu_domain *smmu_domain = dev; + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct arm_smmu_device *smmu = smmu_domain->smmu; + u32 fsr, fsynr, cbfrsynra, resume = 0; + int idx = smmu_domain->cfg.cbndx; + phys_addr_t phys_soft; + unsigned long iova; + int ret, tmp; + + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); + if (!(fsr & ARM_SMMU_FSR_FAULT)) + return IRQ_NONE; + + fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0); + iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); + cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); + + if (list_empty(&tbu_list)) { + ret = report_iommu_fault(&smmu_domain->domain, NULL, iova, + fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); + + if (ret == -ENOSYS) + dev_err_ratelimited(smmu->dev, + "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", + fsr, iova, fsynr, cbfrsynra, idx); + + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + return IRQ_HANDLED; + } + + phys_soft = ops->iova_to_phys(ops, iova); + + tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova, + fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); + if (!tmp || tmp == -EBUSY) { + dev_dbg(smmu->dev, + "Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n", + iova, fsr, fsynr, idx); + dev_dbg(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft); + ret = IRQ_HANDLED; + resume = ARM_SMMU_RESUME_TERMINATE; + } else { + phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, iova, fsr); + + if (__ratelimit(&_rs)) { + dev_err(smmu->dev, + "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", + fsr, iova, fsynr, cbfrsynra, idx); + dev_err(smmu->dev, + "FSR = %08x [%s%s%s%s%s%s%s%s%s], SID=0x%x\n", + fsr, + (fsr & 0x02) ? "TF " : "", + (fsr & 0x04) ? "AFF " : "", + (fsr & 0x08) ? "PF " : "", + (fsr & 0x10) ? "EF " : "", + (fsr & 0x20) ? "TLBMCF " : "", + (fsr & 0x40) ? "TLBLKF " : "", + (fsr & 0x80) ? "MHF " : "", + (fsr & 0x40000000) ? "SS " : "", + (fsr & 0x80000000) ? "MULTI " : "", + cbfrsynra); + + dev_err(smmu->dev, + "soft iova-to-phys=%pa\n", &phys_soft); + if (!phys_soft) + dev_err(smmu->dev, + "SOFTWARE TABLE WALK FAILED! Looks like %s accessed an unmapped address!\n", + dev_name(smmu->dev)); + if (phys_atos) + dev_err(smmu->dev, "hard iova-to-phys (ATOS)=%pa\n", + &phys_atos); + else + dev_err(smmu->dev, "hard iova-to-phys (ATOS) failed\n"); + } + ret = IRQ_NONE; + resume = ARM_SMMU_RESUME_TERMINATE; + } + + /* + * If the client returns -EBUSY, do not clear FSR and do not RESUME + * if stalled. This is required to keep the IOMMU client stalled on + * the outstanding fault. This gives the client a chance to take any + * debug action and then terminate the stalled transaction. + * So, the sequence in case of stall on fault should be: + * 1) Do not clear FSR or write to RESUME here + * 2) Client takes any debug action + * 3) Client terminates the stalled transaction and resumes the IOMMU + * 4) Client clears FSR. The FSR should only be cleared after 3) and + * not before so that the fault remains outstanding. This ensures + * SCTLR.HUPCF has the desired effect if subsequent transactions also + * need to be terminated. + */ + if (tmp != -EBUSY) { + /* Clear the faulting FSR */ + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + + /* Retry or terminate any stalled transactions */ + if (fsr & ARM_SMMU_FSR_SS) + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume); + } + + return ret; +} + static int qcom_tbu_probe(struct platform_device *pdev) { struct of_phandle_args args = { .args_count = 2 }; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 416c71b684c4..afe70c5404b9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -449,6 +449,10 @@ static const struct arm_smmu_impl sdm845_smmu_500_impl = { .reset = qcom_sdm845_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, .tlb_sync = qcom_smmu_tlb_sync, +#ifdef CONFIG_ARM_SMMU_QCOM_DEBUG + .context_fault = qcom_smmu_context_fault, + .context_fault_needs_threaded_irq = true, +#endif }; static const struct arm_smmu_impl qcom_adreno_smmu_v2_impl = { -- Gitee From ace0b01a4cef0d61b930e94aa2ef687f5c471785 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 17 Apr 2024 06:37:30 -0700 Subject: [PATCH 201/219] iommu/arm-smmu-qcom: Use the custom fault handler on more platforms ANBZ: #13617 commit b8ca7ce709f8210c13eec022e87a12111db5d745 upstream. The TBU support is now available, so let's allow it to be used on other platforms that have the Qualcomm SMMU-500 implementation with TBUs. This will allow the context fault handler to query the TBUs when a context fault occurs. Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/20240417133731.2055383-7-quic_c_gdjako@quicinc.com Signed-off-by: Will Deacon Signed-off-by: Shuai Xue --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index afe70c5404b9..ee318cd0e6f1 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -440,6 +440,10 @@ static const struct arm_smmu_impl qcom_smmu_500_impl = { .reset = arm_mmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, .tlb_sync = qcom_smmu_tlb_sync, +#ifdef CONFIG_ARM_SMMU_QCOM_DEBUG + .context_fault = qcom_smmu_context_fault, + .context_fault_needs_threaded_irq = true, +#endif }; static const struct arm_smmu_impl sdm845_smmu_500_impl = { -- Gitee From 84919fa9ca2b886ad88af04e8642cac713d91356 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 24 Apr 2024 15:16:27 +0800 Subject: [PATCH 202/219] iommu/vt-d: Remove redundant assignment to variable err ANBZ: #13617 commit a770ccd91d99f997e023b554d9e2033ce79e019e upstream. Variable err is being assigned a value that is never read. It is either being re-assigned later on error exit paths, or never referenced on the non-error path. Cleans up clang scan build warning: drivers/iommu/intel/dmar.c:1070:2: warning: Value stored to 'err' is never read [deadcode.DeadStores]` Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240411090535.306326-1-colin.i.king@gmail.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/dmar.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index a2960a1c60a6..5232df02f779 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1125,7 +1125,6 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) goto error_free_seq_id; } - err = -EINVAL; if (!cap_sagaw(iommu->cap) && (!ecap_smts(iommu->ecap) || ecap_slts(iommu->ecap))) { pr_info("%s: No supported address widths. Not attempting DMA translation.\n", -- Gitee From faec28445dacccfa28f89117f20e29fa73504268 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 24 Apr 2024 15:16:28 +0800 Subject: [PATCH 203/219] iommu/vt-d: Use try_cmpxchg64{,_local}() in iommu.c ANBZ: #13617 commit 9e7ee0f045395dc8aa55fbdc164c062484f4c88d upstream. Replace this pattern in iommu.c: cmpxchg64{,_local}(*ptr, 0, new) != 0 ... with the simpler and faster: !try_cmpxchg64{,_local}(*ptr, &tmp, new) The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after the CMPXCHG. No functional change intended. Signed-off-by: Uros Bizjak Cc: David Woodhouse Cc: Lu Baolu Cc: Joerg Roedel Cc: Will Deacon Cc: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240414162454.49584-1-ubizjak@gmail.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5952f7083f78..d493c31d71dc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -862,7 +862,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, break; if (!dma_pte_present(pte)) { - uint64_t pteval; + uint64_t pteval, tmp; tmp_page = iommu_alloc_page_node(domain->nid, gfp); @@ -874,7 +874,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (domain->use_first_level) pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (cmpxchg64(&pte->val, 0ULL, pteval)) + tmp = 0ULL; + if (!try_cmpxchg64(&pte->val, &tmp, pteval)) /* Someone else set it while we were thinking; use theirs. */ iommu_free_page(tmp_page); else @@ -2125,8 +2126,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); - if (tmp) { + tmp = 0ULL; + if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { static int dumps = 5; pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", iov_pfn, tmp, (unsigned long long)pteval); -- Gitee From 66b4190789e142206ab56994c435f4970fe9e8ef Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 24 Apr 2024 15:16:29 +0800 Subject: [PATCH 204/219] iommu/vt-d: Allocate DMAR fault interrupts locally ANBZ: #13617 commit d74169ceb0d2e32438946a2f1f9fc8c803304bd6 upstream. The Intel IOMMU code currently tries to allocate all DMAR fault interrupt vectors on the boot cpu. On large systems with high DMAR counts this results in vector exhaustion, and most of the vectors are not initially allocated socket local. Instead, have a cpu on each node do the vector allocation for the DMARs on that node. The boot cpu still does the allocation for its node during its boot sequence. Signed-off-by: Dimitri Sivanich Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/Zfydpp2Hm+as16TY@hpe.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/amd/amd_iommu.h | 2 +- drivers/iommu/amd/init.c | 2 +- drivers/iommu/intel/dmar.c | 9 +++++++-- drivers/iommu/irq_remapping.c | 5 ++++- drivers/iommu/irq_remapping.h | 2 +- include/linux/dmar.h | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 0416f0c186c7..325f63923257 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -33,7 +33,7 @@ int amd_iommu_prepare(void); int amd_iommu_enable(void); void amd_iommu_disable(void); int amd_iommu_reenable(int mode); -int amd_iommu_enable_faulting(void); +int amd_iommu_enable_faulting(unsigned int cpu); extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a78307dc96f3..62317af56d17 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3367,7 +3367,7 @@ int amd_iommu_reenable(int mode) return 0; } -int __init amd_iommu_enable_faulting(void) +int __init amd_iommu_enable_faulting(unsigned int cpu) { /* We enable MSI later when PCI is initialized */ return 0; diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 5232df02f779..36c98d87677b 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2186,7 +2186,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) return ret; } -int __init enable_drhd_fault_handling(void) +int enable_drhd_fault_handling(unsigned int cpu) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; @@ -2196,7 +2196,12 @@ int __init enable_drhd_fault_handling(void) */ for_each_iommu(iommu, drhd) { u32 fault_status; - int ret = dmar_set_interrupt(iommu); + int ret; + + if (iommu->irq || iommu->node != cpu_to_node(cpu)) + continue; + + ret = dmar_set_interrupt(iommu); if (ret) { pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n", diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index ee59647c2050..2f7281ccc05f 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -151,7 +151,10 @@ int __init irq_remap_enable_fault_handling(void) if (!remap_ops->enable_faulting) return -ENODEV; - return remap_ops->enable_faulting(); + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "dmar:enable_fault_handling", + remap_ops->enable_faulting, NULL); + + return remap_ops->enable_faulting(smp_processor_id()); } void panic_if_irq_remap(const char *msg) diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 8c89cb947cdb..0d6f140b5e01 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -41,7 +41,7 @@ struct irq_remap_ops { int (*reenable)(int); /* Enable fault handling */ - int (*enable_faulting)(void); + int (*enable_faulting)(unsigned int); }; extern struct irq_remap_ops intel_irq_remap_ops; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 543c53e84a70..33c72667ece2 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -120,7 +120,7 @@ extern int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, int count); /* Intel IOMMU detection */ void detect_intel_iommu(void); -extern int enable_drhd_fault_handling(void); +extern int enable_drhd_fault_handling(unsigned int cpu); extern int dmar_device_add(acpi_handle handle); extern int dmar_device_remove(acpi_handle handle); -- Gitee From 23ad901531cc3874c704a3142baa7c6ddde3f674 Mon Sep 17 00:00:00 2001 From: Jingqi Liu Date: Wed, 24 Apr 2024 15:16:30 +0800 Subject: [PATCH 205/219] iommu/vt-d: Remove debugfs use of private data field ANBZ: #13617 commit cc9e49d35b4de47d6b656ac144cb22b11dc65c2e upstream. Since the page fault report and response have been tracked by ftrace, the users can easily calculate the time used for a page fault handling. There's no need to expose the similar functionality in debugfs. Hence, remove the corresponding operations in debugfs. Signed-off-by: Jingqi Liu Link: https://lore.kernel.org/r/20240308103811.76744-2-Jingqi.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/debugfs.c | 7 ------- drivers/iommu/intel/perf.h | 1 - drivers/iommu/intel/svm.c | 9 --------- 3 files changed, 17 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index 86b506af7daa..affbf4a1558d 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -706,7 +706,6 @@ static ssize_t dmar_perf_latency_write(struct file *filp, dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB); dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB); dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC); - dmar_latency_disable(iommu, DMAR_LATENCY_PRQ); } rcu_read_unlock(); break; @@ -728,12 +727,6 @@ static ssize_t dmar_perf_latency_write(struct file *filp, dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC); rcu_read_unlock(); break; - case 4: - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) - dmar_latency_enable(iommu, DMAR_LATENCY_PRQ); - rcu_read_unlock(); - break; default: return -EINVAL; } diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h index fd6db8049d1a..df9a36942d64 100644 --- a/drivers/iommu/intel/perf.h +++ b/drivers/iommu/intel/perf.h @@ -11,7 +11,6 @@ enum latency_type { DMAR_LATENCY_INV_IOTLB = 0, DMAR_LATENCY_INV_DEVTLB, DMAR_LATENCY_INV_IEC, - DMAR_LATENCY_PRQ, DMAR_LATENCY_NUM }; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 71c1b2a0ca16..ded2100afe51 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -582,12 +582,6 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; event.fault.prm.private_data[0] = desc->priv_data[0]; event.fault.prm.private_data[1] = desc->priv_data[1]; - } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) { - /* - * If the private data fields are not used by hardware, use it - * to monitor the prq handle latency. - */ - event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); } iommu_report_device_fault(dev, &event); @@ -767,9 +761,6 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, if (private_present) { desc.qw2 = prm->private_data[0]; desc.qw3 = prm->private_data[1]; - } else if (prm->private_data[0]) { - dmar_latency_update(iommu, DMAR_LATENCY_PRQ, - ktime_to_ns(ktime_get()) - prm->private_data[0]); } qi_submit_sync(iommu, &desc, 1, 0); -- Gitee From 949f5f57c15439d079898bc5013707102590104f Mon Sep 17 00:00:00 2001 From: Jingqi Liu Date: Wed, 24 Apr 2024 15:16:31 +0800 Subject: [PATCH 206/219] iommu/vt-d: Remove private data use in fault message ANBZ: #13617 commit 621b7e54f288c5e4e32d1dd81a926b8ecb547c60 upstream. According to Intel VT-d specification revision 4.0, "Private Data" field has been removed from Page Request/Response. Since the private data field is not used in fault message, remove the related definitions in page request descriptor and remove the related code in page request/response handler, as Intel hasn't shipped any products which support private data in the page request message. Signed-off-by: Jingqi Liu Link: https://lore.kernel.org/r/20240308103811.76744-3-Jingqi.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.h | 1 - drivers/iommu/intel/svm.c | 75 ++++++++----------------------------- include/linux/iommu.h | 3 +- 3 files changed, 16 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index c769f57c44ae..ec90d89e55ce 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -455,7 +455,6 @@ enum { /* Page group response descriptor QW0 */ #define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) -#define QI_PGRP_PDP(p) (((u64)(p)) << 5) #define QI_PGRP_RESP_CODE(res) (((u64)(res)) << 12) #define QI_PGRP_DID(rid) (((u64)(rid)) << 16) #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index ded2100afe51..4afa3388a42c 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -417,8 +417,7 @@ struct page_req_dsc { struct { u64 type:8; u64 pasid_present:1; - u64 priv_data_present:1; - u64 rsvd:6; + u64 rsvd:7; u64 rid:16; u64 pasid:20; u64 exe_req:1; @@ -437,7 +436,8 @@ struct page_req_dsc { }; u64 qw_1; }; - u64 priv_data[2]; + u64 qw_2; + u64 qw_3; }; static bool is_canonical_address(u64 addr) @@ -571,18 +571,6 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; } - if (desc->priv_data_present) { - /* - * Set last page in group bit if private data is present, - * page response is required as it does for LPIG. - * iommu_report_device_fault() doesn't understand this vendor - * specific requirement thus we set last_page as a workaround. - */ - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; - event.fault.prm.private_data[0] = desc->priv_data[0]; - event.fault.prm.private_data[1] = desc->priv_data[1]; - } iommu_report_device_fault(dev, &event); } @@ -590,39 +578,23 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, static void handle_bad_prq_event(struct intel_iommu *iommu, struct page_req_dsc *req, int result) { - struct qi_desc desc; + struct qi_desc desc = { }; pr_err("%s: Invalid page request: %08llx %08llx\n", iommu->name, ((unsigned long long *)req)[0], ((unsigned long long *)req)[1]); - /* - * Per VT-d spec. v3.0 ch7.7, system software must - * respond with page group response if private data - * is present (PDP) or last page in group (LPIG) bit - * is set. This is an additional VT-d feature beyond - * PCI ATS spec. - */ - if (!req->lpig && !req->priv_data_present) + if (!req->lpig) return; desc.qw0 = QI_PGRP_PASID(req->pasid) | QI_PGRP_DID(req->rid) | QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_PDP(req->priv_data_present) | QI_PGRP_RESP_CODE(result) | QI_PGRP_RESP_TYPE; desc.qw1 = QI_PGRP_IDX(req->prg_index) | QI_PGRP_LPIG(req->lpig); - if (req->priv_data_present) { - desc.qw2 = req->priv_data[0]; - desc.qw3 = req->priv_data[1]; - } else { - desc.qw2 = 0; - desc.qw3 = 0; - } - qi_submit_sync(iommu, &desc, 1, 0); } @@ -690,7 +662,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) intel_svm_prq_report(iommu, dev, req); trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->priv_data[0], req->priv_data[1], + req->qw_2, req->qw_3, iommu->prq_seq_number++); mutex_unlock(&iommu->iopf_lock); prq_advance: @@ -729,7 +701,7 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct iommu_fault_page_request *prm; - bool private_present; + struct qi_desc desc; bool pasid_present; bool last_page; u16 sid; @@ -737,34 +709,17 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - /* - * Per VT-d spec. v3.0 ch7.7, system software must respond - * with page group response if private data is present (PDP) - * or last page in group (LPIG) bit is set. This is an - * additional VT-d requirement beyond PCI ATS spec. - */ - if (last_page || private_present) { - struct qi_desc desc; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_PDP(private_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - if (private_present) { - desc.qw2 = prm->private_data[0]; - desc.qw3 = prm->private_data[1]; - } + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw2 = 0; + desc.qw3 = 0; - qi_submit_sync(iommu, &desc, 1, 0); - } + qi_submit_sync(iommu, &desc, 1, 0); } static void intel_svm_domain_free(struct iommu_domain *domain) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 20399f549fad..a15e70cea420 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -69,8 +69,7 @@ enum iommu_fault_type { struct iommu_fault_page_request { #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) -#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) -#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 2) u32 flags; u32 pasid; u32 grpid; -- Gitee From cef2085b071b9214b46e6d44590e191ac9d2ab09 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:32 +0800 Subject: [PATCH 207/219] iommu/vt-d: Remove caching mode check before device TLB flush ANBZ: #13617 commit 304b3bde24b58515a75fd198beb52ca57df6275f upstream. The Caching Mode (CM) of the Intel IOMMU indicates if the hardware implementation caches not-present or erroneous translation-structure entries except for the first-stage translation. The caching mode is irrelevant to the device TLB, therefore there is no need to check it before a device TLB invalidation operation. Remove two caching mode checks before device TLB invalidation in the driver. The removal of these checks doesn't change the driver's behavior in critical map/unmap paths. Hence, there is no functionality or performance impact, especially since commit <29b32839725f> ("iommu/vt-d: Do not use flush-queue when caching-mode is on") has already disabled flush-queue for caching mode. Therefore, caching mode will never call intel_flush_iotlb_all(). Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20240415013835.9527-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d493c31d71dc..4a76092fe26a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1499,11 +1499,7 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, else __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); - /* - * In caching mode, changes of pages from non-present to present require - * flush. However, device IOTLB doesn't need to be flushed in this case. - */ - if (!cap_caching_mode(iommu->cap) || !map) + if (!map) iommu_flush_dev_iotlb(domain, addr, mask); } @@ -1577,8 +1573,7 @@ static void intel_flush_iotlb_all(struct iommu_domain *domain) iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - if (!cap_caching_mode(iommu->cap)) - iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); + iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); } if (dmar_domain->nested_parent) -- Gitee From c34c90b2ca5cf44bf72cb0d0aa5ad7ddebed9c92 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:33 +0800 Subject: [PATCH 208/219] iommu/vt-d: Add cache tag assignment interface ANBZ: #13617 commit 3b1d9e2b2d6856eabf5faa12d20c97fef657999f upstream. Caching tag is a combination of tags used by the hardware to cache various translations. Whenever a mapping in a domain is changed, the IOMMU driver should invalidate the caches with the caching tags. The VT-d specification describes caching tags in section 6.2.1, Tagging of Cached Translations. Add interface to assign caching tags to an IOMMU domain when attached to a RID or PASID, and unassign caching tags when a domain is detached from a RID or PASID. All caching tags are listed in the per-domain tag list and are protected by a dedicated lock. In addition to the basic IOTLB and devTLB caching tag types, NESTING_IOTLB and NESTING_DEVTLB tag types are also introduced. These tags are used for caches that store translations for DMA accesses through a nested user domain. They are affected by changes to mappings in the parent domain. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Do not know why conflict Conflicts: drivers/iommu/intel/iommu.c commit ecec12ee763 fix to access dmar_domain by to_dmar_domain() ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/Makefile | 2 +- drivers/iommu/intel/cache.c | 214 +++++++++++++++++++++++++++++++++++ drivers/iommu/intel/iommu.c | 26 ++++- drivers/iommu/intel/iommu.h | 31 +++++ drivers/iommu/intel/nested.c | 19 +++- drivers/iommu/intel/svm.c | 10 +- 6 files changed, 294 insertions(+), 8 deletions(-) create mode 100644 drivers/iommu/intel/cache.c diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 5402b699a122..c8beb0281559 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c new file mode 100644 index 000000000000..296f1645a739 --- /dev/null +++ b/drivers/iommu/intel/cache.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cache.c - Intel VT-d cache invalidation + * + * Copyright (C) 2024 Intel Corporation + * + * Author: Lu Baolu + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include +#include +#include +#include + +#include "iommu.h" +#include "pasid.h" + +/* Check if an existing cache tag can be reused for a new association. */ +static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, + struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) +{ + if (tag->type != type) + return false; + + if (tag->domain_id != domain_id || tag->pasid != pasid) + return false; + + if (type == CACHE_TAG_IOTLB || type == CACHE_TAG_NESTING_IOTLB) + return tag->iommu == iommu; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + return tag->dev == dev; + + return false; +} + +/* Assign a cache tag with specified type to domain. */ +static int cache_tag_assign(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid, + enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag, *temp; + unsigned long flags; + + tag = kzalloc(sizeof(*tag), GFP_KERNEL); + if (!tag) + return -ENOMEM; + + tag->type = type; + tag->iommu = iommu; + tag->domain_id = did; + tag->pasid = pasid; + tag->users = 1; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + tag->dev = dev; + else + tag->dev = iommu->iommu.dev; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(temp, &domain->cache_tags, node) { + if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { + temp->users++; + spin_unlock_irqrestore(&domain->cache_lock, flags); + kfree(tag); + return 0; + } + } + list_add_tail(&tag->node, &domain->cache_tags); + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return 0; +} + +/* Unassign a cache tag with specified type from domain. */ +static void cache_tag_unassign(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid, + enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag; + unsigned long flags; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + if (cache_tage_match(tag, did, iommu, dev, pasid, type)) { + if (--tag->users == 0) { + list_del(&tag->node); + kfree(tag); + } + break; + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); +} + +static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); +} + +static u16 domain_get_id_for_dev(struct dmar_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The driver assigns different domain IDs for all domains except + * the SVA type. + */ + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return FLPT_DEFAULT_DID; + + return domain_id_iommu(domain, iommu); +} + +/* + * Assign cache tags to a domain when it's associated with a device's + * PASID using a specific domain ID. + * + * On success (return value of 0), cache tags are created and added to the + * domain's cache tag list. On failure (negative return value), an error + * code is returned indicating the reason for the failure. + */ +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + int ret; + + ret = __cache_tag_assign_domain(domain, did, dev, pasid); + if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) + return ret; + + ret = __cache_tag_assign_parent_domain(domain->s2_domain, did, dev, pasid); + if (ret) + __cache_tag_unassign_domain(domain, did, dev, pasid); + + return ret; +} + +/* + * Remove the cache tags associated with a device's PASID when the domain is + * detached from the device. + * + * The cache tags must be previously assigned to the domain by calling the + * assign interface. + */ +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + + __cache_tag_unassign_domain(domain, did, dev, pasid); + if (domain->domain.type == IOMMU_DOMAIN_NESTED) + __cache_tag_unassign_parent_domain(domain->s2_domain, did, dev, pasid); +} diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4a76092fe26a..79b722d70ebc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1743,7 +1743,9 @@ static struct dmar_domain *alloc_domain(unsigned int type) domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); return domain; @@ -1755,6 +1757,9 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) unsigned long ndomains; int num, ret = -ENOSPC; + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return 0; + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; @@ -1802,6 +1807,9 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return; + spin_lock(&iommu->lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { @@ -2320,6 +2328,13 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, ret = domain_attach_iommu(domain, iommu); if (ret) return ret; + + ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); + if (ret) { + domain_detach_iommu(domain, iommu); + return ret; + } + info->domain = domain; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); @@ -3869,6 +3884,7 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } @@ -4663,6 +4679,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, */ if (domain->type == IOMMU_DOMAIN_SVA) { intel_svm_remove_dev_pasid(dev, pasid); + cache_tag_unassign_domain(dmar_domain, dev, pasid); goto out_tear_down; } @@ -4677,6 +4694,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); + cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); @@ -4716,6 +4734,10 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_free; + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + if (domain_type_is_si(dmar_domain)) ret = intel_pasid_setup_pass_through(iommu, dev, pasid); else if (dmar_domain->use_first_level) @@ -4725,7 +4747,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, pasid); if (ret) - goto out_detach_iommu; + goto out_unassign_tag; dev_pasid->dev = dev; dev_pasid->pasid = pasid; @@ -4737,6 +4759,8 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; +out_unassign_tag: + cache_tag_unassign_domain(dmar_domain, dev, pasid); out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); out_free: diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index ec90d89e55ce..d0146acce059 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -606,6 +606,9 @@ struct dmar_domain { struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ + spinlock_t cache_lock; /* Protect the cache tag list */ + struct list_head cache_tags; /* Cache tag list */ + int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ @@ -1089,6 +1092,34 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); +enum cache_tag_type { + CACHE_TAG_IOTLB, + CACHE_TAG_DEVTLB, + CACHE_TAG_NESTING_IOTLB, + CACHE_TAG_NESTING_DEVTLB, +}; + +struct cache_tag { + struct list_head node; + enum cache_tag_type type; + struct intel_iommu *iommu; + /* + * The @dev field represents the location of the cache. For IOTLB, it + * resides on the IOMMU hardware. @dev stores the device pointer to + * the IOMMU hardware. For DevTLB, it locates in the PCIe endpoint. + * @dev stores the device pointer to that endpoint. + */ + struct device *dev; + u16 domain_id; + ioasid_t pasid; + unsigned int users; +}; + +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); int intel_svm_enable_prq(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index a7d68f3d518a..13406ee742bf 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -52,13 +52,14 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, return ret; } + ret = cache_tag_assign_domain(dmar_domain, dev, IOMMU_NO_PASID); + if (ret) + goto detach_iommu; + ret = intel_pasid_setup_nested(iommu, dev, IOMMU_NO_PASID, dmar_domain); - if (ret) { - domain_detach_iommu(dmar_domain, iommu); - dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); - return ret; - } + if (ret) + goto unassign_tag; info->domain = dmar_domain; spin_lock_irqsave(&dmar_domain->lock, flags); @@ -68,6 +69,12 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, domain_update_iotlb(dmar_domain); return 0; +unassign_tag: + cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); +detach_iommu: + domain_detach_iommu(dmar_domain, iommu); + + return ret; } static void intel_nested_domain_free(struct iommu_domain *domain) @@ -206,7 +213,9 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->domain.type = IOMMU_DOMAIN_NESTED; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); spin_lock(&s2_domain->s1_lock); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 4afa3388a42c..d74699c6035f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -365,17 +365,23 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, sdev->qdep = 0; } + ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); + if (ret) + goto free_sdev; + /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) - goto free_sdev; + goto unassign_tag; list_add_rcu(&sdev->list, &svm->devs); return 0; +unassign_tag: + cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); free_sdev: kfree(sdev); free_svm: @@ -740,6 +746,8 @@ struct iommu_domain *intel_svm_domain_alloc(void) if (!domain) return NULL; domain->domain.ops = &intel_svm_domain_ops; + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->cache_lock); return &domain->domain; } -- Gitee From d73c41ac5d73ff01a2dcadcbbb4730b1bdf7acb1 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:34 +0800 Subject: [PATCH 209/219] iommu/vt-d: Add cache tag invalidation helpers ANBZ: #13617 commit c4d27ffaa8eb034ec438a9aedfe202ce81e15312 upstream. Add several helpers to invalidate the caches after mappings in the affected domain are changed. - cache_tag_flush_range() invalidates a range of caches after mappings within this range are changed. It uses the page-selective cache invalidation methods. - cache_tag_flush_all() invalidates all caches tagged by a domain ID. It uses the domain-selective cache invalidation methods. - cache_tag_flush_range_np() invalidates a range of caches when new mappings are created in the domain and the corresponding page table entries change from non-present to present. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/cache.c | 195 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/iommu.c | 12 --- drivers/iommu/intel/iommu.h | 14 +++ 3 files changed, 209 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 296f1645a739..0539275a9d20 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "iommu.h" @@ -212,3 +213,197 @@ void cache_tag_unassign_domain(struct dmar_domain *domain, if (domain->domain.type == IOMMU_DOMAIN_NESTED) __cache_tag_unassign_parent_domain(domain->s2_domain, did, dev, pasid); } + +static unsigned long calculate_psi_aligned_address(unsigned long start, + unsigned long end, + unsigned long *_pages, + unsigned long *_mask) +{ + unsigned long pages = aligned_nrpages(start, end - start + 1); + unsigned long aligned_pages = __roundup_pow_of_two(pages); + unsigned long bitmask = aligned_pages - 1; + unsigned long mask = ilog2(aligned_pages); + unsigned long pfn = IOVA_PFN(start); + + /* + * PSI masks the low order bits of the base address. If the + * address isn't aligned to the mask, then compute a mask value + * needed to ensure the target range is flushed. + */ + if (unlikely(bitmask & pfn)) { + unsigned long end_pfn = pfn + pages - 1, shared_bits; + + /* + * Since end_pfn <= pfn + bitmask, the only way bits + * higher than bitmask can differ in pfn and end_pfn is + * by carrying. This means after masking out bitmask, + * high bits starting with the first set bit in + * shared_bits are all equal in both pfn and end_pfn. + */ + shared_bits = ~(pfn ^ end_pfn) & ~bitmask; + mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; + } + + *_pages = aligned_pages; + *_mask = mask; + + return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); +} + +/* + * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) + * when the memory mappings in the target domain have been modified. + */ +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih) +{ + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + switch (tag->type) { + case CACHE_TAG_IOTLB: + case CACHE_TAG_NESTING_IOTLB: + if (domain->use_first_level) { + qi_flush_piotlb(iommu, tag->domain_id, + tag->pasid, addr, pages, ih); + } else { + /* + * Fallback to domain selective flush if no + * PSI support or the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + addr | ih, mask, + DMA_TLB_PSI_FLUSH); + } + break; + case CACHE_TAG_NESTING_DEVTLB: + /* + * Address translation cache in device side caches the + * result of nested translation. There is no easy way + * to identify the exact set of nested translations + * affected by a change in S2. So just flush the entire + * device cache. + */ + addr = 0; + mask = MAX_AGAW_PFN_WIDTH; + fallthrough; + case CACHE_TAG_DEVTLB: + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) + qi_flush_dev_iotlb(iommu, sid, info->pfsid, + info->ats_qdep, addr, mask); + else + qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, + tag->pasid, info->ats_qdep, + addr, mask); + + quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + break; + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* + * Invalidates all ranges of IOVA when the memory mappings in the target + * domain have been modified. + */ +void cache_tag_flush_all(struct dmar_domain *domain) +{ + struct cache_tag *tag; + unsigned long flags; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + switch (tag->type) { + case CACHE_TAG_IOTLB: + case CACHE_TAG_NESTING_IOTLB: + if (domain->use_first_level) + qi_flush_piotlb(iommu, tag->domain_id, + tag->pasid, 0, -1, 0); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + break; + case CACHE_TAG_DEVTLB: + case CACHE_TAG_NESTING_DEVTLB: + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH); + quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, + IOMMU_NO_PASID, info->ats_qdep); + break; + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* + * Invalidate a range of IOVA when new mappings are created in the target + * domain. + * + * - VT-d spec, Section 6.1 Caching Mode: When the CM field is reported as + * Set, any software updates to remapping structures other than first- + * stage mapping requires explicit invalidation of the caches. + * - VT-d spec, Section 6.8 Write Buffer Flushing: For hardware that requires + * write buffer flushing, software must explicitly perform write-buffer + * flushing, if cache invalidation is not required. + */ +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end) +{ + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + + if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { + iommu_flush_write_buffer(iommu); + continue; + } + + if (tag->type == CACHE_TAG_IOTLB || + tag->type == CACHE_TAG_NESTING_IOTLB) { + /* + * Fallback to domain selective flush if no + * PSI support or the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + addr, mask, + DMA_TLB_PSI_FLUSH); + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 79b722d70ebc..24c5f0d4f66c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -55,11 +55,6 @@ __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) -/* IO virtual address start page frame number */ -#define IOVA_START_PFN (1) - -#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); static int rwbf_quirk; @@ -1989,13 +1984,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) domain_context_mapping_cb, domain); } -/* Returns a number of VTD pages, but aligned to MM page size */ -static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) -{ - host_addr &= ~PAGE_MASK; - return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; -} - /* Return largest possible superpage level for a given mapping */ static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phy_pfn, unsigned long pages) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d0146acce059..95ed406d53a6 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -35,6 +35,8 @@ #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) + #define VTD_STRIDE_SHIFT (9) #define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) @@ -1040,6 +1042,13 @@ static inline void context_set_sm_pre(struct context_entry *context) context->lo |= BIT_ULL(4); } +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) +{ + host_addr &= ~PAGE_MASK; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1119,6 +1128,11 @@ int cache_tag_assign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); void cache_tag_unassign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih); +void cache_tag_flush_all(struct dmar_domain *domain); +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -- Gitee From f81d401174b32f337c6f23e6d43141236057eb5c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:35 +0800 Subject: [PATCH 210/219] iommu/vt-d: Add trace events for cache tag interface ANBZ: #13617 commit 446a68c58d2e5b8140d474f1a74082aebeee9bb0 upstream. Add trace events for cache tag assign/unassign/flush operations and trace the events in the interfaces. These trace events will improve debugging capabilities by providing detailed information about cache tag activity. A sample of the traced messages looks like below [messages have been stripped and wrapped to make the line short]. cache_tag_assign: dmar9/0000:00:01.0 type iotlb did 1 pasid 9 ref 1 cache_tag_assign: dmar9/0000:00:01.0 type devtlb did 1 pasid 9 ref 1 cache_tag_flush_all: dmar6/0000:8a:00.0 type iotlb did 7 pasid 0 ref 1 cache_tag_flush_range: dmar1 0000:00:1b.0[0] type iotlb did 9 [0xeab00000-0xeab1afff] addr 0xeab00000 pages 0x20 mask 0x5 cache_tag_flush_range: dmar1 0000:00:1b.0[0] type iotlb did 9 [0xeab20000-0xeab31fff] addr 0xeab20000 pages 0x20 mask 0x5 cache_tag_flush_range: dmar1 0000:00:1b.0[0] type iotlb did 9 [0xeaa40000-0xeaa51fff] addr 0xeaa40000 pages 0x20 mask 0x5 cache_tag_flush_range: dmar1 0000:00:1b.0[0] type iotlb did 9 [0x98de0000-0x98de4fff] addr 0x98de0000 pages 0x8 mask 0x3 cache_tag_flush_range: dmar1 0000:00:1b.0[0] type iotlb did 9 [0xe9828000-0xe9828fff] addr 0xe9828000 pages 0x1 mask 0x0 cache_tag_unassign: dmar9/0000:00:01.0 type iotlb did 1 pasid 9 ref 1 cache_tag_unassign: dmar9/0000:00:01.0 type devtlb did 1 pasid 9 ref 1 Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/cache.c | 10 ++++ drivers/iommu/intel/trace.h | 97 +++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 0539275a9d20..e8418cdd8331 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -17,6 +17,7 @@ #include "iommu.h" #include "pasid.h" +#include "trace.h" /* Check if an existing cache tag can be reused for a new association. */ static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, @@ -69,11 +70,13 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, temp->users++; spin_unlock_irqrestore(&domain->cache_lock, flags); kfree(tag); + trace_cache_tag_assign(temp); return 0; } } list_add_tail(&tag->node, &domain->cache_tags); spin_unlock_irqrestore(&domain->cache_lock, flags); + trace_cache_tag_assign(tag); return 0; } @@ -91,6 +94,7 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { if (cache_tage_match(tag, did, iommu, dev, pasid, type)) { + trace_cache_tag_unassign(tag); if (--tag->users == 0) { list_del(&tag->node); kfree(tag); @@ -316,6 +320,8 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); break; } + + trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); } spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -356,6 +362,8 @@ void cache_tag_flush_all(struct dmar_domain *domain) IOMMU_NO_PASID, info->ats_qdep); break; } + + trace_cache_tag_flush_all(tag); } spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -404,6 +412,8 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, addr, mask, DMA_TLB_PSI_FLUSH); } + + trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); } spin_unlock_irqrestore(&domain->cache_lock, flags); } diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 93d96f93a89b..961ac1c1bc21 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -89,6 +89,103 @@ TRACE_EVENT(prq_report, __entry->dw1, __entry->dw2, __entry->dw3) ) ); + +DECLARE_EVENT_CLASS(cache_tag_log, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(u32, users) + ), + TP_fast_assign( + __assign_str(iommu, tag->iommu->name); + __assign_str(dev, dev_name(tag->dev)); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->users = tag->users; + ), + TP_printk("%s/%s type %s did %d pasid %d ref %d", + __get_str(iommu), __get_str(dev), + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->pasid, __entry->users + ) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_assign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_unassign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_flush_all, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DECLARE_EVENT_CLASS(cache_tag_flush, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, addr) + __field(unsigned long, pages) + __field(unsigned long, mask) + ), + TP_fast_assign( + __assign_str(iommu, tag->iommu->name); + __assign_str(dev, dev_name(tag->dev)); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->start = start; + __entry->end = end; + __entry->addr = addr; + __entry->pages = pages; + __entry->mask = mask; + ), + TP_printk("%s %s[%d] type %s did %d [0x%lx-0x%lx] addr 0x%lx pages 0x%lx mask 0x%lx", + __get_str(iommu), __get_str(dev), __entry->pasid, + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->start, __entry->end, + __entry->addr, __entry->pages, __entry->mask + ) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range_np, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); #endif /* _TRACE_INTEL_IOMMU_H */ /* This part must be outside protection */ -- Gitee From 5b3010fb5880e4bc413aff4c72890f22bd273afd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:36 +0800 Subject: [PATCH 211/219] iommu/vt-d: Use cache_tag_flush_all() in flush_iotlb_all ANBZ: #13617 commit 4e589a53685c7658f9055fcedbce15bd4cc41134 upstream. The flush_iotlb_all callback is called by the iommu core to flush all caches for the affected domain. Use cache_tag_flush_all() in this callback. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 24c5f0d4f66c..0b15f81096f3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1554,25 +1554,7 @@ static void parent_domain_flush(struct dmar_domain *domain, static void intel_flush_iotlb_all(struct iommu_domain *domain) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct iommu_domain_info *info; - unsigned long idx; - - xa_for_each(&dmar_domain->iommu_array, idx, info) { - struct intel_iommu *iommu = info->iommu; - u16 did = domain_id_iommu(dmar_domain, iommu); - - if (dmar_domain->use_first_level) - domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - - iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); - } - - if (dmar_domain->nested_parent) - parent_domain_flush(dmar_domain, 0, -1, 0); + cache_tag_flush_all(to_dmar_domain(domain)); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) -- Gitee From 14af92a2d0ad8e81690381668adee7a737d733d3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:37 +0800 Subject: [PATCH 212/219] iommu/vt-d: Use cache_tag_flush_range() in tlb_sync ANBZ: #13617 commit a600ccd0a347e8f9c9f35f229e1ccd0dca9374c4 upstream. The tlb_sync callback is called by the iommu core to flush a range of caches for the affected domain. Use cache_tag_flush_range() in this callback. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/iommu.c commit b600b9978904 replace put_pages_list to iommu_put_pages_list in intel_iommu_tlb_sync() ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0b15f81096f3..b4b37fdcd5b1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4162,25 +4162,8 @@ static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long iova_pfn = IOVA_PFN(gather->start); - size_t size = gather->end - gather->start; - struct iommu_domain_info *info; - unsigned long start_pfn; - unsigned long nrpages; - unsigned long i; - - nrpages = aligned_nrpages(gather->start, size); - start_pfn = mm_to_dma_pfn_start(iova_pfn); - - xa_for_each(&dmar_domain->iommu_array, i, info) - iommu_flush_iotlb_psi(info->iommu, dmar_domain, - start_pfn, nrpages, - list_empty(&gather->freelist), 0); - - if (dmar_domain->nested_parent) - parent_domain_flush(dmar_domain, start_pfn, nrpages, - list_empty(&gather->freelist)); + cache_tag_flush_range(to_dmar_domain(domain), gather->start, + gather->end, list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } -- Gitee From 8a5836fb693d8506fae808f2f09cb6ddf60fed12 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:38 +0800 Subject: [PATCH 213/219] iommu/vt-d: Use cache_tag_flush_range_np() in iotlb_sync_map ANBZ: #13617 commit 129dab6e1286525fe5baed860d3dfcd9c6b4b327 upstream. The iotlb_sync_map callback is called by the iommu core after non-present to present mappings are created. The iommu driver uses this callback to invalidate caches if IOMMU is working in caching mode and second-only translation is used for the domain. Use cache_tag_flush_range_np() in this callback. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index b4b37fdcd5b1..65c80e06e3bf 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1498,20 +1498,6 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, iommu_flush_dev_iotlb(domain, addr, mask); } -/* Notification for newly created mappings */ -static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, - unsigned long pfn, unsigned int pages) -{ - /* - * It's a non-present to present mapping. Only flush if caching mode - * and second level. - */ - if (cap_caching_mode(iommu->cap) && !domain->use_first_level) - iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); - else - iommu_flush_write_buffer(iommu); -} - /* * Flush the relevant caches in nested translation if the domain * also serves as a parent @@ -4605,14 +4591,8 @@ static bool risky_device(struct pci_dev *pdev) static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long pages = aligned_nrpages(iova, size); - unsigned long pfn = iova >> VTD_PAGE_SHIFT; - struct iommu_domain_info *info; - unsigned long i; + cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); - xa_for_each(&dmar_domain->iommu_array, i, info) - __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); return 0; } -- Gitee From 30521d7498821d86848f7e2be261514f4f4abc17 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:39 +0800 Subject: [PATCH 214/219] iommu/vt-d: Cleanup use of iommu_flush_iotlb_psi() ANBZ: #13617 commit 06792d06798931696dd78f65024de4cc66f2fd5b upstream. Use cache_tag_flush_range() in switch_to_super_page() to invalidate the necessary caches when switching mappings from normal to super pages. The iommu_flush_iotlb_psi() call in intel_iommu_memory_notifier() is unnecessary since there should be no cache invalidation for the identity domain. Clean up iommu_flush_iotlb_psi() after the last call site is removed. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/iommu.c commit b600b9978904 replace put_pages_list to iommu_put_pages_list in intel_iommu_memory_notifier() ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 171 +----------------------------------- 1 file changed, 2 insertions(+), 169 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 65c80e06e3bf..69c11b9c67d1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1387,157 +1387,6 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info, quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); } -static void iommu_flush_dev_iotlb(struct dmar_domain *domain, - u64 addr, unsigned mask) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - unsigned long flags; - - if (!domain->has_iotlb_device) - return; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) - __iommu_flush_dev_iotlb(info, addr, mask); - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - - if (!info->ats_enabled) - continue; - - qi_flush_dev_iotlb_pasid(info->iommu, - PCI_DEVID(info->bus, info->devfn), - info->pfsid, dev_pasid->pasid, - info->ats_qdep, addr, - mask); - } - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, - struct dmar_domain *domain, u64 addr, - unsigned long npages, bool ih) -{ - u16 did = domain_id_iommu(domain, iommu); - struct dev_pasid_info *dev_pasid; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) - qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); - - if (!list_empty(&domain->devices)) - qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - unsigned long pfn, unsigned int pages, - int ih) -{ - unsigned int aligned_pages = __roundup_pow_of_two(pages); - unsigned long bitmask = aligned_pages - 1; - unsigned int mask = ilog2(aligned_pages); - u64 addr = (u64)pfn << VTD_PAGE_SHIFT; - - /* - * PSI masks the low order bits of the base address. If the - * address isn't aligned to the mask, then compute a mask value - * needed to ensure the target range is flushed. - */ - if (unlikely(bitmask & pfn)) { - unsigned long end_pfn = pfn + pages - 1, shared_bits; - - /* - * Since end_pfn <= pfn + bitmask, the only way bits - * higher than bitmask can differ in pfn and end_pfn is - * by carrying. This means after masking out bitmask, - * high bits starting with the first set bit in - * shared_bits are all equal in both pfn and end_pfn. - */ - shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; - } - - /* - * Fallback to domain selective flush if no PSI support or - * the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, - DMA_TLB_PSI_FLUSH); -} - -static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, - struct dmar_domain *domain, - unsigned long pfn, unsigned int pages, - int ih, int map) -{ - unsigned int aligned_pages = __roundup_pow_of_two(pages); - unsigned int mask = ilog2(aligned_pages); - uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; - u16 did = domain_id_iommu(domain, iommu); - - if (WARN_ON(!pages)) - return; - - if (ih) - ih = 1 << 6; - - if (domain->use_first_level) - domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); - else - __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); - - if (!map) - iommu_flush_dev_iotlb(domain, addr, mask); -} - -/* - * Flush the relevant caches in nested translation if the domain - * also serves as a parent - */ -static void parent_domain_flush(struct dmar_domain *domain, - unsigned long pfn, - unsigned long pages, int ih) -{ - struct dmar_domain *s1_domain; - - spin_lock(&domain->s1_lock); - list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { - struct device_domain_info *device_info; - struct iommu_domain_info *info; - unsigned long flags; - unsigned long i; - - xa_for_each(&s1_domain->iommu_array, i, info) - __iommu_flush_iotlb_psi(info->iommu, info->did, - pfn, pages, ih); - - if (!s1_domain->has_iotlb_device) - continue; - - spin_lock_irqsave(&s1_domain->lock, flags); - list_for_each_entry(device_info, &s1_domain->devices, link) - /* - * Address translation cache in device side caches the - * result of nested translation. There is no easy way - * to identify the exact set of nested translations - * affected by a change in S2. So just flush the entire - * device cache. - */ - __iommu_flush_dev_iotlb(device_info, 0, - MAX_AGAW_PFN_WIDTH); - spin_unlock_irqrestore(&s1_domain->lock, flags); - } - spin_unlock(&domain->s1_lock); -} - static void intel_flush_iotlb_all(struct iommu_domain *domain) { cache_tag_flush_all(to_dmar_domain(domain)); @@ -1988,9 +1837,7 @@ static void switch_to_super_page(struct dmar_domain *domain, unsigned long end_pfn, int level) { unsigned long lvl_pages = lvl_to_nr_pages(level); - struct iommu_domain_info *info; struct dma_pte *pte = NULL; - unsigned long i; while (start_pfn <= end_pfn) { if (!pte) @@ -2002,13 +1849,8 @@ static void switch_to_super_page(struct dmar_domain *domain, start_pfn + lvl_pages - 1, level + 1); - xa_for_each(&domain->iommu_array, i, info) - iommu_flush_iotlb_psi(info->iommu, domain, - start_pfn, lvl_pages, - 0, 0); - if (domain->nested_parent) - parent_domain_flush(domain, start_pfn, - lvl_pages, 0); + cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, + end_pfn << VTD_PAGE_SHIFT, 0); } pte++; @@ -3396,18 +3238,9 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb, case MEM_OFFLINE: case MEM_CANCEL_ONLINE: { - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; LIST_HEAD(freelist); domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) - iommu_flush_iotlb_psi(iommu, si_domain, - start_vpfn, mhp->nr_pages, - list_empty(&freelist), 0); - rcu_read_unlock(); iommu_put_pages_list(&freelist); } break; -- Gitee From 539e81327e248692d355fdfb105642d0d3d24d00 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:40 +0800 Subject: [PATCH 215/219] iommu/vt-d: Use cache_tag_flush_range() in cache_invalidate_user ANBZ: #13617 commit 8ebc22366ed8bb2cbcf215bfa3fabb4f36d10a91 upstream. The cache_invalidate_user callback is called to invalidate a range of caches for the affected user domain. Use cache_tag_flush_range() in this callback. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.h | 6 +++++ drivers/iommu/intel/nested.c | 50 +++--------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 95ed406d53a6..8061cd7ed8c5 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1049,6 +1049,12 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } +/* Return a size from number of VTD pages. */ +static inline unsigned long nrpages_to_size(unsigned long npages) +{ + return npages << VTD_PAGE_SHIFT; +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 13406ee742bf..16a2bcf5cfeb 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -88,50 +88,6 @@ static void intel_nested_domain_free(struct iommu_domain *domain) kfree(dmar_domain); } -static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, - unsigned int mask) -{ - struct device_domain_info *info; - unsigned long flags; - u16 sid, qdep; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (!info->ats_enabled) - continue; - sid = info->bus << 8 | info->devfn; - qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, - qdep, addr, mask); - quirk_extra_dev_tlb_flush(info, addr, mask, - IOMMU_NO_PASID, qdep); - } - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, - u64 npages, bool ih) -{ - struct iommu_domain_info *info; - unsigned int mask; - unsigned long i; - - xa_for_each(&domain->iommu_array, i, info) - qi_flush_piotlb(info->iommu, - domain_id_iommu(domain, info->iommu), - IOMMU_NO_PASID, addr, npages, ih); - - if (!domain->has_iotlb_device) - return; - - if (npages == U64_MAX) - mask = 64 - VTD_PAGE_SHIFT; - else - mask = ilog2(__roundup_pow_of_two(npages)); - - nested_flush_dev_iotlb(domain, addr, mask); -} - static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { @@ -164,9 +120,9 @@ static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, break; } - intel_nested_flush_cache(dmar_domain, inv_entry.addr, - inv_entry.npages, - inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); + cache_tag_flush_range(dmar_domain, inv_entry.addr, + inv_entry.addr + nrpages_to_size(inv_entry.npages) - 1, + inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); processed++; } -- Gitee From 9f3f2c76d0bac963d3b9a9311d742f07b3db2356 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:41 +0800 Subject: [PATCH 216/219] iommu/vt-d: Use cache helpers in arch_invalidate_secondary_tlbs ANBZ: #13617 commit 4f609dbff51b3c98d5fc841758a20be74ab3f132 upstream. The arch_invalidate_secondary_tlbs callback is called in the SVA mm notification path. It invalidates all or a range of caches after the CPU page table is modified. Use the cache tag helps in this path. The mm_types defines vm_end as the first byte after the end address which is different from the iommu gather API, hence convert the end parameter from mm_types to iommu gather scheme before calling the cache_tag helper. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.h | 1 + drivers/iommu/intel/svm.c | 81 +++++-------------------------------- 2 files changed, 11 insertions(+), 71 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 8061cd7ed8c5..3c28b0ddd187 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1164,6 +1164,7 @@ struct intel_svm { struct mm_struct *mm; u32 pasid; struct list_head devs; + struct dmar_domain *domain; }; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index d74699c6035f..269c91559192 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -167,88 +167,25 @@ void intel_svm_check(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_SVM_CAPABLE; } -static void __flush_svm_range_dev(struct intel_svm *svm, - struct intel_svm_dev *sdev, - unsigned long address, - unsigned long pages, int ih) -{ - struct device_domain_info *info = dev_iommu_priv_get(sdev->dev); - - if (WARN_ON(!pages)) - return; - - qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); - if (info->ats_enabled) { - qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, - svm->pasid, sdev->qdep, address, - order_base_2(pages)); - quirk_extra_dev_tlb_flush(info, address, order_base_2(pages), - svm->pasid, sdev->qdep); - } -} - -static void intel_flush_svm_range_dev(struct intel_svm *svm, - struct intel_svm_dev *sdev, - unsigned long address, - unsigned long pages, int ih) -{ - unsigned long shift = ilog2(__roundup_pow_of_two(pages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); - unsigned long start = ALIGN_DOWN(address, align); - unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); - - while (start < end) { - __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); - start += align; - } -} - -static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, - unsigned long pages, int ih) -{ - struct intel_svm_dev *sdev; - - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) - intel_flush_svm_range_dev(svm, sdev, address, pages, ih); - rcu_read_unlock(); -} - -static void intel_flush_svm_all(struct intel_svm *svm) -{ - struct device_domain_info *info; - struct intel_svm_dev *sdev; - - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) { - info = dev_iommu_priv_get(sdev->dev); - - qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, 0, -1UL, 0); - if (info->ats_enabled) { - qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, - svm->pasid, sdev->qdep, - 0, 64 - VTD_PAGE_SHIFT); - quirk_extra_dev_tlb_flush(info, 0, 64 - VTD_PAGE_SHIFT, - svm->pasid, sdev->qdep); - } - } - rcu_read_unlock(); -} - /* Pages have been freed at this point */ static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + struct dmar_domain *domain = svm->domain; if (start == 0 && end == -1UL) { - intel_flush_svm_all(svm); + cache_tag_flush_all(domain); return; } - intel_flush_svm_range(svm, start, - (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); + /* + * The mm_types defines vm_end as the first byte after the end address, + * different from IOMMU subsystem using the last address of an address + * range. + */ + cache_tag_flush_range(domain, start, end - 1, 0); } static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -335,6 +272,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, INIT_LIST_HEAD_RCU(&svm->devs); svm->notifier.ops = &intel_mmuops; + svm->domain = to_dmar_domain(domain); ret = mmu_notifier_register(&svm->notifier, mm); if (ret) { kfree(svm); @@ -746,6 +684,7 @@ struct iommu_domain *intel_svm_domain_alloc(void) if (!domain) return NULL; domain->domain.ops = &intel_svm_domain_ops; + domain->use_first_level = true; INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->cache_lock); -- Gitee From ba5a9b39e090625ca94d587c857d7fe083f4be0f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:42 +0800 Subject: [PATCH 217/219] iommu/vt-d: Remove intel_svm_dev ANBZ: #13617 commit deda9a7bf38fb9846ff3fb83179ca07437b1511c upstream. The intel_svm_dev data structure used in the sva implementation for the Intel IOMMU driver stores information about a device attached to an SVA domain. It is a duplicate of dev_pasid_info that serves the same purpose. Replace intel_svm_dev with dev_pasid_info and clean up the use of intel_svm_dev. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: remove unused out_tear_down label in intel_iommu_remove_dev_pasid which is also removed by latter merge commit in upstream ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 8 +-- drivers/iommu/intel/iommu.h | 15 +---- drivers/iommu/intel/svm.c | 130 ++++++++++-------------------------- 3 files changed, 42 insertions(+), 111 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 69c11b9c67d1..f14d60742599 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4443,11 +4443,8 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, * notification. Before consolidating that code into iommu core, let * the intel sva code handle it. */ - if (domain->type == IOMMU_DOMAIN_SVA) { - intel_svm_remove_dev_pasid(dev, pasid); - cache_tag_unassign_domain(dmar_domain, dev, pasid); - goto out_tear_down; - } + if (domain->type == IOMMU_DOMAIN_SVA) + intel_svm_remove_dev_pasid(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { @@ -4464,7 +4461,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); -out_tear_down: intel_pasid_tear_down_entry(iommu, dev, pasid, false); intel_drain_pasid_prq(dev, pasid); } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 3c28b0ddd187..5734e8848c2e 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -649,6 +649,7 @@ struct dmar_domain { struct list_head s2_link; }; }; + struct intel_svm *svm; struct iommu_domain domain; /* generic domain data structure for iommu core */ @@ -1147,23 +1148,13 @@ int intel_svm_finish_prq(struct intel_iommu *iommu); void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(void); -void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); +void intel_svm_remove_dev_pasid(struct iommu_domain *domain); void intel_drain_pasid_prq(struct device *dev, u32 pasid); -struct intel_svm_dev { - struct list_head list; - struct rcu_head rcu; - struct device *dev; - struct intel_iommu *iommu; - u16 did; - u16 sid, qdep; -}; - struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; u32 pasid; - struct list_head devs; struct dmar_domain *domain; }; #else @@ -1174,7 +1165,7 @@ static inline struct iommu_domain *intel_svm_domain_alloc(void) return NULL; } -static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +static inline void intel_svm_remove_dev_pasid(struct iommu_domain *domain) { } #endif diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 269c91559192..26aa43c7eb7e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -44,23 +44,6 @@ static void *pasid_private_find(ioasid_t pasid) return xa_load(&pasid_private_array, pasid); } -static struct intel_svm_dev * -svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) -{ - struct intel_svm_dev *sdev = NULL, *t; - - rcu_read_lock(); - list_for_each_entry_rcu(t, &svm->devs, list) { - if (t->dev == dev) { - sdev = t; - break; - } - } - rcu_read_unlock(); - - return sdev; -} - int intel_svm_enable_prq(struct intel_iommu *iommu) { struct iopf_queue *iopfq; @@ -191,7 +174,10 @@ static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); - struct intel_svm_dev *sdev; + struct dmar_domain *domain = svm->domain; + struct dev_pasid_info *dev_pasid; + struct device_domain_info *info; + unsigned long flags; /* This might end up being called from exit_mmap(), *before* the page * tables are cleared. And __mmu_notifier_release() will delete us from @@ -205,11 +191,13 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * page) so that we end up taking a fault that the hardware really * *has* to handle gracefully without affecting other processes. */ - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) - intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, - svm->pasid, true); - rcu_read_unlock(); + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { + info = dev_iommu_priv_get(dev_pasid->dev); + intel_pasid_tear_down_entry(info->iommu, dev_pasid->dev, + dev_pasid->pasid, true); + } + spin_unlock_irqrestore(&domain->lock, flags); } @@ -218,47 +206,17 @@ static const struct mmu_notifier_ops intel_mmuops = { .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, }; -static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, - struct intel_svm **rsvm, - struct intel_svm_dev **rsdev) -{ - struct intel_svm_dev *sdev = NULL; - struct intel_svm *svm; - - if (pasid == IOMMU_PASID_INVALID || pasid >= PASID_MAX) - return -EINVAL; - - svm = pasid_private_find(pasid); - if (IS_ERR(svm)) - return PTR_ERR(svm); - - if (!svm) - goto out; - - /* - * If we found svm for the PASID, there must be at least one device - * bond. - */ - if (WARN_ON(list_empty(&svm->devs))) - return -EINVAL; - sdev = svm_lookup_device_by_dev(svm, dev); - -out: - *rsvm = svm; - *rsdev = sdev; - - return 0; -} - static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; - struct intel_svm_dev *sdev; + struct dev_pasid_info *dev_pasid; struct intel_svm *svm; unsigned long sflags; + unsigned long flags; int ret = 0; svm = pasid_private_find(pasid); @@ -269,7 +227,6 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, svm->pasid = pasid; svm->mm = mm; - INIT_LIST_HEAD_RCU(&svm->devs); svm->notifier.ops = &intel_mmuops; svm->domain = to_dmar_domain(domain); @@ -287,25 +244,17 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, } } - sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); - if (!sdev) { - ret = -ENOMEM; + dmar_domain->svm = svm; + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) goto free_svm; - } - sdev->dev = dev; - sdev->iommu = iommu; - sdev->did = FLPT_DEFAULT_DID; - sdev->sid = PCI_DEVID(info->bus, info->devfn); - if (info->ats_enabled) { - sdev->qdep = info->ats_qdep; - if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) - sdev->qdep = 0; - } + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); if (ret) - goto free_sdev; + goto free_dev_pasid; /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; @@ -314,16 +263,18 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, if (ret) goto unassign_tag; - list_add_rcu(&sdev->list, &svm->devs); + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); return 0; unassign_tag: cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_sdev: - kfree(sdev); +free_dev_pasid: + kfree(dev_pasid); free_svm: - if (list_empty(&svm->devs)) { + if (list_empty(&dmar_domain->dev_pasids)) { mmu_notifier_unregister(&svm->notifier, mm); pasid_private_remove(pasid); kfree(svm); @@ -332,26 +283,17 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, return ret; } -void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) +void intel_svm_remove_dev_pasid(struct iommu_domain *domain) { - struct intel_svm_dev *sdev; - struct intel_svm *svm; - struct mm_struct *mm; - - if (pasid_to_svm_sdev(dev, pasid, &svm, &sdev)) - return; - mm = svm->mm; - - if (sdev) { - list_del_rcu(&sdev->list); - kfree_rcu(sdev, rcu); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_svm *svm = dmar_domain->svm; + struct mm_struct *mm = domain->mm; - if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(svm->pasid); - kfree(svm); - } + if (list_empty(&dmar_domain->dev_pasids)) { + if (svm->notifier.ops) + mmu_notifier_unregister(&svm->notifier, mm); + pasid_private_remove(svm->pasid); + kfree(svm); } } @@ -685,8 +627,10 @@ struct iommu_domain *intel_svm_domain_alloc(void) return NULL; domain->domain.ops = &intel_svm_domain_ops; domain->use_first_level = true; + INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->cache_lock); + spin_lock_init(&domain->lock); return &domain->domain; } -- Gitee From 90698f1727b7e309d379d89ba96875f10a6b67be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 24 Apr 2024 15:16:43 +0800 Subject: [PATCH 218/219] iommu: Add ops->domain_alloc_sva() ANBZ: #13617 commit 65442507026a79e2de8014880e6ba376b8b44584 upstream. Make a new op that receives the device and the mm_struct that the SVA domain should be created for. Unlike domain_alloc_paging() the dev argument is never NULL here. This allows drivers to fully initialize the SVA domain and allocate the mmu_notifier during allocation. It allows the notifier lifetime to follow the lifetime of the iommu_domain. Since we have only one call site, upgrade the new op to return ERR_PTR instead of NULL. Signed-off-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Tina Zhang Link: https://lore.kernel.org/r/20240311090843.133455-15-vasant.hegde@amd.com Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Shuai Xue --- drivers/iommu/iommu-sva.c | 16 +++++++++++----- include/linux/iommu.h | 3 +++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 640acc804e8c..18a35e798b72 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -108,8 +108,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm /* Allocate a new domain and set it on device pasid. */ domain = iommu_sva_domain_alloc(dev, mm); - if (!domain) { - ret = -ENOMEM; + if (IS_ERR(domain)) { + ret = PTR_ERR(domain); goto out_free_handle; } @@ -283,9 +283,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return NULL; + if (ops->domain_alloc_sva) { + domain = ops->domain_alloc_sva(dev, mm); + if (IS_ERR(domain)) + return domain; + } else { + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return ERR_PTR(-ENOMEM); + } domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a15e70cea420..4edcc5a979b2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -525,6 +525,7 @@ static inline int __iommu_copy_struct_from_user_array( * Upon failure, ERR_PTR must be returned. * @domain_alloc_paging: Allocate an iommu_domain that can be used for * UNMANAGED, DMA, and DMA_FQ domain types. + * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -565,6 +566,8 @@ struct iommu_ops { struct device *dev, u32 flags, struct iommu_domain *parent, const struct iommu_user_data *user_data); struct iommu_domain *(*domain_alloc_paging)(struct device *dev); + struct iommu_domain *(*domain_alloc_sva)(struct device *dev, + struct mm_struct *mm); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); -- Gitee From c1f0f9902a130cad7210f5b109dc7ba50ea4f7e8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 24 Apr 2024 15:16:44 +0800 Subject: [PATCH 219/219] iommu/vt-d: Remove struct intel_svm ANBZ: #13617 commit 886f816c2f01f768384e78f91b3c0ff66920b53f upstream. The struct intel_svm was used for keeping attached devices info for sva domain. Since sva domain is a kind of iommu_domain, the struct dmar_domain should centralize all info of a sva domain, including the info of attached devices. Therefore, retire struct intel_svm and clean up the code. Besides, register mmu notifier callback in domain_alloc_sva() callback which allows the memory management notifier lifetime to follow the lifetime of the iommu_domain. Call mmu_notifier_put() in the domain free and defer the real free to the mmu free_notifier callback. Co-developed-by: Tina Zhang Signed-off-by: Tina Zhang Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240416080656.60968-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel [ Shuai Xue: Conflicts: drivers/iommu/intel/iommu.c commit ecec12ee763 fix to access dmar_domain by to_dmar_domain() in intel_iommu_remove_dev_pasid() ] Signed-off-by: Shuai Xue --- drivers/iommu/intel/iommu.c | 11 +---- drivers/iommu/intel/iommu.h | 26 ++++------ drivers/iommu/intel/svm.c | 99 ++++++++++--------------------------- 3 files changed, 37 insertions(+), 99 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f14d60742599..fa0d1f19699e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3741,8 +3741,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return domain; case IOMMU_DOMAIN_IDENTITY: return &si_domain->domain; - case IOMMU_DOMAIN_SVA: - return intel_svm_domain_alloc(); default: return NULL; } @@ -4438,14 +4436,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct intel_iommu *iommu = info->iommu; unsigned long flags; - /* - * The SVA implementation needs to handle its own stuffs like the mm - * notification. Before consolidating that code into iommu core, let - * the intel sva code handle it. - */ - if (domain->type == IOMMU_DOMAIN_SVA) - intel_svm_remove_dev_pasid(domain); - spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4679,6 +4669,7 @@ const struct iommu_ops intel_iommu_ops = { .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, + .domain_alloc_sva = intel_svm_domain_alloc, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 5734e8848c2e..83d394f166a7 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -648,8 +648,12 @@ struct dmar_domain { /* link to parent domain siblings */ struct list_head s2_link; }; + + /* SVA domain */ + struct { + struct mmu_notifier notifier; + }; }; - struct intel_svm *svm; struct iommu_domain domain; /* generic domain data structure for iommu core */ @@ -1147,26 +1151,16 @@ int intel_svm_enable_prq(struct intel_iommu *iommu); int intel_svm_finish_prq(struct intel_iommu *iommu); void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); -struct iommu_domain *intel_svm_domain_alloc(void); -void intel_svm_remove_dev_pasid(struct iommu_domain *domain); +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm); void intel_drain_pasid_prq(struct device *dev, u32 pasid); - -struct intel_svm { - struct mmu_notifier notifier; - struct mm_struct *mm; - u32 pasid; - struct dmar_domain *domain; -}; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} -static inline struct iommu_domain *intel_svm_domain_alloc(void) -{ - return NULL; -} - -static inline void intel_svm_remove_dev_pasid(struct iommu_domain *domain) +static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) { + return ERR_PTR(-ENODEV); } #endif diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 26aa43c7eb7e..0e3a9b38bef2 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -27,23 +27,6 @@ static irqreturn_t prq_event_thread(int irq, void *d); -static DEFINE_XARRAY_ALLOC(pasid_private_array); -static int pasid_private_add(ioasid_t pasid, void *priv) -{ - return xa_alloc(&pasid_private_array, &pasid, priv, - XA_LIMIT(pasid, pasid), GFP_ATOMIC); -} - -static void pasid_private_remove(ioasid_t pasid) -{ - xa_erase(&pasid_private_array, pasid); -} - -static void *pasid_private_find(ioasid_t pasid) -{ - return xa_load(&pasid_private_array, pasid); -} - int intel_svm_enable_prq(struct intel_iommu *iommu) { struct iopf_queue *iopfq; @@ -155,10 +138,9 @@ static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { - struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); - struct dmar_domain *domain = svm->domain; + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); - if (start == 0 && end == -1UL) { + if (start == 0 && end == ULONG_MAX) { cache_tag_flush_all(domain); return; } @@ -173,8 +155,7 @@ static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); - struct dmar_domain *domain = svm->domain; + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); struct dev_pasid_info *dev_pasid; struct device_domain_info *info; unsigned long flags; @@ -201,9 +182,15 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } +static void intel_mm_free_notifier(struct mmu_notifier *mn) +{ + kfree(container_of(mn, struct dmar_domain, notifier)); +} + static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, + .free_notifier = intel_mm_free_notifier, }; static int intel_svm_set_dev_pasid(struct iommu_domain *domain, @@ -214,40 +201,13 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; struct dev_pasid_info *dev_pasid; - struct intel_svm *svm; unsigned long sflags; unsigned long flags; int ret = 0; - svm = pasid_private_find(pasid); - if (!svm) { - svm = kzalloc(sizeof(*svm), GFP_KERNEL); - if (!svm) - return -ENOMEM; - - svm->pasid = pasid; - svm->mm = mm; - - svm->notifier.ops = &intel_mmuops; - svm->domain = to_dmar_domain(domain); - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - kfree(svm); - return ret; - } - - ret = pasid_private_add(svm->pasid, svm); - if (ret) { - mmu_notifier_unregister(&svm->notifier, mm); - kfree(svm); - return ret; - } - } - - dmar_domain->svm = svm; dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); if (!dev_pasid) - goto free_svm; + return -ENOMEM; dev_pasid->dev = dev; dev_pasid->pasid = pasid; @@ -273,30 +233,10 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); free_dev_pasid: kfree(dev_pasid); -free_svm: - if (list_empty(&dmar_domain->dev_pasids)) { - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(pasid); - kfree(svm); - } return ret; } -void intel_svm_remove_dev_pasid(struct iommu_domain *domain) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_svm *svm = dmar_domain->svm; - struct mm_struct *mm = domain->mm; - - if (list_empty(&dmar_domain->dev_pasids)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(svm->pasid); - kfree(svm); - } -} - /* Page request queue descriptor */ struct page_req_dsc { union { @@ -610,7 +550,10 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, static void intel_svm_domain_free(struct iommu_domain *domain) { - kfree(to_dmar_domain(domain)); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + /* dmar_domain free is deferred to the mmu free_notifier callback. */ + mmu_notifier_put(&dmar_domain->notifier); } static const struct iommu_domain_ops intel_svm_domain_ops = { @@ -618,13 +561,16 @@ static const struct iommu_domain_ops intel_svm_domain_ops = { .free = intel_svm_domain_free }; -struct iommu_domain *intel_svm_domain_alloc(void) +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) { struct dmar_domain *domain; + int ret; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); + domain->domain.ops = &intel_svm_domain_ops; domain->use_first_level = true; INIT_LIST_HEAD(&domain->dev_pasids); @@ -632,5 +578,12 @@ struct iommu_domain *intel_svm_domain_alloc(void) spin_lock_init(&domain->cache_lock); spin_lock_init(&domain->lock); + domain->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&domain->notifier, mm); + if (ret) { + kfree(domain); + return ERR_PTR(ret); + } + return &domain->domain; } -- Gitee