diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 09efa9fc66b021caedd7e32ff7caeed3f38d97a5..5b4bfce70152a8aaf84d0aa6f2103749569d9b4b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6520,6 +6520,7 @@ CONFIG_ARM_SMMU_V3=y CONFIG_ARM_SMMU_V3_SVA=y # CONFIG_ARM_SMMU_V3_PM is not set CONFIG_ARM_SMMU_V3_HTTU=y +CONFIG_ARM_SMMU_V3_ECMDQ=y # CONFIG_QCOM_IOMMU is not set # CONFIG_VIRTIO_IOMMU is not set CONFIG_SMMU_BYPASS_DEV=y diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c2e7b2e23051541e38af42180054ab26461c6372..69ffbeb8a4bcde7f0e85e3fb5e17abaafa020b03 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -430,6 +430,17 @@ config ARM_SMMU_V3_HTTU help Add support for Hardware Translation Table Update arm smmu v3. +config ARM_SMMU_V3_ECMDQ + bool "Add arm_smmu_v3 ecmdq support" + depends on ARM_SMMU_V3 + default n + help + Add support for ARM_SMMU_V3 ECMDQ. One smmu can have multiple + ECMDQs which can be used to reduce competition when smmu try + to send commands. + + If not sure, say no. + config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index f513ac761f7359912ef5ddc992b509ee2f1fa0da..de95468a129342af8224d3976f6a0ba9486ff91f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -41,6 +41,12 @@ module_param(disable_msipolling, bool, 0444); MODULE_PARM_DESC(disable_msipolling, "Disable MSI-based polling for CMD_SYNC completion."); +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static bool disable_ecmdq; +module_param(disable_ecmdq, bool, 0444); +MODULE_PARM_DESC(disable_ecmdq, "Disable the use of ECMDQs"); +#endif + #ifdef CONFIG_SMMU_BYPASS_DEV struct smmu_bypass_device { unsigned short vendor; @@ -296,6 +302,24 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) return 0; } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static void arm_smmu_preempt_disable(struct arm_smmu_device *smmu) +{ + if (smmu->ecmdq_enabled) + preempt_disable(); +} + +static void arm_smmu_preempt_enable(struct arm_smmu_device *smmu) +{ + if (smmu->ecmdq_enabled) + preempt_enable(); +} +#else +static void arm_smmu_preempt_disable(struct arm_smmu_device *smmu) {} +static void arm_smmu_preempt_enable(struct arm_smmu_device *smmu) {} +#endif + + /* High-level queue accessors */ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) { @@ -400,6 +424,16 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu) { +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (smmu->ecmdq_enabled) { + struct arm_smmu_ecmdq *ecmdq; + + ecmdq = *this_cpu_ptr(smmu->ecmdqs); + + return &ecmdq->cmdq; + } +#endif + return &smmu->cmdq; } @@ -482,6 +516,40 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q); } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static void arm_smmu_ecmdq_skip_err(struct arm_smmu_device *smmu) +{ + int i; + u32 prod, cons; + struct arm_smmu_queue *q; + struct arm_smmu_ecmdq *ecmdq; + + for (i = 0; i < smmu->nr_ecmdq; i++) { + unsigned long flags; + + ecmdq = *per_cpu_ptr(smmu->ecmdqs, i); + q = &ecmdq->cmdq.q; + + prod = readl_relaxed(q->prod_reg); + cons = readl_relaxed(q->cons_reg); + if (((prod ^ cons) & ECMDQ_CONS_ERR) == 0) + continue; + + __arm_smmu_cmdq_skip_err(smmu, q); + + write_lock_irqsave(&q->ecmdq_lock, flags); + q->ecmdq_prod &= ~ECMDQ_PROD_ERRACK; + q->ecmdq_prod |= cons & ECMDQ_CONS_ERR; + + prod = readl_relaxed(q->prod_reg); + prod &= ~ECMDQ_PROD_ERRACK; + prod |= cons & ECMDQ_CONS_ERR; + writel(prod, q->prod_reg); + write_unlock_irqrestore(&q->ecmdq_lock, flags); + } +} +#endif + /* * Command queue locking. * This is a form of bastardised rwlock with the following major changes: @@ -778,6 +846,89 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, } } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +/* + * The function is used when the current core exclusively occupies an ECMDQ. + * This is a reduced version of arm_smmu_cmdq_issue_cmdlist(), which eliminates + * a lot of unnecessary inter-core competition considerations. + */ +static int arm_smmu_ecmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, + u64 *cmds, int n, bool sync) +{ + u32 prod; + unsigned long flags; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + }, head; + int ret = 0; + + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(cmdq->q.llq.val); + do { + u64 old; + + while (!queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + dev_err_ratelimited(smmu->dev, "ECMDQ timeout\n"); + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = queue_inc_prod_n(&llq, n + sync); + + old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + + /* 2. Write our commands into the queue */ + arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); + if (sync) { + u64 cmd_sync[CMDQ_ENT_DWORDS]; + + prod = queue_inc_prod_n(&llq, n); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod); + queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); + } + + /* 3. Ensuring commands are visible first */ + dma_wmb(); + + /* 4. Advance the hardware prod pointer */ + read_lock(&cmdq->q.ecmdq_lock); + writel_relaxed(head.prod | cmdq->q.ecmdq_prod, cmdq->q.prod_reg); + read_unlock(&cmdq->q.ecmdq_lock); + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = queue_inc_prod_n(&llq, n); + ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + if (ret) { + dev_err_ratelimited(smmu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, + readl_relaxed(cmdq->q.prod_reg), + readl_relaxed(cmdq->q.cons_reg)); + } + + /* + * Update cmdq->q.llq.cons, to improve the success rate of + * queue_has_space() when some new commands are inserted next + * time. + */ + WRITE_ONCE(cmdq->q.llq.cons, llq.cons); + } + + local_irq_restore(flags); + return ret; +} +#endif + /* * This is the actual insertion function, and provides the following * ordering guarantees to callers: @@ -805,6 +956,11 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, struct arm_smmu_ll_queue llq, head; int ret = 0; +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (!cmdq->shared) + return arm_smmu_ecmdq_issue_cmdlist(smmu, cmdq, cmds, n, sync); +#endif + llq.max_n_shift = cmdq->q.llq.max_n_shift; /* 1. Allocate some space in the queue */ @@ -878,7 +1034,14 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, * d. Advance the hardware prod pointer * Control dependency ordering from the entries becoming valid. */ - writel_relaxed(prod, cmdq->q.prod_reg); +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (smmu->ecmdq_enabled) { + read_lock(&cmdq->q.ecmdq_lock); + writel_relaxed(prod | cmdq->q.ecmdq_prod, cmdq->q.prod_reg); + read_unlock(&cmdq->q.ecmdq_lock); + } else +#endif + writel_relaxed(prod, cmdq->q.prod_reg); /* * e. Tell the next owner we're done @@ -1042,6 +1205,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, cmds.num = 0; + arm_smmu_preempt_disable(smmu); spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { for (i = 0; i < master->num_streams; i++) { @@ -1052,6 +1216,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); arm_smmu_cmdq_batch_submit(smmu, &cmds); + arm_smmu_preempt_enable(smmu); } static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, @@ -1764,6 +1929,11 @@ static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev) if (active & GERROR_CMDQ_ERR) arm_smmu_cmdq_skip_err(smmu); +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (active & GERROR_CMDQP_ERR) + arm_smmu_ecmdq_skip_err(smmu); +#endif + writel(gerror, smmu->base + ARM_SMMU_GERRORN); return IRQ_HANDLED; } @@ -1854,25 +2024,28 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size, static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) { - int i; + int i, ret; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_cmdq_batch cmds; arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); cmds.num = 0; + arm_smmu_preempt_disable(master->smmu); for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd); } - return arm_smmu_cmdq_batch_submit(master->smmu, &cmds); + ret = arm_smmu_cmdq_batch_submit(master->smmu, &cmds); + arm_smmu_preempt_enable(master->smmu); + return ret; } int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, unsigned long iova, size_t size) { - int i; + int i, ret; unsigned long flags; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_master *master; @@ -1902,6 +2075,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, cmds.num = 0; + arm_smmu_preempt_disable(smmu_domain->smmu); spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { if (!master->ats_enabled) @@ -1914,7 +2088,10 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, } spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds); + ret = arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds); + arm_smmu_preempt_enable(smmu_domain->smmu); + + return ret; } /* IO_PGTABLE API */ @@ -1979,6 +2156,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, cmds.num = 0; + arm_smmu_preempt_disable(smmu); while (iova < end) { if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { /* @@ -2010,6 +2188,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, iova += inv_range; } arm_smmu_cmdq_batch_submit(smmu, &cmds); + arm_smmu_preempt_enable(smmu); } static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, @@ -3241,6 +3420,9 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) struct arm_smmu_cmdq *cmdq = &smmu->cmdq; unsigned int nents = 1 << cmdq->q.llq.max_n_shift; +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + cmdq->shared = 1; +#endif atomic_set(&cmdq->owner_prod, 0); atomic_set(&cmdq->lock, 0); @@ -3252,6 +3434,22 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) return 0; } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static int arm_smmu_ecmdq_init(struct arm_smmu_cmdq *cmdq) +{ + unsigned int nents = 1 << cmdq->q.llq.max_n_shift; + + atomic_set(&cmdq->owner_prod, 0); + atomic_set(&cmdq->lock, 0); + + cmdq->valid_map = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); + if (!cmdq->valid_map) + return -ENOMEM; + + return 0; +} +#endif + static int arm_smmu_init_queues(struct arm_smmu_device *smmu) { int ret; @@ -3697,6 +3895,55 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu) return ret; } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static int arm_smmu_ecmdq_reset(struct arm_smmu_device *smmu) +{ + int i, cpu, ret = 0; + u32 reg; + + if (!smmu->nr_ecmdq) + return 0; + + i = 0; + for_each_possible_cpu(cpu) { + struct arm_smmu_ecmdq *ecmdq; + struct arm_smmu_queue *q; + + ecmdq = *per_cpu_ptr(smmu->ecmdqs, cpu); + if (ecmdq != per_cpu_ptr(smmu->ecmdq, cpu)) + continue; + + q = &ecmdq->cmdq.q; + i++; + + if (WARN_ON(q->llq.prod != q->llq.cons)) { + q->llq.prod = 0; + q->llq.cons = 0; + } + writeq_relaxed(q->q_base, ecmdq->base + ARM_SMMU_ECMDQ_BASE); + writel_relaxed(q->llq.prod, ecmdq->base + ARM_SMMU_ECMDQ_PROD); + writel_relaxed(q->llq.cons, ecmdq->base + ARM_SMMU_ECMDQ_CONS); + + /* enable ecmdq */ + writel(ECMDQ_PROD_EN | q->llq.prod, q->prod_reg); + ret = readl_relaxed_poll_timeout(q->cons_reg, reg, reg & ECMDQ_CONS_ENACK, + 1, ARM_SMMU_POLL_TIMEOUT_US); + if (ret) { + dev_err(smmu->dev, "ecmdq[%d] enable failed\n", i); + smmu->ecmdq_enabled = 0; + break; + } + } + + return ret; +} +#else +static int arm_smmu_ecmdq_reset(struct arm_smmu_device *smmu) +{ + return 0; +} +#endif + static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool resume) { int ret; @@ -3743,6 +3990,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool resume) writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD); writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS); + arm_smmu_ecmdq_reset(smmu); + enables = CR0_CMDQEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, ARM_SMMU_CR0ACK); @@ -3832,6 +4081,136 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool resume) return 0; } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static int arm_smmu_ecmdq_layout(struct arm_smmu_device *smmu) +{ + int cpu, host_cpu; + struct arm_smmu_ecmdq *ecmdq; + + ecmdq = devm_alloc_percpu(smmu->dev, *ecmdq); + if (!ecmdq) + return -ENOMEM; + smmu->ecmdq = ecmdq; + + /* A core requires at most one ECMDQ */ + if (num_possible_cpus() < smmu->nr_ecmdq) + smmu->nr_ecmdq = num_possible_cpus(); + + for_each_possible_cpu(cpu) { + if (cpu < smmu->nr_ecmdq) { + *per_cpu_ptr(smmu->ecmdqs, cpu) = per_cpu_ptr(smmu->ecmdq, cpu); + } else { + host_cpu = cpu % smmu->nr_ecmdq; + ecmdq = per_cpu_ptr(smmu->ecmdq, host_cpu); + ecmdq->cmdq.shared = 1; + *per_cpu_ptr(smmu->ecmdqs, cpu) = ecmdq; + } + } + + return 0; +} + +static int arm_smmu_ecmdq_probe(struct arm_smmu_device *smmu) +{ + int ret, cpu; + u32 i, nump, numq, gap; + u32 reg, shift_increment; + u64 addr, smmu_dma_base; + void __iomem *cp_regs, *cp_base; + + /* IDR6 */ + reg = readl_relaxed(smmu->base + ARM_SMMU_IDR6); + nump = 1 << FIELD_GET(IDR6_LOG2NUMP, reg); + numq = 1 << FIELD_GET(IDR6_LOG2NUMQ, reg); + smmu->nr_ecmdq = nump * numq; + gap = ECMDQ_CP_RRESET_SIZE >> FIELD_GET(IDR6_LOG2NUMQ, reg); + if (!smmu->nr_ecmdq) + return -EOPNOTSUPP; + + smmu_dma_base = (vmalloc_to_pfn(smmu->base) << PAGE_SHIFT); + cp_regs = ioremap(smmu_dma_base + ARM_SMMU_ECMDQ_CP_BASE, PAGE_SIZE); + if (!cp_regs) + return -ENOMEM; + + for (i = 0; i < nump; i++) { + u64 val, pre_addr; + + val = readq_relaxed(cp_regs + 32 * i); + if (!(val & ECMDQ_CP_PRESET)) { + iounmap(cp_regs); + dev_err(smmu->dev, "ecmdq control page %u is memory mode\n", i); + return -EFAULT; + } + + if (i && ((val & ECMDQ_CP_ADDR) != (pre_addr + ECMDQ_CP_RRESET_SIZE))) { + iounmap(cp_regs); + dev_err(smmu->dev, "ecmdq_cp memory region is not contiguous\n"); + return -EFAULT; + } + + pre_addr = val & ECMDQ_CP_ADDR; + } + + addr = readl_relaxed(cp_regs) & ECMDQ_CP_ADDR; + iounmap(cp_regs); + + cp_base = devm_ioremap(smmu->dev, smmu_dma_base + addr, ECMDQ_CP_RRESET_SIZE * nump); + if (!cp_base) + return -ENOMEM; + + smmu->ecmdqs = devm_alloc_percpu(smmu->dev, struct arm_smmu_ecmdq *); + if (!smmu->ecmdqs) + return -ENOMEM; + + ret = arm_smmu_ecmdq_layout(smmu); + if (ret) + return ret; + + shift_increment = order_base_2(num_possible_cpus() / smmu->nr_ecmdq); + + addr = 0; + for_each_possible_cpu(cpu) { + struct arm_smmu_ecmdq *ecmdq; + struct arm_smmu_queue *q; + + ecmdq = *per_cpu_ptr(smmu->ecmdqs, cpu); + q = &ecmdq->cmdq.q; + + /* + * The boot option "maxcpus=" can limit the number of online + * CPUs. The CPUs that are not selected are not showed in + * cpumask_of_node(node), their 'ecmdq' may be NULL. + * + * (ecmdq != per_cpu_ptr(smmu->ecmdq, cpu)) indicates that the + * ECMDQ is shared by multiple cores and should be initialized + * only by the first owner. + */ + if (!ecmdq || (ecmdq != per_cpu_ptr(smmu->ecmdq, cpu))) + continue; + ecmdq->base = cp_base + addr; + + q->llq.max_n_shift = ECMDQ_MAX_SZ_SHIFT + shift_increment; + ret = arm_smmu_init_one_queue(smmu, q, ecmdq->base, ARM_SMMU_ECMDQ_PROD, + ARM_SMMU_ECMDQ_CONS, CMDQ_ENT_DWORDS, "ecmdq"); + if (ret) + return ret; + + q->ecmdq_prod = ECMDQ_PROD_EN; + rwlock_init(&q->ecmdq_lock); + + ret = arm_smmu_ecmdq_init(&ecmdq->cmdq); + if (ret) { + dev_err(smmu->dev, "ecmdq[%d] init failed\n", i); + return ret; + } + + addr += gap; + } + + return 0; +} +#endif + #define IIDR_IMPLEMENTER_ARM 0x43b #define IIDR_PRODUCTID_ARM_MMU_600 0x483 #define IIDR_PRODUCTID_ARM_MMU_700 0x487 @@ -4004,6 +4383,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) return -ENXIO; } +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (reg & IDR1_ECMDQ) + smmu->features |= ARM_SMMU_FEAT_ECMDQ; +#endif + /* Queue sizes, capped to ensure natural alignment */ smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, reg)); @@ -4124,6 +4508,18 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n", smmu->ias, smmu->oas, smmu->features); + +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + if (smmu->features & ARM_SMMU_FEAT_ECMDQ && !disable_ecmdq) { + int err; + + err = arm_smmu_ecmdq_probe(smmu); + if (err) { + dev_err(smmu->dev, "suppress ecmdq feature, errno=%d\n", err); + smmu->ecmdq_enabled = 0; + } + } +#endif return 0; } @@ -4216,8 +4612,70 @@ static void __iomem *arm_smmu_ioremap(struct device *dev, resource_size_t start, } #ifdef CONFIG_ARM_SMMU_V3_PM +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +static int arm_smmu_ecmdq_disable(struct device *dev) +{ + int i, j; + int ret, nr_fail = 0, n = 100; + u32 reg, prod, cons; + struct arm_smmu_ecmdq *ecmdq; + struct arm_smmu_queue *q; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + for (i = 0; i < smmu->nr_ecmdq; i++) { + ecmdq = *per_cpu_ptr(smmu->ecmdqs, i); + q = &ecmdq->cmdq.q; + + prod = readl_relaxed(q->prod_reg); + cons = readl_relaxed(q->cons_reg); + if ((prod & ECMDQ_PROD_EN) == 0) + continue; + + for (j = 0; j < n; j++) { + if (Q_IDX(&q->llq, prod) == Q_IDX(&q->llq, cons) && + Q_WRP(&q->llq, prod) == Q_WRP(&q->llq, cons)) + break; + + /* Wait a moment, so ECMDQ has a chance to finish */ + udelay(1); + cons = readl_relaxed(q->cons_reg); + } + WARN_ON(prod != readl_relaxed(q->prod_reg)); + if (j >= n) + dev_warn(smmu->dev, + "Forcibly disabling ecmdq[%d]: prod=%08x, cons=%08x\n", + i, prod, cons); + + /* disable ecmdq */ + prod &= ~ECMDQ_PROD_EN; + writel(prod, q->prod_reg); + ret = readl_relaxed_poll_timeout(q->cons_reg, reg, !(reg & ECMDQ_CONS_ENACK), + 1, ARM_SMMU_POLL_TIMEOUT_US); + if (ret) { + nr_fail++; + dev_err(smmu->dev, "ecmdq[%d] disable failed\n", i); + } + } + + if (nr_fail) { + smmu->ecmdq_enabled = 0; + pr_warn("Suppress ecmdq feature, switch to normal cmdq\n"); + return -EIO; + } + + return 0; +} +#else +static int arm_smmu_ecmdq_disable(struct device *dev) +{ + return 0; +} +#endif + static int arm_smmu_suspend(struct device *dev) { + arm_smmu_ecmdq_disable(dev); + /* * The smmu is powered off and related registers are automatically * cleared when suspend. No need to do anything. diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index cf8eb14d1a9cd0b158d15986824c94de5048f526..a6381d33647c76e5e0f7d0705a4451f082829d98 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -45,6 +45,7 @@ #define IDR0_S2P (1 << 0) #define ARM_SMMU_IDR1 0x4 +#define IDR1_ECMDQ (1 << 31) #define IDR1_TABLES_PRESET (1 << 30) #define IDR1_QUEUES_PRESET (1 << 29) #define IDR1_REL (1 << 28) @@ -121,6 +122,7 @@ #define ARM_SMMU_IRQ_CTRLACK 0x54 #define ARM_SMMU_GERROR 0x60 +#define GERROR_CMDQP_ERR (1 << 9) #define GERROR_SFM_ERR (1 << 8) #define GERROR_MSI_GERROR_ABT_ERR (1 << 7) #define GERROR_MSI_PRIQ_ABT_ERR (1 << 6) @@ -137,6 +139,26 @@ #define ARM_SMMU_GERROR_IRQ_CFG1 0x70 #define ARM_SMMU_GERROR_IRQ_CFG2 0x74 +#define ARM_SMMU_IDR6 0x190 +#define IDR6_LOG2NUMP GENMASK(27, 24) +#define IDR6_LOG2NUMQ GENMASK(19, 16) +#define IDR6_BA_DOORBELLS GENMASK(9, 0) + +#define ARM_SMMU_ECMDQ_BASE 0x00 +#define ARM_SMMU_ECMDQ_PROD 0x08 +#define ARM_SMMU_ECMDQ_CONS 0x0c +#define ECMDQ_MAX_SZ_SHIFT 8 +#define ECMDQ_PROD_EN (1 << 31) +#define ECMDQ_CONS_ENACK (1 << 31) +#define ECMDQ_CONS_ERR (1 << 23) +#define ECMDQ_PROD_ERRACK (1 << 23) + +#define ARM_SMMU_ECMDQ_CP_BASE 0x4000 +#define ECMDQ_CP_ADDR GENMASK_ULL(51, 12) +#define ECMDQ_CP_CMDQGS GENMASK_ULL(2, 1) +#define ECMDQ_CP_PRESET (1UL << 0) +#define ECMDQ_CP_RRESET_SIZE 0x10000 + #define ARM_SMMU_STRTAB_BASE 0x80 #define STRTAB_BASE_RA (1UL << 62) #define STRTAB_BASE_ADDR_MASK GENMASK_ULL(51, 6) @@ -538,6 +560,10 @@ struct arm_smmu_ll_queue { struct arm_smmu_queue { struct arm_smmu_ll_queue llq; int irq; /* Wired interrupt */ +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + u32 ecmdq_prod; + rwlock_t ecmdq_lock; +#endif __le64 *base; dma_addr_t base_dma; @@ -561,8 +587,18 @@ struct arm_smmu_cmdq { atomic_long_t *valid_map; atomic_t owner_prod; atomic_t lock; +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + int shared; +#endif }; +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ +struct arm_smmu_ecmdq { + struct arm_smmu_cmdq cmdq; + void __iomem *base; +}; +#endif + struct arm_smmu_cmdq_batch { u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS]; int num; @@ -661,6 +697,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_NESTING (1 << 21) #define ARM_SMMU_FEAT_BBML1 (1 << 22) #define ARM_SMMU_FEAT_BBML2 (1 << 23) +#define ARM_SMMU_FEAT_ECMDQ (1 << 24) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) @@ -669,6 +706,16 @@ struct arm_smmu_device { #define ARM_SMMU_OPT_CMDQ_FORCE_SYNC (1 << 3) u32 options; +#ifdef CONFIG_ARM_SMMU_V3_ECMDQ + union { + u32 nr_ecmdq; + u32 ecmdq_enabled; + }; + + struct arm_smmu_ecmdq *__percpu *ecmdqs; + struct arm_smmu_ecmdq __percpu *ecmdq; +#endif + struct arm_smmu_cmdq cmdq; struct arm_smmu_evtq evtq; struct arm_smmu_priq priq;