From c3f9e46f96d83efbbade056f29f3ba75b7d9daa1 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 22 Oct 2025 20:12:33 +0800 Subject: [PATCH 1/2] anolis: nvme: introduce panic_on_double_cqe param ANBZ: #27022 Add a new debug switch to control whether to trigger a kernel crash when duplicate CQEs are detected, in order to preserve the kernel context, such as sq, cq, and so on, for subsequent debugging and analysis. Signed-off-by: Guixin Liu --- drivers/nvme/host/core.c | 5 +++++ drivers/nvme/host/nvme.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e359f841c07b..1ff34199efe7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -60,6 +60,11 @@ static bool streams; module_param(streams, bool, 0644); MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); +bool panic_on_double_cqe; +EXPORT_SYMBOL_GPL(panic_on_double_cqe); +module_param(panic_on_double_cqe, bool, 0444); +MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save the scene"); + /* * nvme_wq - hosts nvme related works that are not reset or delete * nvme_reset_wq - hosts nvme reset works diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6e4d385eea0c..c6425e7f5ba3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -26,6 +26,8 @@ extern unsigned int nvme_io_timeout; extern unsigned int admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) +extern bool panic_on_double_cqe; + #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 @@ -536,12 +538,14 @@ static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, if (unlikely(!rq)) { pr_err("could not locate request for tag %#x\n", tag); + BUG_ON(panic_on_double_cqe); return NULL; } if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { dev_err(nvme_req(rq)->ctrl->device, "request %#x genctr mismatch (got %#x expected %#x)\n", tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + BUG_ON(panic_on_double_cqe); return NULL; } return rq; -- Gitee From a887936a4eebdb357cc0dd382bae6913288ffab6 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Tue, 4 Nov 2025 09:55:51 +0800 Subject: [PATCH 2/2] anolis: nvme: pci: invalid next cmd when put cmd to sq ANBZ: #27022 Invalid next cmd by: 1. setting opcode to 0x02. 2. setting prp1 to zero. 3. setting slba to U64_MAX. To let the Moc's TDC hold for debugging. Signed-off-by: Guixin Liu --- drivers/nvme/host/pci.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e95a458310ab..7c9306bd988b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -67,6 +67,12 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +static bool invalid_next_sqe; +module_param(invalid_next_sqe, bool, 0444); +MODULE_PARM_DESC(invalid_next_sqe, + "Invalid the next sqe, when target obtain sqe incorrectly, they " + "can discover and hold on for debugging."); + static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -524,11 +530,23 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, bool write_sq) { + void *cur; + struct nvme_command *next; + spin_lock(&nvmeq->sq_lock); - memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), - cmd, sizeof(*cmd)); + cur = nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; + + if (invalid_next_sqe && nvmeq->qid) { + next = (struct nvme_command *)(nvmeq->sq_cmds + + (nvmeq->sq_tail << nvmeq->sqes)); + next->rw.opcode = 0x01; + next->rw.dptr.prp1 = cpu_to_le64(0x1000); + next->rw.slba = cpu_to_le64(U64_MAX); + } + + memcpy(cur, cmd, sizeof(*cmd)); nvme_write_sq_db(nvmeq, write_sq); spin_unlock(&nvmeq->sq_lock); } -- Gitee