From a67ab0c665fc4efef85dadd98dac5418e86e84fb Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 3 Jun 2023 14:50:27 +0800
Subject: [PATCH 01/25] blk-iocost: don't allow to configure bio based device

hulk inclusion
category: bugfix
bugzilla: 188033, https://gitee.com/openeuler/kernel/issues/I663ZP
CVE: NA

--------------------------------

iocost is based on rq_qos, which can only work for request based device,
thus it doesn't make sense to configure iocost for bio based device.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit e11c64a9d36c54cf6d8103eb56bd385e9e791f3e)
---
 block/blk-iocost.c | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 2c30aae1a664..ea1fac82807c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3166,6 +3166,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 	disk = blkcg_conf_get_disk(&input);
 	if (IS_ERR(disk))
 		return PTR_ERR(disk);
+	if (!queue_is_mq(disk->queue)) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
 
 	ioc = q_to_ioc(disk->queue);
 	if (!ioc) {
@@ -3333,6 +3337,10 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	disk = blkcg_conf_get_disk(&input);
 	if (IS_ERR(disk))
 		return PTR_ERR(disk);
+	if (!queue_is_mq(disk->queue)) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
 
 	ioc = q_to_ioc(disk->queue);
 	if (!ioc) {
-- 
Gitee


From fe2e495677ba4d474457f9ab0c4276b184275d3f Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:28 +0800
Subject: [PATCH 02/25] blk-iocost: use spin_lock_irqsave in
 adjust_inuse_and_calc_cost

hulk inclusion
category: bugfix
bugzilla: 188152, https://gitee.com/openeuler/kernel/issues/I67BPT
CVE: NA

-------------------------------

adjust_inuse_and_calc_cost() use spin_lock_irq and IRQ will enable when
unlock. DEADLOCK might happen if we have held other locks before:

  ================================
  WARNING: inconsistent lock state
  5.10.0-02758-g8e5f91fd772f #26 Not tainted
  --------------------------------
  inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
  kworker/2:3/388 [HC0[0]:SC0[0]:HE0:SE1] takes:
  ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: spin_lock_irq
  ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: bfq_bio_merge+0x141/0x390
  {IN-HARDIRQ-W} state was registered at:
    __lock_acquire+0x3d7/0x1070
    lock_acquire+0x197/0x4a0
    __raw_spin_lock_irqsave
    _raw_spin_lock_irqsave+0x3b/0x60
    bfq_idle_slice_timer_body
    bfq_idle_slice_timer+0x53/0x1d0
    __run_hrtimer+0x477/0xa70
    __hrtimer_run_queues+0x1c6/0x2d0
    hrtimer_interrupt+0x302/0x9e0
    local_apic_timer_interrupt
    __sysvec_apic_timer_interrupt+0xfd/0x420
    run_sysvec_on_irqstack_cond
    sysvec_apic_timer_interrupt+0x46/0xa0
    asm_sysvec_apic_timer_interrupt+0x12/0x20
  irq event stamp: 837522
  hardirqs last  enabled at (837521): [<ffffffff84b9419d>] __raw_spin_unlock_irqrestore
  hardirqs last  enabled at (837521): [<ffffffff84b9419d>] _raw_spin_unlock_irqrestore+0x3d/0x40
  hardirqs last disabled at (837522): [<ffffffff84b93fa3>] __raw_spin_lock_irq
  hardirqs last disabled at (837522): [<ffffffff84b93fa3>] _raw_spin_lock_irq+0x43/0x50
  softirqs last  enabled at (835852): [<ffffffff84e00558>] __do_softirq+0x558/0x8ec
  softirqs last disabled at (835845): [<ffffffff84c010ff>] asm_call_irq_on_stack+0xf/0x20

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(&bfqd->lock);
    <Interrupt>
      lock(&bfqd->lock);

   *** DEADLOCK ***

  3 locks held by kworker/2:3/388:
   #0: ffff888107af0f38 ((wq_completion)kthrotld){+.+.}-{0:0}, at: process_one_work+0x742/0x13f0
   #1: ffff8881176bfdd8 ((work_completion)(&td->dispatch_work)){+.+.}-{0:0}, at: process_one_work+0x777/0x13f0
   #2: ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: spin_lock_irq
   #2: ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: bfq_bio_merge+0x141/0x390

  stack backtrace:
  CPU: 2 PID: 388 Comm: kworker/2:3 Not tainted 5.10.0-02758-g8e5f91fd772f #26
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  Workqueue: kthrotld blk_throtl_dispatch_work_fn
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x107/0x167
   print_usage_bug
   valid_state
   mark_lock_irq.cold+0x32/0x3a
   mark_lock+0x693/0xbc0
   mark_held_locks+0x9e/0xe0
   __trace_hardirqs_on_caller
   lockdep_hardirqs_on_prepare.part.0+0x151/0x360
   trace_hardirqs_on+0x5b/0x180
   __raw_spin_unlock_irq
   _raw_spin_unlock_irq+0x24/0x40
   spin_unlock_irq
   adjust_inuse_and_calc_cost+0x4fb/0x970
   ioc_rqos_merge+0x277/0x740
   __rq_qos_merge+0x62/0xb0
   rq_qos_merge
   bio_attempt_back_merge+0x12c/0x4a0
   blk_mq_sched_try_merge+0x1b6/0x4d0
   bfq_bio_merge+0x24a/0x390
   __blk_mq_sched_bio_merge+0xa6/0x460
   blk_mq_sched_bio_merge
   blk_mq_submit_bio+0x2e7/0x1ee0
   __submit_bio_noacct_mq+0x175/0x3b0
   submit_bio_noacct+0x1fb/0x270
   blk_throtl_dispatch_work_fn+0x1ef/0x2b0
   process_one_work+0x83e/0x13f0
   process_scheduled_works
   worker_thread+0x7e3/0xd80
   kthread+0x353/0x470
   ret_from_fork+0x1f/0x30

Fixes: b0853ab4a238 ("blk-iocost: revamp in-period donation snapbacks")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 60e8843caffcd1cef0e82be9aac917512163c1ca)
---
 block/blk-iocost.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index ea1fac82807c..3f2c670f7c99 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2414,6 +2414,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
 	u32 hwi, adj_step;
 	s64 margin;
 	u64 cost, new_inuse;
+	unsigned long flags;
 
 	current_hweight(iocg, NULL, &hwi);
 	old_hwi = hwi;
@@ -2432,11 +2433,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
 	    iocg->inuse == iocg->active)
 		return cost;
 
-	spin_lock_irq(&ioc->lock);
+	spin_lock_irqsave(&ioc->lock, flags);
 
 	/* we own inuse only when @iocg is in the normal active state */
 	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
-		spin_unlock_irq(&ioc->lock);
+		spin_unlock_irqrestore(&ioc->lock, flags);
 		return cost;
 	}
 
@@ -2457,7 +2458,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
 	} while (time_after64(vtime + cost, now->vnow) &&
 		 iocg->inuse != iocg->active);
 
-	spin_unlock_irq(&ioc->lock);
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
 	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
 			old_inuse, iocg->inuse, old_hwi, hwi);
-- 
Gitee


From 12a4bf85848d92e2e6e5dab0a1fe4f8b2ddfe486 Mon Sep 17 00:00:00 2001
From: Jinke Han <hanjinke.666@bytedance.com>
Date: Sat, 3 Jun 2023 14:50:29 +0800
Subject: [PATCH 03/25] block: don't allow the same type rq_qos add more than
 once

mainline inclusion
from mainline-v6.0-rc1
commit 14a6e2eb7df5c7897c15b109cba29ab0c4a791b6
category: bugfix
bugzilla: 188088, https://gitee.com/openeuler/kernel/issues/I66GIL
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=14a6e2eb7df5c7897c15b109cba29ab0c4a791b6

----------------------------------------------------------------------

In our test of iocost, we encountered some list add/del corruptions of
inner_walk list in ioc_timer_fn.

The reason can be described as follows:

cpu 0					cpu 1
ioc_qos_write				ioc_qos_write

ioc = q_to_ioc(queue);
if (!ioc) {
        ioc = kzalloc();
					ioc = q_to_ioc(queue);
					if (!ioc) {
						ioc = kzalloc();
						...
						rq_qos_add(q, rqos);
					}
        ...
        rq_qos_add(q, rqos);
        ...
}

When the io.cost.qos file is written by two cpus concurrently, rq_qos may
be added to one disk twice. In that case, there will be two iocs enabled
and running on one disk. They own different iocgs on their active list. In
the ioc_timer_fn function, because of the iocgs from two iocs have the
same root iocg, the root iocg's walk_list may be overwritten by each other
and this leads to list add/del corruptions in building or destroying the
inner_walk list.

And so far, the blk-rq-qos framework works in case that one instance for
one type rq_qos per queue by default. This patch make this explicit and
also fix the crash above.

Signed-off-by: Jinke Han <hanjinke.666@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220720093616.70584-1-hanjinke.666@bytedance.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	block/blk-rq-qos.h
	block/blk-wbt.c

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 8ce9b7c45a833430a904922d586dd5f7f523166f)
---
 block/blk-iocost.c    | 20 +++++++++++++-------
 block/blk-iolatency.c | 18 +++++++++++-------
 block/blk-rq-qos.h    | 11 ++++++++++-
 block/blk-wbt.c       | 12 +++++++++++-
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 3f2c670f7c99..4738cd314106 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2874,15 +2874,21 @@ static int blk_iocost_init(struct request_queue *q)
 	 * called before policy activation completion, can't assume that the
 	 * target bio has an iocg associated and need to test for NULL iocg.
 	 */
-	rq_qos_add(q, rqos);
+	ret = rq_qos_add(q, rqos);
+	if (ret)
+		goto err_free_ioc;
+
 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
-	if (ret) {
-		rq_qos_del(q, rqos);
-		free_percpu(ioc->pcpu_stat);
-		kfree(ioc);
-		return ret;
-	}
+	if (ret)
+		goto err_del_qos;
 	return 0;
+
+err_del_qos:
+	rq_qos_del(q, rqos);
+err_free_ioc:
+	free_percpu(ioc->pcpu_stat);
+	kfree(ioc);
+	return ret;
 }
 
 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 74511a060d59..9811ee74b69f 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -772,19 +772,23 @@ int blk_iolatency_init(struct request_queue *q)
 	rqos->ops = &blkcg_iolatency_ops;
 	rqos->q = q;
 
-	rq_qos_add(q, rqos);
-
+	ret = rq_qos_add(q, rqos);
+	if (ret)
+		goto err_free;
 	ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
-	if (ret) {
-		rq_qos_del(q, rqos);
-		kfree(blkiolat);
-		return ret;
-	}
+	if (ret)
+		goto err_qos_del;
 
 	timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
 	INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
 
 	return 0;
+
+err_qos_del:
+	rq_qos_del(q, rqos);
+err_free:
+	kfree(blkiolat);
+	return ret;
 }
 
 static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 2bcb3495e376..37c59d7d6ba7 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -98,7 +98,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
 	init_waitqueue_head(&rq_wait->wait);
 }
 
-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
 {
 	/*
 	 * No IO can be in-flight when adding rqos, so freeze queue, which
@@ -110,6 +110,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
 	blk_mq_freeze_queue(q);
 
 	spin_lock_irq(&q->queue_lock);
+	if (rq_qos_id(q, rqos->id))
+		goto ebusy;
 	rqos->next = q->rq_qos;
 	q->rq_qos = rqos;
 	spin_unlock_irq(&q->queue_lock);
@@ -118,6 +120,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
 
 	if (rqos->ops->debugfs_attrs)
 		blk_mq_debugfs_register_rqos(rqos);
+
+	return 0;
+ebusy:
+	spin_unlock_irq(&q->queue_lock);
+	blk_mq_unfreeze_queue(q);
+	return -EBUSY;
+
 }
 
 static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6f63920f073c..28eb25b947cd 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -818,6 +818,7 @@ int wbt_init(struct request_queue *q)
 {
 	struct rq_wb *rwb;
 	int i;
+	int ret;
 
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
@@ -847,8 +848,17 @@ int wbt_init(struct request_queue *q)
 	/*
 	 * Assign rwb and add the stats callback.
 	 */
-	rq_qos_add(q, &rwb->rqos);
+	ret = rq_qos_add(q, &rwb->rqos);
+	if (ret)
+		goto err_free;
+
 	blk_stat_add_callback(q, rwb->cb);
 
 	return 0;
+
+err_free:
+	blk_stat_free_callback(rwb->cb);
+	kfree(rwb);
+	return ret;
+
 }
-- 
Gitee


From 40125ec77eb3a889c09939918c5d97a8e861a772 Mon Sep 17 00:00:00 2001
From: David Sloan <david.sloan@eideticom.com>
Date: Sat, 3 Jun 2023 14:50:30 +0800
Subject: [PATCH 04/25] md: Flush workqueue md_rdev_misc_wq in md_alloc()

mainline inclusion
from mainline-v6.0-rc3
commit 5e8daf906f890560df430d30617c692a794acb73
category: bugfix
bugzilla: 188015, https://gitee.com/openeuler/kernel/issues/I6OERX
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=5e8daf906f890560df430d30617c692a794acb73

--------------------------------

A race condition still exists when removing and re-creating md devices
in test cases. However, it is only seen on some setups.

The race condition was tracked down to a reference still being held
to the kobject by the rdev in the md_rdev_misc_wq which will be released
in rdev_delayed_delete().

md_alloc() waits for previous deletions by waiting on the md_misc_wq,
but the md_rdev_misc_wq may still be holding a reference to a recently
removed device.

To fix this, also flush the md_rdev_misc_wq in md_alloc().

Signed-off-by: David Sloan <david.sloan@eideticom.com>
[logang@deltatee.com: rewrote commit message]
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Song Liu <song@kernel.org>

Conflict:
	drivers/md/md.c

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 5fa419171f34bdf4966a60fdfd79ecdf1fa848d4)
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 61f68689ddfd..6980f8b207c5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5743,6 +5743,7 @@ static int md_alloc(dev_t dev, char *name)
 	 * completely removed (mddev_delayed_delete).
 	 */
 	flush_workqueue(md_misc_wq);
+	flush_workqueue(md_rdev_misc_wq);
 
 	mutex_lock(&disks_mutex);
 	error = -EEXIST;
-- 
Gitee


From 459c6a0a745a7ffe7ea63eb56e4f5595917e9c28 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:31 +0800
Subject: [PATCH 05/25] md: replace invalid function flush_rdev_wq() with
 flush_workqueue()

hulk inclusion
category: bugfix
bugzilla: 188553, https://gitee.com/openeuler/kernel/issues/I6TNFX
CVE: NA

--------------------------------

If we want to remove a device, first we delete it from mddev->disks list,
then init rdev->del_work to put it (see unbind_rdev_from_array()).

flush_rdev_wq() traverses mddev->disks to check if there is any pending
rdev->del_work, if so, flush it. Howerver, rdev will not be in the list of
mddev->disks if rdev->del_work exists, and flush_workqueue() will never be
executed.

Replace it with flush_workqueue() to ensure del_work has been completed
when adding devices.

Fixes: cc1ffe61c026 ("md: add new workqueue for delete rdev")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit ff461e2dbd2b4362a5fea8378289cfbfff33c967)
---
 drivers/md/md.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6980f8b207c5..9c96ea461e37 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4592,20 +4592,6 @@ null_show(struct mddev *mddev, char *page)
 	return -EINVAL;
 }
 
-/* need to ensure rdev_delayed_delete() has completed */
-static void flush_rdev_wq(struct mddev *mddev)
-{
-	struct md_rdev *rdev;
-
-	rcu_read_lock();
-	rdev_for_each_rcu(rdev, mddev)
-		if (work_pending(&rdev->del_work)) {
-			flush_workqueue(md_rdev_misc_wq);
-			break;
-		}
-	rcu_read_unlock();
-}
-
 static ssize_t
 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 {
@@ -4633,7 +4619,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	    minor != MINOR(dev))
 		return -EOVERFLOW;
 
-	flush_rdev_wq(mddev);
+	flush_workqueue(md_rdev_misc_wq);
 	err = mddev_lock(mddev);
 	if (err)
 		return err;
@@ -7647,7 +7633,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 
 	if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
-		flush_rdev_wq(mddev);
+		flush_workqueue(md_rdev_misc_wq);
 
 	if (cmd == HOT_REMOVE_DISK)
 		/* need to ensure recovery thread has run */
-- 
Gitee


From 6794045a68a99353a92048e98541449b22d6d6b5 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:32 +0800
Subject: [PATCH 06/25] md: fix sysfs duplicate file while adding rdev

hulk inclusion
category: bugfix
bugzilla: 188553, https://gitee.com/openeuler/kernel/issues/I6TNFX
CVE: NA

--------------------------------

rdev->del_work has not been queued to md_rdev_misc_wq and flush_workqueue
will not flush it if tow threads add and remove same device. sysfs might
WARN duplicate filename as below.

    //T1	             //T2
    mdadm write super
			     add success
			     remove
			      unbind_rdev_from_array

    md_ioctl
     flush_workqueue
			      INIT_WORK
                               queue_work
     md_add_new_disk
      duplicate filename dev-xxx

Check if there is any kobj with the same name, and return busy if true.

Fixes: 5792a2856a63 ("md: avoid a deadlock when removing a device from an md array via sysfs")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 5815341fde0f9fbbe1ec5bb9d4aaffbeb61c72c3)
---
 drivers/md/md.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9c96ea461e37..4d744cf8ffc6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2402,8 +2402,9 @@ EXPORT_SYMBOL(md_integrity_add_rdev);
 
 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 {
-	char b[BDEVNAME_SIZE];
+	char b[BDEVNAME_SIZE + 4];
 	struct kobject *ko;
+	struct kernfs_node *sysfs_rdev;
 	int err;
 
 	/* prevent duplicates */
@@ -2454,7 +2455,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 			mdname(mddev), mddev->max_disks);
 		return -EBUSY;
 	}
-	bdevname(rdev->bdev,b);
+	memcpy(b, "dev-", 4);
+	bdevname(rdev->bdev, b + 4);
 	strreplace(b, '/', '!');
 
 	rdev->mddev = mddev;
@@ -2463,7 +2465,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	if (mddev->raid_disks)
 		mddev_create_serial_pool(mddev, rdev, false);
 
-	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
+	sysfs_rdev = sysfs_get_dirent_safe(mddev->kobj.sd, b);
+	if (sysfs_rdev) {
+		sysfs_put(sysfs_rdev);
+		err = -EBUSY;
+		goto fail;
+	}
+
+	err = kobject_add(&rdev->kobj, &mddev->kobj, b);
+	if (err)
 		goto fail;
 
 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
@@ -2484,7 +2494,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	return 0;
 
  fail:
-	pr_warn("md: failed to register dev-%s for %s\n",
+	pr_warn("md: failed to register %s for %s\n",
 		b, mdname(mddev));
 	return err;
 }
-- 
Gitee


From 9205a5c874fde038413b1c8028139753fa4375b5 Mon Sep 17 00:00:00 2001
From: Jiang Li <jiang.li@ugreen.com>
Date: Sat, 3 Jun 2023 14:50:33 +0800
Subject: [PATCH 07/25] md/raid1: stop mdx_raid1 thread when raid1 array run
 failed

mainline inclusion
from mainline-v6.2-rc1
commit b611ad14006e5be2170d9e8e611bf49dff288911
category: bugfix
bugzilla: 188662, https://gitee.com/openeuler/kernel/issues/I6UMUF
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=b611ad14006e5be2170d9e8e611bf49dff288911

--------------------------------

fail run raid1 array when we assemble array with the inactive disk only,
but the mdx_raid1 thread were not stop, Even if the associated resources
have been released. it will caused a NULL dereference when we do poweroff.

This causes the following Oops:
    [  287.587787] BUG: kernel NULL pointer dereference, address: 0000000000000070
    [  287.594762] #PF: supervisor read access in kernel mode
    [  287.599912] #PF: error_code(0x0000) - not-present page
    [  287.605061] PGD 0 P4D 0
    [  287.607612] Oops: 0000 [#1] SMP NOPTI
    [  287.611287] CPU: 3 PID: 5265 Comm: md0_raid1 Tainted: G     U            5.10.146 #0
    [  287.619029] Hardware name: xxxxxxx/To be filled by O.E.M, BIOS 5.19 06/16/2022
    [  287.626775] RIP: 0010:md_check_recovery+0x57/0x500 [md_mod]
    [  287.632357] Code: fe 01 00 00 48 83 bb 10 03 00 00 00 74 08 48 89 ......
    [  287.651118] RSP: 0018:ffffc90000433d78 EFLAGS: 00010202
    [  287.656347] RAX: 0000000000000000 RBX: ffff888105986800 RCX: 0000000000000000
    [  287.663491] RDX: ffffc90000433bb0 RSI: 00000000ffffefff RDI: ffff888105986800
    [  287.670634] RBP: ffffc90000433da0 R08: 0000000000000000 R09: c0000000ffffefff
    [  287.677771] R10: 0000000000000001 R11: ffffc90000433ba8 R12: ffff888105986800
    [  287.684907] R13: 0000000000000000 R14: fffffffffffffe00 R15: ffff888100b6b500
    [  287.692052] FS:  0000000000000000(0000) GS:ffff888277f80000(0000) knlGS:0000000000000000
    [  287.700149] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [  287.705897] CR2: 0000000000000070 CR3: 000000000320a000 CR4: 0000000000350ee0
    [  287.713033] Call Trace:
    [  287.715498]  raid1d+0x6c/0xbbb [raid1]
    [  287.719256]  ? __schedule+0x1ff/0x760
    [  287.722930]  ? schedule+0x3b/0xb0
    [  287.726260]  ? schedule_timeout+0x1ed/0x290
    [  287.730456]  ? __switch_to+0x11f/0x400
    [  287.734219]  md_thread+0xe9/0x140 [md_mod]
    [  287.738328]  ? md_thread+0xe9/0x140 [md_mod]
    [  287.742601]  ? wait_woken+0x80/0x80
    [  287.746097]  ? md_register_thread+0xe0/0xe0 [md_mod]
    [  287.751064]  kthread+0x11a/0x140
    [  287.754300]  ? kthread_park+0x90/0x90
    [  287.757974]  ret_from_fork+0x1f/0x30

In fact, when raid1 array run fail, we need to do
md_unregister_thread() before raid1_free().

Signed-off-by: Jiang Li <jiang.li@ugreen.com>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 22eeb5d1f2f6f33be7fbc1a83e4a0f6e86c438b9)
---
 drivers/md/raid1.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8780c95f9b86..00915e6ec410 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3158,6 +3158,7 @@ static int raid1_run(struct mddev *mddev)
 	 * RAID1 needs at least one disk in active
 	 */
 	if (conf->raid_disks - mddev->degraded < 1) {
+		md_unregister_thread(&conf->thread);
 		ret = -EINVAL;
 		goto abort;
 	}
-- 
Gitee


From e7725377d66b9a2fbe0e998b415da82df0a485a9 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:34 +0800
Subject: [PATCH 08/25] md/raid10: fix softlockup in raid10_unplug

hulk inclusion
category: bugfix
bugzilla: 188628, https://gitee.com/openeuler/kernel/issues/I6WKDR
CVE: NA

--------------------------------

There is no limit to the number of io for raid10 plug, whitch may result
in excessive memory usage and potential softlockup when a large number of
io are submitted at once. There is no good way to fix it now, just add
schedule point to prevent softlockup.

Fixes: 57c67df48866 ("md/raid10: submit IO from originating thread instead of md thread.")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit f8cecf7a7767753e24280343b4a8acccba06cc60)
---
 drivers/md/raid10.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 04869394e345..ba136d4c6b89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -896,6 +896,7 @@ static void flush_pending_writes(struct r10conf *conf)
 			else
 				submit_bio_noacct(bio);
 			bio = next;
+			cond_resched();
 		}
 		blk_finish_plug(&plug);
 	} else
@@ -1089,6 +1090,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		else
 			submit_bio_noacct(bio);
 		bio = next;
+		cond_resched();
 	}
 	kfree(plug);
 }
-- 
Gitee


From afdec8215131086dc21fb64d0869c06ae8355b57 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 3 Jun 2023 14:50:35 +0800
Subject: [PATCH 09/25] md/raid10: factor out code from wait_barrier() to
 stop_waiting_barrier()

mainline inclusion
from mainline-v6.1-rc1
commit ed2e063f92c44c891ccd883e289dde6ca870edcc
category: bugfix
bugzilla: 188380, https://gitee.com/openeuler/kernel/issues/I6GISC
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=ed2e063f92c44c891ccd883e289dde6ca870edcc

--------------------------------

Currently the nasty condition in wait_barrier() is hard to read. This
patch factors out the condition into a function.

There are no functional changes.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Acked-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Guoqing Jiang <guoqing.jiang@linux.dev>
Signed-off-by: Song Liu <song@kernel.org>

conflict:
	drivers/md/raid10.c

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 7aad54e07ac9445466d48d3555c9de35c8349d66)
---
 drivers/md/raid10.c | 49 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba136d4c6b89..74a4ba9f9cee 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -953,36 +953,41 @@ static void lower_barrier(struct r10conf *conf)
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
+static bool stop_waiting_barrier(struct r10conf *conf)
+{
+	struct bio_list *bio_list = current->bio_list;
+
+	/* barrier is dropped */
+	if (!conf->barrier)
+		return true;
+
+	/*
+	 * If there are already pending requests (preventing the barrier from
+	 * rising completely), and the pre-process bio queue isn't empty, then
+	 * don't wait, as we need to empty that queue to get the nr_pending
+	 * count down.
+	 */
+	if (atomic_read(&conf->nr_pending) && bio_list &&
+	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
+		return true;
+
+	/* move on if recovery thread is blocked by us */
+	if (conf->mddev->thread->tsk == current &&
+	    test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
+	    conf->nr_queued > 0)
+		return true;
+
+	return false;
+}
 
 static void wait_barrier(struct r10conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
-		struct bio_list *bio_list = current->bio_list;
 		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * pre-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to get the nr_pending
-		 * count down.
-		 */
 		raid10_log(conf->mddev, "wait barrier");
 		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->barrier ||
-				    (atomic_read(&conf->nr_pending) &&
-				     bio_list &&
-				     (!bio_list_empty(&bio_list[0]) ||
-				      !bio_list_empty(&bio_list[1]))) ||
-				     /* move on if recovery thread is
-				      * blocked by us
-				      */
-				     (conf->mddev->thread->tsk == current &&
-				      test_bit(MD_RECOVERY_RUNNING,
-					       &conf->mddev->recovery) &&
-				      conf->nr_queued > 0),
+				    stop_waiting_barrier(conf),
 				    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
-- 
Gitee


From 4727ec78057711989d46e0a678c61fb46ee505a9 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:36 +0800
Subject: [PATCH 10/25] md/raid10: fix taks hung in raid10d

hulk inclusion
category: bugfix
bugzilla: 188380, https://gitee.com/openeuler/kernel/issues/I6GISC
CVE: NA

--------------------------------

commit fe630de009d0 ("md/raid10: avoid deadlock on recovery.") allowed
normal io and sync io to exist at the same time. Task hung will occur as
below:

T1                      T2		T3		T4
raid10d
 handle_read_error
  allow_barrier
   conf->nr_pending--
    -> 0
                        //submit sync io
                        raid10_sync_request
                         raise_barrier
			  ->will not be blocked
			  ...
			//submit to drivers
  raid10_read_request
   wait_barrier
    conf->nr_pending++
     -> 1
					//retry read fail
					raid10_end_read_request
					 reschedule_retry
					  add to retry_list
					  conf->nr_queued++
					   -> 1
							//sync io fail
							end_sync_read
							 __end_sync_read
							  reschedule_retry
							   add to retry_list
					                    conf->nr_queued++
							     -> 2
 ...
 handle_read_error
  freeze_array
   wait nr_pending == nr_queued+1
        ->1	      ->3
   //task hung

retry read and sync io will be added to retry_list(nr_queued->2) if they
fails. raid10d() called handle_read_error() and hung in freeze_array().
nr_queued will not decrease because raid10d is blocked, nr_pending will
not increase because conf->barrier is not released.

Fix it by moving allow_barrier() after raid10_read_request().
raise_barrier() will wait for nr_waiting to become 0. Therefore, sync io
and regular io will not be issued at the same time.

We also removed the check of nr_queued. It can be 0 but don't need to be
blocked. MD_RECOVERY_RUNNING always is set after this patch, because all
sync io is waitting in raise_barrier(), remove it, too.

Fixes: fe630de009d0 ("md/raid10: avoid deadlock on recovery.")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 1fe782f0126cb4cb0b4e8d87b5c9458d1640698e)
---
 drivers/md/raid10.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 74a4ba9f9cee..4bdf14f459fe 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -971,11 +971,15 @@ static bool stop_waiting_barrier(struct r10conf *conf)
 	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
 		return true;
 
-	/* move on if recovery thread is blocked by us */
-	if (conf->mddev->thread->tsk == current &&
-	    test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
-	    conf->nr_queued > 0)
+	/*
+	 * move on if io is issued from raid10d(), nr_pending is not released
+	 * from original io(see handle_read_error()). All raise barrier is
+	 * blocked until this io is done.
+	 */
+	if (conf->mddev->thread->tsk == current) {
+		WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
 		return true;
+	}
 
 	return false;
 }
@@ -2605,9 +2609,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 		md_error(mddev, rdev);
 
 	rdev_dec_pending(rdev, mddev);
-	allow_barrier(conf);
 	r10_bio->state = 0;
 	raid10_read_request(mddev, r10_bio->master_bio, r10_bio, true);
+	/*
+	 * allow_barrier after re-submit to ensure no sync io
+	 * can be issued while regular io pending.
+	 */
+	allow_barrier(conf);
 }
 
 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
-- 
Gitee


From a58a60a1f3ce182a24ee22de6c69fbedf1a1d87b Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:37 +0800
Subject: [PATCH 11/25] md/raid10: fix a race between removing rdev and access
 conf->mirrors[i].rdev

hulk inclusion
category: bugfix
bugzilla: 188533, https://gitee.com/openeuler/kernel/issues/I6O7YB
CVE: NA

--------------------------------

commit ceff49d9cb24 ("md/raid1: fix a race between removing rdev and
access conf->mirrors[i].rdev") fix a null-ptr-deref about raid1. There
is same bug in raid10 and fix it in the same way.

There is no sync_thread running while removing rdev, no need to check
the flag in raid10_sync_request().

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 4461a62eae3d96f63ee024a28f6b5af6bf40e60c)
---
 drivers/md/raid10.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4bdf14f459fe..e298903aa72f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -753,9 +753,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+		    test_bit(WantRemove, &rdev->flags) ||
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 			rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (rdev == NULL ||
+		    test_bit(WantRemove, &rdev->flags) ||
 		    test_bit(Faulty, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
@@ -1376,9 +1378,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 			blocked_rdev = rrdev;
 			break;
 		}
-		if (rdev && (test_bit(Faulty, &rdev->flags)))
+		if (rdev && (test_bit(Faulty, &rdev->flags) ||
+		    test_bit(WantRemove, &rdev->flags)))
 			rdev = NULL;
-		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
+		if (rrdev && (test_bit(Faulty, &rrdev->flags) ||
+		    test_bit(WantRemove, &rrdev->flags)))
 			rrdev = NULL;
 
 		r10_bio->devs[i].bio = NULL;
@@ -1790,6 +1794,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 				continue;
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Replacement, &rdev->flags);
+			clear_bit(WantRemove, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
 			if (mddev->gendisk)
@@ -1807,6 +1812,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
+		clear_bit(WantRemove, &rdev->flags);
 		err = 0;
 		if (rdev->saved_raid_disk != mirror)
 			conf->fullsync = 1;
@@ -1855,16 +1861,22 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		err = -EBUSY;
 		goto abort;
 	}
-	*rdevp = NULL;
+	/*
+	 * Before set p->rdev = NULL, we set WantRemove bit avoiding
+	 * race between rdev remove and issue bio, which can cause
+	 * NULL pointer deference of rdev by conf->mirrors[i].rdev.
+	 */
+	set_bit(WantRemove, &rdev->flags);
 	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
 		synchronize_rcu();
 		if (atomic_read(&rdev->nr_pending)) {
 			/* lost the race, try later */
 			err = -EBUSY;
-			*rdevp = rdev;
+			clear_bit(WantRemove, &rdev->flags);
 			goto abort;
 		}
 	}
+	*rdevp = NULL;
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
-- 
Gitee


From f6175b417625899f3da295a91fe74b6c70d82f70 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:38 +0800
Subject: [PATCH 12/25] md: fix io loss when remove rdev fail

hulk inclusion
category: bugfix, https://gitee.com/openeuler/kernel/issues/I71EKW
bugzilla: 188628
CVE: NA

--------------------------------

We first set rdev to WantRemove, and check if there is any io
pending, if so, we will clear flag and return busy in
raid10_remove_disk(). io will loss as below:

  raid10_remove_disk
   set WantRemove
			write rdev
			 if WantRemove
			  do not submit io
   if rdev->nr_pending
    clear WantRemove
    return BUSY
					read rdev
					 get error data

Fix it by md_error the rdev which io pending while removing. When the code
reaches this point, it means this rdev will be removed later, so setting
it as faulty has little impact.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 894f89faa888615a5d5fd5e9be987a8cae39e00e)
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e298903aa72f..190e6f18d0e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1872,7 +1872,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		if (atomic_read(&rdev->nr_pending)) {
 			/* lost the race, try later */
 			err = -EBUSY;
-			clear_bit(WantRemove, &rdev->flags);
+			md_error(rdev->mddev, rdev);
 			goto abort;
 		}
 	}
-- 
Gitee


From ce90c45a7c490942a1ca287620ae0c6da04ec59a Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:39 +0800
Subject: [PATCH 13/25] md/raid10: prioritize adding disk to 'removed' mirror

hulk inclusion
category: bugfix
bugzilla: 188804, https://gitee.com/openeuler/kernel/issues/I78YIS
CVE: NA

--------------------------------

When add a new disk to raid10, it will traverse conf->mirror from start
and find one of the following mirror:
  1. mirror->rdev is set to WantReplacement and it have no replacement,
     set new disk to mirror->replacement.
  2. no rdev, set new disk to mirror->rdev.

There is a array as below (sda is set to WantReplacement):

    Number   Major   Minor   RaidDevice State
       0       8        0        0      active sync set-A   /dev/sda
       -       0        0        1      removed
       2       8       32        2      active sync set-A   /dev/sdc
       3       8       48        3      active sync set-B   /dev/sdd

Use 'mdadm --add' to add a new disk to this array, the new disk will
become sda's replacement instead of add to removed position, which is
confusing for users. Meanwhile, after new disk recovery success, sda
will be set to Faulty.

Prioritize adding disk to 'removed' mirror is a better choice. In the
above scenario, the behavior is the same as before, except sda will not
be deleted. Before other disks are added, continued use sda is more
reliable.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 2e2e7ab61cf3d16c3edf176250585a34405b7324)
---
 drivers/md/raid10.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 190e6f18d0e2..2c41b201cfb4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1760,9 +1760,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = -EEXIST;
-	int mirror;
+	int mirror, repl_slot = -1;
 	int first = 0;
 	int last = conf->geo.raid_disks - 1;
+	struct raid10_info *p;
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -1785,24 +1786,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	else
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
-		struct raid10_info *p = &conf->mirrors[mirror];
+		p = &conf->mirrors[mirror];
 		if (p->recovery_disabled == mddev->recovery_disabled)
 			continue;
 		if (p->rdev) {
-			if (!test_bit(WantReplacement, &p->rdev->flags) ||
-			    p->replacement != NULL)
-				continue;
-			clear_bit(In_sync, &rdev->flags);
-			set_bit(Replacement, &rdev->flags);
-			clear_bit(WantRemove, &rdev->flags);
-			rdev->raid_disk = mirror;
-			err = 0;
-			if (mddev->gendisk)
-				disk_stack_limits(mddev->gendisk, rdev->bdev,
-						  rdev->data_offset << 9);
-			conf->fullsync = 1;
-			rcu_assign_pointer(p->replacement, rdev);
-			break;
+			if (test_bit(WantReplacement, &p->rdev->flags) &&
+			    p->replacement == NULL && repl_slot < 0)
+				repl_slot = mirror;
+			continue;
 		}
 
 		if (mddev->gendisk)
@@ -1819,6 +1810,21 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
+
+	if (err && repl_slot >= 0) {
+		p = &conf->mirrors[repl_slot];
+		clear_bit(In_sync, &rdev->flags);
+		set_bit(Replacement, &rdev->flags);
+		clear_bit(WantRemove, &rdev->flags);
+		rdev->raid_disk = repl_slot;
+		err = 0;
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+		conf->fullsync = 1;
+		rcu_assign_pointer(p->replacement, rdev);
+	}
+
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
 
-- 
Gitee


From ba96180f60c1475e511d03b445eaedc4d339aea9 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:40 +0800
Subject: [PATCH 14/25] md/raid10: fix io loss while replacement replace rdev

hulk inclusion
category: bugfix
bugzilla: 188787, https://gitee.com/openeuler/kernel/issues/I78YIW
CVE: NA

--------------------------------

When we remove a disk which has replacement, first set rdev to NULL
and then set replacement to rdev, finally set replacement to NULL (see
raid10_remove_disk()). If io is submitted during the same time, it might
read both rdev and replacement as NULL, and io will not be submitted.

  rdev -> NULL
                        read rdev
  replacement -> NULL
                        read replacement

Fix it by reading replacement first and rdev later, meanwhile, use smp_mb()
to prevent memory reordering.

Fixes: 475b0321a4df ("md/raid10: writes should get directed to replacement as well as original.")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit e80258500706fb91183565d00e0c0168df30e743)
---
 drivers/md/raid10.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2c41b201cfb4..e04182abfb08 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -754,8 +754,16 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 		    test_bit(WantRemove, &rdev->flags) ||
-		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+		    r10_bio->devs[slot].addr + sectors >
+		    rdev->recovery_offset) {
+			/*
+			 * Read replacement first to prevent reading both rdev
+			 * and replacement as NULL during replacement replace
+			 * rdev
+			 */
+			smp_mb();
 			rdev = rcu_dereference(conf->mirrors[disk].rdev);
+		}
 		if (rdev == NULL ||
 		    test_bit(WantRemove, &rdev->flags) ||
 		    test_bit(Faulty, &rdev->flags))
@@ -1363,9 +1371,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[d].replacement);
+		struct md_rdev *rrdev, *rdev;
+
+		rrdev = rcu_dereference(conf->mirrors[d].replacement);
+		/*
+		 * Read replacement first to Prevent reading both rdev and
+		 * replacement as NULL during replacement replace rdev.
+		 */
+		smp_mb();
+		rdev = rcu_dereference(conf->mirrors[d].rdev);
 		if (rdev == rrdev)
 			rrdev = NULL;
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-- 
Gitee


From 6f0d05db7c02832f12e1bfa4a91150765d715791 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:41 +0800
Subject: [PATCH 15/25] md/raid10: fix null-ptr-deref of mreplace in
 raid10_sync_request

hulk inclusion
category: bugfix
bugzilla: 188527, https://gitee.com/openeuler/kernel/issues/I6O3HO
CVE: NA

--------------------------------

need_replace will be set to 1 if no-Faulty mreplace exists, and mreplace
will be deref later. However, the latter check of mreplace might set
mreplace to NULL, null-ptr-deref occurs if need_replace is 1 at this time.

Fix it by merging two checks into one.

Fixes: ee37d7314a32 ("md/raid10: Fix raid10 replace hang when new added disk faulty")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 7718714eb5c5604bcde44c1a09fd1acb060a2f41)
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e04182abfb08..38f4d427a919 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3105,6 +3105,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (mreplace != NULL &&
 			    !test_bit(Faulty, &mreplace->flags))
 				need_replace = 1;
+			else
+				mreplace = NULL;
 
 			if (!need_recover && !need_replace) {
 				rcu_read_unlock();
@@ -3122,8 +3124,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				rcu_read_unlock();
 				continue;
 			}
-			if (mreplace && test_bit(Faulty, &mreplace->flags))
-				mreplace = NULL;
 			/* Unless we are doing a full sync, or a replacement
 			 * we only need to recover the block if it is set in
 			 * the bitmap
-- 
Gitee


From d18e63e86ece8ebead63952cb1ba4fe2d83a1ba3 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:42 +0800
Subject: [PATCH 16/25] md/raid10: fix uaf if replacement replaces rdev

hulk inclusion
category: bugfix
bugzilla: 188377, https://gitee.com/openeuler/kernel/issues/I6GOYF
CVE: NA

--------------------------------

After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be
replace old drive.") mirrors->replacement can replace rdev during
replacement's io pending, and repl_bio will write rdev (see
raid10_write_one_disk()). We will get wrong device by r10conf in
raid10_end_write_request(). In which case, r10_bio->devs[slot].repl_bio
will be put but not set to IO_MADE_GOOD, and it will be put again later in
raid_end_bio_io(), uaf occurs.

Fix it by using r10_bio to record rdev. Put the operations of io fail and
no replacement together, so no need to change repl.

  ==================================================================
  BUG: KASAN: use-after-free in bio_flagged include/linux/bio.h:238 [inline]
  BUG: KASAN: use-after-free in bio_put+0x78/0x80 block/bio.c:650
  Read of size 2 at addr ffff888116524dd4 by task md0_raid10/2618

  CPU: 0 PID: 2618 Comm: md0_raid10 Not tainted 5.10.0+ #3
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  sd 0:0:0:0: rejecting I/O to offline device
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x107/0x167 lib/dump_stack.c:118
   print_address_description.constprop.0+0x1c/0x270 mm/kasan/report.c:390
   __kasan_report mm/kasan/report.c:550 [inline]
   kasan_report.cold+0x22/0x3a mm/kasan/report.c:567
   bio_flagged include/linux/bio.h:238 [inline]
   bio_put+0x78/0x80 block/bio.c:650
   put_all_bios drivers/md/raid10.c:248 [inline]
   free_r10bio drivers/md/raid10.c:257 [inline]
   raid_end_bio_io+0x3b5/0x590 drivers/md/raid10.c:309
   handle_write_completed drivers/md/raid10.c:2699 [inline]
   raid10d+0x2f85/0x5af0 drivers/md/raid10.c:2759
   md_thread+0x444/0x4b0 drivers/md/md.c:7932
   kthread+0x38c/0x470 kernel/kthread.c:313
   ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299

  Allocated by task 1400:
   kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48
   kasan_set_track mm/kasan/common.c:56 [inline]
   set_alloc_info mm/kasan/common.c:498 [inline]
   __kasan_kmalloc.constprop.0+0xb5/0xe0 mm/kasan/common.c:530
   slab_post_alloc_hook mm/slab.h:512 [inline]
   slab_alloc_node mm/slub.c:2923 [inline]
   slab_alloc mm/slub.c:2931 [inline]
   kmem_cache_alloc+0x144/0x360 mm/slub.c:2936
   mempool_alloc+0x146/0x360 mm/mempool.c:391
   bio_alloc_bioset+0x375/0x610 block/bio.c:486
   bio_clone_fast+0x20/0x50 block/bio.c:711
   raid10_write_one_disk+0x166/0xd30 drivers/md/raid10.c:1240
   raid10_write_request+0x1600/0x2c90 drivers/md/raid10.c:1484
   __make_request drivers/md/raid10.c:1508 [inline]
   raid10_make_request+0x376/0x620 drivers/md/raid10.c:1537
   md_handle_request+0x699/0x970 drivers/md/md.c:451
   md_submit_bio+0x204/0x400 drivers/md/md.c:489
   __submit_bio block/blk-core.c:959 [inline]
   __submit_bio_noacct block/blk-core.c:1007 [inline]
   submit_bio_noacct+0x2e3/0xcf0 block/blk-core.c:1086
   submit_bio+0x1a0/0x3a0 block/blk-core.c:1146
   submit_bh_wbc+0x685/0x8e0 fs/buffer.c:3053
   ext4_commit_super+0x37e/0x6c0 fs/ext4/super.c:5696
   flush_stashed_error_work+0x28b/0x400 fs/ext4/super.c:791
   process_one_work+0x9a6/0x1590 kernel/workqueue.c:2280
   worker_thread+0x61d/0x1310 kernel/workqueue.c:2426
   kthread+0x38c/0x470 kernel/kthread.c:313
   ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299

  Freed by task 2618:
   kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48
   kasan_set_track+0x1c/0x30 mm/kasan/common.c:56
   kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:361
   __kasan_slab_free+0x151/0x180 mm/kasan/common.c:482
   slab_free_hook mm/slub.c:1569 [inline]
   slab_free_freelist_hook+0xa9/0x180 mm/slub.c:1608
   slab_free mm/slub.c:3179 [inline]
   kmem_cache_free+0xcd/0x3d0 mm/slub.c:3196
   mempool_free+0xe3/0x3b0 mm/mempool.c:500
   bio_free+0xe2/0x140 block/bio.c:266
   bio_put+0x58/0x80 block/bio.c:651
   raid10_end_write_request+0x885/0xb60 drivers/md/raid10.c:516
   bio_endio+0x376/0x6a0 block/bio.c:1465
   req_bio_endio block/blk-core.c:289 [inline]
   blk_update_request+0x5f5/0xf40 block/blk-core.c:1525
   blk_mq_end_request+0x4c/0x510 block/blk-mq.c:654
   blk_flush_complete_seq+0x835/0xd80 block/blk-flush.c:204
   flush_end_io+0x7b7/0xb90 block/blk-flush.c:261
   __blk_mq_end_request+0x282/0x4c0 block/blk-mq.c:645
   scsi_end_request+0x3a8/0x850 drivers/scsi/scsi_lib.c:607
   scsi_io_completion+0x3f5/0x1320 drivers/scsi/scsi_lib.c:970
   scsi_softirq_done+0x11b/0x490 drivers/scsi/scsi_lib.c:1448
   blk_mq_complete_request block/blk-mq.c:788 [inline]
   blk_mq_complete_request+0x84/0xb0 block/blk-mq.c:785
   scsi_mq_done+0x155/0x360 drivers/scsi/scsi_lib.c:1603
   virtscsi_vq_done drivers/scsi/virtio_scsi.c:184 [inline]
   virtscsi_req_done+0x14c/0x220 drivers/scsi/virtio_scsi.c:199
   vring_interrupt drivers/virtio/virtio_ring.c:2061 [inline]
   vring_interrupt+0x27a/0x300 drivers/virtio/virtio_ring.c:2047
   __handle_irq_event_percpu+0x2f8/0x830 kernel/irq/handle.c:156
   handle_irq_event_percpu kernel/irq/handle.c:196 [inline]
   handle_irq_event+0x105/0x280 kernel/irq/handle.c:213
   handle_edge_irq+0x258/0xd20 kernel/irq/chip.c:828
   asm_call_irq_on_stack+0xf/0x20
   __run_irq_on_irqstack arch/x86/include/asm/irq_stack.h:48 [inline]
   run_irq_on_irqstack_cond arch/x86/include/asm/irq_stack.h:101 [inline]
   handle_irq arch/x86/kernel/irq.c:230 [inline]
   __common_interrupt arch/x86/kernel/irq.c:249 [inline]
   common_interrupt+0xe2/0x190 arch/x86/kernel/irq.c:239
   asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:626

Fixes: 4ca40c2ce099 ("md/raid10: Allow replacement device to be replace old drive.")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit af959500142ff9bde1c79ac0fb42dc386e3794da)
---
 drivers/md/raid10.c | 86 +++++++++++++++++++++++++--------------------
 drivers/md/raid10.h | 12 +++----
 2 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 38f4d427a919..878e7b07fc48 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -441,47 +441,52 @@ static void raid10_end_write_request(struct bio *bio)
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
-	if (repl)
-		rdev = conf->mirrors[dev].replacement;
-	if (!rdev) {
-		smp_rmb();
-		repl = 0;
-		rdev = conf->mirrors[dev].rdev;
+	if (repl) {
+		rdev = r10_bio->devs[slot].replacement;
+		if (rdev == conf->mirrors[dev].replacement) {
+			if (bio->bi_status && !discard_error) {
+				/*
+				 * Never record new bad blocks to replacement,
+				 * just fail it.
+				 */
+				md_error(rdev->mddev, rdev);
+				goto out;
+			}
+		} else {
+			WARN_ON_ONCE(rdev != conf->mirrors[dev].rdev);
+		}
+	} else {
+		rdev = r10_bio->devs[slot].rdev;
 	}
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (bio->bi_status && !discard_error) {
-		if (repl)
-			/* Never record new bad blocks to replacement,
-			 * just fail it.
-			 */
-			md_error(rdev->mddev, rdev);
-		else {
-			set_bit(WriteErrorSeen,	&rdev->flags);
-			if (!test_and_set_bit(WantReplacement, &rdev->flags))
-				set_bit(MD_RECOVERY_NEEDED,
-					&rdev->mddev->recovery);
+		set_bit(WriteErrorSeen,	&rdev->flags);
+		if (!test_and_set_bit(WantReplacement, &rdev->flags))
+			set_bit(MD_RECOVERY_NEEDED,
+				&rdev->mddev->recovery);
 
-			dec_rdev = 0;
-			if (test_bit(FailFast, &rdev->flags) &&
-			    (bio->bi_opf & MD_FAILFAST)) {
-				md_error(rdev->mddev, rdev);
-			}
+		dec_rdev = 0;
+		if (test_bit(FailFast, &rdev->flags) &&
+		    (bio->bi_opf & MD_FAILFAST))
+			md_error(rdev->mddev, rdev);
 
-			/*
-			 * When the device is faulty, it is not necessary to
-			 * handle write error.
-			 */
-			if (!test_bit(Faulty, &rdev->flags))
-				set_bit(R10BIO_WriteError, &r10_bio->state);
-			else {
-				/* Fail the request */
-				set_bit(R10BIO_Degraded, &r10_bio->state);
+		/*
+		 * When the device is faulty, it is not necessary to
+		 * handle write error.
+		 */
+		if (!test_bit(Faulty, &rdev->flags)) {
+			set_bit(R10BIO_WriteError, &r10_bio->state);
+		} else {
+			/* Fail the request */
+			set_bit(R10BIO_Degraded, &r10_bio->state);
+			if (repl)
+				r10_bio->devs[slot].repl_bio = NULL;
+			else
 				r10_bio->devs[slot].bio = NULL;
-				to_put = bio;
-				dec_rdev = 1;
-			}
+			to_put = bio;
+			dec_rdev = 1;
 		}
 	} else {
 		/*
@@ -513,16 +518,17 @@ static void raid10_end_write_request(struct bio *bio)
 				r10_bio->devs[slot].addr,
 				r10_bio->sectors,
 				&first_bad, &bad_sectors) && !discard_error) {
-			bio_put(bio);
 			if (repl)
 				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
 			else
 				r10_bio->devs[slot].bio = IO_MADE_GOOD;
+			bio_put(bio);
 			dec_rdev = 0;
 			set_bit(R10BIO_MadeGood, &r10_bio->state);
 		}
 	}
 
+out:
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
@@ -1259,10 +1265,13 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		rdev = conf->mirrors[devnum].rdev;
 
 	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-	if (replacement)
+	if (replacement) {
 		r10_bio->devs[n_copy].repl_bio = mbio;
-	else
+		r10_bio->devs[n_copy].replacement = rdev;
+	} else {
 		r10_bio->devs[n_copy].bio = mbio;
+		r10_bio->devs[n_copy].rdev = rdev;
+	}
 
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
 				   choose_data_offset(r10_bio, rdev));
@@ -2703,9 +2712,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 	} else {
 		bool fail = false;
 		for (m = 0; m < conf->copies; m++) {
-			int dev = r10_bio->devs[m].devnum;
 			struct bio *bio = r10_bio->devs[m].bio;
-			rdev = conf->mirrors[dev].rdev;
+			rdev = r10_bio->devs[m].rdev;
 			if (bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
@@ -2722,7 +2730,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			bio = r10_bio->devs[m].repl_bio;
-			rdev = conf->mirrors[dev].replacement;
+			rdev = r10_bio->devs[m].replacement;
 			if (rdev && bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5420250d4bd6..73d243e12363 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -146,12 +146,12 @@ struct r10bio {
 	 */
 	struct r10dev {
 		struct bio	*bio;
-		union {
-			struct bio	*repl_bio; /* used for resync and
-						    * writes */
-			struct md_rdev	*rdev;	   /* used for reads
-						    * (read_slot >= 0) */
-		};
+		/* Currently just used for normal reads and writes */
+		struct md_rdev	*rdev;
+		/* used for resync and writes */
+		struct bio	*repl_bio;
+		/* Currently just used for normal writes */
+		struct md_rdev	*replacement;
 		sector_t	addr;
 		int		devnum;
 	} devs[];
-- 
Gitee


From 75d8f605d070214d62ab335a12784fe1474630f9 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:43 +0800
Subject: [PATCH 17/25] md/raid10: remove WANR_ON_ONCE in
 raid10_end_write_request

hulk inclusion
category: bugfix
bugzilla: 188605, https://gitee.com/openeuler/kernel/issues/I6GOYF
CVE: NA

--------------------------------

It might read mirror.redev first and then mirror->replacement because of
memory reordering in raid10_end_write_request(), WARN_ON occurs if we
remove disk at the same time.

  T1 remove			T2 io end
  raid10_remove_disk		raid10_end_write_request
   p->rdev = NULL
				 read rdev -> NULL
   smp_mb
   p->replacement = NULL
				 read replacement -> NULL

It is meaningless to compare rdev with mirror->rdev after we get it from
r10_bio in raid10_end_write_request(). Remove this WANR_ON_ONCE.

Fixes: 2ecf5e6ecbfd ("md/raid10: fix uaf if replacement replaces rdev")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit a3ebeed79257c6c4f1a29bc650f831d39aa60022)
---
 drivers/md/raid10.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 878e7b07fc48..92cb493a140a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -452,8 +452,6 @@ static void raid10_end_write_request(struct bio *bio)
 				md_error(rdev->mddev, rdev);
 				goto out;
 			}
-		} else {
-			WARN_ON_ONCE(rdev != conf->mirrors[dev].rdev);
 		}
 	} else {
 		rdev = r10_bio->devs[slot].rdev;
-- 
Gitee


From 3aa6cf6b7a2f9ed9af61496311ecd36a2d33591f Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:44 +0800
Subject: [PATCH 18/25] md/raid10: fix incorrect counting of rdev->nr_pending

hulk inclusion
category: bugfix
bugzilla: 188605, https://gitee.com/openeuler/kernel/issues/I6ZJ3T
CVE: NA

--------------------------------

We get rdev from mirrors.replacement twice in raid10_write_request().
If replacement changes between two reads, it will increase A->nr_pending
and decrease B->nr_pending.

  T1 (write)	   T2 (remove)	    T3 (add)
                   raid10_remove_disk

  raid10_write_request
   rrdev = conf->mirrors[d].replacement; ->rdev A
   A nr_pending++

                    p->rdev = p->replacement; ->rdev A
                    p->replacement = NULL;

				    //A it set to WantReplacement
                                    raid10_add_disk
				     p->replacement = rdev; ->rdev B

   if blocked_rdev
    rdev = conf->mirrors[d].replacement; ->rdev B
    B nr_pending--

We will record rdev in r10bio, and get rdev from r10bio to fix it.

Fixes: 475b0321a4df ("md/raid10: writes should get directed to replacement as well as original.")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 7b3b818774cdafe3c50f227ef76271b2277bd368)
---
 drivers/md/raid10.c | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 92cb493a140a..ec5f0a1794e9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1252,23 +1252,13 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	int devnum = r10_bio->devs[n_copy].devnum;
 	struct bio *mbio;
 
-	if (replacement) {
-		rdev = conf->mirrors[devnum].replacement;
-		if (rdev == NULL) {
-			/* Replacement just got moved to main 'rdev' */
-			smp_mb();
-			rdev = conf->mirrors[devnum].rdev;
-		}
-	} else
-		rdev = conf->mirrors[devnum].rdev;
-
 	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 	if (replacement) {
 		r10_bio->devs[n_copy].repl_bio = mbio;
-		r10_bio->devs[n_copy].replacement = rdev;
+		rdev = r10_bio->devs[n_copy].replacement;
 	} else {
 		r10_bio->devs[n_copy].bio = mbio;
-		r10_bio->devs[n_copy].rdev = rdev;
+		rdev = r10_bio->devs[n_copy].rdev;
 	}
 
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
@@ -1276,8 +1266,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	bio_set_dev(mbio, rdev->bdev);
 	mbio->bi_end_io	= raid10_end_write_request;
 	bio_set_op_attrs(mbio, op, do_sync | do_fua);
-	if (!replacement && test_bit(FailFast,
-				     &conf->mirrors[devnum].rdev->flags)
+	if (!replacement && test_bit(FailFast, &rdev->flags)
 			 && enough(conf, devnum))
 		mbio->bi_opf |= MD_FAILFAST;
 	mbio->bi_private = r10_bio;
@@ -1456,10 +1445,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		}
 		if (rdev) {
 			r10_bio->devs[i].bio = bio;
+			r10_bio->devs[i].rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
 		if (rrdev) {
 			r10_bio->devs[i].repl_bio = bio;
+			r10_bio->devs[i].replacement = rrdev;
 			atomic_inc(&rrdev->nr_pending);
 		}
 	}
@@ -1468,24 +1459,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	if (unlikely(blocked_rdev)) {
 		/* Have to wait for this device to get unblocked, then retry */
 		int j;
-		int d;
 
 		for (j = 0; j < i; j++) {
-			if (r10_bio->devs[j].bio) {
-				d = r10_bio->devs[j].devnum;
-				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-			}
-			if (r10_bio->devs[j].repl_bio) {
-				struct md_rdev *rdev;
-				d = r10_bio->devs[j].devnum;
-				rdev = conf->mirrors[d].replacement;
-				if (!rdev) {
-					/* Race with remove_disk */
-					smp_mb();
-					rdev = conf->mirrors[d].rdev;
-				}
-				rdev_dec_pending(rdev, mddev);
-			}
+			if (r10_bio->devs[j].bio)
+				rdev_dec_pending(r10_bio->devs[j].rdev, mddev);
+			if (r10_bio->devs[j].repl_bio)
+				rdev_dec_pending(r10_bio->devs[j].replacement, mddev);
 		}
 		allow_barrier(conf);
 		raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
-- 
Gitee


From cfa461c72ea2303f160325cca755e7f60a03578f Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:45 +0800
Subject: [PATCH 19/25] block: Only set bb->changed when badblocks changes

hulk inclusion
category: bugfix
bugzilla: 188569, https://gitee.com/openeuler/kernel/issues/I6XBZQ
CVE: NA

--------------------------------

bb->changed and unacked_exist is set and badblocks_update_acked() is
involked even if no badblocks changes in badblocks_set(). Only update
them when badblocks changes.

Fixes: 9e0e252a048b ("badblocks: Add core badblock management code")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 78cba1632180c1736b093d87c5077e434c49ac0a)
---
 block/badblocks.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/block/badblocks.c b/block/badblocks.c
index d39056630d9c..0ee3cbacb236 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -165,7 +165,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 {
 	u64 *p;
 	int lo, hi;
-	int rv = 0;
+	int rv = 0, changed = 0;
 	unsigned long flags;
 
 	if (bb->shift < 0)
@@ -230,6 +230,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 				s = a + BB_MAX_LEN;
 			}
 			sectors = e - s;
+			changed = 1;
 		}
 	}
 	if (sectors && hi < bb->count) {
@@ -260,6 +261,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 			sectors = e - s;
 			lo = hi;
 			hi++;
+			changed = 1;
 		}
 	}
 	if (sectors == 0 && hi < bb->count) {
@@ -278,6 +280,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 			memmove(p + hi, p + hi + 1,
 				(bb->count - hi - 1) * 8);
 			bb->count--;
+			changed = 1;
 		}
 	}
 	while (sectors) {
@@ -300,14 +303,17 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
 			sectors -= this_sectors;
 			s += this_sectors;
+			changed = 1;
 		}
 	}
 
-	bb->changed = 1;
-	if (!acknowledged)
-		bb->unacked_exist = 1;
-	else
-		badblocks_update_acked(bb);
+	if (changed) {
+		bb->changed = changed;
+		if (!acknowledged)
+			bb->unacked_exist = 1;
+		else
+			badblocks_update_acked(bb);
+	}
 	write_sequnlock_irqrestore(&bb->lock, flags);
 
 	return rv;
-- 
Gitee


From ed6fe40d392e4cf295b858004d4e903dfbf92db3 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:46 +0800
Subject: [PATCH 20/25] md/raid10: fix io hung in md_wait_for_blocked_rdev()

hulk inclusion
category: bugfix
bugzilla: 188569, https://gitee.com/openeuler/kernel/issues/I6XBZQ
CVE: NA

--------------------------------

If badblocks are merged but bb->count exceedded, badblocks_set() will
return 1 and merged badblocks will become un-ack. rdev_set_badblocks()
will not set sb_flags and wakeup mddev->thread, io wait in
md_wait_for_blocked_rdev() will hung because BlockedBadBlocks may not be
cleared.

Fix it by checking badblocks->changed instead of return value. This flag
is set when badblocks changes.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit c23e1cd19b200f40df955a828422f46848c4f74b)
---
 drivers/md/md.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4d744cf8ffc6..f50e0255f3f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9577,13 +9577,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
 	struct mddev *mddev = rdev->mddev;
-	int rv;
+
 	if (is_new)
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
-	if (rv == 0) {
+	badblocks_set(&rdev->badblocks, s, sectors, 0);
+	if (rdev->badblocks.changed) {
 		/* Make sure they get written out promptly */
 		if (test_bit(ExternalBbl, &rdev->flags))
 			sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
-- 
Gitee


From 9933399c05be51b4f4a4a3cb8d2f80437a8bf778 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:47 +0800
Subject: [PATCH 21/25] md: fix unexpected changes of return value in
 rdev_set_badblocks

hulk inclusion
category: bugfix
bugzilla: 188569, https://gitee.com/openeuler/kernel/issues/I6XBZQ
CVE: NA

--------------------------------

If we set any badblocks fail, we will remove this rdev(set it to Faulty
or set recovery_disabled). Previous patch "md/raid10: fix io hung in
md_wait_for_blocked_rdev()" check badblocks->changed instead of return
value in rdev_set_badblocks(), but return value of this func also changed
accordingly, which is not what we expected.

Keep the return value consistent with before.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit bebf3d97ffcb05ae63da64f8dcb7335f596ac863)
---
 drivers/md/md.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f50e0255f3f8..4563ef0df4c5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9577,12 +9577,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
 	struct mddev *mddev = rdev->mddev;
+	int rv;
 
 	if (is_new)
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	badblocks_set(&rdev->badblocks, s, sectors, 0);
+	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
 	if (rdev->badblocks.changed) {
 		/* Make sure they get written out promptly */
 		if (test_bit(ExternalBbl, &rdev->flags))
@@ -9591,9 +9592,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		set_mask_bits(&mddev->sb_flags, 0,
 			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
 		md_wakeup_thread(rdev->mddev->thread);
-		return 1;
-	} else
-		return 0;
+	}
+	return !rv;
 }
 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
 
-- 
Gitee


From 73f3c62850905d8bd79312c6350a3a009f2e3cb5 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:48 +0800
Subject: [PATCH 22/25] block/badblocks: fix the bug of reverse order

hulk inclusion
category: bugfix
bugzilla: 188569, https://gitee.com/openeuler/kernel/issues/I6ZG5B
CVE: NA

--------------------------------

Order of badblocks will be reversed if we set a large area at once. 'hi'
remains unchanged while adding continuous badblocks is wrong, the next
setting is greater than 'hi', it should be added to the next position.
Let 'hi' +1 each cycle.

  # echo 0 2048 > bad_blocks
  # cat bad_blocks
    1536 512
    1024 512
    512 512
    0 512

Fixes: 9e0e252a048b ("badblocks: Add core badblock management code")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit f9a3eea02f861de2d183d62ceebe17e9b51cb6ce)
---
 block/badblocks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/badblocks.c b/block/badblocks.c
index 0ee3cbacb236..4c0f39625e89 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -303,6 +303,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
 			sectors -= this_sectors;
 			s += this_sectors;
+			hi++;
 			changed = 1;
 		}
 	}
-- 
Gitee


From 5c9062906b66342b1eea000d6b8cedeaf8fd0061 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:49 +0800
Subject: [PATCH 23/25] block/badblocks: fix badblocks loss when badblocks
 combine

hulk inclusion
category: bugfix
bugzilla: 188569, https://gitee.com/openeuler/kernel/issues/I6ZG5B
CVE: NA

--------------------------------

badblocks will loss if we set it as below:

  # echo 1 1 > bad_blocks
  # echo 3 1 > bad_blocks
  # echo 1 5 > bad_blocks
  # cat bad_blocks
    1 3

we will combine badblocks if there is an intersection between p[lo] and
p[hi] in badblocks_set(). The end of new badblocks is p[hi]'s end now. but
p[lo] may cross p[hi] and new end should be the larger of p[lo] and p[hi].
  lo: |------------------------|
  hi:		|--------|

Fixes: 9e0e252a048b ("badblocks: Add core badblock management code")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit e35a77628aa041f9d94921992ba9e8d3c6dbe8ba)
---
 block/badblocks.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/badblocks.c b/block/badblocks.c
index 4c0f39625e89..b387109df02e 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -267,16 +267,14 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 	if (sectors == 0 && hi < bb->count) {
 		/* we might be able to combine lo and hi */
 		/* Note: 's' is at the end of 'lo' */
-		sector_t a = BB_OFFSET(p[hi]);
-		int lolen = BB_LEN(p[lo]);
-		int hilen = BB_LEN(p[hi]);
-		int newlen = lolen + hilen - (s - a);
+		sector_t a = BB_OFFSET(p[lo]);
+		int newlen = max(s, BB_OFFSET(p[hi]) + BB_LEN(p[hi])) - a;
 
-		if (s >= a && newlen < BB_MAX_LEN) {
+		if (s >= BB_OFFSET(p[hi]) && newlen < BB_MAX_LEN) {
 			/* yes, we can combine them */
 			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
 
-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+			p[lo] = BB_MAKE(a, newlen, ack);
 			memmove(p + hi, p + hi + 1,
 				(bb->count - hi - 1) * 8);
 			bb->count--;
-- 
Gitee


From d44d924fb1e81379c61b89174f6adfd9a7ce68dc Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:50 +0800
Subject: [PATCH 24/25] md/raid10: fix null-ptr-deref in raid10_sync_request

hulk inclusion
category: bugfix
bugzilla: 188378, https://gitee.com/openeuler/kernel/issues/I6GGV7
CVE: NA

--------------------------------

init_resync() init mempool and set conf->have_replacemnt at the begaining
of sync, close_sync() free the mempool when sync is completed.

After commit 7e83ccbecd60 ("md/raid10: Allow skipping recovery when clean
arrays are assembled"), recovery might skipped and init_resync() is called
but close_sync() is not. null-ptr-deref occurs as below:
  1) creat a array, wait for resync to complete, mddev->recovery_cp is set
     to MaxSector.
  2) recovery is woken and it is skipped. conf->have_replacement is set to
     0 in init_resync(). close_sync() not called.
  3) some io errors and rdev A is set to WantReplacement.
  4) a new device is added and set to A's replacement.
  5) recovery is woken, A have replacement, but conf->have_replacemnt is
     0. r10bio->dev[i].repl_bio will not be alloced and null-ptr-deref
     occurs.

Fix it by not init_resync() if recovery skipped.

Fixes: 7e83ccbecd60 md/raid10: Allow skipping recovery when clean arrays are assembled")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit 2de30b8ffb1a687170b4e21b99dee0aafd0e9a58)
---
 drivers/md/raid10.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ec5f0a1794e9..badf02d8e848 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2941,10 +2941,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	int page_idx = 0;
 
-	if (!mempool_initialized(&conf->r10buf_pool))
-		if (init_resync(conf))
-			return 0;
-
 	/*
 	 * Allow skipping a full rebuild for incremental assembly
 	 * of a clean array, like RAID1 does.
@@ -2960,6 +2956,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		return mddev->dev_sectors - sector_nr;
 	}
 
+	if (!mempool_initialized(&conf->r10buf_pool))
+		if (init_resync(conf))
+			return 0;
+
  skipped:
 	max_sector = mddev->dev_sectors;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
-- 
Gitee


From d06eb933f0231c3c4a40f5df27476b64f410c563 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 3 Jun 2023 14:50:51 +0800
Subject: [PATCH 25/25] md/raid10: fix incorrect done of recovery

hulk inclusion
category: bugfix
bugzilla: 188535, https://gitee.com/openeuler/kernel/issues/I6O61Q
CVE: NA

--------------------------------

Recovery will go to giveup and let chunks_skipped++ in raid10_sync_request
if there are some bad_blocks, and it will return max_sector when
chunks_skipped >= geo.raid_disks. Now, recovery fail and data is
inconsistent but user think recovery is done, it is wrong.

Fix it by set mirror's recovery_disabled and spare device shouln't be
added to here.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
(cherry picked from commit b0ac58c910a3fb1039bb2205a850c588f80f8c8f)
---
 drivers/md/raid10.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index badf02d8e848..babc5d29f0e3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2940,6 +2940,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int chunks_skipped = 0;
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	int page_idx = 0;
+	int error_disk = -1;
 
 	/*
 	 * Allow skipping a full rebuild for incremental assembly
@@ -3023,7 +3024,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		return reshape_request(mddev, sector_nr, skipped);
 
 	if (chunks_skipped >= conf->geo.raid_disks) {
-		/* if there has been nothing to do on any drive,
+		pr_err("md/raid10:%s: %s fail\n", mdname(mddev),
+			test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?  "resync" : "recovery");
+		if (error_disk >= 0 && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			/*
+			 * recovery fail, set mirrors.recovory_disabled,
+			 * device shouldn't be added to there.
+			 */
+			conf->mirrors[error_disk].recovery_disabled = mddev->recovery_disabled;
+			return 0;
+		}
+		/*
+		 * if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
 		 */
 		*skipped = 1;
@@ -3280,6 +3292,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						       mdname(mddev));
 					mirror->recovery_disabled
 						= mddev->recovery_disabled;
+				} else {
+					error_disk = i;
 				}
 				put_buf(r10_bio);
 				if (rb2)
-- 
Gitee