diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e842beb89858b2a7b9b2dd11a021262e5dcd27c0..59f43f5b780396631c76fdb5d0d8029fbb489e8c 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1001,7 +1001,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) return set; } - /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -1011,8 +1010,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) + if (!md_bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1041,6 +1039,35 @@ void md_bitmap_unplug(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_unplug); +struct bitmap_unplug_work { + struct work_struct work; + struct bitmap *bitmap; + struct completion *done; +}; + +static void md_bitmap_unplug_fn(struct work_struct *work) +{ + struct bitmap_unplug_work *unplug_work = + container_of(work, struct bitmap_unplug_work, work); + + md_bitmap_unplug(unplug_work->bitmap); + complete(unplug_work->done); +} + +void md_bitmap_unplug_async(struct bitmap *bitmap) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct bitmap_unplug_work unplug_work; + + INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); + unplug_work.bitmap = bitmap; + unplug_work.done = &done; + + queue_work(md_bitmap_wq, &unplug_work.work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(md_bitmap_unplug_async); + static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index cfd7395de8fd36c779092617c23ad6d36fad7bea..8a3788c9bfef854fde3abc8193e74565619e53b3 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t new_lo, sector_t new_hi); void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, @@ -273,6 +274,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + #endif #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 06dcdf817d12b72b0d67d01ea21d78e0c9c57c2c..1378aab2e8a6e56c6ed3df41523ca4d32e66a7a4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -82,6 +82,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; static struct workqueue_struct *md_rdev_misc_wq; +struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -9677,6 +9678,11 @@ static int __init md_init(void) if (!md_rdev_misc_wq) goto err_rdev_misc_wq; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + goto err_bitmap_wq; + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) goto err_md; @@ -9698,6 +9704,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: + destroy_workqueue(md_bitmap_wq); +err_bitmap_wq: destroy_workqueue(md_rdev_misc_wq); err_rdev_misc_wq: destroy_workqueue(md_misc_wq); @@ -9993,6 +10001,7 @@ static __exit void md_exit(void) } destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); + destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 845ccd8429759184798aab77add76af041f18ef7..aa596ead1b8598ae6b58dfb1d4665e8fcd791724 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -839,6 +839,7 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; +extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 54db341639687e47b5d32c5d9bc08e1d2e761840..5cada8fa4af75b7d79f36971b38c9712347a784b 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -21,12 +21,7 @@ #define IO_MADE_GOOD ((struct bio *)2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queue to be written by - * the raid thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; +#define MAX_PLUG_BIO 32 /* for managing resync I/O pages */ struct resync_pages { @@ -34,6 +29,12 @@ struct resync_pages { struct page *pages[RESYNC_PAGES]; }; +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + unsigned int count; +}; + static void rbio_pool_free(void *rbio, void *data) { kfree(rbio); @@ -110,3 +111,64 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (void *)bio->bi_disk; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !blk_queue_discard(bio->bi_disk->queue))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug, int copies) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + if (++plug->count / MAX_PLUG_BIO >= copies) { + list_del(&cb->list); + cb->callback(cb, false); + } + + + return true; +} + +/* + * current->bio_list will be set under submit_bio() context, in this case bitmap + * io will be added to the list and wait for current io submission to finish, + * while current io submission must wait for bitmap io to be done. In order to + * avoid such deadlock, submit bitmap io asynchronously. + */ +static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +{ + if (current->bio_list) + md_bitmap_unplug_async(bitmap); + else + md_bitmap_unplug(bitmap); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e511354c6e8c06c400d963df85938581c9ce304c..ebb7adba44d6cfdc66f70db3edf96f8a8add40aa 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -793,22 +793,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_disk; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -826,7 +817,6 @@ static void flush_pending_writes(struct r1conf *conf) struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* @@ -1148,12 +1138,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, bio_put(behind_bio); } -struct raid1_plug_cb { - struct blk_plug_cb cb; - struct bio_list pending; - int pending_cnt; -}; - static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, @@ -1162,10 +1146,9 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); - conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); @@ -1327,8 +1310,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; bool write_behind = false; @@ -1360,12 +1341,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - raid1_log(mddev, "wait queued"); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -1553,18 +1528,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)conf->mirrors[i].rdev; - cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); } @@ -3045,7 +3011,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; @@ -3431,4 +3396,3 @@ MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ff30681d753c7501392c7a7ca63251174bfa24e3..468f189da7a05042fcf82f3dde2c6bf621dc1456 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -87,7 +87,6 @@ struct r1conf { /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; - int pending_count; /* for use when syncing mirrors: * We don't allow both normal IO and resync/recovery IO at diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 222d9ed76b4bdaa0723fd3f6cb0b7aa0ec491dbb..e037ef90abdd6628d01d493512263d6dece37cd5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -876,7 +876,6 @@ static void flush_pending_writes(struct r10conf *conf) struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* @@ -891,24 +890,13 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - /* flush any pending bitmap writes to disk - * before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -1072,24 +1060,16 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } -struct raid10_plug_cb { - struct blk_plug_cb cb; - struct bio_list pending; - int pending_cnt; -}; - static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) { - struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, - cb); + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); struct mddev *mddev = plug->cb.data; struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); - conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); @@ -1099,22 +1079,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - md_bitmap_unplug(mddev->bitmap); + raid1_prepare_flush_writes(mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -1248,8 +1219,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; - struct blk_plug_cb *cb; - struct raid10_plug_cb *plug = NULL; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int devnum = r10_bio->devs[n_copy].devnum; @@ -1283,18 +1252,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, atomic_inc(&r10_bio->remaining); - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); } @@ -1345,12 +1305,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, conf->reshape_safe = mddev->reshape_position; } - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - raid10_log(mddev, "wait queued"); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -4976,4 +4930,3 @@ MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 4f627ad16becf5c47a0060820d64ad0e7dc520e4..6570f96e8f47f796c00a6e7480317fe3c9409815 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -75,7 +75,6 @@ struct r10conf { /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - int pending_count; spinlock_t resync_lock; atomic_t nr_pending;