399 Star 1.4K Fork 1.3K

GVPopenEuler / kernel

 / 详情

【OLK-5.10】文件系统回写和fsync竞争导致死锁触发Hung task

已完成
缺陷
创建于  
2022-04-29 10:15

【标题描述】能够简要描述问题:说明什么场景下,做了什么操作,出现什么问题(尽量使用正向表达方式)
往硬件队列深度都为1的U盘vfat拷贝大文件并同时执行fsync卡主
输入图片说明
【环境信息】
硬件信息:
1) arm32
软件信息:
1) OLK5.10
【问题复现步骤】
具体操作步骤

  1. 打入内核延时补丁,打开CONFIG_VFAT_FS,CONFIG_FAT_FS
  2. 编译执行b.c

出现概率(是否必现,概率性错误)必现
【预期结果】
fsync进程卡主,内核报hungtask
【实际结果】

[   45.714127] kworker/u8:4 cannot lock for inode ffff8881058003f8 bh ffff88810fd05958
[   45.874154] kworker/u8:4 cannot lock for inode ffff8881058003f8 bh ffff88810fd05958
[   46.034180] kworker/u8:4 cannot lock for inode ffff8881058003f8 bh ffff88810fd05958
[   46.194204] kworker/u8:4 cannot lock for inode ffff8881058003f8 bh ffff88810fd05958
[   46.281892] INFO: task bb:2694 blocked for more than 15 seconds.
[   46.286758]       Not tainted 5.10.0-13176-ga304d9b8bdc9-dirty #321
[   46.287606] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[   46.289334] task:bb              state:D stack:    0 pid: 2694 ppid:  2412 flags:0x00004000
[   46.289351] Call Trace:
[   46.289380]  __schedule+0x31a/0xb90
[   46.289387]  schedule+0x7e/0x1a0
[   46.289392]  io_schedule+0x22/0x50
[   46.289400]  blk_mq_get_tag+0x21a/0x3d0
[   46.289412]  ? woken_wake_function+0x30/0x30
[   46.289418]  __blk_mq_alloc_request+0x123/0x1b0
[   46.289422]  blk_mq_submit_bio+0x163/0x9f0
[   46.289428]  submit_bio_noacct+0x4dd/0x5f0
[   46.289435]  ? vprintk_func+0x79/0x180
[   46.289439]  submit_bio+0x3f/0x1b0
[   46.289444]  submit_bh_wbc+0x1c8/0x2f0
[   46.289449]  __sync_dirty_buffer+0xc4/0x360
[   46.289453]  sync_dirty_buffer+0x17/0x20
[   46.289458]  __fat_write_inode+0x294/0x320
[   46.289463]  fat_write_inode+0x2a/0xa0
[   46.289469]  ? filemap_fdatawait_range+0x26/0x30
[   46.289474]  __writeback_single_inode+0x435/0x560
[   46.289478]  writeback_single_inode+0x154/0x220
[   46.289482]  sync_inode_metadata+0x45/0x70
[   46.289487]  __generic_file_fsync+0xa3/0x150
[   46.289491]  fat_file_fsync+0x1d/0x90
[   46.289496]  vfs_fsync_range+0x40/0xb0
[   46.289501]  ? exit_to_user_mode_prepare+0x2a/0x210
[   46.289505]  do_fsync+0x48/0xa0
[   46.289509]  __x64_sys_fsync+0x18/0x30
[   46.289514]  do_syscall_64+0x45/0x70
[   46.289518]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

【附件信息】
延时补丁:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 873eeb97fde4..fdcb5adb63cb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -507,6 +507,11 @@ static void __blk_mq_free_request(struct request *rq)
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	const int sched_tag = rq->internal_tag;
 
+	if (rq->rq_disk && rq->rq_disk->first_minor == 16) {
+		global_sync_cond2 = 0;
+		smp_wmb();
+	}
+
 	blk_crypto_free_request(rq);
 	blk_pm_mark_last_busy(rq);
 	rq->mq_hctx = NULL;
@@ -2158,6 +2163,7 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
  *
  * Returns: Request queue cookie.
  */
+int global_sync_cond2 = 0;
 blk_qc_t blk_mq_submit_bio(struct bio *bio)
 {
 	struct request_queue *q = bio->bi_disk->queue;
@@ -2236,6 +2242,11 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 		else
 			last = list_entry_rq(plug->mq_list.prev);
 
+		if (rq->rq_disk->first_minor == 16) {
+			global_sync_cond2 = 1;
+			smp_wmb();
+		}
+
 		if (request_count >= blk_plug_max_rq_count(plug) || (last &&
 		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
 			blk_flush_plug_list(plug, false);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9352a309fe55..019b89b3becb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1908,8 +1908,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 		tag_set->ops = &scsi_mq_ops;
 	else
 		tag_set->ops = &scsi_mq_ops_no_commit;
-	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
-	tag_set->queue_depth = shost->can_queue;
+	tag_set->nr_hw_queues = 1;
+	tag_set->queue_depth = 1;
 	tag_set->cmd_size = cmd_size;
 	tag_set->numa_node = NUMA_NO_NODE;
 	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index fabbad5cda60..863d8cae4e71 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -877,10 +877,8 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	if (err)
 		goto virtscsi_init_failed;
 
-	shost->can_queue = virtqueue_get_vring_size(vscsi->req_vqs[0].vq);
-
-	cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
-	shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
+	shost->can_queue = 1;
+	shost->cmd_per_lun = 1;
 	shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
 
 	/* LUNs > 256 are reported with format 1, so they go in the range
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f645657488..32de2592134a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1709,6 +1709,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
  * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  * causes the writes to be flagged as synchronous writes.
  */
+#include <linux/delay.h>
 int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc,
 			bh_end_io_t *handler)
@@ -1784,9 +1785,14 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 		 * and kswapd activity, but those code paths have their own
 		 * higher-level throttling.
 		 */
+		smp_rmb();
+		if (global_sync_cond && global_sync_cond2 && global_sync_cond3) {
+			mdelay(20);
+		}
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
+			pr_info("%s cannot lock for inode %px bh %px\n", current->comm, inode, bh);
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
@@ -3119,13 +3125,19 @@ EXPORT_SYMBOL(ll_rw_block);
 void write_dirty_buffer(struct buffer_head *bh, int op_flags)
 {
 	lock_buffer(bh);
+	if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+		pr_err("%s:%s lock for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 	if (!test_clear_buffer_dirty(bh)) {
+	if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+		pr_err("%s:%s unlock for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 		unlock_buffer(bh);
 		return;
 	}
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
 	submit_bh(REQ_OP_WRITE, op_flags, bh);
+	if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+		pr_err("%s:%s unlock2 for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 }
 EXPORT_SYMBOL(write_dirty_buffer);
 
@@ -3140,6 +3152,8 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
 
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	lock_buffer(bh);
+	if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+		pr_err("%s:%s lock for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 	if (test_clear_buffer_dirty(bh)) {
 		/*
 		 * The bh should be mapped, but it might not be if the
@@ -3147,6 +3161,8 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
 		 */
 		if (!buffer_mapped(bh)) {
 			unlock_buffer(bh);
+			if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+				pr_err("%s:%s unlock for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 			return -EIO;
 		}
 
@@ -3159,6 +3175,8 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
 	} else {
 		unlock_buffer(bh);
 	}
+	if (!strcmp(bh->b_page->mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(bh->b_page->mapping->host)->bd_dev) == 16 && bh->b_assoc_map && !strcmp(bh->b_assoc_map->host->i_sb->s_type->name, "vfat"))
+		pr_err("%s:%s unlock2 for inode %px bh %px\n", current->comm, __func__, bh->b_page->mapping->host, bh);
 	return ret;
 }
 EXPORT_SYMBOL(__sync_dirty_buffer);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..56a9aefc6451 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -182,6 +182,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#include <linux/delay.h>
 int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 050d40c465bc..cbed73becf9e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1408,6 +1408,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * writeback is not making progress due to locked
 		 * buffers. Skip this inode for now.
 		 */
+//		if (!strcmp("bdev", inode->i_sb->s_type->name))
+//			pr_info("requeue inode %px\n", inode);
 		redirty_tail_locked(inode, wb);
 		return;
 	}
@@ -1514,8 +1516,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 */
 	smp_mb();
 
-	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+//		if (!strcmp("bdev", inode->i_sb->s_type->name))
+//			pr_info("tag dirty inode %px\n", inode);
 		inode->i_state |= I_DIRTY_PAGES;
+	}
 
 	spin_unlock(&inode->i_lock);
 
@@ -1776,11 +1781,13 @@ static long writeback_sb_inodes(struct super_block *sb,
 	return wrote;
 }
 
+int global_sync_cond = 0;
 static long __writeback_inodes_wb(struct bdi_writeback *wb,
 				  struct wb_writeback_work *work)
 {
 	unsigned long start_time = jiffies;
 	long wrote = 0;
+	int has_vfat = 0;
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -1795,7 +1802,15 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 			redirty_tail(inode, wb);
 			continue;
 		}
+
+		if (!strcmp(sb->s_type->name, "bdev") && MINOR(I_BDEV(inode)->bd_dev) == 16 && has_vfat && wb_inode(wb->b_io.prev) == wb_inode(wb->b_io.next)) {
+			global_sync_cond = 1;
+			smp_wmb();
+		}
+
 		wrote += writeback_sb_inodes(sb, wb, work);
+		if (!strcmp(sb->s_type->name, "vfat"))
+			has_vfat++;
 		up_read(&sb->s_umount);
 
 		/* refer to the same tests at the end of writeback_sb_inodes */
@@ -1873,8 +1888,12 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * after the other works are all done.
 		 */
 		if ((work->for_background || work->for_kupdate) &&
-		    !list_empty(&wb->work_list))
+		    !list_empty(&wb->work_list)) {
+			if (!list_empty(&wb->b_dirty)) {
+				pr_err("exit %s\n", current->comm);
+			}
 			break;
+		}
 
 		/*
 		 * For background writeout, stop when we are below the
@@ -1936,6 +1955,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 	}
 	spin_unlock(&wb->list_lock);
 	blk_finish_plug(&plug);
+	global_sync_cond = 0;
+	smp_wmb();
 
 	return nr_pages - work->nr_pages;
 }
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..2a888f53fb28 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -112,10 +112,10 @@ void ksys_sync(void)
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	/*iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_bdevs(fdatawait_one_bdev, NULL);
+	iterate_bdevs(fdatawait_one_bdev, NULL);*/
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b7f42d3dce26..1725352a8c14 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -44,6 +44,10 @@
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
 
+extern int global_sync_cond;	// There are bdev inode and fat inode in a same work, and bdev inode is the last inode to writeback
+extern int global_sync_cond2;	// The only one tag is gotten by one request, and the request is still in plug->mq_list
+extern int global_sync_cond3;	// wb_writeback is writing last page of bdev inode
+
 struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b72da123f242..ad560bb8550d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2160,6 +2160,7 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
  *
  * Return: %0 on success, negative error code otherwise
  */
+int global_sync_cond3 = 0;
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data)
@@ -2175,6 +2176,8 @@ int write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	xa_mark_t tag;
 
+	global_sync_cond3 = 0;
+	smp_wmb();
 	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* prev offset */
@@ -2237,6 +2240,11 @@ int write_cache_pages(struct address_space *mapping,
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
+			if (i == nr_pages - 1 && !strcmp(mapping->host->i_sb->s_type->name, "bdev") && MINOR(I_BDEV(mapping->host)->bd_dev) == 16) {
+				global_sync_cond3 = 1;
+				smp_wmb();
+			}
+
 			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
 			error = (*writepage)(page, wbc, data);
 			if (unlikely(error)) {
-- 
2.31.1

b.c

#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#define BUFF_SIZE (1024 * 600)
#define DEV "sdb"

int main(void) {
	system("umount temp");
	system("mkfs.fat /dev/" DEV);
	system("sysctl -w vm.dirty_background_bytes=20");
	system("echo none > /sys/block/" DEV "/queue/scheduler");
	system("mount /dev/" DEV " /root/temp");
	int fd = open("/root/temp/file", O_RDWR | O_CREAT);
	int i;
	char buf[BUFF_SIZE];
	for (i = 0; i < 1000; ++i) {
		write(fd, buf, BUFF_SIZE);
		printf("fsync %d\n", i);
		fsync(fd);
		printf("done %d\n", i);
	}
	return 0;
}

评论 (1)

chengzhihao 创建了缺陷

Hi czh549642238, welcome to the openEuler Community.
I'm the Bot here serving you. You can find the instructions on how to interact with me at Here.
If you have any questions, please contact the SIG: Kernel, and any of the maintainers: @YangYingliang , @pi3orama , @成坚 (CHENG Jian) , @jiaoff , @Qiuuuuu , @zhengzengkai , @刘勇强 , @Xie XiuQi

登录 后才可以发表评论

状态
负责人
项目
里程碑
Pull Requests
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
开始日期   -   截止日期
-
置顶选项
优先级
预计工期 (小时)
参与者(2)
5329419 openeuler ci bot 1632792936
C
1
https://gitee.com/openeuler/kernel.git
git@gitee.com:openeuler/kernel.git
openeuler
kernel
kernel

搜索帮助