diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst index c482e1619e7757c6a008305ab755693d2c9978e0..ede67f705787e26224d5c316070e67e1f45dca45 100644 --- a/Documentation/filesystems/path-lookup.rst +++ b/Documentation/filesystems/path-lookup.rst @@ -1321,18 +1321,18 @@ to lookup: RCU-walk, REF-walk, and REF-walk with forced revalidation. yet. This is primarily used to tell the audit subsystem the full context of a particular access being audited. -``LOOKUP_ROOT`` indicates that the ``root`` field in the ``nameidata`` was +``ND_ROOT_PRESET`` indicates that the ``root`` field in the ``nameidata`` was provided by the caller, so it shouldn't be released when it is no longer needed. -``LOOKUP_JUMPED`` means that the current dentry was chosen not because +``ND_JUMPED`` means that the current dentry was chosen not because it had the right name but for some other reason. This happens when following "``..``", following a symlink to ``/``, crossing a mount point or accessing a "``/proc/$PID/fd/$FD``" symlink (also known as a "magic link"). In this case the filesystem has not been asked to revalidate the name (with ``d_revalidate()``). In such cases the inode may still need to be revalidated, so ``d_op->d_weak_revalidate()`` is called if -``LOOKUP_JUMPED`` is set when the look completes - which may be at the +``ND_JUMPED`` is set when the look completes - which may be at the final component or, when creating, unlinking, or renaming, at the penultimate component. Resolution-restriction flags diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 867036aa90b8394cea582fdbd183e747a630488a..b00fb6313b58b779fe87f88e0fd20c281539de19 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -865,3 +865,12 @@ no matter what. Everything is handled by the caller. clone_private_mount() returns a longterm mount now, so the proper destructor of its result is kern_unmount() or kern_unmount_array(). + +--- + +**mandatory** + +Calling conventions for file_open_root() changed; now it takes struct path * +instead of passing mount and dentry separately. For callers that used to +pass mnt_root> pair (i.e. the root of given mount), a new helper +is provided - file_open_root_mnt(). In-tree users adjusted. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 02deac76673fde03255bf28785903cd988861ce8..2175465c9bf2a6e804a1b2b66d63de0e5cdffe8e 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -41,17 +41,17 @@ size change due to this facility. - Without page owner:: text data bss dec hex filename - 40662 1493 644 42799 a72f mm/page_alloc.o + 48392 2333 644 51369 c8a9 mm/page_alloc.o - With page owner:: text data bss dec hex filename - 40892 1493 644 43029 a815 mm/page_alloc.o - 1427 24 8 1459 5b3 mm/page_ext.o - 2722 50 0 2772 ad4 mm/page_owner.o + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o -Although, roughly, 4 KB code is added in total, page_alloc.o increase by -230 bytes and only half of it is in hotpath. Building the kernel with +Although, roughly, 8 KB code is added in total, page_alloc.o increase by +520 bytes and less than half of it is in hotpath. Building the kernel with page owner and turning it on if needed would be great option to debug kernel memory problem. diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index dc8f152f35566710060f1d82e03d464200ee4bc5..307f381eee71b5762faacd9151f108c170ed3cdb 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -38,9 +38,10 @@ icache_size: * procedures. */ ENTRY(v7_invalidate_l1) - mov r0, #0 - mcr p15, 2, r0, c0, c0, 0 - mrc p15, 1, r0, c0, c0, 0 + mov r0, #0 + mcr p15, 2, r0, c0, c0, 0 @ select L1 data cache in CSSELR + isb + mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR movw r1, #0x7fff and r2, r1, r0, lsr #13 diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index a5636524af7693a5f7900c3e92f87a31d9e0323c..e2af6b172200ed40ddd2e7aee4de301c0a6c9feb 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -446,7 +446,8 @@ void __init acpi_numa_fixup(void) if (srat_num_cpus == 0) { node_set_online(0); node_cpuid[0].phys_id = hard_smp_processor_id(); - return; + slit_distance(0, 0) = LOCAL_DISTANCE; + goto out; } /* @@ -489,7 +490,7 @@ void __init acpi_numa_fixup(void) for (j = 0; j < MAX_NUMNODES; j++) slit_distance(i, j) = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return; + goto out; } memset(numa_slit, -1, sizeof(numa_slit)); @@ -514,6 +515,8 @@ void __init acpi_numa_fixup(void) printk("\n"); } #endif +out: + node_possible_map = node_online_map; } #endif /* CONFIG_ACPI_NUMA */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 3beeb030cd78e9bbcababbf4de178c588fbe0eee..424635b24b521128e71daf9ca8914166ea346992 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -65,6 +65,11 @@ static inline int early_cpu_to_node(int cpu) int of_drconf_to_nid_single(struct drmem_lmb *lmb); +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -93,6 +98,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 452cbf98bfd714aefddc1b9373663add4e7ee948..b7bed52a5a73d2eae94f5de4ae81bb519f2a989f 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1300,6 +1300,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1384,6 +1386,7 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 094a1076fd1fe3601f445048c669d836de6e4ec1..23bfe00811aea0427a0154b30d3c2f3297c3c6b3 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -137,7 +137,7 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -148,7 +148,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -598,9 +598,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 5218f5da27370b29b8d1a98aaaf8753596236ef1..30551bbd7988aca459748df92ffe7300c3bd68c4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -380,6 +380,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, /* Remove link to a group from table's list of attached groups */ found = false; + + rcu_read_lock(); list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { if (tgl->table_group == table_group) { list_del_rcu(&tgl->next); @@ -388,6 +390,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, break; } } + rcu_read_unlock(); + if (WARN_ON(!found)) return; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index a2e680f7d39f25af3bcfc09bfaac464de7faec66..6a22ead31c5b8159537e12abd2ce1f263a56d5ee 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req) mconsole_reply(req, "Proc not available", 1, 0); goto out; } - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); + file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b8aee71840ae507a0b93f3187350c75799dc8f94..7694c541e3d804240e1e383b780cdb4759c13c8c 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e49b3bbdf1c3e41fc8ea1fb589e647a..a6bcc779c9124e64f959bb53e4e5271c9b24387d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fd3c23d516b8a808c2111fd601c4c9084058b62c..aa1a808fa0724dfaeeb4fd823c98c776ca926b19 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2235,9 +2235,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -2326,7 +2326,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2349,6 +2349,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ @@ -5500,14 +5508,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } @@ -5901,6 +5911,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -5918,40 +5929,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); atomic_dec(&rq->mq_hctx->elevator_queued); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows diff --git a/block/bio.c b/block/bio.c index 0703a208ca248cfaf41a779ad434db50d89ca7b2..b2e0f0c94c5f71b526aa2a570f639b8d6d18fd00 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1433,7 +1433,7 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) + if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED)) rq_qos_done_bio(bio->bi_disk->queue, bio); /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5b19665bc486a8b410b48a349529675f7c299a1c..40f15807efec50d6f2e0b139d83cb07513e8295d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -620,6 +620,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, q = disk->queue; + /* + * blkcg_deactivate_policy() requires queue to be frozen, we can grab + * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). + */ + ret = blk_queue_enter(q, 0); + if (ret) + goto fail; + rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -654,13 +662,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, new_blkg = blkg_alloc(pos, q, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } rcu_read_lock(); @@ -689,6 +697,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: + blk_queue_exit(q); ctx->disk = disk; ctx->blkg = blkg; ctx->body = input; @@ -699,6 +708,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); +fail_exit_queue: + blk_queue_exit(q); fail: put_disk_and_module(disk); /* diff --git a/block/blk-core.c b/block/blk-core.c index 63e9c0d78537bc5f955ae04f87481200a02167fc..a50205bcb755419fd161c440f2e99746029cb753 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -897,10 +897,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - if (blk_throtl_bio(bio)) { - blkcg_bio_issue_init(bio); + if (blk_throtl_bio(bio)) return false; - } blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d072f0809c567f98214e1bc552c145c4..b2676de4c6a57dd66618273be5a0b2b62a034af6 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data; - rq->end_io_data = NULL; + rq->end_io_data = (void *)(uintptr_t)error; /* * complete last, if this is a stack request the process (and thus @@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, +blk_status_t blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); @@ -91,5 +92,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d41ebb9d09414fe976def0e5ab6ac51..1d6ba8ff5a663077790278bc8b7c3ee64fe97d8e 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -242,9 +242,9 @@ void ioc_clear_queue(struct request_queue *q) spin_lock_irq(&q->queue_lock); list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); __ioc_clear_queue(&icq_list); + spin_unlock_irq(&q->queue_lock); } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) diff --git a/block/blk-merge.c b/block/blk-merge.c index 26f4bcc10de9d7d550d78ef765b50fad80bd5276..6518e0ae283516039419341010e242fe7d69654f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -831,18 +831,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 24c08963890e97e2089feaf015b111f3f7baf44d..606bef13f1c210d88dd7c970b3670125cf63c6d8 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -381,9 +381,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); @@ -506,19 +507,6 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); - hctx->sched_tags = NULL; - } -} - static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) @@ -534,8 +522,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return -ENOMEM; ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + if (ret) { + blk_mq_free_rq_map(hctx->sched_tags, flags); + hctx->sched_tags = NULL; + } return ret; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0476360f05f18a23c01bfc92c0ba2d891c88ce2e..15f3d611db1048760d71710f290c69e60643d7ce 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -12,7 +12,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index b0e7ec85a41cf128d4d2ef6b4288c69903300b5c..a9aac1dd0f899168eeb81cc29666e71e4d24d294 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -206,7 +206,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); @@ -247,10 +252,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); @@ -3239,8 +3255,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -3726,8 +3740,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { + int i = prev_nr_hw_queues; + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); + for (; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); + set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; @@ -3973,6 +3992,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq) } EXPORT_SYMBOL(blk_mq_rq_cpu); +void blk_mq_cancel_work_sync(struct request_queue *q) +{ + if (queue_is_mq(q)) { + struct blk_mq_hw_ctx *hctx; + int i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } +} + static int __init blk_mq_init(void) { int i; diff --git a/block/blk-mq.h b/block/blk-mq.h index f792a0920ebb13e3857e6a5d3397e13775c67e53..7f3194657dffbd150fd33fc72c316e1d47c56f22 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -129,6 +129,7 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, @@ -282,6 +283,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683af06f8cb8d43361a13620cb14c6d20c..66765740902bb6d911719c1fbe07393bc26a692f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -910,6 +910,9 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); + spin_lock_irq(&q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_THROTL_INIT_DONE, q); + spin_unlock_irq(&q->queue_lock); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&q->kobj, KOBJ_ADD); @@ -942,6 +945,10 @@ void blk_unregister_queue(struct gendisk *disk) if (!blk_queue_registered(q)) return; + spin_lock_irq(&q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_THROTL_INIT_DONE, q); + spin_unlock_irq(&q->queue_lock); + /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c53a254171a29e0dac95596b5232230f32e48356..4087dcc13f7db2f9ad2f637a544dd9350369296b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-cgroup-rwstat.h" @@ -1456,6 +1457,31 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) } } +static inline int throtl_check_init_done(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_THROTL_INIT_DONE, &q->queue_flags)) + return 0; + + return blk_queue_dying(q) ? -ENODEV : -EBUSY; +} + +/* + * If throtl_check_init_done() return -EBUSY, we should retry after a short + * msleep(), since that throttle init will be completed in blk_register_queue() + * soon. + */ +static inline int throtl_restart_syscall_when_busy(int errno) +{ + int ret = errno; + + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); + } + + return ret; +} + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1469,6 +1495,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, if (ret) return ret; + ret = throtl_check_init_done(ctx.disk->queue); + if (ret) + goto out_finish; + ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; @@ -1486,6 +1516,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, ret = 0; out_finish: blkg_conf_finish(&ctx); + ret = throtl_restart_syscall_when_busy(ret); return ret ?: nbytes; } @@ -1661,8 +1692,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, if (ret) return ret; - tg = blkg_to_tg(ctx.blkg); + ret = throtl_check_init_done(ctx.disk->queue); + if (ret) + goto out_finish; + tg = blkg_to_tg(ctx.blkg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; @@ -1758,6 +1792,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, ret = 0; out_finish: blkg_conf_finish(&ctx); + ret = throtl_restart_syscall_when_busy(ret); return ret ?: nbytes; } diff --git a/block/blk.h b/block/blk.h index ab99c522a31e3387c20ad9f25ebb6416bdbd2f83..ca4de4985946d5a0acf1fc30335b4dd93b3ddc67 100644 --- a/block/blk.h +++ b/block/blk.h @@ -229,7 +229,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/elevator.c b/block/elevator.c index 2a525863d4e92ab9b19bcb1b40d923365e5bbdef..65dfc7559a369ed7a708c12fd99744f6708e9ad5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -353,9 +353,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -366,8 +368,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -381,6 +385,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; @@ -624,6 +629,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1) return NULL; @@ -681,12 +689,18 @@ void elevator_init_mq(struct request_queue *q) if (!e) return; + /* + * We are called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + */ blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); if (err) { diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 7f9ef773bf44419c317f0a7a4f8c6bbfc7064b6d..d2648356e430070fb4c81f9a2e4005638c2539f0 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -353,19 +353,9 @@ static void kyber_timer_fn(struct timer_list *t) } } -static unsigned int kyber_sched_tags_shift(struct request_queue *q) -{ - /* - * All of the hardware queues have the same depth, so we can just grab - * the shift of the first one. - */ - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; -} - static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; - unsigned int shift; int ret = -ENOMEM; int i; @@ -400,9 +390,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) kqd->latency_targets[i] = kyber_latency_targets[i]; } - shift = kyber_sched_tags_shift(q); - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - return kqd; err_buckets: @@ -458,9 +445,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags->sb.shift; + + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); +} + +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ struct kyber_hctx_data *khd; int i; @@ -502,8 +499,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, - kqd->async_depth); + kyber_depth_updated(hctx); return 0; @@ -1023,6 +1019,7 @@ static struct elevator_type kyber_sched = { .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, + .depth_updated = kyber_depth_updated, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index e4e90761eab35108566458caeb74c64292864a63..43994cce1eb26ba65f23a0f25da72c96362291ea 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -489,6 +489,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + LIST_HEAD(free); /* * This may be a requeue of a write request that has locked its @@ -496,8 +497,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); - if (blk_mq_sched_try_insert_merge(q, rq)) + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + } blk_mq_sched_request_inserted(rq); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f0fa0c8e7ec60c151c0562bba147e8553dcd11f2..84f3191f4566c89e04c19f583f963cc0b6febf59 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -233,7 +233,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) * @bsize: size to validate */ static int -loop_validate_block_size(unsigned short bsize) +loop_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; @@ -2117,7 +2117,8 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f7f1d9dbdc806d2de30bcaf57d1ad5f73f9d4050..07b06fc6f70e0d0c412a0520f9091f9b34054e4e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -114,14 +114,20 @@ struct nbd_device { struct workqueue_struct *recv_workq; struct list_head list; - struct task_struct *task_recv; struct task_struct *task_setup; struct completion *destroy_complete; unsigned long flags; + pid_t pid; /* pid of nbd-client, if attached */ }; #define NBD_CMD_REQUEUED 1 +/* + * This flag will be set if nbd_queue_rq() succeed, and will be checked and + * cleared in completion. Both setting and clearing of the flag are protected + * by cmd->lock. + */ +#define NBD_CMD_INFLIGHT 2 struct nbd_cmd { struct nbd_device *nbd; @@ -208,7 +214,7 @@ static ssize_t pid_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); + return sprintf(buf, "%d\n", nbd->pid); } static const struct device_attribute pid_attr = { @@ -327,7 +333,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, struct nbd_config *config = nbd->config; config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; - if (nbd->task_recv != NULL) + if (nbd->pid) nbd_size_update(nbd, false); } @@ -389,6 +395,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return BLK_EH_DONE; + } + if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; mutex_unlock(&cmd->lock); @@ -468,7 +479,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } /* - * Send or receive packet. + * Send or receive packet. Return a positive value on success and + * negtive value on failue, and never return 0. */ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct iov_iter *iter, int msg_flags, int *sent) @@ -594,7 +606,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) result = sock_xmit(nbd, index, 1, &from, (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); trace_nbd_header_sent(req, handle); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* If we havne't sent anything we can just return BUSY, * however if we have sent something we need to make @@ -638,7 +650,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) skip = 0; } result = sock_xmit(nbd, index, 1, &from, flags, &sent); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* We've already sent the header, we * have no choice but to set pending and @@ -672,38 +684,45 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) return 0; } -/* NULL returned = something went wrong, inform userspace */ -static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) +static int nbd_read_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) { - struct nbd_config *config = nbd->config; - int result; - struct nbd_reply reply; - struct nbd_cmd *cmd; - struct request *req = NULL; - u64 handle; - u16 hwq; - u32 tag; - struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)}; + struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; struct iov_iter to; - int ret = 0; + int result; - reply.magic = 0; - iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); + reply->magic = 0; + iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { - if (!nbd_disconnected(config)) + if (result < 0) { + if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); - return ERR_PTR(result); + return result; } - if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { + if (ntohl(reply->magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", - (unsigned long)ntohl(reply.magic)); - return ERR_PTR(-EPROTO); + (unsigned long)ntohl(reply->magic)); + return -EPROTO; } - memcpy(&handle, reply.handle, sizeof(handle)); + return 0; +} + +/* NULL returned = something went wrong, inform userspace */ +static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) +{ + int result; + struct nbd_cmd *cmd; + struct request *req = NULL; + u64 handle; + u16 hwq; + u32 tag; + int ret = 0; + + memcpy(&handle, reply->handle, sizeof(handle)); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) @@ -718,6 +737,16 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", + tag, cmd->status, cmd->flags); + ret = -ENOENT; + goto out; + } + if (cmd->index != index) { + dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)", + tag, index, cmd->index); + } if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); @@ -736,9 +765,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ret = -ENOENT; goto out; } - if (ntohl(reply.error)) { + if (ntohl(reply->error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", - ntohl(reply.error)); + ntohl(reply->error)); cmd->status = BLK_STS_IOERR; goto out; } @@ -747,11 +776,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; + struct iov_iter to; rq_for_each_segment(bvec, req, iter) { iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { + if (result < 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); /* @@ -760,7 +790,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) * and let the timeout stuff handle resubmitting * this request onto another connection. */ - if (nbd_disconnected(config)) { + if (nbd_disconnected(nbd->config)) { cmd->status = BLK_STS_IOERR; goto out; } @@ -784,24 +814,46 @@ static void recv_work(struct work_struct *work) work); struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; + struct request_queue *q = nbd->disk->queue; + struct nbd_sock *nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { - cmd = nbd_read_stat(nbd, args->index); - if (IS_ERR(cmd)) { - struct nbd_sock *nsock = config->socks[args->index]; + struct nbd_reply reply; - mutex_lock(&nsock->tx_lock); - nbd_mark_nsock_dead(nbd, nsock, 1); - mutex_unlock(&nsock->tx_lock); + if (nbd_read_reply(nbd, args->index, &reply)) + break; + + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during nbd_handle_reply(). + * If queue is frozen, there won't be any inflight requests, we + * needn't to handle the incoming garbage message. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) { + dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n", + __func__); + break; + } + + cmd = nbd_handle_reply(nbd, args->index, &reply); + if (IS_ERR(cmd)) { + percpu_ref_put(&q->q_usage_counter); break; } rq = blk_mq_rq_from_pdu(cmd); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); + percpu_ref_put(&q->q_usage_counter); } + + nsock = config->socks[args->index]; + mutex_lock(&nsock->tx_lock); + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); + nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); @@ -817,6 +869,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) return true; mutex_lock(&cmd->lock); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return true; + } cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); @@ -898,7 +954,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); - blk_mq_start_request(req); return -EINVAL; } config = nbd->config; @@ -907,7 +962,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); - blk_mq_start_request(req); return -EINVAL; } cmd->status = BLK_STS_OK; @@ -931,7 +985,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) */ sock_shutdown(nbd); nbd_config_put(nbd); - blk_mq_start_request(req); return -EIO; } goto again; @@ -953,7 +1006,13 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) * returns EAGAIN can be retried on a different socket. */ ret = nbd_send_cmd(nbd, cmd, index); - if (ret == -EAGAIN) { + /* + * Access to this flag is protected by cmd->lock, thus it's safe to set + * the flag after nbd_send_cmd() succeed to send request to server. + */ + if (!ret) + __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); + else if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); @@ -1190,7 +1249,7 @@ static void send_disconnects(struct nbd_device *nbd) iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); - if (ret <= 0) + if (ret < 0) dev_err(disk_to_dev(nbd->disk), "Send disconnect failed %d\n", ret); mutex_unlock(&nsock->tx_lock); @@ -1225,7 +1284,7 @@ static void nbd_config_put(struct nbd_device *nbd) if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - nbd->task_recv = NULL; + nbd->pid = 0; nbd_clear_sock(nbd); if (config->num_connections) { int i; @@ -1260,7 +1319,7 @@ static int nbd_start_device(struct nbd_device *nbd) int num_connections = config->num_connections; int error = 0, i; - if (nbd->task_recv) + if (nbd->pid) return -EBUSY; if (!config->socks) return -EINVAL; @@ -1279,7 +1338,7 @@ static int nbd_start_device(struct nbd_device *nbd) } blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); - nbd->task_recv = current; + nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); @@ -1385,6 +1444,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; + loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: @@ -1407,6 +1467,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, div_s64(arg, config->blksize)); return 0; case NBD_SET_SIZE_BLOCKS: + if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + return -EINVAL; nbd_size_set(nbd, config->blksize, arg); return 0; case NBD_SET_TIMEOUT: @@ -1550,8 +1612,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; - if (nbd->task_recv) - seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); + if (nbd->pid) + seq_printf(s, "recv: %d\n", nbd->pid); return 0; } @@ -1759,7 +1821,18 @@ static int nbd_dev_add(int index) refcount_set(&nbd->refs, 1); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* + * Too big index can cause duplicate creation of sysfs files/links, + * because MKDEV() expect that the max first minor is MINORMASK, or + * index << part_shift can overflow. + */ disk->first_minor = index << part_shift; + if (disk->first_minor < index || disk->first_minor > MINORMASK) { + err = -EINVAL; + goto out_free_tags; + } + disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); @@ -2126,7 +2199,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || - !nbd->task_recv) { + !nbd->pid) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); ret = -EINVAL; diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c index 47c96f7f4976279abed928635245832d0b5b620f..4c12b7a94af59ae9c3ac01b17b8704f85ec22e87 100644 --- a/drivers/iio/buffer/industrialio-buffer-cb.c +++ b/drivers/iio/buffer/industrialio-buffer-cb.c @@ -54,6 +54,11 @@ struct iio_cb_buffer *iio_channel_get_all_cb(struct device *dev, struct iio_cb_buffer *cb_buff; struct iio_channel *chan; + if (!cb) { + dev_err(dev, "Invalid arguments: A callback must be provided!\n"); + return ERR_PTR(-EINVAL); + } + cb_buff = kzalloc(sizeof(*cb_buff), GFP_KERNEL); if (cb_buff == NULL) return ERR_PTR(-ENOMEM); diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 261d3b17edc9a9da82066a31d155b8194471c802..ea98aad9fb818d46b53debc0298c2e6bc68ac178 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1720,6 +1720,7 @@ static const struct iio_buffer_setup_ops noop_ring_setup_ops; int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod) { + const char *label; int ret; if (!indio_dev->info) @@ -1730,8 +1731,9 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod) if (!indio_dev->dev.of_node && indio_dev->dev.parent) indio_dev->dev.of_node = indio_dev->dev.parent->of_node; - indio_dev->label = of_get_property(indio_dev->dev.of_node, "label", - NULL); + label = of_get_property(indio_dev->dev.of_node, "label", NULL); + if (label) + indio_dev->label = label; ret = iio_check_unique_scan_index(indio_dev); if (ret < 0) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 19a70f434029b53ce90161c46f14aa6a96a50cbf..3403722b1688225ace20223fbadeda2872a78c1b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2038,16 +2038,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, dm_table_event_callback(t, event_callback, md); - /* - * The queue hasn't been stopped yet, if the old table type wasn't - * for request-based during suspension. So stop it to prevent - * I/O mapping before resume. - * This must be done before setting the queue restrictions, - * because request-based dm may be run just after the setting. - */ - if (request_based) - dm_stop_queue(q); - if (request_based) { /* * Leverage the fact that request-based DM targets are diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index e85b04e9716b21523155864eed7b331651da9178..4153e0d15c5f913711507fbaf35e50e2ae297173 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -350,9 +350,6 @@ static ssize_t dev_attribute_show(struct device *dev, * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); - ubi = ubi_get_device(ubi->ubi_num); - if (!ubi) - return -ENODEV; if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); @@ -381,7 +378,6 @@ static ssize_t dev_attribute_show(struct device *dev, else ret = -EINVAL; - ubi_put_device(ubi); return ret; } @@ -980,9 +976,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, goto out_detach; } - /* Make device "available" before it becomes accessible via sysfs */ - ubi_devices[ubi_num] = ubi; - err = uif_init(ubi); if (err) goto out_detach; @@ -1027,6 +1020,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); + ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; @@ -1035,7 +1029,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, out_uif: uif_close(ubi); out_detach: - ubi_devices[ubi_num] = NULL; ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 022af59906aa9a6d2b9bfd59e375964c21b3e884..88fdf8f5709fa0d25f3de95e9cc119c41358ff45 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -468,7 +468,9 @@ static int scan_pool(struct ubi_device *ubi, struct ubi_attach_info *ai, if (err == UBI_IO_FF_BITFLIPS) scrub = 1; - add_aeb(ai, free, pnum, ec, scrub); + ret = add_aeb(ai, free, pnum, ec, scrub); + if (ret) + goto out; continue; } else if (err == 0 || err == UBI_IO_BITFLIPS) { dbg_bld("Found non empty PEB:%i in pool", pnum); @@ -638,8 +640,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 0); + ret = add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 0); + if (ret) + goto fail; } /* read EC values from used list */ @@ -649,8 +653,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &used, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 0); + ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 0); + if (ret) + goto fail; } /* read EC values from scrub list */ @@ -660,8 +666,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &used, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 1); + ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 1); + if (ret) + goto fail; } /* read EC values from erase list */ @@ -671,8 +679,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 1); + ret = add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 1); + if (ret) + goto fail; } ai->mean_ec = div_u64(ai->ec_sum, ai->ec_count); @@ -818,24 +828,6 @@ static int find_fm_anchor(struct ubi_attach_info *ai) return ret; } -static struct ubi_ainf_peb *clone_aeb(struct ubi_attach_info *ai, - struct ubi_ainf_peb *old) -{ - struct ubi_ainf_peb *new; - - new = ubi_alloc_aeb(ai, old->pnum, old->ec); - if (!new) - return NULL; - - new->vol_id = old->vol_id; - new->sqnum = old->sqnum; - new->lnum = old->lnum; - new->scrub = old->scrub; - new->copy_flag = old->copy_flag; - - return new; -} - /** * ubi_scan_fastmap - scan the fastmap. * @ubi: UBI device object @@ -865,15 +857,11 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, if (fm_anchor < 0) return UBI_NO_FASTMAP; - /* Copy all (possible) fastmap blocks into our new attach structure. */ + /* Add fastmap blocks(pnum < UBI_FM_MAX_START) into attach structure. */ list_for_each_entry(aeb, &scan_ai->fastmap, u.list) { - struct ubi_ainf_peb *new; - - new = clone_aeb(ai, aeb); - if (!new) - return -ENOMEM; - - list_add(&new->u.list, &ai->fastmap); + ret = add_aeb(ai, &ai->fastmap, aeb->pnum, aeb->ec, 0); + if (ret) + return ret; } down_write(&ubi->fm_protect); @@ -1019,6 +1007,17 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, "err: %i)", i, pnum, ret); goto free_hdr; } + + /* + * Add left fastmap blocks (pnum >= UBI_FM_MAX_START) into + * attach structure. + */ + if (pnum >= UBI_FM_MAX_START) { + ret = add_aeb(ai, &ai->fastmap, pnum, + be64_to_cpu(ech->ec), 0); + if (ret) + goto free_hdr; + } } kfree(fmsb); diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 139ee132bfbcfa432bb8b75f3df93858900c07f9..04d0e217ea1c246c5c0907d614b8abd0fff25658 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -56,16 +56,11 @@ static ssize_t vol_attribute_show(struct device *dev, { int ret; struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); - struct ubi_device *ubi; - - ubi = ubi_get_device(vol->ubi->ubi_num); - if (!ubi) - return -ENODEV; + struct ubi_device *ubi = vol->ubi; spin_lock(&ubi->volumes_lock); if (!ubi->volumes[vol->vol_id]) { spin_unlock(&ubi->volumes_lock); - ubi_put_device(ubi); return -ENODEV; } /* Take a reference to prevent volume removal */ @@ -103,7 +98,6 @@ static ssize_t vol_attribute_show(struct device *dev, vol->ref_count -= 1; ubi_assert(vol->ref_count >= 0); spin_unlock(&ubi->volumes_lock); - ubi_put_device(ubi); return ret; } @@ -415,6 +409,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) struct ubi_device *ubi = vol->ubi; struct ubi_vtbl_record vtbl_rec; struct ubi_eba_table *new_eba_tbl = NULL; + struct ubi_eba_table *old_eba_tbl = NULL; int vol_id = vol->vol_id; if (ubi->ro_mode) @@ -460,10 +455,13 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) err = -ENOSPC; goto out_free; } + ubi->avail_pebs -= pebs; ubi->rsvd_pebs += pebs; ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -471,14 +469,16 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) for (i = 0; i < -pebs; i++) { err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i); if (err) - goto out_acc; + goto out_free; } spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs += pebs; ubi->avail_pebs -= pebs; ubi_update_reserved(ubi); ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -500,27 +500,32 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) if (err) goto out_acc; - vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; vol->last_eb_bytes = vol->usable_leb_size; vol->used_bytes = (long long)vol->used_ebs * vol->usable_leb_size; } - + /* destroy old table */ + ubi_eba_destroy_table(old_eba_tbl); ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED); self_check_volumes(ubi); return err; out_acc: + spin_lock(&ubi->volumes_lock); + vol->reserved_pebs = reserved_pebs - pebs; if (pebs > 0) { - spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= pebs; ubi->avail_pebs += pebs; - spin_unlock(&ubi->volumes_lock); + ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); + } else { + ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); } + vol->eba_tbl = old_eba_tbl; + spin_unlock(&ubi->volumes_lock); out_free: - kfree(new_eba_tbl); + ubi_eba_destroy_table(new_eba_tbl); return err; } diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index fd07561da0348abdaf7f7d9d06e061c4d5361db2..6a6cdd0bb2585bd3f77d2cb590da9910f89f0337 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -108,15 +108,15 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) } static SLAVE_ATTR_RO(ad_partner_oper_port_state); -static const struct slave_attribute *slave_attrs[] = { - &slave_attr_state, - &slave_attr_mii_status, - &slave_attr_link_failure_count, - &slave_attr_perm_hwaddr, - &slave_attr_queue_id, - &slave_attr_ad_aggregator_id, - &slave_attr_ad_actor_oper_port_state, - &slave_attr_ad_partner_oper_port_state, +static const struct attribute *slave_attrs[] = { + &slave_attr_state.attr, + &slave_attr_mii_status.attr, + &slave_attr_link_failure_count.attr, + &slave_attr_perm_hwaddr.attr, + &slave_attr_queue_id.attr, + &slave_attr_ad_aggregator_id.attr, + &slave_attr_ad_actor_oper_port_state.attr, + &slave_attr_ad_partner_oper_port_state.attr, NULL }; @@ -137,24 +137,10 @@ const struct sysfs_ops slave_sysfs_ops = { int bond_sysfs_slave_add(struct slave *slave) { - const struct slave_attribute **a; - int err; - - for (a = slave_attrs; *a; ++a) { - err = sysfs_create_file(&slave->kobj, &((*a)->attr)); - if (err) { - kobject_put(&slave->kobj); - return err; - } - } - - return 0; + return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { - const struct slave_attribute **a; - - for (a = slave_attrs; *a; ++a) - sysfs_remove_file(&slave->kobj, &((*a)->attr)); + sysfs_remove_files(&slave->kobj, slave_attrs); } diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 5ee7cde0c2e97876564c95a2c2f48e970d1f193d..db7866b6f75259e7de9fd07c0490ccc69e211336 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -831,7 +831,12 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ - _phy_start_aneg(phydev); + if (phy_is_started(phydev)) { + phydev->state = PHY_UP; + phy_trigger_machine(phydev); + } else { + _phy_start_aneg(phydev); + } mutex_unlock(&phydev->lock); return 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 99b5152482fe420f26015676285cdcf2b8059e19..9ccf44592fe42ee75ac9645263b95201dc0a044b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,26 +102,6 @@ static void nvme_update_bdev_size(struct gendisk *disk) } } -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_set_queue_dying(ns->queue); - blk_mq_unquiesce_queue(ns->queue); - - set_capacity(ns->disk, 0); - nvme_update_bdev_size(ns->disk); -} - static void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* @@ -4540,6 +4520,38 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, } EXPORT_SYMBOL_GPL(nvme_init_ctrl); +static void nvme_start_ns_queue(struct nvme_ns *ns) +{ + if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_unquiesce_queue(ns->queue); +} + +static void nvme_stop_ns_queue(struct nvme_ns *ns) +{ + if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_quiesce_queue(ns->queue); +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ +static void nvme_set_queue_dying(struct nvme_ns *ns) +{ + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + return; + + blk_set_queue_dying(ns->queue); + nvme_start_ns_queue(ns); + + set_capacity(ns->disk, 0); + nvme_update_bdev_size(ns->disk); +} + /** * nvme_kill_queues(): Ends all namespace queues * @ctrl: the dead controller that needs to end @@ -4555,7 +4567,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) /* Forcibly unquiesce queues to avoid blocking dispatch */ if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); list_for_each_entry(ns, &ctrl->namespaces, list) nvme_set_queue_dying(ns); @@ -4618,7 +4630,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_quiesce_queue(ns->queue); + nvme_stop_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_stop_queues); @@ -4629,11 +4641,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_unquiesce_queue(ns->queue); + nvme_start_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_start_queues); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +{ + if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_quiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); + +void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +{ + if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_start_admin_queue); + void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 906cab35afe7adc3ace9a500f77ab1cfd3bcadc0..b534a85e2bf1f60621e8134dd6f5b6f098fc8ed2 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2381,7 +2381,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -2509,7 +2509,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -3098,7 +3098,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); ret = nvme_init_identify(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) @@ -3245,7 +3245,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ nvme_start_queues(&ctrl->ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5dd1dd8021ba1df7dac38ff690bd9a591ee4beea..ad46208eac765786278dab2fd6afe95ab878ecef 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -338,6 +338,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_ADMIN_Q_STOPPED 0 struct nvmf_ctrl_options *opts; struct page *discard_page; @@ -449,6 +451,7 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 +#define NVME_NS_STOPPED 3 struct nvme_fault_inject fault_inject; @@ -647,6 +650,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); +void nvme_start_admin_queue(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1b85349f57af0f278f10874c4149a4ff50254af5..224907d8d5dc34ce933541cfa56af5912119d428 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1408,7 +1408,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); + nvme_stop_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; @@ -1640,7 +1640,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1674,7 +1674,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); return 0; } @@ -2516,7 +2516,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (shutdown) { nvme_start_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 51f4647ea21428819e1d659d440b89c496afaf1e..5e2c86adeb27a21e11656440ebb8671befdc3998 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -918,7 +918,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_identify(&ctrl->ctrl); if (error) @@ -927,7 +927,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -1025,7 +1025,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -1034,7 +1034,7 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); } if (remove) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -1161,7 +1161,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) nvme_rdma_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1201,7 +1201,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2237,7 +2237,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e99d43989418738738524c631b343b084cb9f8d7..c014c5adbac5ef0b4e57daeeb2e201b15f245135 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1889,7 +1889,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); error = nvme_init_identify(ctrl); if (error) @@ -1898,7 +1898,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1920,7 +1920,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1929,7 +1929,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); } if (remove) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1938,7 +1938,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_sync_io_queues(ctrl); @@ -2030,7 +2030,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2073,7 +2073,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2091,7 +2091,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); if (shutdown) nvme_shutdown_ctrl(ctrl); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e17710e405a31f04c75c94952d2d48316a4658da..a05dc29a43f99fdf029031b15212153ba1fd0966 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -382,6 +382,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = PTR_ERR(ctrl->ctrl.admin_q); goto out_cleanup_fabrics_q; } + /* reset stopped state for the fresh admin queue */ + clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); error = nvmf_connect_admin_queue(&ctrl->ctrl); if (error) @@ -396,7 +398,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_identify(&ctrl->ctrl); if (error) @@ -426,7 +428,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_io_queues(ctrl); } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 043b5f63b94a167b8641dfbc92699b23cd89a1b0..a6d27334a71d20cd49de05389dcced0341b4eca3 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1445,6 +1445,8 @@ static int set_machine_constraints(struct regulator_dev *rdev) if (rdev->constraints->always_on) rdev->use_count++; + } else if (rdev->desc->off_on_delay) { + rdev->last_off_jiffy = jiffies; } print_constraints(rdev); @@ -4058,6 +4060,10 @@ int regulator_set_voltage_time(struct regulator *regulator, for (i = 0; i < rdev->desc->n_voltages; i++) { /* We only look for exact voltage matches here */ + + if (old_sel >= 0 && new_sel >= 0) + break; + voltage = regulator_list_voltage(regulator, i); if (voltage < 0) return -EINVAL; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index b6540b92f56610829e3cda18199755a70aaf3830..3fc7c2a31c1913d468a4eb7d83a4367a95d9ffe2 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1855,7 +1855,7 @@ static int resp_readcap16(struct scsi_cmnd *scp, { unsigned char *cmd = scp->cmnd; unsigned char arr[SDEBUG_READCAP16_ARR_SZ]; - int alloc_len; + u32 alloc_len; alloc_len = get_unaligned_be32(cmd + 10); /* following just in case virtual_gb changed */ @@ -1884,7 +1884,7 @@ static int resp_readcap16(struct scsi_cmnd *scp, } return fill_from_dev_buffer(scp, arr, - min_t(int, alloc_len, SDEBUG_READCAP16_ARR_SZ)); + min_t(u32, alloc_len, SDEBUG_READCAP16_ARR_SZ)); } #define SDEBUG_MAX_TGTPGS_ARR_SZ 1412 @@ -1895,8 +1895,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp, unsigned char *cmd = scp->cmnd; unsigned char *arr; int host_no = devip->sdbg_host->shost->host_no; - int n, ret, alen, rlen; int port_group_a, port_group_b, port_a, port_b; + u32 alen, n, rlen; + int ret; alen = get_unaligned_be32(cmd + 6); arr = kzalloc(SDEBUG_MAX_TGTPGS_ARR_SZ, GFP_ATOMIC); @@ -1958,9 +1959,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp, * - The constructed command length * - The maximum array size */ - rlen = min_t(int, alen, n); + rlen = min(alen, n); ret = fill_from_dev_buffer(scp, arr, - min_t(int, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ)); + min_t(u32, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ)); kfree(arr); return ret; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d89db29fa829c066b501f0dadc64e1ca8870193e..9582aabf2ff1641f44d605459a20814f153b1123 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1193,9 +1193,9 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, } cmd->cmd_len = scsi_req(req)->cmd_len; + cmd->cmnd = scsi_req(req)->cmd; if (cmd->cmd_len == 0) cmd->cmd_len = scsi_command_size(cmd->cmnd); - cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); cmd->allowed = scsi_req(req)->retries; return BLK_STS_OK; @@ -2618,6 +2618,48 @@ scsi_target_resume(struct scsi_target *starget) } EXPORT_SYMBOL(scsi_target_resume); +static int __scsi_internal_device_block_nowait(struct scsi_device *sdev) +{ + if (scsi_device_set_state(sdev, SDEV_BLOCK)) + return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); + + return 0; +} + +static DEFINE_SPINLOCK(sdev_queue_stop_lock); + +void scsi_start_queue(struct scsi_device *sdev) +{ + bool need_start; + unsigned long flags; + + spin_lock_irqsave(&sdev_queue_stop_lock, flags); + need_start = sdev->queue_stopped; + sdev->queue_stopped = 0; + spin_unlock_irqrestore(&sdev_queue_stop_lock, flags); + + if (need_start) + blk_mq_unquiesce_queue(sdev->request_queue); +} + +static void scsi_stop_queue(struct scsi_device *sdev, bool nowait) +{ + bool need_stop; + unsigned long flags; + + spin_lock_irqsave(&sdev_queue_stop_lock, flags); + need_stop = !sdev->queue_stopped; + sdev->queue_stopped = 1; + spin_unlock_irqrestore(&sdev_queue_stop_lock, flags); + + if (need_stop) { + if (nowait) + blk_mq_quiesce_queue_nowait(sdev->request_queue); + else + blk_mq_quiesce_queue(sdev->request_queue); + } +} + /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block @@ -2634,24 +2676,16 @@ EXPORT_SYMBOL(scsi_target_resume); */ int scsi_internal_device_block_nowait(struct scsi_device *sdev) { - struct request_queue *q = sdev->request_queue; - int err = 0; - - err = scsi_device_set_state(sdev, SDEV_BLOCK); - if (err) { - err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); - - if (err) - return err; - } + int ret = __scsi_internal_device_block_nowait(sdev); /* * The device has transitioned to SDEV_BLOCK. Stop the * block layer from calling the midlayer with this device's * request queue. */ - blk_mq_quiesce_queue_nowait(q); - return 0; + if (!ret) + scsi_stop_queue(sdev, true); + return ret; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); @@ -2672,25 +2706,17 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); */ static int scsi_internal_device_block(struct scsi_device *sdev) { - struct request_queue *q = sdev->request_queue; int err; mutex_lock(&sdev->state_mutex); - err = scsi_internal_device_block_nowait(sdev); + err = __scsi_internal_device_block_nowait(sdev); if (err == 0) - blk_mq_quiesce_queue(q); + scsi_stop_queue(sdev, false); mutex_unlock(&sdev->state_mutex); return err; } -void scsi_start_queue(struct scsi_device *sdev) -{ - struct request_queue *q = sdev->request_queue; - - blk_mq_unquiesce_queue(q); -} - /** * scsi_internal_device_unblock_nowait - resume a device after a block request * @sdev: device to resume diff --git a/fs/coredump.c b/fs/coredump.c index c56a3bdce7cd4552168f6d9ac52b2cdb39221ed3..335c98787e668640f075728e33f104117c5f6dbf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cprm.file = file_open_root(root.dentry, root.mnt, - cn.corename, open_flags, 0600); + cprm.file = file_open_root(&root, cn.corename, + open_flags, 0600); path_put(&root); } else { cprm.file = filp_open(cn.corename, open_flags, 0600); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c859ec4544bcb651d0545ed5a481842ec0458a3b..bb3adca53d9375aad51060a5691d24cde2190fb9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3437,9 +3437,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, unsigned flags, struct page **pagep, void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aa4d74f9d1623a364d4ef5a551f8cf562e3c30de..398df993ccd88a63f18dbfd9073d18198c974c73 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -136,14 +136,24 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { + int err = 0; + if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + err = ext4_journal_get_write_access(handle, path->p_bh); + /* + * The extent buffer's verified bit will be set again in + * __ext4_ext_dirty(). We could leave an inconsistent + * buffer if the extents updating procudure break off du + * to some error happens, force to check it again. + */ + if (!err) + clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ - return 0; + return err; } /* @@ -164,6 +174,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); + /* Extents updating done, re-set verified flag */ + if (!err) + set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -353,9 +366,13 @@ static int ext4_valid_extent_idx(struct inode *inode, static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, - ext4_fsblk_t *pblk, int depth) + ext4_lblk_t lblk, ext4_fsblk_t *pblk, + int depth) { unsigned short entries; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + if (eh->eh_entries == 0) return 1; @@ -364,31 +381,51 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); - ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; - int len = 0; + + /* + * The logical block in the first entry should equal to + * the number in the index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext->ee_block)) + return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - len = ext4_ext_get_actual_len(ext); if ((lblock <= prev) && prev) { *pblk = ext4_ext_pblock(ext); return 0; } + prev = lblock + ext4_ext_get_actual_len(ext) - 1; ext++; entries--; - prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + + /* + * The logical block in the first entry should equal to + * the number in the parent index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext_idx->ei_block)) + return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; + + /* Check for overlapping index extents */ + lblock = le32_to_cpu(ext_idx->ei_block); + if ((lblock <= prev) && prev) { + *pblk = ext4_idx_pblock(ext_idx); + return 0; + } ext_idx++; entries--; + prev = lblock; } } return 1; @@ -396,7 +433,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth, ext4_fsblk_t pblk) + int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; @@ -422,7 +459,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) { + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } @@ -452,7 +489,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, } #define ext4_ext_check(inode, eh, depth, pblk) \ - __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { @@ -485,16 +522,18 @@ static void ext4_cache_extents(struct inode *inode, static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, - struct inode *inode, ext4_fsblk_t pblk, int depth, - int flags) + struct inode *inode, struct ext4_extent_idx *idx, + int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; + ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; + pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -505,13 +544,24 @@ __read_extent_tree_block(const char *function, unsigned int line, if (err < 0) goto errout; } - if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) - return bh; - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - set_buffer_verified(bh); + if (buffer_verified(bh)) { + if (unlikely(ext_block_hdr(bh)->eh_magic != EXT4_EXT_MAGIC)) { + err = -EFSCORRUPTED; + ext4_error_inode(inode, function, line, 0, + "invalid magic for verified extent block %llu", + (unsigned long long)bh->b_blocknr); + goto errout; + } + + if (!(flags & EXT4_EX_FORCE_CACHE)) + return bh; + } else { + err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), + depth, pblk, le32_to_cpu(idx->ei_block)); + if (err) + goto errout; + set_buffer_verified(bh); + } /* * If this is a leaf block, cache all of its entries */ @@ -526,8 +576,8 @@ __read_extent_tree_block(const char *function, unsigned int line, } -#define read_extent_tree_block(inode, pblk, depth, flags) \ - __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ +#define read_extent_tree_block(inode, idx, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* @@ -577,8 +627,7 @@ int ext4_ext_precache(struct inode *inode) i--; continue; } - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx++), + bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { @@ -883,8 +932,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = read_extent_tree_block(inode, path[ppos].p_block, --i, - flags); + bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; @@ -1489,7 +1537,6 @@ static int ext4_ext_search_right(struct inode *inode, struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; - ext4_fsblk_t block; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; @@ -1556,20 +1603,17 @@ static int ext4_ext_search_right(struct inode *inode, * follow it and find the closest allocated * block to the right */ ix++; - block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, block, - path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); - block = ext4_idx_pblock(ix); put_bh(bh); } - bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2948,9 +2992,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx), depth - i - 1, - EXT4_EX_NOCACHE); + bh = read_extent_tree_block(inode, path[i].p_idx, + depth - i - 1, + EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -3568,7 +3612,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3582,7 +3626,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; @@ -3591,6 +3635,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } +fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) @@ -4970,36 +5015,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } -/* - * ext4_access_path: - * Function to access the path buffer for marking it dirty. - * It also checks if there are sufficient credits left in the journal handle - * to update path. - */ -static int -ext4_access_path(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - int credits, err; - - if (!ext4_handle_valid(handle)) - return 0; - - /* - * Check if need to extend journal credits - * 3 for leaf, sb, and inode plus 2 (bmap and group - * descriptor) for each block group; assume two block - * groups - */ - credits = ext4_writepage_trans_blocks(inode); - err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); - if (err < 0) - return err; - - err = ext4_ext_get_access(handle, inode, path); - return err; -} - /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext @@ -5014,6 +5029,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; + int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { @@ -5023,13 +5039,26 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + /* leaf + sb + inode */ + credits = 3; + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { + update = true; + /* extent tree + sb + inode */ + credits = depth + 2; + } - err = ext4_access_path(handle, inode, path + depth); - if (err) + restart_credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, credits, + restart_credits, 0); + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } - if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = true; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5060,7 +5089,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, } /* Update index too */ - err = ext4_access_path(handle, inode, path + depth); + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; @@ -5099,6 +5128,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; + ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, @@ -5152,11 +5182,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ +again: if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; + if (tmp != EXT_MAX_BLOCKS) + *iterator = tmp; + /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator @@ -5185,6 +5219,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } } + tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + @@ -5203,6 +5238,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); + /* iterator can be NULL which means we should break */ + if (ret == -EAGAIN) + goto again; if (ret) break; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a96b688a0410fbf97579b37deb4c8af07e8be685..cb42b2245c21cd1736e9378fa184d5b025f31f51 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -729,40 +729,83 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret, no_expand; + handle_t *handle = ext4_journal_current_handle(); + int no_expand; void *kaddr; struct ext4_iloc iloc; + int ret = 0, ret2; if (unlikely(copied < len) && !PageUptodate(page)) - return 0; + copied = 0; - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return ret; - } + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + unlock_page(page); + put_page(page); + ext4_std_error(inode->i_sb, ret); + goto out; + } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); - ext4_write_lock_xattr(inode, &no_expand); - BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since + * ext4_write_begin() called + * ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); - /* - * ei->i_inline_off may have changed since ext4_write_begin() - * called ext4_try_to_write_inline_data() - */ - (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); - ext4_write_unlock_xattr(inode, &no_expand); - brelse(iloc.bh); - mark_inode_dirty(inode); + /* + * It's important to update i_size while still holding page + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); + } + unlock_page(page); + put_page(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (likely(copied)) + mark_inode_dirty(inode); +out: + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); - return copied; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } struct buffer_head * @@ -943,43 +986,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, return ret; } -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int ret; - - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ret; - } - copied = ret; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - unlock_page(page); - put_page(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - mark_inode_dirty(inode); - - return copied; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 05ab7b65ea5253bbab7cb1cc1e3067283064d2ad..ef102cca42dc5e358d698534c68eb6cf1a00c900 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1283,23 +1283,14 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1320,10 +1311,9 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed || inline_data) + if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1395,7 +1385,6 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); @@ -1404,17 +1393,10 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else if (unlikely(copied < len) && !PageUptodate(page)) { + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, page, from, to); } else { @@ -1437,13 +1419,12 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - if (size_changed || inline_data) { + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1956,6 +1937,20 @@ static int __ext4_journalled_writepage(struct page *page, return ret; } +static void cancel_page_dirty_status(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + cancel_dirty_page(page); + xa_lock_irqsave(&mapping->i_pages, flags); + __xa_clear_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_TOWRITE); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2014,6 +2009,12 @@ static int ext4_writepage(struct page *page, return -EIO; } + if (WARN_ON(!page_has_buffers(page))) { + cancel_page_dirty_status(page); + unlock_page(page); + return 0; + } + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_SHIFT && @@ -2622,6 +2623,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) continue; } + if (WARN_ON(!page_has_buffers(page))) { + cancel_page_dirty_status(page); + unlock_page(page); + continue; + } + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); @@ -2935,19 +2942,6 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -/* We always reserve for an inode update; the superblock could be there too */ -static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) -{ - if (likely(ext4_has_feature_large_file(inode->i_sb))) - return 1; - - if (pos + len <= 0x7fffffffULL) - return 1; - - /* We might need to update the superblock to set LARGE_FILE */ - return 2; -} - static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2956,7 +2950,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; struct inode *inode = mapping->host; - handle_t *handle; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -2982,41 +2975,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, return 0; } - /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page - * is being written back. So grab it first before we start - * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. - */ -retry_grab: +retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; - unlock_page(page); - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_da_write_credits(inode, pos, len)); - if (IS_ERR(handle)) { - put_page(page); - return PTR_ERR(handle); - } - - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); - ext4_journal_stop(handle); - goto retry_grab; - } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); @@ -3028,20 +2991,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, #endif if (ret < 0) { unlock_page(page); - ext4_journal_stop(handle); + put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - - put_page(page); + goto retry; return ret; } @@ -3078,8 +3039,6 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret = 0, ret2; - handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; @@ -3089,6 +3048,12 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3108,27 +3073,11 @@ static int ext4_da_write_end(struct file *file, * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) - ext4_update_i_disksize(inode, new_i_size); - } - - if (write_mode != CONVERT_INLINE_DATA && - ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && - ext4_has_inline_data(inode)) - ret = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - - copied = ret; - ret2 = ext4_journal_stop(handle); - if (unlikely(ret2 && !ret)) - ret = ret2; + if (copied && new_i_size > inode->i_size && + ext4_da_should_update_i_disksize(page, end)) + ext4_update_i_disksize(inode, new_i_size); - return ret ? ret : copied; + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* @@ -4303,14 +4252,161 @@ int ext4_truncate(struct inode *inode) return err; } +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + +static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = READ_ONCE(inode->i_blocks); + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represented in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ + if (!ext4_has_feature_huge_file(sb)) + return -EFSCORRUPTED; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + uid_t i_uid; + gid_t i_gid; + projid_t i_projid; + int block; + int err; + + err = ext4_inode_blocks_set(raw_inode, ei); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(i_gid)); + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else if (!ext4_has_inline_data(inode)) { + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + } + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + u64 ivers = ext4_inode_peek_iversion(inode); + + raw_inode->i_disk_version = cpu_to_le32(ivers); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(ivers >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } + } + + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + + ext4_inode_csum_set(inode, raw_inode, ei); + return err; +} + /* * ext4_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If 'in_mem' is true, we have all - * data in memory that is needed to recreate the on-disk version of this - * inode. + * underlying buffer_head on success. If we pass 'inode' and it does not + * have in-inode xattr, we have all inode data in memory that is needed + * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, - struct ext4_iloc *iloc, int in_mem, + struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; @@ -4341,8 +4437,6 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) - goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4357,7 +4451,7 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, * is the only valid inode in the block, we need not read the * block. */ - if (in_mem) { + if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; @@ -4385,8 +4479,13 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, } brelse(bitmap_bh); if (i == start + inodes_per_block) { + struct ext4_inode *raw_inode = + (struct ext4_inode *) (bh->b_data + iloc->offset); + /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); + if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) + ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; @@ -4429,8 +4528,8 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { - simulate_eio: if (ret_block) *ret_block = block; brelse(bh); @@ -4448,7 +4547,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode, ext4_fsblk_t err_blk; int ret; - ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0, + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) @@ -4463,9 +4562,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) ext4_fsblk_t err_blk; int ret; - /* We have all inode data except xattrs in memory here. */ - ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, - !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk); + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, + &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, @@ -4478,7 +4576,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { - return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL); + return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) @@ -4599,13 +4697,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) else inode_set_iversion_queried(inode, val); } -static inline u64 ext4_inode_peek_iversion(const struct inode *inode) -{ - if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return inode_peek_iversion_raw(inode); - else - return inode_peek_iversion(inode); -} struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, @@ -4921,45 +5012,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, return ERR_PTR(ret); } -static int ext4_inode_blocks_set(handle_t *handle, - struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) -{ - struct inode *inode = &(ei->vfs_inode); - u64 i_blocks = READ_ONCE(inode->i_blocks); - struct super_block *sb = inode->i_sb; - - if (i_blocks <= ~0U) { - /* - * i_blocks can be represented in a 32 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = 0; - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - return 0; - } - if (!ext4_has_feature_huge_file(sb)) - return -EFBIG; - - if (i_blocks <= 0xffffffffffffULL) { - /* - * i_blocks can be represented in a 48 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - } else { - ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); - /* i_block is stored in file system block size */ - i_blocks = i_blocks >> (inode->i_blkbits - 9); - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - } - return 0; -} - static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, @@ -5039,113 +5091,33 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; - int err = 0, block; + int err; int need_datasync = 0, set_large_file = 0; - uid_t i_uid; - gid_t i_gid; - projid_t i_projid; spin_lock(&ei->i_raw_lock); - /* For fields not tracked in the in-memory inode, - * initialise them to zero for new inodes. */ + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } - - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - i_uid = i_uid_read(inode); - i_gid = i_gid_read(inode); - i_projid = from_kprojid(&init_user_ns, ei->i_projid); - if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } else { - raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(i_uid)); - raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(i_gid)); - } - } else { - raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); - EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) - raw_inode->i_file_acl_high = - cpu_to_le16(ei->i_file_acl >> 32); - raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { - ext4_isize_set(raw_inode, ei->i_disksize); + if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; - } if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || - EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) + EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } - raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - raw_inode->i_block[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - raw_inode->i_block[1] = 0; - } else { - raw_inode->i_block[0] = 0; - raw_inode->i_block[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - raw_inode->i_block[2] = 0; - } - } else if (!ext4_has_inline_data(inode)) { - for (block = 0; block < EXT4_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - } - if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = ext4_inode_peek_iversion(inode); - - raw_inode->i_disk_version = cpu_to_le32(ivers); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(ivers >> 32); - raw_inode->i_extra_isize = - cpu_to_le16(ei->i_extra_isize); - } + err = ext4_fill_raw_inode(inode, raw_inode); + spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; } - BUG_ON(!ext4_has_feature_project(inode->i_sb) && - i_projid != EXT4_DEF_PROJID); - - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) - raw_inode->i_projid = cpu_to_le32(i_projid); - - ext4_inode_csum_set(inode, raw_inode, ei); - spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5153,13 +5125,13 @@ static int ext4_do_update_inode(handle_t *handle, BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) - goto out_brelse; + goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) - goto out_brelse; + goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); @@ -5169,9 +5141,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); - ext4_std_error(inode->i_sb, err); return err; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index bc364c119af6ac00bcfedcec131af0178ecfa428..cebea4270817e7c0218cc2f3034449ce0d0ef342 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -138,7 +138,7 @@ static int kmmpd(void *data) unsigned mmp_check_interval; unsigned long last_update_time; unsigned long diff; - int retval; + int retval = 0; mmp_block = le64_to_cpu(es->s_mmp_block); mmp = (struct mmp_struct *)(bh->b_data); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dd0846321c2f2890ff2de2da95c19fa7daeb65d5..68aa31dc30dd355b0a13024c5a6d90d48bde4970 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -661,7 +661,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ - if (continue_fs) + if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -5001,6 +5001,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } + /* + * Update the checksum after updating free space/inode + * counters. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5143,12 +5151,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: + /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); diff --git a/fs/fcntl.c b/fs/fcntl.c index 71b43538fa44cf2eb46f3168336f54caa1f7cc50..3eebcbead8bfa444cffd2f0abc3a632e5bf51f8b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1042,7 +1042,8 @@ static int __init fcntl_init(void) __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 01263ffbc4c0873544c28f68e408ce314ccddcc9..718defdf1e0e2d4a8c4486ee00234f475022ca45 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); + file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/fs_context.c b/fs/fs_context.c index 2834d1afa6e8053e93c6e78122be8d58107a36c9..b7e43a780a625bca1b0faeba53e2702463ad0496 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) return -ENOPARAM; } +/** + * vfs_parse_fs_param_source - Handle setting "source" via parameter + * @fc: The filesystem context to modify + * @param: The parameter + * + * This is a simple helper for filesystems to verify that the "source" they + * accept is sane. + * + * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and + * -EINVAL otherwise. In the event of failure, supplementary error information + * is logged. + */ +int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) +{ + if (strcmp(param->key, "source") != 0) + return -ENOPARAM; + + if (param->type != fs_value_is_string) + return invalf(fc, "Non-string source"); + + if (fc->source) + return invalf(fc, "Multiple sources"); + + fc->source = param->string; + param->string = NULL; + return 0; +} +EXPORT_SYMBOL(vfs_parse_fs_param_source); + /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify @@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); @@ -231,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; + int ret; - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Legacy: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); @@ -631,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5fc9ccab907c3f1d1e13e2c4b3bc0fcfe499d8bf..c7dec890f9d9520dbc181739a2dff6613f67cfaa 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -655,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as well as being converted to page offsets. */ start = offset >> hpage_shift; - end = (offset + len + hpage_size - 1) >> hpage_shift; + end = DIV_ROUND_UP_ULL(offset + len, hpage_size); inode_lock(inode); diff --git a/fs/internal.h b/fs/internal.h index 5155f6ce95c79498be285643bae25019cd5cd8d9..0d4f7e4e2f3aa8cf49d5682ee9157a94e405364a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -128,7 +128,7 @@ struct open_flags { }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); -extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, +extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); diff --git a/fs/io_uring.c b/fs/io_uring.c index 104dff9c7131495e0a676b4c4e3c5c2816eeb6c4..f3078708fdb86cf394dea9d0b28cd27d876277e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2056,6 +2056,16 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) return ret; } +static void io_req_task_work_add_fallback(struct io_kiocb *req, + void (*cb)(struct callback_head *)) +{ + struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + + init_task_work(&req->task_work, cb); + task_work_add(tsk, &req->task_work, TWA_NONE); + wake_up_process(tsk); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -2113,14 +2123,8 @@ static void io_req_task_queue(struct io_kiocb *req) percpu_ref_get(&req->ctx->refs); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); } static void io_queue_next(struct io_kiocb *req) @@ -2239,13 +2243,8 @@ static void io_free_req_deferred(struct io_kiocb *req) init_task_work(&req->task_work, io_put_req_deferred_cb); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_put_req_deferred_cb); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -3345,15 +3344,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - /* queue just for cancelation */ - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); return 1; } @@ -4963,12 +4955,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { - struct task_struct *tsk; - WRITE_ONCE(poll->canceled, true); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + io_req_task_work_add_fallback(req, func); } return 1; } @@ -6147,7 +6135,11 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { - ret = -ECANCELED; + /* io-wq is going to take down one */ + refcount_inc(&req->refs); + percpu_ref_get(&req->ctx->refs); + io_req_task_work_add_fallback(req, io_req_task_cancel); + return io_steal_work(req); } if (!ret) { diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index c84d87f558cb62eeb312ad9f74abc59aa257f9a1..1b07550485b96415f0a79a7029b10bf5d78c643e 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, get_fs_root(init_task.fs, &root); task_unlock(&init_task); - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/locks.c b/fs/locks.c index 32c948fe294487ca17b471aebb97ea41f40f34c5..6e118955b9b648938d49312594fec0782deb1a7e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -542,7 +542,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, if (l->l_len > 0) { if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) return -EOVERFLOW; - fl->fl_end = fl->fl_start + l->l_len - 1; + fl->fl_end = fl->fl_start + (l->l_len - 1); } else if (l->l_len < 0) { if (fl->fl_start + l->l_len < 0) diff --git a/fs/namei.c b/fs/namei.c index 4c9d0c36545d3d850d96dec993290b28389b012a..c94a814e86b22802d43915e9fd0c782701e5c60a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -504,7 +504,7 @@ struct nameidata { struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ - unsigned int flags; + unsigned int flags, state; unsigned seq, m_seq, r_seq; int last_type; unsigned depth; @@ -523,6 +523,10 @@ struct nameidata { umode_t dir_mode; } __randomize_layout; +#define ND_ROOT_PRESET 1 +#define ND_ROOT_GRABBED 2 +#define ND_JUMPED 4 + static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; @@ -531,6 +535,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->name = name; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; + p->state = 0; current->nameidata = p; } @@ -593,9 +598,9 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->flags & LOOKUP_ROOT_GRABBED) { + if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); - nd->flags &= ~LOOKUP_ROOT_GRABBED; + nd->state &= ~ND_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; @@ -651,9 +656,9 @@ static bool legitimize_root(struct nameidata *nd) if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ - if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } @@ -790,8 +795,9 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) - nd->root.mnt = NULL; + if (!(nd->state & ND_ROOT_PRESET)) + if (!(nd->flags & LOOKUP_IS_SCOPED)) + nd->root.mnt = NULL; if (!try_to_unlazy(nd)) return -ECHILD; } @@ -817,7 +823,7 @@ static int complete_walk(struct nameidata *nd) return -EXDEV; } - if (likely(!(nd->flags & LOOKUP_JUMPED))) + if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) @@ -855,7 +861,7 @@ static int set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } return 0; } @@ -888,7 +894,7 @@ static int nd_jump_root(struct nameidata *nd) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; } @@ -916,7 +922,7 @@ int nd_jump_link(struct path *path) path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; err: @@ -1338,7 +1344,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; *seqp = read_seqcount_begin(&dentry->d_seq); *inode = dentry->d_inode; /* @@ -1383,7 +1389,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); @@ -2129,7 +2135,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) case 2: if (name[1] == '.') { type = LAST_DOTDOT; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } break; case 1: @@ -2137,7 +2143,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; - nd->flags &= ~LOOKUP_JUMPED; + nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); @@ -2207,14 +2213,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags; + nd->state |= ND_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (flags & LOOKUP_ROOT) { + if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2291,7 +2298,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root_seq = nd->seq; } else { path_get(&nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } } return s; @@ -2330,7 +2337,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); @@ -2354,11 +2361,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); + set_nameidata(&nd, dfd, name); if (unlikely(root)) { nd.root = *root; - flags |= LOOKUP_ROOT; + nd.state = ND_ROOT_PRESET; } - set_nameidata(&nd, dfd, name); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); @@ -3393,18 +3400,15 @@ struct file *do_filp_open(int dfd, struct filename *pathname, return filp; } -struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; - int flags = op->lookup_flags | LOOKUP_ROOT; - - nd.root.mnt = mnt; - nd.root.dentry = dentry; + int flags = op->lookup_flags; - if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) + if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); @@ -3412,6 +3416,8 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, return ERR_CAST(filename); set_nameidata(&nd, -1, filename); + nd.root = *root; + nd.state = ND_ROOT_PRESET; file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); diff --git a/fs/namespace.c b/fs/namespace.c index 046b084136c51a0a36972c326f63d65bae5d4e7d..6e76f2a72cfca0a691f125e2ffafaa569a585fd4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -183,7 +183,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -3282,7 +3283,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); @@ -3840,7 +3841,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 5a59dcdce0b2b2a1c150acdb9a9c7f4281d101b9..cb7f49723bbf1d6958ef41933ade0ccd36cc9c59 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -271,8 +271,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET); -TRACE_DEFINE_ENUM(LOOKUP_JUMPED); -TRACE_DEFINE_ENUM(LOOKUP_ROOT); TRACE_DEFINE_ENUM(LOOKUP_EMPTY); TRACE_DEFINE_ENUM(LOOKUP_DOWN); @@ -288,8 +286,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \ - { LOOKUP_JUMPED, "JUMPED" }, \ - { LOOKUP_ROOT, "ROOT" }, \ { LOOKUP_EMPTY, "EMPTY" }, \ { LOOKUP_DOWN, "DOWN" }) diff --git a/fs/open.c b/fs/open.c index 3aaaad47d9cac24c0351c8d5a86ec0b7dbd67ea4..7ce64c08bb40a0672fa807bb29513087431fb837 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1149,7 +1149,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; @@ -1157,7 +1157,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index f7135777cb4eb0567752a818eed883ea6b9d6819..d1ae643a999a5ca54a8e846f3eacb947f845a13e 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -17,6 +17,7 @@ struct ovl_aio_req { struct kiocb iocb; + refcount_t ref; struct kiocb *orig_iocb; struct fd fd; }; @@ -257,6 +258,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl) return flags; } +static inline void ovl_aio_put(struct ovl_aio_req *aio_req) +{ + if (refcount_dec_and_test(&aio_req->ref)) { + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } +} + static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) { struct kiocb *iocb = &aio_req->iocb; @@ -273,8 +282,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) } orig_iocb->ki_pos = iocb->ki_pos; - fdput(aio_req->fd); - kmem_cache_free(ovl_aio_request_cachep, aio_req); + ovl_aio_put(aio_req); } static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) @@ -324,7 +332,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); } @@ -395,7 +405,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 092812c2f118a971f4d98cac5620827b55aa5369..b1c8a38573c93f607196a4346a7be1c2652bba3c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1004,6 +1004,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * Just make sure a corresponding data dentry has been found. */ if (d.metacopy || (uppermetacopy && !ctr)) { + pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", + dentry); err = -EIO; goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 070d2df8ab9cf02cccfdb276c570331ac9d3065f..ffed75f833b7b362a1e5266f94d76c989b467b98 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1803,7 +1803,7 @@ static int process_sysctl_arg(char *param, char *val, panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); - file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c5562c871c8beff8e12bb92dfa75b1066e3ff3fc..1a188fbdf34e5cff916063ea6071d0f74ed0a088 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -423,6 +423,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); @@ -488,6 +489,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -587,6 +595,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ee179a81b3da8f2a987716944b842073f81e0780..0163dcff9318b4fcdbb79239e4c488c1942ba9ae 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -193,17 +193,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, ramfs_fs_parameters, param, &result); - if (opt < 0) { + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ - if (opt == -ENOPARAM) - opt = 0; - return opt; + return 0; } + if (opt < 0) + return opt; switch (opt) { case Opt_mode: diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 88cc94be10765c7f76b223536a8195eeb30f0296..f7128ae7b949cead57ee0b2a7f4450908587dfd2 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -64,6 +65,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } +static int squashfs_bdi_init(struct super_block *sb) +{ + int err; + unsigned int major = MAJOR(sb->s_dev); + unsigned int minor = MINOR(sb->s_dev); + + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + + err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); + if (err) + return err; + + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + + return 0; +} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -78,6 +97,19 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); + /* + * squashfs provides 'backing_dev_info' in order to disable read-ahead. For + * squashfs, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless.squashfs_bdi_init + * will set sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0. + */ + err = squashfs_bdi_init(sb); + if (err) { + errorf(fc, "squashfs init bdi failed"); + return err; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index b5cdac9b0368dca93491f0331695f475032b1db4..c4fc1047fc0797f492a8f0cdf596f9f2215fd907 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -701,13 +701,13 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) out_dump: ubifs_err(c, "dumping index node (iip=%d)", i->iip); - ubifs_dump_node(c, idx); + ubifs_dump_node(c, idx, ubifs_idx_node_sz(c, c->fanout)); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); ubifs_err(c, "dumping parent index node"); - ubifs_dump_node(c, &i->idx); + ubifs_dump_node(c, &i->idx, ubifs_idx_node_sz(c, c->fanout)); } out_free: while (!list_empty(&list)) { diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ebff43f8009c2966a2a1d585a70713a02c3ef4ef..89320c89cf0d1bd028d0e4d6284bb132a92710be 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -291,9 +291,9 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) kfree(pdent); } -void ubifs_dump_node(const struct ubifs_info *c, const void *node) +void ubifs_dump_node(const struct ubifs_info *c, const void *node, int node_len) { - int i, n; + int i, n, type, safe_len, max_node_len, min_node_len; union ubifs_key key; const struct ubifs_ch *ch = node; char key_buf[DBG_KEY_BUF_LEN]; @@ -306,10 +306,40 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) return; } + /* Skip dumping unknown type node */ + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + pr_err("node type %d was not recognized\n", type); + return; + } + spin_lock(&dbg_lock); dump_ch(node); - switch (ch->node_type) { + if (c->ranges[type].max_len == 0) { + max_node_len = min_node_len = c->ranges[type].len; + } else { + max_node_len = c->ranges[type].max_len; + min_node_len = c->ranges[type].min_len; + } + safe_len = le32_to_cpu(ch->len); + safe_len = safe_len > 0 ? safe_len : 0; + safe_len = min3(safe_len, max_node_len, node_len); + if (safe_len < min_node_len) { + pr_err("node len(%d) is too short for %s, left %d bytes:\n", + safe_len, dbg_ntype(type), + safe_len > UBIFS_CH_SZ ? + safe_len - (int)UBIFS_CH_SZ : 0); + if (safe_len > UBIFS_CH_SZ) + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node + UBIFS_CH_SZ, + safe_len - UBIFS_CH_SZ, 0); + goto out_unlock; + } + if (safe_len != le32_to_cpu(ch->len)) + pr_err("\ttruncated node length %d\n", safe_len); + + switch (type) { case UBIFS_PAD_NODE: { const struct ubifs_pad_node *pad = node; @@ -453,7 +483,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("\tnlen %d\n", nlen); pr_err("\tname "); - if (nlen > UBIFS_MAX_NLEN) + if (nlen > UBIFS_MAX_NLEN || + nlen > safe_len - UBIFS_DENT_NODE_SZ) pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) @@ -467,7 +498,6 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) case UBIFS_DATA_NODE: { const struct ubifs_data_node *dn = node; - int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); pr_err("\tkey %s\n", @@ -475,10 +505,13 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("\tsize %u\n", le32_to_cpu(dn->size)); pr_err("\tcompr_typ %d\n", (int)le16_to_cpu(dn->compr_type)); - pr_err("\tdata size %d\n", dlen); - pr_err("\tdata:\n"); + pr_err("\tdata size %u\n", + le32_to_cpu(ch->len) - (unsigned int)UBIFS_DATA_NODE_SZ); + pr_err("\tdata (length = %d):\n", + safe_len - (int)UBIFS_DATA_NODE_SZ); print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, - (void *)&dn->data, dlen, 0); + (void *)&dn->data, + safe_len - (int)UBIFS_DATA_NODE_SZ, 0); break; } case UBIFS_TRUN_NODE: @@ -495,13 +528,16 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) case UBIFS_IDX_NODE: { const struct ubifs_idx_node *idx = node; + int max_child_cnt = (safe_len - UBIFS_IDX_NODE_SZ) / + (ubifs_idx_node_sz(c, 1) - + UBIFS_IDX_NODE_SZ); - n = le16_to_cpu(idx->child_cnt); - pr_err("\tchild_cnt %d\n", n); + n = min_t(int, le16_to_cpu(idx->child_cnt), max_child_cnt); + pr_err("\tchild_cnt %d\n", (int)le16_to_cpu(idx->child_cnt)); pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); pr_err("\tBranches:\n"); - for (i = 0; i < n && i < c->fanout - 1; i++) { + for (i = 0; i < n && i < c->fanout; i++) { const struct ubifs_branch *br; br = ubifs_idx_branch(c, idx, i); @@ -525,7 +561,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) le64_to_cpu(orph->cmt_no) & LLONG_MAX); pr_err("\tlast node flag %llu\n", (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); - n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + n = (safe_len - UBIFS_ORPH_NODE_SZ) >> 3; pr_err("\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) pr_err("\t ino %llu\n", @@ -537,9 +573,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) break; } default: - pr_err("node type %d was not recognized\n", - (int)ch->node_type); + pr_err("node type %d was not recognized\n", type); } + +out_unlock: spin_unlock(&dbg_lock); } @@ -791,22 +828,6 @@ void ubifs_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } -void ubifs_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) -{ - struct ubifs_scan_node *snod; - - pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n", - current->pid, sleb->lnum, offs); - - list_for_each_entry(snod, &sleb->nodes, list) { - cond_resched(); - pr_err("Dumping node at LEB %d:%d len %d\n", - sleb->lnum, snod->offs, snod->len); - ubifs_dump_node(c, snod->node); - } -} - void ubifs_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; @@ -834,7 +855,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) cond_resched(); pr_err("Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); } pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); @@ -1212,7 +1233,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr1->key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_dump_node(c, dent1); + ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ); goto out_free; } @@ -1224,7 +1245,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr2->key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_dump_node(c, dent2); + ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ); goto out_free; } @@ -1243,9 +1264,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs); - ubifs_dump_node(c, dent1); + ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ); ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs); - ubifs_dump_node(c, dent2); + ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ); out_free: kfree(dent2); @@ -2110,7 +2131,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, out_dump: ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); out_free: kfree(node); return err; @@ -2243,7 +2264,7 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d", (unsigned long)fscki->inum, zbr->lnum, zbr->offs); - ubifs_dump_node(c, ino); + ubifs_dump_node(c, ino, zbr->len); kfree(ino); return -EINVAL; } @@ -2314,12 +2335,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_DATA_NODE) { ubifs_err(c, "bad node type %d", sa->type); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); return -EINVAL; } if (sb->type != UBIFS_DATA_NODE) { ubifs_err(c, "bad node type %d", sb->type); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2350,8 +2371,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) return 0; error_dump: - ubifs_dump_node(c, sa->node); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2382,13 +2403,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sa->type); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); return -EINVAL; } if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE && sb->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sb->type); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2438,11 +2459,10 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) error_dump: ubifs_msg(c, "dumping first node"); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); ubifs_msg(c, "dumping second node"); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; - return 0; } static inline int chance(unsigned int n, unsigned int out_of) diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 7763639a426bab78773138f29db5d276f2990f3e..ed966108da806793f6f6991d9dff67d573709907 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -242,7 +242,8 @@ const char *dbg_get_key_dump(const struct ubifs_info *c, const char *dbg_snprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer, int len); void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); -void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_node(const struct ubifs_info *c, const void *node, + int node_len); void ubifs_dump_budget_req(const struct ubifs_budget_req *req); void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); @@ -251,8 +252,6 @@ void ubifs_dump_lprop(const struct ubifs_info *c, void ubifs_dump_lprops(struct ubifs_info *c); void ubifs_dump_lpt_info(struct ubifs_info *c); void ubifs_dump_leb(const struct ubifs_info *c, int lnum); -void ubifs_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs); void ubifs_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ad90a3a64293eb648d1929ed1e9d66aea29f6611..3db1c75fda5bbc326496ff8b199451b36ce8824b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -348,20 +348,97 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) { + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -391,42 +468,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -440,12 +505,6 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, return err; } -static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -659,32 +718,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -947,7 +980,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* @@ -1262,17 +1296,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * wht_req: new whiteout inode for RENAME_WHITEOUT. + * + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1329,20 +1365,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ + iput(whiteout); + goto out_release; + } } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1414,29 +1471,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; - iput(whiteout); - goto out_release; - } - - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1447,6 +1486,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1455,11 +1499,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1480,11 +1529,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f4826b6da6828aa1818c42919c8cc0179f208441..df46f2d3ff8bfe9b798c25723bf409b8a91a9a71 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -92,7 +92,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dump: ubifs_err(c, "bad data node (block %u, inode %lu)", block, inode->i_ino); - ubifs_dump_node(c, dn); + ubifs_dump_node(c, dn, UBIFS_MAX_DATA_NODE_SZ); return -EINVAL; } @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1303,7 +1303,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); } @@ -1470,8 +1470,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1495,7 +1495,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1566,7 +1566,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index dc3e26e9ed7b2b905f97f46d7f2b68f542ba8024..e43c8161423dd19308614f00948909d100a3b5c3 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) for (i = 0; ; i++) { int space_before, space_after; + /* Maybe continue after find and break before find */ + lp.lnum = -1; + cond_resched(); /* Give the commit an opportunity to run */ @@ -753,8 +756,16 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) * caller instead of the original '-EAGAIN'. */ err = ubifs_return_leb(c, lp.lnum); - if (err) + if (err) { ret = err; + /* LEB may always be "taken". So set + * the ubifs to read-only. Sync wbuf + * will return -EROFS, then go "out". + */ + ubifs_ro_mode(c, ret); + } + /* Maybe double return if go out */ + lp.lnum = -1; break; } goto out; @@ -843,7 +854,8 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ubifs_wbuf_sync_nolock(wbuf); ubifs_ro_mode(c, ret); mutex_unlock(&wbuf->io_mutex); - ubifs_return_leb(c, lp.lnum); + if (lp.lnum != -1) + ubifs_return_leb(c, lp.lnum); return ret; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index eae9cf5a57b05948fb083131eb3a743ac195d109..997450410408aed5f264a2040437cbb17c138351 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -198,6 +198,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) * ubifs_check_node - check node. * @c: UBIFS file-system description object * @buf: node to check + * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages @@ -222,10 +223,10 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. */ -int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int must_chk_crc) +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, + int lnum, int offs, int quiet, int must_chk_crc) { - int err = -EINVAL, type, node_len, dump_node = 1; + int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; const struct ubifs_ch *ch = buf; @@ -278,22 +279,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out_len: if (!quiet) ubifs_err(c, "bad node length %d", node_len); - if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ) - dump_node = 0; out: if (!quiet) { ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - if (dump_node) { - ubifs_dump_node(c, buf); - } else { - int safe_len = min3(node_len, c->leb_size - offs, - (int)UBIFS_MAX_DATA_NODE_SZ); - pr_err("\tprevent out-of-bounds memory access\n"); - pr_err("\ttruncated data node length %d\n", safe_len); - pr_err("\tcorrupted data node:\n"); - print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, - buf, safe_len, 0); - } + ubifs_dump_node(c, buf, len); dump_stack(); } return err; @@ -730,7 +719,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c) int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) { struct ubifs_info *c = wbuf->c; - int err, written, n, aligned_len = ALIGN(len, 8); + int err, n, written = 0, aligned_len = ALIGN(len, 8); dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, dbg_ntype(((struct ubifs_ch *)buf)->node_type), @@ -797,8 +786,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - written = 0; - if (wbuf->used) { /* * The node is large enough and does not fit entirely within @@ -846,16 +833,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } @@ -899,7 +912,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) out: ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, written + len); dump_stack(); ubifs_dump_leb(c, wbuf->lnum); return err; @@ -942,7 +955,7 @@ int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, err = ubifs_leb_write(c, lnum, buf, offs, buf_len); if (err) - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); return err; } @@ -1025,7 +1038,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0); if (err) { ubifs_err(c, "expected node type %d", type); return err; @@ -1041,7 +1054,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, out: ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); dump_stack(); return -EINVAL; } @@ -1081,7 +1094,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0); if (err) { ubifs_errc(c, "expected node type %d", type); return err; @@ -1099,7 +1112,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, ubi_is_mapped(c->ubi, lnum)); if (!c->probing) { - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); dump_stack(); } return -EINVAL; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 4363d85a3fd4010f7b189ce419edecc3a76e0cba..fb0fc0e6be99fb24cd91e1224210a44ca749d2c7 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -107,7 +107,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 7274bd23881b2dc256a769608c224f2eeb8fc91d..72586512e51f1a05029efde00a61a78508d22be0 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout don't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); @@ -1560,7 +1596,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn); + ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - + UBIFS_TRUN_NODE_SZ); goto out_free; } diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 911d0555b9f2b1be56fb05f983940e281b636f18..0df9a3dd0aaa2063dfbb75177b5518a51cbd831e 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -314,7 +314,7 @@ static int validate_master(const struct ubifs_info *c) out: ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err); - ubifs_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node, c->mst_node_alsz); return -EINVAL; } @@ -392,7 +392,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_err(c, "bad leb_cnt on master node"); - ubifs_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node, c->mst_node_alsz); return -EINVAL; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 0fb61956146da6efe34315eb97c08d7cf45fae0e..4909321d84cf83ccd84bb0a5396dbbf1a6a24b71 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -646,7 +646,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type != UBIFS_ORPH_NODE) { ubifs_err(c, "invalid node type %d in orphan area at %d:%d", snod->type, sleb->lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, + c->leb_size - snod->offs); err = -EINVAL; goto out_free; } @@ -674,7 +675,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (!first) { ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, + c->leb_size - snod->offs); err = -EINVAL; goto out_free; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index f116f7b3f9e5fa5761efc7c1cc375c2dd18ffd21..f0d51dd21c9e13d5ba13e29f3caa703bc84e8b61 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -352,11 +352,11 @@ int ubifs_recover_master_node(struct ubifs_info *c) ubifs_err(c, "failed to recover master node"); if (mst1) { ubifs_err(c, "dumping first master node"); - ubifs_dump_node(c, mst1); + ubifs_dump_node(c, mst1, c->leb_size - ((void *)mst1 - buf1)); } if (mst2) { ubifs_err(c, "dumping second master node"); - ubifs_dump_node(c, mst2); + ubifs_dump_node(c, mst2, c->leb_size - ((void *)mst2 - buf2)); } vfree(buf2); vfree(buf1); @@ -469,7 +469,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, * The area after the common header size is not empty, so the common * header must be intact. Check it. */ - if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + if (ubifs_check_node(c, buf, len, lnum, offs, 1, 0) != -EUCLEAN) { dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); return 0; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b2f5563d1489b8e851ee04b950045125d13506fe..6283adeb21e06c5f978c779e294058a64a1eea2b 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -830,7 +830,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) out_dump: ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); ubifs_scan_destroy(sleb); return -EINVAL; } @@ -1126,7 +1126,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) out_dump: ubifs_err(c, "log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); ubifs_scan_destroy(sleb); return -EINVAL; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c0d3e4008d23460de8f2dfe60f67a6065cc2d4ac..c160f718c28822e2c91f4b3e4bf12e408af64275 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -503,7 +503,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) failed: ubifs_err(c, "bad superblock, error %d", err); - ubifs_dump_node(c, sup); + ubifs_dump_node(c, sup, ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size)); return -EINVAL; } diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index c69cdb5e65bcede44ade118bbb7134009e7d0314..84a9157dcc32ca2b8df1bbe14cd3eb8dd051da36 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -76,7 +76,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, dbg_scan("scanning %s at LEB %d:%d", dbg_ntype(ch->node_type), lnum, offs); - if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) + if (ubifs_check_node(c, buf, len, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { @@ -90,7 +90,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (!quiet) { ubifs_err(c, "bad pad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, pad); + ubifs_dump_node(c, pad, len); } return SCANNED_A_BAD_PAD_NODE; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index cfd46753a685656b75eef1cf74b1e2e5da46277c..98cbd9cac2469d08a5ab2e139f8369885b1b8652 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -253,7 +253,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) out_invalid: ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); - ubifs_dump_node(c, ino); + ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 894f1ab14616e831321e86a1b6677b898e67eb01..414266d4d1ba33256784b5865d8e362e2aa5157d 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -316,7 +316,7 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, dent); if (err) { dump_stack(); - ubifs_dump_node(c, dent); + ubifs_dump_node(c, dent, zbr->len); return err; } @@ -349,7 +349,7 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, node); if (err) { dump_stack(); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); return err; } @@ -1699,7 +1699,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf, goto out_err; } - err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + err = ubifs_check_node(c, buf, zbr->len, zbr->lnum, zbr->offs, 0, 0); if (err) { ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE); goto out; @@ -1733,7 +1733,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf, err = -EINVAL; out: ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, zbr->len); dump_stack(); return err; } diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index ccaf94ea5be356ca2a56910d8c736facd7db556a..5517a56b4138a5462f186eb3f1db97ccfd2d382b 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -390,7 +390,7 @@ static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr, out_dump: ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err); - ubifs_dump_node(c, idx); + ubifs_dump_node(c, idx, c->max_idx_node_sz); kfree(idx); return -EINVAL; } @@ -489,7 +489,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->lnum, zbr->offs); dbg_tnck(key, "looked for key "); dbg_tnck(&key1, "but found node's key "); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e7e48f3b179ab3133fb837fbe134fc3363f20c6d..9a4a3191ed07ca0b1ca2f53da66dd37d4e260271 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -372,7 +372,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS * operation. This way UBIFS makes sure the inode fields are consistent. For - * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and * write-back must not write any of them before we have finished. * * The second reason is budgeting - UBIFS has to budget all operations. If an @@ -1721,8 +1721,8 @@ int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs); int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, int offs, int hmac_offs); -int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int must_chk_crc); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, + int lnum, int offs, int quiet, int must_chk_crc); void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_crc_node(struct ubifs_info *c, void *buf, int len); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92bbc9a72355abcc1c6af6ed6b6842e0382fa770..eee2c8a166015c419d0862786124b7a7591923e0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -399,7 +399,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8aae375864b6bc46cf8677db6b216f28b2542bff..31febd3e14de71181ee6270877f0d6d0b5627ba3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,8 @@ struct request_queue { */ struct mutex mq_freeze_lock; + int quiesce_depth; + struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; @@ -593,6 +595,7 @@ struct request_queue { /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_THROTL_INIT_DONE 2 /* io throttle can be online */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -948,10 +951,10 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +blk_status_t blk_execute_rq(struct request_queue *, struct gendisk *, + struct request *, int); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 91b9669785418d4a36d2c196d06e34605c820cf8..5120ca319ff4e64287cd6405469bf5623405c68a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) -#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -167,44 +154,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; } -static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -450,9 +399,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 474a0d852614f586ec5d628b5714d1c8760fb95a..88245386a4b6f44c4469053c73b10b3dded42c63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,11 +1081,20 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +}; + #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1093,17 +1102,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1137,6 +1142,8 @@ _out: \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 ret; \ u32 _ret = 1; \ u32 _cn = 0; \ @@ -1144,15 +1151,15 @@ _out: \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ ret = func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ _ret &= (ret & 1); \ _cn |= (ret & 2); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ if (_ret) \ @@ -1202,6 +1209,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ffec16930b00d7b4741978c2d06650d4899cbfa5..97ce0a12367c6cce36b7f558168682e0b2041161 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -766,107 +766,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * - * On legacy hierarchies, net_prio and net_cls controllers directly set - * attributes on each sock which can then be tested by the network layer. - * On the default hierarchy, each sock is associated with the cgroup it was - * created in and the networking layer can match the cgroup directly. - * - * To avoid carrying all three cgroup related fields separately in sock, - * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. - * On boot, sock_cgroup_data records the cgroup that the sock was created - * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overriden to carry prioidx and/or - * classid. The two modes are distinguished by whether the lowest bit is - * set. Clear bit indicates cgroup pointer while set bit prioidx and - * classid. - * - * While userland may start using net_prio or net_cls at any time, once - * either is used, cgroup2 matching no longer works. There is no reason to - * mix the two and this is in line with how legacy and v2 compatibility is - * handled. On mode switch, cgroup references which are already being - * pointed to by socks may be leaked. While this can be remedied by adding - * synchronization around sock_cgroup_data, given that the number of leaked - * cgroups is bound and highly unlikely to be high, this seems to be the - * better trade-off. + * On legacy hierarchies, net_prio and net_cls controllers directly + * set attributes on each sock which can then be tested by the network + * layer. On the default hierarchy, each sock is associated with the + * cgroup it was created in and the networking layer can match the + * cgroup directly. */ struct sock_cgroup_data { - union { -#ifdef __LITTLE_ENDIAN - struct { - u8 is_data : 1; - u8 no_refcnt : 1; - u8 unused : 6; - u8 padding; - u16 prioidx; - u32 classid; - } __packed; -#else - struct { - u32 classid; - u16 prioidx; - u8 padding; - u8 unused : 6; - u8 no_refcnt : 1; - u8 is_data : 1; - } __packed; + struct cgroup *cgroup; /* v2 */ +#ifdef CONFIG_CGROUP_NET_CLASSID + u32 classid; /* v1 */ +#endif +#ifdef CONFIG_CGROUP_NET_PRIO + u16 prioidx; /* v1 */ #endif - u64 val; - }; }; -/* - * There's a theoretical window where the following accessors race with - * updaters and return part of the previous pointer as the prioidx or - * classid. Such races are short-lived and the result isn't critical. - */ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { - /* fallback to 1 which is always the ID of the root cgroup */ - return (skcd->is_data & 1) ? skcd->prioidx : 1; +#ifdef CONFIG_CGROUP_NET_PRIO + return READ_ONCE(skcd->prioidx); +#else + return 1; +#endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { - /* fallback to 0 which is the unconfigured default classid */ - return (skcd->is_data & 1) ? skcd->classid : 0; +#ifdef CONFIG_CGROUP_NET_CLASSID + return READ_ONCE(skcd->classid); +#else + return 0; +#endif } -/* - * If invoked concurrently, the updaters may clobber each other. The - * caller is responsible for synchronization. - */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_prioidx(&skcd_buf) == prioidx) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.prioidx = prioidx; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_PRIO + WRITE_ONCE(skcd->prioidx, prioidx); +#endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_classid(&skcd_buf) == classid) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.classid = classid; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_CLASSID + WRITE_ONCE(skcd->classid, classid); +#endif } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 618838c48313cd82a9ce917378a5508396b0dcee..9c88b7da324597e76d69d7e2630fd9cc66a7ead6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -816,33 +816,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -extern spinlock_t cgroup_sk_update_lock; -#endif - -void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - unsigned long v; - - /* - * @skcd->val is 64bit but the following is safe on 32bit too as we - * just need the lower ulong to be written and read atomically. - */ - v = READ_ONCE(skcd->val); - - if (v & 3) - return &cgrp_dfl_root.cgrp; - - return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; -#else - return (struct cgroup *)(unsigned long)skcd->val; -#endif + return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bc26b4e11f62f1e0618b0a12d50825514f7ec81f..0bb7489e0cfb902fca42750102b9ed98916ace28 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); diff --git a/include/linux/fs.h b/include/linux/fs.h index c4e366a3226e20b2983063e4b9a5a8bc62447a51..3de2e6c714aaef6e721d0319dc0325da4075d516 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2527,8 +2528,14 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); -extern struct file *file_open_root(struct dentry *, struct vfsmount *, +extern struct file *file_open_root(const struct path *, const char *, int, umode_t); +static inline struct file *file_open_root_mnt(struct vfsmount *mnt, + const char *name, int flags, umode_t mode) +{ + return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, + name, flags, mode); +} extern struct file * dentry_open(const struct path *, int, const struct cred *); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 5b44b0195a28a07a0990748aedd591b2fb86e352..6b54982fc5f378ec704f56def2fdb299e7d8b42d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, extern int generic_parse_monolithic(struct fs_context *fc, void *data); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param_source(struct fs_context *fc, + struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); /* diff --git a/include/linux/namei.h b/include/linux/namei.h index a4bb992623c411e95266ecfd5e9aac5a54275f1b..ca94eb5d2b16676fc999e649357c569bdfe3cb13 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -36,9 +36,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* internal use only */ #define LOOKUP_PARENT 0x0010 -#define LOOKUP_JUMPED 0x1000 -#define LOOKUP_ROOT 0x2000 -#define LOOKUP_ROOT_GRABBED 0x0008 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index cfce186f0c4e0ecd8be7c211e8a3fbb95c963242..aff81ba31bd8bdbe845cb2852aebffdea66c1bbb 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -44,8 +44,12 @@ static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); +static inline void page_ext_init_flatmem_late(void) +{ +} #else extern void page_ext_init_flatmem(void); +extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } @@ -76,6 +80,10 @@ static inline void page_ext_init(void) { } +static inline void page_ext_init_flatmem_late(void) +{ +} + static inline void page_ext_init_flatmem(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b26d9ccefeda08bb8be7d5d2f54f89d66741b..53198ac3d154ac270a19b5af9050bbda65a1f572 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -40,6 +40,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1340,6 +1341,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/include/linux/topology.h b/include/linux/topology.h index ad03df1cc2667d090361ca21aa12d827314884f1..7634cd737061c85c567e7254ca437f0787750b60 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 1a5c9a3df6d69ba59c50270470f6ec3cbeb05152..2b8c3df03f1394ebfcaa2cd85c9b58a27fc05c3b 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -204,6 +204,7 @@ struct scsi_device { unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ + unsigned queue_stopped:1; /* request queue is quiesced */ bool offline_already; /* Device offline message logged */ diff --git a/init/main.c b/init/main.c index dd26a42e80a87cf613982e6cf70a1293440d8529..63cdf49f7f82b13611a91ce4db1d8a59651a8c8c 100644 --- a/init/main.c +++ b/init/main.c @@ -828,6 +828,8 @@ static void __init mm_init(void) init_debug_pagealloc(); report_meminit(); mem_init(); + /* page_owner must be initialized after buddy is ready */ + page_ext_init_flatmem_late(); kmem_cache_init(); kmemleak_init(); pgtable_init(); diff --git a/ipc/namespace.c b/ipc/namespace.c index 24e7b45320f724be853fe15be748229f7732beb3..c94c05846141cc33fb62f909d85aa5ce78b43b31 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0efe7c7bfe5e913d2fde22b3eafd7245699488f1..bb4350de9f110e51bda6c97ad6c4df69869d07b0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -372,8 +372,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -382,17 +380,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i; - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b139247d2dd33daaa375b1341f93064ed57fac59..49ca8771d4701cb2bd4530de304d63a8b45afd92 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@ #ifdef CONFIG_CGROUP_BPF -DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 812dd4ed01293fb3d58e1566b18006b993656ec7..88b72815aa9d0ffd512535bc676b09ab4de8e18d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -915,15 +915,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "Non-string source"); - if (fc->source) - return invalf(fc, "Multiple sources not supported"); - fc->source = param->string; - param->string = NULL; - return 0; - } + int ret; + + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 017bfc698109208d2aa555583dc5fe76c9ff1308..350297ad63f162ede07a0828d8cb641f7144af9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,74 +6439,51 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } - +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 812a61afd538a180d64b3ec35dbf3d978c7a059a..12c5110466bc0774f62b691e2399ca784d0a20d9 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/fork.c b/kernel/fork.c index f019852b2e611150438d06e7fde23b1345c97619..39b1783a7613e2bdc67c8201af1f91d8c401b2fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2072,6 +2072,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + p->bpf_ctx = NULL; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 12dd41b39a7ffe0ae01b7ad3ec58c30c075ef551..3d5a5faf91b51ba42f1f44b55a6f977e339ef494 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,6 +573,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ef8733e2a476e44552cd2f817c1c1de1a34a3838..52c017feabcb3343ce96370868c489818b12990c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -457,7 +457,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 72e33054a2e1b72d7b9c529d59105a51ea9dc9e6..c9126606fa6f4eddbdd8ace5cc345f027b41c72a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1521,9 +1521,10 @@ int swsusp_read(unsigned int *flags_p) int swsusp_check(void) { int error; + void *holder; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); + FMODE_READ | FMODE_EXCL, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1545,7 +1546,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); else pr_debug("Image signature found, resuming\n"); } else { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad9bec90fb7a8b042b8695afaafdd9adfa21a270..dae1e8eaa98329177da6affd4717042bd0826d54 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -944,6 +944,18 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static inline void try_start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + raw_spin_lock(&rt_b->rt_runtime_lock); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -1020,13 +1032,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + try_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2721,8 +2737,10 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); } int sched_rt_handler(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd777022608697d23d0163dc1d44162f59619ae8..004e9505f7ad6e2f7eb8d7e688831c57772784a6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -674,6 +674,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -705,6 +706,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; tmp = tmp->parent) + numa_distance += !!(tmp->flags & SD_NUMA); + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -935,6 +939,31 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } +static struct sched_domain * +find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) +{ + /* + * The proper descendant would be the one whose child won't span out + * of sd + */ + while (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), + sched_domain_span(sd))) + sibling = sibling->child; + + /* + * As we are referencing sgc across different topology level, we need + * to go down to skip those sched_domains which don't contribute to + * scheduling because they will be degenerated in cpu_attach_domain + */ + while (sibling->child && + cpumask_equal(sched_domain_span(sibling->child), + sched_domain_span(sibling))) + sibling = sibling->child; + + return sibling; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -968,6 +997,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; + /* + * Usually we build sched_group by sibling's child sched_domain + * But for machines whose NUMA diameter are 3 or above, we move + * to build sched_group by sibling's proper descendant's child + * domain because sibling's child sched_domain will span out of + * the sched_domain being built as below. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + if (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), span)) + sibling = find_descended_sibling(sd, sibling); + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; @@ -975,7 +1039,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg); + init_overlap_sched_group(sibling, sg); if (!first) first = sg; @@ -1549,66 +1613,57 @@ static void init_numa_topology_type(void) } } +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + int distance = node_distance(i, j); - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; } + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; + } + + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1617,15 +1672,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1633,7 +1688,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1641,12 +1696,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1658,7 +1718,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1681,7 +1741,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1693,8 +1753,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } diff --git a/kernel/signal.c b/kernel/signal.c index ef8f2a28d37c525b0994701448c026469a533259..b125bc8961380dd364dc09daccbf944d8aa94f69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4595,7 +4595,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index afc65e6be33e4ce79e899e11db2739f23fe8224d..00c20f7fdc02b96359dafd8be85d05ae5a98e1ce 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; kref_init(&ns->kref); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b1daaa0e843dd1e23abc8c547ddfc..7363f81dc31add5f2ffa83f2c8f0008a91935880 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce396ea4de60891b4040da0c04e0facf96b81332..2c15bf6680c336c9a9a621601306852d62d17515 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1378,7 +1378,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index bb7bb3b478abfa5c6dbb81466125858db1f90d5b..9dae1f6487136a3b9132c6f18c5c2aba2a028fc4 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (IS_ERR(mnt)) return mnt; - file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { mntput(mnt); return ERR_CAST(file); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 17679e80fbc400b52a64cfb37427dbf30ef44b61..4ecf9a0622dfcd090bb3d82c106a49d6ccd36cc6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1353,9 +1353,26 @@ static void update_and_free_page(struct hstate *h, struct page *page) } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_page will turn the compound page + * into a simple group of pages. After this the destructor does not + * apply. + * + * This handles the case where more than one ref is held when and + * after update_and_free_page is called. + */ set_page_refcounted(page); if (hstate_is_gigantic(h)) { + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); /* * Temporarily drop the hugetlb_lock, because * we might block in free_gigantic_page(). @@ -1365,6 +1382,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) free_gigantic_page(page, huge_page_order(h)); spin_lock(&hugetlb_lock); } else { + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); __free_pages(page, huge_page_order(h)); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00156aebbcadb43ccd9cf0bd0005ab1587ffcc3f..167169b3907d7153ee7e9b159ea8f6dbeaa670e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4184,7 +4184,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (css->parent) diff --git a/mm/page_ext.c b/mm/page_ext.c index a3616f7a0e9e923c2add3a8459c8b2a82da524f7..16b161f28a31d974bc7651c5ae4a0af9ee6b9637 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void) } } +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) +{ + invoke_init_callbacks(); +} +#endif + static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -#if !defined(CONFIG_SPARSEMEM) +#ifndef CONFIG_SPARSEMEM void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) @@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); - invoke_init_callbacks(); return; fail: diff --git a/mm/page_owner.c b/mm/page_owner.c index b735a8eafcdbc605bc7e4e226fcbaf27abbcd4cb..5b93fc85dc739fb85a863208641b435b562e07b1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" @@ -25,6 +26,9 @@ struct page_owner { gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; + u64 ts_nsec; + u64 free_ts_nsec; + pid_t pid; }; static bool page_owner_enabled = false; @@ -145,6 +149,7 @@ void __reset_page_owner(struct page *page, unsigned int order) struct page_ext *page_ext; depot_stack_handle_t handle = 0; struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); @@ -155,6 +160,7 @@ void __reset_page_owner(struct page *page, unsigned int order) __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } } @@ -172,6 +178,8 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; + page_owner->pid = current->pid; + page_owner->ts_nsec = local_clock(); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -236,6 +244,9 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->last_migrate_reason = old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; + new_page_owner->pid = old_page_owner->pid; + new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -349,9 +360,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg)\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, - &page_owner->gfp_mask); + &page_owner->gfp_mask, page_owner->pid, + page_owner->ts_nsec, page_owner->free_ts_nsec); if (ret >= count) goto err; @@ -427,8 +439,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, + page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { diff --git a/mm/page_poison.c b/mm/page_poison.c index a08423cb11ca9b6beb4375ba44232895abab464d..51f9cb913e0c19f83bee0db2fc233066db8af56d 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b) return error && !(error & (error - 1)); } -static void check_poison_mem(unsigned char *mem, size_t bytes) +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) { static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; @@ -101,6 +102,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); } static void unpoison_page(struct page *page) @@ -114,7 +116,7 @@ static void unpoison_page(struct page *page) * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/rmap.c b/mm/rmap.c index 14f84f70c5571879f58336c763d1888a2060415e..cdf549f6f617c43d0beec58e7e663f898ae850b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) + if (vma->anon_vma) { vma->anon_vma->degree--; + + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + } unlock_anon_vma_root(root); /* diff --git a/mm/slub.c b/mm/slub.c index 1384dc90683379f266f91b56dd25fb25149eb8db..e6c532123de1ad0cb7e4040651ee1354050608aa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1346,12 +1346,13 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) static int __init setup_slub_debug(char *str) { slab_flags_t flags; + slab_flags_t global_flags; char *saved_str; char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - slub_debug = DEBUG_DEFAULT_FLAGS; + global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) /* * No options specified. Switch on full debugging. @@ -1363,7 +1364,7 @@ static int __init setup_slub_debug(char *str) str = parse_slub_debug_flags(str, &flags, &slab_list, true); if (!slab_list) { - slub_debug = flags; + global_flags = flags; global_slub_debug_changed = true; } else { slab_list_specified = true; @@ -1372,16 +1373,18 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of - * slabs means debugging is only enabled for those slabs, so the global - * slub_debug should be 0. We can extended that to multiple lists as + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ if (slab_list_specified) { if (!global_slub_debug_changed) - slub_debug = 0; + global_flags = slub_debug; slub_debug_string = saved_str; } out: + slub_debug = global_flags; if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); if ((static_branch_unlikely(&init_on_alloc) || diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a6dd347bcb3fbb4c90a1f21dcf3e224b8a5f0e2..9f292132ed88997a45bffb88f3adb4b2f6e54228 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -689,6 +689,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -701,7 +702,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 1); } void drop_slab(void) @@ -4174,11 +4175,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; + unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -4203,6 +4206,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 15bbfaf943fd12dea78bf8eda8f024a49005e6ef..8b644113715e9db49effcaa60cfe45572304bb13 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,9 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0; out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED || - new_dev->reg_state == NETREG_UNREGISTERED) - free_netdev(new_dev); + free_netdev(new_dev); return err; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb684f31fd6988b9aaf8863dadef96fb8736aba1..99712d35e5351e72f5e46caac6ba141a9d16221d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -19,18 +19,20 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; int ret = 0; u32 i; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -41,18 +43,15 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, rcu_read_lock(); migrate_disable(); time_start = ktime_get_ns(); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); for (i = 0; i < repeat; i++) { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - bpf_cgroup_storage_unset(); - if (signal_pending(current)) { ret = -EINTR; break; @@ -70,6 +69,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); } } + bpf_reset_run_ctx(old_ctx); time_spent += ktime_get_ns() - time_start; migrate_enable(); rcu_read_unlock(); @@ -78,7 +78,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } diff --git a/net/core/dev.c b/net/core/dev.c index ae237a498569fa297f192ef9c08de4474807ff77..b0c3dd268913d2c7f869f5b6f19660c5e9c4260c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10661,6 +10661,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562addb9c6fa87cf87a282ceda62836..b6de5ee22391ce991960b59ee5fc1df13f29608b 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -72,11 +72,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file, &err); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); - spin_unlock(&cgroup_sk_update_lock); - } if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; @@ -122,8 +119,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, struct css_task_iter it; struct task_struct *p; - cgroup_sk_alloc_disable(); - cs->classid = (u32)value; css_task_iter_start(css, 0, &it); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510f234eb83d5e88640474d5d4de072..34c3e49814e26723039f6f7919fff83e8625ff01 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; - cgroup_sk_alloc_disable(); - rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -222,12 +220,11 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + + if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); - spin_unlock(&cgroup_sk_update_lock); - } + return 0; } @@ -236,8 +233,6 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; - cgroup_sk_alloc_disable(); - cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 27ffa83ffeb3ca8318efcd05abd7b53184f2f341..e08b506163f5f588c33078f2d9852aa2ec405bef 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3438,26 +3438,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index e29bea3dd4ccd157b50404db9b15f99902e52a89..a5ded568cc169b8c20536ac1e68dca902035744f 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ima.h" #ifdef CONFIG_IMA_KEXEC diff --git a/sound/core/timer.c b/sound/core/timer.c index c15c8314671b7a5ba2339d7fb22aa5dcab5fa682..963ddd9e7715851b5de3cc75f7e1d8f1e84b4979 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -624,13 +624,13 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop) if (!timer) return -EINVAL; spin_lock_irqsave(&timer->lock, flags); + list_del_init(&timeri->ack_list); + list_del_init(&timeri->active_list); if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING | SNDRV_TIMER_IFLG_START))) { result = -EBUSY; goto unlock; } - list_del_init(&timeri->ack_list); - list_del_init(&timeri->active_list); if (timer->card && timer->card->shutdown) goto unlock; if (stop) {