diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d95f4c66275317099a8eeb06ea95a5ae04cf2fb..3586f15e58e4a6ac51cbf327043c64d59ddf2d51 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,12 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + KABI_BROKEN_REPLACE(void (*map_fd_put_ptr)(void *ptr), + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer)) int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, @@ -164,14 +169,21 @@ struct bpf_map { u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 22 bytes hole */ + KABI_EXTEND(bool free_after_mult_rcu_gp) + /* 17 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + KABI_BROKEN_REPLACE( + struct work_struct work, + union { + struct work_struct work; + struct rcu_head rcu; + }) struct mutex freeze_mutex; atomic64_t writecnt; }; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f241bda2679d43d765c08eec6a6287bd6a8e4fa5..b5b8b57dc212072e5e491cc8677f7324697a3098 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -764,11 +764,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } -static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static int __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -787,13 +787,18 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -811,8 +816,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -822,13 +828,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1007,7 +1013,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); } @@ -1139,8 +1145,9 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1158,7 +1165,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1166,7 +1173,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1195,7 +1202,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); @@ -1203,7 +1210,7 @@ static void cgroup_fd_array_put_ptr(void *ptr) static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1249,7 +1256,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0ce445aadfdfb19e3b7032dead3e73a36339c169..ec849731427251a119d5267d2f2efd3e2e5cfd93 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -786,7 +786,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2023,7 +2023,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2064,7 +2064,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 39ab0b68cade574acb7a8226566e30aa91307ee1..caa1a17cbae1533cfdd5be89e4baef3edfc549d1 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -100,12 +100,17 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (need_defer) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c0d073222a20bc7d38538e986ee27..7d61602354de8074126944aeea4bb1d116b939e1 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 419dbc3d060ee1fea6835a59291bf36b9c570d7e..cc20df2c3465fb908a2628ffe96e06d2759cf9c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -493,6 +493,22 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -502,8 +518,11 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } }