From ef9958ab7e5ebd2818fbecaf62b1930d5af1c2a4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:24 -0700 Subject: [PATCH 01/14] bpf: Use separate lockdep class for each hashtab ANBZ: #5320 commit c50eb518e262fa06bd334e6eec172eaf5d7a5bd9 upstream. If a hashtab is accessed in both NMI and non-NMI contexts, it may cause deadlock in bucket->lock. LOCKDEP NMI warning highlighted this issue: ./test_progs -t stacktrace [ 74.828970] [ 74.828971] ================================ [ 74.828972] WARNING: inconsistent lock state [ 74.828973] 5.9.0-rc8+ #275 Not tainted [ 74.828974] -------------------------------- [ 74.828975] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 74.828976] taskset/1174 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 74.828977] ffffc90000ee96b0 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x271/0x5a0 [ 74.828981] {INITIAL USE} state was registered at: [ 74.828982] lock_acquire+0x137/0x510 [ 74.828983] _raw_spin_lock_irqsave+0x43/0x90 [ 74.828984] htab_map_update_elem+0x271/0x5a0 [ 74.828984] 0xffffffffa0040b34 [ 74.828985] trace_call_bpf+0x159/0x310 [ 74.828986] perf_trace_run_bpf_submit+0x5f/0xd0 [ 74.828987] perf_trace_urandom_read+0x1be/0x220 [ 74.828988] urandom_read_nowarn.isra.0+0x26f/0x380 [ 74.828989] vfs_read+0xf8/0x280 [ 74.828989] ksys_read+0xc9/0x160 [ 74.828990] do_syscall_64+0x33/0x40 [ 74.828991] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 74.828992] irq event stamp: 1766 [ 74.828993] hardirqs last enabled at (1765): [] asm_exc_page_fault+0x1e/0x30 [ 74.828994] hardirqs last disabled at (1766): [] irqentry_enter+0x37/0x60 [ 74.828995] softirqs last enabled at (856): [] fpu__clear+0xac/0x120 [ 74.828996] softirqs last disabled at (854): [] fpu__clear+0x20/0x120 [ 74.828997] [ 74.828998] other info that might help us debug this: [ 74.828999] Possible unsafe locking scenario: [ 74.828999] [ 74.829000] CPU0 [ 74.829001] ---- [ 74.829001] lock(&htab->buckets[i].raw_lock); [ 74.829003] [ 74.829004] lock(&htab->buckets[i].raw_lock); [ 74.829006] [ 74.829006] *** DEADLOCK *** [ 74.829007] [ 74.829008] 1 lock held by taskset/1174: [ 74.829008] #0: ffff8883ec3fd020 (&cpuctx_lock){-...}-{2:2}, at: perf_event_task_tick+0x101/0x650 [ 74.829012] [ 74.829013] stack backtrace: [ 74.829014] CPU: 0 PID: 1174 Comm: taskset Not tainted 5.9.0-rc8+ #275 [ 74.829015] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 74.829016] Call Trace: [ 74.829016] [ 74.829017] dump_stack+0x9a/0xd0 [ 74.829018] lock_acquire+0x461/0x510 [ 74.829019] ? lock_release+0x6b0/0x6b0 [ 74.829020] ? stack_map_get_build_id_offset+0x45e/0x800 [ 74.829021] ? htab_map_update_elem+0x271/0x5a0 [ 74.829022] ? rcu_read_lock_held_common+0x1a/0x50 [ 74.829022] ? rcu_read_lock_held+0x5f/0xb0 [ 74.829023] _raw_spin_lock_irqsave+0x43/0x90 [ 74.829024] ? htab_map_update_elem+0x271/0x5a0 [ 74.829025] htab_map_update_elem+0x271/0x5a0 [ 74.829026] bpf_prog_1fd9e30e1438d3c5_oncpu+0x9c/0xe88 [ 74.829027] bpf_overflow_handler+0x127/0x320 [ 74.829028] ? perf_event_text_poke_output+0x4d0/0x4d0 [ 74.829029] ? sched_clock_cpu+0x18/0x130 [ 74.829030] __perf_event_overflow+0xae/0x190 [ 74.829030] handle_pmi_common+0x34c/0x470 [ 74.829031] ? intel_pmu_save_and_restart+0x90/0x90 [ 74.829032] ? lock_acquire+0x3f8/0x510 [ 74.829033] ? lock_release+0x6b0/0x6b0 [ 74.829034] intel_pmu_handle_irq+0x11e/0x240 [ 74.829034] perf_event_nmi_handler+0x40/0x60 [ 74.829035] nmi_handle+0x110/0x360 [ 74.829036] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829037] default_do_nmi+0x6b/0x170 [ 74.829038] exc_nmi+0x106/0x130 [ 74.829038] end_repeat_nmi+0x16/0x55 [ 74.829039] RIP: 0010:__intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829042] Code: 2f 1f 03 48 8d bb b8 0c 00 00 e8 29 09 41 00 48 ... [ 74.829043] RSP: 0000:ffff8880a604fc90 EFLAGS: 00000002 [ 74.829044] RAX: 000000070000000f RBX: ffff8883ec2195a0 RCX: 000000000000038f [ 74.829045] RDX: 0000000000000007 RSI: ffffffff82e72c20 RDI: ffff8883ec21a258 [ 74.829046] RBP: 000000070000000f R08: ffffffff8101b013 R09: fffffbfff0a7982d [ 74.829047] R10: ffffffff853cc167 R11: fffffbfff0a7982c R12: 0000000000000000 [ 74.829049] R13: ffff8883ec3f0af0 R14: ffff8883ec3fd120 R15: ffff8883e9c92098 [ 74.829049] ? intel_pmu_lbr_enable_all+0x43/0x240 [ 74.829050] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829051] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829052] [ 74.829053] perf_event_task_tick+0x48d/0x650 [ 74.829054] scheduler_tick+0x129/0x210 [ 74.829054] update_process_times+0x37/0x70 [ 74.829055] tick_sched_handle.isra.0+0x35/0x90 [ 74.829056] tick_sched_timer+0x8f/0xb0 [ 74.829057] __hrtimer_run_queues+0x364/0x7d0 [ 74.829058] ? tick_sched_do_timer+0xa0/0xa0 [ 74.829058] ? enqueue_hrtimer+0x1e0/0x1e0 [ 74.829059] ? recalibrate_cpu_khz+0x10/0x10 [ 74.829060] ? ktime_get_update_offsets_now+0x1a3/0x360 [ 74.829061] hrtimer_interrupt+0x1bb/0x360 [ 74.829062] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 74.829063] __sysvec_apic_timer_interrupt+0xed/0x3d0 [ 74.829064] sysvec_apic_timer_interrupt+0x3f/0xd0 [ 74.829064] ? asm_sysvec_apic_timer_interrupt+0xa/0x20 [ 74.829065] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 74.829066] RIP: 0033:0x7fba18d579b4 [ 74.829068] Code: 74 54 44 0f b6 4a 04 41 83 e1 0f 41 80 f9 ... [ 74.829069] RSP: 002b:00007ffc9ba69570 EFLAGS: 00000206 [ 74.829071] RAX: 00007fba192084c0 RBX: 00007fba18c24d28 RCX: 00000000000007a4 [ 74.829072] RDX: 00007fba18c30488 RSI: 0000000000000000 RDI: 000000000000037b [ 74.829073] RBP: 00007fba18ca5760 R08: 00007fba18c248fc R09: 00007fba18c94c30 [ 74.829074] R10: 000000000000002f R11: 0000000000073c30 R12: 00007ffc9ba695e0 [ 74.829075] R13: 00000000000003f3 R14: 00007fba18c21ac8 R15: 00000000000058d6 However, such warning should not apply across multiple hashtabs. The system will not deadlock if one hashtab is used in NMI, while another hashtab is used in non-NMI. Use separate lockdep class for each hashtab, so that we don't get this false alert. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-2-songliubraving@fb.com [dtcccc: backport because relied by lookup&delete for hashtab.] Signed-off-by: Tianchen Ding --- kernel/bpf/hashtab.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6c444e815406..3a7223cef363 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -99,6 +99,7 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; + struct lock_class_key lockdep_key; }; /* each htab element is struct htab_elem + key + value */ @@ -136,12 +137,18 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; + lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) + if (htab_use_raw_lock(htab)) { raw_spin_lock_init(&htab->buckets[i].raw_lock); - else + lockdep_set_class(&htab->buckets[i].raw_lock, + &htab->lockdep_key); + } else { spin_lock_init(&htab->buckets[i].lock); + lockdep_set_class(&htab->buckets[i].lock, + &htab->lockdep_key); + } } } @@ -1338,6 +1345,7 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } -- Gitee From 073d979daacc1603f41b3fbf62b404269ee29836 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:25 -0700 Subject: [PATCH 02/14] bpf: Avoid hashtab deadlock with map_locked ANBZ: #5320 commit 20b6cc34ea74b6a84599c1f8a70f3315b56a1883 upstream. If a hashtab is accessed in both non-NMI and NMI context, the system may deadlock on bucket->lock. Fix this issue with percpu counter map_locked. map_locked rejects concurrent access to the same bucket from the same CPU. To reduce memory overhead, map_locked is not added per bucket. Instead, 8 percpu counters are added to each hashtab. buckets are assigned to these counters based on the lower bits of its hash. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-3-songliubraving@fb.com [dtcccc: backport because relied by lookup&delete for hashtab.] Signed-off-by: Tianchen Ding --- kernel/bpf/hashtab.c | 114 +++++++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 32 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a7223cef363..a61c40dd5976 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -86,6 +86,9 @@ struct bucket { }; }; +#define HASHTAB_MAP_LOCK_COUNT 8 +#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) + struct bpf_htab { struct bpf_map map; struct bucket *buckets; @@ -100,6 +103,7 @@ struct bpf_htab { u32 elem_size; /* size of each element in bytes */ u32 hashrnd; struct lock_class_key lockdep_key; + int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -152,26 +156,41 @@ static void htab_init_buckets(struct bpf_htab *htab) } } -static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b) +static inline int htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b, u32 hash, + unsigned long *pflags) { unsigned long flags; + hash = hash & HASHTAB_MAP_LOCK_MASK; + + migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); + return -EBUSY; + } + if (htab_use_raw_lock(htab)) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); - return flags; + *pflags = flags; + + return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, + struct bucket *b, u32 hash, unsigned long flags) { + hash = hash & HASHTAB_MAP_LOCK_MASK; if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -429,8 +448,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; + int err, i; u64 cost; - int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -487,6 +506,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_charge; + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { + htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), + sizeof(int), GFP_USER); + if (!htab->map_locked[i]) + goto free_map_locked; + } + if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -497,7 +523,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (prealloc) { err = prealloc_init(htab); if (err) - goto free_buckets; + goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so @@ -513,7 +539,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); -free_buckets: +free_map_locked: + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); free_charge: bpf_map_charge_finish(&htab->map.memory); @@ -694,12 +722,15 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; + int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + if (ret) + return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -707,7 +738,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } @@ -1005,7 +1036,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1046,7 +1079,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1084,7 +1117,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1103,7 +1138,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1138,7 +1173,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1161,7 +1198,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1201,7 +1238,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1223,7 +1262,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1251,7 +1290,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1261,17 +1300,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); - ret = 0; + } else { + ret = -ENOENT; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1283,7 +1325,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1293,16 +1335,18 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - ret = 0; - } + else + ret = -ENOENT; - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1328,6 +1372,7 @@ static void delete_all_elements(struct bpf_htab *htab) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1346,6 +1391,8 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); lockdep_unregister_key(&htab->lockdep_key); + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); kfree(htab); } @@ -1449,8 +1496,11 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ - if (locked) - flags = htab_lock_bucket(htab, b); + if (locked) { + ret = htab_lock_bucket(htab, b, batch, &flags); + if (ret) + goto next_batch; + } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1467,7 +1517,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1478,7 +1528,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1531,7 +1581,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, dst_val += value_size; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { -- Gitee From d6214caeeb41ca7b0305f515ab5b60f71d6201c2 Mon Sep 17 00:00:00 2001 From: Denis Salopek Date: Tue, 11 May 2021 23:00:04 +0200 Subject: [PATCH 03/14] bpf: Add lookup_and_delete_elem support to hashtab ANBZ: #5320 commit 3e87f192b405960c0fe83e0925bd0dadf4f8cf43 upstream. Extend the existing bpf_map_lookup_and_delete_elem() functionality to hashtab map types, in addition to stacks and queues. Create a new hashtab bpf_map_ops function that does lookup and deletion of the element under the same bucket lock and add the created map_ops to bpf.h. Signed-off-by: Denis Salopek Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/4d18480a3e990ffbf14751ddef0325eed3be2966.1620763117.git.denis.salopek@sartura.hr Signed-off-by: Tianchen Ding --- include/linux/bpf.h | 2 + kernel/bpf/hashtab.c | 98 ++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 +++++++++++++-- 3 files changed, 130 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5300954833bd..ad875245353f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -64,6 +64,8 @@ struct bpf_map_ops { void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); + int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, + void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a61c40dd5976..4cb0461e4fe2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1417,6 +1417,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, bool is_lru_map, + bool is_percpu, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + unsigned long bflags; + struct htab_elem *l; + u32 hash, key_size; + struct bucket *b; + int ret; + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size, htab->hashrnd); + b = __select_bucket(htab, hash); + head = &b->head; + + ret = htab_lock_bucket(htab, b, hash, &bflags); + if (ret) + return ret; + + l = lookup_elem_raw(head, hash, key, key_size); + if (!l) { + ret = -ENOENT; + } else { + if (is_percpu) { + u32 roundup_value_size = round_up(map->value_size, 8); + void __percpu *pptr; + int off = 0, cpu; + + pptr = htab_elem_get_ptr(l, key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + } else { + u32 roundup_key_size = round_up(map->key_size, 8); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, value, l->key + + roundup_key_size, + true); + else + copy_map_value(map, value, l->key + + roundup_key_size); + check_and_init_map_lock(map, value); + } + + hlist_nulls_del_rcu(&l->hash_node); + if (!is_lru_map) + free_htab_elem(htab, l); + } + + htab_unlock_bucket(htab, b, hash, bflags); + + if (is_lru_map && l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + + return ret; +} + +static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, false, false, + flags); +} + +static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, + void *key, void *value, + u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, false, true, + flags); +} + +static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, true, false, + flags); +} + +static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, + void *key, void *value, + u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, true, true, + flags); +} + static int __htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, @@ -1893,6 +1987,7 @@ const struct bpf_map_ops htab_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_map_lookup_elem, + .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, @@ -1911,6 +2006,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, @@ -2032,6 +2128,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, @@ -2049,6 +2146,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 20160386c7fb..87536b2283a9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1481,7 +1481,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return err; } -#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags static int map_lookup_and_delete_elem(union bpf_attr *attr) { @@ -1497,6 +1497,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1508,24 +1511,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } + if (attr->flags && + (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK)) { + err = -EINVAL; + goto err_put; + } + + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; + err = -ENOTSUPP; if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_pop_elem(map, value); - } else { - err = -ENOTSUPP; + } else if (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (!bpf_map_is_dev_bound(map)) { + bpf_disable_instrumentation(); + rcu_read_lock(); + err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); + rcu_read_unlock(); + bpf_enable_instrumentation(); + } } if (err) -- Gitee From 430b6949c5509b8c79de85936156d62c7d43897b Mon Sep 17 00:00:00 2001 From: Denis Salopek Date: Tue, 11 May 2021 23:00:05 +0200 Subject: [PATCH 04/14] bpf: Extend libbpf with bpf_map_lookup_and_delete_elem_flags ANBZ: #5320 commit d59b9f2d1b2211e948044a099fd0a65941d06570 upstream. Add bpf_map_lookup_and_delete_elem_flags() libbpf API in order to use the BPF_F_LOCK flag with the map_lookup_and_delete_elem() function. Signed-off-by: Denis Salopek Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/15b05dafe46c7e0750d110f233977372029d1f62.1620763117.git.denis.salopek@sartura.hr [dtcccc: update libbpf.map to 0.5.0 to sync with upstream.] Signed-off-by: Tianchen Ding --- tools/lib/bpf/bpf.c | 13 +++++++++++++ tools/lib/bpf/bpf.h | 2 ++ tools/lib/bpf/libbpf.map | 11 +++++++++++ 3 files changed, 26 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973..b2fe13422d79 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -418,6 +418,19 @@ int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); } +int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); +} + int bpf_map_delete_elem(int fd, const void *key) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 875dde20d56e..4f758f8f50cd 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -124,6 +124,8 @@ LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags); LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, + void *value, __u64 flags); LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); LIBBPF_API int bpf_map_freeze(int fd); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47..b45e7fcc1cab 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -337,3 +337,14 @@ LIBBPF_0.2.0 { perf_buffer__consume_buffer; xsk_socket__create_shared; } LIBBPF_0.1.0; + +LIBBPF_0.3.0 { +} LIBBPF_0.2.0; + +LIBBPF_0.4.0 { +} LIBBPF_0.3.0; + +LIBBPF_0.5.0 { + global: + bpf_map_lookup_and_delete_elem_flags; +} LIBBPF_0.4.0; -- Gitee From aca0153977b883f58eef2c16af3a9dce60566c4b Mon Sep 17 00:00:00 2001 From: Denis Salopek Date: Tue, 11 May 2021 23:00:06 +0200 Subject: [PATCH 05/14] selftests/bpf: Add bpf_lookup_and_delete_elem tests ANBZ: #5320 commit 49c299b69444d58a1d234769a13a3697841deb54 upstream. Add bpf selftests and extend existing ones for a new function bpf_lookup_and_delete_elem() for (percpu) hash and (percpu) LRU hash map types. In test_lru_map and test_maps we add an element, lookup_and_delete it, then check whether it's deleted. The newly added lookup_and_delete prog tests practically do the same thing but additionally use a BPF program to change the value of the element for LRU maps. Signed-off-by: Denis Salopek Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/d30d3e0060c1f750e133579623cf1c60ff58f3d9.1620763117.git.denis.salopek@sartura.hr [dtcccc: add ASSERT_GE() and other macros in test_progs.h for future preparation.] Signed-off-by: Tianchen Ding --- .../bpf/prog_tests/lookup_and_delete.c | 288 ++++++++++++++++++ .../bpf/progs/test_lookup_and_delete.c | 26 ++ tools/testing/selftests/bpf/test_lru_map.c | 8 + tools/testing/selftests/bpf/test_maps.c | 17 ++ tools/testing/selftests/bpf/test_progs.h | 44 +++ 5 files changed, 383 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_and_delete.c diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c new file mode 100644 index 000000000000..beebfa9730e1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "test_lookup_and_delete.skel.h" + +#define START_VALUE 1234 +#define NEW_VALUE 4321 +#define MAX_ENTRIES 2 + +static int duration; +static int nr_cpus; + +static int fill_values(int map_fd) +{ + __u64 key, value = START_VALUE; + int err; + + for (key = 1; key < MAX_ENTRIES + 1; key++) { + err = bpf_map_update_elem(map_fd, &key, &value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static int fill_values_percpu(int map_fd) +{ + __u64 key, value[nr_cpus]; + int i, err; + + for (i = 0; i < nr_cpus; i++) + value[i] = START_VALUE; + + for (key = 1; key < MAX_ENTRIES + 1; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_lookup_and_delete *setup_prog(enum bpf_map_type map_type, + int *map_fd) +{ + struct test_lookup_and_delete *skel; + int err; + + skel = test_lookup_and_delete__open(); + if (!ASSERT_OK_PTR(skel, "test_lookup_and_delete__open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hash_map, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.hash_map, MAX_ENTRIES); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = test_lookup_and_delete__load(skel); + if (!ASSERT_OK(err, "test_lookup_and_delete__load")) + goto cleanup; + + *map_fd = bpf_map__fd(skel->maps.hash_map); + if (!ASSERT_GE(*map_fd, 0, "bpf_map__fd")) + goto cleanup; + + return skel; + +cleanup: + test_lookup_and_delete__destroy(skel); + return NULL; +} + +/* Triggers BPF program that updates map with given key and value */ +static int trigger_tp(struct test_lookup_and_delete *skel, __u64 key, + __u64 value) +{ + int err; + + skel->bss->set_pid = getpid(); + skel->bss->set_key = key; + skel->bss->set_value = value; + + err = test_lookup_and_delete__attach(skel); + if (!ASSERT_OK(err, "test_lookup_and_delete__attach")) + return -1; + + syscall(__NR_getpgid); + + test_lookup_and_delete__detach(skel); + + return 0; +} + +static void test_lookup_and_delete_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, value; + int map_fd, err; + + /* Setup program and fill the map. */ + skel = setup_prog(BPF_MAP_TYPE_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values(map_fd); + if (!ASSERT_OK(err, "fill_values")) + goto cleanup; + + /* Lookup and delete element. */ + key = 1; + err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + /* Fetched value should match the initially set value. */ + if (CHECK(value != START_VALUE, "bpf_map_lookup_and_delete_elem", + "unexpected value=%lld\n", value)) + goto cleanup; + + /* Check that the entry is non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_percpu_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, val, value[nr_cpus]; + int map_fd, err, i; + + /* Setup program and fill the map. */ + skel = setup_prog(BPF_MAP_TYPE_PERCPU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values_percpu(map_fd); + if (!ASSERT_OK(err, "fill_values_percpu")) + goto cleanup; + + /* Lookup and delete element. */ + key = 1; + err = bpf_map_lookup_and_delete_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + for (i = 0; i < nr_cpus; i++) { + val = value[i]; + + /* Fetched value should match the initially set value. */ + if (CHECK(val != START_VALUE, "map value", + "unexpected for cpu %d: %lld\n", i, val)) + goto cleanup; + } + + /* Check that the entry is non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_lru_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, value; + int map_fd, err; + + /* Setup program and fill the LRU map. */ + skel = setup_prog(BPF_MAP_TYPE_LRU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values(map_fd); + if (!ASSERT_OK(err, "fill_values")) + goto cleanup; + + /* Insert new element at key=3, should reuse LRU element. */ + key = 3; + err = trigger_tp(skel, key, NEW_VALUE); + if (!ASSERT_OK(err, "trigger_tp")) + goto cleanup; + + /* Lookup and delete element 3. */ + err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + /* Value should match the new value. */ + if (CHECK(value != NEW_VALUE, "bpf_map_lookup_and_delete_elem", + "unexpected value=%lld\n", value)) + goto cleanup; + + /* Check that entries 3 and 1 are non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + + key = 1; + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_lru_percpu_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, val, value[nr_cpus]; + int map_fd, err, i, cpucnt = 0; + + /* Setup program and fill the LRU map. */ + skel = setup_prog(BPF_MAP_TYPE_LRU_PERCPU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values_percpu(map_fd); + if (!ASSERT_OK(err, "fill_values_percpu")) + goto cleanup; + + /* Insert new element at key=3, should reuse LRU element 1. */ + key = 3; + err = trigger_tp(skel, key, NEW_VALUE); + if (!ASSERT_OK(err, "trigger_tp")) + goto cleanup; + + /* Clean value. */ + for (i = 0; i < nr_cpus; i++) + value[i] = 0; + + /* Lookup and delete element 3. */ + err = bpf_map_lookup_and_delete_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) { + goto cleanup; + } + + /* Check if only one CPU has set the value. */ + for (i = 0; i < nr_cpus; i++) { + val = value[i]; + if (val) { + if (CHECK(val != NEW_VALUE, "map value", + "unexpected for cpu %d: %lld\n", i, val)) + goto cleanup; + cpucnt++; + } + } + if (CHECK(cpucnt != 1, "map value", "set for %d CPUs instead of 1!\n", + cpucnt)) + goto cleanup; + + /* Check that entries 3 and 1 are non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + + key = 1; + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +void test_lookup_and_delete(void) +{ + nr_cpus = bpf_num_possible_cpus(); + + if (test__start_subtest("lookup_and_delete")) + test_lookup_and_delete_hash(); + if (test__start_subtest("lookup_and_delete_percpu")) + test_lookup_and_delete_percpu_hash(); + if (test__start_subtest("lookup_and_delete_lru")) + test_lookup_and_delete_lru_hash(); + if (test__start_subtest("lookup_and_delete_lru_percpu")) + test_lookup_and_delete_lru_percpu_hash(); +} diff --git a/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c new file mode 100644 index 000000000000..3a193f42c7e7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +__u32 set_pid = 0; +__u64 set_key = 0; +__u64 set_value = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hash_map SEC(".maps"); + +SEC("tp/syscalls/sys_enter_getpgid") +int bpf_lookup_and_delete_test(const void *ctx) +{ + if (set_pid == bpf_get_current_pid_tgid() >> 32) + bpf_map_update_elem(&hash_map, &set_key, &set_value, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 6a5349f9eb14..7e9049fa3edf 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -231,6 +231,14 @@ static void test_lru_sanity0(int map_type, int map_flags) assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 && errno == ENOENT); + /* lookup elem key=1 and delete it, then check it doesn't exist */ + key = 1; + assert(!bpf_map_lookup_and_delete_elem(lru_map_fd, &key, &value)); + assert(value[0] == 1234); + + /* remove the same element from the expected map */ + assert(!bpf_map_delete_elem(expected_map_fd, &key)); + assert(map_equal(lru_map_fd, expected_map_fd)); close(expected_map_fd); diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 179e680e8d13..c9f395f10988 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -65,6 +65,13 @@ static void test_hashmap(unsigned int task, void *data) assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 1234); key = 2; + value = 1234; + /* Insert key=2 element. */ + assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); + + /* Check that key=2 matches the value and delete it */ + assert(bpf_map_lookup_and_delete_elem(fd, &key, &value) == 0 && value == 1234); + /* Check that key=2 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); @@ -166,6 +173,16 @@ static void test_hashmap_percpu(unsigned int task, void *data) /* Insert key=1 element. */ assert(!(expected_key_mask & key)); assert(bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0); + + /* Lookup and delete elem key=1 and check value. */ + assert(bpf_map_lookup_and_delete_elem(fd, &key, value) == 0 && + bpf_percpu(value,0) == 100); + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value,i) = i + 100; + + /* Insert key=1 element which should not exist. */ + assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == 0); expected_key_mask |= key; /* BPF_NOEXIST means add new element if it doesn't exist. */ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 238f5f61189e..91ca714ef1d5 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -141,6 +141,50 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act < ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld >= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_LE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act <= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld > expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act > ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld <= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act >= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld < expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ -- Gitee From b4bc680df35ee0ea0a43ab9406454961ca2d6b26 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 31 Aug 2022 12:26:28 +0800 Subject: [PATCH 06/14] bpf: Propagate error from htab_lock_bucket() to userspace ANBZ: #5320 commit 66a7a92e4d0d091e79148a4c6ec15d1da65f4280 upstream. In __htab_map_lookup_and_delete_batch() if htab_lock_bucket() returns -EBUSY, it will go to next bucket. Going to next bucket may not only skip the elements in current bucket silently, but also incur out-of-bound memory access or expose kernel memory to userspace if current bucket_cnt is greater than bucket_size or zero. Fixing it by stopping batch operation and returning -EBUSY when htab_lock_bucket() fails, and the application can retry or skip the busy batch as needed. Fixes: 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked") Reported-by: Hao Sun Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220831042629.130006-3-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau Signed-off-by: Tianchen Ding --- kernel/bpf/hashtab.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4cb0461e4fe2..eee3bc009866 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1592,8 +1592,11 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); - if (ret) - goto next_batch; + if (ret) { + rcu_read_unlock(); + bpf_enable_instrumentation(); + goto after_loop; + } } bucket_cnt = 0; -- Gitee From 1e2e6436e25bae79340393f7b219d34ad56a222a Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 11 Jan 2023 17:29:01 +0800 Subject: [PATCH 07/14] bpf: hash map, avoid deadlock with suitable hash mask ANBZ: #5320 commit 9f907439dc80e4a2fcfb949927b36c036468dbb3 upstream. The deadlock still may occur while accessed in NMI and non-NMI context. Because in NMI, we still may access the same bucket but with different map_locked index. For example, on the same CPU, .max_entries = 2, we update the hash map, with key = 4, while running bpf prog in NMI nmi_handle(), to update hash map with key = 20, so it will have the same bucket index but have different map_locked index. To fix this issue, using min mask to hash again. Fixes: 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked") Signed-off-by: Tonghao Zhang Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Hou Tao Acked-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20230111092903.92389-1-tong@infragraf.org Signed-off-by: Martin KaFai Lau Signed-off-by: Tianchen Ding --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index eee3bc009866..cccabeb0f0f1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -162,7 +162,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, { unsigned long flags; - hash = hash & HASHTAB_MAP_LOCK_MASK; + hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); migrate_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { @@ -184,7 +184,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { - hash = hash & HASHTAB_MAP_LOCK_MASK; + hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else -- Gitee From e136fbe1a2d644b0f8d1a7a53a061586c084d0a7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 27 Oct 2021 16:45:00 -0700 Subject: [PATCH 08/14] bpf: Add bloom filter map implementation ANBZ: #5320 commit 9330986c03006ab1d33d243b7cfe598a7a3c1baa upstream. This patch adds the kernel-side changes for the implementation of a bpf bloom filter map. The bloom filter map supports peek (determining whether an element is present in the map) and push (adding an element to the map) operations.These operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_UPDATE_ELEM -> push The bloom filter map does not have keys, only values. In light of this, the bloom filter map's API matches that of queue stack maps: user applications use BPF_MAP_LOOKUP_ELEM/BPF_MAP_UPDATE_ELEM which correspond internally to bpf_map_peek_elem/bpf_map_push_elem, and bpf programs must use the bpf_map_peek_elem and bpf_map_push_elem APIs to query or add an element to the bloom filter map. When the bloom filter map is created, it must be created with a key_size of 0. For updates, the user will pass in the element to add to the map as the value, with a NULL key. For lookups, the user will pass in the element to query in the map as the value, with a NULL key. In the verifier layer, this requires us to modify the argument type of a bloom filter's BPF_FUNC_map_peek_elem call to ARG_PTR_TO_MAP_VALUE; as well, in the syscall layer, we need to copy over the user value so that in bpf_map_peek_elem, we know which specific value to query. A few things to please take note of: * If there are any concurrent lookups + updates, the user is responsible for synchronizing this to ensure no false negative lookups occur. * The number of hashes to use for the bloom filter is configurable from userspace. If no number is specified, the default used will be 5 hash functions. The benchmarks later in this patchset can help compare the performance of using different number of hashes on different entry sizes. In general, using more hashes decreases both the false positive rate and the speed of a lookup. * Deleting an element in the bloom filter map is not supported. * The bloom filter map may be used as an inner map. * The "max_entries" size that is specified at map creation time is used to approximate a reasonable bitmap size for the bloom filter, and is not otherwise strictly enforced. If the user wishes to insert more entries into the bloom filter than "max_entries", they may do so but they should be aware that this may lead to a higher false positive rate. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211027234504.30744-2-joannekoong@fb.com Signed-off-by: Tianchen Ding --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 9 ++ kernel/bpf/Makefile | 2 +- kernel/bpf/bloom_filter.c | 195 +++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 24 +++- kernel/bpf/verifier.c | 19 +++- tools/include/uapi/linux/bpf.h | 9 ++ 8 files changed, 253 insertions(+), 7 deletions(-) create mode 100644 kernel/bpf/bloom_filter.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ad875245353f..622c88236169 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -161,6 +161,7 @@ struct bpf_map { u32 value_size; u32 max_entries; u32 map_flags; + u64 map_extra; /* any per-map-type extra fields */ int spin_lock_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index e256d6ef4765..4cfa1fc39f30 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f5bf3d62b960..df47f25c5ce9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -158,6 +158,7 @@ enum bpf_map_type { BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, }; /* Note that tracing related programs such as @@ -503,6 +504,13 @@ union bpf_attr { * struct stored as the * map value */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -4499,6 +4507,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; + __u64 map_extra; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..05593c2b0411 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,7 +7,7 @@ endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-${CONFIG_BPF_LSM} += bpf_task_storage.o diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c new file mode 100644 index 000000000000..7c50232b7571 --- /dev/null +++ b/kernel/bpf/bloom_filter.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include +#include +#include +#include + +#define BLOOM_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) + +struct bpf_bloom_filter { + struct bpf_map map; + u32 bitset_mask; + u32 hash_seed; + /* If the size of the values in the bloom filter is u32 aligned, + * then it is more performant to use jhash2 as the underlying hash + * function, else we use jhash. This tracks the number of u32s + * in an u32-aligned value size. If the value size is not u32 aligned, + * this will be 0. + */ + u32 aligned_u32_count; + u32 nr_hash_funcs; + unsigned long bitset[]; +}; + +static u32 hash(struct bpf_bloom_filter *bloom, void *value, + u32 value_size, u32 index) +{ + u32 h; + + if (bloom->aligned_u32_count) + h = jhash2(value, bloom->aligned_u32_count, + bloom->hash_seed + index); + else + h = jhash(value, value_size, bloom->hash_seed + index); + + return h & bloom->bitset_mask; +} + +static int peek_elem(struct bpf_map *map, void *value) +{ + struct bpf_bloom_filter *bloom = + container_of(map, struct bpf_bloom_filter, map); + u32 i, h; + + for (i = 0; i < bloom->nr_hash_funcs; i++) { + h = hash(bloom, value, map->value_size, i); + if (!test_bit(h, bloom->bitset)) + return -ENOENT; + } + + return 0; +} + +static int push_elem(struct bpf_map *map, void *value, u64 flags) +{ + struct bpf_bloom_filter *bloom = + container_of(map, struct bpf_bloom_filter, map); + u32 i, h; + + if (flags != BPF_ANY) + return -EINVAL; + + for (i = 0; i < bloom->nr_hash_funcs; i++) { + h = hash(bloom, value, map->value_size, i); + set_bit(h, bloom->bitset); + } + + return 0; +} + +static int pop_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static struct bpf_map *map_alloc(union bpf_attr *attr) +{ + u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_bloom_filter *bloom; + + if (!bpf_capable()) + return ERR_PTR(-EPERM); + + if (attr->key_size != 0 || attr->value_size == 0 || + attr->max_entries == 0 || + attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || + (attr->map_extra & ~0xF)) + return ERR_PTR(-EINVAL); + + /* The lower 4 bits of map_extra specify the number of hash functions */ + nr_hash_funcs = attr->map_extra & 0xF; + if (nr_hash_funcs == 0) + /* Default to using 5 hash functions if unspecified */ + nr_hash_funcs = 5; + + /* For the bloom filter, the optimal bit array size that minimizes the + * false positive probability is n * k / ln(2) where n is the number of + * expected entries in the bloom filter and k is the number of hash + * functions. We use 7 / 5 to approximate 1 / ln(2). + * + * We round this up to the nearest power of two to enable more efficient + * hashing using bitmasks. The bitmask will be the bit array size - 1. + * + * If this overflows a u32, the bit array size will have 2^32 (4 + * GB) bits. + */ + if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) || + check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) || + nr_bits > (1UL << 31)) { + /* The bit array size is 2^32 bits but to avoid overflowing the + * u32, we use U32_MAX, which will round up to the equivalent + * number of bytes + */ + bitset_bytes = BITS_TO_BYTES(U32_MAX); + bitset_mask = U32_MAX; + } else { + if (nr_bits <= BITS_PER_LONG) + nr_bits = BITS_PER_LONG; + else + nr_bits = roundup_pow_of_two(nr_bits); + bitset_bytes = BITS_TO_BYTES(nr_bits); + bitset_mask = nr_bits - 1; + } + + bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); + bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node); + + if (!bloom) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&bloom->map, attr); + + bloom->nr_hash_funcs = nr_hash_funcs; + bloom->bitset_mask = bitset_mask; + + /* Check whether the value size is u32-aligned */ + if ((attr->value_size & (sizeof(u32) - 1)) == 0) + bloom->aligned_u32_count = + attr->value_size / sizeof(u32); + + if (!(attr->map_flags & BPF_F_ZERO_SEED)) + bloom->hash_seed = get_random_int(); + + return &bloom->map; +} + +static void map_free(struct bpf_map *map) +{ + struct bpf_bloom_filter *bloom = + container_of(map, struct bpf_bloom_filter, map); + + bpf_map_area_free(bloom); +} + +static void *lookup_elem(struct bpf_map *map, void *key) +{ + /* The eBPF program should use map_peek_elem instead */ + return ERR_PTR(-EINVAL); +} + +static int update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + /* The eBPF program should use map_push_elem instead */ + return -EINVAL; +} + +static int check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Bloom filter maps are keyless */ + return btf_type_is_void(key_type) ? 0 : -EINVAL; +} + +static int bpf_bloom_btf_id; +const struct bpf_map_ops bloom_filter_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = map_alloc, + .map_free = map_free, + .map_push_elem = push_elem, + .map_peek_elem = peek_elem, + .map_pop_elem = pop_elem, + .map_lookup_elem = lookup_elem, + .map_update_elem = update_elem, + .map_check_btf = check_btf, + .map_btf_name = "bpf_bloom_filter", + .map_btf_id = &bpf_bloom_btf_id, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 87536b2283a9..da57b0322c1e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -210,7 +210,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, err = bpf_fd_reuseport_array_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { + map->map_type == BPF_MAP_TYPE_STACK || + map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { rcu_read_lock(); @@ -249,7 +250,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { + map->map_type == BPF_MAP_TYPE_STACK || + map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_peek_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* struct_ops map requires directly updating "value" */ @@ -355,6 +357,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->max_entries = attr->max_entries; map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); + map->map_extra = attr->map_extra; } static int bpf_charge_memlock(struct user_struct *user, u32 pages) @@ -563,6 +566,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" + "map_extra:\t%#llx\n" "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", @@ -571,6 +575,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, + (unsigned long long)map->map_extra, map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); @@ -803,7 +808,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -825,6 +830,10 @@ static int map_create(union bpf_attr *attr) return -EINVAL; } + if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_extra != 0) + return -EINVAL; + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -1057,6 +1066,14 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; + if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { + if (copy_from_user(value, uvalue, value_size)) + err = -EFAULT; + else + err = bpf_map_copy_value(map, key, value, attr->flags); + goto free_value; + } + err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; @@ -3738,6 +3755,7 @@ static int bpf_map_get_info_by_fd(struct file *file, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + info.map_extra = map->map_extra; memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8ec405ce4bf..707b9e8c2084 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4313,7 +4313,10 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, return -EINVAL; } break; - + case BPF_MAP_TYPE_BLOOM_FILTER: + if (meta->func_id == BPF_FUNC_map_peek_elem) + *arg_type = ARG_PTR_TO_MAP_VALUE; + break; default: break; } @@ -4828,6 +4831,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_task_storage_delete) goto error; break; + case BPF_MAP_TYPE_BLOOM_FILTER: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -4895,13 +4903,18 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; - case BPF_FUNC_map_peek_elem: case BPF_FUNC_map_pop_elem: - case BPF_FUNC_map_push_elem: if (map->map_type != BPF_MAP_TYPE_QUEUE && map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK && + map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b481addbd92a..81a23646a8e0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -158,6 +158,7 @@ enum bpf_map_type { BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, }; /* Note that tracing related programs such as @@ -503,6 +504,13 @@ union bpf_attr { * struct stored as the * map value */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -4499,6 +4507,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; + __u64 map_extra; } __attribute__((aligned(8))); struct bpf_btf_info { -- Gitee From a558ab1b2c02ce8c534249eb03aa980808df92e7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 27 Oct 2021 16:45:01 -0700 Subject: [PATCH 09/14] libbpf: Add "map_extra" as a per-map-type extra flag ANBZ: #5320 commit 47512102cde2d252d7b984d9675cfd3420b48ad9 upstream. This patch adds the libbpf infrastructure for supporting a per-map-type "map_extra" field, whose definition will be idiosyncratic depending on map type. For example, for the bloom filter map, the lower 4 bits of map_extra is used to denote the number of hash functions. Please note that until libbpf 1.0 is here, the "bpf_create_map_params" struct is used as a temporary means for propagating the map_extra field to the kernel. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211027234504.30744-3-joannekoong@fb.com [dtcccc: adapt to ANCK 5.10 since we have no gen_loader and map def refactor by c7ef5ec9573f0 ("libbpf: Refactor BTF map definition parsing").] Signed-off-by: Tianchen Ding --- tools/lib/bpf/bpf.c | 27 ++++++++++++++++++++++++++- tools/lib/bpf/libbpf.c | 33 +++++++++++++++++++++++++++++---- tools/lib/bpf/libbpf.h | 3 +++ tools/lib/bpf/libbpf.map | 6 ++++++ tools/lib/bpf/libbpf_internal.h | 21 +++++++++++++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index b2fe13422d79..a3009cd60878 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -76,7 +76,7 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size) return fd; } -int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) +int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr) { union bpf_attr attr; @@ -100,10 +100,35 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) create_attr->btf_vmlinux_value_type_id; else attr.inner_map_fd = create_attr->inner_map_fd; + attr.map_extra = create_attr->map_extra; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } +int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) +{ + struct bpf_create_map_params p = {}; + + p.map_type = create_attr->map_type; + p.key_size = create_attr->key_size; + p.value_size = create_attr->value_size; + p.max_entries = create_attr->max_entries; + p.map_flags = create_attr->map_flags; + p.name = create_attr->name; + p.numa_node = create_attr->numa_node; + p.btf_fd = create_attr->btf_fd; + p.btf_key_type_id = create_attr->btf_key_type_id; + p.btf_value_type_id = create_attr->btf_value_type_id; + p.map_ifindex = create_attr->map_ifindex; + if (p.map_type == BPF_MAP_TYPE_STRUCT_OPS) + p.btf_vmlinux_value_type_id = + create_attr->btf_vmlinux_value_type_id; + else + p.inner_map_fd = create_attr->inner_map_fd; + + return libbpf__bpf_create_map_xattr(&p); +} + int bpf_create_map_node(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags, int node) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3e52e5a3f4c7..2552dae45197 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -355,6 +355,7 @@ struct bpf_map { char *pin_path; bool pinned; bool reused; + __u64 map_extra; }; enum extern_type { @@ -2193,6 +2194,14 @@ static int parse_btf_map_def(struct bpf_object *obj, return err; } } + } else if (strcmp(name, "map_extra") == 0) { + __u32 map_extra; + + if (!get_map_field_int(map->name, obj->btf, m, &map_extra)) + return -EINVAL; + pr_debug("map '%s': found map_extra = 0x%llx.\n", map->name, + (unsigned long long)map_extra); + map->map_extra = map_extra; } else { if (strict) { pr_warn("map '%s': unknown field '%s'.\n", @@ -3698,6 +3707,7 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) map->btf_key_type_id = info.btf_key_type_id; map->btf_value_type_id = info.btf_value_type_id; map->reused = true; + map->map_extra = info.map_extra; return 0; @@ -4090,7 +4100,8 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) map_info.key_size == map->def.key_size && map_info.value_size == map->def.value_size && map_info.max_entries == map->def.max_entries && - map_info.map_flags == map->def.map_flags); + map_info.map_flags == map->def.map_flags && + map_info.map_extra == map->map_extra); } static int @@ -4166,7 +4177,7 @@ static void bpf_map__destroy(struct bpf_map *map); static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) { - struct bpf_create_map_attr create_attr; + struct bpf_create_map_params create_attr; struct bpf_map_def *def = &map->def; int err = 0; @@ -4180,6 +4191,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.key_size = def->key_size; create_attr.value_size = def->value_size; create_attr.numa_node = map->numa_node; + create_attr.map_extra = map->map_extra; if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !def->max_entries) { int nr_cpus; @@ -4223,7 +4235,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.inner_map_fd = map->inner_map_fd; } - map->fd = bpf_create_map_xattr(&create_attr); + map->fd = libbpf__bpf_create_map_xattr(&create_attr); if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { char *cp, errmsg[STRERR_BUFSIZE]; @@ -4237,7 +4249,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; - map->fd = bpf_create_map_xattr(&create_attr); + map->fd = libbpf__bpf_create_map_xattr(&create_attr); } err = map->fd < 0 ? -errno : 0; @@ -8855,6 +8867,19 @@ int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) return 0; } +__u64 bpf_map__map_extra(const struct bpf_map *map) +{ + return map->map_extra; +} + +int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) +{ + if (map->fd >= 0) + return -EBUSY; + map->map_extra = map_extra; + return 0; +} + __u32 bpf_map__numa_node(const struct bpf_map *map) { return map->numa_node; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 57d10b779dea..0208f1e5c5d4 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -463,6 +463,9 @@ LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); /* get/set map if_index */ LIBBPF_API __u32 bpf_map__ifindex(const struct bpf_map *map); LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +/* get/set map map_extra flags */ +LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b45e7fcc1cab..06b6eb2c113c 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -348,3 +348,9 @@ LIBBPF_0.5.0 { global: bpf_map_lookup_and_delete_elem_flags; } LIBBPF_0.4.0; + +LIBBPF_0.6.0 { + global: + bpf_map__map_extra; + bpf_map__set_map_extra; +} LIBBPF_0.5.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index d99bc847bf84..8bf152688ddc 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -156,6 +156,27 @@ int bpf_object__section_size(const struct bpf_object *obj, const char *name, int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, __u32 *off); +struct bpf_create_map_params { + const char *name; + enum bpf_map_type map_type; + __u32 map_flags; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 numa_node; + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 map_ifindex; + union { + __u32 inner_map_fd; + __u32 btf_vmlinux_value_type_id; + }; + __u64 map_extra; +}; + +int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr); + struct btf_ext_info { /* * info points to the individual info section (e.g. func_info and -- Gitee From 9c0400ab9e386bfb95ac217fc534461ad74a6cec Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 27 Oct 2021 16:45:02 -0700 Subject: [PATCH 10/14] selftests/bpf: Add bloom filter map test cases ANBZ: #5320 commit ed9109ad643cfbe69670a37cdbaf2da9f409fed0 upstream. This patch adds test cases for bpf bloom filter maps. They include tests checking against invalid operations by userspace, tests for using the bloom filter map as an inner map, and a bpf program that queries the bloom filter map for values added by a userspace program. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211027234504.30744-4-joannekoong@fb.com [dtcccc: we have not bpf_for_each_map_elem() helper, so use for-loop instead. bpf_map_update_elem() will return -1 but not -errno on fail because our libbpf has not libbpf_err_errno(), fix it manually.] Signed-off-by: Tianchen Ding --- .../bpf/prog_tests/bloom_filter_map.c | 209 ++++++++++++++++++ .../selftests/bpf/progs/bloom_filter_map.c | 77 +++++++ 2 files changed, 286 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c create mode 100644 tools/testing/selftests/bpf/progs/bloom_filter_map.c diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c new file mode 100644 index 000000000000..7b7b0874d17e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include "bloom_filter_map.skel.h" + +static void test_fail_cases(void) +{ + struct bpf_create_map_attr xattr = { + .name = "bloom_filter_map", + .map_type = BPF_MAP_TYPE_BLOOM_FILTER, + .max_entries = 100, + .value_size = 11, + }; + __u32 value; + int fd, err; + + /* Invalid key size */ + xattr.key_size = 4; + fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid key size")) + close(fd); + xattr.key_size = 0; + + /* Invalid value size */ + xattr.value_size = 0; + fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid value size 0")) + close(fd); + xattr.value_size = 11; + + /* Invalid max entries size */ + xattr.max_entries = 0; + fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid max entries size")) + close(fd); + xattr.max_entries = 100; + + /* Bloom filter maps do not support BPF_F_NO_PREALLOC */ + xattr.map_flags = BPF_F_NO_PREALLOC; + fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid flags")) + close(fd); + xattr.map_flags = 0; + + fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter")) + return; + + /* Test invalid flags */ + err = bpf_map_update_elem(fd, NULL, &value, -1); + err = err < 0 ? -errno : err; + ASSERT_EQ(err, -EINVAL, "bpf_map_update_elem bloom filter invalid flags"); + + err = bpf_map_update_elem(fd, NULL, &value, BPF_EXIST); + err = err < 0 ? -errno : err; + ASSERT_EQ(err, -EINVAL, "bpf_map_update_elem bloom filter invalid flags"); + + err = bpf_map_update_elem(fd, NULL, &value, BPF_F_LOCK); + err = err < 0 ? -errno : err; + ASSERT_EQ(err, -EINVAL, "bpf_map_update_elem bloom filter invalid flags"); + + err = bpf_map_update_elem(fd, NULL, &value, BPF_NOEXIST); + err = err < 0 ? -errno : err; + ASSERT_EQ(err, -EINVAL, "bpf_map_update_elem bloom filter invalid flags"); + + err = bpf_map_update_elem(fd, NULL, &value, 10000); + err = err < 0 ? -errno : err; + ASSERT_EQ(err, -EINVAL, "bpf_map_update_elem bloom filter invalid flags"); + + close(fd); +} + +static void check_bloom(struct bloom_filter_map *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.check_bloom); + if (!ASSERT_OK_PTR(link, "link")) + return; + + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->error, 0, "error"); + + bpf_link__destroy(link); +} + +static void test_inner_map(struct bloom_filter_map *skel, const __u32 *rand_vals, + __u32 nr_rand_vals) +{ + int outer_map_fd, inner_map_fd, err, i, key = 0; + struct bpf_create_map_attr xattr = { + .name = "bloom_filter_inner_map", + .map_type = BPF_MAP_TYPE_BLOOM_FILTER, + .value_size = sizeof(__u32), + .max_entries = nr_rand_vals, + }; + struct bpf_link *link; + + /* Create a bloom filter map that will be used as the inner map */ + inner_map_fd = bpf_create_map_xattr(&xattr); + if (!ASSERT_GE(inner_map_fd, 0, "bpf_create_map bloom filter inner map")) + return; + + for (i = 0; i < nr_rand_vals; i++) { + err = bpf_map_update_elem(inner_map_fd, NULL, rand_vals + i, BPF_ANY); + if (!ASSERT_OK(err, "Add random value to inner_map_fd")) + goto done; + } + + /* Add the bloom filter map to the outer map */ + outer_map_fd = bpf_map__fd(skel->maps.outer_map); + err = bpf_map_update_elem(outer_map_fd, &key, &inner_map_fd, BPF_ANY); + if (!ASSERT_OK(err, "Add bloom filter map to outer map")) + goto done; + + /* Attach the bloom_filter_inner_map prog */ + link = bpf_program__attach(skel->progs.inner_map); + if (!ASSERT_OK_PTR(link, "link")) + goto delete_inner_map; + + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->error, 0, "error"); + + bpf_link__destroy(link); + +delete_inner_map: + /* Ensure the inner bloom filter map can be deleted */ + err = bpf_map_delete_elem(outer_map_fd, &key); + ASSERT_OK(err, "Delete inner bloom filter map"); + +done: + close(inner_map_fd); +} + +static int setup_progs(struct bloom_filter_map **out_skel, __u32 **out_rand_vals, + __u32 *out_nr_rand_vals) +{ + struct bloom_filter_map *skel; + int random_data_fd, bloom_fd; + __u32 *rand_vals = NULL; + __u32 map_size, val; + int err, i; + + /* Set up a bloom filter map skeleton */ + skel = bloom_filter_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bloom_filter_map__open_and_load")) + return -EINVAL; + + /* Set up rand_vals */ + map_size = bpf_map__max_entries(skel->maps.map_random_data); + rand_vals = malloc(sizeof(*rand_vals) * map_size); + if (!rand_vals) { + err = -ENOMEM; + goto error; + } + + /* Generate random values and populate both skeletons */ + random_data_fd = bpf_map__fd(skel->maps.map_random_data); + bloom_fd = bpf_map__fd(skel->maps.map_bloom); + for (i = 0; i < map_size; i++) { + val = rand(); + + err = bpf_map_update_elem(random_data_fd, &i, &val, BPF_ANY); + if (!ASSERT_OK(err, "Add random value to map_random_data")) + goto error; + + err = bpf_map_update_elem(bloom_fd, NULL, &val, BPF_ANY); + if (!ASSERT_OK(err, "Add random value to map_bloom")) + goto error; + + rand_vals[i] = val; + } + + *out_skel = skel; + *out_rand_vals = rand_vals; + *out_nr_rand_vals = map_size; + + return 0; + +error: + bloom_filter_map__destroy(skel); + if (rand_vals) + free(rand_vals); + return err; +} + +void test_bloom_filter_map(void) +{ + __u32 *rand_vals, nr_rand_vals; + struct bloom_filter_map *skel; + int err; + + test_fail_cases(); + + err = setup_progs(&skel, &rand_vals, &nr_rand_vals); + if (err) + return; + + test_inner_map(skel, rand_vals, nr_rand_vals); + free(rand_vals); + + check_bloom(skel); + + bloom_filter_map__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/bloom_filter_map.c b/tools/testing/selftests/bpf/progs/bloom_filter_map.c new file mode 100644 index 000000000000..5286af5f8aa1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bloom_filter_map.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct bpf_map; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1000); +} map_random_data SEC(".maps"); + +struct map_bloom_type { + __uint(type, BPF_MAP_TYPE_BLOOM_FILTER); + __type(value, __u32); + __uint(max_entries, 10000); + __uint(map_extra, 5); +} map_bloom SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct map_bloom_type); +} outer_map SEC(".maps"); + +int error = 0; + +static void check_elem(struct bpf_map *map) +{ + int err, i, key, *map_random_val; + + for (i = 0; i < 1000; i++) { + key = i; + map_random_val = bpf_map_lookup_elem(&map_random_data, &key); + if (!map_random_val) { + error |= 2; + return; + } + err = bpf_map_peek_elem(map, map_random_val); + if (err) { + error |= 1; + return; + } + } +} + +SEC("fentry/__x64_sys_getpgid") +int inner_map(void *ctx) +{ + struct bpf_map *inner_map; + int key = 0; + + inner_map = bpf_map_lookup_elem(&outer_map, &key); + if (!inner_map) { + error |= 2; + return 0; + } + + check_elem(inner_map); + + return 0; +} + +SEC("fentry/__x64_sys_getpgid") +int check_bloom(void *ctx) +{ + check_elem(&map_bloom); + + return 0; +} -- Gitee From 10218c23ffc85ec0df6d5071256a36357baf869d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 29 Oct 2021 15:49:07 -0700 Subject: [PATCH 11/14] bpf: Bloom filter map naming fixups ANBZ: #5320 commit 6fdc348006fe2c8f0765f6eecf2e3cbab06c60b5 upstream. This patch has two changes in the kernel bloom filter map implementation: 1) Change the names of map-ops functions to include the "bloom_map" prefix. As Martin pointed out on a previous patchset, having generic map-ops names may be confusing in tracing and in perf-report. 2) Drop the "& 0xF" when getting nr_hash_funcs, since we already ascertain that no other bits in map_extra beyond the first 4 bits can be set. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211029224909.1721024-2-joannekoong@fb.com Signed-off-by: Tianchen Ding --- kernel/bpf/bloom_filter.c | 49 +++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 7c50232b7571..073c2f2cab8b 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -40,7 +40,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value, return h & bloom->bitset_mask; } -static int peek_elem(struct bpf_map *map, void *value) +static int bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -55,7 +55,7 @@ static int peek_elem(struct bpf_map *map, void *value) return 0; } -static int push_elem(struct bpf_map *map, void *value, u64 flags) +static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -72,12 +72,12 @@ static int push_elem(struct bpf_map *map, void *value, u64 flags) return 0; } -static int pop_elem(struct bpf_map *map, void *value) +static int bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } -static struct bpf_map *map_alloc(union bpf_attr *attr) +static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; int numa_node = bpf_map_attr_numa_node(attr); @@ -90,11 +90,13 @@ static struct bpf_map *map_alloc(union bpf_attr *attr) attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || + /* The lower 4 bits of map_extra (0xF) specify the number + * of hash functions + */ (attr->map_extra & ~0xF)) return ERR_PTR(-EINVAL); - /* The lower 4 bits of map_extra specify the number of hash functions */ - nr_hash_funcs = attr->map_extra & 0xF; + nr_hash_funcs = attr->map_extra; if (nr_hash_funcs == 0) /* Default to using 5 hash functions if unspecified */ nr_hash_funcs = 5; @@ -150,7 +152,7 @@ static struct bpf_map *map_alloc(union bpf_attr *attr) return &bloom->map; } -static void map_free(struct bpf_map *map) +static void bloom_map_free(struct bpf_map *map) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -158,38 +160,39 @@ static void map_free(struct bpf_map *map) bpf_map_area_free(bloom); } -static void *lookup_elem(struct bpf_map *map, void *key) +static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) { /* The eBPF program should use map_peek_elem instead */ return ERR_PTR(-EINVAL); } -static int update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static int bloom_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; } -static int check_btf(const struct bpf_map *map, const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) +static int bloom_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) { /* Bloom filter maps are keyless */ return btf_type_is_void(key_type) ? 0 : -EINVAL; } -static int bpf_bloom_btf_id; +static int bpf_bloom_map_btf_id; const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, - .map_alloc = map_alloc, - .map_free = map_free, - .map_push_elem = push_elem, - .map_peek_elem = peek_elem, - .map_pop_elem = pop_elem, - .map_lookup_elem = lookup_elem, - .map_update_elem = update_elem, - .map_check_btf = check_btf, + .map_alloc = bloom_map_alloc, + .map_free = bloom_map_free, + .map_push_elem = bloom_map_push_elem, + .map_peek_elem = bloom_map_peek_elem, + .map_pop_elem = bloom_map_pop_elem, + .map_lookup_elem = bloom_map_lookup_elem, + .map_update_elem = bloom_map_update_elem, + .map_check_btf = bloom_map_check_btf, .map_btf_name = "bpf_bloom_filter", - .map_btf_id = &bpf_bloom_btf_id, + .map_btf_id = &bpf_bloom_map_btf_id, }; -- Gitee From d2ecc79210b5d0c6e4558394ea9fc4830165d460 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 29 Oct 2021 15:49:09 -0700 Subject: [PATCH 12/14] selftests/bpf: Add bloom map success test for userspace calls ANBZ: #5320 commit 7a67087250f0003acc1b9e20eda01635e1e51f9a upstream. This patch has two changes: 1) Adds a new function "test_success_cases" to test successfully creating + adding + looking up a value in a bloom filter map from the userspace side. 2) Use bpf_create_map instead of bpf_create_map_xattr in the "test_fail_cases" and test_inner_map to make the code look cleaner. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211029224909.1721024-4-joannekoong@fb.com Signed-off-by: Tianchen Ding --- .../bpf/prog_tests/bloom_filter_map.c | 59 +++++++++++-------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c index 7b7b0874d17e..6a946e3043df 100644 --- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c +++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c @@ -7,44 +7,31 @@ static void test_fail_cases(void) { - struct bpf_create_map_attr xattr = { - .name = "bloom_filter_map", - .map_type = BPF_MAP_TYPE_BLOOM_FILTER, - .max_entries = 100, - .value_size = 11, - }; __u32 value; int fd, err; /* Invalid key size */ - xattr.key_size = 4; - fd = bpf_create_map_xattr(&xattr); + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 4, sizeof(value), 100, 0); if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid key size")) close(fd); - xattr.key_size = 0; /* Invalid value size */ - xattr.value_size = 0; - fd = bpf_create_map_xattr(&xattr); + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, 0, 100, 0); if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid value size 0")) close(fd); - xattr.value_size = 11; /* Invalid max entries size */ - xattr.max_entries = 0; - fd = bpf_create_map_xattr(&xattr); + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 0, 0); if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid max entries size")) close(fd); - xattr.max_entries = 100; /* Bloom filter maps do not support BPF_F_NO_PREALLOC */ - xattr.map_flags = BPF_F_NO_PREALLOC; - fd = bpf_create_map_xattr(&xattr); + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, + BPF_F_NO_PREALLOC); if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid flags")) close(fd); - xattr.map_flags = 0; - fd = bpf_create_map_xattr(&xattr); + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, 0); if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter")) return; @@ -72,6 +59,30 @@ static void test_fail_cases(void) close(fd); } +static void test_success_cases(void) +{ + char value[11]; + int fd, err; + + /* Create a map */ + fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, + BPF_F_ZERO_SEED | BPF_F_NUMA_NODE); + if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter success case")) + return; + + /* Add a value to the bloom filter */ + err = bpf_map_update_elem(fd, NULL, &value, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem bloom filter success case")) + goto done; + + /* Lookup a value in the bloom filter */ + err = bpf_map_lookup_elem(fd, NULL, &value); + ASSERT_OK(err, "bpf_map_update_elem bloom filter success case"); + +done: + close(fd); +} + static void check_bloom(struct bloom_filter_map *skel) { struct bpf_link *link; @@ -91,16 +102,11 @@ static void test_inner_map(struct bloom_filter_map *skel, const __u32 *rand_vals __u32 nr_rand_vals) { int outer_map_fd, inner_map_fd, err, i, key = 0; - struct bpf_create_map_attr xattr = { - .name = "bloom_filter_inner_map", - .map_type = BPF_MAP_TYPE_BLOOM_FILTER, - .value_size = sizeof(__u32), - .max_entries = nr_rand_vals, - }; struct bpf_link *link; /* Create a bloom filter map that will be used as the inner map */ - inner_map_fd = bpf_create_map_xattr(&xattr); + inner_map_fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(*rand_vals), + nr_rand_vals, 0); if (!ASSERT_GE(inner_map_fd, 0, "bpf_create_map bloom filter inner map")) return; @@ -195,6 +201,7 @@ void test_bloom_filter_map(void) int err; test_fail_cases(); + test_success_cases(); err = setup_progs(&skel, &rand_vals, &nr_rand_vals); if (err) -- Gitee From 5973b00e02f9f14999ef7061d020896b92a765e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 31 Oct 2021 10:13:53 -0700 Subject: [PATCH 13/14] bpf: Add missing map_delete_elem method to bloom filter map ANBZ: #5320 commit ad10c381d133d9f786564bff4b223be1372f6c2a upstream. Without it, kernel crashes in map_delete_elem(), as reported by syzbot. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 72c97067 P4D 72c97067 PUD 1e20c067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 0 PID: 6518 Comm: syz-executor196 Not tainted 5.15.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. RSP: 0018:ffffc90002bafcb8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: 1ffff92000575f9f RCX: 0000000000000000 RDX: 1ffffffff1327aba RSI: 0000000000000000 RDI: ffff888025a30c00 RBP: ffffc90002baff08 R08: 0000000000000000 R09: 0000000000000001 R10: ffffffff818525d8 R11: 0000000000000000 R12: ffffffff8993d560 R13: ffff888025a30c00 R14: ffff888024bc0000 R15: 0000000000000000 FS: 0000555557491300(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000070189000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: map_delete_elem kernel/bpf/syscall.c:1220 [inline] __sys_bpf+0x34f1/0x5ee0 kernel/bpf/syscall.c:4606 __do_sys_bpf kernel/bpf/syscall.c:4719 [inline] __se_sys_bpf kernel/bpf/syscall.c:4717 [inline] __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:4717 do_syscall_x64 arch/x86/entry/common.c:50 [inline] Fixes: 9330986c0300 ("bpf: Add bloom filter map implementation") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211031171353.4092388-1-eric.dumazet@gmail.com Signed-off-by: Tianchen Ding --- kernel/bpf/bloom_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 073c2f2cab8b..277a05e9c984 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -77,6 +77,11 @@ static int bloom_map_pop_elem(struct bpf_map *map, void *value) return -EOPNOTSUPP; } +static int bloom_map_delete_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_pop_elem = bloom_map_pop_elem, .map_lookup_elem = bloom_map_lookup_elem, .map_update_elem = bloom_map_update_elem, + .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, .map_btf_name = "bpf_bloom_filter", .map_btf_id = &bpf_bloom_map_btf_id, -- Gitee From ad5ae32e0b7956d3413bdaa41cbb77de0859d5f1 Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Wed, 29 Dec 2021 19:20:02 +0800 Subject: [PATCH 14/14] bpf: Add missing map_get_next_key method to bloom filter map. ANBZ: #5320 commit 3ccdcee28415c4226de05438b4d89eb5514edf73 upstream. Without it, kernel crashes in map_get_next_key(). Fixes: 9330986c0300 ("bpf: Add bloom filter map implementation") Reported-by: TCS Robot Signed-off-by: Haimin Zhang Signed-off-by: Alexei Starovoitov Acked-by: Joanne Koong Link: https://lore.kernel.org/bpf/1640776802-22421-1-git-send-email-tcs.kernel@gmail.com Signed-off-by: Tianchen Ding --- kernel/bpf/bloom_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 277a05e9c984..b141a1346f72 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value) return -EOPNOTSUPP; } +static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, + .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, -- Gitee