From 2b9e6b63d7d07aff031a42ea80ea526b56857839 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: [PATCH 01/25] bpf: Support doubleword alignment in bpf_jit_binary_alloc ANBZ: #5530 commit b7b3fc8dd95bc02bd30680da258e09dda55270db upstream. Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com Signed-off-by: Yuanhe Shu --- include/linux/filter.h | 6 ++++-- kernel/bpf/core.c | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 878e1614a379..8ce3993340f5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -538,10 +538,12 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +/* Some arches need doubleword alignment for their instructions and/or data */ +#define BPF_IMAGE_ALIGNMENT 8 + struct bpf_binary_header { u32 pages; - /* Some arches need word alignment for their instructions */ - u8 image[] __aligned(4); + u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e4e1962aed60..590e4aa2456d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include /* Registers */ @@ -819,6 +820,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, struct bpf_binary_header *hdr; u32 size, hole, start, pages; + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. -- Gitee From 8fc9bff32ecc6b2aa0815bca1322e3eb33481219 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: [PATCH 02/25] bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails ANBZ: #5530 commit 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 upstream. 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com Signed-off-by: Yuanhe Shu --- .../net/ethernet/netronome/nfp/bpf/offload.c | 4 +- include/linux/bpf.h | 10 ++-- kernel/bpf/inode.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/syscall.c | 51 ++++++++----------- kernel/bpf/verifier.c | 6 +-- kernel/bpf/xskmap.c | 6 +-- net/core/bpf_sk_storage.c | 2 +- 8 files changed, 34 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 436311c7f4fe..be1d21f71de1 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -75,9 +75,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, /* Grab a single ref to the map for our record. The prog destroy ndo * happens after free_used_maps(). */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) - return PTR_ERR(map); + bpf_map_inc(map); record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e4fa080b001..9df172ab2502 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -104,8 +104,8 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - atomic_t refcnt ____cacheline_aligned; - atomic_t usercnt; + atomic64_t refcnt ____cacheline_aligned; + atomic64_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; @@ -690,9 +690,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); -struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, - bool uref); +void bpf_map_inc(struct bpf_map *map); +void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 84a80b02db99..783f2b7a5086 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -36,7 +36,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - raw = bpf_map_inc(raw, true); + bpf_map_inc_with_uref(raw); break; default: WARN_ON_ONCE(1); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..c145e765d1ac 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -101,7 +101,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) - inner_map = bpf_map_inc(inner_map, false); + bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 41b23a16ee0a..ed97fe69b2d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -323,7 +323,7 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { - if (atomic_dec_and_test(&map->usercnt)) { + if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } @@ -334,7 +334,7 @@ static void bpf_map_put_uref(struct bpf_map *map) */ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { - if (atomic_dec_and_test(&map->refcnt)) { + if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); @@ -587,8 +587,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - atomic_set(&map->refcnt, 1); - atomic_set(&map->usercnt, 1); + atomic64_set(&map->refcnt, 1); + atomic64_set(&map->usercnt, 1); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -665,21 +665,19 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -/* prog's and map's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) +void bpf_map_inc(struct bpf_map *map) { - if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&map->refcnt); - return ERR_PTR(-EBUSY); - } - if (uref) - atomic_inc(&map->usercnt); - return map; + atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); +void bpf_map_inc_with_uref(struct bpf_map *map) +{ + atomic64_inc(&map->refcnt); + atomic64_inc(&map->usercnt); +} +EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -689,38 +687,30 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - map = bpf_map_inc(map, true); + bpf_map_inc_with_uref(map); fdput(f); return map; } /* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; - refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_map_put(map, false); - return ERR_PTR(-EBUSY); - } - + refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); - if (uref) - atomic_inc(&map->usercnt); + atomic64_inc(&map->usercnt); return map; } -struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, uref); + map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; @@ -1470,6 +1460,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } +/* prog's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6f0d846b7fb..82c1ea634504 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8366,11 +8366,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_used_maps() */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) { - fdput(f); - return PTR_ERR(map); - } + bpf_map_inc(map); aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 6a2f6f6dacd4..0e0188580cf5 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,10 +11,8 @@ int xsk_map_inc(struct xsk_map *map) { - struct bpf_map *m = &map->map; - - m = bpf_map_inc(m, false); - return PTR_ERR_OR_ZERO(m); + bpf_map_inc(&map->map); + return 0; } void xsk_map_put(struct xsk_map *map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index aba3767e11d1..872fa42c87f0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -799,7 +799,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ - map = bpf_map_inc_not_zero(&smap->map, false); + map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; -- Gitee From 2c6ea713ce0784aec0e34b4202fb2fc39dd24f29 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: [PATCH 03/25] bpf: Convert bpf_prog refcnt to atomic64_t ANBZ: #5530 commit 85192dbf4de08795afe2b88e52a36fc6abfc3dba upstream. Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. [backport note] Ignore drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c as we don't have. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com Signed-off-by: Yuanhe Shu --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++---- .../net/ethernet/cavium/thunder/nicvf_main.c | 9 ++---- .../net/ethernet/mellanox/mlx4/en_netdev.c | 24 ++++----------- .../net/ethernet/mellanox/mlx5/core/en_main.c | 18 ++++------- drivers/net/ethernet/qlogic/qede/qede_main.c | 8 ++--- drivers/net/virtio_net.c | 7 ++--- include/linux/bpf.h | 13 ++++---- kernel/bpf/inode.c | 5 ++-- kernel/bpf/syscall.c | 30 ++++++------------- kernel/events/core.c | 7 ++--- 10 files changed, 38 insertions(+), 92 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3f8beffa846d..9bacb868654e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2600,13 +2600,8 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr) bnxt_init_rxbd_pages(ring, type); if (BNXT_RX_PAGE_MODE(bp) && bp->xdp_prog) { - rxr->xdp_prog = bpf_prog_add(bp->xdp_prog, 1); - if (IS_ERR(rxr->xdp_prog)) { - int rc = PTR_ERR(rxr->xdp_prog); - - rxr->xdp_prog = NULL; - return rc; - } + bpf_prog_add(bp->xdp_prog, 1); + rxr->xdp_prog = bp->xdp_prog; } prod = rxr->rx_prod; for (i = 0; i < bp->rx_ring_size; i++) { diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index dca02b35c231..63d08d7887c5 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1844,13 +1844,8 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) if (nic->xdp_prog) { /* Attach BPF program */ - nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1); - if (!IS_ERR(nic->xdp_prog)) { - bpf_attached = true; - } else { - ret = PTR_ERR(nic->xdp_prog); - nic->xdp_prog = NULL; - } + bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1); + bpf_attached = true; } /* Calculate Tx queues needed for XDP and network stack */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 5868ec11db1a..b8e1a90a185d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2296,11 +2296,7 @@ int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv, lockdep_is_held(&priv->mdev->state_lock)); if (xdp_prog && carry_xdp_prog) { - xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num); - if (IS_ERR(xdp_prog)) { - mlx4_en_free_resources(tmp); - return PTR_ERR(xdp_prog); - } + bpf_prog_add(xdp_prog, tmp->rx_ring_num); for (i = 0; i < tmp->rx_ring_num; i++) rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog, xdp_prog); @@ -2808,11 +2804,9 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) * program for a new one. */ if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) { - if (prog) { - prog = bpf_prog_add(prog, priv->rx_ring_num - 1); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (prog) + bpf_prog_add(prog, priv->rx_ring_num - 1); + mutex_lock(&mdev->state_lock); for (i = 0; i < priv->rx_ring_num; i++) { old_prog = rcu_dereference_protected( @@ -2833,13 +2827,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) if (!tmp) return -ENOMEM; - if (prog) { - prog = bpf_prog_add(prog, priv->rx_ring_num - 1); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out; - } - } + if (prog) + bpf_prog_add(prog, priv->rx_ring_num - 1); mutex_lock(&mdev->state_lock); memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile)); @@ -2888,7 +2877,6 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) unlock_out: mutex_unlock(&mdev->state_lock); -out: kfree(tmp); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7e6706333fa8..dd995a0a9b15 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -499,12 +499,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->stats = &c->priv->channel_stats[c->ix].rq; - rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL; - if (IS_ERR(rq->xdp_prog)) { - err = PTR_ERR(rq->xdp_prog); - rq->xdp_prog = NULL; - goto err_rq_wq_destroy; - } + if (params->xdp_prog) + bpf_prog_inc(params->xdp_prog); + rq->xdp_prog = params->xdp_prog; err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix); if (err < 0) @@ -4272,16 +4269,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) if (was_opened && reset) mlx5e_close_locked(netdev); - if (was_opened && !reset) { + if (was_opened && !reset) /* num_channels is invariant here, so we can take the * batched reference right upfront. */ - prog = bpf_prog_add(prog, priv->channels.num); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto unlock; - } - } + bpf_prog_add(prog, priv->channels.num); /* exchange programs, extra prog reference we got from caller * as long as we don't fail from this point onwards. diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 630b13a9c3d5..f5af33226c0c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2010,12 +2010,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats) if (rc) goto out; - fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1); - if (IS_ERR(fp->rxq->xdp_prog)) { - rc = PTR_ERR(fp->rxq->xdp_prog); - fp->rxq->xdp_prog = NULL; - goto out; - } + bpf_prog_add(edev->xdp_prog, 1); + fp->rxq->xdp_prog = edev->xdp_prog; } if (fp->type & QEDE_FASTPATH_TX) { diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 504f21d10099..167455b7db8a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2608,11 +2608,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (!prog && !old_prog) return 0; - if (prog) { - prog = bpf_prog_add(prog, vi->max_queue_pairs - 1); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (prog) + bpf_prog_add(prog, vi->max_queue_pairs - 1); /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9df172ab2502..5196a586ba68 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -393,7 +393,7 @@ struct bpf_prog_stats { } __aligned(2 * sizeof(u64)); struct bpf_prog_aux { - atomic_t refcnt; + atomic64_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; @@ -677,9 +677,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); -struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); +void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); -struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); +void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); @@ -816,10 +816,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, - int i) +static inline void bpf_prog_add(struct bpf_prog *prog, int i) { - return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) @@ -830,9 +828,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog) { } -static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) +static inline void bpf_prog_inc(struct bpf_prog *prog) { - return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *__must_check diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 783f2b7a5086..49122de0b573 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -33,7 +33,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - raw = bpf_prog_inc(raw); + bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); @@ -536,7 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); - return bpf_prog_inc(prog); + bpf_prog_inc(prog); + return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ed97fe69b2d2..ae1c71982198 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1354,7 +1354,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { + if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1460,16 +1460,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -/* prog's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) +void bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_sub(i, &prog->aux->refcnt); - return ERR_PTR(-EBUSY); - } - return prog; + atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); @@ -1480,13 +1473,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i) * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ - WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); + WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +void bpf_prog_inc(struct bpf_prog *prog) { - return bpf_prog_add(prog, 1); + atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); @@ -1495,12 +1488,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_prog_put(prog, false); - return ERR_PTR(-EBUSY); - } + refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); @@ -1538,7 +1526,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, goto out; } - prog = bpf_prog_inc(prog); + bpf_prog_inc(prog); out: fdput(f); return prog; @@ -1718,7 +1706,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->orig_prog = NULL; prog->jited = 0; - atomic_set(&prog->aux->refcnt, 1); + atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { diff --git a/kernel/events/core.c b/kernel/events/core.c index effff5703446..787d708eb1b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10439,12 +10439,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; -- Gitee From 0fa64aef5800af45dfabf3169b72ada39167e7fb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: [PATCH 04/25] bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY ANBZ: #5530 commit fc9702273e2edb90400a34b3be76f7b08fa3344b upstream. Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. [backport note] Include mutex.h to implement mutex. Change u64 size -> size_t size in bpf_map_area_alloc to fix complition error and prepare for ff1c08e1f74b ("bpf: Switch bpf_map_{area_alloc, area_mmapable_alloc}() to u64 size"). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 14 +++-- include/linux/vmalloc.h | 1 + include/uapi/linux/bpf.h | 3 ++ kernel/bpf/arraymap.c | 58 +++++++++++++++++--- kernel/bpf/syscall.c | 99 ++++++++++++++++++++++++++++++++-- mm/vmalloc.c | 20 +++++++ tools/include/uapi/linux/bpf.h | 3 ++ 7 files changed, 185 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5196a586ba68..1f8e3d548745 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -15,8 +15,10 @@ #include #include #include +#include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -69,6 +71,7 @@ struct bpf_map_ops { u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); + int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); }; struct bpf_map_memory { @@ -97,9 +100,10 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; + char name[BPF_OBJ_NAME_LEN]; bool unpriv_array; - bool frozen; /* write-once */ - /* 48 bytes hole */ + bool frozen; /* write-once; write-protected by freeze_mutex */ + /* 22 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -107,7 +111,8 @@ struct bpf_map { atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; - char name[BPF_OBJ_NAME_LEN]; + struct mutex freeze_mutex; + u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ CK_HOTFIX_RESERVE(1) CK_HOTFIX_RESERVE(2) @@ -701,7 +706,8 @@ int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(u64 size, int numa_node); +void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 239571c69d6b..005172f6668a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -78,6 +78,7 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); +extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8a56b1163d5d..026bd8d9a243 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -346,6 +346,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 488a630f7424..0f032c0f990d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -67,6 +67,10 @@ int array_map_alloc_check(union bpf_attr *attr) (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -110,10 +114,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; @@ -125,7 +138,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } if (!array) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -358,6 +384,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -373,7 +404,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -429,6 +463,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -440,6 +485,7 @@ const struct bpf_map_ops array_map_ops = { .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae1c71982198..62520c1a6bfc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -136,7 +136,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(u64 size, int numa_node) +static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -154,18 +154,33 @@ void *bpf_map_area_alloc(u64 size, int numa_node) if (size >= SIZE_MAX) return NULL; - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + /* kmalloc()'ed memory can't be mmap()'ed */ + if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); if (area != NULL) return area; } - + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | + __GFP_RETRY_MAYFAIL | flags); + } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, __builtin_return_address(0)); } +void *bpf_map_area_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, false); +} + +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, true); +} + void bpf_map_area_free(void *area) { kvfree(area); @@ -437,6 +452,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } +/* called for any extra memory-mapped regions (except initial) */ +static void bpf_map_mmap_open(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt++; + mutex_unlock(&map->freeze_mutex); + } +} + +/* called for all unmapped memory region (including initial) */ +static void bpf_map_mmap_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt--; + mutex_unlock(&map->freeze_mutex); + } + + bpf_map_put_with_uref(map); +} + +static const struct vm_operations_struct bpf_map_default_vmops = { + .open = bpf_map_mmap_open, + .close = bpf_map_mmap_close, +}; + +static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct bpf_map *map = filp->private_data; + int err; + + if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + return -ENOTSUPP; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + mutex_lock(&map->freeze_mutex); + + if ((vma->vm_flags & VM_WRITE) && map->frozen) { + err = -EPERM; + goto out; + } + + /* set default open/close callbacks */ + vma->vm_ops = &bpf_map_default_vmops; + vma->vm_private_data = map; + + err = map->ops->map_mmap(map, vma); + if (err) + goto out; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) + map->writecnt++; +out: + mutex_unlock(&map->freeze_mutex); + return err; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -444,6 +527,7 @@ const struct file_operations bpf_map_fops = { .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, + .mmap = bpf_map_mmap, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -589,6 +673,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); + mutex_init(&map->freeze_mutex); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -1179,6 +1264,13 @@ static int map_freeze(const union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + + mutex_lock(&map->freeze_mutex); + + if (map->writecnt) { + err = -EBUSY; + goto err_put; + } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; @@ -1190,6 +1282,7 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: + mutex_unlock(&map->freeze_mutex); fdput(f); return err; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 951dfa59dd3b..a8b972a70eda 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2419,6 +2419,26 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); +/** + * vmalloc_user_node_flags - allocate memory for userspace on a specific node + * @size: allocation size + * @node: numa node + * @flags: flags for the page level allocator + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + flags | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9e4db9d972f4..7bf94cba7a9c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -346,6 +346,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- Gitee From 684ecb71dcaf6ca966cb1e1554c2a7c1583a0f0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:05 -0800 Subject: [PATCH 05/25] libbpf: Make global data internal arrays mmap()-able, if possible ANBZ: #5530 commit 7fe74b436236b17ac57e46527166d22bcc230175 upstream. Add detection of BPF_F_MMAPABLE flag support for arrays and add it as an extra flag to internal global data maps, if supported by kernel. This allows users to memory-map global data and use it without BPF map operations, greatly simplifying user experience. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191117172806.2195367-5-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 88eee4162be5..c9ecce23b108 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -142,6 +142,8 @@ struct bpf_capabilities { __u32 btf_func:1; /* BTF_KIND_VAR and BTF_KIND_DATASEC support */ __u32 btf_datasec:1; + /* BPF_F_MMAPABLE is supported for arrays */ + __u32 array_mmap:1; /* BTF_FUNC_GLOBAL is supported */ __u32 btf_func_global:1; }; @@ -858,8 +860,6 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, pr_warn("failed to alloc map name\n"); return -ENOMEM; } - pr_debug("map '%s' (global data): at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); def = &map->def; def->type = BPF_MAP_TYPE_ARRAY; @@ -867,6 +867,12 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->value_size = data->d_size; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0; + if (obj->caps.array_mmap) + def->map_flags |= BPF_F_MMAPABLE; + + pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", + map_name, map->sec_idx, map->sec_offset, def->map_flags); + if (data_buff) { *data_buff = malloc(data->d_size); if (!*data_buff) { @@ -2191,6 +2197,27 @@ static int bpf_object__probe_btf_datasec(struct bpf_object *obj) return 0; } +static int bpf_object__probe_array_mmap(struct bpf_object *obj) +{ + struct bpf_create_map_attr attr = { + .map_type = BPF_MAP_TYPE_ARRAY, + .map_flags = BPF_F_MMAPABLE, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1, + }; + int fd; + + fd = bpf_create_map_xattr(&attr); + if (fd >= 0) { + obj->caps.array_mmap = 1; + close(fd); + return 1; + } + + return 0; +} + static int bpf_object__probe_caps(struct bpf_object *obj) { @@ -2200,6 +2227,7 @@ bpf_object__probe_caps(struct bpf_object *obj) bpf_object__probe_btf_func, bpf_object__probe_btf_func_global, bpf_object__probe_btf_datasec, + bpf_object__probe_array_mmap, }; int i, ret; -- Gitee From ec8f0da36e8f3141101f5fc67e350632aa0d740a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:06 -0800 Subject: [PATCH 06/25] selftests/bpf: Add BPF_TYPE_MAP_ARRAY mmap() tests ANBZ: #5530 commit 5051b384523be92925d13694fabbc6bedf2f907b upstream. Add selftests validating mmap()-ing BPF array maps: both single-element and multi-element ones. Check that plain bpf_map_update_elem() and bpf_map_lookup_elem() work correctly with memory-mapped array. Also convert CO-RE relocation tests to use memory-mapped views of global data. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-6-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/test_mmap.c | 44 +++ tools/testing/selftests/bpf/test_progs.c | 257 ++++++++++++++++-- 2 files changed, 283 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_mmap.c diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c new file mode 100644 index 000000000000..0089b9dec005 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook + +#include +#include +#include "bpf_helpers.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 512 * 4); /* at least 4 pages of data */ + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, __u32); + __type(value, __u64); +} data_map SEC(".maps"); + +static volatile __u64 in_val; +static volatile __u64 out_val; + +SEC("raw_tracepoint/sys_enter") +int test_mmap(void *ctx) +{ + int zero = 0, one = 1, two = 2, far = 1500; + __u64 val, *p; + + out_val = in_val; + + /* data_map[2] = in_val; */ + bpf_map_update_elem(&data_map, &two, (const void *)&in_val, 0); + + /* data_map[1] = data_map[0] * 2; */ + p = bpf_map_lookup_elem(&data_map, &zero); + if (p) { + val = (*p) * 2; + bpf_map_update_elem(&data_map, &one, &val, 0); + } + + /* data_map[far] = in_val * 3; */ + val = in_val * 3; + bpf_map_update_elem(&data_map, &far, &val, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 3fdde68a1490..6b7835c0bc63 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -3331,6 +3331,7 @@ static void test_perf_buffer(void) } #include "progs/core_reloc_types.h" +#include #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name) @@ -3783,8 +3784,15 @@ struct data { char out[256]; }; +static size_t roundup_page(size_t sz) +{ + long page_size = sysconf(_SC_PAGE_SIZE); + return (sz + page_size - 1) / page_size * page_size; +} + static void test_core_reloc(void) { + const size_t mmap_sz = roundup_page(sizeof(struct data)); struct bpf_object_load_attr load_attr = {}; struct core_reloc_test_case *test_case; const char *tp_name, *probe_name; @@ -3793,8 +3801,8 @@ static void test_core_reloc(void) struct bpf_map *data_map; struct bpf_program *prog; struct bpf_object *obj; - const int zero = 0; - struct data data; + struct data *data; + void *mmap_data = NULL; for (i = 0; i < ARRAY_SIZE(test_cases); i++) { test_case = &test_cases[i]; @@ -3804,8 +3812,7 @@ static void test_core_reloc(void) ); obj = bpf_object__open_file(test_case->bpf_obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open '%s': %ld\n", + if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n", test_case->bpf_obj_file, PTR_ERR(obj))) continue; @@ -3847,24 +3854,22 @@ static void test_core_reloc(void) if (CHECK(!data_map, "find_data_map", "data map not found\n")) goto cleanup; - memset(&data, 0, sizeof(data)); - memcpy(data.in, test_case->input, test_case->input_len); - - err = bpf_map_update_elem(bpf_map__fd(data_map), - &zero, &data, 0); - if (CHECK(err, "update_data_map", - "failed to update .data map: %d\n", err)) + mmap_data = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map__fd(data_map), 0); + if (CHECK(mmap_data == MAP_FAILED, "mmap", + ".bss mmap failed: %d", errno)) { + mmap_data = NULL; goto cleanup; + } + data = mmap_data; + + memset(mmap_data, 0, sizeof(*data)); + memcpy(data->in, test_case->input, test_case->input_len); /* trigger test run */ usleep(1); - err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &data); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto cleanup; - - equal = memcmp(data.out, test_case->output, + equal = memcmp(data->out, test_case->output, test_case->output_len) == 0; if (CHECK(!equal, "check_result", "input/output data don't match\n")) { @@ -3876,12 +3881,16 @@ static void test_core_reloc(void) } for (j = 0; j < test_case->output_len; j++) { printf("output byte #%d: EXP 0x%02hhx GOT 0x%02hhx\n", - j, test_case->output[j], data.out[j]); + j, test_case->output[j], data->out[j]); } goto cleanup; } cleanup: + if (mmap_data) { + CHECK_FAIL(munmap(mmap_data, mmap_sz)); + mmap_data = NULL; + } if (!IS_ERR_OR_NULL(link)) { bpf_link__destroy(link); link = NULL; @@ -4235,6 +4244,217 @@ static void test_pinning(void) bpf_object__close(obj); } +struct map_data { + __u64 val[512 * 4]; +}; + +struct bss_data { + __u64 in_val; + __u64 out_val; +}; + +static void test_mmap(void) +{ + const char *file = "test_mmap.o"; + const char *probe_name = "raw_tracepoint/sys_enter"; + const char *tp_name = "sys_enter"; + const size_t bss_sz = roundup_page(sizeof(struct bss_data)); + const size_t map_sz = roundup_page(sizeof(struct map_data)); + const int zero = 0, one = 1, two = 2, far = 1500; + const long page_size = sysconf(_SC_PAGE_SIZE); + int err, duration = 0, i, data_map_fd; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_link *link = NULL; + struct bpf_map *data_map, *bss_map; + void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; + volatile struct bss_data *bss_data; + volatile struct map_data *map_data; + __u64 val = 0; + + obj = bpf_object__open_file("test_mmap.o", NULL); + if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n", + file, PTR_ERR(obj))) + return; + prog = bpf_object__find_program_by_title(obj, probe_name); + if (CHECK(!prog, "find_probe", "prog '%s' not found\n", probe_name)) + goto cleanup; + err = bpf_object__load(obj); + if (CHECK(err, "obj_load", "failed to load prog '%s': %d\n", + probe_name, err)) + goto cleanup; + + bss_map = bpf_object__find_map_by_name(obj, "test_mma.bss"); + if (CHECK(!bss_map, "find_bss_map", ".bss map not found\n")) + goto cleanup; + data_map = bpf_object__find_map_by_name(obj, "data_map"); + if (CHECK(!data_map, "find_data_map", "data_map map not found\n")) + goto cleanup; + data_map_fd = bpf_map__fd(data_map); + + bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + bpf_map__fd(bss_map), 0); + if (CHECK(bss_mmaped == MAP_FAILED, "bss_mmap", + ".bss mmap failed: %d\n", errno)) { + bss_mmaped = NULL; + goto cleanup; + } + /* map as R/W first */ + map_mmaped = mmap(NULL, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + data_map_fd, 0); + if (CHECK(map_mmaped == MAP_FAILED, "data_mmap", + "data_map mmap failed: %d\n", errno)) { + map_mmaped = NULL; + goto cleanup; + } + + bss_data = bss_mmaped; + map_data = map_mmaped; + + CHECK_FAIL(bss_data->in_val); + CHECK_FAIL(bss_data->out_val); + CHECK_FAIL(map_data->val[0]); + CHECK_FAIL(map_data->val[1]); + CHECK_FAIL(map_data->val[2]); + CHECK_FAIL(map_data->val[far]); + + link = bpf_program__attach_raw_tracepoint(prog, tp_name); + if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link))) + goto cleanup; + + bss_data->in_val = 123; + val = 111; + CHECK_FAIL(bpf_map_update_elem(data_map_fd, &zero, &val, 0)); + + usleep(1); + + CHECK_FAIL(bss_data->in_val != 123); + CHECK_FAIL(bss_data->out_val != 123); + CHECK_FAIL(map_data->val[0] != 111); + CHECK_FAIL(map_data->val[1] != 222); + CHECK_FAIL(map_data->val[2] != 123); + CHECK_FAIL(map_data->val[far] != 3 * 123); + + CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &zero, &val)); + CHECK_FAIL(val != 111); + CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &one, &val)); + CHECK_FAIL(val != 222); + CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &two, &val)); + CHECK_FAIL(val != 123); + CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &far, &val)); + CHECK_FAIL(val != 3 * 123); + + /* data_map freeze should fail due to R/W mmap() */ + err = bpf_map_freeze(data_map_fd); + if (CHECK(!err || errno != EBUSY, "no_freeze", + "data_map freeze succeeded: err=%d, errno=%d\n", err, errno)) + goto cleanup; + + /* unmap R/W mapping */ + err = munmap(map_mmaped, map_sz); + map_mmaped = NULL; + if (CHECK(err, "data_map_munmap", "data_map munmap failed: %d\n", errno)) + goto cleanup; + + /* re-map as R/O now */ + map_mmaped = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, data_map_fd, 0); + if (CHECK(map_mmaped == MAP_FAILED, "data_mmap", + "data_map R/O mmap failed: %d\n", errno)) { + map_mmaped = NULL; + goto cleanup; + } + map_data = map_mmaped; + + /* map/unmap in a loop to test ref counting */ + for (i = 0; i < 10; i++) { + int flags = i % 2 ? PROT_READ : PROT_WRITE; + void *p; + + p = mmap(NULL, map_sz, flags, MAP_SHARED, data_map_fd, 0); + if (CHECK_FAIL(p == MAP_FAILED)) + goto cleanup; + err = munmap(p, map_sz); + if (CHECK_FAIL(err)) + goto cleanup; + } + + /* data_map freeze should now succeed due to no R/W mapping */ + err = bpf_map_freeze(data_map_fd); + if (CHECK(err, "freeze", "data_map freeze failed: err=%d, errno=%d\n", + err, errno)) + goto cleanup; + + /* mapping as R/W now should fail */ + tmp1 = mmap(NULL, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + data_map_fd, 0); + if (CHECK(tmp1 != MAP_FAILED, "data_mmap", "mmap succeeded\n")) { + munmap(tmp1, map_sz); + goto cleanup; + } + + bss_data->in_val = 321; + usleep(1); + CHECK_FAIL(bss_data->in_val != 321); + CHECK_FAIL(bss_data->out_val != 321); + CHECK_FAIL(map_data->val[0] != 111); + CHECK_FAIL(map_data->val[1] != 222); + CHECK_FAIL(map_data->val[2] != 321); + CHECK_FAIL(map_data->val[far] != 3 * 321); + + /* check some more advanced mmap() manipulations */ + + /* map all but last page: pages 1-3 mapped */ + tmp1 = mmap(NULL, 3 * page_size, PROT_READ, MAP_SHARED, + data_map_fd, 0); + if (CHECK(tmp1 == MAP_FAILED, "adv_mmap1", "errno %d\n", errno)) + goto cleanup; + + /* unmap second page: pages 1, 3 mapped */ + err = munmap(tmp1 + page_size, page_size); + if (CHECK(err, "adv_mmap2", "errno %d\n", errno)) { + munmap(tmp1, map_sz); + goto cleanup; + } + + /* map page 2 back */ + tmp2 = mmap(tmp1 + page_size, page_size, PROT_READ, + MAP_SHARED | MAP_FIXED, data_map_fd, 0); + if (CHECK(tmp2 == MAP_FAILED, "adv_mmap3", "errno %d\n", errno)) { + munmap(tmp1, page_size); + munmap(tmp1 + 2*page_size, page_size); + goto cleanup; + } + CHECK(tmp1 + page_size != tmp2, "adv_mmap4", + "tmp1: %p, tmp2: %p\n", tmp1, tmp2); + + /* re-map all 4 pages */ + tmp2 = mmap(tmp1, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, + data_map_fd, 0); + if (CHECK(tmp2 == MAP_FAILED, "adv_mmap5", "errno %d\n", errno)) { + munmap(tmp1, 3 * page_size); /* unmap page 1 */ + goto cleanup; + } + CHECK(tmp1 != tmp2, "adv_mmap6", "tmp1: %p, tmp2: %p\n", tmp1, tmp2); + + map_data = tmp2; + CHECK_FAIL(bss_data->in_val != 321); + CHECK_FAIL(bss_data->out_val != 321); + CHECK_FAIL(map_data->val[0] != 111); + CHECK_FAIL(map_data->val[1] != 222); + CHECK_FAIL(map_data->val[2] != 321); + CHECK_FAIL(map_data->val[far] != 3 * 321); + + munmap(tmp2, 4 * page_size); +cleanup: + if (bss_mmaped) + CHECK_FAIL(munmap(bss_mmaped, bss_sz)); + if (map_mmaped) + CHECK_FAIL(munmap(map_mmaped, map_sz)); + if (!IS_ERR_OR_NULL(link)) + bpf_link__destroy(link); + bpf_object__close(obj); +} + int main(int ac, char **av) { srand(time(NULL)); @@ -4280,6 +4500,7 @@ int main(int ac, char **av) test_core_reloc(); test_kfree_skb(); test_pinning(); + test_mmap(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; -- Gitee From b2dcab9f6bfe0fe0132b84117803c9145e2243eb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 23 Nov 2019 14:08:35 -0800 Subject: [PATCH 07/25] mm: Implement no-MMU variant of vmalloc_user_node_flags ANBZ: #5530 commit ed81745a4c96841937f1da35c0eb66ac312e1480 upstream. To fix build with !CONFIG_MMU, implement it for no-MMU configurations as well. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Reported-by: kbuild test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Johannes Weiner Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191123220835.1237773-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- mm/nommu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 86e1da96bc13..8a7958f2b3b6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -242,11 +242,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) return __vmalloc(size, flags, PAGE_KERNEL); } -void *vmalloc_user(unsigned long size) +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, flags, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -259,8 +259,19 @@ void *vmalloc_user(unsigned long size) return ret; } + +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} EXPORT_SYMBOL(vmalloc_user); +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_user_flags(size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); -- Gitee From 8c459e0216e8cf9073ead73a941eceebf8485c38 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 10 Apr 2020 13:26:12 -0700 Subject: [PATCH 08/25] bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping ANBZ: #5530 commit 1f6cb19be2e231fe092f40decb71f066eba090d7 upstream. VM_MAYWRITE flag during initial memory mapping determines if already mmap()'ed pages can be later remapped as writable ones through mprotect() call. To prevent user application to rewrite contents of memory-mapped as read-only and subsequently frozen BPF map, remove VM_MAYWRITE flag completely on initially read-only mapping. Alternatively, we could treat any memory-mapping on unfrozen map as writable and bump writecnt instead. But there is little legitimate reason to map BPF map as read-only and then re-mmap() it as writable through mprotect(), instead of just mmap()'ing it as read/write from the very beginning. Also, at the suggestion of Jann Horn, drop unnecessary refcounting in mmap operations. We can just rely on VMA holding reference to BPF map's file properly. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Reported-by: Jann Horn Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Jann Horn Link: https://lore.kernel.org/bpf/20200410202613.3679837-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- kernel/bpf/syscall.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 62520c1a6bfc..4d1ed414475f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -457,9 +457,7 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt++; mutex_unlock(&map->freeze_mutex); @@ -471,13 +469,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt--; mutex_unlock(&map->freeze_mutex); } - - bpf_map_put_with_uref(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -506,14 +502,16 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; + vma->vm_flags &= ~VM_MAYEXEC; + if (!(vma->vm_flags & VM_WRITE)) + /* disallow re-mapping with PROT_WRITE */ + vma->vm_flags &= ~VM_MAYWRITE; err = map->ops->map_mmap(map, vma); if (err) goto out; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) + if (vma->vm_flags & VM_MAYWRITE) map->writecnt++; out: mutex_unlock(&map->freeze_mutex); -- Gitee From 79b67c854d3f2911c6efedd51039de8354d00f84 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 16:59:25 -0700 Subject: [PATCH 09/25] bpf: Fix bug in mmap() implementation for BPF array map ANBZ: #5530 commit 333291ce5055f2039afc907badaf5b66bc1adfdc upstream. mmap() subsystem allows user-space application to memory-map region with initial page offset. This wasn't taken into account in initial implementation of BPF array memory-mapping. This would result in wrong pages, not taking into account requested page shift, being memory-mmaped into user-space. This patch fixes this gap and adds a test for such scenario. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512235925.3817805-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- kernel/bpf/arraymap.c | 7 ++++++- tools/testing/selftests/bpf/test_progs.c | 9 +++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0f032c0f990d..f60919ad984e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -471,7 +471,12 @@ int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; - return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > + PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), + vma->vm_pgoff + pgoff); } const struct bpf_map_ops array_map_ops = { diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 6b7835c0bc63..a0d4d97cac55 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -4445,6 +4445,15 @@ static void test_mmap(void) CHECK_FAIL(map_data->val[far] != 3 * 321); munmap(tmp2, 4 * page_size); + + /* map all 4 pages, but with pg_off=1 page, should fail */ + tmp1 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, + data_map_fd, page_size /* initial page shift */); + if (CHECK(tmp1 != MAP_FAILED, "adv_mmap7", "unexpected success")) { + munmap(tmp1, 4 * page_size); + goto cleanup; + } + cleanup: if (bss_mmaped) CHECK_FAIL(munmap(bss_mmaped, bss_sz)); -- Gitee From 7c7dbb2f6a9ad6e0ff32422f167782cb22d45ef1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 May 2020 22:38:24 -0700 Subject: [PATCH 10/25] bpf: Prevent mmap()'ing read-only maps as writable ANBZ: #5530 commit dfeb376dd4cb2c5004aeb625e2475f58a5ff2ea7 upstream. As discussed in [0], it's dangerous to allow mapping BPF map, that's meant to be frozen and is read-only on BPF program side, because that allows user-space to actually store a writable view to the page even after it is frozen. This is exacerbated by BPF verifier making a strong assumption that contents of such frozen map will remain unchanged. To prevent this, disallow mapping BPF_F_RDONLY_PROG mmap()'able BPF maps as writable, ever. [0] https://lore.kernel.org/bpf/CAEf4BzYGWYhXdp6BJ7_=9OQPJxQpgug080MMjdSB72i9R+5c6g@mail.gmail.com/ [backport note] Drop selftest changes as we have not introduced former changes. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Suggested-by: Jann Horn Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Jann Horn Link: https://lore.kernel.org/bpf/20200519053824.1089415-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- kernel/bpf/syscall.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d1ed414475f..1baede09c2c9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -494,9 +494,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) mutex_lock(&map->freeze_mutex); - if ((vma->vm_flags & VM_WRITE) && map->frozen) { - err = -EPERM; - goto out; + if (vma->vm_flags & VM_WRITE) { + if (map->frozen) { + err = -EPERM; + goto out; + } + /* map is meant to be read-only, so do not allow mapping as + * writable, because it's possible to leak a writable page + * reference and allows user-space to still modify it after + * freezing, while verifier will assume contents do not change + */ + if (map->map_flags & BPF_F_RDONLY_PROG) { + err = -EACCES; + goto out; + } } /* set default open/close callbacks */ -- Gitee From b3b103356f03b73d6546b9478f2eaad0dd7f6dc6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 19 Nov 2019 11:17:06 +0000 Subject: [PATCH 11/25] tools, bpftool: Fix warning on ignored return value for 'read' ANBZ: #5530 commit a0f17cc6665c80ab2765f9244c41ec127821f343 upstream. When building bpftool, a warning was introduced by commit a94364603610 ("bpftool: Allow to read btf as raw data"), because the return value from a call to 'read()' is ignored. Let's address it. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191119111706.22440-1-quentin.monnet@netronome.com Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 964f58ff44cb..7595f72373ae 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -428,15 +428,15 @@ static struct btf *btf__parse_raw(const char *file) static bool is_btf_raw(const char *file) { __u16 magic = 0; - int fd; + int fd, nb_read; fd = open(file, O_RDONLY); if (fd < 0) return false; - read(fd, &magic, sizeof(magic)); + nb_read = read(fd, &magic, sizeof(magic)); close(fd); - return magic == BTF_MAGIC; + return nb_read == sizeof(magic) && magic == BTF_MAGIC; } static int do_dump(int argc, char **argv) -- Gitee From 51d7e75a90fa302b3af2cf3998e1d7bdd778c6d3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Nov 2019 14:44:47 -0800 Subject: [PATCH 12/25] libbpf: Fix call relocation offset calculation bug ANBZ: #5530 commit a0d7da26ce86a25e97ae191cb90574ada6daea98 upstream. When relocating subprogram call, libbpf doesn't take into account relo->text_off, which comes from symbol's value. This generally works fine for subprograms implemented as static functions, but breaks for global functions. Taking a simplified test_pkt_access.c as an example: __attribute__ ((noinline)) static int test_pkt_access_subprog1(volatile struct __sk_buff *skb) { return skb->len * 2; } __attribute__ ((noinline)) static int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb) { return skb->len + val; } SEC("classifier/test_pkt_access") int test_pkt_access(struct __sk_buff *skb) { if (test_pkt_access_subprog1(skb) != skb->len * 2) return TC_ACT_SHOT; if (test_pkt_access_subprog2(2, skb) != skb->len + 2) return TC_ACT_SHOT; return TC_ACT_UNSPEC; } When compiled, we get two relocations, pointing to '.text' symbol. .text has st_value set to 0 (it points to the beginning of .text section): 0000000000000008 000000050000000a R_BPF_64_32 0000000000000000 .text 0000000000000040 000000050000000a R_BPF_64_32 0000000000000000 .text test_pkt_access_subprog1 and test_pkt_access_subprog2 offsets (targets of two calls) are encoded within call instruction's imm32 part as -1 and 2, respectively: 0000000000000000 test_pkt_access_subprog1: 0: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0) 1: 64 00 00 00 01 00 00 00 w0 <<= 1 2: 95 00 00 00 00 00 00 00 exit 0000000000000018 test_pkt_access_subprog2: 3: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0) 4: 04 00 00 00 02 00 00 00 w0 += 2 5: 95 00 00 00 00 00 00 00 exit 0000000000000000 test_pkt_access: 0: bf 16 00 00 00 00 00 00 r6 = r1 ===> 1: 85 10 00 00 ff ff ff ff call -1 2: bc 01 00 00 00 00 00 00 w1 = w0 3: b4 00 00 00 02 00 00 00 w0 = 2 4: 61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0) 5: 64 02 00 00 01 00 00 00 w2 <<= 1 6: 5e 21 08 00 00 00 00 00 if w1 != w2 goto +8 7: bf 61 00 00 00 00 00 00 r1 = r6 ===> 8: 85 10 00 00 02 00 00 00 call 2 9: bc 01 00 00 00 00 00 00 w1 = w0 10: 61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0) 11: 04 02 00 00 02 00 00 00 w2 += 2 12: b4 00 00 00 ff ff ff ff w0 = -1 13: 1e 21 01 00 00 00 00 00 if w1 == w2 goto +1 14: b4 00 00 00 02 00 00 00 w0 = 2 0000000000000078 LBB0_3: 15: 95 00 00 00 00 00 00 00 exit Now, if we compile example with global functions, the setup changes. Relocations are now against specifically test_pkt_access_subprog1 and test_pkt_access_subprog2 symbols, with test_pkt_access_subprog2 pointing 24 bytes into its respective section (.text), i.e., 3 instructions in: 0000000000000008 000000070000000a R_BPF_64_32 0000000000000000 test_pkt_access_subprog1 0000000000000048 000000080000000a R_BPF_64_32 0000000000000018 test_pkt_access_subprog2 Calls instructions now encode offsets relative to function symbols and are both set ot -1: 0000000000000000 test_pkt_access_subprog1: 0: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0) 1: 64 00 00 00 01 00 00 00 w0 <<= 1 2: 95 00 00 00 00 00 00 00 exit 0000000000000018 test_pkt_access_subprog2: 3: 61 20 00 00 00 00 00 00 r0 = *(u32 *)(r2 + 0) 4: 0c 10 00 00 00 00 00 00 w0 += w1 5: 95 00 00 00 00 00 00 00 exit 0000000000000000 test_pkt_access: 0: bf 16 00 00 00 00 00 00 r6 = r1 ===> 1: 85 10 00 00 ff ff ff ff call -1 2: bc 01 00 00 00 00 00 00 w1 = w0 3: b4 00 00 00 02 00 00 00 w0 = 2 4: 61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0) 5: 64 02 00 00 01 00 00 00 w2 <<= 1 6: 5e 21 09 00 00 00 00 00 if w1 != w2 goto +9 7: b4 01 00 00 02 00 00 00 w1 = 2 8: bf 62 00 00 00 00 00 00 r2 = r6 ===> 9: 85 10 00 00 ff ff ff ff call -1 10: bc 01 00 00 00 00 00 00 w1 = w0 11: 61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0) 12: 04 02 00 00 02 00 00 00 w2 += 2 13: b4 00 00 00 ff ff ff ff w0 = -1 14: 1e 21 01 00 00 00 00 00 if w1 == w2 goto +1 15: b4 00 00 00 02 00 00 00 w0 = 2 0000000000000080 LBB2_3: 16: 95 00 00 00 00 00 00 00 exit Thus the right formula to calculate target call offset after relocation should take into account relocation's target symbol value (offset within section), call instruction's imm32 offset, and (subtracting, to get relative instruction offset) instruction index of call instruction itself. All that is shifted by number of instructions in main program, given all sub-programs are copied over after main program. Convert few selftests relying on bpf-to-bpf calls to use global functions instead of static ones. Fixes: 48cca7e44f9f ("libbpf: add support for bpf_call") Reported-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191119224447.3781271-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 8 ++++++-- tools/testing/selftests/bpf/progs/test_btf_haskv.c | 4 ++-- tools/testing/selftests/bpf/progs/test_btf_newkv.c | 4 ++-- tools/testing/selftests/bpf/progs/test_btf_nokv.c | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c9ecce23b108..53f89d8eef74 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1880,9 +1880,13 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, pr_warn("incorrect bpf_call opcode\n"); return -LIBBPF_ERRNO__RELOC; } + if (sym.st_value % 8) { + pr_warn("bad call relo offset: %lu\n", sym.st_value); + return -LIBBPF_ERRNO__RELOC; + } prog->reloc_desc[i].type = RELO_CALL; prog->reloc_desc[i].insn_idx = insn_idx; - prog->reloc_desc[i].text_off = sym.st_value; + prog->reloc_desc[i].text_off = sym.st_value / 8; obj->has_pseudo_calls = true; continue; } @@ -3616,7 +3620,7 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, prog->section_name); } insn = &prog->insns[relo->insn_idx]; - insn->imm += prog->main_prog_cnt - relo->insn_idx; + insn->imm += relo->text_off + prog->main_prog_cnt - relo->insn_idx; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c index 763c51447c19..62ad7e22105e 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_haskv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_haskv.c @@ -26,7 +26,7 @@ struct dummy_tracepoint_args { }; __attribute__((noinline)) -static int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(struct dummy_tracepoint_args *arg) { struct ipv_counts *counts; int key = 0; @@ -44,7 +44,7 @@ static int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -static int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(struct dummy_tracepoint_args *arg) { return test_long_fname_2(arg); } diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c index 96f9e8451029..fb8d91a1dbe0 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c @@ -34,7 +34,7 @@ struct dummy_tracepoint_args { }; __attribute__((noinline)) -static int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(struct dummy_tracepoint_args *arg) { struct ipv_counts *counts; int key = 0; @@ -57,7 +57,7 @@ static int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -static int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(struct dummy_tracepoint_args *arg) { return test_long_fname_2(arg); } diff --git a/tools/testing/selftests/bpf/progs/test_btf_nokv.c b/tools/testing/selftests/bpf/progs/test_btf_nokv.c index 434188c37774..3f4422044759 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_nokv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_nokv.c @@ -23,7 +23,7 @@ struct dummy_tracepoint_args { }; __attribute__((noinline)) -static int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(struct dummy_tracepoint_args *arg) { struct ipv_counts *counts; int key = 0; @@ -41,7 +41,7 @@ static int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -static int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(struct dummy_tracepoint_args *arg) { return test_long_fname_2(arg); } -- Gitee From a26b08196ad0c8a36b0fc0154564a8825950c511 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Nov 2019 22:21:13 +0800 Subject: [PATCH 13/25] bpf: Make array_map_mmap static ANBZ: #5530 commit b2e2f0e6a6f910c906c083584b6e0afd12266f22 upstream. Fix sparse warning: kernel/bpf/arraymap.c:481:5: warning: symbol 'array_map_mmap' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191119142113.15388-1-yuehaibing@huawei.com Signed-off-by: Yuanhe Shu --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f60919ad984e..f621daad642a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -463,7 +463,7 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } -int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; -- Gitee From 7059cb0c63e746005a0d34c82e01444fbd09a510 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 23:04:44 +0100 Subject: [PATCH 14/25] bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size ANBZ: #5530 commit ff1c08e1f74b6864854c39be48aa799a6a2e4d2b upstream. Given we recently extended the original bpf_map_area_alloc() helper in commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"), we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts, extend it for bpf-next. Reported-by: Stephen Rothwell Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 4 ++-- kernel/bpf/syscall.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1f8e3d548745..095bb8d2abee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -706,8 +706,8 @@ int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(size_t size, int numa_node); -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); +void *bpf_map_area_alloc(u64 size, int numa_node); +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1baede09c2c9..da608f46976f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -136,7 +136,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) +static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -171,12 +171,12 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) flags, __builtin_return_address(0)); } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } -- Gitee From dff8bf9c6f433069540cbf2b2f8b53a3f2c230ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:41 -0800 Subject: [PATCH 15/25] libbpf: Refactor relocation handling ANBZ: #5530 commit 1f8e2bcb2cd5ee1a731fb625a5438e2c305f6a7c upstream. Relocation handling code is convoluted and unnecessarily deeply nested. Split out per-relocation logic into separate function. Also refactor the logic to be more a sequence of per-relocation type checks and processing steps, making it simpler to follow control flow. This makes it easier to further extends it to new kinds of relocations (e.g., support for extern variables). This patch also makes relocation's section verification more robust. Previously relocations against not yet supported externs were silently ignored because of obj->efile.text_shndx was zero, when all BPF programs had custom section names and there was no .text section. Also, invalid LDIMM64 relocations against non-map sections were passed through, if they were pointing to a .text section (or 0, which is invalid section). All these bugs are fixed within this refactoring and checks are made more appropriate for each type of relocation. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-3-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 261 ++++++++++++++++++++++------------------- 1 file changed, 143 insertions(+), 118 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 53f89d8eef74..c43730a64f05 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -277,8 +277,8 @@ struct bpf_object { struct { GElf_Shdr shdr; Elf_Data *data; - } *reloc; - int nr_reloc; + } *reloc_sects; + int nr_reloc_sects; int maps_shndx; int btf_maps_shndx; int text_shndx; @@ -576,8 +576,8 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.rodata = NULL; obj->efile.bss = NULL; - zfree(&obj->efile.reloc); - obj->efile.nr_reloc = 0; + zfree(&obj->efile.reloc_sects); + obj->efile.nr_reloc_sects = 0; zclose(obj->efile.fd); obj->efile.obj_buf = NULL; obj->efile.obj_buf_sz = 0; @@ -1703,8 +1703,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, pr_debug("skip section(%d) %s\n", idx, name); } } else if (sh.sh_type == SHT_REL) { - int nr_reloc = obj->efile.nr_reloc; - void *reloc = obj->efile.reloc; + int nr_sects = obj->efile.nr_reloc_sects; + void *sects = obj->efile.reloc_sects; int sec = sh.sh_info; /* points to other section */ /* Only do relo for section with exec instructions */ @@ -1714,18 +1714,18 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, continue; } - reloc = reallocarray(reloc, nr_reloc + 1, - sizeof(*obj->efile.reloc)); - if (!reloc) { - pr_warn("realloc failed\n"); + sects = reallocarray(sects, nr_sects + 1, + sizeof(*obj->efile.reloc_sects)); + if (!sects) { + pr_warn("reloc_sects realloc failed\n"); return -ENOMEM; } - obj->efile.reloc = reloc; - obj->efile.nr_reloc++; + obj->efile.reloc_sects = sects; + obj->efile.nr_reloc_sects++; - obj->efile.reloc[nr_reloc].shdr = sh; - obj->efile.reloc[nr_reloc].data = data; + obj->efile.reloc_sects[nr_sects].shdr = sh; + obj->efile.reloc_sects[nr_sects].data = data; } else if (sh.sh_type == SHT_NOBITS && strcmp(name, ".bss") == 0) { obj->efile.bss = data; obj->efile.bss_shndx = idx; @@ -1790,14 +1790,6 @@ static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, shndx == obj->efile.btf_maps_shndx; } -static bool bpf_object__relo_in_known_section(const struct bpf_object *obj, - int shndx) -{ - return shndx == obj->efile.text_shndx || - bpf_object__shndx_is_maps(obj, shndx) || - bpf_object__shndx_is_data(obj, shndx); -} - static enum libbpf_map_type bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) { @@ -1811,14 +1803,124 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) return LIBBPF_MAP_UNSPEC; } +static int bpf_program__record_reloc(struct bpf_program *prog, + struct reloc_desc *reloc_desc, + __u32 insn_idx, const char *name, + const GElf_Sym *sym, const GElf_Rel *rel) +{ + struct bpf_insn *insn = &prog->insns[insn_idx]; + size_t map_idx, nr_maps = prog->obj->nr_maps; + struct bpf_object *obj = prog->obj; + __u32 shdr_idx = sym->st_shndx; + enum libbpf_map_type type; + struct bpf_map *map; + + /* sub-program call relocation */ + if (insn->code == (BPF_JMP | BPF_CALL)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("incorrect bpf_call opcode\n"); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + pr_warn("bad call relo against section %u\n", shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % 8) { + pr_warn("bad call relo offset: %lu\n", sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->text_off = sym->st_value / 8; + obj->has_pseudo_calls = true; + return 0; + } + + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", + insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { + pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", + name, shdr_idx, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); + + /* generic map reference relocation */ + if (type == LIBBPF_MAP_UNSPEC) { + if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { + pr_warn("bad map relo against section %u\n", + shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || + map->sec_idx != sym->st_shndx || + map->sec_offset != sym->st_value) + continue; + pr_debug("found map %zd (%s, sec %d, off %zu) for insn %u\n", + map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("map relo failed to find map for sec %u, off %llu\n", + shdr_idx, (__u64)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + return 0; + } + + /* global data map relocation */ + if (!bpf_object__shndx_is_data(obj, shdr_idx)) { + pr_warn("bad data relo against section %u\n", shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL) { + pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", + name, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + if (!obj->caps.global_data) { + pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", + name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type) + continue; + pr_debug("found data map %zd (%s, sec %d, off %zu) for insn %u\n", + map_idx, map->name, map->sec_idx, map->sec_offset, + insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("data relo failed to find map for sec %u\n", + shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + return 0; +} + static int bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, Elf_Data *data, struct bpf_object *obj) { Elf_Data *symbols = obj->efile.symbols; - struct bpf_map *maps = obj->maps; - size_t nr_maps = obj->nr_maps; - int i, nrels; + int err, i, nrels; pr_debug("collecting relocating info for: '%s'\n", prog->section_name); nrels = shdr->sh_size / shdr->sh_entsize; @@ -1831,12 +1933,8 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, prog->nr_reloc = nrels; for (i = 0; i < nrels; i++) { - struct bpf_insn *insns = prog->insns; - enum libbpf_map_type type; - unsigned int insn_idx; - unsigned int shdr_idx; const char *name; - size_t map_idx; + __u32 insn_idx; GElf_Sym sym; GElf_Rel rel; @@ -1844,101 +1942,28 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, pr_warn("relocation: failed to get %d reloc\n", i); return -LIBBPF_ERRNO__FORMAT; } - if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { pr_warn("relocation: symbol %"PRIx64" not found\n", GELF_R_SYM(rel.r_info)); return -LIBBPF_ERRNO__FORMAT; } + if (rel.r_offset % sizeof(struct bpf_insn)) + return -LIBBPF_ERRNO__FORMAT; + insn_idx = rel.r_offset / sizeof(struct bpf_insn); name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, sym.st_name) ? : ""; - pr_debug("relo for %lld value %lld name %d (\'%s\')\n", - (long long) (rel.r_info >> 32), - (long long) sym.st_value, sym.st_name, name); - - shdr_idx = sym.st_shndx; - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - pr_debug("relocation: insn_idx=%u, shdr_idx=%u\n", - insn_idx, shdr_idx); - - if (shdr_idx >= SHN_LORESERVE) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", - name, shdr_idx, insn_idx, - insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - if (!bpf_object__relo_in_known_section(obj, shdr_idx)) { - pr_warn("Program '%s' contains unrecognized relo data pointing to section %u\n", - prog->section_name, shdr_idx); - return -LIBBPF_ERRNO__RELOC; - } - - if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) { - if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) { - pr_warn("incorrect bpf_call opcode\n"); - return -LIBBPF_ERRNO__RELOC; - } - if (sym.st_value % 8) { - pr_warn("bad call relo offset: %lu\n", sym.st_value); - return -LIBBPF_ERRNO__RELOC; - } - prog->reloc_desc[i].type = RELO_CALL; - prog->reloc_desc[i].insn_idx = insn_idx; - prog->reloc_desc[i].text_off = sym.st_value / 8; - obj->has_pseudo_calls = true; - continue; - } - - if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", - insn_idx, insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - - if (bpf_object__shndx_is_maps(obj, shdr_idx) || - bpf_object__shndx_is_data(obj, shdr_idx)) { - type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); - if (type != LIBBPF_MAP_UNSPEC) { - if (GELF_ST_BIND(sym.st_info) == STB_GLOBAL) { - pr_warn("bpf: relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", - name, insn_idx, insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - if (!obj->caps.global_data) { - pr_warn("bpf: relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", - name, insn_idx); - return -LIBBPF_ERRNO__RELOC; - } - } - - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].libbpf_type != type) - continue; - if (type != LIBBPF_MAP_UNSPEC || - (maps[map_idx].sec_idx == sym.st_shndx && - maps[map_idx].sec_offset == sym.st_value)) { - pr_debug("relocation: found map %zd (%s, sec_idx %d, offset %zu) for insn %u\n", - map_idx, maps[map_idx].name, - maps[map_idx].sec_idx, - maps[map_idx].sec_offset, - insn_idx); - break; - } - } - - if (map_idx >= nr_maps) { - pr_warn("bpf relocation: map_idx %d larger than %d\n", - (int)map_idx, (int)nr_maps - 1); - return -LIBBPF_ERRNO__RELOC; - } + pr_debug("relo for shdr %u, symb %llu, value %llu, type %d, bind %d, name %d (\'%s\'), insn %u\n", + (__u32)sym.st_shndx, (__u64)GELF_R_SYM(rel.r_info), + (__u64)sym.st_value, GELF_ST_TYPE(sym.st_info), + GELF_ST_BIND(sym.st_info), sym.st_name, name, + insn_idx); - prog->reloc_desc[i].type = type != LIBBPF_MAP_UNSPEC ? - RELO_DATA : RELO_LD64; - prog->reloc_desc[i].insn_idx = insn_idx; - prog->reloc_desc[i].map_idx = map_idx; - } + err = bpf_program__record_reloc(prog, &prog->reloc_desc[i], + insn_idx, name, &sym, &rel); + if (err) + return err; } return 0; } @@ -3714,9 +3739,9 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) return -LIBBPF_ERRNO__INTERNAL; } - for (i = 0; i < obj->efile.nr_reloc; i++) { - GElf_Shdr *shdr = &obj->efile.reloc[i].shdr; - Elf_Data *data = obj->efile.reloc[i].data; + for (i = 0; i < obj->efile.nr_reloc_sects; i++) { + GElf_Shdr *shdr = &obj->efile.reloc_sects[i].shdr; + Elf_Data *data = obj->efile.reloc_sects[i].data; int idx = shdr->sh_info; struct bpf_program *prog; -- Gitee From bdde0ac2203e10a6c98dd91cff25477781e94fe2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:42 -0800 Subject: [PATCH 16/25] libbpf: Fix various errors and warning reported by checkpatch.pl ANBZ: #5530 commit 8983b731ceb42939acaa6158abcf8adb56f834bf upstream. Fix a bunch of warnings and errors reported by checkpatch.pl, to make it easier to spot new problems. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-4-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c43730a64f05..555f68a16496 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -105,7 +105,7 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) err = action; \ if (err) \ goto out; \ -} while(0) +} while (0) /* Copied from tools/perf/util/util.h */ @@ -966,8 +966,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) obj->path, nr_maps, data->d_size); if (!data->d_size || nr_maps == 0 || (data->d_size % nr_maps) != 0) { - pr_warn("unable to determine map definition size " - "section %s, %d maps in %zd bytes\n", + pr_warn("unable to determine map definition size section %s, %d maps in %zd bytes\n", obj->path, nr_maps, data->d_size); return -EINVAL; } @@ -1031,12 +1030,11 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) * incompatible. */ char *b; + for (b = ((char *)def) + sizeof(struct bpf_map_def); b < ((char *)def) + map_def_sz; b++) { if (*b != 0) { - pr_warn("maps section in %s: \"%s\" " - "has unrecognized, non-zero " - "options\n", + pr_warn("maps section in %s: \"%s\" has unrecognized, non-zero options\n", obj->path, map_name); if (strict) return -EINVAL; @@ -1074,7 +1072,8 @@ skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) */ static bool get_map_field_int(const char *map_name, const struct btf *btf, const struct btf_type *def, - const struct btf_member *m, __u32 *res) { + const struct btf_member *m, __u32 *res) +{ const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); const char *name = btf__name_by_offset(btf, m->name_off); const struct btf_array *arr_info; @@ -1388,7 +1387,8 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, for (i = 0; i < vlen; i++) { err = bpf_object__init_user_btf_map(obj, sec, i, obj->efile.btf_maps_shndx, - data, strict, pin_root_path); + data, strict, + pin_root_path); if (err) return err; } @@ -1683,12 +1683,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, if (strcmp(name, ".text") == 0) obj->efile.text_shndx = idx; err = bpf_object__add_program(obj, data->d_buf, - data->d_size, name, idx); + data->d_size, + name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = libbpf_strerror_r(-err, errmsg, - sizeof(errmsg)); + char *cp; + cp = libbpf_strerror_r(-err, errmsg, + sizeof(errmsg)); pr_warn("failed to alloc program %s (%s): %s", name, obj->path, cp); return err; @@ -1838,7 +1840,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { - pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", + pr_warn("invalid relo for insns[%d].code 0x%x\n", insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; } @@ -2149,7 +2151,7 @@ bpf_object__probe_global_data(struct bpf_object *obj) static int bpf_object__probe_btf_func(struct bpf_object *obj) { - const char strs[] = "\0int\0x\0a"; + static const char strs[] = "\0int\0x\0a"; /* void x(int a) {} */ __u32 types[] = { /* int */ @@ -2201,7 +2203,7 @@ static int bpf_object__probe_btf_func_global(struct bpf_object *obj) static int bpf_object__probe_btf_datasec(struct bpf_object *obj) { - const char strs[] = "\0x\0.data"; + static const char strs[] = "\0x\0.data"; /* static int a; */ __u32 types[] = { /* int */ @@ -5142,7 +5144,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, *expected_attach_type = section_names[i].expected_attach_type; return 0; } - pr_warn("failed to guess program type based on ELF section name '%s'\n", name); + pr_warn("failed to guess program type from ELF section '%s'\n", name); type_names = libbpf_get_type_names(false); if (type_names != NULL) { pr_info("supported section(type) names are:%s\n", type_names); @@ -6309,7 +6311,8 @@ static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { }; -static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, int offset) +static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, + int offset) { __u32 *array = (__u32 *)info; @@ -6318,7 +6321,8 @@ static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, int offse return -(int)offset; } -static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, int offset) +static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, + int offset) { __u64 *array = (__u64 *)info; -- Gitee From 6eae7cdb678cfa4931758eaf793dfe8bf3d19fc3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:43 -0800 Subject: [PATCH 17/25] libbpf: Support initialized global variables ANBZ: #5530 commit 393cdfbee809891dc6ba859a44cc6441fa8dce9e upstream. Initialized global variables are no different in ELF from static variables, and don't require any extra support from libbpf. But they are matching semantics of global data (backed by BPF maps) more closely, preventing LLVM/Clang from aggressively inlining constant values and not requiring volatile incantations to prevent those. This patch enables global variables. It still disables uninitialized variables, which will be put into special COM (common) ELF section, because BPF doesn't allow uninitialized data to be accessed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-5-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 9 ++------- .../testing/selftests/bpf/progs/test_core_reloc_arrays.c | 4 ++-- .../bpf/progs/test_core_reloc_bitfields_direct.c | 4 ++-- .../bpf/progs/test_core_reloc_bitfields_probed.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_existence.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_flavors.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_ints.c | 4 ++-- .../testing/selftests/bpf/progs/test_core_reloc_kernel.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_misc.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_mods.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_nesting.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_primitives.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_ptr_as_arr.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_size.c | 4 ++-- 14 files changed, 28 insertions(+), 33 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 555f68a16496..b5f29b80564b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1845,8 +1845,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", - name, shdr_idx, insn_idx, insn->code); + pr_warn("invalid relo for \'%s\' in special section 0x%x; forgot to initialize global var?..\n", + name, shdr_idx); return -LIBBPF_ERRNO__RELOC; } @@ -1886,11 +1886,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, pr_warn("bad data relo against section %u\n", shdr_idx); return -LIBBPF_ERRNO__RELOC; } - if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", - name, insn_idx, insn->code); - return -LIBBPF_ERRNO__RELOC; - } if (!obj->caps.global_data) { pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", name, insn_idx); diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c index 402d746e78f8..00039b2ff855 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_arrays_output { int a2; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c index ce116698f5af..2618c5398561 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_bitfields { /* unsigned bitfields */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c index 7d0df1b2a334..5e40dc156ad2 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_bitfields { /* unsigned bitfields */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c index 29265201c125..3685efbb9be0 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_existence_output { int a_exists; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c index b4d0820561ac..0114e6cc4699 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_flavors { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c index d34d576ae083..ecc0219a51f3 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_ints { uint8_t u8_field; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index 13b75f68f01d..ee135803ea7d 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_kernel_output { int valid[10]; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c index 69c2760828de..2a032c147b34 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_misc_output { int a, b, c; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c index c6fc95a40b7c..0811d88c95d9 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_mods_output { int a, b, c, d, e, f, g, h; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c index ed3ae46d609c..83506fc0cc36 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_nesting_substruct { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c index bfb49b1b1b9a..c70b2d1512d5 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; enum core_reloc_primitives_enum { A = 0, diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c index 281fc7832e82..746b1be6f5ca 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_ptr_as_arr { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index e7fbd145a211..07dac4ab2b46 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_size_output { int int_sz; -- Gitee From 09a70e6bd2972526626ea4881065a58253523a5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:40 -0800 Subject: [PATCH 18/25] selftests/bpf: Ensure no DWARF relocations for BPF object files ANBZ: #5530 commit ffc88174cdcf5f51fb7f6298fe9203a36c904f1f upstream. Add -mattr=dwarfris attribute to llc to avoid having relocations against DWARF data. These relocations make it impossible to inspect DWARF contents: all strings are invalid. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-2-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e86772886a44..b35364d5fcb4 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -109,7 +109,7 @@ force: $(BPFOBJ): force $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ -PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1) +PROBE := $(shell $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1) # Let newer LLVM versions transparently probe the kernel for availability # of full BPF instruction set. -- Gitee From f153adfe893acaf8b48ef2834c68be67866c1740 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Nov 2019 12:06:50 -0800 Subject: [PATCH 19/25] libbpf: Fix global variable relocation ANBZ: #5530 commit 53f8dd434b6fe666b1c4e0be80a8727e8fa9839f upstream. Similarly to a0d7da26ce86 ("libbpf: Fix call relocation offset calculation bug"), relocations against global variables need to take into account referenced symbol's st_value, which holds offset into a corresponding data section (and, subsequently, offset into internal backing map). For static variables this offset is always zero and data offset is completely described by respective instruction's imm field. Convert a bunch of selftests to global variables. Previously they were relying on `static volatile` trick to ensure Clang doesn't inline static variables, which with global variables is not necessary anymore. [backport note] Drop fentry/fexit testcase as we haven't introduced. Fixes: 393cdfbee809 ("libbpf: Support initialized global variables") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191127200651.1381348-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 43 ++++++++----------- tools/testing/selftests/bpf/progs/test_mmap.c | 4 +- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b5f29b80564b..f0620fe2dd85 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -173,10 +173,8 @@ struct bpf_program { RELO_DATA, } type; int insn_idx; - union { - int map_idx; - int text_off; - }; + int map_idx; + int sym_off; } *reloc_desc; int nr_reloc; int log_level; @@ -1834,7 +1832,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } reloc_desc->type = RELO_CALL; reloc_desc->insn_idx = insn_idx; - reloc_desc->text_off = sym->st_value / 8; + reloc_desc->sym_off = sym->st_value; obj->has_pseudo_calls = true; return 0; } @@ -1878,6 +1876,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_LD64; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = 0; /* sym->st_value determines map_idx */ return 0; } @@ -1909,6 +1908,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = sym->st_value; return 0; } @@ -3606,8 +3606,8 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, return -LIBBPF_ERRNO__RELOC; if (prog->idx == obj->efile.text_shndx) { - pr_warn("relo in .text insn %d into off %d\n", - relo->insn_idx, relo->text_off); + pr_warn("relo in .text insn %d into off %d (insn #%d)\n", + relo->insn_idx, relo->sym_off, relo->sym_off / 8); return -LIBBPF_ERRNO__RELOC; } @@ -3642,7 +3642,7 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, prog->section_name); } insn = &prog->insns[relo->insn_idx]; - insn->imm += relo->text_off + prog->main_prog_cnt - relo->insn_idx; + insn->imm += relo->sym_off / 8 + prog->main_prog_cnt - relo->insn_idx; return 0; } @@ -3665,31 +3665,26 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) return 0; for (i = 0; i < prog->nr_reloc; i++) { - if (prog->reloc_desc[i].type == RELO_LD64 || - prog->reloc_desc[i].type == RELO_DATA) { - bool relo_data = prog->reloc_desc[i].type == RELO_DATA; - struct bpf_insn *insns = prog->insns; - int insn_idx, map_idx; + struct reloc_desc *relo = &prog->reloc_desc[i]; - insn_idx = prog->reloc_desc[i].insn_idx; - map_idx = prog->reloc_desc[i].map_idx; + if (relo->type == RELO_LD64 || relo->type == RELO_DATA) { + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; - if (insn_idx + 1 >= (int)prog->insns_cnt) { + if (relo->insn_idx + 1 >= (int)prog->insns_cnt) { pr_warn("relocation out of range: '%s'\n", prog->section_name); return -LIBBPF_ERRNO__RELOC; } - if (!relo_data) { - insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + if (relo->type != RELO_DATA) { + insn[0].src_reg = BPF_PSEUDO_MAP_FD; } else { - insns[insn_idx].src_reg = BPF_PSEUDO_MAP_VALUE; - insns[insn_idx + 1].imm = insns[insn_idx].imm; + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[1].imm = insn[0].imm + relo->sym_off; } - insns[insn_idx].imm = obj->maps[map_idx].fd; - } else if (prog->reloc_desc[i].type == RELO_CALL) { - err = bpf_program__reloc_text(prog, obj, - &prog->reloc_desc[i]); + insn[0].imm = obj->maps[relo->map_idx].fd; + } else if (relo->type == RELO_CALL) { + err = bpf_program__reloc_text(prog, obj, relo); if (err) return err; } diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 0089b9dec005..e285f126cd4e 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -15,8 +15,8 @@ struct { __type(value, __u64); } data_map SEC(".maps"); -static volatile __u64 in_val; -static volatile __u64 out_val; +__u64 in_val = 0; +__u64 out_val = 0; SEC("raw_tracepoint/sys_enter") int test_mmap(void *ctx) -- Gitee From 812f560ccdd0346f8f8ba0db8bd4f70b00fded55 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 Nov 2019 17:46:56 -0800 Subject: [PATCH 20/25] libbpf: Fix sym->st_value print on 32-bit arches ANBZ: #5530 commit 7c3977d1e80401b1a25efded698b05d60ee26e31 upstream. The st_value field is a 64-bit value and causing this error on 32-bit arches: In file included from libbpf.c:52: libbpf.c: In function 'bpf_program__record_reloc': libbpf_internal.h:59:22: error: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'Elf64_Addr' {aka 'const long long unsigned int'} [-Werror=format=] Fix it with (__u64) cast. Fixes: 1f8e2bcb2cd5 ("libbpf: Refactor relocation handling") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f0620fe2dd85..599d30b6092a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1827,7 +1827,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } if (sym->st_value % 8) { - pr_warn("bad call relo offset: %lu\n", sym->st_value); + pr_warn("bad call relo offset: %llu\n", (__u64)sym->st_value); return -LIBBPF_ERRNO__RELOC; } reloc_desc->type = RELO_CALL; -- Gitee From b6200ac686022a1819270f34c4dca6019c016cc2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 12 Dec 2019 09:19:18 -0800 Subject: [PATCH 21/25] libbpf: Fix printf compilation warnings on ppc64le arch ANBZ: #5530 commit 679152d3a32e305c213f83160c328c37566ae8bc upstream. On ppc64le __u64 and __s64 are defined as long int and unsigned long int, respectively. This causes compiler to emit warning when %lld/%llu are used to printf 64-bit numbers. Fix this by casting to size_t/ssize_t with %zu and %zd format specifiers, respectively. v1->v2: - use size_t/ssize_t instead of custom typedefs (Martin). Fixes: 1f8e2bcb2cd5 ("libbpf: Refactor relocation handling") Fixes: abd29c931459 ("libbpf: allow specifying map definitions using BTF") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191212171918.638010-1-andriin@fb.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 599d30b6092a..3086365eafa2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1243,15 +1243,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { - pr_warn("map '%s': can't determine key size for type [%u]: %lld.\n", - map_name, t->type, sz); + pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %lld.\n", - map_name, t->type, sz); + pr_debug("map '%s': found key [%u], sz = %zd.\n", + map_name, t->type, (ssize_t)sz); if (map->def.key_size && map->def.key_size != sz) { - pr_warn("map '%s': conflicting key size %u != %lld.\n", - map_name, map->def.key_size, sz); + pr_warn("map '%s': conflicting key size %u != %zd.\n", + map_name, map->def.key_size, (ssize_t)sz); return -EINVAL; } map->def.key_size = sz; @@ -1286,15 +1286,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { - pr_warn("map '%s': can't determine value size for type [%u]: %lld.\n", - map_name, t->type, sz); + pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %lld.\n", - map_name, t->type, sz); + pr_debug("map '%s': found value [%u], sz = %zd.\n", + map_name, t->type, (ssize_t)sz); if (map->def.value_size && map->def.value_size != sz) { - pr_warn("map '%s': conflicting value size %u != %lld.\n", - map_name, map->def.value_size, sz); + pr_warn("map '%s': conflicting value size %u != %zd.\n", + map_name, map->def.value_size, (ssize_t)sz); return -EINVAL; } map->def.value_size = sz; @@ -1827,7 +1827,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } if (sym->st_value % 8) { - pr_warn("bad call relo offset: %llu\n", (__u64)sym->st_value); + pr_warn("bad call relo offset: %zu\n", + (size_t)sym->st_value); return -LIBBPF_ERRNO__RELOC; } reloc_desc->type = RELO_CALL; @@ -1869,8 +1870,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, break; } if (map_idx >= nr_maps) { - pr_warn("map relo failed to find map for sec %u, off %llu\n", - shdr_idx, (__u64)sym->st_value); + pr_warn("map relo failed to find map for sec %u, off %zu\n", + shdr_idx, (size_t)sym->st_value); return -LIBBPF_ERRNO__RELOC; } reloc_desc->type = RELO_LD64; @@ -1951,9 +1952,9 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, sym.st_name) ? : ""; - pr_debug("relo for shdr %u, symb %llu, value %llu, type %d, bind %d, name %d (\'%s\'), insn %u\n", - (__u32)sym.st_shndx, (__u64)GELF_R_SYM(rel.r_info), - (__u64)sym.st_value, GELF_ST_TYPE(sym.st_info), + pr_debug("relo for shdr %u, symb %zu, value %zu, type %d, bind %d, name %d (\'%s\'), insn %u\n", + (__u32)sym.st_shndx, (size_t)GELF_R_SYM(rel.r_info), + (size_t)sym.st_value, GELF_ST_TYPE(sym.st_info), GELF_ST_BIND(sym.st_info), sym.st_name, name, insn_idx); -- Gitee From f878e9c3167e39bea3ed93f23e6bd498987c7ea4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:50 -0800 Subject: [PATCH 22/25] bpf: Provide better register bounds after jmp32 instructions ANBZ: #5530 commit 581738a681b6faae5725c2555439189ca81c0f1f upstream. With latest llvm (trunk https://github.com/llvm/llvm-project), test_progs, which has +alu32 enabled, failed for strobemeta.o. The verifier output looks like below with edit to replace large decimal numbers with hex ones. 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,umax_value=0xffffffff00000001) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 197: (67) r6 <<= 32 R6_w=inv(id=0,smax_value=0x7fffffff00000000,umax_value=0xffffffff00000000, var_off=(0x0; 0xffffffff00000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 257: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any array access into a map The value range for register r6 at insn 198 should be really just 0/1. The umax_value=0xffffffff caused later verification failure. After jmp instructions, the current verifier already tried to use just obtained information to get better register range. The current mechanism is for 64bit register only. This patch implemented to tighten the range for 32bit sub-registers after jmp32 instructions. With the patch, we have the below range ranges for the above code sequence: 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,smax_value=0x7fffffff00000001,umax_value=0xffffffff00000001, var_off=(0x0; 0xffffffff00000001)) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0x1)) 197: (67) r6 <<= 32 R6_w=inv(id=0,umax_value=0x100000000,var_off=(0x0; 0x100000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 257: (85) call bpf_probe_read_user_str#114 ... At insn 194, the register R0 has better var_off.mask and smax_value. Especially, the var_off.mask ensures later lshift and rshift maintains proper value range. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170650.449030-1-yhs@fb.com Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82c1ea634504..89925ebb1416 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1015,6 +1015,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } +static void __reg_bound_offset32(struct bpf_reg_state *reg) +{ + u64 mask = 0xffffFFFF; + struct tnum range = tnum_range(reg->umin_value & mask, + reg->umax_value & mask); + struct tnum lo32 = tnum_cast(reg->var_off, 4); + struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + + reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5769,6 +5780,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5878,6 +5893,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. -- Gitee From 09d2bde4d30431a32fbd45d1de6bac3cf62cdf8f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:51 -0800 Subject: [PATCH 23/25] selftests/bpf: Add verifier tests for better jmp32 register bounds ANBZ: #5530 commit 260cb5df9d16c5715b32d73cc8af26ad9a17a792 upstream. Three test cases are added. Test 1: jmp32 'reg op imm'. Test 2: jmp32 'reg op reg' where dst 'reg' has unknown constant and src 'reg' has known constant Test 3: jmp32 'reg op reg' where dst 'reg' has known constant and src 'reg' has unknown constant Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170651.449096-1-yhs@fb.com Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/test_verifier.c | 83 +++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index dbc9be56cb75..9e6b1b321d97 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -15976,6 +15976,89 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 2, }, + { + "jgt32: range bound deduction, reg op imm", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_JMP32_IMM(BPF_JGT, BPF_REG_0, 1, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, + }, + { + "jgt32: range bound deduction, reg1 op reg2, reg1 unknown", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_JMP32_REG(BPF_JGT, BPF_REG_0, BPF_REG_2, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, + }, + { + "jle32: range bound deduction, reg1 op reg2, reg2 unknown", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_JMP32_REG(BPF_JLE, BPF_REG_2, BPF_REG_0, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, { "spin_lock: test1 success", .insns = { -- Gitee From df19706770a0ad934fcfb6a029db584e800fca75 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 30 Mar 2020 18:03:22 +0200 Subject: [PATCH 24/25] bpf: Undo incorrect __reg_bound_offset32 handling ANBZ: #5530 commit f2d67fec0b43edce8c416101cdc52e71145b5fef upstream. Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: 0: (b7) r0 = 808464432 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020;0x10000001f)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umin_value=271581184,umax_value=271581311,var_off=(0x10300000;0x7f)) R1_w=invP808464432 R10=fp0 9: (76) if w0 s>= 0x303030 goto pc+2 12: (95) exit from 8 to 9: safe from 5 to 6: R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020;0x10000001f)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umin_value=271581184,umax_value=271581311,var_off=(0x10300000;0x7f)) R1_w=invP808464432 R10=fp0 9: safe from 8 to 9: safe verification time 589 usec stack depth 0 processed 17 insns (limit 1000000) [...] The underlying program was xlated as follows: # bpftool p d x i 9 0: (b7) r0 = 808464432 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 9: (76) if w0 s>= 0x303030 goto pc+2 10: (05) goto pc-1 11: (05) goto pc-1 12: (95) exit The verifier rewrote original instructions it recognized as dead code with 'goto pc-1', but reality differs from verifier simulation in that we're actually able to trigger a hang due to hitting the 'goto pc-1' instructions. Taking different examples to make the issue more obvious: in this example we're probing bounds on a completely unknown scalar variable in r1: [...] 5: R0_w=inv1 R1_w=inv(id=0) R10=fp0 5: (18) r2 = 0x4000000000 7: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R10=fp0 7: (18) r3 = 0x2000000000 9: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R10=fp0 9: (18) r4 = 0x400 11: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R10=fp0 11: (18) r5 = 0x200 13: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 13: (2d) if r1 > r2 goto pc+4 R0_w=inv1 R1_w=inv(id=0,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 14: R0_w=inv1 R1_w=inv(id=0,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 14: (ad) if r1 < r3 goto pc+3 R0_w=inv1 R1_w=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 15: R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 15: (2e) if w1 > w4 goto pc+2 R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 16: R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 16: (ae) if w1 < w5 goto pc+1 R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 [...] We're first probing lower/upper bounds via jmp64, later we do a similar check via jmp32 and examine the resulting var_off there. After fall-through in insn 14, we get the following bounded r1 with 0x7fffffffff unknown marked bits in the variable section. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111111111111111111111111111111111111 / 0x7fffffffff min: 0b010000000000000000000000000000000000000 / 0x2000000000 Now, in insn 15 and 16, we perform a similar probe with lower/upper bounds in jmp32. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000 and w1 <= 0x400 and w1 >= 0x200: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111100000000000000000000000000000000 / 0x7f00000000 min: 0b010000000000000000000000000000000000000 / 0x2000000000 The lower/upper bounds haven't changed since they have high bits set in u64 space and the jmp32 tests can only refine bounds in the low bits. However, for the var part the expectation would have been 0x7f000007ff or something less precise up to 0x7fffffffff. A outcome of 0x7f00000000 is not correct since it would contradict the earlier probed bounds where we know that the result should have been in [0x200,0x400] in u32 space. Therefore, tests with such info will lead to wrong verifier assumptions later on like falsely predicting conditional jumps to be always taken, etc. The issue here is that __reg_bound_offset32()'s implementation from commit 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") makes an incorrect range assumption: static void __reg_bound_offset32(struct bpf_reg_state *reg) { u64 mask = 0xffffFFFF; struct tnum range = tnum_range(reg->umin_value & mask, reg->umax_value & mask); struct tnum lo32 = tnum_cast(reg->var_off, 4); struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); } In the above walk-through example, __reg_bound_offset32() as-is chose a range after masking with 0xffffffff of [0x0,0x0] since umin:0x2000000000 and umax:0x4000000000 and therefore the lo32 part was clamped to 0x0 as well. However, in the umin:0x2000000000 and umax:0x4000000000 range above we'd end up with an actual possible interval of [0x0,0xffffffff] for u32 space instead. In case of the original reproducer, the situation looked as follows at insn 5 for r0: [...] 5: R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x0; 0x1ffffffff)) R1_w=invP808464432 R10=fp0 0x30303030 0x13030302f 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020; 0x10000001f)) R1_w=invP808464432 R10=fp0 0x30303030 0x13030302f [...] After the fall-through, we similarly forced the var_off result into the wrong range [0x30303030,0x3030302f] suggesting later on that fixed bits must only be of 0x30303020 with 0x10000001f unknowns whereas such assumption can only be made when both bounds in hi32 range match. Originally, I was thinking to fix this by moving reg into a temp reg and use proper coerce_reg_to_size() helper on the temp reg where we can then based on that define the range tnum for later intersection: static void __reg_bound_offset32(struct bpf_reg_state *reg) { struct bpf_reg_state tmp = *reg; struct tnum lo32, hi32, range; coerce_reg_to_size(&tmp, 4); range = tnum_range(tmp.umin_value, tmp.umax_value); lo32 = tnum_cast(reg->var_off, 4); hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); } In the case of the concrete example, this gives us a more conservative unknown section. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000 and w1 <= 0x400 and w1 >= 0x200: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111111111111111111111111111111111111 / 0x7fffffffff min: 0b010000000000000000000000000000000000000 / 0x2000000000 However, above new __reg_bound_offset32() has no effect on refining the knowledge of the register contents. Meaning, if the bounds in hi32 range mismatch we'll get the identity function given the range reg spans [0x0,0xffffffff] and we cast var_off into lo32 only to later on binary or it again with the hi32. Likewise, if the bounds in hi32 range match, then we mask both bounds with 0xffffffff, use the resulting umin/umax for the range to later intersect the lo32 with it. However, _prior_ called __reg_bound_offset() did already such intersection on the full reg and we therefore would only repeat the same operation on the lo32 part twice. Given this has no effect and the original commit had false assumptions, this patch reverts the code entirely which is also more straight forward for stable trees: apparently 581738a681b6 got auto-selected by Sasha's ML system and misclassified as a fix, so it got sucked into v5.4 where it should never have landed. A revert is low-risk also from a user PoV since it requires a recent kernel and llc to opt-into -mcpu=v3 BPF CPU to generate jmp32 instructions. A proper bounds refinement would need a significantly more complex approach which is currently being worked, but no stable material [0]. Hence revert is best option for stable. After the revert, the original reported program gets rejected as follows: 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x0; 0x1ffffffff)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x3fffffff)) R1_w=invP808464432 R10=fp0 9: (76) if w0 s>= 0x303030 goto pc+2 R0=invP(id=0,umax_value=3158063,var_off=(0x0; 0x3fffff)) R1=invP808464432 R10=fp0 10: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 11 insns (limit 1000000) [...] [0] https://lore.kernel.org/bpf/158507130343.15666.8018068546764556975.stgit@john-Precision-5820-Tower/T/ Fixes: 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200330160324.15259-2-daniel@iogearbox.net Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89925ebb1416..82c1ea634504 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1015,17 +1015,6 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } -static void __reg_bound_offset32(struct bpf_reg_state *reg) -{ - u64 mask = 0xffffFFFF; - struct tnum range = tnum_range(reg->umin_value & mask, - reg->umax_value & mask); - struct tnum lo32 = tnum_cast(reg->var_off, 4); - struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); - - reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); -} - /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5780,10 +5769,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5893,10 +5878,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. -- Gitee From 020a092fa1f522716d24afde49a6f428750750f5 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 30 Mar 2020 18:03:23 +0200 Subject: [PATCH 25/25] bpf: Fix tnum constraints for 32-bit comparisons ANBZ: #5530 commit 604dca5e3af1db98bd123b7bfc02b017af99e3a0 upstream. The BPF verifier tried to track values based on 32-bit comparisons by (ab)using the tnum state via 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions"). The idea is that after a check like this: if ((u32)r0 > 3) exit We can't meaningfully constrain the arithmetic-range-based tracking, but we can update the tnum state to (value=0,mask=0xffff'ffff'0000'0003). However, the implementation from 581738a681b6 didn't compute the tnum constraint based on the fixed operand, but instead derives it from the arithmetic-range-based tracking. This means that after the following sequence of operations: if (r0 >= 0x1'0000'0001) exit if ((u32)r0 > 7) exit The verifier assumed that the lower half of r0 is in the range (0, 0) and apply the tnum constraint (value=0,mask=0xffff'ffff'0000'0000) thus causing the overall tnum to be (value=0,mask=0x1'0000'0000), which was incorrect. Provide a fixed implementation. Fixes: 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200330160324.15259-3-daniel@iogearbox.net Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 108 ++++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 36 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82c1ea634504..95d197eb4ab9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5653,6 +5653,70 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) reg->smax_value <= 0 && reg->smin_value >= S32_MIN); } +/* Constrain the possible values of @reg with unsigned upper bound @bound. + * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. + * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits + * of @reg. + */ +static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, + bool is_exclusive) +{ + if (is_exclusive) { + /* There are no values for `reg` that make `reg<0` true. */ + if (bound == 0) + return; + bound--; + } + if (is_jmp32) { + /* Constrain the register's value in the tnum representation. + * For 64-bit comparisons this happens later in + * __reg_bound_offset(), but for 32-bit comparisons, we can be + * more precise than what can be derived from the updated + * numeric bounds. + */ + struct tnum t = tnum_range(0, bound); + + t.mask |= ~0xffffffffULL; /* upper half is unknown */ + reg->var_off = tnum_intersect(reg->var_off, t); + + /* Compute the 64-bit bound from the 32-bit bound. */ + bound += gen_hi_max(reg->var_off); + } + reg->umax_value = min(reg->umax_value, bound); +} + +/* Constrain the possible values of @reg with unsigned lower bound @bound. + * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. + * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits + * of @reg. + */ +static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, + bool is_exclusive) +{ + if (is_exclusive) { + /* There are no values for `reg` that make `reg>MAX` true. */ + if (bound == (is_jmp32 ? U32_MAX : U64_MAX)) + return; + bound++; + } + if (is_jmp32) { + /* Constrain the register's value in the tnum representation. + * For 64-bit comparisons this happens later in + * __reg_bound_offset(), but for 32-bit comparisons, we can be + * more precise than what can be derived from the updated + * numeric bounds. + */ + struct tnum t = tnum_range(bound, U32_MAX); + + t.mask |= ~0xffffffffULL; /* upper half is unknown */ + reg->var_off = tnum_intersect(reg->var_off, t); + + /* Compute the 64-bit bound from the 32-bit bound. */ + bound += gen_hi_min(reg->var_off); + } + reg->umin_value = max(reg->umin_value, bound); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -5708,15 +5772,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: case BPF_JGT: { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); + set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); break; } case BPF_JSGE: @@ -5737,15 +5794,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JLE: case BPF_JLT: { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); + set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); break; } case BPF_JSLE: @@ -5820,15 +5870,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: case BPF_JGT: { - u64 false_umin = opcode == BPF_JGT ? val : val + 1; - u64 true_umax = opcode == BPF_JGT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); + set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); break; } case BPF_JSGE: @@ -5846,15 +5889,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JLE: case BPF_JLT: { - u64 false_umax = opcode == BPF_JLT ? val : val - 1; - u64 true_umin = opcode == BPF_JLT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); + set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); break; } case BPF_JSLE: -- Gitee