From 66d1fdb1c82e580ec975df41ba9e4ddb656cd1c8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:40 -0700 Subject: [PATCH 01/22] selftests/bpf: convert test_cgrp2_attach2 example into kselftest ANBZ: #5530 commit ba0c0cc05dda2c56e23c88e4610ce2ac65ff86ec upstream. Convert test_cgrp2_attach2 example into a proper test_cgroup_attach kselftest. It's better because we do run kselftest on a constant basis, so there are better chances to spot a potential regression. Also make it slightly less verbose to conform kselftests output style. Output example: $ ./test_cgroup_attach #override:PASS #multi:PASS test_cgroup_attach:PASS Signed-off-by: Roman Gushchin Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- samples/bpf/Makefile | 2 - tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/test_cgroup_attach.c | 50 ++++++++++++------- 4 files changed, 36 insertions(+), 20 deletions(-) rename samples/bpf/test_cgrp2_attach2.c => tools/testing/selftests/bpf/test_cgroup_attach.c (91%) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 99afd0dee2ca..f4767c9c41d6 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -26,7 +26,6 @@ hostprogs-y += map_perf_test hostprogs-y += test_overhead hostprogs-y += test_cgrp2_array_pin hostprogs-y += test_cgrp2_attach -hostprogs-y += test_cgrp2_attach2 hostprogs-y += test_cgrp2_sock hostprogs-y += test_cgrp2_sock2 hostprogs-y += xdp1 @@ -82,7 +81,6 @@ map_perf_test-objs := bpf_load.o map_perf_test_user.o test_overhead-objs := bpf_load.o test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o -test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS) test_cgrp2_sock-objs := test_cgrp2_sock.o test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o xdp1-objs := xdp1_user.o diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1587b4171a7..e401075d0957 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,6 +23,7 @@ test_lirc_mode2_user get_cgroup_id_user test_socket_cookie test_skb_cgroup_id_user +test_cgroup_attach test_cgroup_storage test_select_reuseport test_flow_dissector diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 94353d2fa3d7..b34ff6fd1e4c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -25,7 +25,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \ - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_cgroup_attach BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) TEST_GEN_FILES = $(BPF_OBJ_FILES) @@ -91,6 +91,7 @@ $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c $(OUTPUT)/test_netcnt: cgroup_helpers.c $(OUTPUT)/test_sock_fields: cgroup_helpers.c $(OUTPUT)/test_sysctl: cgroup_helpers.c +$(OUTPUT)/test_cgroup_attach: cgroup_helpers.c .PHONY: force diff --git a/samples/bpf/test_cgrp2_attach2.c b/tools/testing/selftests/bpf/test_cgroup_attach.c similarity index 91% rename from samples/bpf/test_cgrp2_attach2.c rename to tools/testing/selftests/bpf/test_cgroup_attach.c index 180f9d813bca..85def14bcc80 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/tools/testing/selftests/bpf/test_cgroup_attach.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + /* eBPF example program: * * - Creates arraymap in kernel with 4 bytes keys and 8 byte values @@ -25,20 +27,27 @@ #include #include #include +#include #include #include -#include "bpf_insn.h" +#include "bpf_util.h" #include "bpf_rlimit.h" #include "cgroup_helpers.h" #define FOO "/foo" #define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" char bpf_log_buf[BPF_LOG_BUF_SIZE]; +#ifdef DEBUG +#define debug(args...) printf(args) +#else +#define debug(args...) +#endif + static int prog_load(int verdict) { int ret; @@ -89,7 +98,7 @@ static int test_foo_bar(void) goto err; } - printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); + debug("Attached DROP prog. This ping in cgroup /foo should fail...\n"); assert(system(PING_CMD) != 0); /* Create cgroup /foo/bar, get fd, and join it */ @@ -100,7 +109,7 @@ static int test_foo_bar(void) if (join_cgroup(BAR)) goto err; - printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); + debug("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, @@ -109,7 +118,7 @@ static int test_foo_bar(void) goto err; } - printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); + debug("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { @@ -117,7 +126,7 @@ static int test_foo_bar(void) goto err; } - printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" + debug("Detached PASS from /foo/bar while DROP is attached to /foo.\n" "This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); @@ -132,7 +141,7 @@ static int test_foo_bar(void) goto err; } - printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" + debug("Attached PASS from /foo/bar and detached DROP from /foo.\n" "This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); @@ -199,9 +208,9 @@ static int test_foo_bar(void) close(bar); cleanup_cgroup_environment(); if (!rc) - printf("### override:PASS\n"); + printf("#override:PASS\n"); else - printf("### override:FAIL\n"); + printf("#override:FAIL\n"); return rc; } @@ -424,19 +433,26 @@ static int test_multiprog(void) close(cg5); cleanup_cgroup_environment(); if (!rc) - printf("### multi:PASS\n"); + printf("#multi:PASS\n"); else - printf("### multi:FAIL\n"); + printf("#multi:FAIL\n"); return rc; } -int main(int argc, char **argv) +int main(void) { - int rc = 0; + int (*tests[])(void) = {test_foo_bar, test_multiprog}; + int errors = 0; + int i; - rc = test_foo_bar(); - if (rc) - return rc; + for (i = 0; i < ARRAY_SIZE(tests); i++) + if (tests[i]()) + errors++; + + if (errors) + printf("test_cgroup_attach:FAIL\n"); + else + printf("test_cgroup_attach:PASS\n"); - return test_multiprog(); + return errors ? EXIT_FAILURE : EXIT_SUCCESS; } -- Gitee From 93ee12533a935a6ea76da520377928474edd34aa Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:41 -0700 Subject: [PATCH 02/22] selftests/bpf: enable all available cgroup v2 controllers ANBZ: #5530 commit 596092ef8beaa73891da4f406446b283cc566cdb upstream. Enable all available cgroup v2 controllers when setting up the environment for the bpf kselftests. It's required to properly test the bpf prog auto-detach feature. Also it will generally increase the code coverage. Signed-off-by: Roman Gushchin Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/cgroup_helpers.c | 57 ++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index cf16948aad4a..c5e368ddd953 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -33,6 +33,60 @@ snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \ CGROUP_WORK_DIR, path) +/** + * enable_all_controllers() - Enable all available cgroup v2 controllers + * + * Enable all available cgroup v2 controllers in order to increase + * the code coverage. + * + * If successful, 0 is returned. + */ +int enable_all_controllers(char *cgroup_path) +{ + char path[PATH_MAX + 1]; + char buf[PATH_MAX]; + char *c, *c2; + int fd, cfd; + size_t len; + + snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path); + fd = open(path, O_RDONLY); + if (fd < 0) { + log_err("Opening cgroup.controllers: %s", path); + return 1; + } + + len = read(fd, buf, sizeof(buf) - 1); + if (len < 0) { + close(fd); + log_err("Reading cgroup.controllers: %s", path); + return 1; + } + buf[len] = 0; + close(fd); + + /* No controllers available? We're probably on cgroup v1. */ + if (len == 0) + return 0; + + snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); + cfd = open(path, O_RDWR); + if (cfd < 0) { + log_err("Opening cgroup.subtree_control: %s", path); + return 1; + } + + for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { + if (dprintf(cfd, "+%s\n", c) <= 0) { + log_err("Enabling controller %s: %s", c, path); + close(cfd); + return 1; + } + } + close(cfd); + return 0; +} + /** * setup_cgroup_environment() - Setup the cgroup environment * @@ -71,6 +125,9 @@ int setup_cgroup_environment(void) return 1; } + if (enable_all_controllers(cgroup_workdir)) + return 1; + return 0; } -- Gitee From a6e7eee38ca3dce1c48c95b3f08101c3cb5cb3a8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:42 -0700 Subject: [PATCH 03/22] selftests/bpf: add auto-detach test ANBZ: #5530 commit d5506591d54b915aaa9d489c9289af2cc88f47a4 upstream. Add a kselftest to cover bpf auto-detachment functionality. The test creates a cgroup, associates some resources with it, attaches a couple of bpf programs and deletes the cgroup. Then it checks that bpf programs are going away in 5 seconds. Expected output: $ ./test_cgroup_attach #override:PASS #multi:PASS #autodetach:PASS test_cgroup_attach:PASS On a kernel without auto-detaching: $ ./test_cgroup_attach #override:PASS #multi:PASS #autodetach:FAIL test_cgroup_attach:FAIL Signed-off-by: Roman Gushchin Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../selftests/bpf/test_cgroup_attach.c | 98 ++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_cgroup_attach.c b/tools/testing/selftests/bpf/test_cgroup_attach.c index 85def14bcc80..9e64c2eca2ff 100644 --- a/tools/testing/selftests/bpf/test_cgroup_attach.c +++ b/tools/testing/selftests/bpf/test_cgroup_attach.c @@ -439,9 +439,105 @@ static int test_multiprog(void) return rc; } +static int test_autodetach(void) +{ + __u32 prog_cnt = 4, attach_flags; + int allow_prog[2] = {0}; + __u32 prog_ids[2] = {0}; + int cg = 0, i, rc = -1; + void *ptr = NULL; + int attempts; + + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (!allow_prog[i]) + goto err; + } + + if (setup_cgroup_environment()) + goto err; + + /* create a cgroup, attach two programs and remember their ids */ + cg = create_and_get_cgroup("/cg_autodetach"); + if (cg < 0) + goto err; + + if (join_cgroup("/cg_autodetach")) + goto err; + + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + if (bpf_prog_attach(allow_prog[i], cg, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { + log_err("Attaching prog[%d] to cg:egress", i); + goto err; + } + } + + /* make sure that programs are attached and run some traffic */ + assert(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags, + prog_ids, &prog_cnt) == 0); + assert(system(PING_CMD) == 0); + + /* allocate some memory (4Mb) to pin the original cgroup */ + ptr = malloc(4 * (1 << 20)); + if (!ptr) + goto err; + + /* close programs and cgroup fd */ + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + close(allow_prog[i]); + allow_prog[i] = 0; + } + + close(cg); + cg = 0; + + /* leave the cgroup and remove it. don't detach programs */ + cleanup_cgroup_environment(); + + /* wait for the asynchronous auto-detachment. + * wait for no more than 5 sec and give up. + */ + for (i = 0; i < ARRAY_SIZE(prog_ids); i++) { + for (attempts = 5; attempts >= 0; attempts--) { + int fd = bpf_prog_get_fd_by_id(prog_ids[i]); + + if (fd < 0) + break; + + /* don't leave the fd open */ + close(fd); + + if (!attempts) + goto err; + + sleep(1); + } + } + + rc = 0; +err: + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) + if (allow_prog[i] > 0) + close(allow_prog[i]); + if (cg) + close(cg); + free(ptr); + cleanup_cgroup_environment(); + if (!rc) + printf("#autodetach:PASS\n"); + else + printf("#autodetach:FAIL\n"); + return rc; +} + int main(void) { - int (*tests[])(void) = {test_foo_bar, test_multiprog}; + int (*tests[])(void) = { + test_foo_bar, + test_multiprog, + test_autodetach, + }; int errors = 0; int i; -- Gitee From d020c0bf70022c043bf2336666b5875b96e64076 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:42 -0700 Subject: [PATCH 04/22] bpf: media: properly use bpf_prog_array api ANBZ: #5530 commit 02205d2ed6fe26a8f4fd9e9cec251d1dc7f79316 upstream. Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. Cc: linux-media@vger.kernel.org Cc: Mauro Carvalho Chehab Cc: Sean Young Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- drivers/media/rc/bpf-lirc.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index ee657003c1a1..0a0ce620e4a2 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -8,6 +8,9 @@ #include #include "rc-core-priv.h" +#define lirc_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&ir_raw_handler_lock)) + /* * BPF interface for raw IR */ @@ -136,7 +139,7 @@ const struct bpf_verifier_ops lirc_mode2_verifier_ops = { static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; struct ir_raw_event_ctrl *raw; int ret; @@ -154,12 +157,12 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) goto unlock; } - if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) { + old_array = lirc_rcu_dereference(raw->progs); + if (old_array && bpf_prog_array_length(old_array) >= BPF_MAX_PROGS) { ret = -E2BIG; goto unlock; } - old_array = raw->progs; ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) goto unlock; @@ -174,7 +177,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; struct ir_raw_event_ctrl *raw; int ret; @@ -192,7 +195,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) goto unlock; } - old_array = raw->progs; + old_array = lirc_rcu_dereference(raw->progs); ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); /* * Do not use bpf_prog_array_delete_safe() as we would end up @@ -223,21 +226,22 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) /* * This should be called once the rc thread has been stopped, so there can be * no concurrent bpf execution. + * + * Should be called with the ir_raw_handler_lock held. */ void lirc_bpf_free(struct rc_dev *rcdev) { struct bpf_prog_array_item *item; + struct bpf_prog_array *array; - if (!rcdev->raw->progs) + array = lirc_rcu_dereference(rcdev->raw->progs); + if (!array) return; - item = rcu_dereference(rcdev->raw->progs)->items; - while (item->prog) { + for (item = array->items; item->prog; item++) bpf_prog_put(item->prog); - item++; - } - bpf_prog_array_free(rcdev->raw->progs); + bpf_prog_array_free(array); } int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -290,7 +294,7 @@ int lirc_prog_detach(const union bpf_attr *attr) int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - struct bpf_prog_array __rcu *progs; + struct bpf_prog_array *progs; struct rc_dev *rcdev; u32 cnt, flags = 0; int ret; @@ -311,7 +315,7 @@ int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) if (ret) goto put; - progs = rcdev->raw->progs; + progs = lirc_rcu_dereference(rcdev->raw->progs); cnt = progs ? bpf_prog_array_length(progs) : 0; if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) { -- Gitee From cfd079e159bf87517fad158f5e8b67c4ab9571d1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:44 -0700 Subject: [PATCH 05/22] bpf: tracing: properly use bpf_prog_array api ANBZ: #5530 commit e672db03ab0e43e41ab6f8b2156a10d6e40f243d upstream. Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/trace/bpf_trace.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index feb088bd3c31..e7a900ff1f20 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -19,6 +19,9 @@ #include "trace_probe.h" #include "trace.h" +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -1178,7 +1181,7 @@ static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; @@ -1196,7 +1199,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, if (event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; @@ -1219,7 +1222,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, void perf_event_detach_bpf_prog(struct perf_event *event) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret; @@ -1228,7 +1231,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) if (!event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret == -ENOENT) goto unlock; @@ -1250,6 +1253,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; @@ -1274,10 +1278,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) */ mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || -- Gitee From 0a87894e0f91849870db98b55feeefb3643042cb Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:35 -0700 Subject: [PATCH 06/22] bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY ANBZ: #5530 commit 1f52f6c0b0e846908e9c1082dab1b3f7088b82ac upstream. Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can request cwr for TCP packets. Current cgroup skb programs can only return 0 or 1 (0 to drop the packet. This macro changes the behavior so the low order bit indicates whether the packet should be dropped (0) or not (1) and the next bit is used for congestion notification (cn). Hence, new allowed return values of CGROUP EGRESS BPF programs are: 0: drop packet 1: keep packet 2: drop packet and call cwr 3: keep packet and call cwr This macro then converts it to one of NET_XMIT values or -EPERM that has the effect of dropping the packet with no cn. 0: NET_XMIT_SUCCESS skb should be transmitted (no cn) 1: NET_XMIT_DROP skb should be dropped and cwr called 2: NET_XMIT_CN skb should be transmitted and cwr called 3: -EPERM skb should be dropped (no cn) Note that when more than one BPF program is called, the packet is dropped if at least one of programs requests it be dropped, and there is cn if at least one program returns cn. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 27329939bc55..95b3f6d8cd08 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -562,6 +562,56 @@ _out: \ _ret; \ }) +/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs + * so BPF programs can request cwr for TCP packets. + * + * Current cgroup skb programs can only return 0 or 1 (0 to drop the + * packet. This macro changes the behavior so the low order bit + * indicates whether the packet should be dropped (0) or not (1) + * and the next bit is a congestion notification bit. This could be + * used by TCP to call tcp_enter_cwr() + * + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * This macro then converts it to one of the NET_XMIT or an error + * code that is then interpreted as drop packet (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -EPERM skb should be dropped + */ +#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 ret; \ + u32 _ret = 1; \ + u32 _cn = 0; \ + preempt_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + ret = func(_prog, ctx); \ + _ret &= (ret & 1); \ + _cn |= (ret & 2); \ + _item++; \ + } \ + rcu_read_unlock(); \ + preempt_enable(); \ + if (_ret) \ + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ + else \ + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret; \ + }) + #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ __BPF_PROG_RUN_ARRAY(array, ctx, func, false) -- Gitee From 31d2b15af91dea7884ea46c224f9768e42aca793 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:36 -0700 Subject: [PATCH 07/22] bpf: cgroup inet skb programs can return 0 to 3 ANBZ: #5530 commit 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b upstream. Allows cgroup inet skb programs to return values in the range [0, 3]. The second bit is used to deterine if congestion occurred and higher level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr() The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS at load time if it uses the new return values (i.e. 2 or 3). The expected_attach_type is currently not enforced for BPF_PROG_TYPE_CGROUP_SKB. e.g Meaning the current bpf_prog with expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to BPF_CGROUP_INET_INGRESS. Blindly enforcing expected_attach_type will break backward compatibility. This patch adds a enforce_expected_attach_type bit to only enforce the expected_attach_type when it uses the new return value. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/filter.h | 3 ++- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 9 +++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index baee4e3c6b35..49af826011df 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -550,7 +550,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 12c571099575..82d253631325 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1592,6 +1592,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1849,6 +1857,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba9985cba943..4528301eb59b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5685,6 +5685,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); @@ -5694,6 +5695,10 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: @@ -5724,6 +5729,10 @@ static int check_return_code(struct bpf_verifier_env *env) verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } -- Gitee From b0a90116ade57d67c11f450c6a9f171d8f63eeea Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:37 -0700 Subject: [PATCH 08/22] bpf: Update __cgroup_bpf_run_filter_skb with cn ANBZ: #5530 commit e7a3160d092aa4dd7fb2ef23335cb3a98400aec6 upstream. For egress packets, __cgroup_bpf_fun_filter_skb() will now call BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() instead of PROG_CGROUP_RUN_ARRAY() in order to propagate congestion notifications (cn) requests to TCP callers. For egress packets, this function can return: NET_XMIT_SUCCESS (0) - continue with packet output NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr NET_XMIT_CN (2) - continue with packet output and notify TCP to call cwr -EPERM - drop packet For ingress packets, this function will return -EPERM if any attached program was found and if it returned != 1 during execution. Otherwise 0 is returned. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- kernel/bpf/cgroup.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c689cd554ace..c9a6bde0c6a0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -614,8 +614,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -641,12 +649,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); -- Gitee From 93488942c8ecfac11b7178368f59608036323a01 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:38 -0700 Subject: [PATCH 09/22] bpf: Update BPF_CGROUP_RUN_PROG_INET_EGRESS calls ANBZ: #5530 commit 956fe2190820df3a6ee530204e059da508159319 upstream. Update BPF_CGROUP_RUN_PROG_INET_EGRESS() callers to support returning congestion notifications from the BPF programs. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- net/ipv4/ip_output.c | 34 +++++++++++++++++++++++----------- net/ipv6/ip6_output.c | 26 +++++++++++++++++--------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5f70ea3640d6..71ad09c2ba39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -296,16 +296,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -324,18 +317,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_SUCCESS: + return dev_loopback_xmit(net, sk, skb); + case NET_XMIT_CN: + return dev_loopback_xmit(net, sk, skb) ? : ret; + default: kfree_skb(skb); return ret; } - - return dev_loopback_xmit(net, sk, skb); } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 86358a6c835c..879f20b34e9b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; -- Gitee From 075d713e11691c05899c4cee705f30fe80de4c33 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:39 -0700 Subject: [PATCH 10/22] bpf: Add cn support to hbm_out_kern.c ANBZ: #5530 commit ffd81558d56c611b1e93f856c77f42046a2deab5 upstream. Update hbm_out_kern.c to support returning cn notifications. Also updates relevant files to allow disabling cn notifications. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- samples/bpf/do_hbm_test.sh | 10 +++++++--- samples/bpf/hbm.c | 18 +++++++++++++++--- samples/bpf/hbm.h | 3 ++- samples/bpf/hbm_out_kern.c | 26 +++++++++++++++++++++----- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index 56c8b4115c95..e48b047d4646 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -13,10 +13,10 @@ Usage() { echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" - echo "USAGE: $name [out] [-b=|--bpf=] [-c=|--cc=] [-D]" - echo " [-d=|--delay=] [--debug] [-E]" + echo "USAGE: $name [out] [-b=|--bpf=] [-c=|--cc=]" + echo " [-D] [-d=|--delay=] [--debug] [-E]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=|--id=]" - echo " [-l] [-N] [-p=|--port=] [-P]" + echo " [-l] [-N] [--no_cn] [-p=|--port=] [-P]" echo " [-q=] [-R] [-s=|--server=|--time=