From 270f49a9dbe66d44b1ad9bab75f02b3c45e518be Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:39 -0800
Subject: [PATCH 01/23] bpf: enable program stats

ANBZ: #5530

commit 492ecee892c2a4ba6a14903d5d586ff750b7e805 upstream.

JITed BPF programs are indistinguishable from kernel functions, but unlike
kernel code BPF code can be changed often.
Typical approach of "perf record" + "perf report" profiling and tuning of
kernel code works just as well for BPF programs, but kernel code doesn't
need to be monitored whereas BPF programs do.
Users load and run large amount of BPF programs.
These BPF stats allow tools monitor the usage of BPF on the server.
The monitoring tools will turn sysctl kernel.bpf_stats_enabled
on and off for few seconds to sample average cost of the programs.
Aggregated data over hours and days will provide an insight into cost of BPF
and alarms can trigger in case given program suddenly gets more expensive.

The cost of two sched_clock() per program invocation adds ~20 nsec.
Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
from ~10 nsec to ~30 nsec.
static_key minimizes the cost of the stats collection.
There is no measurable difference before/after this patch
with kernel.bpf_stats_enabled=0

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/linux/bpf.h    |  9 +++++++++
 include/linux/filter.h | 20 +++++++++++++++++++-
 kernel/bpf/core.c      | 31 +++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c   | 34 ++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c  |  7 ++++++-
 kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4bbc1dc3386..832088d0bc1e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -16,6 +16,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
 #include <linux/wait.h>
+#include <linux/u64_stats_sync.h>
 
 struct bpf_verifier_env;
 struct perf_event;
@@ -345,6 +346,12 @@ enum bpf_cgroup_storage_type {
 
 #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
 
+struct bpf_prog_stats {
+	u64 cnt;
+	u64 nsecs;
+	struct u64_stats_sync syncp;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -394,6 +401,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -570,6 +578,7 @@ void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
 extern int sysctl_unprivileged_bpf_disabled;
+extern int sysctl_bpf_stats_enabled;
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index dc3c1044f247..59f15a2aeee1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -563,7 +563,24 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
+DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+
+#define BPF_PROG_RUN(prog, ctx)	({				\
+	u32 ret;						\
+	cant_sleep();						\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
+		struct bpf_prog_stats *stats;			\
+		u64 start = sched_clock();			\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		stats = this_cpu_ptr(prog->aux->stats);		\
+		u64_stats_update_begin(&stats->syncp);		\
+		stats->cnt++;					\
+		stats->nsecs += sched_clock() - start;		\
+		u64_stats_update_end(&stats->syncp);		\
+	} else {						\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+	}							\
+	ret; })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
@@ -796,6 +813,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
 void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
 void __bpf_prog_free(struct bpf_prog *fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0d1586d4f3a0..6db62aa6d928 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 	return NULL;
 }
 
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
 	struct bpf_prog_aux *aux;
@@ -104,6 +104,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 
 	return fp;
 }
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+	if (!prog)
+		return NULL;
+
+	prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+	if (!prog->aux->stats) {
+		kfree(prog->aux);
+		vfree(prog);
+		return NULL;
+	}
+
+	u64_stats_init(&prog->aux->stats->syncp);
+	return prog;
+}
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 
 int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +251,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
-	kfree(fp->aux);
+	if (fp->aux) {
+		free_percpu(fp->aux->stats);
+		kfree(fp->aux);
+	}
 	vfree(fp);
 }
 
@@ -2067,6 +2090,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7ddd79726778..5f616df11ff1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1291,24 +1291,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+			       struct bpf_prog_stats *stats)
+{
+	u64 nsecs = 0, cnt = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		const struct bpf_prog_stats *st;
+		unsigned int start;
+		u64 tnsecs, tcnt;
+
+		st = per_cpu_ptr(prog->aux->stats, cpu);
+		do {
+			start = u64_stats_fetch_begin_irq(&st->syncp);
+			tnsecs = st->nsecs;
+			tcnt = st->cnt;
+		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
+		nsecs += tnsecs;
+		cnt += tcnt;
+	}
+	stats->nsecs = nsecs;
+	stats->cnt = cnt;
+}
+
 #ifdef CONFIG_PROC_FS
 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_prog *prog = filp->private_data;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+	struct bpf_prog_stats stats;
 
+	bpf_prog_get_stats(prog, &stats);
 	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
 		   "memlock:\t%llu\n"
-		   "prog_id:\t%u\n",
+		   "prog_id:\t%u\n"
+		   "run_time_ns:\t%llu\n"
+		   "run_cnt:\t%llu\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT,
-		   prog->aux->id);
+		   prog->aux->id,
+		   stats.nsecs,
+		   stats.cnt);
 }
 #endif
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 148c41290c32..c283210c53c6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7532,7 +7532,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
-		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+		/* BPF_PROG_RUN doesn't call subprogs directly,
+		 * hence main prog stats include the runtime of subprogs.
+		 * subprogs don't have IDs and not reachable via prog_get_next_id
+		 * func[i]->aux->stats will never be accessed and stays NULL
+		 */
+		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
 		if (!func[i])
 			goto out_free;
 		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0b6dc1f8fdda..827ca50fecc5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -240,6 +240,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1413,6 +1416,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "bpf_stats_enabled",
+		.data		= &sysctl_bpf_stats_enabled,
+		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax_bpf_stats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3738,6 +3750,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	int ret, bpf_stats = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &bpf_stats;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		*(int *)table->data = bpf_stats;
+		if (bpf_stats)
+			static_branch_enable(&bpf_stats_enabled_key);
+		else
+			static_branch_disable(&bpf_stats_enabled_key);
+	}
+	return ret;
+}
+
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
Gitee


From c45c5b53a3effb7db12162f36f58a650d50bf7f8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:40 -0800
Subject: [PATCH 02/23] bpf: expose program stats via bpf_prog_info

ANBZ: #5530

commit 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 upstream.

Return bpf program run_time_ns and run_cnt via bpf_prog_info

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 991b92610531..d6b8a1dc60e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2814,6 +2814,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 run_time_ns;
+	__u64 run_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5f616df11ff1..2c2ee3b61884 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2171,6 +2171,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
+	struct bpf_prog_stats stats;
 	char __user *uinsns;
 	u32 ulen;
 	int err;
@@ -2210,6 +2211,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (err)
 		return err;
 
+	bpf_prog_get_stats(prog, &stats);
+	info.run_time_ns = stats.nsecs;
+	info.run_cnt = stats.cnt;
+
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
-- 
Gitee


From 94bc38da7c5e36a22146086c44b0bd3cc047349c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:41 -0800
Subject: [PATCH 03/23] tools/bpf: sync bpf.h into tools

ANBZ: #5530

commit b1eca86db68b48f59b9f195c8b4f8114d2a9918c upstream.

sync bpf.h into tools directory

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a837314ae674..73a4b9a51b95 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2809,6 +2809,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 run_time_ns;
+	__u64 run_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
Gitee


From c2e95c5594c1385cfbcf26200d60f336370a74e5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:42 -0800
Subject: [PATCH 04/23] tools/bpftool: recognize bpf_prog_info run_time_ns and
 run_cnt

ANBZ: #5530

commit 88ad472b8a4ad2292d11835652462fd9f745245e upstream.

$ bpftool p s
1: kprobe  tag a56587d488d216c9  gpl run_time_ns 79786 run_cnt 8
	loaded_at 2019-02-22T12:22:51-0800  uid 0
	xlated 352B  not jited  memlock 4096B
$ bpftool --json --pretty p s
[{
        "id": 1,
        "type": "kprobe",
        "tag": "a56587d488d216c9",
        "gpl_compatible": true,
        "run_time_ns": 79786,
        "run_cnt": 8,
        "loaded_at": 1550866971,
        "uid": 0,
        "bytes_xlated": 352,
        "jited": false,
        "bytes_memlock": 4096
    }
]

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 +++-
 tools/bpf/bpftool/prog.c                         | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 12bc1e2d4b46..9386bd6e0396 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -171,7 +171,7 @@ EXAMPLES
 
 ::
 
-    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl
+    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl run_time_ns 81632 run_cnt 10
             loaded_at 2017-09-29T20:11:00+0000  uid 0
             xlated 528B  jited 370B  memlock 4096B  map_ids 10
 
@@ -184,6 +184,8 @@ EXAMPLES
             "type": "xdp",
             "tag": "005a3d2123620c8b",
             "gpl_compatible": true,
+            "run_time_ns": 81632,
+            "run_cnt": 10,
             "loaded_at": 1506715860,
             "uid": 0,
             "bytes_xlated": 528,
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 759c4ea9b56e..9027554a2d37 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
 	jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible);
+	if (info->run_time_ns) {
+		jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns);
+		jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt);
+	}
 
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
@@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
 	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("%s", info->gpl_compatible ? "  gpl" : "");
+	if (info->run_time_ns)
+		printf(" run_time_ns %lld run_cnt %lld",
+		       info->run_time_ns, info->run_cnt);
 	printf("\n");
 
 	if (info->load_time) {
-- 
Gitee


From a281928e6f31a605f5375849503f6b2c9fdc0773 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 27 Feb 2019 18:30:44 -0800
Subject: [PATCH 05/23] bpf: fix build without bpf_syscall

ANBZ: #5530

commit 3fcc5530bcb2a879e32bd940e6eafee328ac3647 upstream.

wrap bpf_stats_enabled sysctl with #ifdef

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/sysctl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 827ca50fecc5..d8015709ffb2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -240,9 +240,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_BPF_SYSCALL
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos);
+#endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1415,7 +1417,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &one,
 	},
-#endif
 	{
 		.procname	= "bpf_stats_enabled",
 		.data		= &sysctl_bpf_stats_enabled,
@@ -1425,6 +1426,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#endif
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3750,6 +3752,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+#ifdef CONFIG_BPF_SYSCALL
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
@@ -3771,7 +3774,7 @@ static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 	}
 	return ret;
 }
-
+#endif
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
Gitee


From 91d9cf7e1234f2831141c53e3deab5cf818af1ed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2019 14:33:11 -0800
Subject: [PATCH 06/23] bpf: fix u64_stats_init() usage in bpf_prog_alloc()

ANBZ: #5530

commit 4b9113045b1745ec8512d6743680809edca6a74e upstream.

We need to iterate through all possible cpus.

Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/core.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6db62aa6d928..594dd9fe6d51 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -109,6 +109,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
 	struct bpf_prog *prog;
+	int cpu;
 
 	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
 	if (!prog)
@@ -121,7 +122,12 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 		return NULL;
 	}
 
-	u64_stats_init(&prog->aux->stats->syncp);
+	for_each_possible_cpu(cpu) {
+		struct bpf_prog_stats *pstats;
+
+		pstats = per_cpu_ptr(prog->aux->stats, cpu);
+		u64_stats_init(&pstats->syncp);
+	}
 	return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
-- 
Gitee


From f3d6a9d6821f7a81bc8354f99d6259e20b23fd9e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Mar 2019 21:34:12 +0100
Subject: [PATCH 07/23] bpf: fix sysctl.c warning

ANBZ: #5530

commit 78c3aff834f7939d1a2570e366e33f2a880d4d9e upstream.

When CONFIG_BPF_SYSCALL or CONFIG_SYSCTL is disabled, we get
a warning about an unused function:

kernel/sysctl.c:3331:12: error: 'proc_dointvec_minmax_bpf_stats' defined but not used [-Werror=unused-function]
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,

The CONFIG_BPF_SYSCALL check was already handled, but the SYSCTL check
is needed on top.

Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Christian Brauner <christian@brauner.io>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d8015709ffb2..803c16c30bd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3752,7 +3752,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
-#ifdef CONFIG_BPF_SYSCALL
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
-- 
Gitee


From a23aa009990da1c196a235732587105486601404 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 27 Feb 2019 11:08:06 -0500
Subject: [PATCH 08/23] bpf: add missing entries to bpf_helpers.h

ANBZ: #5530

commit f2bb53887eb3e8f859ac7cfc09d1a3801492c009 upstream.

This header defines the BPF functions enumerated in uapi/linux.bpf.h
in a callable format. Expand to include all registered functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/bpf_helpers.h | 30 +++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index dc94528078d3..a908dd501bdd 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -230,6 +230,36 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 	(void *) BPF_FUNC_skb_change_head;
 static int (*bpf_skb_pull_data)(void *, int len) =
 	(void *) BPF_FUNC_skb_pull_data;
+static unsigned int (*bpf_get_cgroup_classid)(void *ctx) =
+	(void *) BPF_FUNC_get_cgroup_classid;
+static unsigned int (*bpf_get_route_realm)(void *ctx) =
+	(void *) BPF_FUNC_get_route_realm;
+static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) =
+	(void *) BPF_FUNC_skb_change_proto;
+static int (*bpf_skb_change_type)(void *ctx, __u32 type) =
+	(void *) BPF_FUNC_skb_change_type;
+static unsigned int (*bpf_get_hash_recalc)(void *ctx) =
+	(void *) BPF_FUNC_get_hash_recalc;
+static unsigned long long (*bpf_get_current_task)(void *ctx) =
+	(void *) BPF_FUNC_get_current_task;
+static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) =
+	(void *) BPF_FUNC_skb_change_tail;
+static long long (*bpf_csum_update)(void *ctx, __u32 csum) =
+	(void *) BPF_FUNC_csum_update;
+static void (*bpf_set_hash_invalid)(void *ctx) =
+	(void *) BPF_FUNC_set_hash_invalid;
+static int (*bpf_get_numa_node_id)(void) =
+	(void *) BPF_FUNC_get_numa_node_id;
+static int (*bpf_probe_read_str)(void *ctx, __u32 size,
+				 const void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read_str;
+static unsigned int (*bpf_get_socket_uid)(void *ctx) =
+	(void *) BPF_FUNC_get_socket_uid;
+static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) =
+	(void *) BPF_FUNC_set_hash;
+static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
+				  unsigned long long flags) =
+	(void *) BPF_FUNC_skb_adjust_room;
 
 /* Scan the ARCH passed in from ARCH env variable (see Makefile) */
 #if defined(__TARGET_ARCH_x86)
-- 
Gitee


From ec449d797f18d3656f1b6069e4e83da696b40ddc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:12 -0800
Subject: [PATCH 09/23] tools: libbpf: add a correctly named define for map
 iteration

ANBZ: #5530

commit f74a53d9a567f6bc6f6d8460e84c76bd2a45d016 upstream.

For historical reasons the helper to loop over maps in an object
is called bpf_map__for_each while it really should be called
bpf_object__for_each_map.  Rename and add a correctly named
define for backward compatibility.

Switch all in-tree users to the correct name (Quentin).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/bpf/bpftool/prog.c                       | 4 ++--
 tools/lib/bpf/libbpf.c                         | 8 ++++----
 tools/lib/bpf/libbpf.h                         | 3 ++-
 tools/perf/util/bpf-loader.c                   | 4 ++--
 tools/testing/selftests/bpf/test_libbpf_open.c | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9027554a2d37..9d3a891a34ad 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1057,7 +1057,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	j = 0;
 	while (j < old_map_fds && map_replace[j].name) {
 		i = 0;
-		bpf_map__for_each(map, obj) {
+		bpf_object__for_each_map(map, obj) {
 			if (!strcmp(bpf_map__name(map), map_replace[j].name)) {
 				map_replace[j].idx = i;
 				break;
@@ -1078,7 +1078,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	/* Set ifindex and name reuse */
 	j = 0;
 	idx = 0;
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		if (!bpf_map__is_offload_neutral(map))
 			bpf_map__set_ifindex(map, ifindex);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b6e9d22ee0ae..638c6bd89b84 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2109,7 +2109,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 	if (err)
 		return err;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		char buf[PATH_MAX];
 		int len;
 
@@ -2156,7 +2156,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 	if (!obj)
 		return -ENOENT;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		char buf[PATH_MAX];
 		int len;
 
@@ -2844,7 +2844,7 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name)
 {
 	struct bpf_map *pos;
 
-	bpf_map__for_each(pos, obj) {
+	bpf_object__for_each_map(pos, obj) {
 		if (pos->name && !strcmp(pos->name, name))
 			return pos;
 	}
@@ -2937,7 +2937,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 			first_prog = prog;
 	}
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		if (!bpf_map__is_offload_neutral(map))
 			map->map_ifindex = attr->ifindex;
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 0409781c44cc..aa1521a51687 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -279,10 +279,11 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
 
 LIBBPF_API struct bpf_map *
 bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
-#define bpf_map__for_each(pos, obj)		\
+#define bpf_object__for_each_map(pos, obj)		\
 	for ((pos) = bpf_map__next(NULL, (obj));	\
 	     (pos) != NULL;				\
 	     (pos) = bpf_map__next((pos), (obj)))
+#define bpf_map__for_each bpf_object__for_each_map
 
 LIBBPF_API struct bpf_map *
 bpf_map__prev(struct bpf_map *map, struct bpf_object *obj);
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 30f28daaf51a..0e03c4e9351a 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1489,7 +1489,7 @@ apply_obj_config_object(struct bpf_object *obj)
 	struct bpf_map *map;
 	int err;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		err = apply_obj_config_map(map);
 		if (err)
 			return err;
@@ -1513,7 +1513,7 @@ int bpf__apply_obj_config(void)
 
 #define bpf__for_each_map(pos, obj, objtmp)	\
 	bpf_object__for_each_safe(obj, objtmp)	\
-		bpf_map__for_each(pos, obj)
+		bpf_object__for_each_map(pos, obj)
 
 #define bpf__for_each_map_named(pos, obj, objtmp, name)	\
 	bpf__for_each_map(pos, obj, objtmp) 		\
diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c
index c413fb96afef..9e9db202d218 100644
--- a/tools/testing/selftests/bpf/test_libbpf_open.c
+++ b/tools/testing/selftests/bpf/test_libbpf_open.c
@@ -69,7 +69,7 @@ int test_walk_maps(struct bpf_object *obj, bool verbose)
 	struct bpf_map *map;
 	int cnt = 0;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		cnt++;
 		if (verbose)
 			printf("Map (count:%d) name: %s\n", cnt,
-- 
Gitee


From 04448655f37da3de33ff2cac1e4378ac9966d5b5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:22 -0800
Subject: [PATCH 10/23] libbpf: fix formatting for btf_ext__get_raw_data

ANBZ: #5530

commit 1baabdc1089eb807cdcabebad50b36c8b9895a48 upstream.

Fix invalid formatting of pointer arg.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 94bbc249b0f1..b60bb7cf5fff 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -76,7 +76,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 
 LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size);
 LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext);
-LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext* btf_ext,
+LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext,
 					     __u32 *size);
 LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf,
 					const struct btf_ext *btf_ext,
-- 
Gitee


From 87b6c26ddbf2da3b04f164ac75c897966e740a40 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:23 -0800
Subject: [PATCH 11/23] btf: allow to customize dedup hash table size

ANBZ: #5530

commit 51edf5f6e015c48b62e24ab2fbcad8885ca1c74e upstream.

Default size of dedup table (16k) is good enough for most binaries, even
typical vmlinux images. But there are cases of binaries with huge amount
of BTF types (e.g., allyesconfig variants of kernel), which benefit from
having bigger dedup table size to lower amount of unnecessary hash
collisions. Tools like pahole, thus, can tune this parameter to reach
optimal performance.

This change also serves double purpose of allowing tests to force hash
collisions to test some corner cases, used in follow up patch.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/btf.c | 53 ++++++++++++++++++++++++++++++---------------
 tools/lib/bpf/btf.h |  1 +
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 9dc1c1cccf63..2b179c68722d 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1069,8 +1069,8 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
 	return err;
 }
 
-#define BTF_DEDUP_TABLE_SIZE_LOG 14
-#define BTF_DEDUP_TABLE_MOD ((1 << BTF_DEDUP_TABLE_SIZE_LOG) - 1)
+#define BTF_DEDUP_TABLE_DEFAULT_SIZE (1 << 14)
+#define BTF_DEDUP_TABLE_MAX_SIZE_LOG 31
 #define BTF_UNPROCESSED_ID ((__u32)-1)
 #define BTF_IN_PROGRESS_ID ((__u32)-2)
 
@@ -1127,18 +1127,21 @@ static inline __u32 hash_combine(__u32 h, __u32 value)
 #undef GOLDEN_RATIO_PRIME
 }
 
-#define for_each_hash_node(table, hash, node) \
-	for (node = table[hash & BTF_DEDUP_TABLE_MOD]; node; node = node->next)
+#define for_each_dedup_cand(d, hash, node) \
+	for (node = d->dedup_table[hash & (d->opts.dedup_table_size - 1)]; \
+	     node;							   \
+	     node = node->next)
 
 static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id)
 {
 	struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node));
+	int bucket = hash & (d->opts.dedup_table_size - 1);
 
 	if (!node)
 		return -ENOMEM;
 	node->type_id = type_id;
-	node->next = d->dedup_table[hash & BTF_DEDUP_TABLE_MOD];
-	d->dedup_table[hash & BTF_DEDUP_TABLE_MOD] = node;
+	node->next = d->dedup_table[bucket];
+	d->dedup_table[bucket] = node;
 	return 0;
 }
 
@@ -1176,7 +1179,7 @@ static void btf_dedup_table_free(struct btf_dedup *d)
 	if (!d->dedup_table)
 		return;
 
-	for (i = 0; i < (1 << BTF_DEDUP_TABLE_SIZE_LOG); i++) {
+	for (i = 0; i < d->opts.dedup_table_size; i++) {
 		while (d->dedup_table[i]) {
 			tmp = d->dedup_table[i];
 			d->dedup_table[i] = tmp->next;
@@ -1211,19 +1214,37 @@ static void btf_dedup_free(struct btf_dedup *d)
 	free(d);
 }
 
+/* Find closest power of two >= to size, capped at 2^max_size_log */
+static __u32 roundup_pow2_max(__u32 size, int max_size_log)
+{
+	int i;
+
+	for (i = 0; i < max_size_log  && (1U << i) < size;  i++)
+		;
+	return 1U << i;
+}
+
+
 static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 				       const struct btf_dedup_opts *opts)
 {
 	struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup));
 	int i, err = 0;
+	__u32 sz;
 
 	if (!d)
 		return ERR_PTR(-ENOMEM);
 
+	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
+	sz = opts && opts->dedup_table_size ? opts->dedup_table_size
+					    : BTF_DEDUP_TABLE_DEFAULT_SIZE;
+	sz = roundup_pow2_max(sz, BTF_DEDUP_TABLE_MAX_SIZE_LOG);
+	d->opts.dedup_table_size = sz;
+
 	d->btf = btf;
 	d->btf_ext = btf_ext;
 
-	d->dedup_table = calloc(1 << BTF_DEDUP_TABLE_SIZE_LOG,
+	d->dedup_table = calloc(d->opts.dedup_table_size,
 				sizeof(struct btf_dedup_node *));
 	if (!d->dedup_table) {
 		err = -ENOMEM;
@@ -1248,8 +1269,6 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 	for (i = 0; i <= btf->nr_types; i++)
 		d->hypot_map[i] = BTF_UNPROCESSED_ID;
 
-	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
-
 done:
 	if (err) {
 		btf_dedup_free(d);
@@ -1823,7 +1842,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_INT:
 		h = btf_hash_int(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_int(t, cand)) {
 				new_id = cand_node->type_id;
@@ -1834,7 +1853,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_ENUM:
 		h = btf_hash_enum(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_enum(t, cand)) {
 				new_id = cand_node->type_id;
@@ -1845,7 +1864,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_FWD:
 		h = btf_hash_common(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_common(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2262,7 +2281,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 		return 0;
 
 	h = btf_hash_struct(t);
-	for_each_hash_node(d->dedup_table, h, cand_node) {
+	for_each_dedup_cand(d, h, cand_node) {
 		int eq;
 
 		btf_dedup_clear_hypot_map(d);
@@ -2349,7 +2368,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		t->type = ref_type_id;
 
 		h = btf_hash_common(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_common(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2372,7 +2391,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		info->index_type = ref_type_id;
 
 		h = btf_hash_array(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_array(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2403,7 +2422,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		}
 
 		h = btf_hash_fnproto(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_fnproto(t, cand)) {
 				new_id = cand_node->type_id;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index b60bb7cf5fff..28a1e1e59861 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -90,6 +90,7 @@ LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
 LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
 
 struct btf_dedup_opts {
+	unsigned int dedup_table_size;
 	bool dont_resolve_fwds;
 };
 
-- 
Gitee


From 8a8aa2467fb3f606f4f80d0fc4ff95327039dcb2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:24 -0800
Subject: [PATCH 12/23] btf: fix bug with resolving STRUCT/UNION into
 corresponding FWD

ANBZ: #5530

commit 91097fbee4c025ac72f91ae41feba3a822cc1316 upstream.

When checking available canonical candidates for struct/union algorithm
utilizes btf_dedup_is_equiv to determine if candidate is suitable. This
check is not enough when candidate is corresponding FWD for that
struct/union, because according to equivalence logic they are
equivalent. When it so happens that FWD and STRUCT/UNION end in hashing
to the same bucket, it's possible to create remapping loop from FWD to
STRUCT and STRUCT to same FWD, which will cause btf_dedup() to loop
forever.

This patch fixes the issue by additionally checking that type and
canonical candidate are strictly equal (utilizing btf_equal_struct).

Fixes: d5caef5b5655 ("btf: add BTF types deduplication algorithm")
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/btf.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 2b179c68722d..8aeb80d73285 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1662,7 +1662,7 @@ static __u32 btf_hash_struct(struct btf_type *t)
  * IDs. This check is performed during type graph equivalence check and
  * referenced types equivalence is checked separately.
  */
-static bool btf_equal_struct(struct btf_type *t1, struct btf_type *t2)
+static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
 {
 	struct btf_member *m1, *m2;
 	__u16 vlen;
@@ -2123,7 +2123,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		struct btf_member *cand_m, *canon_m;
 		__u16 vlen;
 
-		if (!btf_equal_struct(cand_type, canon_type))
+		if (!btf_shallow_equal_struct(cand_type, canon_type))
 			return 0;
 		vlen = BTF_INFO_VLEN(cand_type->info);
 		cand_m = (struct btf_member *)(cand_type + 1);
@@ -2264,7 +2264,7 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
 static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 {
 	struct btf_dedup_node *cand_node;
-	struct btf_type *t;
+	struct btf_type *cand_type, *t;
 	/* if we don't find equivalent type, then we are canonical */
 	__u32 new_id = type_id;
 	__u16 kind;
@@ -2284,6 +2284,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 	for_each_dedup_cand(d, h, cand_node) {
 		int eq;
 
+		/*
+		 * Even though btf_dedup_is_equiv() checks for
+		 * btf_shallow_equal_struct() internally when checking two
+		 * structs (unions) for equivalence, we need to guard here
+		 * from picking matching FWD type as a dedup candidate.
+		 * This can happen due to hash collision. In such case just
+		 * relying on btf_dedup_is_equiv() would lead to potentially
+		 * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because
+		 * FWD and compatible STRUCT/UNION are considered equivalent.
+		 */
+		cand_type = d->btf->types[cand_node->type_id];
+		if (!btf_shallow_equal_struct(t, cand_type))
+			continue;
+
 		btf_dedup_clear_hypot_map(d);
 		eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id);
 		if (eq < 0)
-- 
Gitee


From e8406d8a4cddafcb76761e006c33563ee71b4ea3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:25 -0800
Subject: [PATCH 13/23] selftests/bpf: add btf_dedup test of FWD/STRUCT
 resolution

ANBZ: #5530

commit 7c7a4890c87dd2eb77ef144e5153a7df4c0d6f53 upstream.

This patch adds a btf_dedup test exercising logic of STRUCT<->FWD
resolution and validating that STRUCT is not resolved to a FWD. It also
forces hash collisions, forcing both FWD and STRUCT to be candidates for
each other. Previously this condition caused infinite loop due to FWD
pointing to STRUCT and STRUCT pointing to its FWD.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/test_btf.c | 45 ++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 591c9b40dae6..f01006873b09 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5731,6 +5731,51 @@ const struct btf_dedup_test dedup_tests[] = {
 		.dont_resolve_fwds = false,
 	},
 },
+{
+	.descr = "dedup: struct <-> fwd resolution w/ hash collision",
+	/*
+	 * // CU 1:
+	 * struct x;
+	 * struct s {
+	 *	struct x *x;
+	 * };
+	 * // CU 2:
+	 * struct x {};
+	 * struct s {
+	 *	struct x *x;
+	 * };
+	 */
+	.input = {
+		.raw_types = {
+			/* CU 1 */
+			BTF_FWD_ENC(NAME_TBD, 0 /* struct fwd */),	/* [1] fwd x      */
+			BTF_PTR_ENC(1),					/* [2] ptr -> [1] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [3] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+			/* CU 2 */
+			BTF_STRUCT_ENC(NAME_TBD, 0, 0),			/* [4] struct x   */
+			BTF_PTR_ENC(4),					/* [5] ptr -> [4] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [6] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0x\0s\0x\0x\0s\0x\0"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_PTR_ENC(3),					/* [1] ptr -> [3] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [2] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_STRUCT_ENC(NAME_NTH(2), 0, 0),		/* [3] struct x   */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0s\0x"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+		.dedup_table_size = 1, /* force hash collisions */
+	},
+},
 {
 	.descr = "dedup: all possible kinds (no duplicates)",
 	.input = {
-- 
Gitee


From c9daac93ed5f4390669cb4c877b2837a9d8ad0ad Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:46 -0800
Subject: [PATCH 14/23] bpf: add bpf helper bpf_skb_ecn_set_ce

ANBZ: #5530

commit f7c917ba11a67632a8452ea99fe132f626a7a2cc upstream.

This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
"int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because his type of bpf_prog cannot modify the skb directly.

This helper is used to set the ECN field of ECN capable IP packets to ce
(congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
used by a bpf_prog to manage egress or ingress network bandwdith limit
per cgroupv2 by inducing an ECN response in the TCP sender.
This works best when using DCTCP.

[backport note]
The former patch "bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper"
introduced bpf_get_listener_sock which is introduced after
bpf_skb_ecn_set_ce in upstream so code change in description and
__BPF_FUNC_MAPPER in include/uapi/linux/bpf.h has already applied
in the former patch.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 net/core/filter.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index da08f2b51743..f250d398015f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5380,6 +5380,32 @@ static const struct bpf_func_proto bpf_get_listener_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+	unsigned int iphdr_len;
+
+	if (skb->protocol == cpu_to_be16(ETH_P_IP))
+		iphdr_len = sizeof(struct iphdr);
+	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
+		iphdr_len = sizeof(struct ipv6hdr);
+	else
+		return 0;
+
+	if (skb_headlen(skb) < iphdr_len)
+		return 0;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+		return 0;
+
+	return INET_ECN_set_ce(skb);
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+	.func           = bpf_skb_ecn_set_ce,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5538,6 +5564,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_skb_ecn_set_ce:
+		return &bpf_skb_ecn_set_ce_proto;
 	case BPF_FUNC_get_listener_sock:
 		return &bpf_get_listener_sock_proto;
 #endif
-- 
Gitee


From ef0efa031717f75ad74ba0814d465b2ba982efe3 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:47 -0800
Subject: [PATCH 15/23] bpf: sync bpf.h to tools and update bpf_helpers.h

ANBZ: #5530

commit 5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 upstream.

This patch syncs the uapi bpf.h to tools/ and also updates
bpf_herlpers.h in tools/

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/bpf_helpers.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index a908dd501bdd..eaf911037a11 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -176,6 +176,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_sk_fullsock;
 static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_tcp_sock;
+static int (*bpf_skb_ecn_set_ce)(void *ctx) =
+	(void *) BPF_FUNC_skb_ecn_set_ce;
 static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_get_listener_sock;
 
-- 
Gitee


From 8960db216a46fba3fda838b085bd9c02e38aa784 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:48 -0800
Subject: [PATCH 16/23] bpf: Sample HBM BPF program to limit egress bw

ANBZ: #5530

commit 187d0738ff351f725a58be3d606d3a7fc8db8aed upstream.

A cgroup skb BPF program to limit cgroup output bandwidth.
It uses a modified virtual token bucket queue to limit average
egress bandwidth. The implementation uses credits instead of tokens.
Negative credits imply that queueing would have happened (this is
a virtual queue, so no queueing is done by it. However, queueing may
occur at the actual qdisc (which is not used for rate limiting).
This implementation uses 3 thresholds, one to start marking packets and
the other two to drop packets:
                                 CREDIT
       - <--------------------------|------------------------> +
             |    |          |      0
             |  Large pkt    |
             |  drop thresh  |
  Small pkt drop             Mark threshold
      thresh
The effect of marking depends on the type of packet:
a) If the packet is ECN enabled, then the packet is ECN ce marked.
   The current mark threshold is tuned for DCTCP.
c) Else, it is dropped if it is a large packet.
If the credit is below the drop threshold, the packet is dropped.
Note that dropping a packet through the BPF program does not trigger CWR
(Congestion Window Reduction) in TCP packets. A future patch will add
support for triggering CWR.
This BPF program actually uses 2 drop thresholds, one threshold
for larger packets (>= 120 bytes) and another for smaller packets. This
protects smaller packets such as SYNs, ACKs, etc.
The default bandwidth limit is set at 1Gbps but this can be changed by
a user program through a shared BPF map. In addition, by default this BPF
program does not limit connections using loopback. This behavior can be
overwritten by the user program. There is also an option to calculate
some statistics, such as percent of packets marked or dropped, which
the user program can access.
A latter patch provides such a program (hbm.c)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 samples/bpf/Makefile       |   2 +
 samples/bpf/hbm.h          |  31 ++++++++
 samples/bpf/hbm_kern.h     | 137 ++++++++++++++++++++++++++++++++
 samples/bpf/hbm_out_kern.c | 157 +++++++++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+)
 create mode 100644 samples/bpf/hbm.h
 create mode 100644 samples/bpf/hbm_kern.h
 create mode 100644 samples/bpf/hbm_out_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index d6b9da7b5bcf..ef32859701fe 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -166,6 +166,7 @@ always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
+always += hbm_out_kern.o
 
 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -267,6 +268,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
 $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
+$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for Host Bandwidth Management (HBM) programs
+ */
+struct hbm_vqueue {
+	struct bpf_spin_lock lock;
+	/* 4 byte hole */
+	unsigned long long lasttime;	/* In ns */
+	int credit;			/* In bytes */
+	unsigned int rate;		/* In bytes per NS << 20 */
+};
+
+struct hbm_queue_stats {
+	unsigned long rate;		/* in Mbps*/
+	unsigned long stats:1,		/* get HBM stats (marked, dropped,..) */
+		loopback:1;		/* also limit flows using loopback */
+	unsigned long long pkts_marked;
+	unsigned long long bytes_marked;
+	unsigned long long pkts_dropped;
+	unsigned long long bytes_dropped;
+	unsigned long long pkts_total;
+	unsigned long long bytes_total;
+	unsigned long long firstPacketTime;
+	unsigned long long lastPacketTime;
+};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for sample Host Bandwidth Manager (HBM) BPF programs
+ */
+#define KBUILD_MODNAME "foo"
+#include <stddef.h>
+#include <stdbool.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <net/inet_ecn.h>
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+#include "hbm.h"
+
+#define DROP_PKT	0
+#define ALLOW_PKT	1
+#define TCP_ECN_OK	1
+
+#define HBM_DEBUG 0  // Set to 1 to enable debugging
+#if HBM_DEBUG
+#define bpf_printk(fmt, ...)					\
+({								\
+	char ____fmt[] = fmt;					\
+	bpf_trace_printk(____fmt, sizeof(____fmt),		\
+			 ##__VA_ARGS__);			\
+})
+#else
+#define bpf_printk(fmt, ...)
+#endif
+
+#define INITIAL_CREDIT_PACKETS	100
+#define MAX_BYTES_PER_PACKET	1500
+#define MARK_THRESH		(40 * MAX_BYTES_PER_PACKET)
+#define DROP_THRESH		(80 * 5 * MAX_BYTES_PER_PACKET)
+#define LARGE_PKT_DROP_THRESH	(DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
+#define MARK_REGION_SIZE	(LARGE_PKT_DROP_THRESH - MARK_THRESH)
+#define LARGE_PKT_THRESH	120
+#define MAX_CREDIT		(100 * MAX_BYTES_PER_PACKET)
+#define INIT_CREDIT		(INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+
+// rate in bytes per ns << 20
+#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+
+struct bpf_map_def SEC("maps") queue_state = {
+	.type = BPF_MAP_TYPE_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct hbm_vqueue),
+};
+BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
+		     struct hbm_vqueue);
+
+struct bpf_map_def SEC("maps") queue_stats = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(struct hbm_queue_stats),
+	.max_entries = 1,
+};
+BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
+
+struct hbm_pkt_info {
+	bool	is_ip;
+	bool	is_tcp;
+	short	ecn;
+};
+
+static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
+					     struct hbm_pkt_info *pkti)
+{
+	struct iphdr iph;
+	struct ipv6hdr *ip6h;
+
+	bpf_skb_load_bytes(skb, 0, &iph, 12);
+	if (iph.version == 6) {
+		ip6h = (struct ipv6hdr *)&iph;
+		pkti->is_ip = true;
+		pkti->is_tcp = (ip6h->nexthdr == 6);
+		pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
+	} else if (iph.version == 4) {
+		pkti->is_ip = true;
+		pkti->is_tcp = (iph.protocol == 6);
+		pkti->ecn = iph.tos & INET_ECN_MASK;
+	} else {
+		pkti->is_ip = false;
+		pkti->is_tcp = false;
+		pkti->ecn = 0;
+	}
+}
+
+static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
+{
+		bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+		qdp->lasttime = bpf_ktime_get_ns();
+		qdp->credit = INIT_CREDIT;
+		qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
+					     int len,
+					     unsigned long long curtime,
+					     bool congestion_flag,
+					     bool drop_flag)
+{
+	if (qsp != NULL) {
+		// Following is needed for work conserving
+		__sync_add_and_fetch(&(qsp->bytes_total), len);
+		if (qsp->stats) {
+			// Optionally update statistics
+			if (qsp->firstPacketTime == 0)
+				qsp->firstPacketTime = curtime;
+			qsp->lastPacketTime = curtime;
+			__sync_add_and_fetch(&(qsp->pkts_total), 1);
+			if (congestion_flag || drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_marked), 1);
+				__sync_add_and_fetch(&(qsp->bytes_marked), len);
+			}
+			if (drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_dropped), 1);
+				__sync_add_and_fetch(&(qsp->bytes_dropped),
+						     len);
+			}
+		}
+	}
+}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ *                                  CREDIT
+ *        - <--------------------------|------------------------> +
+ *              |    |          |      0
+ *              |  Large pkt    |
+ *              |  drop thresh  |
+ *   Small pkt drop             Mark threshold
+ *       thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ *    is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ *    to reduce the congestion window. The current implementation uses a linear
+ *    distribution (0% probability at marking threshold, 100% probability
+ *    at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+	struct hbm_pkt_info pkti;
+	int len = skb->len;
+	unsigned int queue_index = 0;
+	unsigned long long curtime;
+	int credit;
+	signed long long delta = 0, zero = 0;
+	int max_credit = MAX_CREDIT;
+	bool congestion_flag = false;
+	bool drop_flag = false;
+	bool cwr_flag = false;
+	struct hbm_vqueue *qdp;
+	struct hbm_queue_stats *qsp = NULL;
+	int rv = ALLOW_PKT;
+
+	qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+	if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+		return ALLOW_PKT;
+
+	hbm_get_pkt_info(skb, &pkti);
+
+	// We may want to account for the length of headers in len
+	// calculation, like ETH header + overhead, specially if it
+	// is a gso packet. But I am not doing it right now.
+
+	qdp = bpf_get_local_storage(&queue_state, 0);
+	if (!qdp)
+		return ALLOW_PKT;
+	else if (qdp->lasttime == 0)
+		hbm_init_vqueue(qdp, 1024);
+
+	curtime = bpf_ktime_get_ns();
+
+	// Begin critical section
+	bpf_spin_lock(&qdp->lock);
+	credit = qdp->credit;
+	delta = curtime - qdp->lasttime;
+	/* delta < 0 implies that another process with a curtime greater
+	 * than ours beat us to the critical section and already added
+	 * the new credit, so we should not add it ourselves
+	 */
+	if (delta > 0) {
+		qdp->lasttime = curtime;
+		credit += CREDIT_PER_NS(delta, qdp->rate);
+		if (credit > MAX_CREDIT)
+			credit = MAX_CREDIT;
+	}
+	credit -= len;
+	qdp->credit = credit;
+	bpf_spin_unlock(&qdp->lock);
+	// End critical section
+
+	// Check if we should update rate
+	if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+		qdp->rate = qsp->rate * 128;
+		bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+			   (int)qdp->rate,
+			   CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+	}
+
+	// Set flags (drop, congestion, cwr)
+	// Dropping => we are congested, so ignore congestion flag
+	if (credit < -DROP_THRESH ||
+	    (len > LARGE_PKT_THRESH &&
+	     credit < -LARGE_PKT_DROP_THRESH)) {
+		// Very congested, set drop flag
+		drop_flag = true;
+	} else if (credit < 0) {
+		// Congested, set congestion flag
+		if (pkti.ecn) {
+			if (credit < -MARK_THRESH)
+				congestion_flag = true;
+			else
+				congestion_flag = false;
+		} else {
+			congestion_flag = true;
+		}
+	}
+
+	if (congestion_flag) {
+		if (!bpf_skb_ecn_set_ce(skb)) {
+			if (len > LARGE_PKT_THRESH) {
+				// Problem if too many small packets?
+				drop_flag = true;
+			}
+		}
+	}
+
+	if (drop_flag)
+		rv = DROP_PKT;
+
+	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+
+	if (rv == DROP_PKT)
+		__sync_add_and_fetch(&(qdp->credit), len);
+
+	return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
Gitee


From 5549cbc0580ab41a9a2f7c43ce19f6bdf749fc18 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:49 -0800
Subject: [PATCH 17/23] bpf: User program for testing HBM

ANBZ: #5530

commit a1270fe95b74eb3195b107c494ed1f11b932a278 upstream.

The program nrm creates a cgroup and attaches a BPF program to the
cgroup for testing HBM (Host Bandwidth Manager) for egress traffic.
One still needs to create network traffic. This can be done through
netesto, netperf or iperf3.
A follow-up patch contains a script to create traffic.
USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>]
           [-w] [-h] [prog]
  Where:
   -d        Print BPF trace debug buffer
   -l        Also limit flows doing loopback
   -n <#>    To create cgroup "/hbm#" and attach prog. Default is /nrm1
             This is convenient when testing HBM in more than 1 cgroup
   -r <rate> Rate limit in Mbps
   -s        Get HBM stats (marked, dropped, etc.)
   -t <time> Exit after specified seconds (deault is 0)
   -w        Work conserving flag. cgroup can increase its bandwidth
             beyond the rate limit specified while there is available
             bandwidth. Current implementation assumes there is only
             NIC (eth0), but can be extended to support multiple NICs.
             Currrently only supported for egress. Note, this is just
	     a proof of concept.
   -h        Print this info
   prog      BPF program file name. Name defaults to hbm_out_kern.o

More information about HBM can be found in the paper "BPF Host Resource
Management" presented at the 2018 Linux Plumbers Conference, Networking Track
(http://vger.kernel.org/lpc_net2018_talks/LPC%20BPF%20Network%20Resource%20Paper.pdf)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 samples/bpf/Makefile |   3 +
 samples/bpf/hbm.c    | 441 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 444 insertions(+)
 create mode 100644 samples/bpf/hbm.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ef32859701fe..99afd0dee2ca 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -53,6 +53,7 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
 hostprogs-y += xdp_sample_pkts
+hostprogs-y += hbm
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -109,6 +110,7 @@ xdpsock-objs := xdpsock_user.o
 xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -269,6 +271,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+$(obj)/hbm.o: $(src)/hbm.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..8408ccb7409f
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Example program for Host Bandwidth Managment
+ *
+ * This program loads a cgroup skb BPF program to enforce cgroup output
+ * (egress) or input (ingress) bandwidth limits.
+ *
+ * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
+ *   Where:
+ *    -d	Print BPF trace debug buffer
+ *    -l	Also limit flows doing loopback
+ *    -n <#>	To create cgroup \"/hbm#\" and attach prog
+ *		Default is /hbm1
+ *    -r <rate>	Rate limit in Mbps
+ *    -s	Get HBM stats (marked, dropped, etc.)
+ *    -t <time>	Exit after specified seconds (deault is 0)
+ *    -w	Work conserving flag. cgroup can increase its bandwidth
+ *		beyond the rate limit specified while there is available
+ *		bandwidth. Current implementation assumes there is only
+ *		NIC (eth0), but can be extended to support multiple NICs.
+ *		Currrently only supported for egress.
+ *    -h	Print this info
+ *    prog	BPF program file name. Name defaults to hbm_out_kern.o
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_load.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+#include "hbm.h"
+#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+bool outFlag = true;
+int minRate = 1000;		/* cgroup rate limit in Mbps */
+int rate = 1000;		/* can grow if rate conserving is enabled */
+int dur = 1;
+bool stats_flag;
+bool loopback_flag;
+bool debugFlag;
+bool work_conserving_flag;
+
+static void Usage(void);
+static void read_trace_pipe2(void);
+static void do_error(char *msg, bool errno_flag);
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+struct bpf_object *obj;
+int bpfprog_fd;
+int cgroup_storage_fd;
+
+static void read_trace_pipe2(void)
+{
+	int trace_fd;
+	FILE *outf;
+	char *outFname = "hbm_out.log";
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0) {
+		printf("Error opening trace_pipe\n");
+		return;
+	}
+
+//	Future support of ingress
+//	if (!outFlag)
+//		outFname = "hbm_in.log";
+	outf = fopen(outFname, "w");
+
+	if (outf == NULL)
+		printf("Error creating %s\n", outFname);
+
+	while (1) {
+		static char buf[4097];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf) - 1);
+		if (sz > 0) {
+			buf[sz] = 0;
+			puts(buf);
+			if (outf != NULL) {
+				fprintf(outf, "%s\n", buf);
+				fflush(outf);
+			}
+		}
+	}
+}
+
+static void do_error(char *msg, bool errno_flag)
+{
+	if (errno_flag)
+		printf("ERROR: %s, errno: %d\n", msg, errno);
+	else
+		printf("ERROR: %s\n", msg);
+	exit(1);
+}
+
+static int prog_load(char *prog)
+{
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+		.file = prog,
+		.expected_attach_type = BPF_CGROUP_INET_EGRESS,
+	};
+	int map_fd;
+	struct bpf_map *map;
+
+	int ret = 0;
+
+	if (access(prog, O_RDONLY) < 0) {
+		printf("Error accessing file %s: %s\n", prog, strerror(errno));
+		return 1;
+	}
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
+		ret = 1;
+	if (!ret) {
+		map = bpf_object__find_map_by_name(obj, "queue_stats");
+		map_fd = bpf_map__fd(map);
+		if (map_fd < 0) {
+			printf("Map not found: %s\n", strerror(map_fd));
+			ret = 1;
+		}
+	}
+
+	if (ret) {
+		printf("ERROR: load_bpf_file failed for: %s\n", prog);
+		printf("  Output from verifier:\n%s\n------\n", bpf_log_buf);
+		ret = -1;
+	} else {
+		ret = map_fd;
+	}
+
+	return ret;
+}
+
+static int run_bpf_prog(char *prog, int cg_id)
+{
+	int map_fd;
+	int rc = 0;
+	int key = 0;
+	int cg1 = 0;
+	int type = BPF_CGROUP_INET_EGRESS;
+	char cg_dir[100];
+	struct hbm_queue_stats qstats = {0};
+
+	sprintf(cg_dir, "/hbm%d", cg_id);
+	map_fd = prog_load(prog);
+	if (map_fd  == -1)
+		return 1;
+
+	if (setup_cgroup_environment()) {
+		printf("ERROR: setting cgroup environment\n");
+		goto err;
+	}
+	cg1 = create_and_get_cgroup(cg_dir);
+	if (!cg1) {
+		printf("ERROR: create_and_get_cgroup\n");
+		goto err;
+	}
+	if (join_cgroup(cg_dir)) {
+		printf("ERROR: join_cgroup\n");
+		goto err;
+	}
+
+	qstats.rate = rate;
+	qstats.stats = stats_flag ? 1 : 0;
+	qstats.loopback = loopback_flag ? 1 : 0;
+	if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
+		printf("ERROR: Could not update map element\n");
+		goto err;
+	}
+
+	if (!outFlag)
+		type = BPF_CGROUP_INET_INGRESS;
+	if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
+		printf("ERROR: bpf_prog_attach fails!\n");
+		log_err("Attaching prog");
+		goto err;
+	}
+
+	if (work_conserving_flag) {
+		struct timeval t0, t_last, t_new;
+		FILE *fin;
+		unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
+		signed long long last_cg_tx_bytes, new_cg_tx_bytes;
+		signed long long delta_time, delta_bytes, delta_rate;
+		int delta_ms;
+#define DELTA_RATE_CHECK 10000		/* in us */
+#define RATE_THRESHOLD 9500000000	/* 9.5 Gbps */
+
+		bpf_map_lookup_elem(map_fd, &key, &qstats);
+		if (gettimeofday(&t0, NULL) < 0)
+			do_error("gettimeofday failed", true);
+		t_last = t0;
+		fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
+		if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
+			do_error("fscanf fails", false);
+		fclose(fin);
+		last_cg_tx_bytes = qstats.bytes_total;
+		while (true) {
+			usleep(DELTA_RATE_CHECK);
+			if (gettimeofday(&t_new, NULL) < 0)
+				do_error("gettimeofday failed", true);
+			delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
+				(t_new.tv_usec - t0.tv_usec)/1000;
+			if (delta_ms > dur * 1000)
+				break;
+			delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
+				(t_new.tv_usec - t_last.tv_usec);
+			if (delta_time == 0)
+				continue;
+			t_last = t_new;
+			fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
+				    "r");
+			if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
+				do_error("fscanf fails", false);
+			fclose(fin);
+			printf("  new_eth_tx_bytes:%llu\n",
+			       new_eth_tx_bytes);
+			bpf_map_lookup_elem(map_fd, &key, &qstats);
+			new_cg_tx_bytes = qstats.bytes_total;
+			delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
+			last_eth_tx_bytes = new_eth_tx_bytes;
+			delta_rate = (delta_bytes * 8000000) / delta_time;
+			printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
+			       delta_ms, delta_rate/1000000000.0,
+			       rate/1000.0);
+			if (delta_rate < RATE_THRESHOLD) {
+				/* can increase cgroup rate limit, but first
+				 * check if we are using the current limit.
+				 * Currently increasing by 6.25%, unknown
+				 * if that is the optimal rate.
+				 */
+				int rate_diff100;
+
+				delta_bytes = new_cg_tx_bytes -
+					last_cg_tx_bytes;
+				last_cg_tx_bytes = new_cg_tx_bytes;
+				delta_rate = (delta_bytes * 8000000) /
+					delta_time;
+				printf(" rate:%.3fGbps",
+				       delta_rate/1000000000.0);
+				rate_diff100 = (((long long)rate)*1000000 -
+						     delta_rate) * 100 /
+					(((long long) rate) * 1000000);
+				printf("  rdiff:%d", rate_diff100);
+				if (rate_diff100  <= 3) {
+					rate += (rate >> 4);
+					if (rate > RATE_THRESHOLD / 1000000)
+						rate = RATE_THRESHOLD / 1000000;
+					qstats.rate = rate;
+					printf(" INC\n");
+				} else {
+					printf("\n");
+				}
+			} else {
+				/* Need to decrease cgroup rate limit.
+				 * Currently decreasing by 12.5%, unknown
+				 * if that is optimal
+				 */
+				printf(" DEC\n");
+				rate -= (rate >> 3);
+				if (rate < minRate)
+					rate = minRate;
+				qstats.rate = rate;
+			}
+			if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
+				do_error("update map element fails", false);
+		}
+	} else {
+		sleep(dur);
+	}
+	// Get stats!
+	if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
+		char fname[100];
+		FILE *fout;
+
+		if (!outFlag)
+			sprintf(fname, "hbm.%d.in", cg_id);
+		else
+			sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "ERROR: Could not lookup queue_stats\n");
+	} else if (stats_flag && qstats.lastPacketTime >
+		   qstats.firstPacketTime) {
+		long long delta_us = (qstats.lastPacketTime -
+				      qstats.firstPacketTime)/1000;
+		unsigned int rate_mbps = ((qstats.bytes_total -
+					   qstats.bytes_dropped) * 8 /
+					  delta_us);
+		double percent_pkts, percent_bytes;
+		char fname[100];
+		FILE *fout;
+
+// Future support of ingress
+//		if (!outFlag)
+//			sprintf(fname, "hbm.%d.in", cg_id);
+//		else
+		sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "rate_mbps:%d\n", rate_mbps);
+		fprintf(fout, "duration:%.1f secs\n",
+			(qstats.lastPacketTime - qstats.firstPacketTime) /
+			1000000000.0);
+		fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
+		fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
+						     1000000));
+		fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
+		fprintf(fout, "bytes_dropped_MB:%d\n",
+			(int)(qstats.bytes_dropped /
+						       1000000));
+		// Marked Pkts and Bytes
+		percent_pkts = (qstats.pkts_marked * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_marked * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
+
+		// Dropped Pkts and Bytes
+		percent_pkts = (qstats.pkts_dropped * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_dropped * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+		fclose(fout);
+	}
+
+	if (debugFlag)
+		read_trace_pipe2();
+	return rc;
+err:
+	rc = 1;
+
+	if (cg1)
+		close(cg1);
+	cleanup_cgroup_environment();
+
+	return rc;
+}
+
+static void Usage(void)
+{
+	printf("This program loads a cgroup skb BPF program to enforce\n"
+	       "cgroup output (egress) bandwidth limits.\n\n"
+	       "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [-r <rate>] [-s]\n"
+	       "           [-t <secs>] [-w] [-h] [prog]\n"
+	       "  Where:\n"
+	       "    -o         indicates egress direction (default)\n"
+	       "    -d         print BPF trace debug buffer\n"
+	       "    -l         also limit flows using loopback\n"
+	       "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n"
+	       "               Default is /hbm1\n"
+	       "    -r <rate>  Rate in Mbps\n"
+	       "    -s         Update HBM stats\n"
+	       "    -t <time>  Exit after specified seconds (deault is 0)\n"
+	       "    -w	       Work conserving flag. cgroup can increase\n"
+	       "               bandwidth beyond the rate limit specified\n"
+	       "               while there is available bandwidth. Current\n"
+	       "               implementation assumes there is only eth0\n"
+	       "               but can be extended to support multiple NICs\n"
+	       "    -h         print this info\n"
+	       "    prog       BPF program file name. Name defaults to\n"
+	       "                 hbm_out_kern.o\n");
+}
+
+int main(int argc, char **argv)
+{
+	char *prog = "hbm_out_kern.o";
+	int  k;
+	int cg_id = 1;
+	char *optstring = "iodln:r:st:wh";
+
+	while ((k = getopt(argc, argv, optstring)) != -1) {
+		switch (k) {
+		case'o':
+			break;
+		case 'd':
+			debugFlag = true;
+			break;
+		case 'l':
+			loopback_flag = true;
+			break;
+		case 'n':
+			cg_id = atoi(optarg);
+			break;
+		case 'r':
+			minRate = atoi(optarg) * 1.024;
+			rate = minRate;
+			break;
+		case 's':
+			stats_flag = true;
+			break;
+		case 't':
+			dur = atoi(optarg);
+			break;
+		case 'w':
+			work_conserving_flag = true;
+			break;
+		case '?':
+			if (optopt == 'n' || optopt == 'r' || optopt == 't')
+				fprintf(stderr,
+					"Option -%c requires an argument.\n\n",
+					optopt);
+		case 'h':
+			// fallthrough
+		default:
+			Usage();
+			return 0;
+		}
+	}
+
+	if (optind < argc)
+		prog = argv[optind];
+	printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
+
+	return run_bpf_prog(prog, cg_id);
+}
-- 
Gitee


From d6e0f655aadd609026689c270b9eaf20c0bac7d9 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:50 -0800
Subject: [PATCH 18/23] bpf: HBM test script

ANBZ: #5530

commit 4ffd44cfd147d157406a26c995cd0c373bffd7a0 upstream.

Script for testing HBM (Host Bandwidth Manager) framework.
It creates a cgroup to use for testing and load a BPF program to limit
egress bandwidht. It then uses iperf3 or netperf to create
loads. The output is the goodput in Mbps (unless -D is used).

It can work on a single host using loopback or among two hosts (with netperf).
When using loopback, it is recommended to also introduce a delay of at least
1ms (-d=1), otherwise the assigned bandwidth is likely to be underutilized.

USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]
             [-d=<delay>|--delay=<delay>] [--debug] [-E]
             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >] [-l]
	     [-N] [-p=<port>|--port=<port>] [-P] [-q=<qdisc>]
             [-R] [-s=<server>|--server=<server] [--stats]
	     [-t=<time>|--time=<time>] [-w] [cubic|dctcp]
  Where:
    out               Egress (default egress)
    -b or --bpf       BPF program filename to load and attach.
                      Default is nrm_out_kern.o for egress,
    -c or -cc         TCP congestion control (cubic or dctcp)
    -d or --delay     Add a delay in ms using netem
    -D                In addition to the goodput in Mbps, it also outputs
                      other detailed information. This information is
                      test dependent (i.e. iperf3 or netperf).
    --debug           Print BPF trace buffer
    -E                Enable ECN (not required for dctcp)
    -f or --flows     Number of concurrent flows (default=1)
    -i or --id        cgroup id (an integer, default is 1)
    -l                Do not limit flows using loopback
    -N                Use netperf instead of iperf3
    -h                Help
    -p or --port      iperf3 port (default is 5201)
    -P                Use an iperf3 instance for each flow
    -q                Use the specified qdisc.
    -r or --rate      Rate in Mbps (default 1s 1Gbps)
    -R                Use TCP_RR for netperf. 1st flow has req
                      size of 10KB, rest of 1MB. Reply in all
                      cases is 1 byte.
                      More detailed output for each flow can be found
                      in the files netperf.<cg>.<flow>, where <cg> is the
                      cgroup id as specified with the -i flag, and <flow>
                      is the flow id starting at 1 and increasing by 1 for
                      flow (as specified by -f).
    -s or --server    hostname of netperf server. Used to create netperf
                      test traffic between to hosts (default is within host)
                      netserver must be running on the host.
    --stats           Get HBM stats (marked, dropped, etc.)
    -t or --time      duration of iperf3 in seconds (default=5)
    -w                Work conserving flag. cgroup can increase its
                      bandwidth beyond the rate limit specified
                      while there is available bandwidth. Current
                      implementation assumes there is only one NIC
                      (eth0), but can be extended to support multiple
                      NICs. This is just a proof of concept.
    cubic or dctcp    specify TCP CC to use

Examples:
 ./do_hbm_test.sh -l -d=1 -D --stats
     Runs a 5 second test, using a single iperf3 flow and with the default
     rate limit of 1Gbps and a delay of 1ms (using netem) using the default
     TCP congestion control on the loopback device (hence we use "-l" to
     enforce bandwidth limit on loopback device). Since no direction is
     specified, it defaults to egress. Since no TCP CC algorithm is
     specified it uses the system default (Cubic for this test).
     With no -D flag, only the value of the AGGREGATE OUTPUT would show.
     id refers to the cgroup id and is useful when running multi cgroup
     tests (supported by a future patch).
     This patchset does not support calling TCP's congesion window
     reduction, even when packets are dropped by the BPF program, resulting
     in a large number of packets dropped. It is recommended that the  current
     HBM implemenation only be used with ECN enabled flows. A future patch
     will add support for reducing TCP's cwnd and will increase the
     performance of non-ECN enabled flows.
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:493
     duration:4.8 secs
     packets:11355
     bytes_MB:590
     pkts_dropped:4497
     bytes_dropped_MB:292
     pkts_marked_percent: 39.60
     bytes_marked_percent: 49.49
     pkts_dropped_percent: 39.60
     bytes_dropped_percent: 49.49
     PING AVG DELAY:2.075
     AGGREGATE_GOODPUT:505

./do_nrm_test.sh -l -d=1 -D --stats dctcp
     Same as above but using dctcp. Note that fewer bytes are dropped
     (0.01% vs. 49%).
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:945
     duration:4.9 secs
     packets:16859
     bytes_MB:578
     pkts_dropped:1
     bytes_dropped_MB:0
     pkts_marked_percent: 28.74
     bytes_marked_percent: 45.15
     pkts_dropped_percent:  0.01
     bytes_dropped_percent:  0.01
     PING AVG DELAY:2.083
     AGGREGATE_GOODPUT:965

./do_nrm_test.sh -d=1 -D --stats
     As first example, but without limiting loopback device (i.e. no
     "-l" flag). Since there is no bandwidth limiting, no details for
     HBM are printed out.
   Output:
     Details for HBM in cgroup 1
     PING AVG DELAY:2.019
     AGGREGATE_GOODPUT:42655

./do_hbm.sh -l -d=1 -D --stats -f=2
     Uses iper3 and does 2 flows
./do_hbm.sh -l -d=1 -D --stats -f=4 -P
     Uses iperf3 and does 4 flows, each flow as a separate process.
./do_hbm.sh -l -d=1 -D --stats -f=4 -N
     Uses netperf, 4 flows
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats dctcp -s=<server-name>
     Uses netperf between two hosts. The remote host name is specified
     with -s= and you need to start the program netserver manually on
     the remote host. It will use 1 flow, a rate limit of 2Gbps and dctcp.
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats -w dctcp \
     -s=<server-name>
     As previous, but allows use of extra bandwidth. For this test the
     rate is 8Gbps vs. 1Gbps of the previous test.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 samples/bpf/do_hbm_test.sh | 436 +++++++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100755 samples/bpf/do_hbm_test.sh

diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..56c8b4115c95
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+
+Usage() {
+  echo "Script for testing HBM (Host Bandwidth Manager) framework."
+  echo "It creates a cgroup to use for testing and load a BPF program to limit"
+  echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
+  echo "loads. The output is the goodput in Mbps (unless -D was used)."
+  echo ""
+  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
+  echo "             [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+  echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
+  echo "             [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+  echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
+  echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
+  echo "  Where:"
+  echo "    out               egress (default)"
+  echo "    -b or --bpf       BPF program filename to load and attach."
+  echo "                      Default is hbm_out_kern.o for egress,"
+  echo "    -c or -cc         TCP congestion control (cubic or dctcp)"
+  echo "    --debug           print BPF trace buffer"
+  echo "    -d or --delay     add a delay in ms using netem"
+  echo "    -D                In addition to the goodput in Mbps, it also outputs"
+  echo "                      other detailed information. This information is"
+  echo "                      test dependent (i.e. iperf3 or netperf)."
+  echo "    -E                enable ECN (not required for dctcp)"
+  echo "    -f or --flows     number of concurrent flows (default=1)"
+  echo "    -i or --id        cgroup id (an integer, default is 1)"
+  echo "    -N                use netperf instead of iperf3"
+  echo "    -l                do not limit flows using loopback"
+  echo "    -h                Help"
+  echo "    -p or --port      iperf3 port (default is 5201)"
+  echo "    -P                use an iperf3 instance for each flow"
+  echo "    -q                use the specified qdisc"
+  echo "    -r or --rate      rate in Mbps (default 1s 1Gbps)"
+  echo "    -R                Use TCP_RR for netperf. 1st flow has req"
+  echo "                      size of 10KB, rest of 1MB. Reply in all"
+  echo "                      cases is 1 byte."
+  echo "                      More detailed output for each flow can be found"
+  echo "                      in the files netperf.<cg>.<flow>, where <cg> is the"
+  echo "                      cgroup id as specified with the -i flag, and <flow>"
+  echo "                      is the flow id starting at 1 and increasing by 1 for"
+  echo "                      flow (as specified by -f)."
+  echo "    -s or --server    hostname of netperf server. Used to create netperf"
+  echo "                      test traffic between to hosts (default is within host)"
+  echo "                      netserver must be running on the host."
+  echo "    -S or --stats     whether to update hbm stats (default is yes)."
+  echo "    -t or --time      duration of iperf3 in seconds (default=5)"
+  echo "    -w                Work conserving flag. cgroup can increase its"
+  echo "                      bandwidth beyond the rate limit specified"
+  echo "                      while there is available bandwidth. Current"
+  echo "                      implementation assumes there is only one NIC"
+  echo "                      (eth0), but can be extended to support multiple"
+  echo "                       NICs."
+  echo "    cubic or dctcp    specify which TCP CC to use"
+  echo " "
+  exit
+}
+
+#set -x
+
+debug_flag=0
+args="$@"
+name="$0"
+netem=0
+cc=x
+dir="-o"
+dir_name="out"
+dur=5
+flows=1
+id=1
+prog=""
+port=5201
+rate=1000
+multi_iperf=0
+flow_cnt=1
+use_netperf=0
+rr=0
+ecn=0
+details=0
+server=""
+qdisc=""
+flags=""
+do_stats=0
+
+function start_hbm () {
+  rm -f hbm.out
+  echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
+  echo " " >> hbm.out
+  ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1  &
+  echo $!
+}
+
+processArgs () {
+  for i in $args ; do
+    case $i in
+    # Support for upcomming ingress rate limiting
+    #in)         # support for upcoming ingress rate limiting
+    #  dir="-i"
+    #  dir_name="in"
+    #  ;;
+    out)
+      dir="-o"
+      dir_name="out"
+      ;;
+    -b=*|--bpf=*)
+      prog="${i#*=}"
+      ;;
+    -c=*|--cc=*)
+      cc="${i#*=}"
+      ;;
+    --debug)
+      flags="$flags -d"
+      debug_flag=1
+      ;;
+    -d=*|--delay=*)
+      netem="${i#*=}"
+      ;;
+    -D)
+      details=1
+      ;;
+    -E)
+     ecn=1
+     ;;
+    # Support for upcomming fq Early Departure Time egress rate limiting
+    #--edt)
+    # prog="hbm_out_edt_kern.o"
+    # qdisc="fq"
+    # ;;
+    -f=*|--flows=*)
+      flows="${i#*=}"
+      ;;
+    -i=*|--id=*)
+      id="${i#*=}"
+      ;;
+    -l)
+      flags="$flags -l"
+      ;;
+    -N)
+      use_netperf=1
+      ;;
+    -p=*|--port=*)
+      port="${i#*=}"
+      ;;
+    -P)
+      multi_iperf=1
+      ;;
+    -q=*)
+      qdisc="${i#*=}"
+      ;;
+    -r=*|--rate=*)
+      rate="${i#*=}"
+      ;;
+    -R)
+      rr=1
+      ;;
+    -s=*|--server=*)
+      server="${i#*=}"
+      ;;
+    -S|--stats)
+      flags="$flags -s"
+      do_stats=1
+      ;;
+    -t=*|--time=*)
+      dur="${i#*=}"
+      ;;
+    -w)
+      flags="$flags -w"
+      ;;
+    cubic)
+      cc=cubic
+      ;;
+    dctcp)
+      cc=dctcp
+      ;;
+    *)
+      echo "Unknown arg:$i"
+      Usage
+      ;;
+    esac
+  done
+}
+
+processArgs
+
+if [ $debug_flag -eq 1 ] ; then
+  rm -f hbm_out.log
+fi
+
+hbm_pid=$(start_hbm)
+usleep 100000
+
+host=`hostname`
+cg_base_dir=/sys/fs/cgroup
+cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
+
+echo $$ >> $cg_dir/cgroup.procs
+
+ulimit -l unlimited
+
+rm -f ss.out
+rm -f hbm.[0-9]*.$dir_name
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=1
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
+  if [ "$cc" != "x" ] ; then
+    sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
+  fi
+fi
+
+if [ "$netem" -ne "0" ] ; then
+  if [ "$qdisc" != "" ] ; then
+    echo "WARNING: Ignoring -q options because -d option used"
+  fi
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
+elif [ "$qdisc" != "" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+fi
+
+n=0
+m=$[$dur * 5]
+hn="::1"
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" != "" ] ; then
+    hn=$server
+  fi
+fi
+
+( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
+
+if [ $use_netperf -ne 0 ] ; then
+  begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
+                   awk '{ print $1 }'`
+  if [ "$begNetserverPid" == "" ] ; then
+    if [ "$server" == "" ] ; then
+      ( ./netserver > /dev/null 2>&1) &
+      usleep 100000
+    fi
+  fi
+  flow_cnt=1
+  if [ "$server" == "" ] ; then
+    np_server=$host
+  else
+    np_server=$server
+  fi
+  if [ "$cc" == "x" ] ; then
+    np_cc=""
+  else
+    np_cc="-K $cc,$cc"
+  fi
+  replySize=1
+  while [ $flow_cnt -le $flows ] ; do
+    if [ $rr -ne 0 ] ; then
+      reqSize=1M
+      if [ $flow_cnt -eq 1 ] ; then
+        reqSize=10K
+      fi
+      if [ "$dir" == "-i" ] ; then
+        replySize=$reqSize
+        reqSize=1
+      fi
+      ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR  -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+    else
+      if [ "$dir" == "-i" ] ; then
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      else
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      fi
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+
+# sleep for duration of test (plus some buffer)
+  n=$[dur+2]
+  sleep $n
+
+# force graceful termination of netperf
+  pids=`pgrep netperf`
+  for p in $pids ; do
+    kill -SIGALRM $p
+  done
+
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+  while [ $flow_cnt -le $flows ] ; do
+    if [ "$dir" == "-i" ] ; then
+      r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    else
+      r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    fi
+    echo "rate for flow $flow_cnt: $r"
+    rate=$[rate+r]
+    if [ $details -ne 0 ] ; then
+      echo "-----"
+      echo "Details for cgroup $id, flow $flow_cnt"
+      cat netperf.$id.$flow_cnt
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    echo ""
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+elif [ $multi_iperf -eq 0 ] ; then
+  (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+  usleep 100000
+  iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
+  rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
+  rate=`echo $rates | grep -o "[0-9]*$"`
+
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+else
+  flow_cnt=1
+  while [ $flow_cnt -le $flows ] ; do
+    (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+    ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
+    port=$[port+1]
+    flow_cnt=$[flow_cnt+1]
+  done
+  n=$[dur+1]
+  sleep $n
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+
+  while [ $flow_cnt -le $flows ] ; do
+    r=`cat iperf3.$id.$flow_cnt`
+#    echo "rate for flow $flow_cnt: $r"
+  if [ $details -ne 0 ] ; then
+    echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
+  fi
+    rate=$[rate+r]
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
+fi
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=0
+fi
+if [ "$netem" -ne "0" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+fi
+
+sleep 2
+
+hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
+if [ "$hbmPid" == "$hbm_pid" ] ; then
+  kill $hbm_pid
+fi
+
+sleep 1
+
+# Detach any BPF programs that may have lingered
+ttx=`bpftool cgroup tree | grep hbm`
+v=2
+for x in $ttx ; do
+    if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
+	cg=$x ; v=0
+    else
+	if [ $v -eq 0 ] ; then
+	    id=$x ; v=1
+	else
+	    if [ $v -eq 1 ] ; then
+		type=$x ; bpftool cgroup detach $cg $type id $id
+		v=0
+	    fi
+	fi
+    fi
+done
+
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" == "" ] ; then
+    if [ "$begNetserverPid" == "" ] ; then
+      netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
+      if [ "$netserverPid" != "" ] ; then
+        kill $netserverPid
+      fi
+    fi
+  fi
+fi
+exit
-- 
Gitee


From 185109668a8c91e7c9fbb293f1fc899c3e9a9417 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 4 Mar 2019 21:08:53 +0100
Subject: [PATCH 19/23] bpf: fix replace_map_fd_with_map_ptr's ldimm64 second
 imm field

ANBZ: #5530

commit 20182390c4134478d795a096ddb8dddcc648e28a upstream.

Non-zero imm value in the second part of the ldimm64 instruction for
BPF_PSEUDO_MAP_FD is invalid, and thus must be rejected. The map fd
only ever sits in the first instructions' imm field. None of the BPF
loaders known to us are using it, so risk of regression is minimal.
For clarity and consistency, the few insn->{src_reg,imm} occurrences
are rewritten into insn[0].{src_reg,imm}. Add a test case to the BPF
selftest suite as well.

Fixes: 0246e64d9a5f ("bpf: handle pseudo BPF_LD_IMM64 insn")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/verifier.c                       | 10 +++++-----
 tools/testing/selftests/bpf/test_verifier.c | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c283210c53c6..84c0b9934d5e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6887,17 +6887,17 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				/* valid generic load 64-bit imm */
 				goto next_insn;
 
-			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose(env,
-					"unrecognized bpf_ld_imm64 insn\n");
+			if (insn[0].src_reg != BPF_PSEUDO_MAP_FD ||
+			    insn[1].imm != 0) {
+				verbose(env, "unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
-			f = fdget(insn->imm);
+			f = fdget(insn[0].imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
 				verbose(env, "fd %d is not pointing to valid bpf_map\n",
-					insn->imm);
+					insn[0].imm);
 				return PTR_ERR(map);
 			}
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 023b032369d6..0fe6fad2892f 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -746,7 +746,7 @@ static struct bpf_test tests[] = {
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, BPF_REG_1, 0, 1),
-			BPF_RAW_INSN(0, 0, 0, 0, 1),
+			BPF_RAW_INSN(0, 0, 0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "not pointing to valid bpf_map",
@@ -763,6 +763,19 @@ static struct bpf_test tests[] = {
 		.errstr = "invalid bpf_ld_imm64 insn",
 		.result = REJECT,
 	},
+	{
+       "test14 ld_imm64: reject 2nd imm != 0",
+		.insns = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_1,
+					 BPF_PSEUDO_MAP_FD, 0, 0),
+		BPF_RAW_INSN(0, 0, 0, 0, 0xfefefe),
+		BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_48b = { 1 },
+		.errstr = "unrecognized bpf_ld_imm64 insn",
+		.result = REJECT,
+	},
 	{
 		"arsh32 on imm",
 		.insns = {
-- 
Gitee


From cf4fb65637d380626d2ed8a61d275cb882894045 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 10 Mar 2019 17:44:09 -0700
Subject: [PATCH 20/23] btf: resolve enum fwds in btf_dedup

ANBZ: #5530

commit 9768095ba97ce946838e8210f0b44f2fd36ec31d upstream.

GCC and clang support enum forward declarations as an extension. Such
forward-declared enums will be represented as normal BTF_KIND_ENUM types with
vlen=0. This patch adds ability to resolve such enums to their corresponding
fully defined enums. This helps to avoid duplicated BTF type graphs which only
differ by some types referencing forward-declared enum vs full enum.

One such example in kernel is enum irqchip_irq_state, defined in
include/linux/interrupt.h and forward-declared in include/linux/irq.h. This
causes entire struct task_struct and all referenced types to be duplicated in
btf_dedup output. This patch eliminates such duplication cases.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/btf.c | 51 +++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8aeb80d73285..719fb7f530b6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1601,16 +1601,12 @@ static bool btf_equal_int(struct btf_type *t1, struct btf_type *t2)
 /* Calculate type signature hash of ENUM. */
 static __u32 btf_hash_enum(struct btf_type *t)
 {
-	struct btf_enum *member = (struct btf_enum *)(t + 1);
-	__u32 vlen = BTF_INFO_VLEN(t->info);
-	__u32 h = btf_hash_common(t);
-	int i;
+	__u32 h;
 
-	for (i = 0; i < vlen; i++) {
-		h = hash_combine(h, member->name_off);
-		h = hash_combine(h, member->val);
-		member++;
-	}
+	/* don't hash vlen and enum members to support enum fwd resolving */
+	h = hash_combine(0, t->name_off);
+	h = hash_combine(h, t->info & ~0xffff);
+	h = hash_combine(h, t->size);
 	return h;
 }
 
@@ -1636,6 +1632,22 @@ static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2)
 	return true;
 }
 
+static inline bool btf_is_enum_fwd(struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM &&
+	       BTF_INFO_VLEN(t->info) == 0;
+}
+
+static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2)
+{
+	if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2))
+		return btf_equal_enum(t1, t2);
+	/* ignore vlen when comparing */
+	return t1->name_off == t2->name_off &&
+	       (t1->info & ~0xffff) == (t2->info & ~0xffff) &&
+	       t1->size == t2->size;
+}
+
 /*
  * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs,
  * as referenced type IDs equivalence is established separately during type
@@ -1859,6 +1871,17 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 				new_id = cand_node->type_id;
 				break;
 			}
+			if (d->opts.dont_resolve_fwds)
+				continue;
+			if (btf_compat_enum(t, cand)) {
+				if (btf_is_enum_fwd(t)) {
+					/* resolve fwd to full enum */
+					new_id = cand_node->type_id;
+					break;
+				}
+				/* resolve canonical enum fwd to full enum */
+				d->map[cand_node->type_id] = type_id;
+			}
 		}
 		break;
 
@@ -2083,15 +2106,15 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		return fwd_kind == real_kind;
 	}
 
-	if (cand_type->info != canon_type->info)
-		return 0;
-
 	switch (cand_kind) {
 	case BTF_KIND_INT:
 		return btf_equal_int(cand_type, canon_type);
 
 	case BTF_KIND_ENUM:
-		return btf_equal_enum(cand_type, canon_type);
+		if (d->opts.dont_resolve_fwds)
+			return btf_equal_enum(cand_type, canon_type);
+		else
+			return btf_compat_enum(cand_type, canon_type);
 
 	case BTF_KIND_FWD:
 		return btf_equal_common(cand_type, canon_type);
@@ -2102,6 +2125,8 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 	case BTF_KIND_PTR:
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
+		if (cand_type->info != canon_type->info)
+			return 0;
 		return btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
 
 	case BTF_KIND_ARRAY: {
-- 
Gitee


From 41a08043c81eab65fa252220772f01aaaabfd94e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 10 Mar 2019 17:44:10 -0700
Subject: [PATCH 21/23] selftests/bpf: add fwd enum resolution test for
 btf_dedup

ANBZ: #5530

commit 8fd7a61aa556ad518e897f58264a2e67f9c527f5 upstream.

This patch adds test verifying new btf_dedup logic of resolving
forward-declared enums.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/test_btf.c | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index f01006873b09..e059db4e5fac 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5874,6 +5874,50 @@ const struct btf_dedup_test dedup_tests[] = {
 		.dont_resolve_fwds = false,
 	},
 },
+{
+	.descr = "dedup: enum fwd resolution",
+	.input = {
+		.raw_types = {
+			/* [1] fwd enum 'e1' before full enum */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 4),
+			/* [2] full enum 'e1' after fwd */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(2), 123),
+			/* [3] full enum 'e2' before fwd */
+			BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(4), 456),
+			/* [4] fwd enum 'e2' after full enum */
+			BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 4),
+			/* [5] incompatible fwd enum with different size */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 1),
+			/* [6] incompatible full enum with different value */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(2), 321),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"),
+	},
+	.expect = {
+		.raw_types = {
+			/* [1] full enum 'e1' */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(2), 123),
+			/* [2] full enum 'e2' */
+			BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(4), 456),
+			/* [3] incompatible fwd enum with different size */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 1),
+			/* [4] incompatible full enum with different value */
+			BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+				BTF_ENUM_ENC(NAME_NTH(2), 321),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
 
 };
 
-- 
Gitee


From 970047acd131eb50c0434f32f3fc1852e601a92f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 26 Mar 2019 22:00:06 -0700
Subject: [PATCH 22/23] libbpf: fix btf_dedup equivalence check handling of
 different kinds

ANBZ: #5530

commit 9ec71c1cdbdd6c4ac0150a51d64e06c5d1bd207e upstream.

btf_dedup_is_equiv() used to compare btf_type->info fields, before doing
kind-specific equivalence check. This comparsion implicitly verified
that candidate and canonical types are of the same kind. With enum fwd
resolution logic this check couldn't be done generically anymore, as for
enums info contains vlen, which differs between enum fwd and
fully-defined enum, so this check was subsumed by kind-specific
equivalence checks.

This change caused btf_dedup_is_equiv() to let through VOID vs other
types check to reach switch, which was never meant to be handing VOID
kind, as VOID kind is always pre-resolved to itself and is only
equivalent to itself, which is checked early in btf_dedup_is_equiv().

This change adds back BTF kind equality check in place of more generic
btf_type->info check, still defering further kind-specific checks to
a per-kind switch.

Fixes: 9768095ba97c ("btf: resolve enum fwds in btf_dedup")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/btf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 719fb7f530b6..c97eaeb324c6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2106,6 +2106,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		return fwd_kind == real_kind;
 	}
 
+	if (cand_kind != canon_kind)
+		return 0;
+
 	switch (cand_kind) {
 	case BTF_KIND_INT:
 		return btf_equal_int(cand_type, canon_type);
-- 
Gitee


From aa7f049551ac09a76d3726da8a873d3869900095 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 26 Mar 2019 22:00:07 -0700
Subject: [PATCH 23/23] selftests/bpf: add btf_dedup test for VOID equivalence
 check

ANBZ: #5530

commit eb76899ce749507e09cad6816f32cede14a9b7ee upstream.

This patch adds specific test exposing bug in btf_dedup_is_equiv() when
comparing candidate VOID type to a non-VOID canonical type. It's
important for canonical type to be anonymous, otherwise name equality
check will do the right thing and will exit early.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/test_btf.c | 47 ++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index e059db4e5fac..a9d68ccbecc8 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5776,6 +5776,53 @@ const struct btf_dedup_test dedup_tests[] = {
 		.dedup_table_size = 1, /* force hash collisions */
 	},
 },
+{
+	.descr = "dedup: void equiv check",
+	/*
+	 * // CU 1:
+	 * struct s {
+	 *	struct {} *x;
+	 * };
+	 * // CU 2:
+	 * struct s {
+	 *	int *x;
+	 * };
+	 */
+	.input = {
+		.raw_types = {
+			/* CU 1 */
+			BTF_STRUCT_ENC(0, 0, 1),				/* [1] struct {}  */
+			BTF_PTR_ENC(1),						/* [2] ptr -> [1] */
+			BTF_STRUCT_ENC(NAME_NTH(1), 1, 8),			/* [3] struct s   */
+				BTF_MEMBER_ENC(NAME_NTH(2), 2, 0),
+			/* CU 2 */
+			BTF_PTR_ENC(0),						/* [4] ptr -> void */
+			BTF_STRUCT_ENC(NAME_NTH(1), 1, 8),			/* [5] struct s   */
+				BTF_MEMBER_ENC(NAME_NTH(2), 4, 0),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0s\0x"),
+	},
+	.expect = {
+		.raw_types = {
+			/* CU 1 */
+			BTF_STRUCT_ENC(0, 0, 1),				/* [1] struct {}  */
+			BTF_PTR_ENC(1),						/* [2] ptr -> [1] */
+			BTF_STRUCT_ENC(NAME_NTH(1), 1, 8),			/* [3] struct s   */
+				BTF_MEMBER_ENC(NAME_NTH(2), 2, 0),
+			/* CU 2 */
+			BTF_PTR_ENC(0),						/* [4] ptr -> void */
+			BTF_STRUCT_ENC(NAME_NTH(1), 1, 8),			/* [5] struct s   */
+				BTF_MEMBER_ENC(NAME_NTH(2), 4, 0),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0s\0x"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+		.dedup_table_size = 1, /* force hash collisions */
+	},
+},
 {
 	.descr = "dedup: all possible kinds (no duplicates)",
 	.input = {
-- 
Gitee