From a80f4c48e0e18a309d54c1a512f2ffaf6836fb58 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:40 -0700
Subject: [PATCH 01/21] bpf: add verifier stats and log_level bit 2

ANBZ: #5530

commit 06ee7115b0d1742de745ad143fb5e06d77d27fba upstream.

In order to understand the verifier bottlenecks add various stats
and extend log_level:
log_level 1 and 2 are kept as-is:
bit 0 - level=1 - print every insn and verifier state at branch points
bit 1 - level=2 - print every insn and verifier state at every insn
bit 2 - level=4 - print verifier error and stats at the end of verification

When verifier rejects the program the libbpf is trying to load the program twice.
Once with log_level=0 (no messages, only error code is reported to user space)
and second time with log_level=1 to tell the user why the verifier rejected it.

With introduction of bit 2 - level=4 the libbpf can choose to always use that
level and load programs once, since the verification speed is not affected and
in case of error the verbose message will be available.

Note that the verifier stats are not part of uapi just like all other
verbose messages. They're expected to change in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/linux/bpf_verifier.h | 21 ++++++++++
 kernel/bpf/verifier.c        | 76 ++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cecdce3d0491..603320963729 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -249,6 +249,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 	return log->len_used >= log->len_total - 1;
 }
 
+#define BPF_LOG_LEVEL1	1
+#define BPF_LOG_LEVEL2	2
+#define BPF_LOG_STATS	4
+#define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log->level && log->ubuf && !bpf_verifier_log_full(log);
@@ -285,6 +291,21 @@ struct bpf_verifier_env {
 	struct bpf_verifier_log log;
 	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
+	/* number of instructions analyzed by the verifier */
+	u32 insn_processed;
+	/* total verification time */
+	u64 verification_time;
+	/* maximum number of verifier states kept in 'branching' instructions */
+	u32 max_states_per_insn;
+	/* total number of allocated verifier states */
+	u32 total_states;
+	/* some states are freed during program analysis.
+	 * this is peak number of states. this number dominates kernel
+	 * memory consumption during verification
+	 */
+	u32 peak_states;
+	/* longest register parentage chain walked for liveness marking */
+	u32 longest_mark_read_walk;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 28725053aa0f..f77c6c71d04b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1090,7 +1090,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	 */
 	subprog[env->subprog_cnt].start = insn_cnt;
 
-	if (env->log.level > 1)
+	if (env->log.level & BPF_LOG_LEVEL2)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
@@ -1137,6 +1137,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 			 struct bpf_reg_state *parent)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
+	int cnt = 0;
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
@@ -1153,7 +1154,11 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 		state = parent;
 		parent = state->parent;
 		writes = true;
+		cnt++;
 	}
+
+	if (env->longest_mark_read_walk < cnt)
+		env->longest_mark_read_walk = cnt;
 	return 0;
 }
 
@@ -1448,7 +1453,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (env->log.level)
+	if (env->log.level & BPF_LOG_LEVEL)
 		print_verifier_state(env, state);
 
 	/* The minimum value is only important with signed
@@ -2981,7 +2986,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* and go analyze first insn of the callee */
 	*insn_idx = target_insn;
 
-	if (env->log.level) {
+	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "caller:\n");
 		print_verifier_state(env, caller);
 		verbose(env, "callee:\n");
@@ -3021,7 +3026,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		return err;
 
 	*insn_idx = callee->callsite + 1;
-	if (env->log.level) {
+	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "returning from callee:\n");
 		print_verifier_state(env, callee);
 		verbose(env, "to caller at %d:\n", *insn_idx);
@@ -5231,7 +5236,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			insn->dst_reg);
 		return -EACCES;
 	}
-	if (env->log.level)
+	if (env->log.level & BPF_LOG_LEVEL)
 		print_verifier_state(env, this_branch->frame[this_branch->curframe]);
 	return 0;
 }
@@ -6416,6 +6421,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		states_cnt++;
 	}
 
+	if (env->max_states_per_insn < states_cnt)
+		env->max_states_per_insn = states_cnt;
+
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
 		return 0;
 
@@ -6429,6 +6437,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
+	env->total_states++;
+	env->peak_states++;
 
 	/* add new state to the head of linked list */
 	new = &new_sl->state;
@@ -6513,8 +6523,7 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_reg_state *regs;
-	int insn_cnt = env->prog->len, i;
-	int insn_processed = 0;
+	int insn_cnt = env->prog->len;
 	bool do_print_state = false;
 
 	env->prev_linfo = NULL;
@@ -6549,10 +6558,10 @@ static int do_check(struct bpf_verifier_env *env)
 		insn = &insns[env->insn_idx];
 		class = BPF_CLASS(insn->code);
 
-		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
+		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
 			verbose(env,
 				"BPF program is too large. Processed %d insn\n",
-				insn_processed);
+				env->insn_processed);
 			return -E2BIG;
 		}
 
@@ -6561,7 +6570,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (env->log.level) {
+			if (env->log.level & BPF_LOG_LEVEL) {
 				if (do_print_state)
 					verbose(env, "\nfrom %d to %d%s: safe\n",
 						env->prev_insn_idx, env->insn_idx,
@@ -6579,8 +6588,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (env->log.level > 1 || (env->log.level && do_print_state)) {
-			if (env->log.level > 1)
+		if (env->log.level & BPF_LOG_LEVEL2 ||
+		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
+			if (env->log.level & BPF_LOG_LEVEL2)
 				verbose(env, "%d:", env->insn_idx);
 			else
 				verbose(env, "\nfrom %d to %d%s:",
@@ -6591,7 +6601,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (env->log.level) {
+		if (env->log.level & BPF_LOG_LEVEL) {
 			const struct bpf_insn_cbs cbs = {
 				.cb_print	= verbose,
 				.private_data	= env,
@@ -6856,16 +6866,6 @@ static int do_check(struct bpf_verifier_env *env)
 		env->insn_idx++;
 	}
 
-	verbose(env, "processed %d insns (limit %d), stack depth ",
-		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt; i++) {
-		u32 depth = env->subprog_info[i].stack_depth;
-
-		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt)
-			verbose(env, "+");
-	}
-	verbose(env, "\n");
 	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
@@ -8094,9 +8094,34 @@ static void free_states(struct bpf_verifier_env *env)
 	kfree(env->explored_states);
 }
 
+static void print_verification_stats(struct bpf_verifier_env *env)
+{
+	int i;
+
+	if (env->log.level & BPF_LOG_STATS) {
+		verbose(env, "verification time %lld usec\n",
+			div_u64(env->verification_time, 1000));
+		verbose(env, "stack depth ");
+		for (i = 0; i < env->subprog_cnt; i++) {
+			u32 depth = env->subprog_info[i].stack_depth;
+
+			verbose(env, "%d", depth);
+			if (i + 1 < env->subprog_cnt)
+				verbose(env, "+");
+		}
+		verbose(env, "\n");
+	}
+	verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
+		"total_states %d peak_states %d mark_read %d\n",
+		env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
+		env->max_states_per_insn, env->total_states,
+		env->peak_states, env->longest_mark_read_walk);
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	      union bpf_attr __user *uattr)
 {
+	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
 	struct bpf_verifier_log *log;
 	int i, len, ret = -EINVAL;
@@ -8139,7 +8164,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		ret = -EINVAL;
 		/* log attributes have to be sane */
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
-		    !log->level || !log->ubuf)
+		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
 			goto err_unlock;
 	}
 
@@ -8220,6 +8245,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
+	env->verification_time = ktime_get_ns() - start_time;
+	print_verification_stats(env);
+
 	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
 	if (log->level && !log->ubuf) {
-- 
Gitee


From 92878735c86c0c533bd4bab3dcb6c7192e5adc86 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:41 -0700
Subject: [PATCH 02/21] bpf: improve verification speed by droping states

ANBZ: #5530

commit 9f4686c41bdff051f557accb531af79dd1773687 upstream.

Branch instructions, branch targets and calls in a bpf program are
the places where the verifier remembers states that led to successful
verification of the program.
These states are used to prune brute force program analysis.
For unprivileged programs there is a limit of 64 states per such
'branching' instructions (maximum length is tracked by max_states_per_insn
counter introduced in the previous patch).
Simply reducing this threshold to 32 or lower increases insn_processed
metric to the point that small valid programs get rejected.
For root programs there is no limit and cilium programs can have
max_states_per_insn to be 100 or higher.
Walking 100+ states multiplied by number of 'branching' insns during
verification consumes significant amount of cpu time.
Turned out simple LRU-like mechanism can be used to remove states
that unlikely will be helpful in future search pruning.
This patch introduces hit_cnt and miss_cnt counters:
hit_cnt - this many times this state successfully pruned the search
miss_cnt - this many times this state was not equivalent to other states
(and that other states were added to state list)

The heuristic introduced in this patch is:
if (sl->miss_cnt > sl->hit_cnt * 3 + 3)
  /* drop this state from future considerations */

Higher numbers increase max_states_per_insn (allow more states to be
considered for pruning) and slow verification speed, but do not meaningfully
reduce insn_processed metric.
Lower numbers drop too many states and insn_processed increases too much.
Many different formulas were considered.
This one is simple and works well enough in practice.
(the analysis was done on selftests/progs/* and on cilium programs)

The end result is this heuristic improves verification speed by 10 times.
Large synthetic programs that used to take a second more now take
1/10 of a second.
In cases where max_states_per_insn used to be 100 or more, now it's ~10.

There is a slight increase in insn_processed for cilium progs:
                       before   after
bpf_lb-DLB_L3.o 	1831	1838
bpf_lb-DLB_L4.o 	3029	3218
bpf_lb-DUNKNOWN.o 	1064	1064
bpf_lxc-DDROP_ALL.o	26309	26935
bpf_lxc-DUNKNOWN.o	33517	34439
bpf_netdev.o		9713	9721
bpf_overlay.o		6184	6184
bpf_lcx_jit.o		37335	39389
And 2-3 times improvement in the verification speed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/verifier.c        | 44 +++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 603320963729..859e3b6bf67e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -207,6 +207,7 @@ struct bpf_verifier_state {
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
 	struct bpf_verifier_state_list *next;
+	int miss_cnt, hit_cnt;
 };
 
 /* Possible states for alu_state member. */
@@ -281,6 +282,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
+	struct bpf_verifier_state_list *free_list;
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f77c6c71d04b..6c2fc7e8a9c1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6387,11 +6387,13 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
-	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
 
-	sl = env->explored_states[insn_idx];
+	pprev = &env->explored_states[insn_idx];
+	sl = *pprev;
+
 	if (!sl)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
@@ -6402,6 +6404,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 
 	while (sl != STATE_LIST_MARK) {
 		if (states_equal(env, &sl->state, cur)) {
+			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -6417,8 +6420,35 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		sl = sl->next;
 		states_cnt++;
+		sl->miss_cnt++;
+		/* heuristic to determine whether this state is beneficial
+		 * to keep checking from state equivalence point of view.
+		 * Higher numbers increase max_states_per_insn and verification time,
+		 * but do not meaningfully decrease insn_processed.
+		 */
+		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+			/* the state is unlikely to be useful. Remove it to
+			 * speed up verification
+			 */
+			*pprev = sl->next;
+			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+				free_verifier_state(&sl->state, false);
+				kfree(sl);
+				env->peak_states--;
+			} else {
+				/* cannot free this state, since parentage chain may
+				 * walk it later. Add it for free_list instead to
+				 * be freed at the end of verification
+				 */
+				sl->next = env->free_list;
+				env->free_list = sl;
+			}
+			sl = *pprev;
+			continue;
+		}
+		pprev = &sl->next;
+		sl = *pprev;
 	}
 
 	if (env->max_states_per_insn < states_cnt)
@@ -8076,6 +8106,14 @@ static void free_states(struct bpf_verifier_env *env)
 	struct bpf_verifier_state_list *sl, *sln;
 	int i;
 
+	sl = env->free_list;
+	while (sl) {
+		sln = sl->next;
+		free_verifier_state(&sl->state, false);
+		kfree(sl);
+		sl = sln;
+	}
+
 	if (!env->explored_states)
 		return;
 
-- 
Gitee


From 9da70b1bcad69559749cb534a6cf0452499cf96a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:42 -0700
Subject: [PATCH 03/21] bpf: improve verification speed by not remarking
 live_read

ANBZ: #5530

commit 25af32dad8047d180e70e233c85b909dd6587cc5 upstream.

With large verifier speed improvement brought by the previous patch
mark_reg_read() becomes the hottest function during verification.
On a typical program it consumes 40% of cpu.
mark_reg_read() walks parentage chain of registers to mark parents as LIVE_READ.
Once the register is marked there is no need to remark it again in the future.
Hence stop walking the chain once first LIVE_READ is seen.
This optimization drops mark_reg_read() time from 40% of cpu to <1%
and overall 2x improvement of verification speed.
For some programs the longest_mark_read_walk counter improves from ~500 to ~5

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/verifier.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c2fc7e8a9c1..b86d5fb6b0af 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1149,6 +1149,15 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
+		if (parent->live & REG_LIVE_READ)
+			/* The parentage chain never changes and
+			 * this parent was already marked as LIVE_READ.
+			 * There is no need to keep walking the chain again and
+			 * keep re-marking all parents as LIVE_READ.
+			 * This case happens when the same register is read
+			 * multiple times without writes into it in-between.
+			 */
+			break;
 		/* ... then we depend on parent's value */
 		parent->live |= REG_LIVE_READ;
 		state = parent;
-- 
Gitee


From ef2b5b322a4a50d2c9ff282fbaaa521b2055ab54 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:43 -0700
Subject: [PATCH 04/21] bpf: convert temp arrays to kvcalloc

ANBZ: #5530

commit 71dde681a8cea1ccff2c7b3be83c043ab6b2a977 upstream.

Temporary arrays used during program verification need to be vmalloc-ed
to support large bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/verifier.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b86d5fb6b0af..73a0cd1cae2b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5548,13 +5548,13 @@ static int check_cfg(struct bpf_verifier_env *env)
 	int ret = 0;
 	int i, t;
 
-	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
 
-	insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_stack) {
-		kfree(insn_state);
+		kvfree(insn_state);
 		return -ENOMEM;
 	}
 
@@ -5652,8 +5652,8 @@ static int check_cfg(struct bpf_verifier_env *env)
 	ret = 0; /* cfg looks good */
 
 err_free:
-	kfree(insn_state);
-	kfree(insn_stack);
+	kvfree(insn_state);
+	kvfree(insn_stack);
 	return ret;
 }
 
@@ -8138,7 +8138,7 @@ static void free_states(struct bpf_verifier_env *env)
 			}
 	}
 
-	kfree(env->explored_states);
+	kvfree(env->explored_states);
 }
 
 static void print_verification_stats(struct bpf_verifier_env *env)
@@ -8234,7 +8234,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 			goto skip_full_check;
 	}
 
-	env->explored_states = kcalloc(env->prog->len,
+	env->explored_states = kvcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
-- 
Gitee


From 2a7132627751ebc5219deb93a28569bb19266bb4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:44 -0700
Subject: [PATCH 05/21] bpf: verbose jump offset overflow check

ANBZ: #5530

commit 4f73379ec5c2891598aa715c6df7ac9afdc86fbf upstream.

Larger programs may trigger 16-bit jump offset overflow check
during instruction patching. Make this error verbose otherwise
users cannot decipher error code without printks in the verifier.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/core.c     | 11 ++++++-----
 kernel/bpf/verifier.c |  7 ++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 594dd9fe6d51..9eb50de49662 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -438,6 +438,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
 	const u32 cnt_max = S16_MAX;
 	struct bpf_prog *prog_adj;
+	int err;
 
 	/* Since our patchlet doesn't expand the image, we're done. */
 	if (insn_delta == 0) {
@@ -453,8 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	 * we afterwards may not fail anymore.
 	 */
 	if (insn_adj_cnt > cnt_max &&
-	    bpf_adj_branches(prog, off, off + 1, off + len, true))
-		return NULL;
+	    (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
+		return ERR_PTR(err);
 
 	/* Several new instructions need to be inserted. Make room
 	 * for them. Likely, there's no need for a new allocation as
@@ -463,7 +464,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
 				    GFP_USER);
 	if (!prog_adj)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	prog_adj->len = insn_adj_cnt;
 
@@ -1093,13 +1094,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 			continue;
 
 		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
-		if (!tmp) {
+		if (IS_ERR(tmp)) {
 			/* Patching may have repointed aux->prog during
 			 * realloc from the original one, so we need to
 			 * fix it up here on error.
 			 */
 			bpf_jit_prog_release_other(prog, clone);
-			return ERR_PTR(-ENOMEM);
+			return tmp;
 		}
 
 		clone = tmp;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73a0cd1cae2b..f4c2048f7c32 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7170,8 +7170,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	struct bpf_prog *new_prog;
 
 	new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
-	if (!new_prog)
+	if (IS_ERR(new_prog)) {
+		if (PTR_ERR(new_prog) == -ERANGE)
+			verbose(env,
+				"insn %d cannot be patched due to 16-bit range\n",
+				env->insn_aux_data[off].orig_idx);
 		return NULL;
+	}
 	if (adjust_insn_aux_data(env, new_prog->len, off, len))
 		return NULL;
 	adjust_subprog_starts(env, off, len);
-- 
Gitee


From 9f0ca071da98a3977d5e5d2ad090939b03b1e01a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:46 -0700
Subject: [PATCH 06/21] bpf: increase verifier log limit

ANBZ: #5530

commit 7a9f5c65abcc9644b11738ca0815510cb5510eaf upstream.

The existing 16Mbyte verifier log limit is not enough for log_level=2
even for small programs. Increase it to 1Gbyte.
Note it's not a kernel memory limit.
It's an amount of memory user space provides to store
the verifier log. The kernel populates it 1k at a time.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f4c2048f7c32..6716b8ec149d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8215,7 +8215,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
-		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
 		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
 			goto err_unlock;
 	}
-- 
Gitee


From 0f2d339d82d38f646555bbc133a7319b3cd12de2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:47 -0700
Subject: [PATCH 07/21] libbpf: teach libbpf about log_level bit 2

ANBZ: #5530

commit da11b417583ece875f862d84578259a2ab27ad86 upstream.

Allow bpf_prog_load_xattr() to specify log_level for program loading.

Teach libbpf to accept log_level with bit 2 set.

Increase default BPF_LOG_BUF_SIZE from 256k to 16M.
There is no downside to increase it to a maximum allowed by old kernels.
Existing 256k limit caused ENOSPC errors and users were not able to see
verifier error which is printed at the end of the verifier log.

If ENOSPC is hit, double the verifier log and try again to capture
the verifier error.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/bpf.c    |  2 +-
 tools/lib/bpf/bpf.h    |  2 +-
 tools/lib/bpf/libbpf.c | 16 ++++++++++++++--
 tools/lib/bpf/libbpf.h |  1 +
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 78f5a42ccbbf..1fc7beabf658 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -225,7 +225,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 		return -EINVAL;
 
 	log_level = load_attr->log_level;
-	if (log_level > 2 || (log_level && !log_buf))
+	if (log_level > (4 | 2 | 1) || (log_level && !log_buf))
 		return -EINVAL;
 
 	name_len = load_attr->name ? strlen(load_attr->name) : 0;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6dc1f418034f..151fc1710380 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -93,7 +93,7 @@ struct bpf_load_program_attr {
 #define MAPS_RELAX_COMPAT	0x01
 
 /* Recommend log buffer size */
-#define BPF_LOG_BUF_SIZE (256 * 1024)
+#define BPF_LOG_BUF_SIZE (16 * 1024 * 1024) /* verifier maximum in kernels <= 5.1 */
 LIBBPF_API int
 bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 		       char *log_buf, size_t log_buf_sz);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0db9ba284a68..e0ae11880944 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -152,6 +152,7 @@ struct bpf_program {
 		};
 	} *reloc_desc;
 	int nr_reloc;
+	int log_level;
 
 	struct {
 		int nr;
@@ -1494,6 +1495,7 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt,
 {
 	struct bpf_load_program_attr load_attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
+	int log_buf_size = BPF_LOG_BUF_SIZE;
 	char *log_buf;
 	int ret;
 
@@ -1514,21 +1516,30 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt,
 	load_attr.line_info = prog->line_info;
 	load_attr.line_info_rec_size = prog->line_info_rec_size;
 	load_attr.line_info_cnt = prog->line_info_cnt;
+	load_attr.log_level = prog->log_level;
 	if (!load_attr.insns || !load_attr.insns_cnt)
 		return -EINVAL;
 
-	log_buf = malloc(BPF_LOG_BUF_SIZE);
+retry_load:
+	log_buf = malloc(log_buf_size);
 	if (!log_buf)
 		pr_warning("Alloc log buffer for bpf loader error, continue without log\n");
 
-	ret = bpf_load_program_xattr(&load_attr, log_buf, BPF_LOG_BUF_SIZE);
+	ret = bpf_load_program_xattr(&load_attr, log_buf, log_buf_size);
 
 	if (ret >= 0) {
+		if (load_attr.log_level)
+			pr_debug("verifier log:\n%s", log_buf);
 		*pfd = ret;
 		ret = 0;
 		goto out;
 	}
 
+	if (errno == ENOSPC) {
+		log_buf_size <<= 1;
+		free(log_buf);
+		goto retry_load;
+	}
 	ret = -LIBBPF_ERRNO__LOAD;
 	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 	pr_warning("load bpf program failed: %s\n", cp);
@@ -2941,6 +2952,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 		bpf_program__set_expected_attach_type(prog,
 						      expected_attach_type);
 
+		prog->log_level = attr->log_level;
 		if (!bpf_program__is_function_storage(prog, obj) && !first_prog)
 			first_prog = prog;
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c70785cc8ef5..531323391d07 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -314,6 +314,7 @@ struct bpf_prog_load_attr {
 	enum bpf_prog_type prog_type;
 	enum bpf_attach_type expected_attach_type;
 	int ifindex;
+	int log_level;
 };
 
 LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
-- 
Gitee


From 8e20d6d0ee4f565dd4483131b3e63328f96df481 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:48 -0700
Subject: [PATCH 08/21] selftests/bpf: add few verifier scale tests

ANBZ: #5530

commit e5e7a8f2d858a91b79c4afc51a3f15edcbf9cb60 upstream.

Add 3 basic tests that stress verifier scalability.

test_verif_scale1.c calls non-inlined jhash() function 90 times on
different position in the packet.
This test simulates network packet parsing.
jhash function is ~140 instructions and main program is ~1200 insns.

test_verif_scale2.c force inlines jhash() function 90 times.
This program is ~15k instructions long.

test_verif_scale3.c calls non-inlined jhash() function 90 times on
But this time jhash has to process 32-bytes from the packet
instead of 14-bytes in tests 1 and 2.
jhash function is ~230 insns and main program is ~1200 insns.

$ test_progs -s
can be used to see verifier stats.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 .../testing/selftests/bpf/progs/test_jhash.h  | 70 +++++++++++++++++++
 .../selftests/bpf/progs/test_verif_scale1.c   | 30 ++++++++
 .../selftests/bpf/progs/test_verif_scale2.c   | 30 ++++++++
 .../selftests/bpf/progs/test_verif_scale3.c   | 30 ++++++++
 tools/testing/selftests/bpf/test_progs.c      | 54 +++++++++++++-
 5 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_jhash.h
 create mode 100644 tools/testing/selftests/bpf/progs/test_verif_scale1.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_verif_scale2.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_verif_scale3.c

diff --git a/tools/testing/selftests/bpf/progs/test_jhash.h b/tools/testing/selftests/bpf/progs/test_jhash.h
new file mode 100644
index 000000000000..3d12c11a8d47
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_jhash.h
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+typedef unsigned int u32;
+
+static __attribute__((always_inline)) u32 rol32(u32 word, unsigned int shift)
+{
+	return (word << shift) | (word >> ((-shift) & 31));
+}
+
+#define __jhash_mix(a, b, c)			\
+{						\
+	a -= c;  a ^= rol32(c, 4);  c += b;	\
+	b -= a;  b ^= rol32(a, 6);  a += c;	\
+	c -= b;  c ^= rol32(b, 8);  b += a;	\
+	a -= c;  a ^= rol32(c, 16); c += b;	\
+	b -= a;  b ^= rol32(a, 19); a += c;	\
+	c -= b;  c ^= rol32(b, 4);  b += a;	\
+}
+
+#define __jhash_final(a, b, c)			\
+{						\
+	c ^= b; c -= rol32(b, 14);		\
+	a ^= c; a -= rol32(c, 11);		\
+	b ^= a; b -= rol32(a, 25);		\
+	c ^= b; c -= rol32(b, 16);		\
+	a ^= c; a -= rol32(c, 4);		\
+	b ^= a; b -= rol32(a, 14);		\
+	c ^= b; c -= rol32(b, 24);		\
+}
+
+#define JHASH_INITVAL		0xdeadbeef
+
+static ATTR
+u32 jhash(const void *key, u32 length, u32 initval)
+{
+	u32 a, b, c;
+	const unsigned char *k = key;
+
+	a = b = c = JHASH_INITVAL + length + initval;
+
+	while (length > 12) {
+		a += *(volatile u32 *)(k);
+		b += *(volatile u32 *)(k + 4);
+		c += *(volatile u32 *)(k + 8);
+		__jhash_mix(a, b, c);
+		length -= 12;
+		k += 12;
+	}
+	switch (length) {
+	case 12: c += (u32)k[11]<<24;
+	case 11: c += (u32)k[10]<<16;
+	case 10: c += (u32)k[9]<<8;
+	case 9:  c += k[8];
+	case 8:  b += (u32)k[7]<<24;
+	case 7:  b += (u32)k[6]<<16;
+	case 6:  b += (u32)k[5]<<8;
+	case 5:  b += k[4];
+	case 4:  a += (u32)k[3]<<24;
+	case 3:  a += (u32)k[2]<<16;
+	case 2:  a += (u32)k[1]<<8;
+	case 1:  a += k[0];
+		 c ^= a;
+		 __jhash_final(a, b, c);
+	case 0: /* Nothing left to add */
+		break;
+	}
+
+	return c;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale1.c b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
new file mode 100644
index 000000000000..f3236ce35f31
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#define ATTR __attribute__((noinline))
+#include "test_jhash.h"
+
+SEC("scale90_noinline")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	void *ptr;
+	int ret = 0, nh_off, i = 0;
+
+	nh_off = 14;
+
+	/* pragma unroll doesn't work on large loops */
+
+#define C do { \
+	ptr = data + i; \
+	if (ptr + nh_off > data_end) \
+		break; \
+	ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+	} while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+	C30;C30;C30; /* 90 calls */
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
new file mode 100644
index 000000000000..77830693eccb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#define ATTR __attribute__((always_inline))
+#include "test_jhash.h"
+
+SEC("scale90_inline")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	void *ptr;
+	int ret = 0, nh_off, i = 0;
+
+	nh_off = 14;
+
+	/* pragma unroll doesn't work on large loops */
+
+#define C do { \
+	ptr = data + i; \
+	if (ptr + nh_off > data_end) \
+		break; \
+	ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+	} while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+	C30;C30;C30; /* 90 calls */
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale3.c b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
new file mode 100644
index 000000000000..1848da04ea41
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#define ATTR __attribute__((noinline))
+#include "test_jhash.h"
+
+SEC("scale90_noinline32")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	void *ptr;
+	int ret = 0, nh_off, i = 0;
+
+	nh_off = 32;
+
+	/* pragma unroll doesn't work on large loops */
+
+#define C do { \
+	ptr = data + i; \
+	if (ptr + nh_off > data_end) \
+		break; \
+	ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+	} while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+	C30;C30;C30; /* 90 calls */
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index b5a8706a1f36..12d57d3f9dce 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -46,6 +46,7 @@ typedef __u16 __sum16;
 
 static int error_cnt, pass_cnt;
 static bool jit_enabled;
+bool verifier_stats = false;
 
 #define MAGIC_BYTES 123
 
@@ -2218,12 +2219,62 @@ static void test_signal_pending(enum bpf_prog_type prog_type)
 	signal(SIGALRM, SIG_DFL);
 }
 
-int main(void)
+static int libbpf_debug_print_verifier_scale(enum libbpf_print_level level,
+			      const char *format, va_list args)
+{
+	if (level != LIBBPF_DEBUG)
+		return 0;
+
+	if (!strstr(format, "verifier log"))
+		return 0;
+	return vfprintf(stderr, "%s", args);
+}
+
+static int check_load(const char *file)
+{
+	struct bpf_prog_load_attr attr;
+	struct bpf_object *obj;
+	int err, prog_fd;
+
+	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+	attr.file = file;
+	attr.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+	attr.log_level = 4;
+	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+	bpf_object__close(obj);
+	if (err)
+		error_cnt++;
+	return err;
+}
+
+void test_bpf_verif_scale(void)
+{
+	const char *file1 = "./test_verif_scale1.o";
+	const char *file2 = "./test_verif_scale2.o";
+	const char *file3 = "./test_verif_scale3.o";
+	int err;
+
+	if (verifier_stats)
+		libbpf_set_print(libbpf_debug_print_verifier_scale);
+
+	err = check_load(file1);
+	err |= check_load(file2);
+	err |= check_load(file3);
+	if (!err)
+		printf("test_verif_scale:OK\n");
+	else
+		printf("test_verif_scale:FAIL\n");
+}
+
+int main(int ac, char **av)
 {
 	srand(time(NULL));
 
 	jit_enabled = is_jit_enabled();
 
+	if (ac == 2 && strcmp(av[1], "-s") == 0)
+		verifier_stats = true;
+
 	test_pkt_access();
 	test_prog_run_xattr();
 	test_xdp();
@@ -2250,6 +2301,7 @@ int main(void)
 	test_map_lock();
 	test_signal_pending(BPF_PROG_TYPE_SOCKET_FILTER);
 	test_signal_pending(BPF_PROG_TYPE_FLOW_DISSECTOR);
+	test_bpf_verif_scale();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
Gitee


From fad8f4335e726d7e5601564ad78355096135f6a0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:49 -0700
Subject: [PATCH 09/21] selftests/bpf: synthetic tests to push verifier limits

ANBZ: #5530

commit 8aa2d4b4b92cd534d53353b0c2fb079572b97fdf upstream.

Add a test to generate 1m ld_imm64 insns to stress the verifier.

Bump the size of fill_ld_abs_vlan_push_pop test from 4k to 29k
and jump_around_ld_abs from 4k to 5.5k.
Larger sizes are not possible due to 16-bit offset encoding
in jump instructions.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 44 ++++++++++++++++-----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index ed2d803d5fdf..3630869a3653 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -49,6 +49,7 @@
 #include "../../../include/linux/filter.h"
 
 #define MAX_INSNS	BPF_MAXINSNS
+#define MAX_TEST_INSNS	1000000
 #define MAX_FIXUPS	8
 #define MAX_NR_MAPS	14
 #define MAX_TEST_RUNS	8
@@ -65,6 +66,7 @@ static int skips;
 struct bpf_test {
 	const char *descr;
 	struct bpf_insn	insns[MAX_INSNS];
+	struct bpf_insn	*fill_insns;
 	int fixup_map_hash_8b[MAX_FIXUPS];
 	int fixup_map_hash_48b[MAX_FIXUPS];
 	int fixup_map_hash_16b[MAX_FIXUPS];
@@ -82,6 +84,7 @@ struct bpf_test {
 	const char *errstr;
 	const char *errstr_unpriv;
 	uint32_t retval, retval_unpriv, insn_processed;
+	int prog_len;
 	enum {
 		UNDEF,
 		ACCEPT,
@@ -118,10 +121,11 @@ struct other_val {
 
 static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
 {
-	/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
+	/* test: {skb->data[0], vlan_push} x 51 + {skb->data[0], vlan_pop} x 51 */
 #define PUSH_CNT 51
-	unsigned int len = BPF_MAXINSNS;
-	struct bpf_insn *insn = self->insns;
+	/* jump range is limited to 16 bit. PUSH_CNT of ld_abs needs room */
+	unsigned int len = (1 << 15) - PUSH_CNT * 2 * 5 * 6;
+	struct bpf_insn *insn = self->fill_insns;
 	int i = 0, j, k = 0;
 
 	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
@@ -155,12 +159,14 @@ static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
 	for (; i < len - 1; i++)
 		insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef);
 	insn[len - 1] = BPF_EXIT_INSN();
+	self->prog_len = len;
 }
 
 static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
 {
-	struct bpf_insn *insn = self->insns;
-	unsigned int len = BPF_MAXINSNS;
+	struct bpf_insn *insn = self->fill_insns;
+	/* jump range is limited to 16 bit. every ld_abs is replaced by 6 insns */
+	unsigned int len = (1 << 15) / 6;
 	int i = 0;
 
 	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
@@ -170,11 +176,12 @@ static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
 	while (i < len - 1)
 		insn[i++] = BPF_LD_ABS(BPF_B, 1);
 	insn[i] = BPF_EXIT_INSN();
+	self->prog_len = i + 1;
 }
 
 static void bpf_fill_rand_ld_dw(struct bpf_test *self)
 {
-	struct bpf_insn *insn = self->insns;
+	struct bpf_insn *insn = self->fill_insns;
 	uint64_t res = 0;
 	int i = 0;
 
@@ -192,6 +199,7 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
 	insn[i++] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32);
 	insn[i++] = BPF_ALU64_REG(BPF_XOR, BPF_REG_0, BPF_REG_1);
 	insn[i] = BPF_EXIT_INSN();
+	self->prog_len = i + 1;
 	res ^= (res >> 32);
 	self->retval = (uint32_t)res;
 }
@@ -13441,6 +13449,15 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.retval = 5,
 	},
+	{
+		"ld_dw: xor semi-random 64 bit imms, test 5",
+		.insns = { },
+		.data = { },
+		.fill_helper = bpf_fill_rand_ld_dw,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1000000 - 6,
+	},
 	{
 		"pass unmodified ctx pointer to helper",
 		.insns = {
@@ -16499,8 +16516,10 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
 	int *fixup_percpu_cgroup_storage = test->fixup_percpu_cgroup_storage;
 	int *fixup_map_spin_lock = test->fixup_map_spin_lock;
 
-	if (test->fill_helper)
+	if (test->fill_helper) {
+		test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
 		test->fill_helper(test);
+	}
 
 	/* Allocating HTs with 1 elem is fine here, since we only test
 	 * for verifier and not do a runtime lookup, so the only thing
@@ -16696,12 +16715,17 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
 	fixup_skips = skips;
 	do_test_fixup(test, prog_type, prog, map_fds);
+	if (test->fill_insns) {
+		prog = test->fill_insns;
+		prog_len = test->prog_len;
+	} else {
+		prog_len = probe_filter_length(prog);
+	}
 	/* If there were some map skips during fixup due to missing bpf
 	 * features, skip this test.
 	 */
 	if (fixup_skips != skips)
 		return;
-	prog_len = probe_filter_length(prog);
 
 	pflags = 0;
 	if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT)
@@ -16709,7 +16733,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
 		pflags |= BPF_F_ANY_ALIGNMENT;
 	fd_prog = bpf_verify_program(prog_type, prog, prog_len, pflags,
-				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1);
+				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 4);
 	if (fd_prog < 0 && !bpf_probe_prog_type(prog_type, 0)) {
 		printf("SKIP (unsupported program type %d)\n", prog_type);
 		skips++;
@@ -16808,6 +16832,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		goto fail_log;
 	}
 close_fds:
+	if (test->fill_insns)
+		free(test->fill_insns);
 	close(fd_prog);
 	for (i = 0; i < MAX_NR_MAPS; i++)
 		close(map_fds[i]);
-- 
Gitee


From 74fd46098802177c639d43a7c28940ea38cdb643 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 10 Apr 2019 08:54:16 +0200
Subject: [PATCH 10/21] libbpf: fix crash in XDP socket part with new larger
 BPF_LOG_BUF_SIZE

ANBZ: #5530

commit 50bd645b3a21a374dbd0fa8273a5f4e98001fb05 upstream.

In commit da11b417583e ("libbpf: teach libbpf about log_level bit 2"),
the BPF_LOG_BUF_SIZE was increased to 16M. The XDP socket part of
libbpf allocated the log_buf on the stack, but for the new 16M buffer
size this is not going to work. Change the code so it uses a 16K buffer
instead.

Fixes: da11b417583e ("libbpf: teach libbpf about log_level bit 2")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/xsk.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 1676651c35e6..a3d1a302bc9c 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -258,7 +258,8 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
 
 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 {
-	char bpf_log_buf[BPF_LOG_BUF_SIZE];
+	static const int log_buf_size = 16 * 1024;
+	char log_buf[log_buf_size];
 	int err, prog_fd;
 
 	/* This is the C-program:
@@ -307,10 +308,10 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 
 	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
-				   "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
-				   BPF_LOG_BUF_SIZE);
+				   "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
+				   log_buf_size);
 	if (prog_fd < 0) {
-		pr_warning("BPF log buffer:\n%s", bpf_log_buf);
+		pr_warning("BPF log buffer:\n%s", log_buf);
 		return prog_fd;
 	}
 
-- 
Gitee


From 013bb22ebd12b8f670454907dc6c6d7f23659ed5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sat, 6 Apr 2019 22:37:34 -0700
Subject: [PATCH 11/21] libbpf: Ignore -Wformat-nonliteral warning

ANBZ: #5530

commit ff466b58055f2d28d8ddc1388af312e87a693efe upstream.

vsprintf() in __base_pr() uses nonliteral format string and it breaks
compilation for those who provide corresponding extra CFLAGS, e.g.:
https://github.com/libbpf/libbpf/issues/27

If libbpf is built with the flags from PR:

  libbpf.c:68:26: error: format string is not a string literal
  [-Werror,-Wformat-nonliteral]
          return vfprintf(stderr, format, args);
                                  ^~~~~~
  1 error generated.

Ignore this warning since the use case in libbpf.c is legit.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/libbpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e0ae11880944..077d91f0ca62 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -52,6 +52,11 @@
 #define BPF_FS_MAGIC		0xcafe4a11
 #endif
 
+/* vsprintf() in __base_pr() uses nonliteral format string. It may break
+ * compilation if user enables corresponding warning. Disable it explicitly.
+ */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+
 #define __printf(a, b)	__attribute__((format(printf, a, b)))
 
 static int __base_pr(enum libbpf_print_level level, const char *format,
-- 
Gitee


From fbbceaaf29fcf0c540d14e0e8c996e7b9a1f59b3 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 9 Apr 2019 11:49:09 -0700
Subject: [PATCH 12/21] bpf: support input __sk_buff context in
 BPF_PROG_TEST_RUN

ANBZ: #5530

commit b0b9395d865e3060d97658fbc9ba3f77fecc8da1 upstream.

Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN:
* ctx_in/ctx_size_in - input context
* ctx_out/ctx_size_out - output context

The intended use case is to pass some meta data to the test runs that
operate on skb (this has being brought up on recent LPC).

For programs that use bpf_prog_test_run_skb, support __sk_buff input and
output. Initially, from input __sk_buff, copy _only_ cb and priority into
skb, all other non-zero fields are prohibited (with EINVAL).
If the user has set ctx_out/ctx_size_out, copy the potentially modified
__sk_buff back to the userspace.

We require all fields of input __sk_buff except the ones we explicitly
support to be set to zero. The expectation is that in the future we might
add support for more fields and we want to fail explicitly if the user
runs the program on the kernel where we don't yet support them.

The API is intentionally vague (i.e. we don't explicitly add __sk_buff
to bpf_attr, but ctx_in) to potentially let other test_run types use
this interface in the future (this can be xdp_md for xdp types for
example).

v4:
  * don't copy more than allowed in bpf_ctx_init [Martin]

v3:
  * handle case where ctx_in is NULL, but ctx_out is not [Martin]
  * convert size==0 checks to ptr==NULL checks and add some extra ptr
    checks [Martin]

v2:
  * Addressed comments from Martin Lau

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/uapi/linux/bpf.h |   7 ++
 kernel/bpf/syscall.c     |  10 ++-
 net/bpf/test_run.c       | 143 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 151 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ea1066927b6c..d69aa9e45e63 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -400,6 +400,13 @@ union bpf_attr {
 		__aligned_u64	data_out;
 		__u32		repeat;
 		__u32		duration;
+		__u32		ctx_size_in;	/* input: len of ctx_in */
+		__u32		ctx_size_out;	/* input/output: len of ctx_out
+						 *   returns ENOSPC if ctx_out
+						 *   is too small.
+						 */
+		__aligned_u64	ctx_in;
+		__aligned_u64	ctx_out;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1cc3d6865d75..8f936948e97a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1972,7 +1972,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	return cgroup_bpf_prog_query(attr, uattr);
 }
 
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
 
 static int bpf_prog_test_run(const union bpf_attr *attr,
 			     union bpf_attr __user *uattr)
@@ -1985,6 +1985,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
+	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
+	    (!attr->test.ctx_size_in && attr->test.ctx_in))
+		return -EINVAL;
+
+	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
+	    (!attr->test.ctx_size_out && attr->test.ctx_out))
+		return -EINVAL;
+
 	prog = bpf_prog_get(attr->test.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index fab142b796ef..cbd4fb65aa4f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -123,12 +123,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 	return data;
 }
 
+static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
+{
+	void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
+	void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+	u32 size = kattr->test.ctx_size_in;
+	void *data;
+	int err;
+
+	if (!data_in && !data_out)
+		return NULL;
+
+	data = kzalloc(max_size, GFP_USER);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (data_in) {
+		err = bpf_check_uarg_tail_zero(data_in, max_size, size);
+		if (err) {
+			kfree(data);
+			return ERR_PTR(err);
+		}
+
+		size = min_t(u32, max_size, size);
+		if (copy_from_user(data, data_in, size)) {
+			kfree(data);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	return data;
+}
+
+static int bpf_ctx_finish(const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr, const void *data,
+			  u32 size)
+{
+	void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+	int err = -EFAULT;
+	u32 copy_size = size;
+
+	if (!data || !data_out)
+		return 0;
+
+	if (copy_size > kattr->test.ctx_size_out) {
+		copy_size = kattr->test.ctx_size_out;
+		err = -ENOSPC;
+	}
+
+	if (copy_to_user(data_out, data, copy_size))
+		goto out;
+	if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size)))
+		goto out;
+	if (err != -ENOSPC)
+		err = 0;
+out:
+	return err;
+}
+
+/**
+ * range_is_zero - test whether buffer is initialized
+ * @buf: buffer to check
+ * @from: check from this position
+ * @to: check up until (excluding) this position
+ *
+ * This function returns true if the there is a non-zero byte
+ * in the buf in the range [from,to).
+ */
+static inline bool range_is_zero(void *buf, size_t from, size_t to)
+{
+	return !memchr_inv((u8 *)buf + from, 0, to - from);
+}
+
+static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+	struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+	if (!__skb)
+		return 0;
+
+	/* make sure the fields we don't use are zeroed */
+	if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority)))
+		return -EINVAL;
+
+	/* priority is allowed */
+
+	if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) +
+			   FIELD_SIZEOF(struct __sk_buff, priority),
+			   offsetof(struct __sk_buff, cb)))
+		return -EINVAL;
+
+	/* cb is allowed */
+
+	if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) +
+			   FIELD_SIZEOF(struct __sk_buff, cb),
+			   sizeof(struct __sk_buff)))
+		return -EINVAL;
+
+	skb->priority = __skb->priority;
+	memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
+
+	return 0;
+}
+
+static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+	struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+	if (!__skb)
+		return;
+
+	__skb->priority = skb->priority;
+	memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+}
+
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
 	bool is_l2 = false, is_direct_pkt_access = false;
 	u32 size = kattr->test.data_size_in;
 	u32 repeat = kattr->test.repeat;
+	struct __sk_buff *ctx = NULL;
 	u32 retval, duration;
 	int hh_len = ETH_HLEN;
 	struct sk_buff *skb;
@@ -141,6 +255,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (IS_ERR(data))
 		return PTR_ERR(data);
 
+	ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
+	if (IS_ERR(ctx)) {
+		kfree(data);
+		return PTR_ERR(ctx);
+	}
+
 	switch (prog->type) {
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
@@ -158,6 +278,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	sk = kzalloc(sizeof(struct sock), GFP_USER);
 	if (!sk) {
 		kfree(data);
+		kfree(ctx);
 		return -ENOMEM;
 	}
 	sock_net_set(sk, current->nsproxy->net_ns);
@@ -166,6 +287,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	skb = build_skb(data, 0);
 	if (!skb) {
 		kfree(data);
+		kfree(ctx);
 		kfree(sk);
 		return -ENOMEM;
 	}
@@ -180,32 +302,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		__skb_push(skb, hh_len);
 	if (is_direct_pkt_access)
 		bpf_compute_data_pointers(skb);
+	ret = convert___skb_to_skb(skb, ctx);
+	if (ret)
+		goto out;
 	ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
-	if (ret) {
-		kfree_skb(skb);
-		kfree(sk);
-		return ret;
-	}
+	if (ret)
+		goto out;
 	if (!is_l2) {
 		if (skb_headroom(skb) < hh_len) {
 			int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
 
 			if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
-				kfree_skb(skb);
-				kfree(sk);
-				return -ENOMEM;
+				ret = -ENOMEM;
+				goto out;
 			}
 		}
 		memset(__skb_push(skb, hh_len), 0, hh_len);
 	}
+	convert_skb_to___skb(skb, ctx);
 
 	size = skb->len;
 	/* bpf program can never convert linear skb to non-linear */
 	if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
 		size = skb_headlen(skb);
 	ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, ctx,
+				     sizeof(struct __sk_buff));
+out:
 	kfree_skb(skb);
 	kfree(sk);
+	kfree(ctx);
 	return ret;
 }
 
-- 
Gitee


From c0f5d7f3604b1848412cbc683550ebff991939c9 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 9 Apr 2019 11:49:10 -0700
Subject: [PATCH 13/21] libbpf: add support for ctx_{size, }_{in, out} in
 BPF_PROG_TEST_RUN

ANBZ: #5530

commit 5e903c656b98614698a891c6e098186272cbba14 upstream.

Support recently introduced input/output context for test runs.
We extend only bpf_prog_test_run_xattr. bpf_prog_test_run is
unextendable and left as is.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/include/uapi/linux/bpf.h | 7 +++++++
 tools/lib/bpf/bpf.c            | 5 +++++
 tools/lib/bpf/bpf.h            | 5 +++++
 3 files changed, 17 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 361a22796976..7bb339198bd1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -398,6 +398,13 @@ union bpf_attr {
 		__aligned_u64	data_out;
 		__u32		repeat;
 		__u32		duration;
+		__u32		ctx_size_in;	/* input: len of ctx_in */
+		__u32		ctx_size_out;	/* input/output: len of ctx_out
+						 *   returns ENOSPC if ctx_out
+						 *   is too small.
+						 */
+		__aligned_u64	ctx_in;
+		__aligned_u64	ctx_out;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 1fc7beabf658..764ed5fa7da8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -548,10 +548,15 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
 	attr.test.data_out = ptr_to_u64(test_attr->data_out);
 	attr.test.data_size_in = test_attr->data_size_in;
 	attr.test.data_size_out = test_attr->data_size_out;
+	attr.test.ctx_in = ptr_to_u64(test_attr->ctx_in);
+	attr.test.ctx_out = ptr_to_u64(test_attr->ctx_out);
+	attr.test.ctx_size_in = test_attr->ctx_size_in;
+	attr.test.ctx_size_out = test_attr->ctx_size_out;
 	attr.test.repeat = test_attr->repeat;
 
 	ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
 	test_attr->data_size_out = attr.test.data_size_out;
+	test_attr->ctx_size_out = attr.test.ctx_size_out;
 	test_attr->retval = attr.test.retval;
 	test_attr->duration = attr.test.duration;
 	return ret;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 151fc1710380..d42f97426634 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -136,6 +136,11 @@ struct bpf_prog_test_run_attr {
 			      * out: length of data_out */
 	__u32 retval;        /* out: return code of the BPF program */
 	__u32 duration;      /* out: average per repetition in ns */
+	const void *ctx_in; /* optional */
+	__u32 ctx_size_in;
+	void *ctx_out;      /* optional */
+	__u32 ctx_size_out; /* in: max length of ctx_out
+			     * out: length of cxt_out */
 };
 
 LIBBPF_API int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr);
-- 
Gitee


From 7824d5c3b2339a5ce2aef0d6f548cdfdadf12ef9 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 Apr 2019 09:12:02 -0700
Subject: [PATCH 14/21] bpf: fix missing bpf_check_uarg_tail_zero in
 BPF_PROG_TEST_RUN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #5530

commit c695865c5c9803f14eef2c99d8a49d9ad60a3383 upstream.

Commit b0b9395d865e ("bpf: support input __sk_buff context in
BPF_PROG_TEST_RUN") started using bpf_check_uarg_tail_zero in
BPF_PROG_TEST_RUN. However, bpf_check_uarg_tail_zero is not defined
for !CONFIG_BPF_SYSCALL:

net/bpf/test_run.c: In function ‘bpf_ctx_init’:
net/bpf/test_run.c:142:9: error: implicit declaration of function ‘bpf_check_uarg_tail_zero’ [-Werror=implicit-function-declaration]
   err = bpf_check_uarg_tail_zero(data_in, max_size, size);
         ^~~~~~~~~~~~~~~~~~~~~~~~

Let's not build net/bpf/test_run.c when CONFIG_BPF_SYSCALL is not set.

Reported-by: kbuild test robot <lkp@intel.com>
Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/linux/bpf.h | 37 +++++++++++++++++++++++++++++--------
 net/bpf/Makefile    |  2 +-
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dafb8a5d17b7..cbe634602ec6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -460,14 +460,6 @@ typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
-int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
-			  union bpf_attr __user *uattr);
-int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
-			  union bpf_attr __user *uattr);
-int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
-				     const union bpf_attr *kattr,
-				     union bpf_attr __user *uattr);
-
 /* an array of programs to be executed under rcu_lock.
  *
  * Typical usage:
@@ -663,6 +655,14 @@ static inline bool unprivileged_ebpf_enabled(void)
 	return !sysctl_unprivileged_bpf_disabled;
 }
 
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+				     const union bpf_attr *kattr,
+				     union bpf_attr __user *uattr);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -775,6 +775,27 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog,
+					const union bpf_attr *kattr,
+					union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
+static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
+					const union bpf_attr *kattr,
+					union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
+static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+						   const union bpf_attr *kattr,
+						   union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
 static inline bool unprivileged_ebpf_enabled(void)
 {
 	return false;
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
index 27b2992a0692..b0ca361742e4 100644
--- a/net/bpf/Makefile
+++ b/net/bpf/Makefile
@@ -1 +1 @@
-obj-y	:= test_run.o
+obj-$(CONFIG_BPF_SYSCALL)	:= test_run.o
-- 
Gitee


From 4902ca70bca6009f1eee4ed1dd09fba95880bc06 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 Apr 2019 15:47:07 -0700
Subject: [PATCH 15/21] bpf: explicitly prohibit ctx_{in, out} in non-skb
 BPF_PROG_TEST_RUN

ANBZ: #5530

commit 947e8b595b82d3551750641445d0a97b8f29b536 upstream.

This should allow us later to extend BPF_PROG_TEST_RUN for non-skb case
and be sure that nobody is erroneously setting ctx_{in,out}.

Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 net/bpf/test_run.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index cbd4fb65aa4f..2221573dacdb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -347,6 +347,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	void *data;
 	int ret;
 
+	if (kattr->test.ctx_in || kattr->test.ctx_out)
+		return -EINVAL;
+
 	data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0);
 	if (IS_ERR(data))
 		return PTR_ERR(data);
@@ -390,6 +393,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
 		return -EINVAL;
 
+	if (kattr->test.ctx_in || kattr->test.ctx_out)
+		return -EINVAL;
+
 	data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
 			     SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
 	if (IS_ERR(data))
-- 
Gitee


From 5e568629ab4aa6a7e35176d3b9e9043e412dd87a Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 9 Apr 2019 15:06:40 +0100
Subject: [PATCH 16/21] selftests_bpf: extend test_tc_tunnel for UDP encap

ANBZ: #5530

commit 166b5a7f2ca3803ab0a7bb33ac2300e616de2470 upstream.

commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags")
introduced support to bpf_skb_adjust_room for GSO-friendly GRE
and UDP encapsulation and later introduced associated test_tc_tunnel
tests.  Here those tests are extended to cover UDP encapsulation also.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/config            |   4 +
 .../selftests/bpf/progs/test_tc_tunnel.c      | 138 ++++++++++++------
 tools/testing/selftests/bpf/test_tc_tunnel.sh |  47 +++++-
 3 files changed, 142 insertions(+), 47 deletions(-)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 13825ebc9472..3a156eddb652 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -20,3 +20,7 @@ CONFIG_CRYPTO_SHA256=m
 CONFIG_VXLAN=y
 CONFIG_GENEVE=y
 CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_FOU=m
+CONFIG_NET_FOU_IP_TUNNELS=y
+CONFIG_IPV6_FOU=m
+CONFIG_IPV6_FOU_TUNNEL=m
\ No newline at end of file
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 4cbbab2de5e6..762d01dc0e06 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -12,6 +12,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
+#include <linux/udp.h>
 #include <linux/pkt_cls.h>
 #include <linux/types.h>
 
@@ -20,16 +21,27 @@
 
 static const int cfg_port = 8000;
 
-struct grev4hdr {
-	struct iphdr ip;
+static const int cfg_udp_src = 20000;
+static const int cfg_udp_dst = 5555;
+
+struct gre_hdr {
 	__be16 flags;
 	__be16 protocol;
 } __attribute__((packed));
 
-struct grev6hdr {
+union l4hdr {
+	struct udphdr udp;
+	struct gre_hdr gre;
+};
+
+struct v4hdr {
+	struct iphdr ip;
+	union l4hdr l4hdr;
+} __attribute__((packed));
+
+struct v6hdr {
 	struct ipv6hdr ip;
-	__be16 flags;
-	__be16 protocol;
+	union l4hdr l4hdr;
 } __attribute__((packed));
 
 static __always_inline void set_ipv4_csum(struct iphdr *iph)
@@ -47,10 +59,10 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
 }
 
-static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
+static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto)
 {
-	struct grev4hdr h_outer;
 	struct iphdr iph_inner;
+	struct v4hdr h_outer;
 	struct tcphdr tcph;
 	__u64 flags;
 	int olen;
@@ -70,12 +82,29 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
 	if (tcph.dest != __bpf_constant_htons(cfg_port))
 		return TC_ACT_OK;
 
+	olen = sizeof(h_outer.ip);
+
 	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
-	if (with_gre) {
+	switch (encap_proto) {
+	case IPPROTO_GRE:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
-		olen = sizeof(h_outer);
-	} else {
-		olen = sizeof(h_outer.ip);
+		olen += sizeof(h_outer.l4hdr.gre);
+		h_outer.l4hdr.gre.protocol = bpf_htons(ETH_P_IP);
+		h_outer.l4hdr.gre.flags = 0;
+		break;
+	case IPPROTO_UDP:
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
+		olen += sizeof(h_outer.l4hdr.udp);
+		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
+		h_outer.l4hdr.udp.dest = __bpf_constant_htons(cfg_udp_dst);
+		h_outer.l4hdr.udp.check = 0;
+		h_outer.l4hdr.udp.len = bpf_htons(bpf_ntohs(iph_inner.tot_len) +
+						  sizeof(h_outer.l4hdr.udp));
+		break;
+	case IPPROTO_IPIP:
+		break;
+	default:
+		return TC_ACT_OK;
 	}
 
 	/* add room between mac and network header */
@@ -85,16 +114,10 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
 	/* prepare new outer network header */
 	h_outer.ip = iph_inner;
 	h_outer.ip.tot_len = bpf_htons(olen +
-				      bpf_htons(h_outer.ip.tot_len));
-	if (with_gre) {
-		h_outer.ip.protocol = IPPROTO_GRE;
-		h_outer.protocol = bpf_htons(ETH_P_IP);
-		h_outer.flags = 0;
-	} else {
-		h_outer.ip.protocol = IPPROTO_IPIP;
-	}
+				       bpf_ntohs(h_outer.ip.tot_len));
+	h_outer.ip.protocol = encap_proto;
 
-	set_ipv4_csum((void *)&h_outer.ip);
+	set_ipv4_csum(&h_outer.ip);
 
 	/* store new outer network header */
 	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
@@ -104,11 +127,12 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
 	return TC_ACT_OK;
 }
 
-static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
+static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto)
 {
 	struct ipv6hdr iph_inner;
-	struct grev6hdr h_outer;
+	struct v6hdr h_outer;
 	struct tcphdr tcph;
+	__u16 tot_len;
 	__u64 flags;
 	int olen;
 
@@ -124,15 +148,32 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
 	if (tcph.dest != __bpf_constant_htons(cfg_port))
 		return TC_ACT_OK;
 
+	olen = sizeof(h_outer.ip);
+
 	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
-	if (with_gre) {
+	switch (encap_proto) {
+	case IPPROTO_GRE:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
-		olen = sizeof(h_outer);
-	} else {
-		olen = sizeof(h_outer.ip);
+		olen += sizeof(h_outer.l4hdr.gre);
+		h_outer.l4hdr.gre.protocol = bpf_htons(ETH_P_IPV6);
+		h_outer.l4hdr.gre.flags = 0;
+		break;
+	case IPPROTO_UDP:
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
+		olen += sizeof(h_outer.l4hdr.udp);
+		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
+		h_outer.l4hdr.udp.dest = __bpf_constant_htons(cfg_udp_dst);
+		tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
+			  sizeof(h_outer.l4hdr.udp);
+		h_outer.l4hdr.udp.check = 0;
+		h_outer.l4hdr.udp.len = bpf_htons(tot_len);
+		break;
+	case IPPROTO_IPV6:
+		break;
+	default:
+		return TC_ACT_OK;
 	}
 
-
 	/* add room between mac and network header */
 	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
 		return TC_ACT_SHOT;
@@ -141,13 +182,8 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
 	h_outer.ip = iph_inner;
 	h_outer.ip.payload_len = bpf_htons(olen +
 					   bpf_ntohs(h_outer.ip.payload_len));
-	if (with_gre) {
-		h_outer.ip.nexthdr = IPPROTO_GRE;
-		h_outer.protocol = bpf_htons(ETH_P_IPV6);
-		h_outer.flags = 0;
-	} else {
-		h_outer.ip.nexthdr = IPPROTO_IPV6;
-	}
+
+	h_outer.ip.nexthdr = encap_proto;
 
 	/* store new outer network header */
 	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
@@ -161,7 +197,7 @@ SEC("encap_ipip")
 int __encap_ipip(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
-		return encap_ipv4(skb, false);
+		return encap_ipv4(skb, IPPROTO_IPIP);
 	else
 		return TC_ACT_OK;
 }
@@ -170,7 +206,16 @@ SEC("encap_gre")
 int __encap_gre(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
-		return encap_ipv4(skb, true);
+		return encap_ipv4(skb, IPPROTO_GRE);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_udp")
+int __encap_udp(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, IPPROTO_UDP);
 	else
 		return TC_ACT_OK;
 }
@@ -179,7 +224,7 @@ SEC("encap_ip6tnl")
 int __encap_ip6tnl(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
-		return encap_ipv6(skb, false);
+		return encap_ipv6(skb, IPPROTO_IPV6);
 	else
 		return TC_ACT_OK;
 }
@@ -188,22 +233,33 @@ SEC("encap_ip6gre")
 int __encap_ip6gre(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
-		return encap_ipv6(skb, true);
+		return encap_ipv6(skb, IPPROTO_GRE);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6udp")
+int __encap_ip6udp(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, IPPROTO_UDP);
 	else
 		return TC_ACT_OK;
 }
 
 static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 {
-	int olen;
+	int olen = len;
 
 	switch (proto) {
 	case IPPROTO_IPIP:
 	case IPPROTO_IPV6:
-		olen = len;
 		break;
 	case IPPROTO_GRE:
-		olen = len + 4 /* gre hdr */;
+		olen += sizeof(struct gre_hdr);
+		break;
+	case IPPROTO_UDP:
+		olen += sizeof(struct udphdr);
 		break;
 	default:
 		return TC_ACT_OK;
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index c805adb88f3a..64f30910d39a 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -15,6 +15,9 @@ readonly ns2_v4=192.168.1.2
 readonly ns1_v6=fd::1
 readonly ns2_v6=fd::2
 
+# Must match port used by bpf program
+readonly udpport=5555
+
 readonly infile="$(mktemp)"
 readonly outfile="$(mktemp)"
 
@@ -38,8 +41,8 @@ setup() {
 	# clamp route to reserve room for tunnel headers
 	ip -netns "${ns1}" -4 route flush table main
 	ip -netns "${ns1}" -6 route flush table main
-	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1
-	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1
+	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1472 dev veth1
+	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1452 dev veth1
 
 	sleep 1
 
@@ -103,6 +106,18 @@ if [[ "$#" -eq "0" ]]; then
 	echo "ip6 gre gso"
 	$0 ipv6 ip6gre 2000
 
+	echo "ip udp"
+	$0 ipv4 udp 100
+
+	echo "ip6 udp"
+	$0 ipv6 ip6udp 100
+
+	echo "ip udp gso"
+	$0 ipv4 udp 2000
+
+	echo "ip6 udp gso"
+	$0 ipv6 ip6udp 2000
+
 	echo "OK. All tests passed"
 	exit 0
 fi
@@ -117,12 +132,20 @@ case "$1" in
 "ipv4")
 	readonly addr1="${ns1_v4}"
 	readonly addr2="${ns2_v4}"
-	readonly netcat_opt=-4
+	readonly ipproto=4
+	readonly netcat_opt=-${ipproto}
+	readonly foumod=fou
+	readonly foutype=ipip
+	readonly fouproto=4
 	;;
 "ipv6")
 	readonly addr1="${ns1_v6}"
 	readonly addr2="${ns2_v6}"
-	readonly netcat_opt=-6
+	readonly ipproto=6
+	readonly netcat_opt=-${ipproto}
+	readonly foumod=fou6
+	readonly foutype=ip6tnl
+	readonly fouproto="41 -6"
 	;;
 *)
 	echo "unknown arg: $1"
@@ -155,11 +178,23 @@ echo "test bpf encap without decap (expect failure)"
 server_listen
 ! client_connect
 
+if [[ "$tuntype" =~ "udp" ]]; then
+	# Set up fou tunnel.
+	ttype="${foutype}"
+	targs="encap fou encap-sport auto encap-dport $udpport"
+	# fou may be a module; allow this to fail.
+	modprobe "${foumod}" ||true
+	ip netns exec "${ns2}" ip fou add port 5555 ipproto ${fouproto}
+else
+	ttype=$tuntype
+	targs=""
+fi
+
 # serverside, insert decap module
 # server is still running
 # client can connect again
-ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \
-	remote "${addr1}" local "${addr2}"
+ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
+	remote "${addr1}" local "${addr2}" $targs
 # Because packets are decapped by the tunnel they arrive on testtun0 from
 # the IP stack perspective.  Ensure reverse path filtering is disabled
 # otherwise we drop the TCP SYN as arriving on testtun0 instead of the
-- 
Gitee


From ebbc2102313c4f90e538f0e1c143efed633acbf7 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 9 Apr 2019 15:06:41 +0100
Subject: [PATCH 17/21] bpf: add layer 2 encap support to bpf_skb_adjust_room

ANBZ: #5530

commit 58dfc900faff6db7eb9bf01555622e0b6c74c262 upstream.

commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags")
introduced support to bpf_skb_adjust_room for GSO-friendly GRE
and UDP encapsulation.

For GSO to work for skbs, the inner headers (mac and network) need to
be marked.  For L3 encapsulation using bpf_skb_adjust_room, the mac
and network headers are identical.  Here we provide a way of specifying
the inner mac header length for cases where L2 encap is desired.  Such
an approach can support encapsulated ethernet headers, MPLS headers etc.
For example to convert from a packet of form [eth][ip][tcp] to
[eth][ip][udp][inner mac][ip][tcp], something like the following could
be done:

	headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen;

	ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC,
				  BPF_F_ADJ_ROOM_ENCAP_L4_UDP |
				  BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 |
				  BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen));

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 net/core/filter.c        | 12 ++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d69aa9e45e63..77d82bcfe747 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1511,6 +1511,10 @@ union bpf_attr {
  *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
  *		  Use with ENCAP_L3 flags to further specify the tunnel type.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2(len) **:
+ *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
+ *		  type; **len** is the length of the inner MAC header.
+ *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -2767,10 +2771,16 @@ enum bpf_func_id {
 /* BPF_FUNC_sysctl_get_name flags. */
 #define BPF_F_SYSCTL_BASE_NAME		(1ULL << 0)
 
+#define	BPF_ADJ_ROOM_ENCAP_L2_MASK	0xff
+#define	BPF_ADJ_ROOM_ENCAP_L2_SHIFT	56
+
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+#define	BPF_F_ADJ_ROOM_ENCAP_L2(len)	(((__u64)len & \
+					  BPF_ADJ_ROOM_ENCAP_L2_MASK) \
+					 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
diff --git a/net/core/filter.c b/net/core/filter.c
index 3e1613324362..d06c552a53db 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2975,11 +2975,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 #define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO | \
 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
-					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+					 BPF_F_ADJ_ROOM_ENCAP_L2( \
+					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
 
 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 			    u64 flags)
 {
+	u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
 	bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
 	u16 mac_len = 0, inner_net = 0, inner_trans = 0;
 	unsigned int gso_type = SKB_GSO_DODGY;
@@ -3014,6 +3017,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 
 		mac_len = skb->network_header - skb->mac_header;
 		inner_net = skb->network_header;
+		if (inner_mac_len > len_diff)
+			return -EINVAL;
 		inner_trans = skb->transport_header;
 	}
 
@@ -3022,8 +3027,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 		return ret;
 
 	if (encap) {
-		/* inner mac == inner_net on l3 encap */
-		skb->inner_mac_header = inner_net;
+		skb->inner_mac_header = inner_net - inner_mac_len;
 		skb->inner_network_header = inner_net;
 		skb->inner_transport_header = inner_trans;
 		skb_set_inner_protocol(skb, skb->protocol);
@@ -3037,7 +3041,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 			gso_type |= SKB_GSO_GRE;
 		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
 			gso_type |= SKB_GSO_IPXIP6;
-		else
+		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
 			gso_type |= SKB_GSO_IPXIP4;
 
 		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
-- 
Gitee


From f3dbc2cb968e9dd3bc9ae46363f0015b9b1669db Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 9 Apr 2019 15:06:42 +0100
Subject: [PATCH 18/21] bpf: sync bpf.h to tools/ for BPF_F_ADJ_ROOM_ENCAP_L2

ANBZ: #5530

commit 1db04c300a41e17892bf83ed0d1aa681416ee150 upstream.

Sync include/uapi/linux/bpf.h with tools/ equivalent to add
BPF_F_ADJ_ROOM_ENCAP_L2(len) macro.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7bb339198bd1..efa51b93df9f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1509,6 +1509,10 @@ union bpf_attr {
  *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
  *		  Use with ENCAP_L3 flags to further specify the tunnel type.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2(len) **:
+ *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
+ *		  type; **len** is the length of the inner MAC header.
+ *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -2759,10 +2763,16 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_adjust_room flags. */
 #define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
 
+#define	BPF_ADJ_ROOM_ENCAP_L2_MASK	0xff
+#define	BPF_ADJ_ROOM_ENCAP_L2_SHIFT	56
+
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+#define	BPF_F_ADJ_ROOM_ENCAP_L2(len)	(((__u64)len & \
+					  BPF_ADJ_ROOM_ENCAP_L2_MASK) \
+					 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
 /* BPF_FUNC_sysctl_get_name flags. */
 #define BPF_F_SYSCTL_BASE_NAME		(1ULL << 0)
-- 
Gitee


From 918f53605e6ae26f0ffc3f035532c62cf68500f8 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 9 Apr 2019 15:06:43 +0100
Subject: [PATCH 19/21] selftests_bpf: add L2 encap to test_tc_tunnel

ANBZ: #5530

commit 3ec61df82ba0c2d2455da838ee46bf60f2256b56 upstream.

Update test_tc_tunnel to verify adding inner L2 header
encapsulation (an MPLS label or ethernet header) works.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/config            |   6 +-
 .../selftests/bpf/progs/test_tc_tunnel.c      | 219 +++++++++++++++---
 tools/testing/selftests/bpf/test_tc_tunnel.sh | 113 ++++++---
 3 files changed, 278 insertions(+), 60 deletions(-)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 3a156eddb652..9942a810abd6 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -23,4 +23,8 @@ CONFIG_NET_CLS_FLOWER=m
 CONFIG_NET_FOU=m
 CONFIG_NET_FOU_IP_TUNNELS=y
 CONFIG_IPV6_FOU=m
-CONFIG_IPV6_FOU_TUNNEL=m
\ No newline at end of file
+CONFIG_IPV6_FOU_TUNNEL=m
+CONFIG_MPLS=y
+CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
+CONFIG_MPLS_IPTUNNEL=m
\ No newline at end of file
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 762d01dc0e06..f147eebe96da 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -11,6 +11,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/mpls.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/pkt_cls.h>
@@ -22,7 +23,14 @@
 static const int cfg_port = 8000;
 
 static const int cfg_udp_src = 20000;
-static const int cfg_udp_dst = 5555;
+
+#define	UDP_PORT		5555
+#define	MPLS_OVER_UDP_PORT	6635
+#define	ETH_OVER_UDP_PORT	7777
+
+/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
+static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
+						     MPLS_LS_S_MASK | 0xff);
 
 struct gre_hdr {
 	__be16 flags;
@@ -37,11 +45,13 @@ union l4hdr {
 struct v4hdr {
 	struct iphdr ip;
 	union l4hdr l4hdr;
+	__u8 pad[16];			/* enough space for L2 header */
 } __attribute__((packed));
 
 struct v6hdr {
 	struct ipv6hdr ip;
 	union l4hdr l4hdr;
+	__u8 pad[16];			/* enough space for L2 header */
 } __attribute__((packed));
 
 static __always_inline void set_ipv4_csum(struct iphdr *iph)
@@ -59,13 +69,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
 }
 
-static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto)
+static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
+				      __u16 l2_proto)
 {
+	__u16 udp_dst = UDP_PORT;
 	struct iphdr iph_inner;
 	struct v4hdr h_outer;
 	struct tcphdr tcph;
+	int olen, l2_len;
 	__u64 flags;
-	int olen;
 
 	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
 			       sizeof(iph_inner)) < 0)
@@ -83,23 +95,38 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto)
 		return TC_ACT_OK;
 
 	olen = sizeof(h_outer.ip);
+	l2_len = 0;
 
 	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
+
+	switch (l2_proto) {
+	case ETH_P_MPLS_UC:
+		l2_len = sizeof(mpls_label);
+		udp_dst = MPLS_OVER_UDP_PORT;
+		break;
+	case ETH_P_TEB:
+		l2_len = ETH_HLEN;
+		udp_dst = ETH_OVER_UDP_PORT;
+		break;
+	}
+	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
+
 	switch (encap_proto) {
 	case IPPROTO_GRE:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
 		olen += sizeof(h_outer.l4hdr.gre);
-		h_outer.l4hdr.gre.protocol = bpf_htons(ETH_P_IP);
+		h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
 		h_outer.l4hdr.gre.flags = 0;
 		break;
 	case IPPROTO_UDP:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
 		olen += sizeof(h_outer.l4hdr.udp);
 		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
-		h_outer.l4hdr.udp.dest = __bpf_constant_htons(cfg_udp_dst);
+		h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
 		h_outer.l4hdr.udp.check = 0;
 		h_outer.l4hdr.udp.len = bpf_htons(bpf_ntohs(iph_inner.tot_len) +
-						  sizeof(h_outer.l4hdr.udp));
+						  sizeof(h_outer.l4hdr.udp) +
+						  l2_len);
 		break;
 	case IPPROTO_IPIP:
 		break;
@@ -107,6 +134,19 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto)
 		return TC_ACT_OK;
 	}
 
+	/* add L2 encap (if specified) */
+	switch (l2_proto) {
+	case ETH_P_MPLS_UC:
+		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+		break;
+	case ETH_P_TEB:
+		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
+				       ETH_HLEN))
+			return TC_ACT_SHOT;
+		break;
+	}
+	olen += l2_len;
+
 	/* add room between mac and network header */
 	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
 		return TC_ACT_SHOT;
@@ -127,14 +167,16 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto)
 	return TC_ACT_OK;
 }
 
-static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto)
+static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+				      __u16 l2_proto)
 {
+	__u16 udp_dst = UDP_PORT;
 	struct ipv6hdr iph_inner;
 	struct v6hdr h_outer;
 	struct tcphdr tcph;
+	int olen, l2_len;
 	__u16 tot_len;
 	__u64 flags;
-	int olen;
 
 	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
 			       sizeof(iph_inner)) < 0)
@@ -149,20 +191,34 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto)
 		return TC_ACT_OK;
 
 	olen = sizeof(h_outer.ip);
+	l2_len = 0;
 
 	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
+
+	switch (l2_proto) {
+	case ETH_P_MPLS_UC:
+		l2_len = sizeof(mpls_label);
+		udp_dst = MPLS_OVER_UDP_PORT;
+		break;
+	case ETH_P_TEB:
+		l2_len = ETH_HLEN;
+		udp_dst = ETH_OVER_UDP_PORT;
+		break;
+	}
+	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
+
 	switch (encap_proto) {
 	case IPPROTO_GRE:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
 		olen += sizeof(h_outer.l4hdr.gre);
-		h_outer.l4hdr.gre.protocol = bpf_htons(ETH_P_IPV6);
+		h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
 		h_outer.l4hdr.gre.flags = 0;
 		break;
 	case IPPROTO_UDP:
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
 		olen += sizeof(h_outer.l4hdr.udp);
 		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
-		h_outer.l4hdr.udp.dest = __bpf_constant_htons(cfg_udp_dst);
+		h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
 		tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
 			  sizeof(h_outer.l4hdr.udp);
 		h_outer.l4hdr.udp.check = 0;
@@ -174,6 +230,19 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto)
 		return TC_ACT_OK;
 	}
 
+	/* add L2 encap (if specified) */
+	switch (l2_proto) {
+	case ETH_P_MPLS_UC:
+		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+		break;
+	case ETH_P_TEB:
+		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
+				       ETH_HLEN))
+			return TC_ACT_SHOT;
+		break;
+	}
+	olen += l2_len;
+
 	/* add room between mac and network header */
 	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
 		return TC_ACT_SHOT;
@@ -193,62 +262,136 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto)
 	return TC_ACT_OK;
 }
 
-SEC("encap_ipip")
-int __encap_ipip(struct __sk_buff *skb)
+SEC("encap_ipip_none")
+int __encap_ipip_none(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, IPPROTO_IPIP, ETH_P_IP);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_gre_none")
+int __encap_gre_none(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, IPPROTO_GRE, ETH_P_IP);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_gre_mpls")
+int __encap_gre_mpls(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_gre_eth")
+int __encap_gre_eth(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
-		return encap_ipv4(skb, IPPROTO_IPIP);
+		return encap_ipv4(skb, IPPROTO_GRE, ETH_P_TEB);
 	else
 		return TC_ACT_OK;
 }
 
-SEC("encap_gre")
-int __encap_gre(struct __sk_buff *skb)
+SEC("encap_udp_none")
+int __encap_udp_none(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
-		return encap_ipv4(skb, IPPROTO_GRE);
+		return encap_ipv4(skb, IPPROTO_UDP, ETH_P_IP);
 	else
 		return TC_ACT_OK;
 }
 
-SEC("encap_udp")
-int __encap_udp(struct __sk_buff *skb)
+SEC("encap_udp_mpls")
+int __encap_udp_mpls(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
-		return encap_ipv4(skb, IPPROTO_UDP);
+		return encap_ipv4(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_udp_eth")
+int __encap_udp_eth(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, IPPROTO_UDP, ETH_P_TEB);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6tnl_none")
+int __encap_ip6tnl_none(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, IPPROTO_IPV6, ETH_P_IPV6);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_none")
+int __encap_ip6gre_none(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, IPPROTO_GRE, ETH_P_IPV6);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_mpls")
+int __encap_ip6gre_mpls(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_eth")
+int __encap_ip6gre_eth(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, IPPROTO_GRE, ETH_P_TEB);
 	else
 		return TC_ACT_OK;
 }
 
-SEC("encap_ip6tnl")
-int __encap_ip6tnl(struct __sk_buff *skb)
+SEC("encap_ip6udp_none")
+int __encap_ip6udp_none(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
-		return encap_ipv6(skb, IPPROTO_IPV6);
+		return encap_ipv6(skb, IPPROTO_UDP, ETH_P_IPV6);
 	else
 		return TC_ACT_OK;
 }
 
-SEC("encap_ip6gre")
-int __encap_ip6gre(struct __sk_buff *skb)
+SEC("encap_ip6udp_mpls")
+int __encap_ip6udp_mpls(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
-		return encap_ipv6(skb, IPPROTO_GRE);
+		return encap_ipv6(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
 	else
 		return TC_ACT_OK;
 }
 
-SEC("encap_ip6udp")
-int __encap_ip6udp(struct __sk_buff *skb)
+SEC("encap_ip6udp_eth")
+int __encap_ip6udp_eth(struct __sk_buff *skb)
 {
 	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
-		return encap_ipv6(skb, IPPROTO_UDP);
+		return encap_ipv6(skb, IPPROTO_UDP, ETH_P_TEB);
 	else
 		return TC_ACT_OK;
 }
 
 static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 {
+	struct gre_hdr greh;
+	struct udphdr udph;
 	int olen = len;
 
 	switch (proto) {
@@ -257,9 +400,29 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 		break;
 	case IPPROTO_GRE:
 		olen += sizeof(struct gre_hdr);
+		if (bpf_skb_load_bytes(skb, off + len, &greh, sizeof(greh)) < 0)
+			return TC_ACT_OK;
+		switch (bpf_ntohs(greh.protocol)) {
+		case ETH_P_MPLS_UC:
+			olen += sizeof(mpls_label);
+			break;
+		case ETH_P_TEB:
+			olen += ETH_HLEN;
+			break;
+		}
 		break;
 	case IPPROTO_UDP:
 		olen += sizeof(struct udphdr);
+		if (bpf_skb_load_bytes(skb, off + len, &udph, sizeof(udph)) < 0)
+			return TC_ACT_OK;
+		switch (bpf_ntohs(udph.dest)) {
+		case MPLS_OVER_UDP_PORT:
+			olen += sizeof(mpls_label);
+			break;
+		case ETH_OVER_UDP_PORT:
+			olen += ETH_HLEN;
+			break;
+		}
 		break;
 	default:
 		return TC_ACT_OK;
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 64f30910d39a..d4d8d5d3b06e 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -17,6 +17,9 @@ readonly ns2_v6=fd::2
 
 # Must match port used by bpf program
 readonly udpport=5555
+# MPLSoverUDP
+readonly mplsudpport=6635
+readonly mplsproto=137
 
 readonly infile="$(mktemp)"
 readonly outfile="$(mktemp)"
@@ -41,8 +44,8 @@ setup() {
 	# clamp route to reserve room for tunnel headers
 	ip -netns "${ns1}" -4 route flush table main
 	ip -netns "${ns1}" -6 route flush table main
-	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1472 dev veth1
-	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1452 dev veth1
+	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
+	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
 
 	sleep 1
 
@@ -89,42 +92,44 @@ set -e
 # no arguments: automated test, run all
 if [[ "$#" -eq "0" ]]; then
 	echo "ipip"
-	$0 ipv4 ipip 100
+	$0 ipv4 ipip none 100
 
 	echo "ip6ip6"
-	$0 ipv6 ip6tnl 100
+	$0 ipv6 ip6tnl none 100
 
-	echo "ip gre"
-	$0 ipv4 gre 100
+	for mac in none mpls eth ; do
+		echo "ip gre $mac"
+		$0 ipv4 gre $mac 100
 
-	echo "ip6 gre"
-	$0 ipv6 ip6gre 100
+		echo "ip6 gre $mac"
+		$0 ipv6 ip6gre $mac 100
 
-	echo "ip gre gso"
-	$0 ipv4 gre 2000
+		echo "ip gre $mac gso"
+		$0 ipv4 gre $mac 2000
 
-	echo "ip6 gre gso"
-	$0 ipv6 ip6gre 2000
+		echo "ip6 gre $mac gso"
+		$0 ipv6 ip6gre $mac 2000
 
-	echo "ip udp"
-	$0 ipv4 udp 100
+		echo "ip udp $mac"
+		$0 ipv4 udp $mac 100
 
-	echo "ip6 udp"
-	$0 ipv6 ip6udp 100
+		echo "ip6 udp $mac"
+		$0 ipv6 ip6udp $mac 100
 
-	echo "ip udp gso"
-	$0 ipv4 udp 2000
+		echo "ip udp $mac gso"
+		$0 ipv4 udp $mac 2000
 
-	echo "ip6 udp gso"
-	$0 ipv6 ip6udp 2000
+		echo "ip6 udp $mac gso"
+		$0 ipv6 ip6udp $mac 2000
+	done
 
 	echo "OK. All tests passed"
 	exit 0
 fi
 
-if [[ "$#" -ne "3" ]]; then
+if [[ "$#" -ne "4" ]]; then
 	echo "Usage: $0"
-	echo "   or: $0 <ipv4|ipv6> <tuntype> <data_len>"
+	echo "   or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>"
 	exit 1
 fi
 
@@ -137,6 +142,8 @@ case "$1" in
 	readonly foumod=fou
 	readonly foutype=ipip
 	readonly fouproto=4
+	readonly fouproto_mpls=${mplsproto}
+	readonly gretaptype=gretap
 	;;
 "ipv6")
 	readonly addr1="${ns1_v6}"
@@ -146,6 +153,8 @@ case "$1" in
 	readonly foumod=fou6
 	readonly foutype=ip6tnl
 	readonly fouproto="41 -6"
+	readonly fouproto_mpls="${mplsproto} -6"
+	readonly gretaptype=ip6gretap
 	;;
 *)
 	echo "unknown arg: $1"
@@ -154,9 +163,10 @@ case "$1" in
 esac
 
 readonly tuntype=$2
-readonly datalen=$3
+readonly mac=$3
+readonly datalen=$4
 
-echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}"
+echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}"
 
 trap cleanup EXIT
 
@@ -173,7 +183,7 @@ verify_data
 ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
 ip netns exec "${ns1}" tc filter add dev veth1 egress \
 	bpf direct-action object-file ./test_tc_tunnel.o \
-	section "encap_${tuntype}"
+	section "encap_${tuntype}_${mac}"
 echo "test bpf encap without decap (expect failure)"
 server_listen
 ! client_connect
@@ -184,7 +194,18 @@ if [[ "$tuntype" =~ "udp" ]]; then
 	targs="encap fou encap-sport auto encap-dport $udpport"
 	# fou may be a module; allow this to fail.
 	modprobe "${foumod}" ||true
-	ip netns exec "${ns2}" ip fou add port 5555 ipproto ${fouproto}
+	if [[ "$mac" == "mpls" ]]; then
+		dport=${mplsudpport}
+		dproto=${fouproto_mpls}
+		tmode="mode any ttl 255"
+	else
+		dport=${udpport}
+		dproto=${fouproto}
+	fi
+	ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto}
+	targs="encap fou encap-sport auto encap-dport $dport"
+elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+	ttype=$gretaptype
 else
 	ttype=$tuntype
 	targs=""
@@ -194,7 +215,31 @@ fi
 # server is still running
 # client can connect again
 ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
-	remote "${addr1}" local "${addr2}" $targs
+	${tmode} remote "${addr1}" local "${addr2}" $targs
+
+expect_tun_fail=0
+
+if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
+	# No support for MPLS IPv6 fou tunnel; expect failure.
+	expect_tun_fail=1
+elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
+	# No support for TEB fou tunnel; expect failure.
+	expect_tun_fail=1
+elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+	# Share ethernet address between tunnel/veth2 so L2 decap works.
+	ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
+		  awk '/ether/ { print $2 }')
+	ip netns exec "${ns2}" ip link set testtun0 address $ethaddr
+elif [[ "$mac" == "mpls" ]]; then
+	modprobe mpls_iptunnel ||true
+	modprobe mpls_gso ||true
+	ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536
+	ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo
+	ip netns exec "${ns2}" ip link set lo up
+	ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0
+fi
+
 # Because packets are decapped by the tunnel they arrive on testtun0 from
 # the IP stack perspective.  Ensure reverse path filtering is disabled
 # otherwise we drop the TCP SYN as arriving on testtun0 instead of the
@@ -204,16 +249,22 @@ ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
 # selected as the max of the "all" and device-specific values.
 ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0
 ip netns exec "${ns2}" ip link set dev testtun0 up
-echo "test bpf encap with tunnel device decap"
-client_connect
-verify_data
+if [[ "$expect_tun_fail" == 1 ]]; then
+	# This tunnel mode is not supported, so we expect failure.
+	echo "test bpf encap with tunnel device decap (expect failure)"
+	! client_connect
+else
+	echo "test bpf encap with tunnel device decap"
+	client_connect
+	verify_data
+	server_listen
+fi
 
 # serverside, use BPF for decap
 ip netns exec "${ns2}" ip link del dev testtun0
 ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
 ip netns exec "${ns2}" tc filter add dev veth2 ingress \
 	bpf direct-action object-file ./test_tc_tunnel.o section decap
-server_listen
 echo "test bpf encap with bpf decap"
 client_connect
 verify_data
-- 
Gitee


From 170c0fd08882383eaef67b672bc58ecc56ced698 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 Apr 2019 15:53:16 -0700
Subject: [PATCH 20/21] selftests/bpf: bring back (void *) cast to
 set_ipv4_csum in test_tc_tunnel

ANBZ: #5530

commit bcbccad694b7ef0de9a993ecd918231c10a1496a upstream.

It was removed in commit 166b5a7f2ca3 ("selftests_bpf: extend
test_tc_tunnel for UDP encap") without any explanation.

Otherwise I see:
progs/test_tc_tunnel.c:160:17: warning: taking address of packed member 'ip' of class or structure
      'v4hdr' may result in an unaligned pointer value [-Waddress-of-packed-member]
        set_ipv4_csum(&h_outer.ip);
                       ^~~~~~~~~~
1 warning generated.

Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Fixes: 166b5a7f2ca3 ("selftests_bpf: extend test_tc_tunnel for UDP encap")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index f147eebe96da..3261ceb38337 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -157,7 +157,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 				       bpf_ntohs(h_outer.ip.tot_len));
 	h_outer.ip.protocol = encap_proto;
 
-	set_ipv4_csum(&h_outer.ip);
+	set_ipv4_csum((void *)&h_outer.ip);
 
 	/* store new outer network header */
 	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
-- 
Gitee


From 9812e73015d78b5b165229c8b0c5c2a1ac761a5d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 9 Apr 2019 17:37:41 -0700
Subject: [PATCH 21/21] bpf, bpftool: fix a few ubsan warnings

ANBZ: #5530

commit 69a0f9ecef22131982ba328e6b74ebb082bc0992 upstream.

The issue is reported at https://github.com/libbpf/libbpf/issues/28.

Basically, per C standard, for
  void *memcpy(void *dest, const void *src, size_t n)
if "dest" or "src" is NULL, regardless of whether "n" is 0 or not,
the result of memcpy is undefined. clang ubsan reported three such
instances in bpf.c with the following pattern:
  memcpy(dest, 0, 0).

Although in practice, no known compiler will cause issues when
copy size is 0. Let us still fix the issue to silence ubsan
warnings.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 tools/lib/bpf/bpf.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 764ed5fa7da8..4e4c2ff857a7 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -81,7 +81,6 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size)
 
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 {
-	__u32 name_len = create_attr->name ? strlen(create_attr->name) : 0;
 	union bpf_attr attr;
 
 	memset(&attr, '\0', sizeof(attr));
@@ -91,8 +90,9 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	attr.value_size = create_attr->value_size;
 	attr.max_entries = create_attr->max_entries;
 	attr.map_flags = create_attr->map_flags;
-	memcpy(attr.map_name, create_attr->name,
-	       min(name_len, BPF_OBJ_NAME_LEN - 1));
+	if (create_attr->name)
+		memcpy(attr.map_name, create_attr->name,
+		       min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
 	attr.numa_node = create_attr->numa_node;
 	attr.btf_fd = create_attr->btf_fd;
 	attr.btf_key_type_id = create_attr->btf_key_type_id;
@@ -157,7 +157,6 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
 			       int key_size, int inner_map_fd, int max_entries,
 			       __u32 map_flags, int node)
 {
-	__u32 name_len = name ? strlen(name) : 0;
 	union bpf_attr attr;
 
 	memset(&attr, '\0', sizeof(attr));
@@ -168,7 +167,9 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
 	attr.inner_map_fd = inner_map_fd;
 	attr.max_entries = max_entries;
 	attr.map_flags = map_flags;
-	memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+	if (name)
+		memcpy(attr.map_name, name,
+		       min(strlen(name), BPF_OBJ_NAME_LEN - 1));
 
 	if (node >= 0) {
 		attr.map_flags |= BPF_F_NUMA_NODE;
@@ -218,7 +219,6 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	void *finfo = NULL, *linfo = NULL;
 	union bpf_attr attr;
 	__u32 log_level;
-	__u32 name_len;
 	int fd;
 
 	if (!load_attr || !log_buf != !log_buf_sz)
@@ -228,8 +228,6 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	if (log_level > (4 | 2 | 1) || (log_level && !log_buf))
 		return -EINVAL;
 
-	name_len = load_attr->name ? strlen(load_attr->name) : 0;
-
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = load_attr->prog_type;
 	attr.expected_attach_type = load_attr->expected_attach_type;
@@ -255,8 +253,9 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	attr.line_info_rec_size = load_attr->line_info_rec_size;
 	attr.line_info_cnt = load_attr->line_info_cnt;
 	attr.line_info = ptr_to_u64(load_attr->line_info);
-	memcpy(attr.prog_name, load_attr->name,
-	       min(name_len, BPF_OBJ_NAME_LEN - 1));
+	if (load_attr->name)
+		memcpy(attr.prog_name, load_attr->name,
+		       min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1));
 
 	fd = sys_bpf_prog_load(&attr, sizeof(attr));
 	if (fd >= 0)
-- 
Gitee