diff --git a/include/linux/filter.h b/include/linux/filter.h
index 834aa6e011dc70aace9069263964ecd7c027ce96..da6ccd833b7534eb61ff5402e941b279e341a78b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -23,6 +23,7 @@
 
 #include <net/sch_generic.h>
 
+#include <asm/byteorder.h>
 #include <uapi/linux/filter.h>
 #include <uapi/linux/bpf.h>
 
@@ -777,7 +778,19 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+static inline u8
+bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
+{
+	u8 access_off = off & (size_default - 1);
+
+#ifdef __LITTLE_ENDIAN
+	return access_off;
+#else
+	return size_default - (access_off + size);
+#endif
+}
+
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ac787c72912985c8f8d41b25d5b7b9648e1af77a..57f96a592ad52eb5c3452cc0fa38cc8a9ed7c1ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3226,7 +3226,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3238,7 +3238,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8782379d99f7c494116c18a7c0ea094c5e1cbb70..a310a71f82a85b2e2d49ea25155ebc40a9dcf29a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 				 !btf_type_is_var(size_type)))
 			return NULL;
 
-		size = btf->resolved_sizes[size_type_id];
 		size_type_id = btf->resolved_ids[size_type_id];
 		size_type = btf_type_by_id(btf, size_type_id);
 		if (btf_type_nosize_or_null(size_type))
 			return NULL;
+		else if (btf_type_has_size(size_type))
+			size = size_type->size;
+		else if (btf_type_is_array(size_type))
+			size = btf->resolved_sizes[size_type_id];
+		else if (btf_type_is_ptr(size_type))
+			size = sizeof(void *);
+		else
+			return NULL;
 	}
 
 	*type_id = size_type_id;
@@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	const struct btf_type *next_type;
 	u32 next_type_id = t->type;
 	struct btf *btf = env->btf;
-	u32 next_type_size = 0;
 
 	next_type = btf_type_by_id(btf, next_type_id);
 	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	 * save us a few type-following when we use it later (e.g. in
 	 * pretty print).
 	 */
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+	if (!btf_type_id_size(btf, &next_type_id, NULL)) {
 		if (env_type_is_resolved(env, next_type_id))
 			next_type = btf_type_id_resolve(btf, &next_type_id);
 
@@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 		}
 	}
 
-	env_stack_pop_resolved(env, next_type_id, next_type_size);
+	env_stack_pop_resolved(env, next_type_id, 0);
 
 	return 0;
 }
@@ -1645,7 +1651,6 @@ static int btf_var_resolve(struct btf_verifier_env *env,
 	const struct btf_type *t = v->t;
 	u32 next_type_id = t->type;
 	struct btf *btf = env->btf;
-	u32 next_type_size;
 
 	next_type = btf_type_by_id(btf, next_type_id);
 	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1675,12 +1680,12 @@ static int btf_var_resolve(struct btf_verifier_env *env,
 	 * forward types or similar that would resolve to size of
 	 * zero is allowed.
 	 */
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+	if (!btf_type_id_size(btf, &next_type_id, NULL)) {
 		btf_verifier_log_type(env, v->t, "Invalid type_id");
 		return -EINVAL;
 	}
 
-	env_stack_pop_resolved(env, next_type_id, next_type_size);
+	env_stack_pop_resolved(env, next_type_id, 0);
 
 	return 0;
 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8a3623733094d1e28550b165f30c8e75280286e4..122c6761758388bf0aef847023d4db420fffbccc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1336,6 +1336,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	u32 read_size;
 
 	switch (si->off) {
 	case offsetof(struct bpf_sysctl, write):
@@ -1367,7 +1368,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
 			*insn++ = BPF_STX_MEM(
-				BPF_SIZEOF(u32), treg, si->src_reg, 0);
+				BPF_SIZEOF(u32), treg, si->src_reg,
+				bpf_ctx_narrow_access_offset(
+					0, sizeof(u32), sizeof(loff_t)));
 			*insn++ = BPF_LDX_MEM(
 				BPF_DW, treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1376,8 +1379,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
 				si->dst_reg, si->src_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
+			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
 			*insn++ = BPF_LDX_MEM(
-				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+				bpf_ctx_narrow_access_offset(
+					0, read_size, sizeof(loff_t)));
 		}
 		*target_size = sizeof(u32);
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ebd14096aaad895145980ba72150c692e5dd9266..709c7a350a1d95417bd93d4d7048c51a307c6569 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6304,11 +6304,13 @@ static int check_return_code(struct bpf_verifier_env *env)
 		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
 			range = tnum_range(1, 1);
+		break;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
 			range = tnum_range(0, 3);
 			enforce_attach_type_range = tnum_range(2, 3);
 		}
+		break;
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
@@ -8821,8 +8823,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		if (is_narrower_load && size < target_size) {
-			u8 shift = (off & (size_default - 1)) * 8;
-
+			u8 shift = bpf_ctx_narrow_access_offset(
+				off, size, size_default) * 8;
 			if (ctx_field_size <= 4) {
 				if (shift)
 					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
diff --git a/net/core/filter.c b/net/core/filter.c
index 289fca1671d02773783f1a8948abaab13fae7151..84ea1683e75153b663c04f21bb3399edae071391 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6838,20 +6838,30 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
 				msg_src_ip6[3]):
-		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
+				return true;
+
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 1df637f85f9436dbac1662274f691dd7fefd67cd..7a374b3c851d0577afa9cf725ac6a7085187e9e4 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -29,6 +29,7 @@ PROG COMMANDS
 |	**bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
 |	**bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
 |	**bpftool** **prog tracelog**
+|	**bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*]
 |	**bpftool** **prog help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -146,6 +147,39 @@ DESCRIPTION
 		  streaming data from BPF programs to user space, one can use
 		  perf events (see also **bpftool-map**\ (8)).
 
+	**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*]
+		  Run BPF program *PROG* in the kernel testing infrastructure
+		  for BPF, meaning that the program works on the data and
+		  context provided by the user, and not on actual packets or
+		  monitored functions etc. Return value and duration for the
+		  test run are printed out to the console.
+
+		  Input data is read from the *FILE* passed with **data_in**.
+		  If this *FILE* is "**-**", input data is read from standard
+		  input. Input context, if any, is read from *FILE* passed with
+		  **ctx_in**. Again, "**-**" can be used to read from standard
+		  input, but only if standard input is not already in use for
+		  input data. If a *FILE* is passed with **data_out**, output
+		  data is written to that file. Similarly, output context is
+		  written to the *FILE* passed with **ctx_out**. For both
+		  output flows, "**-**" can be used to print to the standard
+		  output (as plain text, or JSON if relevant option was
+		  passed). If output keywords are omitted, output data and
+		  context are discarded. Keywords **data_size_out** and
+		  **ctx_size_out** are used to pass the size (in bytes) for the
+		  output buffers to the kernel, although the default of 32 kB
+		  should be more than enough for most cases.
+
+		  Keyword **repeat** is used to indicate the number of
+		  consecutive runs to perform. Note that output data and
+		  context printed to files correspond to the last of those
+		  runs. The duration printed out at the end of the runs is an
+		  average over all runs performed by the command.
+
+		  Not all program types support test run. Among those which do,
+		  not all of them can take the **ctx_in**/**ctx_out**
+		  arguments. bpftool does not perform checks on program types.
+
 	**bpftool prog help**
 		  Print short help message.
 
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 4ad1f0894d5312ef81e162cc5ecd34f114405725..453a9e1e8549e219682379df5875efdcf001b411 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -40,9 +40,8 @@ CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-ini
 CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
 	-I$(srctree)/kernel/bpf/ \
 	-I$(srctree)/tools/include \
-	-I$(srctree)/tools/include/uapi \
 	-I$(srctree)/tools/lib/bpf \
-	-I$(srctree)/tools/perf
+	-I$(srctree)/tools/include/uapi
 CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 ifneq ($(EXTRA_CFLAGS),)
 CFLAGS += $(EXTRA_CFLAGS)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index ba37095e1f62e64b2ebcb7d7c51c6ad8d48d22a2..c8f42e1fcbc994d859ede556b1a6a5a4e4e75e1b 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -342,6 +342,13 @@ _bpftool()
                 load|loadall)
                     local obj
 
+                    # Propose "load/loadall" to complete "bpftool prog load",
+                    # or bash tries to complete "load" as a filename below.
+                    if [[ ${#words[@]} -eq 3 ]]; then
+                        COMPREPLY=( $( compgen -W "load loadall" -- "$cur" ) )
+                        return 0
+                    fi
+
                     if [[ ${#words[@]} -lt 6 ]]; then
                         _filedir
                         return 0
@@ -408,10 +415,34 @@ _bpftool()
                 tracelog)
                     return 0
                     ;;
+                run)
+                    if [[ ${#words[@]} -lt 5 ]]; then
+                        _filedir
+                        return 0
+                    fi
+                    case $prev in
+                        id)
+                            _bpftool_get_prog_ids
+                            return 0
+                            ;;
+                        data_in|data_out|ctx_in|ctx_out)
+                            _filedir
+                            return 0
+                            ;;
+                        repeat|data_size_out|ctx_size_out)
+                            return 0
+                            ;;
+                        *)
+                            _bpftool_once_attr 'data_in data_out data_size_out \
+                                ctx_in ctx_out ctx_size_out repeat'
+                            return 0
+                            ;;
+                    esac
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'dump help pin attach detach load \
-                            show list tracelog' -- "$cur" ) )
+                        COMPREPLY=( $( compgen -W 'dump help pin attach detach \
+                            load loadall show list tracelog run' -- "$cur" ) )
                     ;;
             esac
             ;;
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index d26c498b376c3666984acdb3e9198173610c07ce..68babde3cee6eddb719198003c0aa260b4a24787 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -117,6 +117,35 @@ bool is_prefix(const char *pfx, const char *str)
 	return !memcmp(str, pfx, strlen(pfx));
 }
 
+/* Last argument MUST be NULL pointer */
+int detect_common_prefix(const char *arg, ...)
+{
+	unsigned int count = 0;
+	const char *ref;
+	char msg[256];
+	va_list ap;
+
+	snprintf(msg, sizeof(msg), "ambiguous prefix: '%s' could be '", arg);
+	va_start(ap, arg);
+	while ((ref = va_arg(ap, const char *))) {
+		if (!is_prefix(arg, ref))
+			continue;
+		count++;
+		if (count > 1)
+			strncat(msg, "' or '", sizeof(msg) - strlen(msg) - 1);
+		strncat(msg, ref, sizeof(msg) - strlen(msg) - 1);
+	}
+	va_end(ap);
+	strncat(msg, "'", sizeof(msg) - strlen(msg) - 1);
+
+	if (count >= 2) {
+		p_err("%s", msg);
+		return -1;
+	}
+
+	return 0;
+}
+
 void fprint_hex(FILE *f, void *arg, unsigned int n, const char *sep)
 {
 	unsigned char *data = arg;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 673869822c114ca842fc2cafde9e704f24d95ea4..cf76a6767d5340f1c387acd55f50eaf687f50f0e 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -102,6 +102,7 @@ void p_err(const char *fmt, ...);
 void p_info(const char *fmt, ...);
 
 bool is_prefix(const char *pfx, const char *str);
+int detect_common_prefix(const char *arg, ...);
 void fprint_hex(FILE *f, void *arg, unsigned int n, const char *sep);
 void usage(void) __noreturn;
 
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 0507dfaf7a8f93f5c0e2daf8de837e1163dff203..48924d4939cd6ad6b1a9eb8a6595fe14fe20997f 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -22,13 +22,12 @@
 #include <sys/syscall.h>
 
 #include <bpf.h>
-#include <perf-sys.h>
 
 #include "main.h"
 
 #define MMAP_PAGE_CNT	16
 
-static bool stop;
+static volatile bool stop;
 
 struct event_ring_info {
 	int fd;
@@ -44,32 +43,44 @@ struct perf_event_sample {
 	unsigned char data[];
 };
 
+struct perf_event_lost {
+	struct perf_event_header header;
+	__u64 id;
+	__u64 lost;
+};
+
 static void int_exit(int signo)
 {
 	fprintf(stderr, "Stopping...\n");
 	stop = true;
 }
 
+struct event_pipe_ctx {
+	bool all_cpus;
+	int cpu;
+	int idx;
+};
+
 static enum bpf_perf_event_ret
-print_bpf_output(struct perf_event_header *event, void *private_data)
+print_bpf_output(void *private_data, int cpu, struct perf_event_header *event)
 {
-	struct perf_event_sample *e = container_of(event, struct perf_event_sample,
+	struct perf_event_sample *e = container_of(event,
+						   struct perf_event_sample,
 						   header);
-	struct event_ring_info *ring = private_data;
-	struct {
-		struct perf_event_header header;
-		__u64 id;
-		__u64 lost;
-	} *lost = (typeof(lost))event;
+	struct perf_event_lost *lost = container_of(event,
+						    struct perf_event_lost,
+						    header);
+	struct event_pipe_ctx *ctx = private_data;
+	int idx = ctx->all_cpus ? cpu : ctx->idx;
 
 	if (json_output) {
 		jsonw_start_object(json_wtr);
 		jsonw_name(json_wtr, "type");
 		jsonw_uint(json_wtr, e->header.type);
 		jsonw_name(json_wtr, "cpu");
-		jsonw_uint(json_wtr, ring->cpu);
+		jsonw_uint(json_wtr, cpu);
 		jsonw_name(json_wtr, "index");
-		jsonw_uint(json_wtr, ring->key);
+		jsonw_uint(json_wtr, idx);
 		if (e->header.type == PERF_RECORD_SAMPLE) {
 			jsonw_name(json_wtr, "timestamp");
 			jsonw_uint(json_wtr, e->time);
@@ -89,7 +100,7 @@ print_bpf_output(struct perf_event_header *event, void *private_data)
 		if (e->header.type == PERF_RECORD_SAMPLE) {
 			printf("== @%lld.%09lld CPU: %d index: %d =====\n",
 			       e->time / 1000000000ULL, e->time % 1000000000ULL,
-			       ring->cpu, ring->key);
+			       cpu, idx);
 			fprint_hex(stdout, e->data, e->size, " ");
 			printf("\n");
 		} else if (e->header.type == PERF_RECORD_LOST) {
@@ -103,87 +114,25 @@ print_bpf_output(struct perf_event_header *event, void *private_data)
 	return LIBBPF_PERF_EVENT_CONT;
 }
 
-static void
-perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
-{
-	enum bpf_perf_event_ret ret;
-
-	ret = bpf_perf_event_read_simple(ring->mem,
-					 MMAP_PAGE_CNT * get_page_size(),
-					 get_page_size(), buf, buf_len,
-					 print_bpf_output, ring);
-	if (ret != LIBBPF_PERF_EVENT_CONT) {
-		fprintf(stderr, "perf read loop failed with %d\n", ret);
-		stop = true;
-	}
-}
-
-static int perf_mmap_size(void)
-{
-	return get_page_size() * (MMAP_PAGE_CNT + 1);
-}
-
-static void *perf_event_mmap(int fd)
-{
-	int mmap_size = perf_mmap_size();
-	void *base;
-
-	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	if (base == MAP_FAILED) {
-		p_err("event mmap failed: %s\n", strerror(errno));
-		return NULL;
-	}
-
-	return base;
-}
-
-static void perf_event_unmap(void *mem)
-{
-	if (munmap(mem, perf_mmap_size()))
-		fprintf(stderr, "Can't unmap ring memory!\n");
-}
-
-static int bpf_perf_event_open(int map_fd, int key, int cpu)
+int do_event_pipe(int argc, char **argv)
 {
-	struct perf_event_attr attr = {
+	struct perf_event_attr perf_attr = {
 		.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME,
 		.type = PERF_TYPE_SOFTWARE,
 		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.sample_period = 1,
+		.wakeup_events = 1,
 	};
-	int pmu_fd;
-
-	pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
-	if (pmu_fd < 0) {
-		p_err("failed to open perf event %d for CPU %d", key, cpu);
-		return -1;
-	}
-
-	if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
-		p_err("failed to update map for event %d for CPU %d", key, cpu);
-		goto err_close;
-	}
-	if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
-		p_err("failed to enable event %d for CPU %d", key, cpu);
-		goto err_close;
-	}
-
-	return pmu_fd;
-
-err_close:
-	close(pmu_fd);
-	return -1;
-}
-
-int do_event_pipe(int argc, char **argv)
-{
-	int i, nfds, map_fd, index = -1, cpu = -1;
 	struct bpf_map_info map_info = {};
-	struct event_ring_info *rings;
-	size_t tmp_buf_sz = 0;
-	void *tmp_buf = NULL;
-	struct pollfd *pfds;
+	struct perf_buffer_raw_opts opts = {};
+	struct event_pipe_ctx ctx = {
+		.all_cpus = true,
+		.cpu = -1,
+		.idx = -1,
+	};
+	struct perf_buffer *pb;
 	__u32 map_info_len;
-	bool do_all = true;
+	int err, map_fd;
 
 	map_info_len = sizeof(map_info);
 	map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
@@ -205,7 +154,7 @@ int do_event_pipe(int argc, char **argv)
 			char *endptr;
 
 			NEXT_ARG();
-			cpu = strtoul(*argv, &endptr, 0);
+			ctx.cpu = strtoul(*argv, &endptr, 0);
 			if (*endptr) {
 				p_err("can't parse %s as CPU ID", **argv);
 				goto err_close_map;
@@ -216,7 +165,7 @@ int do_event_pipe(int argc, char **argv)
 			char *endptr;
 
 			NEXT_ARG();
-			index = strtoul(*argv, &endptr, 0);
+			ctx.idx = strtoul(*argv, &endptr, 0);
 			if (*endptr) {
 				p_err("can't parse %s as index", **argv);
 				goto err_close_map;
@@ -228,45 +177,32 @@ int do_event_pipe(int argc, char **argv)
 			goto err_close_map;
 		}
 
-		do_all = false;
+		ctx.all_cpus = false;
 	}
 
-	if (!do_all) {
-		if (index == -1 || cpu == -1) {
+	if (!ctx.all_cpus) {
+		if (ctx.idx == -1 || ctx.cpu == -1) {
 			p_err("cpu and index must be specified together");
 			goto err_close_map;
 		}
-
-		nfds = 1;
 	} else {
-		nfds = min(get_possible_cpus(), map_info.max_entries);
-		cpu = 0;
-		index = 0;
+		ctx.cpu = 0;
+		ctx.idx = 0;
 	}
 
-	rings = calloc(nfds, sizeof(rings[0]));
-	if (!rings)
+	opts.attr = &perf_attr;
+	opts.event_cb = print_bpf_output;
+	opts.ctx = &ctx;
+	opts.cpu_cnt = ctx.all_cpus ? 0 : 1;
+	opts.cpus = &ctx.cpu;
+	opts.map_keys = &ctx.idx;
+
+	pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &opts);
+	err = libbpf_get_error(pb);
+	if (err) {
+		p_err("failed to create perf buffer: %s (%d)",
+		      strerror(err), err);
 		goto err_close_map;
-
-	pfds = calloc(nfds, sizeof(pfds[0]));
-	if (!pfds)
-		goto err_free_rings;
-
-	for (i = 0; i < nfds; i++) {
-		rings[i].cpu = cpu + i;
-		rings[i].key = index + i;
-
-		rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
-						  rings[i].cpu);
-		if (rings[i].fd < 0)
-			goto err_close_fds_prev;
-
-		rings[i].mem = perf_event_mmap(rings[i].fd);
-		if (!rings[i].mem)
-			goto err_close_fds_current;
-
-		pfds[i].fd = rings[i].fd;
-		pfds[i].events = POLLIN;
 	}
 
 	signal(SIGINT, int_exit);
@@ -277,34 +213,24 @@ int do_event_pipe(int argc, char **argv)
 		jsonw_start_array(json_wtr);
 
 	while (!stop) {
-		poll(pfds, nfds, 200);
-		for (i = 0; i < nfds; i++)
-			perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz);
+		err = perf_buffer__poll(pb, 200);
+		if (err < 0 && err != -EINTR) {
+			p_err("perf buffer polling failed: %s (%d)",
+			      strerror(err), err);
+			goto err_close_pb;
+		}
 	}
-	free(tmp_buf);
 
 	if (json_output)
 		jsonw_end_array(json_wtr);
 
-	for (i = 0; i < nfds; i++) {
-		perf_event_unmap(rings[i].mem);
-		close(rings[i].fd);
-	}
-	free(pfds);
-	free(rings);
+	perf_buffer__free(pb);
 	close(map_fd);
 
 	return 0;
 
-err_close_fds_prev:
-	while (i--) {
-		perf_event_unmap(rings[i].mem);
-err_close_fds_current:
-		close(rings[i].fd);
-	}
-	free(pfds);
-err_free_rings:
-	free(rings);
+err_close_pb:
+	perf_buffer__free(pb);
 err_close_map:
 	close(map_fd);
 	return -1;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 616a9c0b7f66073d6625c709a94eee0c9876bda8..43fdbbfe41bb24eef44e91c984c4dd30f41a6798 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -15,6 +15,7 @@
 #include <sys/stat.h>
 
 #include <linux/err.h>
+#include <linux/sizes.h>
 
 #include <bpf.h>
 #include <btf.h>
@@ -750,6 +751,344 @@ static int do_detach(int argc, char **argv)
 	return 0;
 }
 
+static int check_single_stdin(char *file_data_in, char *file_ctx_in)
+{
+	if (file_data_in && file_ctx_in &&
+	    !strcmp(file_data_in, "-") && !strcmp(file_ctx_in, "-")) {
+		p_err("cannot use standard input for both data_in and ctx_in");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_run_data(const char *fname, void **data_ptr, unsigned int *size)
+{
+	size_t block_size = 256;
+	size_t buf_size = block_size;
+	size_t nb_read = 0;
+	void *tmp;
+	FILE *f;
+
+	if (!fname) {
+		*data_ptr = NULL;
+		*size = 0;
+		return 0;
+	}
+
+	if (!strcmp(fname, "-"))
+		f = stdin;
+	else
+		f = fopen(fname, "r");
+	if (!f) {
+		p_err("failed to open %s: %s", fname, strerror(errno));
+		return -1;
+	}
+
+	*data_ptr = malloc(block_size);
+	if (!*data_ptr) {
+		p_err("failed to allocate memory for data_in/ctx_in: %s",
+		      strerror(errno));
+		goto err_fclose;
+	}
+
+	while ((nb_read += fread(*data_ptr + nb_read, 1, block_size, f))) {
+		if (feof(f))
+			break;
+		if (ferror(f)) {
+			p_err("failed to read data_in/ctx_in from %s: %s",
+			      fname, strerror(errno));
+			goto err_free;
+		}
+		if (nb_read > buf_size - block_size) {
+			if (buf_size == UINT32_MAX) {
+				p_err("data_in/ctx_in is too long (max: %d)",
+				      UINT32_MAX);
+				goto err_free;
+			}
+			/* No space for fread()-ing next chunk; realloc() */
+			buf_size *= 2;
+			tmp = realloc(*data_ptr, buf_size);
+			if (!tmp) {
+				p_err("failed to reallocate data_in/ctx_in: %s",
+				      strerror(errno));
+				goto err_free;
+			}
+			*data_ptr = tmp;
+		}
+	}
+	if (f != stdin)
+		fclose(f);
+
+	*size = nb_read;
+	return 0;
+
+err_free:
+	free(*data_ptr);
+	*data_ptr = NULL;
+err_fclose:
+	if (f != stdin)
+		fclose(f);
+	return -1;
+}
+
+static void hex_print(void *data, unsigned int size, FILE *f)
+{
+	size_t i, j;
+	char c;
+
+	for (i = 0; i < size; i += 16) {
+		/* Row offset */
+		fprintf(f, "%07zx\t", i);
+
+		/* Hexadecimal values */
+		for (j = i; j < i + 16 && j < size; j++)
+			fprintf(f, "%02x%s", *(uint8_t *)(data + j),
+				j % 2 ? " " : "");
+		for (; j < i + 16; j++)
+			fprintf(f, "  %s", j % 2 ? " " : "");
+
+		/* ASCII values (if relevant), '.' otherwise */
+		fprintf(f, "| ");
+		for (j = i; j < i + 16 && j < size; j++) {
+			c = *(char *)(data + j);
+			if (c < ' ' || c > '~')
+				c = '.';
+			fprintf(f, "%c%s", c, j == i + 7 ? " " : "");
+		}
+
+		fprintf(f, "\n");
+	}
+}
+
+static int
+print_run_output(void *data, unsigned int size, const char *fname,
+		 const char *json_key)
+{
+	size_t nb_written;
+	FILE *f;
+
+	if (!fname)
+		return 0;
+
+	if (!strcmp(fname, "-")) {
+		f = stdout;
+		if (json_output) {
+			jsonw_name(json_wtr, json_key);
+			print_data_json(data, size);
+		} else {
+			hex_print(data, size, f);
+		}
+		return 0;
+	}
+
+	f = fopen(fname, "w");
+	if (!f) {
+		p_err("failed to open %s: %s", fname, strerror(errno));
+		return -1;
+	}
+
+	nb_written = fwrite(data, 1, size, f);
+	fclose(f);
+	if (nb_written != size) {
+		p_err("failed to write output data/ctx: %s", strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int alloc_run_data(void **data_ptr, unsigned int size_out)
+{
+	*data_ptr = calloc(size_out, 1);
+	if (!*data_ptr) {
+		p_err("failed to allocate memory for output data/ctx: %s",
+		      strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int do_run(int argc, char **argv)
+{
+	char *data_fname_in = NULL, *data_fname_out = NULL;
+	char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
+	struct bpf_prog_test_run_attr test_attr = {0};
+	const unsigned int default_size = SZ_32K;
+	void *data_in = NULL, *data_out = NULL;
+	void *ctx_in = NULL, *ctx_out = NULL;
+	unsigned int repeat = 1;
+	int fd, err;
+
+	if (!REQ_ARGS(4))
+		return -1;
+
+	fd = prog_parse_fd(&argc, &argv);
+	if (fd < 0)
+		return -1;
+
+	while (argc) {
+		if (detect_common_prefix(*argv, "data_in", "data_out",
+					 "data_size_out", NULL))
+			return -1;
+		if (detect_common_prefix(*argv, "ctx_in", "ctx_out",
+					 "ctx_size_out", NULL))
+			return -1;
+
+		if (is_prefix(*argv, "data_in")) {
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			data_fname_in = GET_ARG();
+			if (check_single_stdin(data_fname_in, ctx_fname_in))
+				return -1;
+		} else if (is_prefix(*argv, "data_out")) {
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			data_fname_out = GET_ARG();
+		} else if (is_prefix(*argv, "data_size_out")) {
+			char *endptr;
+
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			test_attr.data_size_out = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as output data size",
+				      *argv);
+				return -1;
+			}
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "ctx_in")) {
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			ctx_fname_in = GET_ARG();
+			if (check_single_stdin(data_fname_in, ctx_fname_in))
+				return -1;
+		} else if (is_prefix(*argv, "ctx_out")) {
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			ctx_fname_out = GET_ARG();
+		} else if (is_prefix(*argv, "ctx_size_out")) {
+			char *endptr;
+
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			test_attr.ctx_size_out = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as output context size",
+				      *argv);
+				return -1;
+			}
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "repeat")) {
+			char *endptr;
+
+			NEXT_ARG();
+			if (!REQ_ARGS(1))
+				return -1;
+
+			repeat = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as repeat number",
+				      *argv);
+				return -1;
+			}
+			NEXT_ARG();
+		} else {
+			p_err("expected no more arguments, 'data_in', 'data_out', 'data_size_out', 'ctx_in', 'ctx_out', 'ctx_size_out' or 'repeat', got: '%s'?",
+			      *argv);
+			return -1;
+		}
+	}
+
+	err = get_run_data(data_fname_in, &data_in, &test_attr.data_size_in);
+	if (err)
+		return -1;
+
+	if (data_in) {
+		if (!test_attr.data_size_out)
+			test_attr.data_size_out = default_size;
+		err = alloc_run_data(&data_out, test_attr.data_size_out);
+		if (err)
+			goto free_data_in;
+	}
+
+	err = get_run_data(ctx_fname_in, &ctx_in, &test_attr.ctx_size_in);
+	if (err)
+		goto free_data_out;
+
+	if (ctx_in) {
+		if (!test_attr.ctx_size_out)
+			test_attr.ctx_size_out = default_size;
+		err = alloc_run_data(&ctx_out, test_attr.ctx_size_out);
+		if (err)
+			goto free_ctx_in;
+	}
+
+	test_attr.prog_fd	= fd;
+	test_attr.repeat	= repeat;
+	test_attr.data_in	= data_in;
+	test_attr.data_out	= data_out;
+	test_attr.ctx_in	= ctx_in;
+	test_attr.ctx_out	= ctx_out;
+
+	err = bpf_prog_test_run_xattr(&test_attr);
+	if (err) {
+		p_err("failed to run program: %s", strerror(errno));
+		goto free_ctx_out;
+	}
+
+	err = 0;
+
+	if (json_output)
+		jsonw_start_object(json_wtr);	/* root */
+
+	/* Do not exit on errors occurring when printing output data/context,
+	 * we still want to print return value and duration for program run.
+	 */
+	if (test_attr.data_size_out)
+		err += print_run_output(test_attr.data_out,
+					test_attr.data_size_out,
+					data_fname_out, "data_out");
+	if (test_attr.ctx_size_out)
+		err += print_run_output(test_attr.ctx_out,
+					test_attr.ctx_size_out,
+					ctx_fname_out, "ctx_out");
+
+	if (json_output) {
+		jsonw_uint_field(json_wtr, "retval", test_attr.retval);
+		jsonw_uint_field(json_wtr, "duration", test_attr.duration);
+		jsonw_end_object(json_wtr);	/* root */
+	} else {
+		fprintf(stdout, "Return value: %u, duration%s: %uns\n",
+			test_attr.retval,
+			repeat > 1 ? " (average)" : "", test_attr.duration);
+	}
+
+free_ctx_out:
+	free(ctx_out);
+free_ctx_in:
+	free(ctx_in);
+free_data_out:
+	free(data_out);
+free_data_in:
+	free(data_in);
+
+	return err;
+}
+
 static int load_with_options(int argc, char **argv, bool first_prog_only)
 {
 	struct bpf_object_load_attr load_attr = { 0 };
@@ -1060,6 +1399,11 @@ static int do_help(int argc, char **argv)
 		"                         [pinmaps MAP_DIR]\n"
 		"       %s %s attach PROG ATTACH_TYPE [MAP]\n"
 		"       %s %s detach PROG ATTACH_TYPE [MAP]\n"
+		"       %s %s run PROG \\\n"
+		"                         data_in FILE \\\n"
+		"                         [data_out FILE [data_size_out L]] \\\n"
+		"                         [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n"
+		"                         [repeat N]\n"
 		"       %s %s tracelog\n"
 		"       %s %s help\n"
 		"\n"
@@ -1081,7 +1425,8 @@ static int do_help(int argc, char **argv)
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+		bin_name, argv[-2]);
 
 	return 0;
 }
@@ -1097,6 +1442,7 @@ static const struct cmd cmds[] = {
 	{ "attach",	do_attach },
 	{ "detach",	do_detach },
 	{ "tracelog",	do_tracelog },
+	{ "run",	do_run },
 	{ 0 }
 };
 
diff --git a/tools/include/linux/sizes.h b/tools/include/linux/sizes.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cbb4c4d016e37c23fa456f2b5e977ba357c385f
--- /dev/null
+++ b/tools/include/linux/sizes.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * include/linux/sizes.h
+ */
+#ifndef __LINUX_SIZES_H__
+#define __LINUX_SIZES_H__
+
+#include <linux/const.h>
+
+#define SZ_1				0x00000001
+#define SZ_2				0x00000002
+#define SZ_4				0x00000004
+#define SZ_8				0x00000008
+#define SZ_16				0x00000010
+#define SZ_32				0x00000020
+#define SZ_64				0x00000040
+#define SZ_128				0x00000080
+#define SZ_256				0x00000100
+#define SZ_512				0x00000200
+
+#define SZ_1K				0x00000400
+#define SZ_2K				0x00000800
+#define SZ_4K				0x00001000
+#define SZ_8K				0x00002000
+#define SZ_16K				0x00004000
+#define SZ_32K				0x00008000
+#define SZ_64K				0x00010000
+#define SZ_128K				0x00020000
+#define SZ_256K				0x00040000
+#define SZ_512K				0x00080000
+
+#define SZ_1M				0x00100000
+#define SZ_2M				0x00200000
+#define SZ_4M				0x00400000
+#define SZ_8M				0x00800000
+#define SZ_16M				0x01000000
+#define SZ_32M				0x02000000
+#define SZ_64M				0x04000000
+#define SZ_128M				0x08000000
+#define SZ_256M				0x10000000
+#define SZ_512M				0x20000000
+
+#define SZ_1G				0x40000000
+#define SZ_2G				0x80000000
+
+#define SZ_4G				_AC(0x100000000, ULL)
+
+#endif /* __LINUX_SIZES_H__ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 46eed9ee7cca8350f170e9527e093b4485088eb6..5ac3af15dc9488e48fd4cc6df3c18db4b8069c11 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3223,7 +3223,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3235,7 +3235,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst
index e9892da79e98917b2d5e65822631fcc42f856e2a..8e603565ccded907e4a90e4aa7df473c8cd2e87c 100644
--- a/tools/lib/bpf/README.rst
+++ b/tools/lib/bpf/README.rst
@@ -9,7 +9,8 @@ described here. It's recommended to follow these conventions whenever a
 new function or type is added to keep libbpf API clean and consistent.
 
 All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``,
+``perf_buffer_``.
 
 System call wrappers
 --------------------
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3f21172d6d34981534fb95edb502a42c07312287..4d8ec9068c496d718c18c8152ebbcefec3918d45 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -33,7 +33,9 @@
 #include <linux/limits.h>
 #include <linux/perf_event.h>
 #include <linux/ring_buffer.h>
+#include <sys/epoll.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -1379,8 +1381,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj)
 		if (!has_datasec && kind == BTF_KIND_VAR) {
 			/* replace VAR with INT */
 			t->info = BTF_INFO_ENC(BTF_KIND_INT, 0, 0);
-			t->size = sizeof(int);
-			*(int *)(t+1) = BTF_INT_ENC(0, 0, 32);
+			/*
+			 * using size = 1 is the safest choice, 4 will be too
+			 * big and cause kernel BTF validation failure if
+			 * original variable took less than 4 bytes
+			 */
+			t->size = 1;
+			*(int *)(t+1) = BTF_INT_ENC(0, 0, 8);
 		} else if (!has_datasec && kind == BTF_KIND_DATASEC) {
 			/* replace DATASEC with STRUCT */
 			struct btf_var_secinfo *v = (void *)(t + 1);
@@ -1776,15 +1783,22 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
 			 (long long) sym.st_value, sym.st_name, name);
 
 		shdr_idx = sym.st_shndx;
+		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+		pr_debug("relocation: insn_idx=%u, shdr_idx=%u\n",
+			 insn_idx, shdr_idx);
+
+		if (shdr_idx >= SHN_LORESERVE) {
+			pr_warning("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n",
+				   name, shdr_idx, insn_idx,
+				   insns[insn_idx].code);
+			return -LIBBPF_ERRNO__RELOC;
+		}
 		if (!bpf_object__relo_in_known_section(obj, shdr_idx)) {
 			pr_warning("Program '%s' contains unrecognized relo data pointing to section %u\n",
 				   prog->section_name, shdr_idx);
 			return -LIBBPF_ERRNO__RELOC;
 		}
 
-		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-		pr_debug("relocation: insn_idx=%u\n", insn_idx);
-
 		if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) {
 			if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) {
 				pr_warning("incorrect bpf_call opcode\n");
@@ -2159,6 +2173,7 @@ static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
 	struct bpf_create_map_attr create_attr = {};
+	int nr_cpus = 0;
 	unsigned int i;
 	int err;
 
@@ -2181,7 +2196,22 @@ bpf_object__create_maps(struct bpf_object *obj)
 		create_attr.map_flags = def->map_flags;
 		create_attr.key_size = def->key_size;
 		create_attr.value_size = def->value_size;
-		create_attr.max_entries = def->max_entries;
+		if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+		    !def->max_entries) {
+			if (!nr_cpus)
+				nr_cpus = libbpf_num_possible_cpus();
+			if (nr_cpus < 0) {
+				pr_warning("failed to determine number of system CPUs: %d\n",
+					   nr_cpus);
+				err = nr_cpus;
+				goto err_out;
+			}
+			pr_debug("map '%s': setting size to %d\n",
+				 map->name, nr_cpus);
+			create_attr.max_entries = nr_cpus;
+		} else {
+			create_attr.max_entries = def->max_entries;
+		}
 		create_attr.btf_fd = 0;
 		create_attr.btf_key_type_id = 0;
 		create_attr.btf_value_type_id = 0;
@@ -2198,9 +2228,10 @@ bpf_object__create_maps(struct bpf_object *obj)
 		*pfd = bpf_create_map_xattr(&create_attr);
 		if (*pfd < 0 && (create_attr.btf_key_type_id ||
 				 create_attr.btf_value_type_id)) {
-			cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
+			err = -errno;
+			cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
 			pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
-				   map->name, cp, errno);
+				   map->name, cp, err);
 			create_attr.btf_fd = 0;
 			create_attr.btf_key_type_id = 0;
 			create_attr.btf_value_type_id = 0;
@@ -2212,11 +2243,11 @@ bpf_object__create_maps(struct bpf_object *obj)
 		if (*pfd < 0) {
 			size_t j;
 
-			err = *pfd;
+			err = -errno;
 err_out:
-			cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
-			pr_warning("failed to create map (name: '%s'): %s\n",
-				   map->name, cp);
+			cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
+			pr_warning("failed to create map (name: '%s'): %s(%d)\n",
+				   map->name, cp, err);
 			for (j = 0; j < i; j++)
 				zclose(obj->maps[j].fd);
 			return err;
@@ -4401,6 +4432,391 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 	return ret;
 }
 
+struct perf_buffer;
+
+struct perf_buffer_params {
+	struct perf_event_attr *attr;
+	/* if event_cb is specified, it takes precendence */
+	perf_buffer_event_fn event_cb;
+	/* sample_cb and lost_cb are higher-level common-case callbacks */
+	perf_buffer_sample_fn sample_cb;
+	perf_buffer_lost_fn lost_cb;
+	void *ctx;
+	int cpu_cnt;
+	int *cpus;
+	int *map_keys;
+};
+
+struct perf_cpu_buf {
+	struct perf_buffer *pb;
+	void *base; /* mmap()'ed memory */
+	void *buf; /* for reconstructing segmented data */
+	size_t buf_size;
+	int fd;
+	int cpu;
+	int map_key;
+};
+
+struct perf_buffer {
+	perf_buffer_event_fn event_cb;
+	perf_buffer_sample_fn sample_cb;
+	perf_buffer_lost_fn lost_cb;
+	void *ctx; /* passed into callbacks */
+
+	size_t page_size;
+	size_t mmap_size;
+	struct perf_cpu_buf **cpu_bufs;
+	struct epoll_event *events;
+	int cpu_cnt; /* number of allocated CPU buffers */
+	int epoll_fd; /* perf event FD */
+	int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */
+};
+
+static void perf_buffer__free_cpu_buf(struct perf_buffer *pb,
+				      struct perf_cpu_buf *cpu_buf)
+{
+	if (!cpu_buf)
+		return;
+	if (cpu_buf->base &&
+	    munmap(cpu_buf->base, pb->mmap_size + pb->page_size))
+		pr_warning("failed to munmap cpu_buf #%d\n", cpu_buf->cpu);
+	if (cpu_buf->fd >= 0) {
+		ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0);
+		close(cpu_buf->fd);
+	}
+	free(cpu_buf->buf);
+	free(cpu_buf);
+}
+
+void perf_buffer__free(struct perf_buffer *pb)
+{
+	int i;
+
+	if (!pb)
+		return;
+	if (pb->cpu_bufs) {
+		for (i = 0; i < pb->cpu_cnt; i++) {
+			struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i];
+
+			if (!cpu_buf)
+				continue;
+
+			bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key);
+			perf_buffer__free_cpu_buf(pb, cpu_buf);
+		}
+		free(pb->cpu_bufs);
+	}
+	if (pb->epoll_fd >= 0)
+		close(pb->epoll_fd);
+	free(pb->events);
+	free(pb);
+}
+
+static struct perf_cpu_buf *
+perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr,
+			  int cpu, int map_key)
+{
+	struct perf_cpu_buf *cpu_buf;
+	char msg[STRERR_BUFSIZE];
+	int err;
+
+	cpu_buf = calloc(1, sizeof(*cpu_buf));
+	if (!cpu_buf)
+		return ERR_PTR(-ENOMEM);
+
+	cpu_buf->pb = pb;
+	cpu_buf->cpu = cpu;
+	cpu_buf->map_key = map_key;
+
+	cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu,
+			      -1, PERF_FLAG_FD_CLOEXEC);
+	if (cpu_buf->fd < 0) {
+		err = -errno;
+		pr_warning("failed to open perf buffer event on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size,
+			     PROT_READ | PROT_WRITE, MAP_SHARED,
+			     cpu_buf->fd, 0);
+	if (cpu_buf->base == MAP_FAILED) {
+		cpu_buf->base = NULL;
+		err = -errno;
+		pr_warning("failed to mmap perf buffer on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
+		err = -errno;
+		pr_warning("failed to enable perf buffer event on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	return cpu_buf;
+
+error:
+	perf_buffer__free_cpu_buf(pb, cpu_buf);
+	return (struct perf_cpu_buf *)ERR_PTR(err);
+}
+
+static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
+					      struct perf_buffer_params *p);
+
+struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
+				     const struct perf_buffer_opts *opts)
+{
+	struct perf_buffer_params p = {};
+	struct perf_event_attr attr = { 0, };
+
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT,
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	p.attr = &attr;
+	p.sample_cb = opts ? opts->sample_cb : NULL;
+	p.lost_cb = opts ? opts->lost_cb : NULL;
+	p.ctx = opts ? opts->ctx : NULL;
+
+	return __perf_buffer__new(map_fd, page_cnt, &p);
+}
+
+struct perf_buffer *
+perf_buffer__new_raw(int map_fd, size_t page_cnt,
+		     const struct perf_buffer_raw_opts *opts)
+{
+	struct perf_buffer_params p = {};
+
+	p.attr = opts->attr;
+	p.event_cb = opts->event_cb;
+	p.ctx = opts->ctx;
+	p.cpu_cnt = opts->cpu_cnt;
+	p.cpus = opts->cpus;
+	p.map_keys = opts->map_keys;
+
+	return __perf_buffer__new(map_fd, page_cnt, &p);
+}
+
+static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
+					      struct perf_buffer_params *p)
+{
+	const char *online_cpus_file = "/sys/devices/system/cpu/online";
+	struct bpf_map_info map = {};
+	char msg[STRERR_BUFSIZE];
+	struct perf_buffer *pb;
+	bool *online = NULL;
+	__u32 map_info_len;
+	int err, i, j, n;
+
+	if (page_cnt & (page_cnt - 1)) {
+		pr_warning("page count should be power of two, but is %zu\n",
+			   page_cnt);
+		return ERR_PTR(-EINVAL);
+	}
+
+	map_info_len = sizeof(map);
+	err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len);
+	if (err) {
+		err = -errno;
+		pr_warning("failed to get map info for map FD %d: %s\n",
+			   map_fd, libbpf_strerror_r(err, msg, sizeof(msg)));
+		return ERR_PTR(err);
+	}
+
+	if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+		pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+			   map.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pb = calloc(1, sizeof(*pb));
+	if (!pb)
+		return ERR_PTR(-ENOMEM);
+
+	pb->event_cb = p->event_cb;
+	pb->sample_cb = p->sample_cb;
+	pb->lost_cb = p->lost_cb;
+	pb->ctx = p->ctx;
+
+	pb->page_size = getpagesize();
+	pb->mmap_size = pb->page_size * page_cnt;
+	pb->map_fd = map_fd;
+
+	pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (pb->epoll_fd < 0) {
+		err = -errno;
+		pr_warning("failed to create epoll instance: %s\n",
+			   libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	if (p->cpu_cnt > 0) {
+		pb->cpu_cnt = p->cpu_cnt;
+	} else {
+		pb->cpu_cnt = libbpf_num_possible_cpus();
+		if (pb->cpu_cnt < 0) {
+			err = pb->cpu_cnt;
+			goto error;
+		}
+		if (map.max_entries < pb->cpu_cnt)
+			pb->cpu_cnt = map.max_entries;
+	}
+
+	pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events));
+	if (!pb->events) {
+		err = -ENOMEM;
+		pr_warning("failed to allocate events: out of memory\n");
+		goto error;
+	}
+	pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs));
+	if (!pb->cpu_bufs) {
+		err = -ENOMEM;
+		pr_warning("failed to allocate buffers: out of memory\n");
+		goto error;
+	}
+
+	err = parse_cpu_mask_file(online_cpus_file, &online, &n);
+	if (err) {
+		pr_warning("failed to get online CPU mask: %d\n", err);
+		goto error;
+	}
+
+	for (i = 0, j = 0; i < pb->cpu_cnt; i++) {
+		struct perf_cpu_buf *cpu_buf;
+		int cpu, map_key;
+
+		cpu = p->cpu_cnt > 0 ? p->cpus[i] : i;
+		map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i;
+
+		/* in case user didn't explicitly requested particular CPUs to
+		 * be attached to, skip offline/not present CPUs
+		 */
+		if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu]))
+			continue;
+
+		cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key);
+		if (IS_ERR(cpu_buf)) {
+			err = PTR_ERR(cpu_buf);
+			goto error;
+		}
+
+		pb->cpu_bufs[j] = cpu_buf;
+
+		err = bpf_map_update_elem(pb->map_fd, &map_key,
+					  &cpu_buf->fd, 0);
+		if (err) {
+			err = -errno;
+			pr_warning("failed to set cpu #%d, key %d -> perf FD %d: %s\n",
+				   cpu, map_key, cpu_buf->fd,
+				   libbpf_strerror_r(err, msg, sizeof(msg)));
+			goto error;
+		}
+
+		pb->events[j].events = EPOLLIN;
+		pb->events[j].data.ptr = cpu_buf;
+		if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd,
+			      &pb->events[j]) < 0) {
+			err = -errno;
+			pr_warning("failed to epoll_ctl cpu #%d perf FD %d: %s\n",
+				   cpu, cpu_buf->fd,
+				   libbpf_strerror_r(err, msg, sizeof(msg)));
+			goto error;
+		}
+		j++;
+	}
+	pb->cpu_cnt = j;
+	free(online);
+
+	return pb;
+
+error:
+	free(online);
+	if (pb)
+		perf_buffer__free(pb);
+	return ERR_PTR(err);
+}
+
+struct perf_sample_raw {
+	struct perf_event_header header;
+	uint32_t size;
+	char data[0];
+};
+
+struct perf_sample_lost {
+	struct perf_event_header header;
+	uint64_t id;
+	uint64_t lost;
+	uint64_t sample_id;
+};
+
+static enum bpf_perf_event_ret
+perf_buffer__process_record(struct perf_event_header *e, void *ctx)
+{
+	struct perf_cpu_buf *cpu_buf = ctx;
+	struct perf_buffer *pb = cpu_buf->pb;
+	void *data = e;
+
+	/* user wants full control over parsing perf event */
+	if (pb->event_cb)
+		return pb->event_cb(pb->ctx, cpu_buf->cpu, e);
+
+	switch (e->type) {
+	case PERF_RECORD_SAMPLE: {
+		struct perf_sample_raw *s = data;
+
+		if (pb->sample_cb)
+			pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size);
+		break;
+	}
+	case PERF_RECORD_LOST: {
+		struct perf_sample_lost *s = data;
+
+		if (pb->lost_cb)
+			pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost);
+		break;
+	}
+	default:
+		pr_warning("unknown perf sample type %d\n", e->type);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static int perf_buffer__process_records(struct perf_buffer *pb,
+					struct perf_cpu_buf *cpu_buf)
+{
+	enum bpf_perf_event_ret ret;
+
+	ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size,
+					 pb->page_size, &cpu_buf->buf,
+					 &cpu_buf->buf_size,
+					 perf_buffer__process_record, cpu_buf);
+	if (ret != LIBBPF_PERF_EVENT_CONT)
+		return ret;
+	return 0;
+}
+
+int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
+{
+	int i, cnt, err;
+
+	cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
+	for (i = 0; i < cnt; i++) {
+		struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;
+
+		err = perf_buffer__process_records(pb, cpu_buf);
+		if (err) {
+			pr_warning("error while processing records: %d\n", err);
+			return err;
+		}
+	}
+	return cnt < 0 ? -errno : cnt;
+}
+
 struct bpf_prog_info_array_desc {
 	int	array_offset;	/* e.g. offset of jited_prog_insns */
 	int	count_offset;	/* e.g. offset of jited_prog_len */
@@ -4647,62 +5063,104 @@ void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear)
 	}
 }
 
-int libbpf_num_possible_cpus(void)
+int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz)
 {
-	static const char *fcpu = "/sys/devices/system/cpu/possible";
-	int len = 0, n = 0, il = 0, ir = 0;
-	unsigned int start = 0, end = 0;
-	int tmp_cpus = 0;
-	static int cpus;
-	char buf[128];
-	int error = 0;
-	int fd = -1;
+	int err = 0, n, len, start, end = -1;
+	bool *tmp;
 
-	tmp_cpus = READ_ONCE(cpus);
-	if (tmp_cpus > 0)
-		return tmp_cpus;
+	*mask = NULL;
+	*mask_sz = 0;
+
+	/* Each sub string separated by ',' has format \d+-\d+ or \d+ */
+	while (*s) {
+		if (*s == ',' || *s == '\n') {
+			s++;
+			continue;
+		}
+		n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len);
+		if (n <= 0 || n > 2) {
+			pr_warning("Failed to get CPU range %s: %d\n", s, n);
+			err = -EINVAL;
+			goto cleanup;
+		} else if (n == 1) {
+			end = start;
+		}
+		if (start < 0 || start > end) {
+			pr_warning("Invalid CPU range [%d,%d] in %s\n",
+				start, end, s);
+			err = -EINVAL;
+			goto cleanup;
+		}
+		tmp = realloc(*mask, end + 1);
+		if (!tmp) {
+			err = -ENOMEM;
+			goto cleanup;
+		}
+		*mask = tmp;
+		memset(tmp + *mask_sz, 0, start - *mask_sz);
+		memset(tmp + start, 1, end - start + 1);
+		*mask_sz = end + 1;
+		s += len;
+	}
+	if (!*mask_sz) {
+		pr_warning("Empty CPU range\n");
+		return -EINVAL;
+	}
+	return 0;
+cleanup:
+	free(*mask);
+	*mask = NULL;
+	return err;
+}
+
+int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz)
+{
+	int fd, err = 0, len;
+	char buf[128];
 
 	fd = open(fcpu, O_RDONLY);
 	if (fd < 0) {
-		error = errno;
-		pr_warning("Failed to open file %s: %s\n",
-			   fcpu, strerror(error));
-		return -error;
+		err = -errno;
+		pr_warning("Failed to open cpu mask file %s: %d\n", fcpu, err);
+		return err;
 	}
 	len = read(fd, buf, sizeof(buf));
 	close(fd);
 	if (len <= 0) {
-		error = len ? errno : EINVAL;
-		pr_warning("Failed to read # of possible cpus from %s: %s\n",
-			   fcpu, strerror(error));
-		return -error;
+		err = len ? -errno : -EINVAL;
+		pr_warning("Failed to read cpu mask from %s: %d\n", fcpu, err);
+		return err;
 	}
-	if (len == sizeof(buf)) {
-		pr_warning("File %s size overflow\n", fcpu);
-		return -EOVERFLOW;
+	if (len >= sizeof(buf)) {
+		pr_warning("CPU mask is too big in file %s\n", fcpu);
+		return -E2BIG;
 	}
 	buf[len] = '\0';
 
-	for (ir = 0, tmp_cpus = 0; ir <= len; ir++) {
-		/* Each sub string separated by ',' has format \d+-\d+ or \d+ */
-		if (buf[ir] == ',' || buf[ir] == '\0') {
-			buf[ir] = '\0';
-			n = sscanf(&buf[il], "%u-%u", &start, &end);
-			if (n <= 0) {
-				pr_warning("Failed to get # CPUs from %s\n",
-					   &buf[il]);
-				return -EINVAL;
-			} else if (n == 1) {
-				end = start;
-			}
-			tmp_cpus += end - start + 1;
-			il = ir + 1;
-		}
-	}
-	if (tmp_cpus <= 0) {
-		pr_warning("Invalid #CPUs %d from %s\n", tmp_cpus, fcpu);
-		return -EINVAL;
+	return parse_cpu_mask_str(buf, mask, mask_sz);
+}
+
+int libbpf_num_possible_cpus(void)
+{
+	static const char *fcpu = "/sys/devices/system/cpu/possible";
+	static int cpus;
+	int err, n, i, tmp_cpus;
+	bool *mask;
+
+	tmp_cpus = READ_ONCE(cpus);
+	if (tmp_cpus > 0)
+		return tmp_cpus;
+
+	err = parse_cpu_mask_file(fcpu, &mask, &n);
+	if (err)
+		return err;
+
+	tmp_cpus = 0;
+	for (i = 0; i < n; i++) {
+		if (mask[i])
+			tmp_cpus++;
 	}
+	free(mask);
 
 	WRITE_ONCE(cpus, tmp_cpus);
 	return tmp_cpus;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index f55933784f95208b230b283355aad53bfc92708f..5cbf459ece0b29eb3e36981d0b3cfe5a951ae562 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -358,6 +358,26 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type,
 LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
 
+struct perf_buffer;
+
+typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu,
+				      void *data, __u32 size);
+typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt);
+
+/* common use perf buffer options */
+struct perf_buffer_opts {
+	/* if specified, sample_cb is called for each sample */
+	perf_buffer_sample_fn sample_cb;
+	/* if specified, lost_cb is called for each batch of lost samples */
+	perf_buffer_lost_fn lost_cb;
+	/* ctx is provided to sample_cb and lost_cb */
+	void *ctx;
+};
+
+LIBBPF_API struct perf_buffer *
+perf_buffer__new(int map_fd, size_t page_cnt,
+		 const struct perf_buffer_opts *opts);
+
 enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_DONE	= 0,
 	LIBBPF_PERF_EVENT_ERROR	= -1,
@@ -365,6 +385,35 @@ enum bpf_perf_event_ret {
 };
 
 struct perf_event_header;
+
+typedef enum bpf_perf_event_ret
+(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event);
+
+/* raw perf buffer options, giving most power and control */
+struct perf_buffer_raw_opts {
+	/* perf event attrs passed directly into perf_event_open() */
+	struct perf_event_attr *attr;
+	/* raw event callback */
+	perf_buffer_event_fn event_cb;
+	/* ctx is provided to event_cb */
+	void *ctx;
+	/* if cpu_cnt == 0, open all on all possible CPUs (up to the number of
+	 * max_entries of given PERF_EVENT_ARRAY map)
+	 */
+	int cpu_cnt;
+	/* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */
+	int *cpus;
+	/* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */
+	int *map_keys;
+};
+
+LIBBPF_API struct perf_buffer *
+perf_buffer__new_raw(int map_fd, size_t page_cnt,
+		     const struct perf_buffer_raw_opts *opts);
+
+LIBBPF_API void perf_buffer__free(struct perf_buffer *pb);
+LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms);
+
 typedef enum bpf_perf_event_ret
 	(*bpf_perf_event_print_t)(struct perf_event_header *hdr,
 				  void *private_data);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index e6b7d4edbc936e461cb960155047dab429951c5e..f9d316e873d8d2d7ff37d138fdef0cbbbd425bcd 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -179,4 +179,8 @@ LIBBPF_0.0.4 {
 		btf_dump__new;
 		btf__parse_elf;
 		libbpf_num_possible_cpus;
+		perf_buffer__free;
+		perf_buffer__new;
+		perf_buffer__new_raw;
+		perf_buffer__poll;
 } LIBBPF_0.0.3;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 2ac29bd362269ad8f901e06bc03033c86fbf3474..86df374629144e0cc1665f84f1b4bd327539c37c 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -43,6 +43,8 @@ do {				\
 #define pr_info(fmt, ...)	__pr(LIBBPF_INFO, fmt, ##__VA_ARGS__)
 #define pr_debug(fmt, ...)	__pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__)
 
+int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
+int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 			 const char *str_sec, size_t str_len);
 
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 9dfaff57daea085f9cffb3a11a3d97152464d32d..f5357230e16c1fdac975c9bcbb598fdf1ae82c64 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -317,15 +317,14 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 
 static int xsk_get_max_queues(struct xsk_socket *xsk)
 {
-	struct ethtool_channels channels;
-	struct ifreq ifr;
+	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+	struct ifreq ifr = {};
 	int fd, err, ret;
 
 	fd = socket(AF_INET, SOCK_DGRAM, 0);
 	if (fd < 0)
 		return -errno;
 
-	channels.cmd = ETHTOOL_GCHANNELS;
 	ifr.ifr_data = (void *)&channels;
 	strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ - 1);
 	ifr.ifr_name[IFNAMSIZ - 1] = '\0';
@@ -335,7 +334,7 @@ static int xsk_get_max_queues(struct xsk_socket *xsk)
 		goto out;
 	}
 
-	if (channels.max_combined == 0 || errno == EOPNOTSUPP)
+	if (err || channels.max_combined == 0)
 		/* If the device says it has no channels, then all traffic
 		 * is sent to a single stream, so max queues = 1.
 		 */
@@ -520,7 +519,8 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
 		err = -errno;
 		goto out_socket;
 	}
-	strncpy(xsk->ifname, ifname, IFNAMSIZ);
+	strncpy(xsk->ifname, ifname, IFNAMSIZ - 1);
+	xsk->ifname[IFNAMSIZ - 1] = '\0';
 
 	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
 	if (err)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 4ee5694f5602eafa06c6a6cdc5c00fd88b4e1b5e..40ecae377d154f7d1c66c070725c3095b5768b44 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -26,7 +26,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
 	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_cgroup_attach test_sockopt \
-	test_sockopt_sk test_sockopt_multi
+	test_sockopt_sk test_sockopt_multi test_tcp_rtt
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -96,6 +96,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
 $(OUTPUT)/test_sockopt: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c
new file mode 100644
index 0000000000000000000000000000000000000000..233bdcb1659ef7ab3ce9a3be7d8745a865529f0d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+struct tcp_rtt_storage {
+	__u32 invoked;
+	__u32 dsack_dups;
+	__u32 delivered;
+	__u32 delivered_ce;
+	__u32 icsk_retransmits;
+};
+
+struct bpf_map_def SEC("maps") socket_storage_map = {
+	.type = BPF_MAP_TYPE_SK_STORAGE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct tcp_rtt_storage),
+	.map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(socket_storage_map, int, struct tcp_rtt_storage);
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+	struct tcp_rtt_storage *storage;
+	struct bpf_tcp_sock *tcp_sk;
+	int op = (int) ctx->op;
+	struct bpf_sock *sk;
+
+	sk = ctx->sk;
+	if (!sk)
+		return 1;
+
+	storage = bpf_sk_storage_get(&socket_storage_map, sk, 0,
+				     BPF_SK_STORAGE_GET_F_CREATE);
+	if (!storage)
+		return 1;
+
+	if (op == BPF_SOCK_OPS_TCP_CONNECT_CB) {
+		bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+		return 1;
+	}
+
+	if (op != BPF_SOCK_OPS_RTT_CB)
+		return 1;
+
+	tcp_sk = bpf_tcp_sock(sk);
+	if (!tcp_sk)
+		return 1;
+
+	storage->invoked++;
+
+	storage->dsack_dups = tcp_sk->dsack_dups;
+	storage->delivered = tcp_sk->delivered;
+	storage->delivered_ce = tcp_sk->delivered_ce;
+	storage->icsk_retransmits = tcp_sk->icsk_retransmits;
+
+	return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
new file mode 100644
index 0000000000000000000000000000000000000000..876c27deb65a3333e1cdfb460c8e4c6dda16cd03
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+} perf_buf_map SEC(".maps");
+
+SEC("kprobe/sys_nanosleep")
+int handle_sys_nanosleep_entry(struct pt_regs *ctx)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
+			      &cpu, sizeof(cpu));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 0ae7a741541464390d7e4a43aa941028d0f2a430..6cc29b58d6707e90411f8ca3b01162a72a65517d 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -180,7 +180,7 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{7, "R0=pkt(id=0,off=8,r=8,imm=0)"},
+			{7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
 			{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
 			{8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
 			{9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
@@ -315,7 +315,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{8, "R2=pkt(id=0,off=0,r=8,imm=0)"},
+			{8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Offset is added to packet pointer R5, resulting in
 			 * known fixed offset, and variable offset from R6.
@@ -405,7 +405,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{8, "R2=pkt(id=0,off=0,r=8,imm=0)"},
+			{8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
@@ -473,12 +473,12 @@ static struct bpf_align_test tests[] = {
 			/* (4n) + 14 == (4n+2).  We blow our bounds, because
 			 * the add could overflow.
 			 */
-			{7, "R5=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+			{7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
 			/* Checked s>=0 */
 			{9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
 			/* packet pointer + nonnegative (4n+2) */
 			{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-			{13, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			{13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
 			/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
 			 * We checked the bounds, but it might have been able
 			 * to overflow if the packet pointer started in the
@@ -486,7 +486,7 @@ static struct bpf_align_test tests[] = {
 			 * So we did not get a 'range' on R6, and the access
 			 * attempt will fail.
 			 */
-			{15, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			{15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
 		}
 	},
 	{
@@ -521,7 +521,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{7, "R2=pkt(id=0,off=0,r=8,imm=0)"},
+			{7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
@@ -575,7 +575,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{7, "R2=pkt(id=0,off=0,r=8,imm=0)"},
+			{7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 781c7de343be005a92bffd7a2f6da0f0eaae22bb..1b25a7e348dc477caa4d806d6ed2168ea7e2da23 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -18,9 +18,11 @@
 #include <sys/wait.h>
 
 #include <bpf/bpf.h>
+#include <bpf/libbpf.h>
 
 #include "bpf_util.h"
 #include "bpf_rlimit.h"
+#include "../../../include/linux/filter.h"
 
 #define LOCAL_FREE_TARGET	(128)
 #define PERCPU_FREE_TARGET	(4)
@@ -40,6 +42,68 @@ static int create_map(int map_type, int map_flags, unsigned int size)
 	return map_fd;
 }
 
+static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
+					    void *value)
+{
+	struct bpf_load_program_attr prog;
+	struct bpf_create_map_attr map;
+	struct bpf_insn insns[] = {
+		BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0),
+		BPF_LD_MAP_FD(BPF_REG_1, fd),
+		BPF_LD_IMM64(BPF_REG_3, key),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+		BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_1, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 42),
+		BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+	__u8 data[64] = {};
+	int mfd, pfd, ret, zero = 0;
+	__u32 retval = 0;
+
+	memset(&map, 0, sizeof(map));
+	map.map_type = BPF_MAP_TYPE_ARRAY;
+	map.key_size = sizeof(int);
+	map.value_size = sizeof(unsigned long long);
+	map.max_entries = 1;
+
+	mfd = bpf_create_map_xattr(&map);
+	if (mfd < 0)
+		return -1;
+
+	insns[0].imm = mfd;
+
+	memset(&prog, 0, sizeof(prog));
+	prog.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+	prog.insns = insns;
+	prog.insns_cnt = ARRAY_SIZE(insns);
+	prog.license = "GPL";
+
+	pfd = bpf_load_program_xattr(&prog, NULL, 0);
+	if (pfd < 0) {
+		close(mfd);
+		return -1;
+	}
+
+	ret = bpf_prog_test_run(pfd, 1, data, sizeof(data),
+				NULL, NULL, &retval, NULL);
+	if (ret < 0 || retval != 42) {
+		ret = -1;
+	} else {
+		assert(!bpf_map_lookup_elem(mfd, &zero, value));
+		ret = 0;
+	}
+	close(pfd);
+	close(mfd);
+	return ret;
+}
+
 static int map_subset(int map0, int map1)
 {
 	unsigned long long next_key = 0;
@@ -87,7 +151,7 @@ static int sched_next_online(int pid, int *next_to_try)
 	return ret;
 }
 
-/* Size of the LRU amp is 2
+/* Size of the LRU map is 2
  * Add key=1 (+1 key)
  * Add key=2 (+1 key)
  * Lookup Key=1
@@ -157,7 +221,7 @@ static void test_lru_sanity0(int map_type, int map_flags)
 	 * stop LRU from removing key=1
 	 */
 	key = 1;
-	assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+	assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 	assert(value[0] == 1234);
 
 	key = 3;
@@ -167,7 +231,8 @@ static void test_lru_sanity0(int map_type, int map_flags)
 
 	/* key=2 has been removed from the LRU */
 	key = 2;
-	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1);
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
 
 	assert(map_equal(lru_map_fd, expected_map_fd));
 
@@ -221,7 +286,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 	/* Lookup 1 to tgt_free/2 */
 	end_key = 1 + batch_size;
 	for (key = 1; key < end_key; key++) {
-		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
 					    BPF_NOEXIST));
 	}
@@ -322,10 +387,11 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	end_key = 1 + batch_size;
 	value[0] = 4321;
 	for (key = 1; key < end_key; key++) {
-		assert(bpf_map_lookup_elem(lru_map_fd, &key, value));
+		assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+		       errno == ENOENT);
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
-		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 		assert(value[0] == 4321);
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
 					    BPF_NOEXIST));
@@ -404,7 +470,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 	/* Lookup key 1 to tgt_free*3/2 */
 	end_key = tgt_free + batch_size;
 	for (key = 1; key < end_key; key++) {
-		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
 					    BPF_NOEXIST));
 	}
@@ -463,7 +529,7 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
 	assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
 
 	for (key = 1; key <= tgt_free; key++) {
-		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
 					    BPF_NOEXIST));
 	}
@@ -494,16 +560,16 @@ static void do_test_lru_sanity5(unsigned long long last_key, int map_fd)
 	unsigned long long key, value[nr_cpus];
 
 	/* Ensure the last key inserted by previous CPU can be found */
-	assert(!bpf_map_lookup_elem(map_fd, &last_key, value));
-
+	assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, last_key, value));
 	value[0] = 1234;
 
 	key = last_key + 1;
 	assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
-	assert(!bpf_map_lookup_elem(map_fd, &key, value));
+	assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, key, value));
 
 	/* Cannot find the last key because it was removed by LRU */
-	assert(bpf_map_lookup_elem(map_fd, &last_key, value));
+	assert(bpf_map_lookup_elem(map_fd, &last_key, value) == -1 &&
+	       errno == ENOENT);
 }
 
 /* Test map with only one element */
@@ -590,8 +656,8 @@ static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
 		/* Make ref bit sticky for key: [1, tgt_free] */
 		for (stable_key = 1; stable_key <= tgt_free; stable_key++) {
 			/* Mark the ref bit */
-			assert(!bpf_map_lookup_elem(lru_map_fd, &stable_key,
-						    value));
+			assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd,
+								 stable_key, value));
 		}
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
@@ -612,6 +678,198 @@ static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
 	printf("Pass\n");
 }
 
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (datapath)
+ * Lookup Key=2 (syscall)
+ * Add Key=3
+ *   => Key=2 will be removed by LRU
+ * Iterate map.  Only found key=1 and key=3
+ */
+static void test_lru_sanity7(int map_type, int map_flags)
+{
+	unsigned long long key, value[nr_cpus];
+	int lru_map_fd, expected_map_fd;
+	int next_cpu = 0;
+
+	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+	       map_flags);
+
+	assert(sched_next_online(0, &next_cpu) != -1);
+
+	if (map_flags & BPF_F_NO_COMMON_LRU)
+		lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+	else
+		lru_map_fd = create_map(map_type, map_flags, 2);
+	assert(lru_map_fd != -1);
+
+	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+	assert(expected_map_fd != -1);
+
+	value[0] = 1234;
+
+	/* insert key=1 element */
+
+	key = 1;
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+	assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+				    BPF_NOEXIST));
+
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+	       /* key=1 already exists */
+	       && errno == EEXIST);
+
+	/* insert key=2 element */
+
+	/* check that key=2 is not found */
+	key = 2;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+	/* insert key=3 element */
+
+	/* check that key=3 is not found */
+	key = 3;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	/* check that key=1 can be found and mark the ref bit to
+	 * stop LRU from removing key=1
+	 */
+	key = 1;
+	assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+	assert(value[0] == 1234);
+
+	/* check that key=2 can be found and do _not_ mark ref bit.
+	 * this will be evicted on next update.
+	 */
+	key = 2;
+	assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+	assert(value[0] == 1234);
+
+	key = 3;
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+	assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+				    BPF_NOEXIST));
+
+	/* key=2 has been removed from the LRU */
+	key = 2;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	assert(map_equal(lru_map_fd, expected_map_fd));
+
+	close(expected_map_fd);
+	close(lru_map_fd);
+
+	printf("Pass\n");
+}
+
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (syscall)
+ * Lookup Key=2 (datapath)
+ * Add Key=3
+ *   => Key=1 will be removed by LRU
+ * Iterate map.  Only found key=2 and key=3
+ */
+static void test_lru_sanity8(int map_type, int map_flags)
+{
+	unsigned long long key, value[nr_cpus];
+	int lru_map_fd, expected_map_fd;
+	int next_cpu = 0;
+
+	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+	       map_flags);
+
+	assert(sched_next_online(0, &next_cpu) != -1);
+
+	if (map_flags & BPF_F_NO_COMMON_LRU)
+		lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+	else
+		lru_map_fd = create_map(map_type, map_flags, 2);
+	assert(lru_map_fd != -1);
+
+	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+	assert(expected_map_fd != -1);
+
+	value[0] = 1234;
+
+	/* insert key=1 element */
+
+	key = 1;
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+	       /* key=1 already exists */
+	       && errno == EEXIST);
+
+	/* insert key=2 element */
+
+	/* check that key=2 is not found */
+	key = 2;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+	assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+				    BPF_NOEXIST));
+
+	/* insert key=3 element */
+
+	/* check that key=3 is not found */
+	key = 3;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	/* check that key=1 can be found and do _not_ mark ref bit.
+	 * this will be evicted on next update.
+	 */
+	key = 1;
+	assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+	assert(value[0] == 1234);
+
+	/* check that key=2 can be found and mark the ref bit to
+	 * stop LRU from removing key=2
+	 */
+	key = 2;
+	assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+	assert(value[0] == 1234);
+
+	key = 3;
+	assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+	assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+				    BPF_NOEXIST));
+
+	/* key=1 has been removed from the LRU */
+	key = 1;
+	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+	       errno == ENOENT);
+
+	assert(map_equal(lru_map_fd, expected_map_fd));
+
+	close(expected_map_fd);
+	close(lru_map_fd);
+
+	printf("Pass\n");
+}
+
 int main(int argc, char **argv)
 {
 	int map_types[] = {BPF_MAP_TYPE_LRU_HASH,
@@ -637,6 +895,8 @@ int main(int argc, char **argv)
 			test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
 			test_lru_sanity5(map_types[t], map_flags[f]);
 			test_lru_sanity6(map_types[t], map_flags[f], tgt_free);
+			test_lru_sanity7(map_types[t], map_flags[f]);
+			test_lru_sanity8(map_types[t], map_flags[f]);
 
 			printf("\n");
 		}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index fb5566eb06808c1b372c9956a2769d009567cc74..bbe72073daee577233e430ce3be769aeb2c15f08 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -4,6 +4,8 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
+
+#define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
@@ -13,6 +15,7 @@
 #include <stdarg.h>
 #include <time.h>
 #include <signal.h>
+#include <sched.h>
 
 #include <linux/types.h>
 typedef __u16 __sum16;
@@ -34,6 +37,7 @@ typedef __u16 __sum16;
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/uio.h>
+#include <sys/socket.h>
 #include <fcntl.h>
 #include <pthread.h>
 #include <linux/bpf.h>
@@ -54,6 +58,12 @@ bool verifier_stats = false;
 
 #define MAGIC_BYTES 123
 
+#ifdef __x86_64__
+#define SYS_KPROBE_NAME "__x64_sys_nanosleep"
+#else
+#define SYS_KPROBE_NAME "sys_nanosleep"
+#endif
+
 /* ipv4 test vector */
 static struct {
 	struct ethhdr eth;
@@ -2978,6 +2988,94 @@ static void test_send_signal(void)
 		printf("test_send_signal:FAIL\n");
 }
 
+static void on_sample(void *ctx, int cpu, void *data, __u32 size)
+{
+	int cpu_data = *(int *)data, duration = 0;
+	cpu_set_t *cpu_seen = ctx;
+
+	if (cpu_data != cpu)
+		CHECK(cpu_data != cpu, "check_cpu_data",
+		      "cpu_data %d != cpu %d\n", cpu_data, cpu);
+
+	CPU_SET(cpu, cpu_seen);
+}
+
+static void test_perf_buffer(void)
+{
+	int err, prog_fd, nr_cpus, i, duration = 0;
+	const char *prog_name = "kprobe/sys_nanosleep";
+	const char *file = "./test_perf_buffer.o";
+	struct perf_buffer_opts pb_opts = {};
+	struct bpf_map *perf_buf_map;
+	cpu_set_t cpu_set, cpu_seen;
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	struct perf_buffer *pb;
+	struct bpf_link *link;
+
+	nr_cpus = libbpf_num_possible_cpus();
+	if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
+		return;
+
+	/* load program */
+	err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
+	if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
+		return;
+
+	prog = bpf_object__find_program_by_title(obj, prog_name);
+	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
+		goto out_close;
+
+	/* load map */
+	perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map");
+	if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
+		goto out_close;
+
+	/* attach kprobe */
+	link = bpf_program__attach_kprobe(prog, false /* retprobe */,
+					  SYS_KPROBE_NAME);
+	if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link)))
+		goto out_close;
+
+	/* set up perf buffer */
+	pb_opts.sample_cb = on_sample;
+	pb_opts.ctx = &cpu_seen;
+	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
+	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+		goto out_detach;
+
+	/* trigger kprobe on every CPU */
+	CPU_ZERO(&cpu_seen);
+	for (i = 0; i < nr_cpus; i++) {
+		CPU_ZERO(&cpu_set);
+		CPU_SET(i, &cpu_set);
+
+		err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set),
+					     &cpu_set);
+		if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n",
+				 i, err))
+			goto out_detach;
+
+		usleep(1);
+	}
+
+	/* read perf buffer */
+	err = perf_buffer__poll(pb, 100);
+	if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+		goto out_free_pb;
+
+	if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
+		  "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
+		goto out_free_pb;
+
+out_free_pb:
+	perf_buffer__free(pb);
+out_detach:
+	bpf_link__destroy(link);
+out_close:
+	bpf_object__close(obj);
+}
+
 int main(int ac, char **av)
 {
 	srand(time(NULL));
@@ -3019,6 +3117,7 @@ int main(int ac, char **av)
 	test_raw_tp_writable_reject_nbd_invalid();
 	test_raw_tp_writable_test_run();
 	test_send_signal();
+	test_perf_buffer();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index a3bebd7c68ddc6262c5eaf6021bd8c8304b6a006..abc26248a7f182dc7d3a0e6372ef0196bae6b308 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -31,6 +31,7 @@ struct sysctl_test {
 	enum bpf_attach_type attach_type;
 	const char *sysctl;
 	int open_flags;
+	int seek;
 	const char *newval;
 	const char *oldval;
 	enum {
@@ -139,7 +140,7 @@ static struct sysctl_test tests[] = {
 			/* If (file_pos == X) */
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
 				    offsetof(struct bpf_sysctl, file_pos)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
 
 			/* return ALLOW; */
 			BPF_MOV64_IMM(BPF_REG_0, 1),
@@ -152,6 +153,7 @@ static struct sysctl_test tests[] = {
 		.attach_type = BPF_CGROUP_SYSCTL,
 		.sysctl = "kernel/ostype",
 		.open_flags = O_RDONLY,
+		.seek = 3,
 		.result = SUCCESS,
 	},
 	{
@@ -1442,6 +1444,11 @@ static int access_sysctl(const char *sysctl_path,
 	if (fd < 0)
 		return fd;
 
+	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
+		log_err("lseek(%d) failed", test->seek);
+		goto err;
+	}
+
 	if (test->open_flags == O_RDONLY) {
 		char buf[128];
 
diff --git a/tools/testing/selftests/bpf/test_tcp_rtt.c b/tools/testing/selftests/bpf/test_tcp_rtt.c
new file mode 100644
index 0000000000000000000000000000000000000000..90c3862f74a856cc10685130a3313dedcdb1dd34
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_rtt.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH                                "/tcp_rtt"
+
+struct tcp_rtt_storage {
+	__u32 invoked;
+	__u32 dsack_dups;
+	__u32 delivered;
+	__u32 delivered_ce;
+	__u32 icsk_retransmits;
+};
+
+static void send_byte(int fd)
+{
+	char b = 0x55;
+
+	if (write(fd, &b, sizeof(b)) != 1)
+		error(1, errno, "Failed to send single byte");
+}
+
+static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
+		     __u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
+		     __u32 icsk_retransmits)
+{
+	int err = 0;
+	struct tcp_rtt_storage val;
+
+	if (bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)
+		error(1, errno, "Failed to read socket storage");
+
+	if (val.invoked != invoked) {
+		log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
+			msg, val.invoked, invoked);
+		err++;
+	}
+
+	if (val.dsack_dups != dsack_dups) {
+		log_err("%s: unexpected bpf_tcp_sock.dsack_dups %d != %d",
+			msg, val.dsack_dups, dsack_dups);
+		err++;
+	}
+
+	if (val.delivered != delivered) {
+		log_err("%s: unexpected bpf_tcp_sock.delivered %d != %d",
+			msg, val.delivered, delivered);
+		err++;
+	}
+
+	if (val.delivered_ce != delivered_ce) {
+		log_err("%s: unexpected bpf_tcp_sock.delivered_ce %d != %d",
+			msg, val.delivered_ce, delivered_ce);
+		err++;
+	}
+
+	if (val.icsk_retransmits != icsk_retransmits) {
+		log_err("%s: unexpected bpf_tcp_sock.icsk_retransmits %d != %d",
+			msg, val.icsk_retransmits, icsk_retransmits);
+		err++;
+	}
+
+	return err;
+}
+
+static int connect_to_server(int server_fd)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int fd;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd < 0) {
+		log_err("Failed to create client socket");
+		return -1;
+	}
+
+	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+		log_err("Failed to get server addr");
+		goto out;
+	}
+
+	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+		log_err("Fail to connect to server");
+		goto out;
+	}
+
+	return fd;
+
+out:
+	close(fd);
+	return -1;
+}
+
+static int run_test(int cgroup_fd, int server_fd)
+{
+	struct bpf_prog_load_attr attr = {
+		.prog_type = BPF_PROG_TYPE_SOCK_OPS,
+		.file = "./tcp_rtt.o",
+		.expected_attach_type = BPF_CGROUP_SOCK_OPS,
+	};
+	struct bpf_object *obj;
+	struct bpf_map *map;
+	int client_fd;
+	int prog_fd;
+	int map_fd;
+	int err;
+
+	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+	if (err) {
+		log_err("Failed to load BPF object");
+		return -1;
+	}
+
+	map = bpf_map__next(NULL, obj);
+	map_fd = bpf_map__fd(map);
+
+	err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (err) {
+		log_err("Failed to attach BPF program");
+		goto close_bpf_object;
+	}
+
+	client_fd = connect_to_server(server_fd);
+	if (client_fd < 0) {
+		err = -1;
+		goto close_bpf_object;
+	}
+
+	err += verify_sk(map_fd, client_fd, "syn-ack",
+			 /*invoked=*/1,
+			 /*dsack_dups=*/0,
+			 /*delivered=*/1,
+			 /*delivered_ce=*/0,
+			 /*icsk_retransmits=*/0);
+
+	send_byte(client_fd);
+
+	err += verify_sk(map_fd, client_fd, "first payload byte",
+			 /*invoked=*/2,
+			 /*dsack_dups=*/0,
+			 /*delivered=*/2,
+			 /*delivered_ce=*/0,
+			 /*icsk_retransmits=*/0);
+
+	close(client_fd);
+
+close_bpf_object:
+	bpf_object__close(obj);
+	return err;
+}
+
+static int start_server(void)
+{
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
+	int fd;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd < 0) {
+		log_err("Failed to create server socket");
+		return -1;
+	}
+
+	if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		log_err("Failed to bind socket");
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static void *server_thread(void *arg)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int fd = *(int *)arg;
+	int client_fd;
+
+	if (listen(fd, 1) < 0)
+		error(1, errno, "Failed to listed on socket");
+
+	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+	if (client_fd < 0)
+		error(1, errno, "Failed to accept client");
+
+	/* Wait for the next connection (that never arrives)
+	 * to keep this thread alive to prevent calling
+	 * close() on client_fd.
+	 */
+	if (accept(fd, (struct sockaddr *)&addr, &len) >= 0)
+		error(1, errno, "Unexpected success in second accept");
+
+	close(client_fd);
+
+	return NULL;
+}
+
+int main(int args, char **argv)
+{
+	int server_fd, cgroup_fd;
+	int err = EXIT_SUCCESS;
+	pthread_t tid;
+
+	if (setup_cgroup_environment())
+		goto cleanup_obj;
+
+	cgroup_fd = create_and_get_cgroup(CG_PATH);
+	if (cgroup_fd < 0)
+		goto cleanup_cgroup_env;
+
+	if (join_cgroup(CG_PATH))
+		goto cleanup_cgroup;
+
+	server_fd = start_server();
+	if (server_fd < 0) {
+		err = EXIT_FAILURE;
+		goto cleanup_cgroup;
+	}
+
+	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+	if (run_test(cgroup_fd, server_fd))
+		err = EXIT_FAILURE;
+
+	close(server_fd);
+
+	printf("test_sockopt_sk: %s\n",
+	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+	close(cgroup_fd);
+cleanup_cgroup_env:
+	cleanup_cgroup_environment();
+cleanup_obj:
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2072b96cd3855ba0f009ac02a4ee35383bbeed72..0c255ee925b4cc83e46ed963533e9d5ebe41a5d9 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -108,6 +108,7 @@ struct bpf_test {
 			__u64 data64[TEST_DATA_LEN / 8];
 		};
 	} retvals[MAX_TEST_RUNS];
+	enum bpf_attach_type expected_attach_type;
 };
 
 /* Note we want this to be 64 bit aligned so that the end of our array is
@@ -17381,6 +17382,79 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
 	},
+	#define BPF_SOCK_ADDR_STORE(field, off, res, err) \
+	{ \
+		"wide store to bpf_sock_addr." #field "[" #off "]", \
+		.insns = { \
+		BPF_MOV64_IMM(BPF_REG_0, 1), \
+		BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \
+			offsetof(struct bpf_sock_addr, field[off])), \
+		BPF_EXIT_INSN(), \
+		}, \
+		.result = res, \
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \
+		.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \
+		.errstr = err, \
+	}
+
+	/* user_ip6[0] is u64 aligned */
+	BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT,
+			"invalid bpf_context access off=12 size=8"),
+	BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT,
+			"invalid bpf_context access off=20 size=8"),
+
+	/* msg_src_ip6[0] is _not_ u64 aligned */
+	BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT,
+			"invalid bpf_context access off=44 size=8"),
+	BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT,
+			"invalid bpf_context access off=52 size=8"),
+	BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT,
+			"invalid bpf_context access off=56 size=8"),
+
+	#undef BPF_SOCK_ADDR_STORE
+
+	#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \
+	{ \
+		"wide load from bpf_sock_addr." #field "[" #off "]", \
+		.insns = { \
+		BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, \
+			offsetof(struct bpf_sock_addr, field[off])), \
+		BPF_MOV64_IMM(BPF_REG_0, 1), \
+		BPF_EXIT_INSN(), \
+		}, \
+		.result = res, \
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \
+		.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \
+		.errstr = err, \
+	}
+
+	/* user_ip6[0] is u64 aligned */
+	BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT,
+			"invalid bpf_context access off=12 size=8"),
+	BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT,
+			"invalid bpf_context access off=20 size=8"),
+
+	/* msg_src_ip6[0] is _not_ u64 aligned */
+	BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT,
+			"invalid bpf_context access off=44 size=8"),
+	BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT,
+			NULL),
+	BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT,
+			"invalid bpf_context access off=52 size=8"),
+	BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT,
+			"invalid bpf_context access off=56 size=8"),
+
+	#undef BPF_SOCK_ADDR_LOAD
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -17885,6 +17959,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	int fd_prog, expected_ret, alignment_prevented_execution;
 	int prog_len, prog_type = test->prog_type;
 	struct bpf_insn *prog = test->insns;
+	struct bpf_load_program_attr attr;
 	int run_errs, run_successes;
 	int map_fds[MAX_NR_MAPS];
 	const char *expected_err;
@@ -17917,8 +17992,16 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		pflags |= BPF_F_STRICT_ALIGNMENT;
 	if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
 		pflags |= BPF_F_ANY_ALIGNMENT;
-	fd_prog = bpf_verify_program(prog_type, prog, prog_len, pflags,
-				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 4);
+	memset(&attr, 0, sizeof(attr));
+	attr.prog_type = prog_type;
+	attr.expected_attach_type = test->expected_attach_type;
+	attr.insns = prog;
+	attr.insns_cnt = prog_len;
+	attr.license = "GPL";
+	attr.log_level = 4;
+	attr.prog_flags = pflags;
+
+	fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
 	saved_errno = errno;
 	if (fd_prog < 0 && !bpf_probe_prog_type(prog_type, 0)) {
 		printf("SKIP (unsupported program type %d)\n", prog_type);
@@ -17949,7 +18032,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 			printf("FAIL\nUnexpected success to load!\n");
 			goto fail_log;
 		}
-		if (!strstr(bpf_vlog, expected_err)) {
+		if (!expected_err || !strstr(bpf_vlog, expected_err)) {
 			printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n",
 			      expected_err, bpf_vlog);
 			goto fail_log;