diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 37946e09e6b1eebabe766338c090c32c40b0b010..ca63617c7457a82eaea87fbb4da85b36e53b0a9d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -91,6 +91,7 @@ CONFIG_BPF_LSM=y CONFIG_BPF_SCHED=y # end of BPF subsystem +CONFIG_BPF_RVI=y CONFIG_PREEMPT_NONE_BUILD=y CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 105a76c5af6805aa5c819b3e5faf49c0c887cd9a..bf3606a61ea535107802b6065714b8e10dc1622d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -111,6 +111,7 @@ CONFIG_BPF_LSM=y CONFIG_BPF_SCHED=y # end of BPF subsystem +CONFIG_BPF_RVI=y CONFIG_PREEMPT_NONE_BUILD=y CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e10099f4a0afcc13b2dc694825464b63a80ec871..b96c8413bbc320f61ecbaea4efb34e9bbfcfcc6b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o +obj-$(CONFIG_BPF_RVI) += bpf-rvi.o proc.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/bpf-rvi.c b/arch/x86/kernel/cpu/bpf-rvi.c new file mode 100644 index 0000000000000000000000000000000000000000..dc71657623ed32af16b3f7f4fb79dca5c5499f5f --- /dev/null +++ b/arch/x86/kernel/cpu/bpf-rvi.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Huawei Technologies Co., Ltd */ +#include +#include +#include +#include + +struct cpuinfo_x86_bpf { + unsigned int cpu_khz; + unsigned int siblings; +}; + +struct bpf_iter__cpuinfo_x86 { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cpuinfo_x86 *, cpuinfo); + __bpf_md_ptr(struct cpuinfo_x86_bpf *, cpuinfo_bpf); +}; + +struct cpuinfo_x86_seq_priv { + cpumask_t allowed_mask; +}; + +static void *bpf_c_start(struct seq_file *m, loff_t *pos) +{ + struct cpuinfo_x86_seq_priv *priv = m->private; + struct task_struct *reaper = get_current_level1_reaper(); + + task_effective_cpumask(reaper ?: current, &priv->allowed_mask); + if (reaper) + put_task_struct(reaper); + + /* + * DO NOT use cpumask_first() here: sys_read may start from somewhere in + * the middle of the file, and *pos may contain a value from the last + * read. + */ + *pos = cpumask_next(*pos - 1, &priv->allowed_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +} + +static void *bpf_c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct cpuinfo_x86_seq_priv *priv = m->private; + + *pos = cpumask_next(*pos, &priv->allowed_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +} + +int show_cpuinfo(struct seq_file *m, void *v); + +static int bpf_show_cpuinfo(struct seq_file *m, void *v) +{ + struct bpf_iter__cpuinfo_x86 ctx; + struct bpf_iter_meta meta; + struct cpuinfo_x86_bpf cpuinfo_bpf; + struct bpf_prog *prog; + + meta.seq = m; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return show_cpuinfo(m, v); + + ctx.meta = &meta; + ctx.cpuinfo = (struct cpuinfo_x86 *)v; + + cpuinfo_bpf.cpu_khz = cpu_khz; + cpuinfo_bpf.siblings = cpumask_weight(topology_core_cpumask(ctx.cpuinfo->cpu_index)); + ctx.cpuinfo_bpf = &cpuinfo_bpf; + + return bpf_iter_run_prog(prog, &ctx); +} + +static void bpf_c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations bpf_cpuinfo_op = { + .start = bpf_c_start, + .next = bpf_c_next, + .stop = bpf_c_stop, + .show = bpf_show_cpuinfo, +}; + +DEFINE_BPF_ITER_FUNC(cpuinfo_x86, struct bpf_iter_meta *meta, struct cpuinfo_x86 *cpuinfo, + struct cpuinfo_x86_bpf *cpuinfo_bpf) + +BTF_ID_LIST(btf_cpuinfo_x86_id) +BTF_ID(struct, cpuinfo_x86) +BTF_ID(struct, cpuinfo_x86_bpf) + +static const struct bpf_iter_seq_info cpuinfo_x86_seq_info = { + .seq_ops = &bpf_cpuinfo_op, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct cpuinfo_x86_seq_priv), +}; + +static struct bpf_iter_reg cpuinfo_x86_reg_info = { + .target = "cpuinfo_x86", + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cpuinfo_x86, cpuinfo), + PTR_TO_BTF_ID, }, + { offsetof(struct bpf_iter__cpuinfo_x86, cpuinfo_bpf), + PTR_TO_BTF_ID, }, + }, + .seq_info = &cpuinfo_x86_seq_info, +}; + +static int __init cpuinfo_iter_init(void) +{ + cpuinfo_x86_reg_info.ctx_arg_info[0].btf_id = btf_cpuinfo_x86_id[0]; + cpuinfo_x86_reg_info.ctx_arg_info[1].btf_id = btf_cpuinfo_x86_id[1]; + return bpf_iter_reg_target(&cpuinfo_x86_reg_info); +} +late_initcall(cpuinfo_iter_init); + +enum arch_flags_type { + X86_CAP, + X86_BUG, + X86_POWER, + X86_POWER_SIZE, +}; + +__bpf_kfunc const char *bpf_arch_flags(enum arch_flags_type t, int i) +{ + switch (t) { + case X86_CAP: + return x86_cap_flags[i]; + case X86_BUG: + return x86_bug_flags[i]; + case X86_POWER: + return x86_power_flags[i]; + case X86_POWER_SIZE: + return (void *)ARRAY_SIZE(x86_power_flags); + default: + return NULL; + } +} + +BTF_SET8_START(bpf_arch_flags_kfunc_ids) +BTF_ID_FLAGS(func, bpf_arch_flags) +BTF_SET8_END(bpf_arch_flags_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_arch_flags_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_arch_flags_kfunc_ids, +}; + +static int __init bpf_arch_flags_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_arch_flags_kfunc_set); +} +late_initcall(bpf_arch_flags_kfunc_init); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index a0f81db51eac371ab8bac8cd72f45e5b1829194d..1aaf5be60a82a5ab4ea929b73408bf5c92be3759 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -60,7 +60,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) } #endif -static int show_cpuinfo(struct seq_file *m, void *v) +int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; unsigned int cpu; diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 64d602e7219f2490f1fbfe248fb8b20023457fd3..407178c86b1e80c3ffa59b36d27e504ef438bf9d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -75,6 +75,9 @@ extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +#ifdef CONFIG_BPF_RVI +extern void task_effective_cpumask(struct task_struct *p, struct cpumask *mask); +#endif extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) @@ -199,6 +202,14 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, task_cpu_possible_mask(p)); } +#ifdef CONFIG_BPF_RVI +static inline void task_effective_cpumask(struct task_struct *p, + struct cpumask *mask) +{ + cpuset_cpus_allowed(p, mask); +} +#endif + static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7ab8c213f13b101b498b634a5549314ef71dfed3..28161eefca5da292d9389571182cd19d9512dbc2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -130,4 +130,8 @@ static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) return task_active_pid_ns(tsk) == &init_pid_ns; } +#ifdef CONFIG_BPF_RVI +extern struct task_struct *get_current_level1_reaper(void); +#endif + #endif /* _LINUX_PID_NS_H */ diff --git a/init/Kconfig b/init/Kconfig index 22d9ac8ca08fadf435599b48515d3485e5ac76d5..80624b3e525784aeb6e15fc48b43fc024f770412 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -486,6 +486,7 @@ config AUDITSYSCALL source "kernel/irq/Kconfig" source "kernel/time/Kconfig" source "kernel/bpf/Kconfig" +source "kernel/bpf-rvi/Kconfig" source "kernel/Kconfig.preempt" menu "CPU/Task time and stats accounting" diff --git a/kernel/Makefile b/kernel/Makefile index 1fe46db40806212be0f6b554d15992c73a87ed93..da4c2d1838dc9a4dbac5757743f330fa192fe650 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -140,6 +140,8 @@ KCOV_INSTRUMENT_stackleak.o := n obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o +obj-$(CONFIG_BPF_RVI) += bpf-rvi/ + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data config_data.gz diff --git a/kernel/bpf-rvi/Kconfig b/kernel/bpf-rvi/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..0e356ae6fc85e2745777851244be2d901d6acb96 --- /dev/null +++ b/kernel/bpf-rvi/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (c) 2025 Huawei Technologies Co., Ltd + +config BPF_RVI + bool "Resource View Isolation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on BPF_JIT + depends on CPUSETS + default n + help + A resource view is a bundle of interfaces under /proc and /sys + providing system resource information, e.g. /proc/{cpu,mem}info. This + feature implements a container-sensitive resource view, whose output + can be customized based on the container's configuration and its + limitation on many kinds of system resources. This feature aims to + be a substitute for LXCFS with better performance and scalability. + + If you are unsure how to answer this question, answer N. diff --git a/kernel/bpf-rvi/Makefile b/kernel/bpf-rvi/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8c226d5f1b3e4831509f57b7c5eb8ed0b52278a0 --- /dev/null +++ b/kernel/bpf-rvi/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd + +obj-y := generic_single_iter.o diff --git a/kernel/bpf-rvi/generic_single_iter.c b/kernel/bpf-rvi/generic_single_iter.c new file mode 100644 index 0000000000000000000000000000000000000000..c8b46242736656ca45e8a7d00e6c916f0b8c711a --- /dev/null +++ b/kernel/bpf-rvi/generic_single_iter.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Huawei Technologies Co., Ltd */ + +#include +#include +#include +#include + +static void *generic_single_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos < 1 ? (void *)1 : NULL; +} + +static void *generic_single_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void generic_single_seq_stop(struct seq_file *seq, void *v) +{ +} + +struct bpf_iter__generic_single { + __bpf_md_ptr(struct bpf_iter_meta *, meta); +}; + +static int generic_single_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__generic_single ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return 0; + + ctx.meta = &meta; + return bpf_iter_run_prog(prog, &ctx); +} + +static const struct seq_operations generic_single_seq_ops = { + .start = generic_single_seq_start, + .next = generic_single_seq_next, + .stop = generic_single_seq_stop, + .show = generic_single_seq_show, +}; + +/* + * Users of "generic_single" iter type: + * - cpu_online + */ +DEFINE_BPF_ITER_FUNC(generic_single, struct bpf_iter_meta *meta) + +static const struct bpf_iter_seq_info generic_single_seq_info = { + .seq_ops = &generic_single_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = 0, +}; + +static struct bpf_iter_reg generic_single_reg_info = { + .target = "generic_single", + .seq_info = &generic_single_seq_info, +}; + +static int __init generic_single_iter_init(void) +{ + return bpf_iter_reg_target(&generic_single_reg_info); +} +late_initcall(generic_single_iter_init); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f5226ddf87b69c57ec7af473a780473d02fe621a..140097c8198e7c372c2350b9a9a03361016ac4d7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2362,6 +2362,32 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) return p; } +#ifdef CONFIG_BPF_RVI +/** + * bpf_current_level1_reaper - Find the reaper task in the level-1 parent of + * current pid namespace. If a task is returned, it must either be stored in + * a map, or released with bpf_task_release(). + */ +__bpf_kfunc struct task_struct *bpf_current_level1_reaper(void) +{ + struct task_struct *p; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + while (ns->level > 1) { // not !=, as ns could be init_pid_ns + ns = ns->parent; + } + + read_lock(&tasklist_lock); + p = ns->child_reaper; + if (p) + p = bpf_task_acquire(p); + read_unlock(&tasklist_lock); + + return p; +} +#endif + /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve @@ -2604,6 +2630,9 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +#ifdef CONFIG_BPF_RVI +BTF_ID_FLAGS(func, bpf_current_level1_reaper, KF_ACQUIRE | KF_RET_NULL) +#endif BTF_SET8_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 26d3a17e8806d45f9cfbffa5641bdfabaa742af1..417827f2c043d745973e2be1f64661acbc9e40e6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -46,6 +46,12 @@ #include #include +#ifdef CONFIG_BPF_RVI +#include +#include +#include +#endif + DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -5200,3 +5206,47 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) seq_printf(m, "Mems_allowed_list:\t%*pbl\n", nodemask_pr_args(&task->mems_allowed)); } + +#ifdef CONFIG_BPF_RVI +void task_effective_cpumask(struct task_struct *tsk, struct cpumask *pmask) +{ + struct cpuset *cs; + + if (!tsk) + cpumask_clear(pmask); + + rcu_read_lock(); + cs = task_cs(tsk); + cpumask_copy(pmask, cs->effective_cpus); + rcu_read_unlock(); +} + +__bpf_kfunc struct cpuset *bpf_cpuset_from_task(struct task_struct *task) +{ + if (!task) + return NULL; + return task_cs(task); +} + +__bpf_kfunc unsigned int bpf_cpumask_weight(struct cpumask *pmask) +{ + return cpumask_weight(pmask); +} + +BTF_SET8_START(bpf_cpuset_kfunc_ids) +BTF_ID_FLAGS(func, bpf_cpuset_from_task, KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_weight) +BTF_SET8_END(bpf_cpuset_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_cpuset_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_cpuset_kfunc_ids, +}; + +static int __init bpf_cpuset_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_cpuset_kfunc_set); +} +late_initcall(bpf_cpuset_kfunc_init); +#endif diff --git a/kernel/pid.c b/kernel/pid.c index db99713600fc6e54b7e611bb5f3137b9328003c9..8000cf32798564d50a95db490f50f07756a7bf4f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -797,3 +797,28 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, fdput(f); return ret; } + +#ifdef CONFIG_BPF_RVI +/* + * We assume that containers should start at root ns, which means that + * containers themselves are level-1 ns. + */ +struct task_struct *get_current_level1_reaper(void) +{ + struct task_struct *reaper; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + while (ns->level > 1) { // not !=, as ns could be init_pid_ns + ns = ns->parent; + } + + read_lock(&tasklist_lock); + reaper = ns->child_reaper; + if (reaper) + get_task_struct(reaper); + read_unlock(&tasklist_lock); + + return reaper; +} +#endif /* CONFIG_BPF_RVI */ diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3fa16412db15cafa9538605a7db975ffe365ad31..7627f996b5e5cdc7c5200d0468d9bbe75130ec41 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -149,6 +149,10 @@ always-y += task_fd_query_kern.o always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o +ifeq ($(ARCH), x86) +always-$(CONFIG_BPF_RVI) += bpf_rvi_cpuinfo_x86.bpf.o +endif +always-$(CONFIG_BPF_RVI) += bpf_rvi_cpu_online.bpf.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/bpf_rvi_cpu_online.bpf.c b/samples/bpf/bpf_rvi_cpu_online.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..da031f7e5a7be8497e9bd9d6ef69417a939b9a6d --- /dev/null +++ b/samples/bpf/bpf_rvi_cpu_online.bpf.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Huawei Technologies Co., Ltd */ +#include +#include +#include + +void bpf_task_release(struct task_struct *p) __ksym; +struct task_struct *bpf_current_level1_reaper(void) __ksym; +struct cpuset *bpf_cpuset_from_task(struct task_struct *p) __ksym; +unsigned int bpf_cpumask_weight(struct cpumask *pmask) __ksym; + +char _license[] SEC("license") = "GPL"; + + +#define RET_OK 0 +#define RET_FAIL 1 +#define RET_SKIP -1 + +static int task_effective_cpus_num(struct task_struct *reaper) +{ + struct cpuset *cpuset; + + cpuset = bpf_cpuset_from_task(reaper); + if (!cpuset) + return -1; + + return bpf_cpumask_weight(cpuset->effective_cpus); +} + +SEC("iter/generic_single") +s64 dump_cpu_online(struct bpf_iter__generic_single *ctx) +{ + struct seq_file *m = ctx->meta->seq; + struct task_struct *reaper; + int ncpus; + int ret = RET_OK; + + reaper = bpf_current_level1_reaper(); + if (!reaper) + return RET_FAIL; + ncpus = task_effective_cpus_num(reaper); + if (ncpus == -1) { + ret = RET_FAIL; + goto err; + } + + if (ncpus > 1) + BPF_SEQ_PRINTF(m, "0-%u\n", ncpus - 1); + else + BPF_SEQ_PRINTF(m, "0\n"); + +err: + bpf_task_release(reaper); + return ret; +} diff --git a/samples/bpf/bpf_rvi_cpuinfo_x86.bpf.c b/samples/bpf/bpf_rvi_cpuinfo_x86.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..b4921d5c9e642bb8674592c325b1f0f4ad3e8180 --- /dev/null +++ b/samples/bpf/bpf_rvi_cpuinfo_x86.bpf.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Huawei Technologies Co., Ltd */ +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define BITS_PER_BYTE 8 +#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + + +/* Check generic_test_bit() in include/asm-generic/bitops/generic-non-atomic.h */ +static inline bool test_bit(unsigned long nr, const unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +const char *bpf_arch_flags(enum arch_flags_type t, int i) __ksym; + +char _license[] SEC("license") = "GPL"; + +#define NCAPINTS 31 /* N 32-bit words worth of info */ +#define NBUGINTS 4 /* N 32-bit bug flags */ +#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ +#define HZ 1000 + +/* + * Reference: arch/x86/include/asm/cpufeature.h + * Treats cpu_has(c, bit) == test_cpu_cap(c, bit) + */ +#define cpu_has(c, bit) test_bit(bit, (unsigned long *)((c)->x86_capability)) +#define cpu_has_bug(c, bit) cpu_has((c), (bit)) + +#define MAX_CPUS (sizeof(struct cpumask) * BITS_PER_BYTE) + +#define RET_OK 0 +#define RET_FAIL 1 +/* + * For bpf prog, -1 means to skip the current object, + * so we can use ctx->meta->seq_num as new cpu index. + */ +#define RET_SKIP -1 + +/* + * Relevant concepts: + * - CPU socket/chip, could be many on one motherboard + * - Physical CPU core + * - Logical processor, from e.g. SMT + * + * Field explanation: + * - physical id: id of CPU socket (i.e. CPU chip) + * - siblings: number of logical processors per CPU chip + * - core id: id of physical core + * - cpu cores: number of physical cores per CPU chip + * - (initial) apicid: something given by BIOS + * + * Use a designed pattern to print physical info that could leak the real number + * of CPUs on the host. + * + */ +static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, + unsigned int cpu, unsigned int siblings) +{ + BPF_SEQ_PRINTF(m, "physical id\t: %d\n", cpu); + BPF_SEQ_PRINTF(m, "siblings\t: 1\n"); + BPF_SEQ_PRINTF(m, "core id\t\t: 0\n"); + BPF_SEQ_PRINTF(m, "cpu cores\t: 1\n"); + BPF_SEQ_PRINTF(m, "apicid\t\t: %d\n", cpu); + BPF_SEQ_PRINTF(m, "initial apicid\t: %d\n", cpu); +} + +/* this for CONFIG_X86 64 bit*/ +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + BPF_SEQ_PRINTF(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + c->cpuid_level); +} + +/* + * Reference: arch/x86/kernel/cpu/proc.c + */ +SEC("iter/cpuinfo_x86") +s64 dump_cpuinfo_x86(struct bpf_iter__cpuinfo_x86 *ctx) +{ + struct seq_file *m = ctx->meta->seq; + struct cpuinfo_x86 *c = ctx->cpuinfo; + struct cpuinfo_x86_bpf *c_bpf = ctx->cpuinfo_bpf; + unsigned long x86_power_flags_size; + unsigned int virtual_cpu; + int i; + + virtual_cpu = ctx->meta->seq_num; + BPF_SEQ_PRINTF(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %u\n" + "model name\t: %s\n", + virtual_cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + if (c->x86_stepping || c->cpuid_level >= 0) + BPF_SEQ_PRINTF(m, "stepping\t: %d\n", c->x86_stepping); + else + BPF_SEQ_PRINTF(m, "stepping\t: unknown\n"); + if (c->microcode) + BPF_SEQ_PRINTF(m, "microcode\t: 0x%x\n", c->microcode); + + if (cpu_has(c, X86_FEATURE_TSC)) { + unsigned int freq = c_bpf->cpu_khz;//bpf_arch_freq_get_on_cpu(cpu); + + BPF_SEQ_PRINTF(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + } + /* Cache size */ + if (c->x86_cache_size) + BPF_SEQ_PRINTF(m, "cache size\t: %u KB\n", c->x86_cache_size); + show_cpuinfo_core(m, c, virtual_cpu, c_bpf->siblings); + show_cpuinfo_misc(m, c); + + BPF_SEQ_PRINTF(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && bpf_arch_flags(X86_CAP, i)) + BPF_SEQ_PRINTF(m, " %s", bpf_arch_flags(X86_CAP, i)); + + BPF_SEQ_PRINTF(m, "\nbugs\t\t:"); + for (i = 0; i < 32*NBUGINTS; i++) { + unsigned int bug_bit = 32*NCAPINTS + i; + + if (cpu_has_bug(c, bug_bit) && bpf_arch_flags(X86_BUG, i)) + BPF_SEQ_PRINTF(m, " %s", bpf_arch_flags(X86_BUG, i)); + } + + BPF_SEQ_PRINTF(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + + //#ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) + BPF_SEQ_PRINTF(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); + + BPF_SEQ_PRINTF(m, "clflush size\t: %u\n", c->x86_clflush_size); + BPF_SEQ_PRINTF(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + BPF_SEQ_PRINTF(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + BPF_SEQ_PRINTF(m, "power management:"); + x86_power_flags_size = (unsigned long)bpf_arch_flags(X86_POWER_SIZE, 0); + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < x86_power_flags_size && + bpf_arch_flags(X86_POWER, i)) + BPF_SEQ_PRINTF(m, "%s%s", + bpf_arch_flags(X86_POWER, i)[0] ? " " : "", + bpf_arch_flags(X86_POWER, i)); + else + BPF_SEQ_PRINTF(m, " [%d]", i); + } + } + + BPF_SEQ_PRINTF(m, "\n\n"); + + return RET_OK; +}