From fa75636857993e25030c3db31444d2aa01d5d9de Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 16:53:40 +0800 Subject: [PATCH] RAS: Report ARM processor information to userspace kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OUGN?from=project-issue CVE: NA ---------------- The original arm_event trace code only traces out ARM processor error information data. It's not enough for user to take appropriate action. According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error section includes several ARM processor error information, several ARM processor context information and several vendor specific error information structures. In addition to these info, there are error severity and cpu logical index about the event. Report all of these information to userspace via perf i/f. So that the user can do cpu core isolation according to error severity and other info. Original-Author: Jason Tian Signed-off-by: Shengwei Luo Reviewed-by: Lv Ying Reviewed-by: Tan Xiaofei Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai Signed-off-by: zhoukaiqi modified: arch/arm64/configs/openeuler_defconfig modified: drivers/acpi/apei/ghes.c modified: drivers/ras/Kconfig modified: drivers/ras/ras.c modified: include/linux/ras.h modified: include/ras/ras_event.h --- arch/arm64/configs/openeuler_defconfig | 1 + drivers/acpi/apei/ghes.c | 7 +++- drivers/ras/Kconfig | 11 +++++ drivers/ras/ras.c | 50 +++++++++++++++++++++++ include/linux/ras.h | 23 +++++++++++ include/ras/ras_event.h | 56 +++++++++++++++++++++++++- 6 files changed, 145 insertions(+), 3 deletions(-) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index e1cf25de0102..497b36a8598a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6769,6 +6769,7 @@ CONFIG_HNS3_PMU=m # end of Performance monitor support CONFIG_RAS=y +CONFIG_RAS_ARM_EVENT_INFO=y CONFIG_PAGE_EJECT=m CONFIG_USB4=m # CONFIG_USB4_DEBUGFS_WRITE is not set diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index bf1b9252a8da..3fde9388b0f4 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -518,9 +518,12 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s int sec_sev, i; char *p; - log_arm_hw_error(err); - sec_sev = ghes_severity(gdata->error_severity); +#ifdef CONFIG_RAS_ARM_EVENT_INFO + log_arm_hw_error(err, sec_sev); +#else + log_arm_hw_error(err); +#endif if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) return false; diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index f8554a940316..84b6d569cfda 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -29,6 +29,17 @@ menuconfig RAS so have ideal availability, but may be unreliable, with frequent data corruption. +config RAS_ARM_EVENT_INFO + bool "RAS feature: report all the arm processor info in arm event" + default y + depends on ARM64 + help + This option allows to report several ARM processor error information, + context information, vendor specific error information, error severity + and cpu logical index about the event to userspace via perf i/f. So + that the user can do cpu core isolation according to error severity + and other info. + if RAS source "arch/x86/ras/Kconfig" diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 95540ea8dd9d..9a12398ee0a8 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -21,9 +21,59 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } +#ifdef CONFIG_RAS_ARM_EVENT_INFO +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) +#else void log_arm_hw_error(struct cper_sec_proc_arm *err) +#endif { +#ifdef CONFIG_RAS_ARM_EVENT_INFO + u32 pei_len; + u32 ctx_len = 0; + s32 vsei_len; + u8 *pei_err; + u8 *ctx_err; + u8 *ven_err_data; + struct cper_arm_err_info *err_info; + struct cper_arm_ctx_info *ctx_info; + int n, sz; + int cpu; + + pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; + pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm); + + err_info = (struct cper_arm_err_info *)(err + 1); + ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); + ctx_err = (u8 *)ctx_info; + for (n = 0; n < err->context_info_num; n++) { + sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); + ctx_len += sz; + } + + vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + + pei_len + ctx_len); + if (vsei_len < 0) { + pr_warn(FW_BUG + "section length: %d\n", err->section_length); + pr_warn(FW_BUG + "section length is too small\n"); + pr_warn(FW_BUG + "firmware-generated error record is incorrect\n"); + vsei_len = 0; + } + ven_err_data = (u8 *)ctx_info; + + cpu = GET_LOGICAL_INDEX(err->mpidr); + /* when return value is invalid, set cpu index to -1 */ + if (cpu < 0) + cpu = -1; + + trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, + ven_err_data, (u32)vsei_len, sev, cpu); +#else trace_arm_event(err); +#endif } static int __init ras_init(void) diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674..8cdd011a9735 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -24,15 +24,38 @@ int __init parse_cec_param(char *str); void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); + +#ifdef CONFIG_RAS_ARM_EVENT_INFO +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev); +#else void log_arm_hw_error(struct cper_sec_proc_arm *err); +#endif + #else static inline void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len) { return; } + +#ifdef CONFIG_RAS_ARM_EVENT_INFO static inline void +log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; } +#else log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } #endif +#endif + +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +#include +/* + * Include ARM specific SMP header which provides a function mapping mpidr to + * cpu logical index. + */ +#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK) +#else +#define GET_LOGICAL_INDEX(mpidr) -EINVAL +#endif /* CONFIG_ARM || CONFIG_ARM64 */ + #endif /* __RAS_H__ */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index e7d9470a27cd..9dbca0e03b3d 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -168,11 +168,29 @@ TRACE_EVENT(mc_event, * This event is generated when hardware detects an ARM processor error * has occurred. UEFI 2.6 spec section N.2.4.4. */ +#define APEIL "ARM Processor Err Info data len" +#define APEID "ARM Processor Err Info raw data" +#define APECIL "ARM Processor Err Context Info data len" +#define APECID "ARM Processor Err Context Info raw data" +#define VSEIL "Vendor Specific Err Info data len" +#define VSEID "Vendor Specific Err Info raw data" TRACE_EVENT(arm_event, - +#ifdef CONFIG_RAS_ARM_EVENT_INFO + TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err, + const u32 pei_len, + const u8 *ctx_err, + const u32 ctx_len, + const u8 *oem, + const u32 oem_len, + u8 sev, + int cpu), + + TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu), +#else TP_PROTO(const struct cper_sec_proc_arm *proc), TP_ARGS(proc), +#endif TP_STRUCT__entry( __field(u64, mpidr) @@ -180,6 +198,16 @@ TRACE_EVENT(arm_event, __field(u32, running_state) __field(u32, psci_state) __field(u8, affinity) +#ifdef CONFIG_RAS_ARM_EVENT_INFO + __field(u32, pei_len) + __dynamic_array(u8, buf, pei_len) + __field(u32, ctx_len) + __dynamic_array(u8, buf1, ctx_len) + __field(u32, oem_len) + __dynamic_array(u8, buf2, oem_len) + __field(u8, sev) + __field(int, cpu) +#endif ), TP_fast_assign( @@ -199,12 +227,38 @@ TRACE_EVENT(arm_event, __entry->running_state = ~0; __entry->psci_state = ~0; } +#ifdef CONFIG_RAS_ARM_EVENT_INFO + __entry->pei_len = pei_len; + memcpy(__get_dynamic_array(buf), pei_err, pei_len); + __entry->ctx_len = ctx_len; + memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len); + __entry->oem_len = oem_len; + memcpy(__get_dynamic_array(buf2), oem, oem_len); + __entry->sev = sev; + __entry->cpu = cpu; +#endif ), +#ifdef CONFIG_RAS_ARM_EVENT_INFO + TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " + "running state: %d; PSCI state: %d; " + "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", + __entry->cpu, + __entry->sev, + __entry->affinity, __entry->mpidr, __entry->midr, + __entry->running_state, __entry->psci_state, + APEIL, __entry->pei_len, APEID, + __print_hex(__get_dynamic_array(buf), __entry->pei_len), + APECIL, __entry->ctx_len, APECID, + __print_hex(__get_dynamic_array(buf1), __entry->ctx_len), + VSEIL, __entry->oem_len, VSEID, + __print_hex(__get_dynamic_array(buf2), __entry->oem_len)) +#else TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " "running state: %d; PSCI state: %d", __entry->affinity, __entry->mpidr, __entry->midr, __entry->running_state, __entry->psci_state) +#endif ); /* -- Gitee