diff --git a/0001-revert-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiS.patch b/0001-revert-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiS.patch new file mode 100644 index 0000000000000000000000000000000000000000..e5b61be6b8edfe088404b53e30ef860eeead4386 --- /dev/null +++ b/0001-revert-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiS.patch @@ -0,0 +1,274 @@ +From 77c2ccb26b5da0c24a82ae956164fe527723dabd Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Mon, 2 Sep 2024 10:24:59 +0800 +Subject: [PATCH] revert "rasdaemon: Add HBM Memory ACLS support for HiSilicon" + +The HBM ACLS scheme that rasdaemon relies on has changed. Moreover, the +new solution only needs to be uploaded to the openEuler-22.03-LTS-SP4 +branch, so this branch cancels support for HiSilicon HBM Memory ACLS. + +Signed-off-by: Junhao He +--- + configure.ac | 11 ---- + misc/rasdaemon.env | 5 -- + non-standard-hisilicon.c | 110 ------------------------------------- + ras-events.c | 3 - + ras-non-standard-handler.c | 32 ----------- + ras-non-standard-handler.h | 8 --- + 6 files changed, 169 deletions(-) + +diff --git a/configure.ac b/configure.ac +index 30c90d2..d098fcf 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -171,16 +171,6 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x + AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) + +-AC_ARG_ENABLE([hisi_hbm_memory_acls], +- AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS])) +- +-AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [ +- AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS") +- AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS]) +-]) +-AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes]) +-AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"]) +- + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -222,5 +212,4 @@ compile time options summary + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION +- HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index d754128..ca12a1a 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -45,10 +45,5 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect + CPU_ISOLATION_LIMIT="10" + +-# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no). +-HISI_HBM_MEMORY_ACLS="no" +-# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no). +-HISI_HBM_ISOLATION_PAGE="no" +- + # Disable specified events by config + DISABLE="block:block_rq_complete" +\ No newline at end of file +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 25c4903..7296d28 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -19,17 +19,6 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +-# define HISI_TYPE_UINT32_WIDTH 32 +-/* Specify the Hisilicon HBMC HBM error type */ +-# define HISI_HBM_ERR_TYPE 0 +-# define HISI_HBM_ERR_ACLS BIT(0) +-# define HISI_HBM_ACLS_ADDL 1 +-# define HISI_HBM_ACLS_ADDH 2 +-# define HISI_HBM_ACLS_ARRAY_SIZE 12 +-# define HISI_SUBMOD_HBMC_HBM 6 +-#endif +- + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -369,100 +358,6 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +-#include +-#include +- +-static int write_file(const char *name, unsigned long long value) +-{ +- char fname[MAX_PATH + 1] = "/sys/kernel/"; +- FILE *file; +- int ret; +- +- strcat(fname, name); +- if (access(fname, W_OK)) { +- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot access '%s': %s\n", +- fname, strerror(errno)); +- return -errno; +- } +- +- file = fopen(fname, "w"); +- if (!file) { +- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", +- fname, strerror(errno)); +- return -errno; +- } +- +- ret = fprintf(file, "0x%llx\n", value); +- if (ret < 0) +- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n", +- fname, value, strerror(errno)); +- +- fclose(file); +- return ret; +-} +- +-static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, +- int page_size) +-{ +- unsigned long long paddr; +- unsigned long long pfn; +- int ret; +- +- if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { +- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: No valid address array length (%d)\n", +- err->reg_array_size); +- return -1; +- } +- +- if (!page_size) +- return -1; +- +- paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; +- paddr <<= HISI_TYPE_UINT32_WIDTH; +- paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; +- pfn = paddr / page_size; +- +- ret = write_file("hbm_memory/acls/acls_query", paddr); +- if (ret < 0) +- return ret; +- +- ret = write_file("debug/hwpoison/corrupt-pfn", pfn); +- if (ret < 0) +- return ret; +- +- ret = write_file("hbm_memory/acls/acls_repair", paddr); +- if (ret < 0 && ras_ns_hisi_hbm_isolation_page_enabled()) { +- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep the pfn (0x%llx) offline\n", +- pfn); +- return ret; +- } +- +- ret = write_file("debug/hwpoison/unpoison-pfn", pfn); +- if (ret < 0) +- return ret; +- +- return 0; +-} +- +-static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err, +- int page_size) +-{ +- if (strcmp(module_name[err->module_id], "HBMC") || +- err->submodule_id != HISI_SUBMOD_HBMC_HBM) +- return; +- +- if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE))) +- return; +- +- if (!(err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS)) +- return; +- +- if (hisi_hbmc_hbm_acls(err, page_size)) +- log(TERM, LOG_WARNING, "Failed to handler HiSilicon HBM ACLS\n"); +-} +-#endif +- + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -498,11 +393,6 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +- if (ras_ns_hisi_hbm_acls_enabled()) +- hisi_hbm_acls_handler(err, ras->page_size); +-#endif +- + return 0; + } + +diff --git a/ras-events.c b/ras-events.c +index d2a7a4e..ed2198b 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -951,9 +951,6 @@ int handle_ras_events(int record_events) + #endif + + #ifdef HAVE_NON_STANDARD +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +- ras_ns_hisi_hbm_param_init(); +-#endif + if (is_disabled_event("ras", "non_standard_event")) { + log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n", + "ras", "non_standard_event"); +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 3ed0900..20d514b 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -24,38 +24,6 @@ + + static struct ras_ns_ev_decoder *ras_ns_ev_dec_list; + +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +-static bool ras_ns_hisi_hbm_acls; +-static bool ras_ns_hisi_hbm_isolation_page; +- +-void ras_ns_hisi_hbm_param_init(void) +-{ +- char *env; +- +- env = getenv("HISI_HBM_MEMORY_ACLS"); +- if (env && strcasecmp(env, "yes") == 0) { +- log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n"); +- ras_ns_hisi_hbm_acls = true; +- } +- +- env = getenv("HISI_HBM_ISOLATION_PAGE"); +- if (env && strcasecmp(env, "yes") == 0) { +- log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n"); +- ras_ns_hisi_hbm_isolation_page = true; +- } +-} +- +-bool ras_ns_hisi_hbm_acls_enabled(void) +-{ +- return ras_ns_hisi_hbm_acls; +-} +- +-bool ras_ns_hisi_hbm_isolation_page_enabled(void) +-{ +- return ras_ns_hisi_hbm_isolation_page; +-} +-#endif +- + void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { + trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); + } +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 1c2a6e7..341206a 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -46,12 +46,4 @@ void ras_ns_finalize_vendor_tables(void); + static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; }; + #endif + +-#ifdef HAVE_HISI_HBM_MEMORY_ACLS +-#include +- +-void ras_ns_hisi_hbm_param_init(void); +-bool ras_ns_hisi_hbm_acls_enabled(void); +-bool ras_ns_hisi_hbm_isolation_page_enabled(void); +-#endif +- + #endif +-- +2.33.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index e70df0768e6801640153c2ac45c7aadbb24e493e..3b57ee11f5636f8a40de04d905819de5c9024f36 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 20 +Release: 21 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -60,6 +60,7 @@ Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch Patch9011: bugfix-fix-cpu-isolate-errors-when-some-cpus-are-.patch +Patch9012: 0001-revert-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiS.patch %description @@ -79,7 +80,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-hisi-hbm-memory-acls +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -114,6 +115,12 @@ if [ $1 -eq 0 ] ; then fi %changelog +* Mon Sep 2 2024 Junhao He - 0.6.7-21 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:Remove the support for HiSilicon HBM Memory ACLS. + * Thu Apr 25 2024 yangjunshuo - 0.6.7-20 - Type:bugfix - ID:NA