From ca1694e2b1d88f6813330da013b8ecb5e6603ec2 Mon Sep 17 00:00:00 2001 From: luckky Date: Mon, 28 Oct 2024 18:39:25 +0800 Subject: [PATCH] fix hbm online repair logic --- ...-online-repair-notice-and-efi-create.patch | 508 ++++++++++++++++++ sysSentry.spec | 9 +- 2 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 fix-hbm-online-repair-notice-and-efi-create.patch diff --git a/fix-hbm-online-repair-notice-and-efi-create.patch b/fix-hbm-online-repair-notice-and-efi-create.patch new file mode 100644 index 0000000..90633c5 --- /dev/null +++ b/fix-hbm-online-repair-notice-and-efi-create.patch @@ -0,0 +1,508 @@ +From c547cd3721412c6d240e07a75a65425201c4c57a Mon Sep 17 00:00:00 2001 +From: luckky +Date: Mon, 28 Oct 2024 18:34:34 +0800 +Subject: [PATCH] fix hbm online repair notice and efi create + +--- + src/c/hbm_online_repair/hbm_online_repair.c | 5 +- + .../non-standard-hbm-repair.c | 194 +++++++++--------- + .../non-standard-hbm-repair.h | 2 +- + src/c/hbm_online_repair/ras-events.c | 1 - + .../ras-non-standard-handler.c | 33 +-- + .../ras-non-standard-handler.h | 1 + + 6 files changed, 116 insertions(+), 120 deletions(-) + +diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c +index 3ace206..b3b2742 100644 +--- a/src/c/hbm_online_repair/hbm_online_repair.c ++++ b/src/c/hbm_online_repair/hbm_online_repair.c +@@ -127,10 +127,7 @@ int main(int argc, char *argv[]) + return -1; + } + +- ret = init_all_flash(); +- if (ret < 0) { +- log(LOG_ERROR, "flash writer init failed\n"); +- } ++ get_flash_total_size(); + + handle_ras_events(ras); + +diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c +index b175e14..f26d8ae 100644 +--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c ++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c +@@ -15,7 +15,7 @@ + #include "non-standard-hbm-repair.h" + + extern int page_isolation_threshold; +-size_t total_size = 0; ++size_t flash_total_size = 0; + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned + info_struct->crc8 = (uint32_t)fault_addr; + } + +-static bool variable_existed(char *name, char *guid) ++static bool is_variable_existing(char *name, char *guid) + { ++ char filename[PATH_MAX]; ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ return access(filename, F_OK | R_OK) == 0; ++} ++ ++static size_t get_var_size(char *name, char *guid) { + char filename[PATH_MAX]; + int fd; ++ struct stat stat; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { +- log(LOG_WARNING, "open file %s failed\n", filename); +- return false; ++ log(LOG_WARNING, "open %s failed\n", filename); ++ goto err; ++ } ++ // read stat ++ if (fstat(fd, &stat) != 0) { ++ log(LOG_WARNING, "fstat %s failed\n", filename); ++ goto err; + } + close(fd); +- return true; ++ return stat.st_size; ++err: ++ if (fd >= 0) ++ close(fd); ++ return (size_t)-1; + } + +-static uint32_t read_variable_attribute(char *name, char *guid) { ++void get_flash_total_size() { ++ for (int i = 0; i < FLASH_ENTRY_NUM; i++) { ++ if (is_variable_existing(flash_names[i], flash_guids[i])) { ++ flash_total_size += get_var_size(flash_names[i], flash_guids[i]); ++ } ++ } ++ // check total entry size ++ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", ++ flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); ++ if (flash_total_size > MAX_VAR_SIZE) { ++ log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size); ++ } ++} ++ ++static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) { + char filename[PATH_MAX]; + int fd; + size_t readsize; +- uint32_t attribute = (uint32_t)-1; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + +@@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) { + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); +- return attribute; ++ return -1; + } + + // read attributes from first 4 bytes +- readsize = read(fd, &attribute, sizeof(uint32_t)); ++ readsize = read(fd, attribute, sizeof(uint32_t)); + if (readsize != sizeof(uint32_t)) { + log(LOG_ERROR, "read attribute of %s failed\n", filename); ++ return -1; + } + + close(fd); +- return attribute; ++ return 0; + } + + static int efivarfs_set_mutable(char *name, char *guid, bool mutable) +@@ -205,8 +236,8 @@ err: + return -1; + } + +-static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { +- int fd, mode; ++static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) { ++ int fd = -1, mode; + size_t writesize; + void *buffer; + unsigned long total; +@@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz + memcpy(buffer + sizeof(uint32_t), value, size); + + // change attr +- if (efivarfs_set_mutable(name, guid, 1) != 0) { ++ if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) { + log(LOG_ERROR, "set mutable for %s failed\n", filename); + goto err; + } + + mode = O_WRONLY; +- if (attribute & EFI_VARIABLE_APPEND_WRITE) +- mode |= O_APPEND; +- else +- mode |= O_CREAT; ++ mode |= is_existing ? O_APPEND : O_CREAT; + + // open var file + fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); +@@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz + + close(fd); + free(buffer); +- if (efivarfs_set_mutable(name, guid, 0) != 0) { ++ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return 0; +@@ -261,86 +289,21 @@ err: + close(fd); + if (buffer) + free(buffer); +- if (efivarfs_set_mutable(name, guid, 0) != 0) { ++ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return -1; + } + +-static int append_variable(char *name, char *guid, void *data, unsigned long size) { +- // prepare append attribute +- uint32_t attribute = read_variable_attribute(name, guid); +- if (attribute == (uint32_t)-1) { +- log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); +- return -1; +- } +- attribute |= EFI_VARIABLE_APPEND_WRITE; +- +- return write_variable(name, guid, data, size, attribute); +-} +- +-static size_t get_var_size(char *name, char *guid) { +- char filename[PATH_MAX]; +- int fd; +- struct stat stat; +- +- snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); +- +- // open var file +- fd = open(filename, O_RDONLY); +- if (fd < 0) { +- log(LOG_WARNING, "open %s failed\n", filename); +- goto err; +- } +- // read stat +- if (fstat(fd, &stat) != 0) { +- log(LOG_WARNING, "fstat %s failed\n", filename); +- goto err; +- } +- close(fd); +- return stat.st_size; +-err: +- if (fd >= 0) +- close(fd); +- return (size_t)-1; +-} +- +-int init_all_flash() { +- for (int i = 0; i < FLASH_ENTRY_NUM; i++) { +- // check existed entry +- if (variable_existed(flash_names[i], flash_guids[i])) { +- total_size += get_var_size(flash_names[i], flash_guids[i]); +- continue; +- } +- // create new entry +- uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | +- EFI_VARIABLE_BOOTSERVICE_ACCESS | +- EFI_VARIABLE_RUNTIME_ACCESS; +- char *data = ""; +- unsigned long size = 1; +- int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); +- if (ret) { +- log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); +- return -1; +- } +- total_size += sizeof(uint32_t) + 1; +- } +- // check total entry size +- log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", +- total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); +- if (total_size > MAX_VAR_SIZE) { +- log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); +- } +- return 0; +-} +- + static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { + int ret, guid_index; + uint32_t reg_size; + uint64_t fault_addr; ++ bool is_existing; ++ uint32_t attribute = -1; + + // check flash usage threshold +- if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { ++ if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { + log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); + return -1; + } +@@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err + log(LOG_ERROR, "invalid fault info\n"); + return -1; + } ++ ++ // judge if the efivar is existing to set the attribute ++ is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]); ++ attribute = EFI_VARIABLE_NON_VOLATILE | ++ EFI_VARIABLE_BOOTSERVICE_ACCESS | ++ EFI_VARIABLE_RUNTIME_ACCESS; ++ if (is_existing) { ++ ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute); ++ if (ret < 0) { ++ log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]); ++ return -1; ++ } ++ attribute |= EFI_VARIABLE_APPEND_WRITE; ++ } ++ + // record physical addr in flash +- ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); ++ ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing); + if (ret < 0) { +- log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); ++ log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); + return -1; + } +- total_size += sizeof(uint64_t); +- log(LOG_INFO, "write hbm fault info to flash success\n"); ++ flash_total_size += sizeof(uint64_t); ++ log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]); + return 0; + } + +@@ -421,7 +399,7 @@ static int get_hardware_corrupted_size() + return hardware_corrupted_size; + } + +-static uint8_t get_repair_result_code(int ret) ++static uint8_t get_repair_failed_result_code(int ret) + { + if (ret == -ENOSPC) { + return REPAIR_FAILED_NO_RESOURCE; +@@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) + static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) + { + int ret; +- if (repair_ret < 0) { ++ if (repair_ret <= 0) { + log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); + /* not much we can do about errors here */ + (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); +- return get_repair_result_code(repair_ret); ++ return get_repair_failed_result_code(repair_ret); + } + + ret = write_file("/sys/kernel/page_eject", "online_page", paddr); +@@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; + + ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); +- if (ret < 0) { +- notice_BMC(err, get_repair_result_code(ret)); +- log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); ++ ++ /* Only positive num means the error is supported to repair */ ++ if (ret <= 0) { ++ if (ret != -ENXIO) { ++ notice_BMC(err, get_repair_failed_result_code(ret)); ++ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); ++ } + return ret; + } + +@@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char + all_online_success = false; + } + } +- if (ret < 0) { +- notice_BMC(err, get_repair_result_code(ret)); ++ /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ ++ if (ret <= 0) { ++ notice_BMC(err, get_repair_failed_result_code(ret)); + return ret; + } else if (all_online_success) { + notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); +@@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) + struct dirent *dent; + DIR *dir; + int ret; +- bool find_device = false, find_hbm_mem = false; ++ bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false; + + ret = hbmc_hbm_page_isolate(err); + if (ret < 0) { +@@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) + if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { + find_hbm_mem = true; + ret = hbmc_hbm_repair(err, path); +- if (ret != -ENXIO) ++ if (ret != -ENXIO) { ++ addr_in_hbm_device = true; + break; ++ } + } + } ++ + if (!find_device) { + log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); +@@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) + log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); ++ } else if (!addr_in_hbm_device) { ++ log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n", ++ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); ++ notice_BMC(err, REPAIR_FAILED_INVALID_PARAM); + } + + closedir(dir); +@@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err) + (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); + + if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { +- log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", ++ log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n", + hbm_repair_reg_type, err->reg_array_size); + return false; + } +diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h +index 7e8e448..ecb04fe 100644 +--- a/src/c/hbm_online_repair/non-standard-hbm-repair.h ++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h +@@ -84,6 +84,6 @@ + #define FLASH_ENTRY_NUM 8 + #define KB_SIZE 1024 + +-extern int init_all_flash(); ++extern void get_flash_total_size(); + + #endif +diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c +index 0b12329..4d281ad 100644 +--- a/src/c/hbm_online_repair/ras-events.c ++++ b/src/c/hbm_online_repair/ras-events.c +@@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata, + "Error on CPU %i\n", i); + warnonce[i]++; + } +- continue; + } + if (!(fds[i].revents & POLLIN)) { + count_nready++; +diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c +index 1d1fd04..48ffa70 100644 +--- a/src/c/hbm_online_repair/ras-non-standard-handler.c ++++ b/src/c/hbm_online_repair/ras-non-standard-handler.c +@@ -7,17 +7,21 @@ + #include "ras-non-standard-handler.h" + #include "logger.h" + +-static char *uuid_le(const char *uu) ++static int uuid_le(const char *uu, char* uuid) + { +- static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + if (!uu) { + log(LOG_ERROR, "uuid_le failed: uu is empty"); +- return uuid; ++ return -1; + } + size_t uu_len = strlen(uu); +- if (uu_len < SECTION_TYPE_UUID_LEN) { +- log(LOG_ERROR, "uuid_le failed: uu is too short"); +- return uuid; ++ if (uu_len != SECTION_TYPE_UUID_LEN) { ++ log(LOG_ERROR, "uuid_le failed: uu len is incorrect"); ++ return -1; ++ } ++ size_t uuid_len = strlen(uuid); ++ if (uuid_len != strlen(UUID_STR_TYPE)) { ++ log(LOG_ERROR, "uuid_le failed: uuid len is incorrect"); ++ return -1; + } + + char *p = uuid; +@@ -38,7 +42,7 @@ static char *uuid_le(const char *uu) + + *p = 0; + +- return uuid; ++ return 0; + } + + int ras_non_standard_event_handler(struct trace_seq *s, +@@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s, + ev.sec_type = tep_get_field_raw(s, event, "sec_type", + record, &len, 1); + if(!ev.sec_type) { +- log(LOG_WARNING, "get event section type failed"); ++ log(LOG_WARNING, "get event section type failed\n"); + return -1; + } + + trace_seq_printf(s, "\n"); +- trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); ++ char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE; ++ if (uuid_le(ev.sec_type, uuid) < 0) { ++ log(LOG_WARNING, "get uuid failed\n"); ++ return -1; ++ } ++ trace_seq_printf(s, "sec_type: %s\n", uuid); + + if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { +- log(LOG_WARNING, "tep get field val failed"); ++ log(LOG_WARNING, "tep get field val failed\n"); + return -1; + } + +@@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s, + + ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); + if(!ev.error || ev.length != len) { +- log(LOG_WARNING, "get event error failed"); ++ log(LOG_WARNING, "get event error failed\n"); + return -1; + } + +- if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { ++ if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) { + decode_hisi_common_section(&ev); + } + +diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h +index 0272dc1..15a37ee 100644 +--- a/src/c/hbm_online_repair/ras-non-standard-handler.h ++++ b/src/c/hbm_online_repair/ras-non-standard-handler.h +@@ -7,6 +7,7 @@ + #define BIT(nr) (1UL << (nr)) + + #define SECTION_TYPE_UUID_LEN 16 ++#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + #define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" + + struct ras_non_standard_event { +-- +2.43.0 + diff --git a/sysSentry.spec b/sysSentry.spec index 3d21a4b..ecbbc28 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -4,7 +4,7 @@ Summary: System Inspection Framework Name: sysSentry Version: 1.0.2 -Release: 16 +Release: 17 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz @@ -27,6 +27,7 @@ Patch14: over-threshold-should-be-warn-level-log-in-cat-cli.patch Patch15: add-separator-to-err-info.patch Patch16: remove-threshold-max-cpu-cores.patch Patch17: add-hbm-online-repair.patch +Patch18: fix-hbm-online-repair-notice-and-efi-create.patch BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools @@ -216,6 +217,12 @@ rm -rf %{buildroot} %attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py %changelog +* Mon Oct 28 2024 luckky - 1.0.2-17 +- Type:bugfix +- CVE:NA +- SUG:NA +- DESC:fix hbm_online_repair notice BMC function and create efi fd + * Mon Oct 21 2024 luckky - 1.0.2-16 - Type:requirement - CVE:NA -- Gitee