From f1612b1b026444f1b6a93c526f062dce3c726def Mon Sep 17 00:00:00 2001 From: luckky Date: Tue, 15 Oct 2024 12:03:00 +0000 Subject: [PATCH] add hbm_online_repair Signed-off-by: luckky --- add-hbm-online-repair.patch | 4090 +++++++++++++++++++++++++++++++++++ sysSentry.spec | 41 +- 2 files changed, 4130 insertions(+), 1 deletion(-) create mode 100644 add-hbm-online-repair.patch diff --git a/add-hbm-online-repair.patch b/add-hbm-online-repair.patch new file mode 100644 index 0000000..3804a75 --- /dev/null +++ b/add-hbm-online-repair.patch @@ -0,0 +1,4090 @@ +From 844e1ad845c58a98fc803b754c0e44ad071bb707 Mon Sep 17 00:00:00 2001 +From: luckky +Date: Tue, 22 Oct 2024 18:09:30 +0800 +Subject: [PATCH] add hbm online repair + +--- + build/build.sh | 16 +- + config/collector.conf | 7 + + config/plugins/avg_block_io.ini | 21 + + config/tasks/avg_block_io.mod | 5 + + config/tasks/hbm_online_repair.mod | 9 + + service/sentryCollector.service | 12 + + .../plugin/cpu_patrol/cpu_patrol_result.c | 4 +- + .../plugin/cpu_patrol/cpu_patrol_result.h | 4 +- + src/c/hbm_online_repair/.gitignore | 6 + + src/c/hbm_online_repair/Makefile | 25 + + src/c/hbm_online_repair/hbm_online_repair.c | 144 ++++ + src/c/hbm_online_repair/hbm_online_repair.env | 2 + + src/c/hbm_online_repair/logger.h | 31 + + .../non-standard-hbm-repair.c | 799 ++++++++++++++++++ + .../non-standard-hbm-repair.h | 89 ++ + src/c/hbm_online_repair/ras-events.c | 534 ++++++++++++ + src/c/hbm_online_repair/ras-events.h | 28 + + .../ras-non-standard-handler.c | 81 ++ + .../ras-non-standard-handler.h | 25 + + src/python/.gitignore | 1 + + src/python/sentryCollector/__init__.py | 0 + src/python/sentryCollector/__main__.py | 17 + + src/python/sentryCollector/collect_config.py | 118 +++ + src/python/sentryCollector/collect_io.py | 243 ++++++ + src/python/sentryCollector/collect_plugin.py | 276 ++++++ + src/python/sentryCollector/collect_server.py | 285 +++++++ + src/python/sentryCollector/collectd.py | 97 +++ + src/python/sentryPlugins/__init__.py | 0 + .../sentryPlugins/avg_block_io/__init__.py | 0 + .../avg_block_io/avg_block_io.py | 257 ++++++ + .../sentryPlugins/avg_block_io/module_conn.py | 86 ++ + .../avg_block_io/stage_window.py | 47 ++ + .../sentryPlugins/avg_block_io/utils.py | 86 ++ + src/python/setup.py | 4 +- + src/python/syssentry/bmc_alarm.py | 159 ++++ + src/python/syssentry/callbacks.py | 2 +- + src/python/syssentry/cpu_alarm.py | 1 + + src/python/syssentry/cpu_sentry.py | 36 +- + src/python/syssentry/cron_process.py | 2 +- + src/python/syssentry/syssentry.py | 79 +- + src/python/xalarm/xalarm_api.py | 2 +- + 41 files changed, 3592 insertions(+), 48 deletions(-) + create mode 100644 config/collector.conf + create mode 100644 config/plugins/avg_block_io.ini + create mode 100644 config/tasks/avg_block_io.mod + create mode 100644 config/tasks/hbm_online_repair.mod + create mode 100644 service/sentryCollector.service + create mode 100644 src/c/hbm_online_repair/.gitignore + create mode 100644 src/c/hbm_online_repair/Makefile + create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c + create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env + create mode 100644 src/c/hbm_online_repair/logger.h + create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c + create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h + create mode 100644 src/c/hbm_online_repair/ras-events.c + create mode 100644 src/c/hbm_online_repair/ras-events.h + create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c + create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h + create mode 100644 src/python/.gitignore + create mode 100644 src/python/sentryCollector/__init__.py + create mode 100644 src/python/sentryCollector/__main__.py + create mode 100644 src/python/sentryCollector/collect_config.py + create mode 100644 src/python/sentryCollector/collect_io.py + create mode 100644 src/python/sentryCollector/collect_plugin.py + create mode 100644 src/python/sentryCollector/collect_server.py + create mode 100644 src/python/sentryCollector/collectd.py + create mode 100644 src/python/sentryPlugins/__init__.py + create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py + create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py + create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py + create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py + create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py + create mode 100644 src/python/syssentry/bmc_alarm.py + +diff --git a/build/build.sh b/build/build.sh +index 17af8a0..e2442e6 100644 +--- a/build/build.sh ++++ b/build/build.sh +@@ -43,23 +43,23 @@ function install() + } + + [ "$1" == "-b" ] && { +- INTALL_DIR=$2 ++ INSTALL_DIR=$2 + [ -z $2 ] && { +- INTALL_DIR=/usr/lib64 +- mkdir -p ${INTALL_DIR} ++ INSTALL_DIR=/usr/lib64 ++ mkdir -p ${INSTALL_DIR} + } + +- build ${INTALL_DIR} ++ build ${INSTALL_DIR} + exit 0 + } + + [ "$1" == "-i" ] && { +- INTALL_DIR=$2 ++ INSTALL_DIR=$2 + [ -z $2 ] && { +- INTALL_DIR=/usr/lib64 +- mkdir -p ${INTALL_DIR} ++ INSTALL_DIR=/usr/lib64 ++ mkdir -p ${INSTALL_DIR} + } +- install ${INTALL_DIR} ++ install ${INSTALL_DIR} + exit 0 + } + +diff --git a/config/collector.conf b/config/collector.conf +new file mode 100644 +index 0000000..9baa086 +--- /dev/null ++++ b/config/collector.conf +@@ -0,0 +1,7 @@ ++[common] ++modules=io ++ ++[io] ++period_time=1 ++max_save=10 ++disk=default +\ No newline at end of file +diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini +new file mode 100644 +index 0000000..bc33dde +--- /dev/null ++++ b/config/plugins/avg_block_io.ini +@@ -0,0 +1,21 @@ ++[common] ++disk=default ++stage=default ++iotype=read,write ++period_time=1 ++ ++[algorithm] ++win_size=30 ++win_threshold=6 ++ ++[latency] ++read_avg_lim=10 ++write_avg_lim=10 ++read_avg_time=3 ++write_avg_time=3 ++read_tot_lim=50 ++write_tot_lim=50 ++ ++[iodump] ++read_iodump_lim=0 ++write_iodump_lim=0 +diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod +new file mode 100644 +index 0000000..75c7299 +--- /dev/null ++++ b/config/tasks/avg_block_io.mod +@@ -0,0 +1,5 @@ ++[common] ++enabled=yes ++task_start=/usr/bin/avg_block_io ++task_stop=pkill avg_block_io ++type=oneshot +\ No newline at end of file +diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod +new file mode 100644 +index 0000000..77dd73e +--- /dev/null ++++ b/config/tasks/hbm_online_repair.mod +@@ -0,0 +1,9 @@ ++[common] ++enabled=yes ++task_start=/usr/bin/hbm_online_repair ++task_stop=kill $pid ++type=period ++interval=180 ++onstart=yes ++env_file=/etc/sysconfig/hbm_online_repair.env ++conflict=up +\ No newline at end of file +diff --git a/service/sentryCollector.service b/service/sentryCollector.service +new file mode 100644 +index 0000000..2e50d7a +--- /dev/null ++++ b/service/sentryCollector.service +@@ -0,0 +1,12 @@ ++[Unit] ++Description = Collection module added for sysSentry and kernel lock-free collection ++ ++[Service] ++ExecStart=/usr/bin/sentryCollector ++ExecStop=/bin/kill $MAINPID ++KillMode=process ++Restart=on-failure ++RestartSec=10s ++ ++[Install] ++WantedBy = multi-user.target +diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c +index 8e31312..9f8d80c 100644 +--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c ++++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c +@@ -22,8 +22,8 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid) + CAT_LOG_W("Core %d is a special core and cannot be isolated", coreid); + return CAT_OK; + } +- if (coreid < 0) { +- CAT_LOG_W("Inner error, coreid is a negative number"); ++ if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) { ++ CAT_LOG_E("Insert error, core id(%d)", coreid); + return CAT_ERR; + } + +diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h +index 9722ec9..92dcdc3 100644 +--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h ++++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h +@@ -30,9 +30,9 @@ typedef enum { + #define CAT_LOG_W(...) CAT_LOG("WARN", __VA_ARGS__) + #define CAT_LOG_E(...) CAT_LOG("ERROR", __VA_ARGS__) + +-#define MAX_CPU_CORES 4096 ++#define MAX_ISOLATE_CORES_PER_PATROL 64 // 一次巡检最大支持隔离故障核数量,一次巡检同时检测到2个以上故障核的概率非常低 + typedef struct { +- unsigned int order_list[MAX_CPU_CORES]; ++ unsigned int order_list[MAX_ISOLATE_CORES_PER_PATROL]; + unsigned short current_nums; + } core_list_st; + +diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore +new file mode 100644 +index 0000000..a577882 +--- /dev/null ++++ b/src/c/hbm_online_repair/.gitignore +@@ -0,0 +1,6 @@ ++*.o ++*.c~ ++*.h~ ++hbm_online_repair ++ ++.vscode/ +diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile +new file mode 100644 +index 0000000..16ebcd8 +--- /dev/null ++++ b/src/c/hbm_online_repair/Makefile +@@ -0,0 +1,25 @@ ++CC = gcc ++ ++CFLAGS = -Wall -o3 ++ ++LDFLAGS = -ltraceevent ++ ++SRC = $(wildcard *.c) ++HDR = $(wildcard *.h) ++ ++OBJ = $(SRC:.c=.o) ++ ++TARGET = hbm_online_repair ++ ++all: $(TARGET) ++ ++$(TARGET): $(OBJ) ++ $(CC) $(OBJ) -o $@ $(LDFLAGS) ++ ++%.o: %.c $(HDR) ++ $(CC) $(CFLAGS) -c $< -o $@ ++ ++clean: ++ rm -f $(OBJ) $(TARGET) ++ ++.PHONY: all clean +diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c +new file mode 100644 +index 0000000..3ace206 +--- /dev/null ++++ b/src/c/hbm_online_repair/hbm_online_repair.c +@@ -0,0 +1,144 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "logger.h" ++#include "ras-events.h" ++#include "non-standard-hbm-repair.h" ++ ++#define DEFAULT_LOG_LEVEL LOG_INFO ++#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 ++ ++int global_level_setting; ++int page_isolation_threshold; ++ ++int string2int(const char* str, int* value) ++{ ++ if (!str) { ++ return -1; ++ } ++ char *endptr; ++ errno = 0; ++ long val = strtol(str, &endptr, 10); ++ if (errno != 0 || *endptr != '\0') { ++ return -1; ++ } ++ *value = (int)val; ++ if (val != (long)*value) { ++ return -1; ++ } ++ return 0; ++} ++ ++int execute_command(const char *command) ++{ ++ FILE *fp; ++ char buffer[128] = {0}; ++ int ret; ++ fp = popen(command, "r"); ++ if (!fp) { ++ log(LOG_ERROR, "popen failed\n"); ++ return -1; ++ } ++ ++ fgets(buffer, sizeof(buffer), fp); ++ log(LOG_DEBUG, "output of command is: %s\n", buffer); ++ ++ ret = pclose(fp); ++ if (ret < 0) { ++ log(LOG_ERROR, "pclose failed\n"); ++ return -1; ++ } ++ ++ if (!WIFEXITED(ret)) { ++ log(LOG_ERROR, "command did not terminate normally\n"); ++ return -1; ++ } ++ ++ ret = WEXITSTATUS(ret); ++ log(LOG_DEBUG, "command exited with status: %d\n", ret); ++ return ret; ++} ++ ++int load_required_driver(void) ++{ ++ int ret; ++ ret = execute_command("modprobe hisi_mem_ras 2>&1"); ++ if (ret < 0) { ++ log(LOG_ERROR, "load repair driver failed\n"); ++ return ret; ++ } ++ ret = execute_command("modprobe page_eject 2>&1"); ++ if (ret < 0) { ++ log(LOG_ERROR, "load page driver failed\n"); ++ return ret; ++ } ++ log(LOG_INFO, "load required driver success\n"); ++ return ret; ++} ++ ++void hbm_param_init(void) ++{ ++ int ret; ++ char *env; ++ ++ env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL"); ++ ret = string2int(env, &global_level_setting); ++ if (ret < 0) { ++ global_level_setting = DEFAULT_LOG_LEVEL; ++ log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); ++ } else { ++ log(LOG_INFO, "log level: %d\n", global_level_setting); ++ } ++ ++ env = getenv("PAGE_ISOLATION_THRESHOLD"); ++ ret = string2int(env, &page_isolation_threshold); ++ if (ret < 0) { ++ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; ++ log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); ++ } else { ++ log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); ++ } ++} ++ ++ ++int main(int argc, char *argv[]) ++{ ++ int ret; ++ ++ hbm_param_init(); ++ ++ ret = load_required_driver(); ++ if (ret < 0) { ++ log(LOG_DEBUG, "load required driver failed\n"); ++ return ret; ++ } ++ ++ struct ras_events *ras = init_trace_instance(); ++ if (!ras) ++ return -1; ++ ++ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1); ++ if (ret < 0) { ++ log(LOG_WARNING, "unable to enable ras non_standard_event.\n"); ++ free(ras); ++ return -1; ++ } ++ ++ ret = init_all_flash(); ++ if (ret < 0) { ++ log(LOG_ERROR, "flash writer init failed\n"); ++ } ++ ++ handle_ras_events(ras); ++ ++ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0); ++ if (ret < 0) { ++ log(LOG_WARNING, "unable to disable ras non_standard_event.\n"); ++ } ++ ++ free(ras); ++ return ret; ++} +diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env +new file mode 100644 +index 0000000..de56079 +--- /dev/null ++++ b/src/c/hbm_online_repair/hbm_online_repair.env +@@ -0,0 +1,2 @@ ++HBM_ONLINE_REPAIR_LOG_LEVEL=1 ++PAGE_ISOLATION_THRESHOLD=128 +diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h +new file mode 100644 +index 0000000..ddfa932 +--- /dev/null ++++ b/src/c/hbm_online_repair/logger.h +@@ -0,0 +1,31 @@ ++#ifndef __LOGGER_H ++#define __LOGGER_H ++ ++#define TOOL_NAME "hbm_online_repair" ++ ++#define LOG_DEBUG 0 ++#define LOG_INFO 1 ++#define LOG_WARNING 2 ++#define LOG_ERROR 3 ++ ++extern int global_level_setting; ++ ++#define log_prefix(level) \ ++ (level == LOG_DEBUG ? "DEBUG" : \ ++ level == LOG_INFO ? "INFO" : \ ++ level == LOG_WARNING ? "WARNING" : \ ++ level == LOG_ERROR ? "ERROR" : \ ++ "UNKNOWN_LEVEL") ++ ++#define log_fd(level) \ ++ (level == LOG_ERROR ? stderr : stdout) ++ ++#define log(level, fmt, args...) do {\ ++ if (level >= global_level_setting) {\ ++ fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\ ++ fprintf(log_fd(level), fmt, ##args);\ ++ fflush(log_fd(level));\ ++ }\ ++} while (0) ++ ++#endif +diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c +new file mode 100644 +index 0000000..b175e14 +--- /dev/null ++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c +@@ -0,0 +1,799 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "logger.h" ++#include "non-standard-hbm-repair.h" ++ ++extern int page_isolation_threshold; ++size_t total_size = 0; ++struct hisi_common_error_section { ++ uint32_t val_bits; ++ uint8_t version; ++ uint8_t soc_id; ++ uint8_t socket_id; ++ uint8_t totem_id; ++ uint8_t nimbus_id; ++ uint8_t subsystem_id; ++ uint8_t module_id; ++ uint8_t submodule_id; ++ uint8_t core_id; ++ uint8_t port_id; ++ uint16_t err_type; ++ struct { ++ uint8_t function; ++ uint8_t device; ++ uint16_t segment; ++ uint8_t bus; ++ uint8_t reserved[3]; ++ } pcie_info; ++ uint8_t err_severity; ++ uint8_t reserved[3]; ++ uint32_t reg_array_size; ++ uint32_t reg_array[]; ++}; ++ ++struct fault_addr_info { ++ uint32_t processer_id; ++ uint32_t die_id; ++ uint32_t stack_id; ++ uint32_t sid; ++ uint32_t channel_id; ++ uint32_t bankgroup_id; ++ uint32_t bank_id; ++ uint32_t row_id; ++ uint32_t column_id; ++ uint32_t error_type; ++ uint32_t repair_type; ++ uint32_t reserved; ++ uint32_t crc8; ++}; ++ ++typedef struct { ++ const char *VariableName; ++ const char *VendorGuid; ++ uint32_t DataSize; ++ uint8_t *Data; ++ uint32_t Attributes; ++} efi_variable_t; ++ ++char* flash_names[FLASH_ENTRY_NUM] = { ++ "repair0000", ++ "repair0001", ++ "repair0100", ++ "repair0101", ++ "repair0200", ++ "repair0201", ++ "repair0300", ++ "repair0301", ++}; ++char *flash_guids[FLASH_ENTRY_NUM] = { ++ "CD2FF4D9-D937-4e1d-B810-A1A568C37C01", ++ "DD92CC91-43E6-4c69-A42A-B08F72FCB157", ++ "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B", ++ "733F9979-4ED4-478d-BD6A-E4D0F0390FDB", ++ "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914", ++ "A0920D6F-78B8-4c09-9F61-7CEC845F116C", ++ "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7", ++ "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC" ++}; ++ ++static int get_guid_index(uint32_t socket_id, uint32_t error_type) { ++ if (2 * socket_id + error_type >= FLASH_ENTRY_NUM) ++ return -1; ++ return 2 * socket_id + error_type; ++} ++ ++static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr) ++{ ++ info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK; ++ fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN; ++ info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK; ++ fault_addr >>= FAULT_ADDR_DIE_ID_LEN; ++ info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK; ++ fault_addr >>= FAULT_ADDR_STACK_ID_LEN; ++ info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK; ++ fault_addr >>= FAULT_ADDR_SID_LEN; ++ info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK; ++ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; ++ info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK; ++ fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN; ++ info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK; ++ fault_addr >>= FAULT_ADDR_BANK_ID_LEN; ++ info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; ++ fault_addr >>= FAULT_ADDR_ROW_ID_LEN; ++ info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; ++ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; ++ info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; ++ fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; ++ info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; ++ fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN; ++ info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK; ++ fault_addr >>= FAULT_ADDR_RESERVED_LEN; ++ info_struct->crc8 = (uint32_t)fault_addr; ++} ++ ++static bool variable_existed(char *name, char *guid) ++{ ++ char filename[PATH_MAX]; ++ int fd; ++ ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ // open var file ++ fd = open(filename, O_RDONLY); ++ if (fd < 0) { ++ log(LOG_WARNING, "open file %s failed\n", filename); ++ return false; ++ } ++ close(fd); ++ return true; ++} ++ ++static uint32_t read_variable_attribute(char *name, char *guid) { ++ char filename[PATH_MAX]; ++ int fd; ++ size_t readsize; ++ uint32_t attribute = (uint32_t)-1; ++ ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ // open var file ++ fd = open(filename, O_RDONLY); ++ if (fd < 0) { ++ log(LOG_ERROR, "open %s failed\n", filename); ++ return attribute; ++ } ++ ++ // read attributes from first 4 bytes ++ readsize = read(fd, &attribute, sizeof(uint32_t)); ++ if (readsize != sizeof(uint32_t)) { ++ log(LOG_ERROR, "read attribute of %s failed\n", filename); ++ } ++ ++ close(fd); ++ return attribute; ++} ++ ++static int efivarfs_set_mutable(char *name, char *guid, bool mutable) ++{ ++ unsigned long orig_attrs, new_attrs; ++ char filename[PATH_MAX]; ++ int fd; ++ ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ fd = open(filename, O_RDONLY); ++ if (fd < 0) { ++ log(LOG_ERROR, "open %s failed\n", filename); ++ goto err; ++ } ++ ++ if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) { ++ log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n"); ++ goto err; ++ } ++ ++ if (mutable) ++ new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL; ++ else ++ new_attrs = orig_attrs | FS_IMMUTABLE_FL; ++ ++ if (new_attrs == orig_attrs) { ++ close(fd); ++ return 0; ++ } ++ ++ if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) { ++ log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n"); ++ goto err; ++ } ++ close(fd); ++ return 0; ++err: ++ if (fd >= 0) ++ close(fd); ++ return -1; ++} ++ ++static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { ++ int fd, mode; ++ size_t writesize; ++ void *buffer; ++ unsigned long total; ++ char filename[PATH_MAX]; ++ ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ // prepare attributes(size 4 bytes) and data ++ total = size + sizeof(uint32_t); ++ buffer = malloc(total); ++ if (buffer == NULL) { ++ log(LOG_ERROR, "malloc data for %s failed\n", filename); ++ goto err; ++ } ++ memcpy(buffer, &attribute, sizeof(uint32_t)); ++ memcpy(buffer + sizeof(uint32_t), value, size); ++ ++ // change attr ++ if (efivarfs_set_mutable(name, guid, 1) != 0) { ++ log(LOG_ERROR, "set mutable for %s failed\n", filename); ++ goto err; ++ } ++ ++ mode = O_WRONLY; ++ if (attribute & EFI_VARIABLE_APPEND_WRITE) ++ mode |= O_APPEND; ++ else ++ mode |= O_CREAT; ++ ++ // open var file ++ fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); ++ if (fd < 0) { ++ log(LOG_ERROR, "open %s failed\n", filename); ++ goto err; ++ } ++ ++ // write to var file ++ writesize = write(fd, buffer, total); ++ if (writesize != total) { ++ log(LOG_ERROR, "write %s failed\n", filename); ++ goto err; ++ } ++ ++ close(fd); ++ free(buffer); ++ if (efivarfs_set_mutable(name, guid, 0) != 0) { ++ log(LOG_ERROR, "set immutable for %s failed\n", filename); ++ } ++ return 0; ++err: ++ if (fd >= 0) ++ close(fd); ++ if (buffer) ++ free(buffer); ++ if (efivarfs_set_mutable(name, guid, 0) != 0) { ++ log(LOG_ERROR, "set immutable for %s failed\n", filename); ++ } ++ return -1; ++} ++ ++static int append_variable(char *name, char *guid, void *data, unsigned long size) { ++ // prepare append attribute ++ uint32_t attribute = read_variable_attribute(name, guid); ++ if (attribute == (uint32_t)-1) { ++ log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); ++ return -1; ++ } ++ attribute |= EFI_VARIABLE_APPEND_WRITE; ++ ++ return write_variable(name, guid, data, size, attribute); ++} ++ ++static size_t get_var_size(char *name, char *guid) { ++ char filename[PATH_MAX]; ++ int fd; ++ struct stat stat; ++ ++ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); ++ ++ // open var file ++ fd = open(filename, O_RDONLY); ++ if (fd < 0) { ++ log(LOG_WARNING, "open %s failed\n", filename); ++ goto err; ++ } ++ // read stat ++ if (fstat(fd, &stat) != 0) { ++ log(LOG_WARNING, "fstat %s failed\n", filename); ++ goto err; ++ } ++ close(fd); ++ return stat.st_size; ++err: ++ if (fd >= 0) ++ close(fd); ++ return (size_t)-1; ++} ++ ++int init_all_flash() { ++ for (int i = 0; i < FLASH_ENTRY_NUM; i++) { ++ // check existed entry ++ if (variable_existed(flash_names[i], flash_guids[i])) { ++ total_size += get_var_size(flash_names[i], flash_guids[i]); ++ continue; ++ } ++ // create new entry ++ uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | ++ EFI_VARIABLE_BOOTSERVICE_ACCESS | ++ EFI_VARIABLE_RUNTIME_ACCESS; ++ char *data = ""; ++ unsigned long size = 1; ++ int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); ++ if (ret) { ++ log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); ++ return -1; ++ } ++ total_size += sizeof(uint32_t) + 1; ++ } ++ // check total entry size ++ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", ++ total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); ++ if (total_size > MAX_VAR_SIZE) { ++ log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); ++ } ++ return 0; ++} ++ ++static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { ++ int ret, guid_index; ++ uint32_t reg_size; ++ uint64_t fault_addr; ++ ++ // check flash usage threshold ++ if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { ++ log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); ++ return -1; ++ } ++ ++ // parse physical addr ++ reg_size = err->reg_array_size / sizeof(uint32_t); ++ fault_addr = err->reg_array[reg_size - 1]; ++ fault_addr <<= TYPE_UINT32_WIDTH; ++ fault_addr += err->reg_array[reg_size - 2]; ++ ++ // get guid ++ struct fault_addr_info info_struct; ++ parse_fault_addr_info(&info_struct, fault_addr); ++ guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type); ++ if (guid_index < 0) { ++ log(LOG_ERROR, "invalid fault info\n"); ++ return -1; ++ } ++ // record physical addr in flash ++ ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); ++ if (ret < 0) { ++ log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); ++ return -1; ++ } ++ total_size += sizeof(uint64_t); ++ log(LOG_INFO, "write hbm fault info to flash success\n"); ++ return 0; ++} ++ ++static int write_file(char *path, const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH]; ++ char buf[20]; ++ int ret; ++ int fd; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, name); ++ ++ fd = open(fname, O_WRONLY); ++ if (fd < 0) { ++ log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ snprintf(buf, sizeof(buf), "0x%llx\n", value); ++ ret = write(fd, buf, strlen(buf)); ++ if (ret <= 0) ++ log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ close(fd); ++ return ret > 0 ? 0 : -errno; ++} ++ ++static int get_hardware_corrupted_size() ++{ ++ FILE *fp; ++ char line[256]; ++ int hardware_corrupted_size = -1; ++ char *key = "HardwareCorrupted:"; ++ ++ fp = fopen("/proc/meminfo", "r"); ++ if (fp == NULL) { ++ log(LOG_ERROR, "Failed to open /proc/meminfo\n"); ++ return -1; ++ } ++ ++ while (fgets(line, sizeof(line), fp) != NULL) { ++ char *pos; ++ if ((pos = strstr(line, key)) != NULL) { ++ sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size); ++ break; ++ } ++ } ++ ++ fclose(fp); ++ return hardware_corrupted_size; ++} ++ ++static uint8_t get_repair_result_code(int ret) ++{ ++ if (ret == -ENOSPC) { ++ return REPAIR_FAILED_NO_RESOURCE; ++ } else if (ret == -EIO) { ++ return REPAIR_FAILED_OTHER_REASON; ++ } else if (ret == -ENXIO || ret == -EINVAL) { ++ return REPAIR_FAILED_INVALID_PARAM; ++ } ++ return REPAIR_FAILED_OTHER_REASON; ++} ++ ++static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code) ++{ ++ int sockfd; ++ struct sockaddr_un addr; ++ char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0}; ++ uint8_t repair_type_code, isolation_type_code; ++ uint32_t repair_type; ++ unsigned long long fault_addr; ++ ++ sockfd = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (sockfd < 0) { ++ log(LOG_ERROR, "Failed to create BMC notice socket\n"); ++ return -1; ++ } ++ ++ memset(&addr, 0, sizeof(struct sockaddr_un)); ++ addr.sun_family = AF_UNIX; ++ strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1); ++ if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) { ++ log(LOG_ERROR, "Failed to connect BMC notice socket\n"); ++ close(sockfd); ++ return -1; ++ } ++ ++ /* assemble bmc specific msg */ ++ repair_type_code = 0; ++ isolation_type_code = 0; ++ repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE]; ++ if (repair_type & HBM_CE_ACLS) { ++ repair_type_code = 0; ++ isolation_type_code = SINGLE_ADDR_FAULT; ++ } else if (repair_type & HBM_PSUE_ACLS) { ++ repair_type_code = 1; ++ isolation_type_code = SINGLE_ADDR_FAULT; ++ } else if (repair_type & HBM_CE_SPPR) { ++ repair_type_code = 2; ++ isolation_type_code = ROW_FAULT; ++ } else if (repair_type & HBM_PSUE_SPPR) { ++ repair_type_code = 3; ++ isolation_type_code = ROW_FAULT; ++ } ++ ++ const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t); ++ ++ fault_addr = err->reg_array[reg_size - 1]; ++ fault_addr <<= TYPE_UINT32_WIDTH; ++ fault_addr += err->reg_array[reg_size - 2]; ++ ++ log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr); ++ ++ struct fault_addr_info info_struct; ++ parse_fault_addr_info(&info_struct, fault_addr); ++ ++ log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id); ++ log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id); ++ log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id); ++ log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid); ++ log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id); ++ log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id); ++ log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id); ++ log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id); ++ log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id); ++ log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type); ++ log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type); ++ log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved); ++ log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8); ++ ++ snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT, ++ repair_type_code, ++ repair_result_code, ++ isolation_type_code, ++ info_struct.processer_id, ++ info_struct.die_id, ++ info_struct.stack_id, ++ info_struct.sid, ++ info_struct.channel_id, ++ info_struct.bankgroup_id, ++ info_struct.bank_id, ++ info_struct.row_id, ++ info_struct.column_id ++ ); ++ ++ log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg); ++ ++ if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) { ++ log(LOG_ERROR, "Failed to send data to BMC notice socket\n"); ++ close(sockfd); ++ return -1; ++ } ++ ++ close(sockfd); ++ return 0; ++} ++ ++static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) ++{ ++ unsigned long long paddr; ++ int ret; ++ bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS); ++ int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB; ++ int hardware_corrupted_size = get_hardware_corrupted_size(); ++ if (hardware_corrupted_size < 0) { ++ log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed"); ++ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); ++ return -1; ++ } ++ if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) { ++ log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n"); ++ notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD); ++ return -1; ++ } ++ if (is_acls) { ++ /* ACLS */ ++ paddr = err->reg_array[HBM_ADDH]; ++ paddr <<= TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HBM_ADDL]; ++ ++ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); ++ if (ret < 0) { ++ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); ++ log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr); ++ return ret; ++ } ++ } else { ++ /* SPPR */ ++ bool all_success = true; ++ uint32_t i; ++ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { ++ paddr = err->reg_array[2 * i + HBM_ADDH]; ++ paddr <<= TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[2 * i + HBM_ADDL]; ++ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); ++ if (ret < 0) { ++ all_success = false; ++ log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr); ++ continue; ++ } ++ } ++ if (!all_success) { ++ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); ++ ret = -1; ++ } ++ } ++ return ret < 0 ? ret : 0; ++} ++ ++static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) ++{ ++ int ret; ++ if (repair_ret < 0) { ++ log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); ++ /* not much we can do about errors here */ ++ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); ++ return get_repair_result_code(repair_ret); ++ } ++ ++ ret = write_file("/sys/kernel/page_eject", "online_page", paddr); ++ if (ret < 0) { ++ log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr); ++ return ONLINE_PAGE_FAILED; ++ } else { ++ log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr); ++ return ISOLATE_REPAIR_ONLINE_SUCCESS; ++ } ++} ++ ++static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) ++{ ++ unsigned long long paddr; ++ int ret; ++ uint8_t repair_result_code; ++ bool is_acls; ++ ++ /* Both ACLS and SPPR only repair the first address */ ++ paddr = err->reg_array[HBM_ADDH]; ++ paddr <<= TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HBM_ADDL]; ++ ++ is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS || ++ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; ++ ++ ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); ++ if (ret < 0) { ++ notice_BMC(err, get_repair_result_code(ret)); ++ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); ++ return ret; ++ } ++ ++ ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr); ++ ++ if (is_acls) { ++ /* ACLS */ ++ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); ++ notice_BMC(err, repair_result_code); ++ return ret; ++ } else { ++ /* SPPR */ ++ bool all_online_success = true; ++ uint32_t i; ++ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { ++ paddr = err->reg_array[2 * i + HBM_ADDH]; ++ paddr <<= TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[2 * i + HBM_ADDL]; ++ ++ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); ++ if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) { ++ all_online_success = false; ++ } ++ } ++ if (ret < 0) { ++ notice_BMC(err, get_repair_result_code(ret)); ++ return ret; ++ } else if (all_online_success) { ++ notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); ++ return 0; ++ } else { ++ notice_BMC(err, ONLINE_PAGE_FAILED); ++ return ret; ++ } ++ } ++ /* The final return code is not necessary */ ++ return ret < 0 ? ret : 0; ++} ++ ++static int hbmc_get_memory_type(char *path) ++{ ++ int type = HBM_UNKNOWN; ++ char fname[MAX_PATH]; ++ char buf[128]; ++ FILE *file; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); ++ file = fopen(fname, "r"); ++ if (!file) { ++ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ if (!fgets(buf, sizeof(buf), file)) { ++ log(LOG_WARNING, "HBM: Failed to read %s\n", fname); ++ goto err; ++ } ++ ++ /* Remove the last '\n' */ ++ buf[strlen(buf) - 1] = 0; ++ ++ if (strcmp(buf, "HBM") == 0) ++ type = HBM_HBM_MEMORY; ++ else if (strcmp(buf, "DDR") == 0) ++ type = HBM_DDR_MEMORY; ++ ++err: ++ fclose(file); ++ return type; ++} ++ ++static void hbm_repair_handler(const struct hisi_common_error_section *err) ++{ ++ log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n"); ++ char *sys_dev_path = "/sys/devices/platform"; ++ char path[MAX_PATH]; ++ struct dirent *dent; ++ DIR *dir; ++ int ret; ++ bool find_device = false, find_hbm_mem = false; ++ ++ ret = hbmc_hbm_page_isolate(err); ++ if (ret < 0) { ++ return; ++ } ++ ++ dir = opendir(sys_dev_path); ++ if (!dir) { ++ log(LOG_WARNING, "Can't read '%s': %s\n", ++ sys_dev_path, strerror(errno)); ++ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); ++ return; ++ } ++ ++ while ((dent = readdir(dir))) { ++ if (!strstr(dent->d_name, HBM_MEM_RAS_NAME)) ++ continue; ++ find_device = true; ++ ++ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); ++ ++ if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { ++ find_hbm_mem = true; ++ ret = hbmc_hbm_repair(err, path); ++ if (ret != -ENXIO) ++ break; ++ } ++ } ++ if (!find_device) { ++ log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", ++ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); ++ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); ++ } else if (!find_hbm_mem) { ++ log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", ++ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); ++ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); ++ } ++ ++ closedir(dir); ++} ++ ++static bool hbm_repair_validate(const struct hisi_common_error_section *err) ++{ ++ if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) && ++ (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) && ++ (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE)) ++ )) { ++ log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits); ++ return false; ++ } ++ log(LOG_DEBUG, "err->module_id: %u\n", err->module_id); ++ log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id); ++ log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits); ++ log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size); ++ ++ if (err->module_id != HBMC_MODULE_ID || ++ err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) { ++ log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n"); ++ return false; ++ } ++ ++ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; ++ bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) && ++ (err->reg_array_size == HBM_ACLS_ARRAY_SIZE); ++ bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) && ++ (err->reg_array_size == HBM_SPPR_ARRAY_SIZE); ++ bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) && ++ (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); ++ ++ if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { ++ log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", ++ hbm_repair_reg_type, err->reg_array_size); ++ return false; ++ } ++ ++ log(LOG_INFO, "Received ACLS/SPPR repair request\n"); ++ return true; ++} ++ ++static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err) ++{ ++ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; ++ return !(hbm_repair_reg_type & HBM_CACHE_MODE); ++} ++ ++int decode_hisi_common_section(struct ras_non_standard_event *event) ++{ ++ const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error; ++ ++ if (hbm_repair_validate(err)) { ++ write_fault_info_to_flash(err); ++ if (hbm_flat_mode_validate(err)) { ++ hbm_repair_handler(err); ++ } ++ } ++ ++ return 0; ++} +diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h +new file mode 100644 +index 0000000..7e8e448 +--- /dev/null ++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h +@@ -0,0 +1,89 @@ ++#ifndef __NON_STANDARD_HBM_REPAIR ++#define __NON_STANDARD_HBM_REPAIR ++ ++#include "ras-non-standard-handler.h" ++ ++#define DEFAULT_PAGE_SIZE_KB 4 ++#define HBM_MEM_RAS_NAME "HISI0521" ++#define HBM_UNKNOWN 0 ++#define HBM_HBM_MEMORY 1 ++#define HBM_DDR_MEMORY 2 ++ ++#define TYPE_UINT32_WIDTH 32 ++#define HBM_REPAIR_REQ_TYPE 0 ++#define HBM_CE_ACLS BIT(0) ++#define HBM_PSUE_ACLS BIT(1) ++#define HBM_CE_SPPR BIT(2) ++#define HBM_PSUE_SPPR BIT(3) ++#define HBM_CACHE_MODE (BIT(4) | BIT(5) | BIT(6) | BIT(7)) ++#define HBM_ERROR_MASK 0b11111111 ++#define HBM_ADDL 1 ++#define HBM_ADDH 2 ++#define HBM_ERROR_TYPE_SIZE 4 ++#define HBM_ADDR_SIZE 8 ++#define HBM_ACLS_ADDR_NUM 1 ++#define HBM_SPPR_ADDR_NUM 16 ++#define HBM_ACLS_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE) ++#define HBM_SPPR_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE) ++#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE) ++#define HBMC_MODULE_ID 0x28 ++#define HBMC_SUBMOD_HBM_REPAIR 6 ++#define COMMON_VALID_MODULE_ID 5 ++#define COMMON_VALID_SUBMODULE_ID 6 ++#define COMMON_VALID_REG_ARRAY_SIZE 12 ++ ++#define BMC_SOCKET_PATH "/var/run/sysSentry/bmc.sock" ++#define BMC_REPORT_FORMAT "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000" ++ ++#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001 ++#define ISOLATE_FAILED_OTHER_REASON 0b10000010 ++#define REPAIR_FAILED_NO_RESOURCE 0b10010100 ++#define REPAIR_FAILED_INVALID_PARAM 0b10011000 ++#define REPAIR_FAILED_OTHER_REASON 0b10011100 ++#define ONLINE_PAGE_FAILED 0b10100000 ++#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000 ++ ++#define ROW_FAULT 1 ++#define SINGLE_ADDR_FAULT 6 ++ ++#define FAULT_ADDR_PROCESSOR_ID_LEN 2 ++#define FAULT_ADDR_DIE_ID_LEN 1 ++#define FAULT_ADDR_STACK_ID_LEN 3 ++#define FAULT_ADDR_SID_LEN 3 ++#define FAULT_ADDR_CHANNEL_ID_LEN 8 ++#define FAULT_ADDR_BANKGROUP_ID_LEN 3 ++#define FAULT_ADDR_BANK_ID_LEN 3 ++#define FAULT_ADDR_ROW_ID_LEN 17 ++#define FAULT_ADDR_COLUMN_ID_LEN 10 ++#define FAULT_ADDR_ERROR_TYPE_LEN 2 ++#define FAULT_ADDR_REPAIR_TYPE_LEN 2 ++#define FAULT_ADDR_RESERVED_LEN 2 ++#define FAULT_ADDR_CRC8_LEN 8 ++ ++#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1) ++#define FAULT_ADDR_DIE_ID_MASK ((1 << FAULT_ADDR_DIE_ID_LEN ) - 1) ++#define FAULT_ADDR_STACK_ID_MASK ((1 << FAULT_ADDR_STACK_ID_LEN ) - 1) ++#define FAULT_ADDR_SID_MASK ((1 << FAULT_ADDR_SID_LEN ) - 1) ++#define FAULT_ADDR_CHANNEL_ID_MASK ((1 << FAULT_ADDR_CHANNEL_ID_LEN ) - 1) ++#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1) ++#define FAULT_ADDR_BANK_ID_MASK ((1 << FAULT_ADDR_BANK_ID_LEN ) - 1) ++#define FAULT_ADDR_ROW_ID_MASK ((1 << FAULT_ADDR_ROW_ID_LEN ) - 1) ++#define FAULT_ADDR_COLUMN_ID_MASK ((1 << FAULT_ADDR_COLUMN_ID_LEN ) - 1) ++#define FAULT_ADDR_ERROR_TYPE_MASK ((1 << FAULT_ADDR_ERROR_TYPE_LEN ) - 1) ++#define FAULT_ADDR_REPAIR_TYPE_MASK ((1 << FAULT_ADDR_REPAIR_TYPE_LEN ) - 1) ++#define FAULT_ADDR_RESERVED_MASK ((1 << FAULT_ADDR_RESERVED_LEN ) - 1) ++#define FAULT_ADDR_CRC8_MASK ((1 << FAULT_ADDR_CRC8_LEN ) - 1) ++ ++#define EFI_VARIABLE_NON_VOLATILE 0x1 ++#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2 ++#define EFI_VARIABLE_RUNTIME_ACCESS 0x4 ++#define EFI_VARIABLE_APPEND_WRITE 0x40 ++ ++#define EFIVARFS_PATH "/sys/firmware/efi/efivars" ++#define MAX_VAR_SIZE (128 * 1024) ++#define FLASH_ENTRY_NUM 8 ++#define KB_SIZE 1024 ++ ++extern int init_all_flash(); ++ ++#endif +diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c +new file mode 100644 +index 0000000..0b12329 +--- /dev/null ++++ b/src/c/hbm_online_repair/ras-events.c +@@ -0,0 +1,534 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include "ras-non-standard-handler.h" ++#include "logger.h" ++ ++/* ++ * Polling time, if read() doesn't block. Currently, trace_pipe_raw never ++ * blocks on read(). So, we need to sleep for a while, to avoid spending ++ * too much CPU cycles. A fix for it is expected for 3.10. ++ */ ++#define POLLING_TIME 3 ++ ++/* Test for a little-endian machine */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ #define ENDIAN KBUFFER_ENDIAN_LITTLE ++#else ++ #define ENDIAN KBUFFER_ENDIAN_BIG ++#endif ++ ++static int get_debugfs_dir(char *debugfs_dir, size_t len) ++{ ++ FILE *fp; ++ char line[MAX_PATH + 1 + 256]; ++ ++ fp = fopen("/proc/mounts","r"); ++ if (!fp) { ++ log(LOG_INFO, "Can't open /proc/mounts"); ++ return errno; ++ } ++ ++ do { ++ char *p, *type, *dir; ++ if (!fgets(line, sizeof(line), fp)) ++ break; ++ ++ p = strtok(line, " \t"); ++ if (!p) ++ break; ++ ++ dir = strtok(NULL, " \t"); ++ if (!dir) ++ break; ++ ++ type = strtok(NULL, " \t"); ++ if (!type) ++ break; ++ ++ if (!strcmp(type, "debugfs")) { ++ fclose(fp); ++ strncpy(debugfs_dir, dir, len - 1); ++ debugfs_dir[len - 1] = '\0'; ++ return 0; ++ } ++ } while(1); ++ ++ fclose(fp); ++ log(LOG_INFO, "Can't find debugfs\n"); ++ return ENOENT; ++} ++ ++ ++static int open_trace(char *trace_dir, char *name, int flags) ++{ ++ int ret; ++ char fname[MAX_PATH + 1]; ++ ++ strcpy(fname, trace_dir); ++ strcat(fname, "/"); ++ strcat(fname, name); ++ ++ ret = open(fname, flags); ++ if (ret < 0) ++ log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno); ++ ++ return ret; ++} ++ ++static int create_trace_instance(char *trace_instance_dir) ++{ ++ char fname[MAX_PATH + 1]; ++ int rc; ++ ++ get_debugfs_dir(fname, sizeof(fname)); ++ strcat(fname, "/tracing/instances/"TOOL_NAME); ++ rc = mkdir(fname, S_IRWXU); ++ if (rc < 0 && errno != EEXIST) { ++ log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname); ++ return -1; ++ } ++ strcpy(trace_instance_dir, fname); ++ return 0; ++} ++ ++struct ras_events *init_trace_instance(void) ++{ ++ struct ras_events *ras = calloc(1, sizeof(*ras)); ++ if (!ras) { ++ log(LOG_ERROR, "Can't allocate memory for ras struct\n"); ++ return NULL; ++ } ++ int rc = create_trace_instance(ras->tracing); ++ if (rc < 0) { ++ free(ras); ++ return NULL; ++ } ++ return ras; ++} ++ ++/* ++ * Tracing enable/disable code ++ */ ++int toggle_ras_event(char *trace_dir, char *group, char *event, int enable) ++{ ++ int fd, rc; ++ char fname[MAX_PATH + 1]; ++ ++ snprintf(fname, sizeof(fname), "%s%s:%s\n", ++ enable ? "" : "!", ++ group, event); ++ ++ /* Enable RAS events */ ++ fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND); ++ if (fd < 0) { ++ log(LOG_WARNING, "Can't open set_event\n"); ++ rc = -errno; ++ goto err; ++ } ++ ++ rc = write(fd, fname, strlen(fname)); ++ close(fd); ++ if (rc <= 0) { ++ log(LOG_WARNING, "Can't write to set_event\n"); ++ rc = -EIO; ++ goto err; ++ } ++ ++ log(LOG_INFO, "%s:%s event %s\n", ++ group, event, ++ enable ? "enabled" : "disabled"); ++ return 0; ++err: ++ log(LOG_ERROR, "Can't %s %s:%s tracing\n", ++ enable ? "enable" : "disable", group, event); ++ return rc; ++} ++ ++static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent) ++{ ++ int fd, len, page_size = DEFAULT_PAGE_SIZE; ++ char buf[page_size]; ++ ++ fd = open_trace(ras->tracing, "events/header_page", O_RDONLY); ++ if (fd < 0) { ++ log(LOG_WARNING, "Open event header page failed\n"); ++ return -1; ++ } ++ ++ len = read(fd, buf, page_size); ++ close(fd); ++ if (len <= 0) { ++ log(LOG_WARNING, "Read event header page failed\n"); ++ return -1; ++ } ++ ++ if (tep_parse_header_page(pevent, buf, len, sizeof(long))) { ++ log(LOG_WARNING, "Parse event header page failed\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf, ++ void *data, unsigned long long time_stamp) ++{ ++ struct tep_record record; ++ struct trace_seq s; ++ ++ record.ts = time_stamp; ++ record.size = kbuffer_event_size(kbuf); ++ record.data = data; ++ record.offset = kbuffer_curr_offset(kbuf); ++ record.cpu = pdata->cpu; ++ ++ /* note offset is just offset in subbuffer */ ++ record.missed_events = kbuffer_missed_events(kbuf); ++ record.record_size = kbuffer_curr_size(kbuf); ++ ++ trace_seq_init(&s); ++ tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s", ++ TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO); ++ trace_seq_do_printf(&s); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++} ++ ++static int get_num_cpus() ++{ ++ return sysconf(_SC_NPROCESSORS_ONLN); ++} ++ ++static int set_buffer_percent(struct ras_events *ras, int percent) ++{ ++ int res = 0; ++ int fd; ++ ++ fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY); ++ if (fd >= 0) { ++ char buf[16]; ++ ssize_t size; ++ snprintf(buf, sizeof(buf), "%d", percent); ++ size = write(fd, buf, strlen(buf)); ++ if (size <= 0) { ++ log(LOG_WARNING, "can't write to buffer_percent\n"); ++ res = -1; ++ } ++ close(fd); ++ } else { ++ log(LOG_WARNING, "Can't open buffer_percent\n"); ++ res = -1; ++ } ++ ++ return res; ++} ++ ++static int read_ras_event_all_cpus(struct pcpu_data *pdata, ++ unsigned n_cpus) ++{ ++ ssize_t size; ++ unsigned long long time_stamp; ++ void *data; ++ int ready, i, count_nready; ++ struct kbuffer *kbuf; ++ void *page; ++ struct pollfd fds[n_cpus + 1]; ++ struct signalfd_siginfo fdsiginfo; ++ sigset_t mask; ++ int warnonce[n_cpus]; ++ char pipe_raw[PATH_MAX]; ++ ++ memset(&warnonce, 0, sizeof(warnonce)); ++ ++ page = malloc(pdata[0].ras->page_size); ++ if (!page) { ++ log(LOG_ERROR, "Can't allocate page\n"); ++ return -ENOMEM; ++ } ++ ++ kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN); ++ if (!kbuf) { ++ log(LOG_ERROR, "Can't allocate kbuf\n"); ++ free(page); ++ return -ENOMEM; ++ } ++ ++ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks ++ * indefinitely with the default buffer_percent in the kernel trace system, ++ * which is introduced by the following change in the kernel. ++ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. ++ * Set buffer_percent to 0 so that poll() will return immediately ++ * when the trace data is available in the ras per_cpu trace pipe_raw ++ */ ++ if (set_buffer_percent(pdata[0].ras, 0)) ++ log(LOG_WARNING, "Set buffer_percent failed\n"); ++ ++ for (i = 0; i < (n_cpus + 1); i++) ++ fds[i].fd = -1; ++ ++ for (i = 0; i < n_cpus; i++) { ++ fds[i].events = POLLIN; ++ ++ snprintf(pipe_raw, sizeof(pipe_raw), ++ "per_cpu/cpu%d/trace_pipe_raw", i); ++ ++ fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY); ++ if (fds[i].fd < 0) { ++ log(LOG_ERROR, "Can't open trace_pipe_raw\n"); ++ goto error; ++ } ++ } ++ ++ sigemptyset(&mask); ++ sigaddset(&mask, SIGINT); ++ sigaddset(&mask, SIGTERM); ++ sigaddset(&mask, SIGHUP); ++ sigaddset(&mask, SIGQUIT); ++ if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) ++ log(LOG_WARNING, "sigprocmask\n"); ++ fds[n_cpus].events = POLLIN; ++ fds[n_cpus].fd = signalfd(-1, &mask, 0); ++ if (fds[n_cpus].fd < 0) { ++ log(LOG_WARNING, "signalfd\n"); ++ goto error; ++ } ++ ++ log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1); ++ ++ do { ++ ready = poll(fds, (n_cpus + 1), -1); ++ if (ready < 0) { ++ log(LOG_WARNING, "poll\n"); ++ } ++ ++ /* check for the signal */ ++ if (fds[n_cpus].revents & POLLIN) { ++ size = read(fds[n_cpus].fd, &fdsiginfo, ++ sizeof(struct signalfd_siginfo)); ++ if (size != sizeof(struct signalfd_siginfo)) { ++ log(LOG_WARNING, "signalfd read\n"); ++ continue; ++ } ++ ++ if (fdsiginfo.ssi_signo == SIGINT || ++ fdsiginfo.ssi_signo == SIGTERM || ++ fdsiginfo.ssi_signo == SIGHUP || ++ fdsiginfo.ssi_signo == SIGQUIT) { ++ log(LOG_INFO, "Recevied signal=%d\n", ++ fdsiginfo.ssi_signo); ++ goto error; ++ } else { ++ log(LOG_INFO, ++ "Received unexpected signal=%d\n", ++ fdsiginfo.ssi_signo); ++ continue; ++ } ++ } ++ ++ count_nready = 0; ++ for (i = 0; i < n_cpus; i++) { ++ if (fds[i].revents & POLLERR) { ++ if (!warnonce[i]) { ++ log(LOG_INFO, ++ "Error on CPU %i\n", i); ++ warnonce[i]++; ++ } ++ continue; ++ } ++ if (!(fds[i].revents & POLLIN)) { ++ count_nready++; ++ continue; ++ } ++ size = read(fds[i].fd, page, pdata[i].ras->page_size); ++ if (size < 0) { ++ log(LOG_WARNING, "read\n"); ++ goto error; ++ } else if (size > 0) { ++ log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size); ++ kbuffer_load_subbuffer(kbuf, page); ++ ++ while ((data = kbuffer_read_event(kbuf, &time_stamp))) { ++ if (kbuffer_curr_size(kbuf) < 0) { ++ log(LOG_ERROR, "invalid kbuf data, discard\n"); ++ break; ++ } ++ ++ log(LOG_DEBUG, "parse_ras_data\n"); ++ parse_ras_data(&pdata[i], ++ kbuf, data, time_stamp); ++ ++ /* increment to read next event */ ++ log(LOG_DEBUG, "kbuffer_next_event\n"); ++ kbuffer_next_event(kbuf, NULL); ++ } ++ } else { ++ count_nready++; ++ } ++ } ++ ++ /* ++ * If count_nready == n_cpus, there is no cpu fd in POLLIN state, ++ * so we need to break the cycle ++ */ ++ if (count_nready == n_cpus) { ++ log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n"); ++ break; ++ } ++ } while (1); ++ ++error: ++ kbuffer_free(kbuf); ++ free(page); ++ sigprocmask(SIG_UNBLOCK, &mask, NULL); ++ ++ for (i = 0; i < (n_cpus + 1); i++) { ++ if (fds[i].fd > 0) ++ close(fds[i].fd); ++ } ++ ++ return -1; ++} ++ ++static int init_header_page(struct ras_events *ras, struct tep_handle *pevent) ++{ ++ int rc; ++ ++ rc = parse_header_page(ras, pevent); ++ if (rc) { ++ log(LOG_ERROR, "cannot read trace header_page: %d\n", rc); ++ return rc; ++ } ++ return 0; ++} ++ ++static int init_event_format(struct ras_events *ras, struct tep_handle *pevent, ++ char *group, char *event) ++{ ++ char *page, fname[MAX_PATH + 1]; ++ int fd, size, rc, page_size = DEFAULT_PAGE_SIZE; ++ ++ // read one page from format ++ snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); ++ fd = open_trace(ras->tracing, fname, O_RDONLY); ++ if (fd < 0) { ++ log(LOG_ERROR, ++ "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n", ++ group, event); ++ return errno; ++ } ++ ++ log(LOG_INFO, "page_size: %d\n", page_size); ++ ras->page_size = page_size; ++ page = malloc(page_size); ++ if (!page) { ++ log(LOG_ERROR, "Can't allocate page to read %s:%s format\n", ++ group, event); ++ rc = errno; ++ close(fd); ++ return rc; ++ } ++ ++ size = read(fd, page, page_size); ++ close(fd); ++ if (size < 0) { ++ log(LOG_ERROR, "Can't read format\n"); ++ free(page); ++ return size; ++ } ++ ++ // parse event format ++ rc = tep_parse_event(pevent, page, size, group); ++ if (rc) { ++ log(LOG_ERROR, "Can't parse event %s:%s\n", group, event); ++ free(page); ++ return EINVAL; ++ } ++ return 0; ++} ++ ++static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, ++ char *group, char *event, ++ tep_event_handler_func func) ++{ ++ int rc; ++ ++ rc = init_event_format(ras, pevent, group, event); ++ if (rc) { ++ log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event); ++ return rc; ++ } ++ ++ /* Registers the special event handlers */ ++ rc = tep_register_event_handler(pevent, -1, group, event, func, ras); ++ if (rc < 0) { ++ log(LOG_ERROR, "Can't register event handler for %s:%s\n", ++ group, event); ++ return EINVAL; ++ } ++ ++ return 0; ++} ++ ++int handle_ras_events(struct ras_events *ras) ++{ ++ int rc, i; ++ unsigned cpus; ++ struct tep_handle *pevent = NULL; ++ struct pcpu_data *data = NULL; ++ ++ pevent = tep_alloc(); ++ if (!pevent) { ++ log(LOG_ERROR, "Can't allocate pevent\n"); ++ rc = errno; ++ goto err; ++ } ++ ras->pevent = pevent; ++ ++ rc = init_header_page(ras, pevent); ++ if (rc) { ++ log(LOG_ERROR, "init_header_page failed\n"); ++ goto err; ++ } ++ ++ rc = add_event_handler(ras, pevent, "ras", "non_standard_event", ++ ras_non_standard_event_handler); ++ if (rc) { ++ log(LOG_ERROR, "Can't get traces from %s:%s\n", ++ "ras", "non_standard_event"); ++ goto err; ++ } ++ log(LOG_INFO, "add_event_handler done\n"); ++ ++ cpus = get_num_cpus(); ++ data = calloc(sizeof(*data), cpus); ++ if (!data) ++ goto err; ++ ++ for (i = 0; i < cpus; i++) { ++ data[i].ras = ras; ++ data[i].cpu = i; ++ } ++ rc = read_ras_event_all_cpus(data, cpus); ++ ++err: ++ if (data) ++ free(data); ++ if (pevent) ++ tep_free(pevent); ++ return rc; ++} +diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h +new file mode 100644 +index 0000000..4218d93 +--- /dev/null ++++ b/src/c/hbm_online_repair/ras-events.h +@@ -0,0 +1,28 @@ ++#ifndef __RAS_EVENTS_H ++#define __RAS_EVENTS_H ++ ++#include ++#include ++ ++#define MAX_PATH 1024 ++ ++#define DEFAULT_PAGE_SIZE 4096 ++ ++struct ras_events { ++ char tracing[MAX_PATH + 1]; ++ struct tep_handle *pevent; ++ int page_size; ++}; ++ ++struct pcpu_data { ++ struct tep_handle *pevent; ++ struct ras_events *ras; ++ int cpu; ++}; ++ ++/* Function prototypes */ ++int toggle_ras_event(char *trace_dir, char *group, char *event, int enable); ++int handle_ras_events(struct ras_events *ras); ++struct ras_events *init_trace_instance(void); ++ ++#endif +diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c +new file mode 100644 +index 0000000..1d1fd04 +--- /dev/null ++++ b/src/c/hbm_online_repair/ras-non-standard-handler.c +@@ -0,0 +1,81 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "ras-non-standard-handler.h" ++#include "logger.h" ++ ++static char *uuid_le(const char *uu) ++{ ++ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; ++ if (!uu) { ++ log(LOG_ERROR, "uuid_le failed: uu is empty"); ++ return uuid; ++ } ++ size_t uu_len = strlen(uu); ++ if (uu_len < SECTION_TYPE_UUID_LEN) { ++ log(LOG_ERROR, "uuid_le failed: uu is too short"); ++ return uuid; ++ } ++ ++ char *p = uuid; ++ int i; ++ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; ++ ++ for (i = 0; i < 16; i++) { ++ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); ++ switch (i) { ++ case 3: ++ case 5: ++ case 7: ++ case 9: ++ *p++ = '-'; ++ break; ++ } ++ } ++ ++ *p = 0; ++ ++ return uuid; ++} ++ ++int ras_non_standard_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_non_standard_event ev; ++ ++ ev.sec_type = tep_get_field_raw(s, event, "sec_type", ++ record, &len, 1); ++ if(!ev.sec_type) { ++ log(LOG_WARNING, "get event section type failed"); ++ return -1; ++ } ++ ++ trace_seq_printf(s, "\n"); ++ trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); ++ ++ if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { ++ log(LOG_WARNING, "tep get field val failed"); ++ return -1; ++ } ++ ++ ev.length = val; ++ trace_seq_printf(s, "length: %d\n", ev.length); ++ ++ ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); ++ if(!ev.error || ev.length != len) { ++ log(LOG_WARNING, "get event error failed"); ++ return -1; ++ } ++ ++ if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { ++ decode_hisi_common_section(&ev); ++ } ++ ++ return 0; ++} +diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h +new file mode 100644 +index 0000000..0272dc1 +--- /dev/null ++++ b/src/c/hbm_online_repair/ras-non-standard-handler.h +@@ -0,0 +1,25 @@ ++#ifndef __RAS_NON_STANDARD_HANDLER_H ++#define __RAS_NON_STANDARD_HANDLER_H ++ ++#include ++#include "ras-events.h" ++ ++#define BIT(nr) (1UL << (nr)) ++ ++#define SECTION_TYPE_UUID_LEN 16 ++#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" ++ ++struct ras_non_standard_event { ++ char timestamp[64]; ++ const char *sec_type; ++ const uint8_t *error; ++ uint32_t length; ++}; ++ ++int ras_non_standard_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); ++ ++int decode_hisi_common_section(struct ras_non_standard_event *event); ++ ++#endif +diff --git a/src/python/.gitignore b/src/python/.gitignore +new file mode 100644 +index 0000000..58200d4 +--- /dev/null ++++ b/src/python/.gitignore +@@ -0,0 +1 @@ ++__pycache__/ +diff --git a/src/python/sentryCollector/__init__.py b/src/python/sentryCollector/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/src/python/sentryCollector/__main__.py b/src/python/sentryCollector/__main__.py +new file mode 100644 +index 0000000..9c2ae50 +--- /dev/null ++++ b/src/python/sentryCollector/__main__.py +@@ -0,0 +1,17 @@ ++# coding: utf-8 ++# Copyright (c) 2023 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++main ++""" ++from collectd import collectd ++ ++collectd.main() +diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py +new file mode 100644 +index 0000000..b6cc75c +--- /dev/null ++++ b/src/python/sentryCollector/collect_config.py +@@ -0,0 +1,118 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++Read and save collector.conf value. ++""" ++import configparser ++import logging ++import os ++import re ++ ++ ++COLLECT_CONF_PATH = "/etc/sysSentry/collector.conf" ++ ++CONF_COMMON = 'common' ++CONF_MODULES = 'modules' ++ ++# io ++CONF_IO = 'io' ++CONF_IO_PERIOD_TIME = 'period_time' ++CONF_IO_MAX_SAVE = 'max_save' ++CONF_IO_DISK = 'disk' ++CONF_IO_PERIOD_TIME_DEFAULT = 1 ++CONF_IO_MAX_SAVE_DEFAULT = 10 ++CONF_IO_DISK_DEFAULT = "default" ++ ++class CollectConfig: ++ def __init__(self, filename=COLLECT_CONF_PATH): ++ ++ self.filename = filename ++ self.modules = [] ++ self.module_count = 0 ++ self.load_config() ++ ++ def load_config(self): ++ if not os.path.exists(self.filename): ++ logging.error("%s is not exists", self.filename) ++ return ++ ++ try: ++ self.config = configparser.ConfigParser() ++ self.config.read(self.filename) ++ except configparser.Error: ++ logging.error("collectd configure file read failed") ++ return ++ ++ try: ++ common_config = self.config[CONF_COMMON] ++ modules_str = common_config[CONF_MODULES] ++ # remove space ++ modules_list = modules_str.replace(" ", "").split(',') ++ except KeyError as e: ++ logging.error("read config data failed, %s", e) ++ return ++ ++ pattern = r'^[a-zA-Z0-9-_]+$' ++ for module_name in modules_list: ++ if not re.match(pattern, module_name): ++ logging.warning("module_name: %s is invalid", module_name) ++ continue ++ if not self.config.has_section(module_name): ++ logging.warning("module_name: %s config is incorrect", module_name) ++ continue ++ self.modules.append(module_name) ++ ++ def load_module_config(self, module_name): ++ module_name = module_name.strip().lower() ++ if module_name in self.modules and self.config.has_section(module_name): ++ return {key.lower(): value for key, value in self.config[module_name].items()} ++ else: ++ raise ValueError(f"Module '{module_name}' not found in configuration") ++ ++ def get_io_config(self): ++ result_io_config = {} ++ io_map_value = self.load_module_config(CONF_IO) ++ # period_time ++ period_time = io_map_value.get(CONF_IO_PERIOD_TIME) ++ if period_time and period_time.isdigit() and int(period_time) >= 1 and int(period_time) <= 300: ++ result_io_config[CONF_IO_PERIOD_TIME] = int(period_time) ++ else: ++ logging.warning("module_name = %s section, field = %s is incorrect, use default %d", ++ CONF_IO, CONF_IO_PERIOD_TIME, CONF_IO_PERIOD_TIME_DEFAULT) ++ result_io_config[CONF_IO_PERIOD_TIME] = CONF_IO_PERIOD_TIME_DEFAULT ++ # max_save ++ max_save = io_map_value.get(CONF_IO_MAX_SAVE) ++ if max_save and max_save.isdigit() and int(max_save) >= 1 and int(max_save) <= 300: ++ result_io_config[CONF_IO_MAX_SAVE] = int(max_save) ++ else: ++ logging.warning("module_name = %s section, field = %s is incorrect, use default %d", ++ CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) ++ result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT ++ # disk ++ disk = io_map_value.get(CONF_IO_DISK) ++ if disk: ++ disk_str = disk.replace(" ", "") ++ pattern = r'^[a-zA-Z0-9-_,]+$' ++ if not re.match(pattern, disk_str): ++ logging.warning("module_name = %s section, field = %s is incorrect, use default %s", ++ CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) ++ disk_str = CONF_IO_DISK_DEFAULT ++ result_io_config[CONF_IO_DISK] = disk_str ++ else: ++ logging.warning("module_name = %s section, field = %s is incorrect, use default %s", ++ CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) ++ result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT ++ logging.info("config get_io_config: %s", result_io_config) ++ return result_io_config ++ ++ def get_common_config(self): ++ return {key.lower(): value for key, value in self.config['common'].items()} +diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py +new file mode 100644 +index 0000000..104b734 +--- /dev/null ++++ b/src/python/sentryCollector/collect_io.py +@@ -0,0 +1,243 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++collect module ++""" ++import os ++import time ++import logging ++import threading ++ ++from .collect_config import CollectConfig ++ ++Io_Category = ["read", "write", "flush", "discard"] ++IO_GLOBAL_DATA = {} ++IO_CONFIG_DATA = [] ++ ++class IoStatus(): ++ TOTAL = 0 ++ FINISH = 1 ++ LATENCY = 2 ++ ++class CollectIo(): ++ ++ def __init__(self, module_config): ++ ++ io_config = module_config.get_io_config() ++ ++ self.period_time = io_config['period_time'] ++ self.max_save = io_config['max_save'] ++ disk_str = io_config['disk'] ++ ++ self.disk_map_stage = {} ++ self.window_value = {} ++ ++ self.loop_all = False ++ ++ if disk_str == "default": ++ self.loop_all = True ++ else: ++ self.disk_list = disk_str.strip().split(',') ++ ++ self.stop_event = threading.Event() ++ ++ IO_CONFIG_DATA.append(self.period_time) ++ IO_CONFIG_DATA.append(self.max_save) ++ ++ def get_blk_io_hierarchy(self, disk_name, stage_list): ++ stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) ++ try: ++ with open(stats_file, 'r') as file: ++ lines = file.read() ++ except FileNotFoundError: ++ logging.error("The file %s does not exist", stats_file) ++ return -1 ++ except Exception as e: ++ logging.error("An error occurred3: %s", e) ++ return -1 ++ ++ curr_value = lines.strip().split('\n') ++ ++ for stage_val in curr_value: ++ stage = stage_val.split(' ')[0] ++ if (len(self.window_value[disk_name][stage])) >= 2: ++ self.window_value[disk_name][stage].pop(0) ++ ++ curr_stage_value = stage_val.split(' ')[1:-1] ++ self.window_value[disk_name][stage].append(curr_stage_value) ++ return 0 ++ ++ def append_period_lat(self, disk_name, stage_list): ++ for stage in stage_list: ++ if len(self.window_value[disk_name][stage]) < 2: ++ return ++ curr_stage_value = self.window_value[disk_name][stage][-1] ++ last_stage_value = self.window_value[disk_name][stage][-2] ++ ++ for index in range(len(Io_Category)): ++ # read=0, write=1, flush=2, discard=3 ++ if (len(IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save: ++ IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].pop() ++ ++ curr_lat = self.get_latency_value(curr_stage_value, last_stage_value, index) ++ curr_iops = self.get_iops(curr_stage_value, last_stage_value, index) ++ curr_io_length = self.get_io_length(curr_stage_value, last_stage_value, index) ++ curr_io_dump = self.get_io_dump(disk_name, stage, index) ++ ++ IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) ++ ++ def get_iops(self, curr_stage_value, last_stage_value, category): ++ try: ++ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) ++ except ValueError as e: ++ logging.error("get_iops convert to int failed, %s", e) ++ return 0 ++ value = finish / self.period_time ++ if value.is_integer(): ++ return int(value) ++ else: ++ return round(value, 1) ++ ++ def get_latency_value(self, curr_stage_value, last_stage_value, category): ++ try: ++ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) ++ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) ++ except ValueError as e: ++ logging.error("get_latency_value convert to int failed, %s", e) ++ return 0 ++ if finish <= 0 or lat_time <= 0: ++ return 0 ++ value = lat_time / finish / 1000 / 1000 ++ if value.is_integer(): ++ return int(value) ++ else: ++ return round(value, 1) ++ ++ def get_io_length(self, curr_stage_value, last_stage_value, category): ++ try: ++ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) ++ except ValueError as e: ++ logging.error("get_io_length convert to int failed, %s", e) ++ return 0 ++ value = finish / self.period_time / 1000 / 1000 ++ if value.is_integer(): ++ return int(value) ++ else: ++ return round(value, 1) ++ ++ def get_io_dump(self, disk_name, stage, category): ++ io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) ++ count = 0 ++ try: ++ with open(io_dump_file, 'r') as file: ++ for line in file: ++ count += line.count('.op=' + Io_Category[category]) ++ except FileNotFoundError: ++ logging.error("The file %s does not exist.", io_dump_file) ++ return count ++ except Exception as e: ++ logging.error("An error occurred1: %s", e) ++ return count ++ return count ++ ++ def extract_first_column(self, file_path): ++ column_names = [] ++ try: ++ with open(file_path, 'r') as file: ++ for line in file: ++ parts = line.strip().split() ++ if parts: ++ column_names.append(parts[0]) ++ except FileNotFoundError: ++ logging.error("The file %s does not exist.", file_path) ++ except Exception as e: ++ logging.error("An error occurred2: %s", e) ++ return column_names ++ ++ def task_loop(self): ++ if self.stop_event.is_set(): ++ logging.info("collect io thread exit") ++ return ++ ++ for disk_name, stage_list in self.disk_map_stage.items(): ++ if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: ++ continue ++ self.append_period_lat(disk_name, stage_list) ++ ++ threading.Timer(self.period_time, self.task_loop).start() ++ ++ def is_kernel_avaliable(self): ++ base_path = '/sys/kernel/debug/block' ++ for disk_name in os.listdir(base_path): ++ if not self.loop_all and disk_name not in self.disk_list: ++ continue ++ ++ disk_path = os.path.join(base_path, disk_name) ++ blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') ++ ++ if not os.path.exists(blk_io_hierarchy_path): ++ logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name) ++ continue ++ ++ for file_name in os.listdir(blk_io_hierarchy_path): ++ file_path = os.path.join(blk_io_hierarchy_path, file_name) ++ ++ if file_name == 'stats': ++ stage_list = self.extract_first_column(file_path) ++ self.disk_map_stage[disk_name] = stage_list ++ self.window_value[disk_name] = {} ++ IO_GLOBAL_DATA[disk_name] = {} ++ ++ return len(IO_GLOBAL_DATA) != 0 ++ ++ def main_loop(self): ++ logging.info("collect io thread start") ++ ++ if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: ++ logging.warning("no disks meet the requirements. collect io thread exits") ++ return ++ ++ for disk_name, stage_list in self.disk_map_stage.items(): ++ for stage in stage_list: ++ self.window_value[disk_name][stage] = [] ++ IO_GLOBAL_DATA[disk_name][stage] = {} ++ for category in Io_Category: ++ IO_GLOBAL_DATA[disk_name][stage][category] = [] ++ ++ while True: ++ start_time = time.time() ++ ++ if self.stop_event.is_set(): ++ logging.debug("collect io thread exit") ++ return ++ ++ for disk_name, stage_list in self.disk_map_stage.items(): ++ if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: ++ continue ++ self.append_period_lat(disk_name, stage_list) ++ ++ elapsed_time = time.time() - start_time ++ sleep_time = self.period_time - elapsed_time ++ if sleep_time < 0: ++ continue ++ while sleep_time > 1: ++ if self.stop_event.is_set(): ++ logging.debug("collect io thread exit") ++ return ++ time.sleep(1) ++ sleep_time -= 1 ++ time.sleep(sleep_time) ++ ++ # set stop event, notify thread exit ++ def stop_thread(self): ++ logging.debug("collect io thread is preparing to exit") ++ self.stop_event.set() +diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py +new file mode 100644 +index 0000000..49ce0a8 +--- /dev/null ++++ b/src/python/sentryCollector/collect_plugin.py +@@ -0,0 +1,276 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++collcet plugin ++""" ++import json ++import socket ++import logging ++import re ++ ++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" ++ ++# data length param ++CLT_MSG_HEAD_LEN = 9 #3+2+4 ++CLT_MSG_PRO_LEN = 2 ++CLT_MSG_MAGIC_LEN = 3 ++CLT_MSG_LEN_LEN = 4 ++ ++CLT_MAGIC = "CLT" ++RES_MAGIC = "RES" ++ ++# disk limit ++LIMIT_DISK_CHAR_LEN = 32 ++LIMIT_DISK_LIST_LEN = 10 ++ ++# stage limit ++LIMIT_STAGE_CHAR_LEN = 20 ++LIMIT_STAGE_LIST_LEN = 15 ++ ++#iotype limit ++LIMIT_IOTYPE_CHAR_LEN = 7 ++LIMIT_IOTYPE_LIST_LEN = 4 ++ ++#period limit ++LIMIT_PERIOD_MIN_LEN = 1 ++LIMIT_PERIOD_MAX_LEN = 300 ++ ++# interface protocol ++class ClientProtocol(): ++ IS_IOCOLLECT_VALID = 0 ++ GET_IO_DATA = 1 ++ PRO_END = 3 ++ ++class ResultMessage(): ++ RESULT_SUCCEED = 0 ++ RESULT_UNKNOWN = 1 # unknown error ++ RESULT_NOT_PARAM = 2 # the parameter does not exist or the type does not match. ++ RESULT_INVALID_LENGTH = 3 # invalid parameter length. ++ RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. ++ RESULT_PARSE_FAILED = 5 # parse failed ++ RESULT_INVALID_CHAR = 6 # invalid char ++ ++Result_Messages = { ++ ResultMessage.RESULT_SUCCEED: "Succeed", ++ ResultMessage.RESULT_UNKNOWN: "Unknown error", ++ ResultMessage.RESULT_NOT_PARAM: "The parameter does not exist or the type does not match", ++ ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", ++ ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", ++ ResultMessage.RESULT_PARSE_FAILED: "Parse failed", ++ ResultMessage.RESULT_INVALID_CHAR: "Invalid char" ++} ++ ++ ++def client_send_and_recv(request_data, data_str_len, protocol): ++ """client socket send and recv message""" ++ try: ++ client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) ++ except socket.error: ++ print("collect_plugin: client creat socket error") ++ return None ++ ++ try: ++ client_socket.connect(COLLECT_SOCKET_PATH) ++ except OSError: ++ client_socket.close() ++ print("collect_plugin: client connect error") ++ return None ++ ++ req_data_len = len(request_data) ++ request_msg = CLT_MAGIC + str(protocol).zfill(CLT_MSG_PRO_LEN) + str(req_data_len).zfill(CLT_MSG_LEN_LEN) + request_data ++ ++ try: ++ client_socket.send(request_msg.encode()) ++ res_data = client_socket.recv(len(RES_MAGIC) + CLT_MSG_PRO_LEN + data_str_len) ++ res_data = res_data.decode() ++ except (OSError, UnicodeError): ++ client_socket.close() ++ print("collect_plugin: client communicate error") ++ return None ++ ++ res_magic = res_data[:CLT_MSG_MAGIC_LEN] ++ if res_magic != "RES": ++ print("res msg format error") ++ return None ++ ++ protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] ++ try: ++ protocol_id = int(protocol_str) ++ except ValueError: ++ print("recv msg protocol id is invalid %s", protocol_str) ++ return None ++ ++ if protocol_id >= ClientProtocol.PRO_END: ++ print("protocol id is invalid") ++ return None ++ ++ try: ++ res_data_len = int(res_data[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:]) ++ res_msg_data = client_socket.recv(res_data_len) ++ res_msg_data = res_msg_data.decode() ++ return res_msg_data ++ except (OSError, ValueError, UnicodeError): ++ print("collect_plugin: client recv res msg error") ++ finally: ++ client_socket.close() ++ ++ return None ++ ++def validate_parameters(param, len_limit, char_limit): ++ ret = ResultMessage.RESULT_SUCCEED ++ if not param: ++ print("parm is invalid") ++ ret = ResultMessage.RESULT_NOT_PARAM ++ return [False, ret] ++ ++ if not isinstance(param, list): ++ print(f"{param} is not list type.") ++ ret = ResultMessage.RESULT_NOT_PARAM ++ return [False, ret] ++ ++ if len(param) <= 0: ++ print(f"{param} length is 0.") ++ ret = ResultMessage.RESULT_INVALID_LENGTH ++ return [False, ret] ++ ++ if len(param) > len_limit: ++ print(f"{param} length more than {len_limit}") ++ ret = ResultMessage.RESULT_EXCEED_LIMIT ++ return [False, ret] ++ ++ pattern = r'^[a-zA-Z0-9_-]+$' ++ for info in param: ++ if len(info) > char_limit: ++ print(f"{info} length more than {char_limit}") ++ ret = ResultMessage.RESULT_EXCEED_LIMIT ++ return [False, ret] ++ if not re.match(pattern, info): ++ print(f"{info} is invalid char") ++ ret = ResultMessage.RESULT_INVALID_CHAR ++ return [False, ret] ++ ++ return [True, ret] ++ ++def is_iocollect_valid(period, disk_list=None, stage=None): ++ result = inter_is_iocollect_valid(period, disk_list, stage) ++ error_code = result['ret'] ++ if error_code != ResultMessage.RESULT_SUCCEED: ++ result['message'] = Result_Messages[error_code] ++ return result ++ ++def inter_is_iocollect_valid(period, disk_list=None, stage=None): ++ result = {} ++ result['ret'] = ResultMessage.RESULT_UNKNOWN ++ result['message'] = "" ++ ++ if not period or not isinstance(period, int): ++ result['ret'] = ResultMessage.RESULT_NOT_PARAM ++ return result ++ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: ++ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH ++ return result ++ ++ if not disk_list: ++ disk_list = [] ++ else: ++ res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) ++ if not res[0]: ++ result['ret'] = res[1] ++ return result ++ ++ if not stage: ++ stage = [] ++ else: ++ res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) ++ if not res[0]: ++ result['ret'] = res[1] ++ return result ++ ++ req_msg_struct = { ++ 'disk_list': json.dumps(disk_list), ++ 'period': period, ++ 'stage': json.dumps(stage) ++ } ++ request_message = json.dumps(req_msg_struct) ++ result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID) ++ if not result_message: ++ print("collect_plugin: client_send_and_recv failed") ++ return result ++ ++ try: ++ json.loads(result_message) ++ except json.JSONDecodeError: ++ print("is_iocollect_valid: json decode error") ++ result['ret'] = ResultMessage.RESULT_PARSE_FAILED ++ return result ++ ++ result['ret'] = ResultMessage.RESULT_SUCCEED ++ result['message'] = result_message ++ return result ++ ++def get_io_data(period, disk_list, stage, iotype): ++ result = inter_get_io_data(period, disk_list, stage, iotype) ++ error_code = result['ret'] ++ if error_code != ResultMessage.RESULT_SUCCEED: ++ result['message'] = Result_Messages[error_code] ++ return result ++ ++def inter_get_io_data(period, disk_list, stage, iotype): ++ result = {} ++ result['ret'] = ResultMessage.RESULT_UNKNOWN ++ result['message'] = "" ++ ++ if not isinstance(period, int): ++ result['ret'] = ResultMessage.RESULT_NOT_PARAM ++ return result ++ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: ++ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH ++ return result ++ ++ res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) ++ if not res[0]: ++ result['ret'] = res[1] ++ return result ++ ++ res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) ++ if not res[0]: ++ result['ret'] = res[1] ++ return result ++ ++ res = validate_parameters(iotype, LIMIT_IOTYPE_LIST_LEN, LIMIT_IOTYPE_CHAR_LEN) ++ if not res[0]: ++ result['ret'] = res[1] ++ return result ++ ++ req_msg_struct = { ++ 'disk_list': json.dumps(disk_list), ++ 'period': period, ++ 'stage': json.dumps(stage), ++ 'iotype' : json.dumps(iotype) ++ } ++ ++ request_message = json.dumps(req_msg_struct) ++ result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) ++ if not result_message: ++ print("collect_plugin: client_send_and_recv failed") ++ return result ++ try: ++ json.loads(result_message) ++ except json.JSONDecodeError: ++ print("get_io_data: json decode error") ++ result['ret'] = ResultMessage.RESULT_PARSE_FAILED ++ return result ++ ++ result['ret'] = ResultMessage.RESULT_SUCCEED ++ result['message'] = result_message ++ return result ++ +diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py +new file mode 100644 +index 0000000..bab4e56 +--- /dev/null ++++ b/src/python/sentryCollector/collect_server.py +@@ -0,0 +1,285 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++listen module ++""" ++import sys ++import signal ++import traceback ++import socket ++import os ++import json ++import logging ++import fcntl ++import select ++import threading ++import time ++ ++from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA ++from .collect_config import CollectConfig ++ ++SENTRY_RUN_DIR = "/var/run/sysSentry" ++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" ++ ++# socket param ++CLT_LISTEN_QUEUE_LEN = 5 ++SERVER_EPOLL_TIMEOUT = 0.3 ++ ++# data length param ++CLT_MSG_HEAD_LEN = 9 #3+2+4 ++CLT_MSG_PRO_LEN = 2 ++CLT_MSG_MAGIC_LEN = 3 ++CLT_MSG_LEN_LEN = 4 ++ ++# data flag param ++CLT_MAGIC = "CLT" ++RES_MAGIC = "RES" ++ ++# interface protocol ++class ServerProtocol(): ++ IS_IOCOLLECT_VALID = 0 ++ GET_IO_DATA = 1 ++ PRO_END = 3 ++ ++class CollectServer(): ++ ++ def __init__(self): ++ ++ self.io_global_data = {} ++ ++ self.stop_event = threading.Event() ++ ++ def is_iocollect_valid(self, data_struct): ++ ++ result_rev = {} ++ self.io_global_data = IO_GLOBAL_DATA ++ ++ if len(IO_CONFIG_DATA) == 0: ++ logging.error("the collect thread is not started, the data is invalid. ") ++ return json.dumps(result_rev) ++ ++ period_time = IO_CONFIG_DATA[0] ++ max_save = IO_CONFIG_DATA[1] ++ ++ disk_list = json.loads(data_struct['disk_list']) ++ period = int(data_struct['period']) ++ stage_list = json.loads(data_struct['stage']) ++ ++ if (period < period_time) or (period > period_time * max_save) or (period % period_time): ++ logging.error("is_iocollect_valid: period time: %d is invalid", period) ++ return json.dumps(result_rev) ++ ++ for disk_name, stage_info in self.io_global_data.items(): ++ if len(disk_list) > 0 and disk_name not in disk_list: ++ continue ++ result_rev[disk_name] = [] ++ if len(stage_list) == 0: ++ result_rev[disk_name] = list(stage_info.keys()) ++ continue ++ for stage_name, stage_data in stage_info.items(): ++ if stage_name in stage_list: ++ result_rev[disk_name].append(stage_name) ++ ++ return json.dumps(result_rev) ++ ++ def get_io_data(self, data_struct): ++ result_rev = {} ++ self.io_global_data = IO_GLOBAL_DATA ++ ++ if len(IO_CONFIG_DATA) == 0: ++ logging.error("the collect thread is not started, the data is invalid. ") ++ return json.dumps(result_rev) ++ period_time = IO_CONFIG_DATA[0] ++ max_save = IO_CONFIG_DATA[1] ++ ++ period = int(data_struct['period']) ++ disk_list = json.loads(data_struct['disk_list']) ++ stage_list = json.loads(data_struct['stage']) ++ iotype_list = json.loads(data_struct['iotype']) ++ ++ if (period < period_time) or (period > period_time * max_save) or (period % period_time): ++ logging.error("get_io_data: period time: %d is invalid", period) ++ return json.dumps(result_rev) ++ ++ collect_index = period // period_time - 1 ++ logging.debug("period: %d, collect_index: %d", period, collect_index) ++ ++ for disk_name, stage_info in self.io_global_data.items(): ++ if disk_name not in disk_list: ++ continue ++ result_rev[disk_name] = {} ++ for stage_name, iotype_info in stage_info.items(): ++ if len(stage_list) > 0 and stage_name not in stage_list: ++ continue ++ result_rev[disk_name][stage_name] = {} ++ for iotype_name, iotype_info in iotype_info.items(): ++ if iotype_name not in iotype_list: ++ continue ++ if len(iotype_info) < collect_index: ++ continue ++ result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] ++ ++ return json.dumps(result_rev) ++ ++ def msg_data_process(self, msg_data, protocal_id): ++ """message data process""" ++ logging.debug("msg_data %s", msg_data) ++ protocol_name = msg_data[0] ++ try: ++ data_struct = json.loads(msg_data) ++ except json.JSONDecodeError: ++ logging.error("msg data process: json decode error") ++ return "Request message decode failed" ++ ++ if protocal_id == ServerProtocol.IS_IOCOLLECT_VALID: ++ res_msg = self.is_iocollect_valid(data_struct) ++ elif protocal_id == ServerProtocol.GET_IO_DATA: ++ res_msg = self.get_io_data(data_struct) ++ ++ return res_msg ++ ++ def msg_head_process(self, msg_head): ++ """message head process""" ++ ctl_magic = msg_head[:CLT_MSG_MAGIC_LEN] ++ if ctl_magic != CLT_MAGIC: ++ logging.error("recv msg head magic invalid") ++ return None ++ ++ protocol_str = msg_head[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] ++ try: ++ protocol_id = int(protocol_str) ++ except ValueError: ++ logging.error("recv msg protocol id is invalid") ++ return None ++ ++ data_len_str = msg_head[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:CLT_MSG_HEAD_LEN] ++ try: ++ data_len = int(data_len_str) ++ except ValueError: ++ logging.error("recv msg data len is invalid %s", data_len_str) ++ return None ++ ++ return [protocol_id, data_len] ++ ++ def server_recv(self, server_socket: socket.socket): ++ """server receive""" ++ try: ++ client_socket, _ = server_socket.accept() ++ logging.debug("server_fd listen ok") ++ except socket.error: ++ logging.error("server accept failed, %s", socket.error) ++ return ++ ++ try: ++ msg_head = client_socket.recv(CLT_MSG_HEAD_LEN) ++ logging.debug("recv msg head: %s", msg_head.decode()) ++ head_info = self.msg_head_process(msg_head.decode()) ++ except (OSError, UnicodeError): ++ client_socket.close() ++ logging.error("server recv HEAD failed") ++ return ++ ++ protocol_id = head_info[0] ++ data_len = head_info[1] ++ logging.debug("msg protocol id: %d, data length: %d", protocol_id, data_len) ++ if protocol_id >= ServerProtocol.PRO_END: ++ client_socket.close() ++ logging.error("protocol id is invalid") ++ return ++ ++ if data_len < 0: ++ client_socket.close() ++ logging.error("msg head parse failed") ++ return ++ ++ try: ++ msg_data = client_socket.recv(data_len) ++ msg_data_decode = msg_data.decode() ++ logging.debug("msg data %s", msg_data_decode) ++ except (OSError, UnicodeError): ++ client_socket.close() ++ logging.error("server recv MSG failed") ++ return ++ ++ res_data = self.msg_data_process(msg_data_decode, protocol_id) ++ logging.debug("res data %s", res_data) ++ ++ # server send ++ res_head = RES_MAGIC ++ res_head += str(protocol_id).zfill(CLT_MSG_PRO_LEN) ++ res_data_len = str(len(res_data)).zfill(CLT_MSG_LEN_LEN) ++ res_head += res_data_len ++ logging.debug("res head %s", res_head) ++ ++ res_msg = res_head + res_data ++ logging.debug("res msg %s", res_msg) ++ ++ try: ++ client_socket.send(res_msg.encode()) ++ except OSError: ++ logging.error("server recv failed") ++ finally: ++ client_socket.close() ++ return ++ ++ def server_fd_create(self): ++ """create server fd""" ++ if not os.path.exists(SENTRY_RUN_DIR): ++ logging.error("%s not exist, failed", SENTRY_RUN_DIR) ++ return None ++ ++ try: ++ server_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) ++ server_fd.setblocking(False) ++ if os.path.exists(COLLECT_SOCKET_PATH): ++ os.remove(COLLECT_SOCKET_PATH) ++ ++ server_fd.bind(COLLECT_SOCKET_PATH) ++ os.chmod(COLLECT_SOCKET_PATH, 0o600) ++ server_fd.listen(CLT_LISTEN_QUEUE_LEN) ++ logging.debug("%s bind and listen", COLLECT_SOCKET_PATH) ++ except socket.error: ++ logging.error("server fd create failed") ++ server_fd = None ++ ++ return server_fd ++ ++ ++ def server_loop(self): ++ """main loop""" ++ logging.info("collect listen thread start") ++ server_fd = self.server_fd_create() ++ if not server_fd: ++ return ++ ++ epoll_fd = select.epoll() ++ epoll_fd.register(server_fd.fileno(), select.EPOLLIN) ++ ++ logging.debug("start server_loop loop") ++ while True: ++ if self.stop_event.is_set(): ++ logging.debug("collect listen thread exit") ++ server_fd = None ++ return ++ try: ++ events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) ++ for event_fd, _ in events_list: ++ if event_fd == server_fd.fileno(): ++ self.server_recv(server_fd) ++ else: ++ continue ++ except socket.error: ++ pass ++ ++ def stop_thread(self): ++ logging.debug("collect listen thread is preparing to exit") ++ self.stop_event.set() +diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py +new file mode 100644 +index 0000000..3a836df +--- /dev/null ++++ b/src/python/sentryCollector/collectd.py +@@ -0,0 +1,97 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++""" ++main loop for collect. ++""" ++import sys ++import signal ++import traceback ++import socket ++import os ++import json ++import logging ++import fcntl ++import select ++ ++import threading ++ ++from .collect_io import CollectIo ++from .collect_server import CollectServer ++from .collect_config import CollectConfig ++ ++SENTRY_RUN_DIR = "/var/run/sysSentry" ++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" ++SENTRY_RUN_DIR_PERM = 0o750 ++ ++COLLECT_LOG_FILE = "/var/log/sysSentry/collector.log" ++Thread_List = [] ++Module_Map_Class = {"io" : CollectIo} ++ ++def remove_sock_file(): ++ try: ++ os.unlink(COLLECT_SOCKET_PATH) ++ except FileNotFoundError: ++ pass ++ ++def sig_handler(signum, _f): ++ if signum not in (signal.SIGINT, signal.SIGTERM): ++ return ++ for i in range(len(Thread_List)): ++ Thread_List[i][0].stop_thread() ++ ++ remove_sock_file() ++ ++def main(): ++ """main ++ """ ++ if not os.path.exists(SENTRY_RUN_DIR): ++ os.mkdir(SENTRY_RUN_DIR) ++ os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) ++ ++ logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO) ++ os.chmod(COLLECT_LOG_FILE, 0o600) ++ ++ try: ++ signal.signal(signal.SIGINT, sig_handler) ++ signal.signal(signal.SIGTERM, sig_handler) ++ ++ logging.info("finish main parse_args") ++ ++ module_config = CollectConfig() ++ module_list = module_config.modules ++ ++ # listen thread ++ cs = CollectServer() ++ listen_thread = threading.Thread(target=cs.server_loop) ++ listen_thread.start() ++ Thread_List.append([cs, listen_thread]) ++ ++ # collect thread ++ for info in module_list: ++ class_name = Module_Map_Class.get(info) ++ if not class_name: ++ logging.info("%s correspond to class is not exists", info) ++ continue ++ cn = class_name(module_config) ++ collect_thread = threading.Thread(target=cn.main_loop) ++ collect_thread.start() ++ Thread_List.append([cn, collect_thread]) ++ ++ for i in range(len(Thread_List)): ++ Thread_List[i][1].join() ++ ++ except Exception: ++ logging.error('%s', traceback.format_exc()) ++ finally: ++ pass ++ ++ logging.info("All threads have finished. Main thread is exiting.") +\ No newline at end of file +diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py +new file mode 100644 +index 0000000..ff2071d +--- /dev/null ++++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py +@@ -0,0 +1,257 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++import logging ++import signal ++import configparser ++import time ++ ++from .stage_window import IoWindow, IoDumpWindow ++from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler ++from .utils import update_avg_and_check_abnormal ++ ++CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" ++ ++def log_invalid_keys(not_in_list, keys_name, config_list, default_list): ++ """print invalid log""" ++ if config_list and default_list: ++ logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) ++ elif config_list == ["default"]: ++ logging.warning("Default {} use {}".format(keys_name, default_list)) ++ ++ ++def read_config_common(config): ++ """read config file, get [common] section value""" ++ try: ++ common_sec = config['common'] ++ except configparser.NoSectionError: ++ report_alarm_fail("Cannot find common section in config file") ++ ++ try: ++ period_time = int(common_sec.get("period_time", 1)) ++ if not (1 <= period_time <= 300): ++ raise ValueError("Invalid period_time") ++ except ValueError: ++ period_time = 1 ++ logging.warning("Invalid period_time, set to 1s") ++ ++ disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] ++ stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] ++ ++ if len(disk) > 10: ++ logging.warning("Too many disks, record only max 10 disks") ++ disk = disk[:10] ++ ++ iotype = common_sec.get('iotype', 'read,write').split(",") ++ iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] ++ err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] ++ ++ if err_iotype: ++ logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) ++ ++ return period_time, disk, stage, iotype_list ++ ++ ++def read_config_algorithm(config): ++ """read config file, get [algorithm] section value""" ++ if not config.has_section("algorithm"): ++ report_alarm_fail("Cannot find algorithm section in config file") ++ ++ try: ++ win_size = int(config.get("algorithm", "win_size")) ++ if not (1 <= win_size <= 300): ++ raise ValueError("Invalid win_size") ++ win_threshold = int(config.get("algorithm", "win_threshold")) ++ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: ++ raise ValueError("Invalid win_threshold") ++ except ValueError: ++ report_alarm_fail("Invalid win_threshold or win_size") ++ ++ return win_size, win_threshold ++ ++ ++def read_config_lat_iodump(io_dic, config): ++ """read config file, get [latency] [iodump] section value""" ++ common_param = {} ++ for io_type in io_dic["iotype_list"]: ++ common_param[io_type] = {} ++ ++ latency_keys = { ++ "avg_lim": "{}_avg_lim".format(io_type), ++ "avg_time": "{}_avg_time".format(io_type), ++ "tot_lim": "{}_tot_lim".format(io_type), ++ } ++ iodump_key = "{}_iodump_lim".format(io_type) ++ ++ for key_suffix, key_template in latency_keys.items(): ++ if key_template in config["latency"] and config["latency"][key_template].isdecimal(): ++ common_param[io_type][key_template] = int(config["latency"][key_template]) ++ ++ if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): ++ common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) ++ ++ return common_param ++ ++ ++def read_config_stage(config, stage, iotype_list): ++ """read config file, get [STAGE_NAME] section value""" ++ res = {} ++ if not stage in config: ++ return res ++ ++ for key in config[stage]: ++ if config[stage][key].isdecimal(): ++ res[key] = int(config[stage][key]) ++ ++ return res ++ ++ ++def init_io_win(io_dic, config, common_param): ++ """initialize windows of latency, iodump, and dict of avg_value""" ++ iotype_list = io_dic["iotype_list"] ++ io_data = {} ++ io_avg_value = {} ++ for disk_name in io_dic["disk_list"]: ++ io_data[disk_name] = {} ++ io_avg_value[disk_name] = {} ++ for stage_name in io_dic["stage_list"]: ++ io_data[disk_name][stage_name] = {} ++ io_avg_value[disk_name][stage_name] = {} ++ # step3. 解析stage配置 ++ curr_stage_param = read_config_stage(config, stage_name, iotype_list) ++ for rw in iotype_list: ++ io_data[disk_name][stage_name][rw] = {} ++ io_avg_value[disk_name][stage_name][rw] = [0, 0] ++ ++ # 对每个rw创建latency和iodump窗口 ++ avg_lim_key = "{}_avg_lim".format(rw) ++ avg_time_key = "{}_avg_time".format(rw) ++ tot_lim_key = "{}_tot_lim".format(rw) ++ iodump_lim_key = "{}_iodump_lim".format(rw) ++ ++ # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 ++ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) ++ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) ++ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) ++ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) ++ ++ if avg_lim_value and avg_time_value and tot_lim_value: ++ io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) ++ ++ if iodump_lim_value is not None: ++ io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) ++ return io_data, io_avg_value ++ ++ ++def get_valid_disk_stage_list(io_dic, config_disk, config_stage): ++ """get disk_list and stage_list by sentryCollector""" ++ json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage) ++ ++ all_disk_set = json_data.keys() ++ all_stage_set = set() ++ for disk_stage_list in json_data.values(): ++ all_stage_set.update(disk_stage_list) ++ ++ disk_list = [key for key in config_disk if key in all_disk_set] ++ not_in_disk_list = [key for key in config_disk if key not in all_disk_set] ++ ++ stage_list = [key for key in config_stage if key in all_stage_set] ++ not_in_stage_list = [key for key in config_stage if key not in all_stage_set] ++ ++ if not config_disk: ++ disk_list = [key for key in all_disk_set] ++ ++ if not config_stage: ++ stage_list = [key for key in all_stage_set] ++ ++ if config_disk and not disk_list: ++ logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) ++ disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) ++ ++ if config_stage and not stage_list: ++ logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) ++ disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) ++ ++ if not stage_list or not disk_list: ++ report_alarm_fail("Cannot get valid disk name or stage name.") ++ ++ log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) ++ log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) ++ ++ return disk_list, stage_list ++ ++ ++def main_loop(io_dic, io_data, io_avg_value): ++ """main loop of avg_block_io""" ++ period_time = io_dic["period_time"] ++ disk_list = io_dic["disk_list"] ++ stage_list = io_dic["stage_list"] ++ iotype_list = io_dic["iotype_list"] ++ win_size = io_dic["win_size"] ++ # 开始循环 ++ while True: ++ # 等待x秒 ++ time.sleep(period_time) ++ ++ # 采集模块对接,获取周期数据 ++ curr_period_data = avg_get_io_data(io_dic) ++ ++ # 处理周期数据 ++ reach_size = False ++ for disk_name in disk_list: ++ for stage_name in stage_list: ++ for rw in iotype_list: ++ if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: ++ io_key = (disk_name, stage_name, rw) ++ reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) ++ ++ # win_size不满时不进行告警判断 ++ if not reach_size: ++ continue ++ ++ # 判断异常窗口、异常场景 ++ for disk_name in disk_list: ++ for rw in iotype_list: ++ process_report_data(disk_name, rw, io_data) ++ ++ ++def main(): ++ """main func""" ++ # 注册停止信号-2/-15 ++ signal.signal(signal.SIGINT, sig_handler) ++ signal.signal(signal.SIGTERM, sig_handler) ++ ++ # 初始化配置读取 ++ config = configparser.ConfigParser(comment_prefixes=('#', ';')) ++ try: ++ config.read(CONFIG_FILE) ++ except configparser.Error: ++ report_alarm_fail("Failed to read config file") ++ ++ io_dic = {} ++ ++ # 读取配置文件 -- common段 ++ io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config) ++ ++ # 采集模块对接,is_iocollect_valid() ++ io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) ++ ++ if "bio" not in io_dic["stage_list"]: ++ report_alarm_fail("Cannot run avg_block_io without bio stage") ++ ++ # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 ++ # step1. 解析公共配置 --- algorithm ++ io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) ++ ++ # step2. 循环创建窗口 ++ common_param = read_config_lat_iodump(io_dic, config) ++ io_data, io_avg_value = init_io_win(io_dic, config, common_param) ++ ++ main_loop(io_dic, io_data, io_avg_value) +diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py +new file mode 100644 +index 0000000..0da4208 +--- /dev/null ++++ b/src/python/sentryPlugins/avg_block_io/module_conn.py +@@ -0,0 +1,86 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++import json ++import logging ++import sys ++import time ++ ++from .utils import is_abnormal ++from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages ++from syssentry.result import ResultLevel, report_result ++ ++ ++TASK_NAME = "avg_block_io" ++ ++def sig_handler(signum, _f): ++ """stop avg_block_io""" ++ report_result(TASK_NAME, ResultLevel.PASS, json.dumps({})) ++ logging.info("Finished avg_block_io plugin running.") ++ sys.exit(0) ++ ++def avg_get_io_data(io_dic): ++ """get_io_data from sentryCollector""" ++ res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) ++ return check_result_validation(res, 'get io data') ++ ++ ++def avg_is_iocollect_valid(io_dic, config_disk, config_stage): ++ """is_iocollect_valid from sentryCollector""" ++ res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) ++ return check_result_validation(res, 'check config validation') ++ ++ ++def check_result_validation(res, reason): ++ """check validation of result from sentryCollector""" ++ if not 'ret' in res or not 'message' in res: ++ err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) ++ report_alarm_fail(err_msg) ++ if res['ret'] != 0: ++ err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) ++ report_alarm_fail(err_msg) ++ ++ try: ++ json_data = json.loads(res['message']) ++ except json.JSONDecodeError: ++ err_msg = "Failed to {}: invalid return message".format(reason) ++ report_alarm_fail(err_msg) ++ ++ return json_data ++ ++ ++def report_alarm_fail(alarm_info): ++ """report result to xalarmd""" ++ report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) ++ logging.error(alarm_info) ++ sys.exit(1) ++ ++ ++def process_report_data(disk_name, rw, io_data): ++ """check abnormal window and report to xalarm""" ++ if not is_abnormal((disk_name, 'bio', rw), io_data): ++ return ++ ++ ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] ++ for stage_name in ctrl_stage: ++ if is_abnormal((disk_name, stage_name, rw), io_data): ++ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) ++ return ++ ++ if is_abnormal((disk_name, 'rq_driver', rw), io_data): ++ logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) ++ return ++ ++ kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] ++ for stage_name in kernel_stage: ++ if is_abnormal((disk_name, stage_name, rw), io_data): ++ logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) ++ return ++ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) +diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py +new file mode 100644 +index 0000000..9b0ce79 +--- /dev/null ++++ b/src/python/sentryPlugins/avg_block_io/stage_window.py +@@ -0,0 +1,47 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++class AbnormalWindowBase: ++ def __init__(self, window_size=10, window_threshold=7): ++ self.window_size = window_size ++ self.window_threshold = window_threshold ++ self.abnormal_window = [False] * window_size ++ ++ def append_new_period(self, ab_res, avg_val=0): ++ self.abnormal_window.pop(0) ++ if self.is_abnormal_period(ab_res, avg_val): ++ self.abnormal_window.append(True) ++ else: ++ self.abnormal_window.append(False) ++ ++ def is_abnormal_window(self): ++ return sum(self.abnormal_window) > self.window_threshold ++ ++ ++class IoWindow(AbnormalWindowBase): ++ def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): ++ super().__init__(window_size, window_threshold) ++ self.abnormal_multiple = abnormal_multiple ++ self.abnormal_multiple_lim = abnormal_multiple_lim ++ self.abnormal_time = abnormal_time ++ ++ def is_abnormal_period(self, value, avg_val): ++ return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \ ++ (value > self.abnormal_time) ++ ++ ++class IoDumpWindow(AbnormalWindowBase): ++ def __init__(self, window_size=10, window_threshold=7, abnormal_time=40): ++ super().__init__(window_size, window_threshold) ++ self.abnormal_time = abnormal_time ++ ++ def is_abnormal_period(self, value, avg_val=0): ++ return value > self.abnormal_time +diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py +new file mode 100644 +index 0000000..54ed080 +--- /dev/null ++++ b/src/python/sentryPlugins/avg_block_io/utils.py +@@ -0,0 +1,86 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++AVG_VALUE = 0 ++AVG_COUNT = 1 ++ ++ ++def get_nested_value(data, keys): ++ """get data from nested dict""" ++ for key in keys: ++ if key in data: ++ data = data[key] ++ else: ++ return None ++ return data ++ ++ ++def set_nested_value(data, keys, value): ++ """set data to nested dict""" ++ for key in keys[:-1]: ++ if key in data: ++ data = data[key] ++ else: ++ return False ++ data[keys[-1]] = value ++ return True ++ ++ ++def is_abnormal(io_key, io_data): ++ """check if latency and iodump win abnormal""" ++ for key in ['latency', 'iodump']: ++ all_keys = get_nested_value(io_data, io_key) ++ if all_keys and key in all_keys: ++ win = get_nested_value(io_data, io_key + (key,)) ++ if win and win.is_abnormal_window(): ++ return True ++ return False ++ ++ ++def update_io_avg(old_avg, period_value, win_size): ++ """update average of latency window""" ++ if old_avg[AVG_COUNT] < win_size: ++ new_avg_count = old_avg[AVG_COUNT] + 1 ++ new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count ++ else: ++ new_avg_count = old_avg[AVG_COUNT] ++ new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count ++ return [new_avg_value, new_avg_count] ++ ++ ++def update_io_data(old_avg, period_value, win_size, io_data, io_key): ++ """update data of latency and iodump window""" ++ all_wins = get_nested_value(io_data, io_key) ++ if all_wins and "latency" in all_wins: ++ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) ++ if all_wins and "iodump" in all_wins: ++ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) ++ ++ ++def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): ++ """update avg and check abonrmal, return true if win_size full""" ++ period_value = get_nested_value(data, io_key) ++ old_avg = get_nested_value(io_avg_value, io_key) ++ ++ # 更新avg数据 ++ if old_avg[AVG_COUNT] < win_size: ++ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) ++ return False ++ ++ # 更新win数据 -- 判断异常周期 ++ update_io_data(old_avg, period_value, win_size, io_data, io_key) ++ all_wins = get_nested_value(io_data, io_key) ++ if all_wins and 'latency' not in all_wins: ++ return True ++ period = get_nested_value(io_data, io_key + ("latency",)) ++ if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): ++ return True ++ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) ++ return True +diff --git a/src/python/setup.py b/src/python/setup.py +index f96a96e..c28c691 100644 +--- a/src/python/setup.py ++++ b/src/python/setup.py +@@ -31,7 +31,9 @@ setup( + 'console_scripts': [ + 'cpu_sentry=syssentry.cpu_sentry:main', + 'syssentry=syssentry.syssentry:main', +- 'xalarmd=xalarm.xalarm_daemon:alarm_process_create' ++ 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', ++ 'sentryCollector=sentryCollector.collectd:main', ++ 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' + ] + }, + ) +diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py +new file mode 100644 +index 0000000..5956538 +--- /dev/null ++++ b/src/python/syssentry/bmc_alarm.py +@@ -0,0 +1,159 @@ ++import logging ++import socket ++from enum import Enum ++ ++from .utils import execute_command ++ ++HEX_CHAR_LEN = 2 ++SOCKET_RECEIVE_LEN = 128 ++BMC_DATA_HEAD = "REP" ++BMC_REPORT_TYPE_BIT = 0 ++HBMC_REPAIR_TYPE_BIT = 1 ++HBMC_REPAIR_RESULT_BIT = 2 ++HBMC_ISOLATION_TYPE_BIT = 3 ++HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92" ++HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN ++HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN ++HBMC_REPAIR_TYPE_OFFSET = 7 ++ ++HBMC_SEND_SUCCESS_CODE = "db 07 00" ++ ++ ++class ReportType(Enum): ++ HBMC_REPAIR_BMC = 0x00 ++ ++ ++class HBMCRepairType(Enum): ++ CE_ACLS = 7 ++ PS_UCE_ACLS = 8 ++ CE_SPPR = 9 ++ PS_UCE_SPPR = 10 ++ ++ ++class HBMCRepairResultType(Enum): ++ ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001 ++ ISOLATE_FAILED_OTHER_REASON = 0b10000010 ++ REPAIR_FAILED_NO_RESOURCE = 0b10010100 ++ REPAIR_FAILED_INVALID_PARAM = 0b10011000 ++ REPAIR_FAILED_OTHER_REASON = 0b10011100 ++ ONLINE_PAGE_FAILED = 0b10100000 ++ ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000 ++ ++ ++class HBMCIsolationType(Enum): ++ ROW_FAULT = 1 ++ SINGLE_ADDR_FAULT = 6 ++ ++ ++def find_value_is_in_enum(value: int, enum: Enum): ++ for item in enum: ++ if value == item.value: ++ return True ++ return False ++ ++ ++def convert_hex_char_to_int(data, bit): ++ if len(data) < (bit+1)*HEX_CHAR_LEN: ++ logging.error(f"Data {data} len is too short, current convert bit is {bit}") ++ char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN] ++ try: ++ value = int(char, 16) ++ except ValueError: ++ logging.error(f"Cannot convert char [{char}] to int") ++ raise ValueError ++ return value ++ ++ ++def reverse_byte(data): ++ return data[3], data[2], data[1], data[0] ++ ++ ++def parse_hbmc_report(data: str): ++ logging.debug(f"bmc receive raw data is {data}") ++ repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT) ++ repair_type += HBMC_REPAIR_TYPE_OFFSET ++ if not find_value_is_in_enum(repair_type, HBMCRepairType): ++ logging.warning(f"HBMC msg repair type ({repair_type}) is unknown") ++ raise ValueError ++ ++ repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT) ++ if not find_value_is_in_enum(repair_result, HBMCRepairResultType): ++ logging.warning(f"HBMC msg repair result ({repair_result}) is unknown") ++ raise ValueError ++ ++ isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT) ++ if not find_value_is_in_enum(isolation_type, HBMCIsolationType): ++ logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown") ++ raise ValueError ++ ++ cmd_list = [ ++ "ipmitool", ++ "raw", ++ "0x30", # Netfn ++ "0x92", # cmd ++ "0xdb", ++ "0x07", ++ "0x00", ++ "0x65", # sub command ++ "0x01", # SystemId ++ "0x00", # LocalSystemId ++ "{:#04X}".format(repair_type), ++ "{:#04X}".format(repair_result), ++ "{:#04X}".format(isolation_type), ++ ] ++ # send the remain data directly ++ data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:] ++ other_info_str = [] ++ for i in range(len(data) // 2): ++ other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i))) ++ cmd_list.extend(other_info_str) ++ ++ cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4]) ++ cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4]) ++ ++ logging.info(f"Send bmc alarm command is {cmd_list}") ++ ++ ret = execute_command(cmd_list) ++ if HBMC_SEND_SUCCESS_CODE not in ret: ++ logging.warning(f"Send bmc alarm failed, error code is {ret}") ++ raise ValueError ++ logging.debug("Send bmc alarm success") ++ ++ ++PARSE_REPORT_MSG_FUNC_DICT = { ++ ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report, ++} ++ ++ ++def bmc_recv(server_socket: socket.socket): ++ logging.debug("Get hbm socket connection request") ++ try: ++ client_socket, _ = server_socket.accept() ++ logging.debug("cpu alarm fd listen ok") ++ ++ data = client_socket.recv(SOCKET_RECEIVE_LEN) ++ data = data.decode() ++ ++ data_head = data[0:len(BMC_DATA_HEAD)] ++ if data_head != BMC_DATA_HEAD: ++ logging.warning(f"The head of the msg is incorrect, head is {data_head}") ++ raise ValueError ++ ++ # remove the data head ++ data = data[len(BMC_DATA_HEAD):] ++ logging.info(f"Remove head data is {data}") ++ ++ report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT) ++ if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys(): ++ logging.warning(f"The type of the msg ({report_type}) is unknown") ++ raise ValueError ++ ++ PARSE_REPORT_MSG_FUNC_DICT[report_type](data) ++ ++ except socket.error: ++ logging.error("socket error") ++ return ++ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): ++ logging.error("server recv bmc msg failed!") ++ client_socket.close() ++ return +diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py +index d0d0719..b38b381 100644 +--- a/src/python/syssentry/callbacks.py ++++ b/src/python/syssentry/callbacks.py +@@ -53,7 +53,7 @@ def task_stop(mod_name): + return "failed", "mod is not enabled" + logging.info("%s stop", mod_name) + if task.runtime_status == EXITED_STATUS: +- return "success", "task already stoped" ++ return "success", "task already stopped" + if task.runtime_status == WAITING_STATUS: + set_runtime_status(task.name, EXITED_STATUS) + return "success", "" +diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py +index 0b1642b..1fce462 100644 +--- a/src/python/syssentry/cpu_alarm.py ++++ b/src/python/syssentry/cpu_alarm.py +@@ -249,3 +249,4 @@ def cpu_alarm_recv(server_socket: socket.socket): + + upload_bmc(_type, module, command, event_type, socket_id, core_id) + ++ +diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py +index 2f18d14..72925eb 100644 +--- a/src/python/syssentry/cpu_sentry.py ++++ b/src/python/syssentry/cpu_sentry.py +@@ -26,8 +26,6 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini" + # Inspection commands running at the bottom layer + LOW_LEVEL_INSPECT_CMD = "cat-cli" + +-# max length of msg in details +-DETAILS_LOG_MSG_MAX_LEN = 255 + + class CpuSentry: + """ +@@ -96,10 +94,22 @@ class CpuSentry: + self.send_result["details"]["msg"] = "cpu_sentry task is killed!" + return + ++ if "ERROR" in stdout: ++ self.send_result["result"] = ResultLevel.FAIL ++ self.send_result["details"]["code"] = 1004 ++ ++ # Remove ANSI escape sequences ++ error_info = stdout.split("\n")[0] ++ if error_info.startswith("\u001b"): ++ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' ++ error_info = re.sub(ansi_escape, '', error_info) ++ ++ self.send_result["details"]["msg"] = error_info ++ return ++ + out_split = stdout.split("\n") +- isolated_cores_number = -1 ++ isolated_cores_number = 0 + found_fault_cores_list = [] +- error_msg_list = [] + for out_line_i in out_split: + if "handle_patrol_result: Found fault cores" in out_line_i: + cores_number_tmp = out_line_i.split("Found fault cores:")[1] +@@ -111,25 +121,9 @@ class CpuSentry: + elif out_line_i.startswith(''): + self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1] + break +- elif "ERROR" in out_line_i: +- logging.error("[cat-cli error] - %s\n", out_line_i) +- error_msg_list.append(out_line_i) + + found_fault_cores_number = len(set(found_fault_cores_list)) +- if isolated_cores_number == -1: +- self.send_result["result"] = ResultLevel.FAIL +- self.send_result["details"]["code"] = 1004 +- +- send_error_msg = "" +- # Remove ANSI escape sequences +- for error_info in error_msg_list: +- if error_info.startswith("\u001b"): +- ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' +- error_info = re.sub(ansi_escape, '', error_info) +- if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN: +- send_error_msg += ";" + error_info +- self.send_result["details"]["msg"] = send_error_msg +- elif found_fault_cores_number == 0: ++ if found_fault_cores_number == 0: + self.send_result["details"]["code"] = 0 + self.send_result["result"] = ResultLevel.PASS + elif 0 in found_fault_cores_list: +diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py +index 50780b3..f161f1f 100644 +--- a/src/python/syssentry/cron_process.py ++++ b/src/python/syssentry/cron_process.py +@@ -21,7 +21,7 @@ import subprocess + from .utils import get_current_time_string + from .result import ResultLevel, RESULT_LEVEL_ERR_MSG_DICT + from .global_values import InspectTask +-from .task_map import TasksMap, PERIOD_TYPE, ONESHOT_TYPE ++from .task_map import TasksMap, PERIOD_TYPE + from .mod_status import set_runtime_status, WAITING_STATUS, RUNNING_STATUS, \ + FAILED_STATUS, EXITED_STATUS + +diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py +index 776971f..debff4e 100644 +--- a/src/python/syssentry/syssentry.py ++++ b/src/python/syssentry/syssentry.py +@@ -44,6 +44,12 @@ try: + except ImportError: + CPU_EXIST = False + ++BMC_EXIST = True ++try: ++ from .bmc_alarm import bmc_recv ++except ImportError: ++ BMC_EXIST = False ++ + + INSPECTOR = None + +@@ -83,6 +89,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" + + CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" + ++BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock" ++ ++fd_list = [] + + def msg_data_process(msg_data): + """message data process""" +@@ -325,6 +334,41 @@ def cpu_alarm_fd_create(): + + return cpu_alarm_fd + ++def bmc_fd_create(): ++ """create bmc fd""" ++ if not os.path.exists(SENTRY_RUN_DIR): ++ logging.debug("%s not exist", SENTRY_RUN_DIR) ++ return None ++ ++ try: ++ bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) ++ except socket.error: ++ logging.error("bmc fd create failed") ++ return None ++ ++ bmc_fd.setblocking(False) ++ if os.path.exists(BMC_SOCKET_PATH): ++ os.remove(BMC_SOCKET_PATH) ++ ++ try: ++ bmc_fd.bind(BMC_SOCKET_PATH) ++ except OSError: ++ logging.error("bmc fd bind failed") ++ bmc_fd.close() ++ return None ++ ++ os.chmod(BMC_SOCKET_PATH, 0o600) ++ try: ++ bmc_fd.listen(5) ++ except OSError: ++ logging.error("bmc fd listen failed") ++ bmc_fd.close() ++ return None ++ ++ logging.debug("%s bind and listen", BMC_SOCKET_PATH) ++ ++ return bmc_fd ++ + + def server_result_recv(server_socket: socket.socket): + """server result receive""" +@@ -398,35 +442,47 @@ def server_result_fd_create(): + return server_result_fd + + ++def close_all_fd(): ++ for fd in fd_list: ++ fd.close() ++ ++ + def main_loop(): + """main loop""" ++ + server_fd = server_fd_create() + if not server_fd: ++ close_all_fd() + return ++ fd_list.append(server_fd) + + server_result_fd = server_result_fd_create() + if not server_result_fd: +- server_fd.close() ++ close_all_fd() + return ++ fd_list.append(server_result_fd) + + heartbeat_fd = heartbeat_fd_create() + if not heartbeat_fd: +- server_fd.close() +- server_result_fd.close() ++ close_all_fd() + return ++ fd_list.append(heartbeat_fd) + + cpu_alarm_fd = cpu_alarm_fd_create() + if not cpu_alarm_fd: +- server_fd.close() +- heartbeat_fd.close() +- server_result_fd.close() ++ close_all_fd() + return ++ fd_list.append(cpu_alarm_fd) ++ ++ bmc_fd = bmc_fd_create() ++ if not bmc_fd: ++ close_all_fd() ++ return ++ fd_list.append(bmc_fd) + + epoll_fd = select.epoll() +- epoll_fd.register(server_fd.fileno(), select.EPOLLIN) +- epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN) +- epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN) +- epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) ++ for fd in fd_list: ++ epoll_fd.register(fd.fileno(), select.EPOLLIN) + + logging.debug("start main loop") + # onstart_tasks_handle() +@@ -449,6 +505,8 @@ def main_loop(): + heartbeat_recv(heartbeat_fd) + elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): + cpu_alarm_recv(cpu_alarm_fd) ++ elif BMC_EXIST and event_fd == bmc_fd.fileno(): ++ bmc_recv(bmc_fd) + else: + continue + +@@ -587,4 +645,3 @@ def main(): + logging.error('%s', traceback.format_exc()) + finally: + release_pidfile() +- +diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py +index 94d7638..a2cdb25 100644 +--- a/src/python/xalarm/xalarm_api.py ++++ b/src/python/xalarm/xalarm_api.py +@@ -98,7 +98,7 @@ class Xalarm: + """msg1 setter + """ + if len(msg) > 512: +- raise ValueError("msg1 length must below 255") ++ raise ValueError("msg1 length must below 512") + self._msg1 = msg + + +-- +2.27.0 + diff --git a/sysSentry.spec b/sysSentry.spec index 1497a74..3d21a4b 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -4,7 +4,7 @@ Summary: System Inspection Framework Name: sysSentry Version: 1.0.2 -Release: 15 +Release: 16 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz @@ -26,6 +26,7 @@ Patch13: optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch Patch14: over-threshold-should-be-warn-level-log-in-cat-cli.patch Patch15: add-separator-to-err-info.patch Patch16: remove-threshold-max-cpu-cores.patch +Patch17: add-hbm-online-repair.patch BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools @@ -62,6 +63,16 @@ Recommends: ipmitool %description -n cpu_sentry This package provides CPU fault detection +%package -n hbm_online_repair +Summary: hbm_online_repair for the sysSentry +Provides: hbm_online_repair = %{version} +BuildRequires: libtraceevent-devel +Requires: libtraceevent ipmitool +Requires: sysSentry = %{version}-%{release} + +%description -n hbm_online_repair +This package provides hbm_online_repair for the sysSentry. + %prep %autosetup -n %{name}-%{version} -p1 @@ -81,6 +92,11 @@ make popd popd +# hbm_online_repair +pushd src/c/hbm_online_repair +make +popd + %install # sysSentry mkdir -p %{buildroot}%{_bindir} @@ -109,6 +125,12 @@ install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sen install src/c/catcli/catlib/build/cat-cli %{buildroot}%{_bindir}/cat-cli install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot}%{_libdir} +# hbm_online_repair +mkdir -p %{buildroot}/etc/sysconfig/ +install config/tasks/hbm_online_repair.mod %{buildroot}/etc/sysSentry/tasks/ +install src/c/hbm_online_repair/hbm_online_repair %{buildroot}%{_bindir} +install src/c/hbm_online_repair/hbm_online_repair.env %{buildroot}/etc/sysconfig/hbm_online_repair.env + chrpath -d %{buildroot}%{_bindir}/cat-cli chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so @@ -166,6 +188,11 @@ rm -rf %{buildroot} %exclude %{python3_sitelib}/syssentry/cpu_* %exclude %{python3_sitelib}/syssentry/*/cpu_* +# hbm repair module +%exclude %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod +%exclude %{python3_sitelib}/syssentry/bmc_* +%exclude %{python3_sitelib}/syssentry/*/bmc_* + %files -n libxalarm %attr(0550,root,root) %{_libdir}/libxalarm.so @@ -182,7 +209,19 @@ rm -rf %{buildroot} %attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini %attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_* +%files -n hbm_online_repair +%attr(0550,root,root) %{_bindir}/hbm_online_repair +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/hbm_online_repair.env +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod +%attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py + %changelog +* Mon Oct 21 2024 luckky - 1.0.2-16 +- Type:requirement +- CVE:NA +- SUG:NA +- DESC:add hbm_online_repair + * Wed Sep 25 2024 shixuantong - 1.0.2-15 - Type:bugfix - CVE:NA -- Gitee