From dc10d90dfd4fc6cc9fcd95da2cc616b0614aedc8 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Tue, 11 Jun 2024 16:47:46 +0800 Subject: [PATCH 01/24] fix version in setup.py --- src/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/setup.py b/src/python/setup.py index 21dbe9f..f96a96e 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -17,7 +17,7 @@ from setuptools import setup, find_packages setup( name="syssentry", - version="1.0.1", + version="1.0.2", description="System inspection framework tool set", packages=find_packages(), include_package_data=True, -- Gitee From a52c0a51e047a1dba0a0ce4e1b71445ab06ddc5b Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 24 Jul 2024 16:17:54 +0800 Subject: [PATCH 02/24] Fix the problem that function cpu_report_result() is called more than once when task is running, user to exec "sentryctl stop cpu_sentry", cpu_report_result() will be called twice. This will cause the log to be printed twice --- src/python/syssentry/cpu_sentry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 7e77654..3c4d58d 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -133,6 +133,7 @@ class CpuSentry: result_level = self.send_result.get("result", ResultLevel.FAIL) report_result(task_name, result_level, details) + self.init_send_result() def kill_process(signum, _f, cpu_sentry_obj): """kill process by 'pkill -9'""" @@ -179,6 +180,6 @@ def main(): cpu_sentry_task.send_result["result"] = ResultLevel.FAIL cpu_sentry_task.send_result["details"]["code"] = 1004 cpu_sentry_task.send_result["details"]["msg"] = "run cmd [%s] raise Error" % cpu_sentry_task_cmd - finally: cpu_sentry_task.cpu_report_result() - cpu_sentry_task.init_send_result() + else: + cpu_sentry_task.cpu_report_result() -- Gitee From 0e7b82b86c8d9e5b2d7406b9246c4f003c3e622a Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 24 Jul 2024 17:53:58 +0800 Subject: [PATCH 03/24] fix error handling --- src/python/syssentry/cpu_sentry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 3c4d58d..d0bafa8 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -87,7 +87,7 @@ class CpuSentry: } def handle_cpu_output(self, stdout: str): - if "" in stdout: + if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 self.send_result["details"]["msg"] = stdout.split("\n")[0] -- Gitee From 7ec585b5f71d47244ffc85dc98c6ef809b7160da Mon Sep 17 00:00:00 2001 From: shixuantong Date: Fri, 26 Jul 2024 15:59:42 +0800 Subject: [PATCH 04/24] fix result when process output is None --- src/python/syssentry/cpu_sentry.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index d0bafa8..9287e2f 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -87,11 +87,19 @@ class CpuSentry: } def handle_cpu_output(self, stdout: str): + if not stdout: + logging.error("%s process output is None, it may be killed!", LOW_LEVEL_INSPECT_CMD) + self.send_result["result"] = ResultLevel.FAIL + self.send_result["details"]["code"] = 1005 + self.send_result["details"]["msg"] = "cpu_sentry task is killed!" + return + if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 self.send_result["details"]["msg"] = stdout.split("\n")[0] return + out_split = stdout.split("\n") isolated_cores_number = 0 found_fault_cores_list = [] -- Gitee From a5cc5bb73fc22092d20429acfaf79ae960d0157f Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Mon, 22 Jul 2024 14:58:27 +0800 Subject: [PATCH 05/24] cpu_utility and cpu_patrol musht be an integer --- src/c/catcli/catlib/cli_param_checker.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index a1aa636..e400428 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include "cli_common.h" @@ -13,7 +14,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) { long cpu_utility = strtol(getopt_optarg, NULL, DECIMAL); - if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX) { + if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); } @@ -68,7 +69,7 @@ void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) { long second = strtol(getopt_optarg, NULL, DECIMAL); - if (second <= 0 || second > INT_MAX) { + if (second <= 0 || second > INT_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_time_err, "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", MAX_ERR_LEN); -- Gitee From 31bde41eb4f02a0af3be72e7e229abaa3305e6c2 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Fri, 30 Aug 2024 14:30:46 +0800 Subject: [PATCH 06/24] must be integer --- src/c/catcli/catlib/cli_param_checker.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index e400428..5b38402 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -17,8 +17,9 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); + } else { + p_request_body->cpu_utility = (int)cpu_utility; } - p_request_body->cpu_utility = (int)cpu_utility; } void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) @@ -73,8 +74,9 @@ void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_bo strncpy(errs->patrol_time_err, "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", MAX_ERR_LEN); + } else { + p_request_body->patrol_second = (int)second; } - p_request_body->patrol_second = (int)second; } void checkset_patrol_type(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) -- Gitee From 676178cbbe7ca2166b4c34971f26b1fff46f8501 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Fri, 30 Aug 2024 16:59:56 +0800 Subject: [PATCH 07/24] param must be integer --- src/c/catcli/catlib/cli_param_checker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index 5b38402..71edf17 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -17,6 +17,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); + p_request_body->cpu_utility = 0; } else { p_request_body->cpu_utility = (int)cpu_utility; } -- Gitee From c10f6716daaa427dbc55299ff3c052fe9fc8bafd Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 30 Aug 2024 19:58:41 +0800 Subject: [PATCH 08/24] add deleted code to plugin rasdaemon --- src/python/syssentry/syssentry.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 32b81e3..3d5cb8d 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -462,6 +462,14 @@ def main_loop(): epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) logging.debug("start main loop") + # onstart_tasks_handle() + for task_type in TasksMap.tasks_dict: + for task_name in TasksMap.tasks_dict.get(task_type): + task = TasksMap.tasks_dict.get(task_type).get(task_name) + if not task: + continue + task.onstart_handle() + while True: try: events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) -- Gitee From 07d8bef745a309717a342e263d6d6c4b576aa863 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Sun, 11 Aug 2024 18:36:23 +0800 Subject: [PATCH 09/24] Remove ANSI escape sequences --- src/python/syssentry/cpu_sentry.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 9287e2f..99af127 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -97,7 +97,14 @@ class CpuSentry: if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 - self.send_result["details"]["msg"] = stdout.split("\n")[0] + + # Remove ANSI escape sequences + error_info = stdout.split("\n")[0] + if error_info.startswith("\u001b"): + ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' + error_info = re.sub(ansi_escape, '', error_info) + + self.send_result["details"]["msg"] = error_info return out_split = stdout.split("\n") -- Gitee From 65e24b5c57ae43309c308d54cc6d80de4b9707a0 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Wed, 14 Aug 2024 21:10:20 +0800 Subject: [PATCH 10/24] split cpu_sentry and syssentry --- src/python/syssentry/cpu_alarm.py | 42 +++++++++++++++++++++++++ src/python/syssentry/syssentry.py | 52 ++++++------------------------- 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py index d972c42..0b1642b 100644 --- a/src/python/syssentry/cpu_alarm.py +++ b/src/python/syssentry/cpu_alarm.py @@ -1,6 +1,7 @@ import re import math import logging +import socket from enum import Enum from .utils import execute_command @@ -15,6 +16,12 @@ BINARY = 2 MIN_DATA_LEN = 0 MAX_DATA_LEN = 999 +PARAM_REP_LEN = 3 +PARAM_TYPE_LEN = 1 +PARAM_MODULE_LEN = 1 +PARAM_TRANS_TO_LEN = 2 +PARAM_DATA_LEN = 3 + class Type(Enum): CE = 0x00 @@ -207,3 +214,38 @@ def check_fixed_param(data, expect): raise ValueError("expected str param is not valid") return data raise NotImplementedError("unexpected param type") + + +def cpu_alarm_recv(server_socket: socket.socket): + try: + client_socket, _ = server_socket.accept() + logging.debug("cpu alarm fd listen ok") + + data = client_socket.recv(PARAM_REP_LEN) + check_fixed_param(data, "REP") + + data = client_socket.recv(PARAM_TYPE_LEN) + _type = check_fixed_param(data, Type) + + data = client_socket.recv(PARAM_MODULE_LEN) + module = check_fixed_param(data, Module) + + data = client_socket.recv(PARAM_TRANS_TO_LEN) + trans_to = check_fixed_param(data, TransTo) + + data = client_socket.recv(PARAM_DATA_LEN) + data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) + + data = client_socket.recv(data_len) + + command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) + except socket.error: + logging.error("socket error") + return + except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): + logging.error("server recv cpu alarm msg failed!") + client_socket.close() + return + + upload_bmc(_type, module, command, event_type, socket_id, core_id) + diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 3d5cb8d..f93956e 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -36,8 +36,15 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel from .utils import get_current_time_string -from .cpu_alarm import (upload_bmc, check_fixed_param, parser_cpu_alarm_info, - Type, Module, TransTo, MIN_DATA_LEN, MAX_DATA_LEN) + + +CPU_EXIST = True +try: + from .cpu_alarm import cpu_alarm_recv +except ImportError: + CPU_EXIST = False + logging.debug("Cannot find cpu sentry mod") + INSPECTOR = None @@ -76,45 +83,6 @@ PID_FILE_FLOCK = None RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" -PARAM_REP_LEN = 3 -PARAM_TYPE_LEN = 1 -PARAM_MODULE_LEN = 1 -PARAM_TRANS_TO_LEN = 2 -PARAM_DATA_LEN = 3 - - -def cpu_alarm_recv(server_socket: socket.socket): - try: - client_socket, _ = server_socket.accept() - logging.debug("cpu alarm fd listen ok") - - data = client_socket.recv(PARAM_REP_LEN) - check_fixed_param(data, "REP") - - data = client_socket.recv(PARAM_TYPE_LEN) - _type = check_fixed_param(data, Type) - - data = client_socket.recv(PARAM_MODULE_LEN) - module = check_fixed_param(data, Module) - - data = client_socket.recv(PARAM_TRANS_TO_LEN) - trans_to = check_fixed_param(data, TransTo) - - data = client_socket.recv(PARAM_DATA_LEN) - data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) - - data = client_socket.recv(data_len) - - command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) - except socket.error: - logging.error("socket error") - return - except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): - logging.error("server recv cpu alarm msg failed!") - client_socket.close() - return - - upload_bmc(_type, module, command, event_type, socket_id, core_id) def msg_data_process(msg_data): @@ -480,7 +448,7 @@ def main_loop(): server_result_recv(server_result_fd) elif event_fd == heartbeat_fd.fileno(): heartbeat_recv(heartbeat_fd) - elif event_fd == cpu_alarm_fd.fileno(): + elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): cpu_alarm_recv(cpu_alarm_fd) else: continue -- Gitee From 692bb8088310b877c9dfeea1ed4df467c17efb3f Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 11 Sep 2024 10:23:41 +0800 Subject: [PATCH 11/24] fix configparser.InterpolationSyntaxError --- src/python/syssentry/sentry_config.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py index 01f3df8..a0e7b79 100644 --- a/src/python/syssentry/sentry_config.py +++ b/src/python/syssentry/sentry_config.py @@ -103,14 +103,18 @@ class CpuPluginsParamsConfig: """read config file""" config_param_section_args = {} if os.path.exists(self.config_file): - self.config.read(self.config_file) try: + self.config.read(self.config_file) config_param_section_args = dict(self.config[self.param_section_name]) - except (ValueError, KeyError): + except (ValueError, KeyError, configparser.InterpolationSyntaxError): config_param_section_args = {} + logging.error("Failed to parse cpu_sentry.ini!") return config_param_section_args def join_cpu_start_cmd(self, cpu_param_dict: dict) -> str: + if not cpu_param_dict: + return "" + cpu_list = cpu_param_dict.get("cpu_list", "default") if cpu_list == "default": cpu_list = CpuPluginsParamsConfig.get_cpu_info() -- Gitee From a30dc4d1fb30f8a4104bfe0d6605ffaa5f92e142 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Sat, 14 Sep 2024 09:28:00 +0800 Subject: [PATCH 12/24] fix syssentry fails to be started when cpu_sentry is not installed --- src/python/syssentry/syssentry.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index f93956e..776971f 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -43,7 +43,6 @@ try: from .cpu_alarm import cpu_alarm_recv except ImportError: CPU_EXIST = False - logging.debug("Cannot find cpu sentry mod") INSPECTOR = None @@ -563,20 +562,21 @@ def main(): if not os.path.exists(SENTRY_RUN_DIR): os.mkdir(SENTRY_RUN_DIR) os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - if not chk_and_set_pidfile(): - logging.error("get pid file lock failed, exist") - sys.exit(17) logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) os.chmod(SYSSENTRY_LOG_FILE, 0o600) + if not chk_and_set_pidfile(): + logging.error("get pid file lock failed, exist") + sys.exit(17) + try: signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGCHLD, sigchld_handler) - logging.debug("finish main parse_args") + logging.info("finish main parse_args") _ = SentryConfig.init_param() TasksMap.init_task_map() @@ -587,3 +587,4 @@ def main(): logging.error('%s', traceback.format_exc()) finally: release_pidfile() + -- Gitee From 3edc711179c9471d0f2c425082753e8e2942d770 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Sat, 21 Sep 2024 09:53:42 +0800 Subject: [PATCH 13/24] optimize the handing of cat-cli error msg in cpu_sentry --- src/python/syssentry/cpu_sentry.py | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 99af127..582d4b3 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -26,6 +26,8 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini" # Inspection commands running at the bottom layer LOW_LEVEL_INSPECT_CMD = "cat-cli" +# max length of msg in details +DETAILS_LOG_MSG_MAX_LEN = 255 class CpuSentry: """ @@ -94,22 +96,10 @@ class CpuSentry: self.send_result["details"]["msg"] = "cpu_sentry task is killed!" return - if "ERROR" in stdout: - self.send_result["result"] = ResultLevel.FAIL - self.send_result["details"]["code"] = 1004 - - # Remove ANSI escape sequences - error_info = stdout.split("\n")[0] - if error_info.startswith("\u001b"): - ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' - error_info = re.sub(ansi_escape, '', error_info) - - self.send_result["details"]["msg"] = error_info - return - out_split = stdout.split("\n") - isolated_cores_number = 0 + isolated_cores_number = -1 found_fault_cores_list = [] + error_msg_list = [] for out_line_i in out_split: if "handle_patrol_result: Found fault cores" in out_line_i: cores_number_tmp = out_line_i.split("Found fault cores:")[1] @@ -121,9 +111,25 @@ class CpuSentry: elif out_line_i.startswith(''): self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1] break + elif "ERROR" in out_line_i: + logging.error("[cat-cli error] - %s\n", out_line_i) + error_msg_list.append(out_line_i) found_fault_cores_number = len(set(found_fault_cores_list)) - if found_fault_cores_number == 0: + if isolated_cores_number == -1: + self.send_result["result"] = ResultLevel.FAIL + self.send_result["details"]["code"] = 1004 + + send_error_msg = "" + # Remove ANSI escape sequences + for error_info in error_msg_list: + if error_info.startswith("\u001b"): + ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' + error_info = re.sub(ansi_escape, '', error_info) + if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN: + send_error_msg += error_info + self.send_result["details"]["msg"] = send_error_msg + elif found_fault_cores_number == 0: self.send_result["details"]["code"] = 0 self.send_result["result"] = ResultLevel.PASS elif 0 in found_fault_cores_list: -- Gitee From 47032b1d83d965e2418c016f918fb723085eacc5 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Fri, 20 Sep 2024 15:59:40 +0800 Subject: [PATCH 14/24] should be warn-level log --- src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c index 9f8d80c..f4f3172 100644 --- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c +++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c @@ -23,7 +23,7 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid) return CAT_OK; } if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) { - CAT_LOG_E("Insert error, core id(%d)", coreid); + CAT_LOG_W("Too many cores need to isolate,do not isolate core(%d)", coreid); return CAT_ERR; } -- Gitee From d2aa53bd465b8b370fa0d227efa35acc1572b78e Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 25 Sep 2024 10:38:46 +0800 Subject: [PATCH 15/24] add separator to err info --- src/python/syssentry/cpu_sentry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 582d4b3..2f18d14 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -127,7 +127,7 @@ class CpuSentry: ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' error_info = re.sub(ansi_escape, '', error_info) if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN: - send_error_msg += error_info + send_error_msg += ";" + error_info self.send_result["details"]["msg"] = send_error_msg elif found_fault_cores_number == 0: self.send_result["details"]["code"] = 0 -- Gitee From 56c9e24a6c9bdce9332250573be32b398978d1b4 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Wed, 25 Sep 2024 16:09:17 +0800 Subject: [PATCH 16/24] remove threshold:max cpu cores --- src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c | 4 ++-- src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c index f4f3172..8e31312 100644 --- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c +++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c @@ -22,8 +22,8 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid) CAT_LOG_W("Core %d is a special core and cannot be isolated", coreid); return CAT_OK; } - if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) { - CAT_LOG_W("Too many cores need to isolate,do not isolate core(%d)", coreid); + if (coreid < 0) { + CAT_LOG_W("Inner error, coreid is a negative number"); return CAT_ERR; } diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h index 92dcdc3..9722ec9 100644 --- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h +++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h @@ -30,9 +30,9 @@ typedef enum { #define CAT_LOG_W(...) CAT_LOG("WARN", __VA_ARGS__) #define CAT_LOG_E(...) CAT_LOG("ERROR", __VA_ARGS__) -#define MAX_ISOLATE_CORES_PER_PATROL 64 // 一次巡检最大支持隔离故障核数量,一次巡检同时检测到2个以上故障核的概率非常低 +#define MAX_CPU_CORES 4096 typedef struct { - unsigned int order_list[MAX_ISOLATE_CORES_PER_PATROL]; + unsigned int order_list[MAX_CPU_CORES]; unsigned short current_nums; } core_list_st; -- Gitee From e44e7a565e66a408279161a706535491e8236e46 Mon Sep 17 00:00:00 2001 From: luckky Date: Fri, 11 Oct 2024 09:49:40 +0000 Subject: [PATCH 17/24] add hbm online repair --- config/tasks/hbm_online_repair.mod | 9 + src/c/hbm_online_repair/.gitignore | 6 + src/c/hbm_online_repair/Makefile | 25 + src/c/hbm_online_repair/hbm_online_repair.c | 144 ++++ src/c/hbm_online_repair/hbm_online_repair.env | 2 + src/c/hbm_online_repair/logger.h | 31 + .../non-standard-hbm-repair.c | 799 ++++++++++++++++++ .../non-standard-hbm-repair.h | 89 ++ src/c/hbm_online_repair/ras-events.c | 534 ++++++++++++ src/c/hbm_online_repair/ras-events.h | 28 + .../ras-non-standard-handler.c | 81 ++ .../ras-non-standard-handler.h | 25 + src/python/.gitignore | 1 + src/python/syssentry/bmc_alarm.py | 159 ++++ src/python/syssentry/syssentry.py | 79 +- 15 files changed, 2001 insertions(+), 11 deletions(-) create mode 100644 config/tasks/hbm_online_repair.mod create mode 100644 src/c/hbm_online_repair/.gitignore create mode 100644 src/c/hbm_online_repair/Makefile create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env create mode 100644 src/c/hbm_online_repair/logger.h create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h create mode 100644 src/c/hbm_online_repair/ras-events.c create mode 100644 src/c/hbm_online_repair/ras-events.h create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h create mode 100644 src/python/.gitignore create mode 100644 src/python/syssentry/bmc_alarm.py diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod new file mode 100644 index 0000000..77dd73e --- /dev/null +++ b/config/tasks/hbm_online_repair.mod @@ -0,0 +1,9 @@ +[common] +enabled=yes +task_start=/usr/bin/hbm_online_repair +task_stop=kill $pid +type=period +interval=180 +onstart=yes +env_file=/etc/sysconfig/hbm_online_repair.env +conflict=up \ No newline at end of file diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore new file mode 100644 index 0000000..a577882 --- /dev/null +++ b/src/c/hbm_online_repair/.gitignore @@ -0,0 +1,6 @@ +*.o +*.c~ +*.h~ +hbm_online_repair + +.vscode/ diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile new file mode 100644 index 0000000..16ebcd8 --- /dev/null +++ b/src/c/hbm_online_repair/Makefile @@ -0,0 +1,25 @@ +CC = gcc + +CFLAGS = -Wall -o3 + +LDFLAGS = -ltraceevent + +SRC = $(wildcard *.c) +HDR = $(wildcard *.h) + +OBJ = $(SRC:.c=.o) + +TARGET = hbm_online_repair + +all: $(TARGET) + +$(TARGET): $(OBJ) + $(CC) $(OBJ) -o $@ $(LDFLAGS) + +%.o: %.c $(HDR) + $(CC) $(CFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJ) $(TARGET) + +.PHONY: all clean diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c new file mode 100644 index 0000000..3ace206 --- /dev/null +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include + +#include "logger.h" +#include "ras-events.h" +#include "non-standard-hbm-repair.h" + +#define DEFAULT_LOG_LEVEL LOG_INFO +#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 + +int global_level_setting; +int page_isolation_threshold; + +int string2int(const char* str, int* value) +{ + if (!str) { + return -1; + } + char *endptr; + errno = 0; + long val = strtol(str, &endptr, 10); + if (errno != 0 || *endptr != '\0') { + return -1; + } + *value = (int)val; + if (val != (long)*value) { + return -1; + } + return 0; +} + +int execute_command(const char *command) +{ + FILE *fp; + char buffer[128] = {0}; + int ret; + fp = popen(command, "r"); + if (!fp) { + log(LOG_ERROR, "popen failed\n"); + return -1; + } + + fgets(buffer, sizeof(buffer), fp); + log(LOG_DEBUG, "output of command is: %s\n", buffer); + + ret = pclose(fp); + if (ret < 0) { + log(LOG_ERROR, "pclose failed\n"); + return -1; + } + + if (!WIFEXITED(ret)) { + log(LOG_ERROR, "command did not terminate normally\n"); + return -1; + } + + ret = WEXITSTATUS(ret); + log(LOG_DEBUG, "command exited with status: %d\n", ret); + return ret; +} + +int load_required_driver(void) +{ + int ret; + ret = execute_command("modprobe hisi_mem_ras 2>&1"); + if (ret < 0) { + log(LOG_ERROR, "load repair driver failed\n"); + return ret; + } + ret = execute_command("modprobe page_eject 2>&1"); + if (ret < 0) { + log(LOG_ERROR, "load page driver failed\n"); + return ret; + } + log(LOG_INFO, "load required driver success\n"); + return ret; +} + +void hbm_param_init(void) +{ + int ret; + char *env; + + env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL"); + ret = string2int(env, &global_level_setting); + if (ret < 0) { + global_level_setting = DEFAULT_LOG_LEVEL; + log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); + } else { + log(LOG_INFO, "log level: %d\n", global_level_setting); + } + + env = getenv("PAGE_ISOLATION_THRESHOLD"); + ret = string2int(env, &page_isolation_threshold); + if (ret < 0) { + page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; + log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); + } else { + log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); + } +} + + +int main(int argc, char *argv[]) +{ + int ret; + + hbm_param_init(); + + ret = load_required_driver(); + if (ret < 0) { + log(LOG_DEBUG, "load required driver failed\n"); + return ret; + } + + struct ras_events *ras = init_trace_instance(); + if (!ras) + return -1; + + ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1); + if (ret < 0) { + log(LOG_WARNING, "unable to enable ras non_standard_event.\n"); + free(ras); + return -1; + } + + ret = init_all_flash(); + if (ret < 0) { + log(LOG_ERROR, "flash writer init failed\n"); + } + + handle_ras_events(ras); + + ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0); + if (ret < 0) { + log(LOG_WARNING, "unable to disable ras non_standard_event.\n"); + } + + free(ras); + return ret; +} diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env new file mode 100644 index 0000000..de56079 --- /dev/null +++ b/src/c/hbm_online_repair/hbm_online_repair.env @@ -0,0 +1,2 @@ +HBM_ONLINE_REPAIR_LOG_LEVEL=1 +PAGE_ISOLATION_THRESHOLD=128 diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h new file mode 100644 index 0000000..ddfa932 --- /dev/null +++ b/src/c/hbm_online_repair/logger.h @@ -0,0 +1,31 @@ +#ifndef __LOGGER_H +#define __LOGGER_H + +#define TOOL_NAME "hbm_online_repair" + +#define LOG_DEBUG 0 +#define LOG_INFO 1 +#define LOG_WARNING 2 +#define LOG_ERROR 3 + +extern int global_level_setting; + +#define log_prefix(level) \ + (level == LOG_DEBUG ? "DEBUG" : \ + level == LOG_INFO ? "INFO" : \ + level == LOG_WARNING ? "WARNING" : \ + level == LOG_ERROR ? "ERROR" : \ + "UNKNOWN_LEVEL") + +#define log_fd(level) \ + (level == LOG_ERROR ? stderr : stdout) + +#define log(level, fmt, args...) do {\ + if (level >= global_level_setting) {\ + fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\ + fprintf(log_fd(level), fmt, ##args);\ + fflush(log_fd(level));\ + }\ +} while (0) + +#endif diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c new file mode 100644 index 0000000..b175e14 --- /dev/null +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -0,0 +1,799 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "logger.h" +#include "non-standard-hbm-repair.h" + +extern int page_isolation_threshold; +size_t total_size = 0; +struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; + uint8_t soc_id; + uint8_t socket_id; + uint8_t totem_id; + uint8_t nimbus_id; + uint8_t subsystem_id; + uint8_t module_id; + uint8_t submodule_id; + uint8_t core_id; + uint8_t port_id; + uint16_t err_type; + struct { + uint8_t function; + uint8_t device; + uint16_t segment; + uint8_t bus; + uint8_t reserved[3]; + } pcie_info; + uint8_t err_severity; + uint8_t reserved[3]; + uint32_t reg_array_size; + uint32_t reg_array[]; +}; + +struct fault_addr_info { + uint32_t processer_id; + uint32_t die_id; + uint32_t stack_id; + uint32_t sid; + uint32_t channel_id; + uint32_t bankgroup_id; + uint32_t bank_id; + uint32_t row_id; + uint32_t column_id; + uint32_t error_type; + uint32_t repair_type; + uint32_t reserved; + uint32_t crc8; +}; + +typedef struct { + const char *VariableName; + const char *VendorGuid; + uint32_t DataSize; + uint8_t *Data; + uint32_t Attributes; +} efi_variable_t; + +char* flash_names[FLASH_ENTRY_NUM] = { + "repair0000", + "repair0001", + "repair0100", + "repair0101", + "repair0200", + "repair0201", + "repair0300", + "repair0301", +}; +char *flash_guids[FLASH_ENTRY_NUM] = { + "CD2FF4D9-D937-4e1d-B810-A1A568C37C01", + "DD92CC91-43E6-4c69-A42A-B08F72FCB157", + "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B", + "733F9979-4ED4-478d-BD6A-E4D0F0390FDB", + "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914", + "A0920D6F-78B8-4c09-9F61-7CEC845F116C", + "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7", + "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC" +}; + +static int get_guid_index(uint32_t socket_id, uint32_t error_type) { + if (2 * socket_id + error_type >= FLASH_ENTRY_NUM) + return -1; + return 2 * socket_id + error_type; +} + +static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr) +{ + info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK; + fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN; + info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK; + fault_addr >>= FAULT_ADDR_DIE_ID_LEN; + info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK; + fault_addr >>= FAULT_ADDR_STACK_ID_LEN; + info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK; + fault_addr >>= FAULT_ADDR_SID_LEN; + info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK; + fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; + info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK; + fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN; + info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK; + fault_addr >>= FAULT_ADDR_BANK_ID_LEN; + info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; + fault_addr >>= FAULT_ADDR_ROW_ID_LEN; + info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; + fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; + info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; + fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; + info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; + fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN; + info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK; + fault_addr >>= FAULT_ADDR_RESERVED_LEN; + info_struct->crc8 = (uint32_t)fault_addr; +} + +static bool variable_existed(char *name, char *guid) +{ + char filename[PATH_MAX]; + int fd; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "open file %s failed\n", filename); + return false; + } + close(fd); + return true; +} + +static uint32_t read_variable_attribute(char *name, char *guid) { + char filename[PATH_MAX]; + int fd; + size_t readsize; + uint32_t attribute = (uint32_t)-1; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + return attribute; + } + + // read attributes from first 4 bytes + readsize = read(fd, &attribute, sizeof(uint32_t)); + if (readsize != sizeof(uint32_t)) { + log(LOG_ERROR, "read attribute of %s failed\n", filename); + } + + close(fd); + return attribute; +} + +static int efivarfs_set_mutable(char *name, char *guid, bool mutable) +{ + unsigned long orig_attrs, new_attrs; + char filename[PATH_MAX]; + int fd; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + goto err; + } + + if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) { + log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n"); + goto err; + } + + if (mutable) + new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL; + else + new_attrs = orig_attrs | FS_IMMUTABLE_FL; + + if (new_attrs == orig_attrs) { + close(fd); + return 0; + } + + if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) { + log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n"); + goto err; + } + close(fd); + return 0; +err: + if (fd >= 0) + close(fd); + return -1; +} + +static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { + int fd, mode; + size_t writesize; + void *buffer; + unsigned long total; + char filename[PATH_MAX]; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // prepare attributes(size 4 bytes) and data + total = size + sizeof(uint32_t); + buffer = malloc(total); + if (buffer == NULL) { + log(LOG_ERROR, "malloc data for %s failed\n", filename); + goto err; + } + memcpy(buffer, &attribute, sizeof(uint32_t)); + memcpy(buffer + sizeof(uint32_t), value, size); + + // change attr + if (efivarfs_set_mutable(name, guid, 1) != 0) { + log(LOG_ERROR, "set mutable for %s failed\n", filename); + goto err; + } + + mode = O_WRONLY; + if (attribute & EFI_VARIABLE_APPEND_WRITE) + mode |= O_APPEND; + else + mode |= O_CREAT; + + // open var file + fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + goto err; + } + + // write to var file + writesize = write(fd, buffer, total); + if (writesize != total) { + log(LOG_ERROR, "write %s failed\n", filename); + goto err; + } + + close(fd); + free(buffer); + if (efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return 0; +err: + if (fd >= 0) + close(fd); + if (buffer) + free(buffer); + if (efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return -1; +} + +static int append_variable(char *name, char *guid, void *data, unsigned long size) { + // prepare append attribute + uint32_t attribute = read_variable_attribute(name, guid); + if (attribute == (uint32_t)-1) { + log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); + return -1; + } + attribute |= EFI_VARIABLE_APPEND_WRITE; + + return write_variable(name, guid, data, size, attribute); +} + +static size_t get_var_size(char *name, char *guid) { + char filename[PATH_MAX]; + int fd; + struct stat stat; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "open %s failed\n", filename); + goto err; + } + // read stat + if (fstat(fd, &stat) != 0) { + log(LOG_WARNING, "fstat %s failed\n", filename); + goto err; + } + close(fd); + return stat.st_size; +err: + if (fd >= 0) + close(fd); + return (size_t)-1; +} + +int init_all_flash() { + for (int i = 0; i < FLASH_ENTRY_NUM; i++) { + // check existed entry + if (variable_existed(flash_names[i], flash_guids[i])) { + total_size += get_var_size(flash_names[i], flash_guids[i]); + continue; + } + // create new entry + uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + char *data = ""; + unsigned long size = 1; + int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); + if (ret) { + log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); + return -1; + } + total_size += sizeof(uint32_t) + 1; + } + // check total entry size + log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", + total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); + if (total_size > MAX_VAR_SIZE) { + log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); + } + return 0; +} + +static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { + int ret, guid_index; + uint32_t reg_size; + uint64_t fault_addr; + + // check flash usage threshold + if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { + log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); + return -1; + } + + // parse physical addr + reg_size = err->reg_array_size / sizeof(uint32_t); + fault_addr = err->reg_array[reg_size - 1]; + fault_addr <<= TYPE_UINT32_WIDTH; + fault_addr += err->reg_array[reg_size - 2]; + + // get guid + struct fault_addr_info info_struct; + parse_fault_addr_info(&info_struct, fault_addr); + guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type); + if (guid_index < 0) { + log(LOG_ERROR, "invalid fault info\n"); + return -1; + } + // record physical addr in flash + ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); + if (ret < 0) { + log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); + return -1; + } + total_size += sizeof(uint64_t); + log(LOG_INFO, "write hbm fault info to flash success\n"); + return 0; +} + +static int write_file(char *path, const char *name, unsigned long long value) +{ + char fname[MAX_PATH]; + char buf[20]; + int ret; + int fd; + + snprintf(fname, MAX_PATH, "%s/%s", path, name); + + fd = open(fname, O_WRONLY); + if (fd < 0) { + log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", + fname, strerror(errno)); + return -errno; + } + + snprintf(buf, sizeof(buf), "0x%llx\n", value); + ret = write(fd, buf, strlen(buf)); + if (ret <= 0) + log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", + fname, value, strerror(errno)); + + close(fd); + return ret > 0 ? 0 : -errno; +} + +static int get_hardware_corrupted_size() +{ + FILE *fp; + char line[256]; + int hardware_corrupted_size = -1; + char *key = "HardwareCorrupted:"; + + fp = fopen("/proc/meminfo", "r"); + if (fp == NULL) { + log(LOG_ERROR, "Failed to open /proc/meminfo\n"); + return -1; + } + + while (fgets(line, sizeof(line), fp) != NULL) { + char *pos; + if ((pos = strstr(line, key)) != NULL) { + sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size); + break; + } + } + + fclose(fp); + return hardware_corrupted_size; +} + +static uint8_t get_repair_result_code(int ret) +{ + if (ret == -ENOSPC) { + return REPAIR_FAILED_NO_RESOURCE; + } else if (ret == -EIO) { + return REPAIR_FAILED_OTHER_REASON; + } else if (ret == -ENXIO || ret == -EINVAL) { + return REPAIR_FAILED_INVALID_PARAM; + } + return REPAIR_FAILED_OTHER_REASON; +} + +static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code) +{ + int sockfd; + struct sockaddr_un addr; + char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0}; + uint8_t repair_type_code, isolation_type_code; + uint32_t repair_type; + unsigned long long fault_addr; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + log(LOG_ERROR, "Failed to create BMC notice socket\n"); + return -1; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1); + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) { + log(LOG_ERROR, "Failed to connect BMC notice socket\n"); + close(sockfd); + return -1; + } + + /* assemble bmc specific msg */ + repair_type_code = 0; + isolation_type_code = 0; + repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE]; + if (repair_type & HBM_CE_ACLS) { + repair_type_code = 0; + isolation_type_code = SINGLE_ADDR_FAULT; + } else if (repair_type & HBM_PSUE_ACLS) { + repair_type_code = 1; + isolation_type_code = SINGLE_ADDR_FAULT; + } else if (repair_type & HBM_CE_SPPR) { + repair_type_code = 2; + isolation_type_code = ROW_FAULT; + } else if (repair_type & HBM_PSUE_SPPR) { + repair_type_code = 3; + isolation_type_code = ROW_FAULT; + } + + const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t); + + fault_addr = err->reg_array[reg_size - 1]; + fault_addr <<= TYPE_UINT32_WIDTH; + fault_addr += err->reg_array[reg_size - 2]; + + log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr); + + struct fault_addr_info info_struct; + parse_fault_addr_info(&info_struct, fault_addr); + + log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id); + log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id); + log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id); + log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid); + log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id); + log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id); + log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id); + log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id); + log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id); + log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type); + log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type); + log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved); + log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8); + + snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT, + repair_type_code, + repair_result_code, + isolation_type_code, + info_struct.processer_id, + info_struct.die_id, + info_struct.stack_id, + info_struct.sid, + info_struct.channel_id, + info_struct.bankgroup_id, + info_struct.bank_id, + info_struct.row_id, + info_struct.column_id + ); + + log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg); + + if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) { + log(LOG_ERROR, "Failed to send data to BMC notice socket\n"); + close(sockfd); + return -1; + } + + close(sockfd); + return 0; +} + +static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) +{ + unsigned long long paddr; + int ret; + bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS); + int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB; + int hardware_corrupted_size = get_hardware_corrupted_size(); + if (hardware_corrupted_size < 0) { + log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed"); + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + return -1; + } + if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) { + log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n"); + notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD); + return -1; + } + if (is_acls) { + /* ACLS */ + paddr = err->reg_array[HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[HBM_ADDL]; + + ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); + if (ret < 0) { + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr); + return ret; + } + } else { + /* SPPR */ + bool all_success = true; + uint32_t i; + for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { + paddr = err->reg_array[2 * i + HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[2 * i + HBM_ADDL]; + ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); + if (ret < 0) { + all_success = false; + log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr); + continue; + } + } + if (!all_success) { + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + ret = -1; + } + } + return ret < 0 ? ret : 0; +} + +static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) +{ + int ret; + if (repair_ret < 0) { + log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); + /* not much we can do about errors here */ + (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); + return get_repair_result_code(repair_ret); + } + + ret = write_file("/sys/kernel/page_eject", "online_page", paddr); + if (ret < 0) { + log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr); + return ONLINE_PAGE_FAILED; + } else { + log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr); + return ISOLATE_REPAIR_ONLINE_SUCCESS; + } +} + +static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) +{ + unsigned long long paddr; + int ret; + uint8_t repair_result_code; + bool is_acls; + + /* Both ACLS and SPPR only repair the first address */ + paddr = err->reg_array[HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[HBM_ADDL]; + + is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS || + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; + + ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); + if (ret < 0) { + notice_BMC(err, get_repair_result_code(ret)); + log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + return ret; + } + + ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr); + + if (is_acls) { + /* ACLS */ + repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); + notice_BMC(err, repair_result_code); + return ret; + } else { + /* SPPR */ + bool all_online_success = true; + uint32_t i; + for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { + paddr = err->reg_array[2 * i + HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[2 * i + HBM_ADDL]; + + repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); + if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) { + all_online_success = false; + } + } + if (ret < 0) { + notice_BMC(err, get_repair_result_code(ret)); + return ret; + } else if (all_online_success) { + notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); + return 0; + } else { + notice_BMC(err, ONLINE_PAGE_FAILED); + return ret; + } + } + /* The final return code is not necessary */ + return ret < 0 ? ret : 0; +} + +static int hbmc_get_memory_type(char *path) +{ + int type = HBM_UNKNOWN; + char fname[MAX_PATH]; + char buf[128]; + FILE *file; + + snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); + file = fopen(fname, "r"); + if (!file) { + log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", + fname, strerror(errno)); + return -errno; + } + + if (!fgets(buf, sizeof(buf), file)) { + log(LOG_WARNING, "HBM: Failed to read %s\n", fname); + goto err; + } + + /* Remove the last '\n' */ + buf[strlen(buf) - 1] = 0; + + if (strcmp(buf, "HBM") == 0) + type = HBM_HBM_MEMORY; + else if (strcmp(buf, "DDR") == 0) + type = HBM_DDR_MEMORY; + +err: + fclose(file); + return type; +} + +static void hbm_repair_handler(const struct hisi_common_error_section *err) +{ + log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n"); + char *sys_dev_path = "/sys/devices/platform"; + char path[MAX_PATH]; + struct dirent *dent; + DIR *dir; + int ret; + bool find_device = false, find_hbm_mem = false; + + ret = hbmc_hbm_page_isolate(err); + if (ret < 0) { + return; + } + + dir = opendir(sys_dev_path); + if (!dir) { + log(LOG_WARNING, "Can't read '%s': %s\n", + sys_dev_path, strerror(errno)); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + return; + } + + while ((dent = readdir(dir))) { + if (!strstr(dent->d_name, HBM_MEM_RAS_NAME)) + continue; + find_device = true; + + snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); + + if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { + find_hbm_mem = true; + ret = hbmc_hbm_repair(err, path); + if (ret != -ENXIO) + break; + } + } + if (!find_device) { + log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } else if (!find_hbm_mem) { + log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } + + closedir(dir); +} + +static bool hbm_repair_validate(const struct hisi_common_error_section *err) +{ + if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) && + (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) && + (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE)) + )) { + log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits); + return false; + } + log(LOG_DEBUG, "err->module_id: %u\n", err->module_id); + log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id); + log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits); + log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size); + + if (err->module_id != HBMC_MODULE_ID || + err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) { + log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n"); + return false; + } + + uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; + bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) && + (err->reg_array_size == HBM_ACLS_ARRAY_SIZE); + bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) && + (err->reg_array_size == HBM_SPPR_ARRAY_SIZE); + bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) && + (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); + + if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { + log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", + hbm_repair_reg_type, err->reg_array_size); + return false; + } + + log(LOG_INFO, "Received ACLS/SPPR repair request\n"); + return true; +} + +static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err) +{ + uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; + return !(hbm_repair_reg_type & HBM_CACHE_MODE); +} + +int decode_hisi_common_section(struct ras_non_standard_event *event) +{ + const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error; + + if (hbm_repair_validate(err)) { + write_fault_info_to_flash(err); + if (hbm_flat_mode_validate(err)) { + hbm_repair_handler(err); + } + } + + return 0; +} diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h new file mode 100644 index 0000000..7e8e448 --- /dev/null +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h @@ -0,0 +1,89 @@ +#ifndef __NON_STANDARD_HBM_REPAIR +#define __NON_STANDARD_HBM_REPAIR + +#include "ras-non-standard-handler.h" + +#define DEFAULT_PAGE_SIZE_KB 4 +#define HBM_MEM_RAS_NAME "HISI0521" +#define HBM_UNKNOWN 0 +#define HBM_HBM_MEMORY 1 +#define HBM_DDR_MEMORY 2 + +#define TYPE_UINT32_WIDTH 32 +#define HBM_REPAIR_REQ_TYPE 0 +#define HBM_CE_ACLS BIT(0) +#define HBM_PSUE_ACLS BIT(1) +#define HBM_CE_SPPR BIT(2) +#define HBM_PSUE_SPPR BIT(3) +#define HBM_CACHE_MODE (BIT(4) | BIT(5) | BIT(6) | BIT(7)) +#define HBM_ERROR_MASK 0b11111111 +#define HBM_ADDL 1 +#define HBM_ADDH 2 +#define HBM_ERROR_TYPE_SIZE 4 +#define HBM_ADDR_SIZE 8 +#define HBM_ACLS_ADDR_NUM 1 +#define HBM_SPPR_ADDR_NUM 16 +#define HBM_ACLS_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE) +#define HBM_SPPR_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE) +#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE) +#define HBMC_MODULE_ID 0x28 +#define HBMC_SUBMOD_HBM_REPAIR 6 +#define COMMON_VALID_MODULE_ID 5 +#define COMMON_VALID_SUBMODULE_ID 6 +#define COMMON_VALID_REG_ARRAY_SIZE 12 + +#define BMC_SOCKET_PATH "/var/run/sysSentry/bmc.sock" +#define BMC_REPORT_FORMAT "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000" + +#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001 +#define ISOLATE_FAILED_OTHER_REASON 0b10000010 +#define REPAIR_FAILED_NO_RESOURCE 0b10010100 +#define REPAIR_FAILED_INVALID_PARAM 0b10011000 +#define REPAIR_FAILED_OTHER_REASON 0b10011100 +#define ONLINE_PAGE_FAILED 0b10100000 +#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000 + +#define ROW_FAULT 1 +#define SINGLE_ADDR_FAULT 6 + +#define FAULT_ADDR_PROCESSOR_ID_LEN 2 +#define FAULT_ADDR_DIE_ID_LEN 1 +#define FAULT_ADDR_STACK_ID_LEN 3 +#define FAULT_ADDR_SID_LEN 3 +#define FAULT_ADDR_CHANNEL_ID_LEN 8 +#define FAULT_ADDR_BANKGROUP_ID_LEN 3 +#define FAULT_ADDR_BANK_ID_LEN 3 +#define FAULT_ADDR_ROW_ID_LEN 17 +#define FAULT_ADDR_COLUMN_ID_LEN 10 +#define FAULT_ADDR_ERROR_TYPE_LEN 2 +#define FAULT_ADDR_REPAIR_TYPE_LEN 2 +#define FAULT_ADDR_RESERVED_LEN 2 +#define FAULT_ADDR_CRC8_LEN 8 + +#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1) +#define FAULT_ADDR_DIE_ID_MASK ((1 << FAULT_ADDR_DIE_ID_LEN ) - 1) +#define FAULT_ADDR_STACK_ID_MASK ((1 << FAULT_ADDR_STACK_ID_LEN ) - 1) +#define FAULT_ADDR_SID_MASK ((1 << FAULT_ADDR_SID_LEN ) - 1) +#define FAULT_ADDR_CHANNEL_ID_MASK ((1 << FAULT_ADDR_CHANNEL_ID_LEN ) - 1) +#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1) +#define FAULT_ADDR_BANK_ID_MASK ((1 << FAULT_ADDR_BANK_ID_LEN ) - 1) +#define FAULT_ADDR_ROW_ID_MASK ((1 << FAULT_ADDR_ROW_ID_LEN ) - 1) +#define FAULT_ADDR_COLUMN_ID_MASK ((1 << FAULT_ADDR_COLUMN_ID_LEN ) - 1) +#define FAULT_ADDR_ERROR_TYPE_MASK ((1 << FAULT_ADDR_ERROR_TYPE_LEN ) - 1) +#define FAULT_ADDR_REPAIR_TYPE_MASK ((1 << FAULT_ADDR_REPAIR_TYPE_LEN ) - 1) +#define FAULT_ADDR_RESERVED_MASK ((1 << FAULT_ADDR_RESERVED_LEN ) - 1) +#define FAULT_ADDR_CRC8_MASK ((1 << FAULT_ADDR_CRC8_LEN ) - 1) + +#define EFI_VARIABLE_NON_VOLATILE 0x1 +#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2 +#define EFI_VARIABLE_RUNTIME_ACCESS 0x4 +#define EFI_VARIABLE_APPEND_WRITE 0x40 + +#define EFIVARFS_PATH "/sys/firmware/efi/efivars" +#define MAX_VAR_SIZE (128 * 1024) +#define FLASH_ENTRY_NUM 8 +#define KB_SIZE 1024 + +extern int init_all_flash(); + +#endif diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c new file mode 100644 index 0000000..0b12329 --- /dev/null +++ b/src/c/hbm_online_repair/ras-events.c @@ -0,0 +1,534 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "ras-non-standard-handler.h" +#include "logger.h" + +/* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never + * blocks on read(). So, we need to sleep for a while, to avoid spending + * too much CPU cycles. A fix for it is expected for 3.10. + */ +#define POLLING_TIME 3 + +/* Test for a little-endian machine */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define ENDIAN KBUFFER_ENDIAN_LITTLE +#else + #define ENDIAN KBUFFER_ENDIAN_BIG +#endif + +static int get_debugfs_dir(char *debugfs_dir, size_t len) +{ + FILE *fp; + char line[MAX_PATH + 1 + 256]; + + fp = fopen("/proc/mounts","r"); + if (!fp) { + log(LOG_INFO, "Can't open /proc/mounts"); + return errno; + } + + do { + char *p, *type, *dir; + if (!fgets(line, sizeof(line), fp)) + break; + + p = strtok(line, " \t"); + if (!p) + break; + + dir = strtok(NULL, " \t"); + if (!dir) + break; + + type = strtok(NULL, " \t"); + if (!type) + break; + + if (!strcmp(type, "debugfs")) { + fclose(fp); + strncpy(debugfs_dir, dir, len - 1); + debugfs_dir[len - 1] = '\0'; + return 0; + } + } while(1); + + fclose(fp); + log(LOG_INFO, "Can't find debugfs\n"); + return ENOENT; +} + + +static int open_trace(char *trace_dir, char *name, int flags) +{ + int ret; + char fname[MAX_PATH + 1]; + + strcpy(fname, trace_dir); + strcat(fname, "/"); + strcat(fname, name); + + ret = open(fname, flags); + if (ret < 0) + log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno); + + return ret; +} + +static int create_trace_instance(char *trace_instance_dir) +{ + char fname[MAX_PATH + 1]; + int rc; + + get_debugfs_dir(fname, sizeof(fname)); + strcat(fname, "/tracing/instances/"TOOL_NAME); + rc = mkdir(fname, S_IRWXU); + if (rc < 0 && errno != EEXIST) { + log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname); + return -1; + } + strcpy(trace_instance_dir, fname); + return 0; +} + +struct ras_events *init_trace_instance(void) +{ + struct ras_events *ras = calloc(1, sizeof(*ras)); + if (!ras) { + log(LOG_ERROR, "Can't allocate memory for ras struct\n"); + return NULL; + } + int rc = create_trace_instance(ras->tracing); + if (rc < 0) { + free(ras); + return NULL; + } + return ras; +} + +/* + * Tracing enable/disable code + */ +int toggle_ras_event(char *trace_dir, char *group, char *event, int enable) +{ + int fd, rc; + char fname[MAX_PATH + 1]; + + snprintf(fname, sizeof(fname), "%s%s:%s\n", + enable ? "" : "!", + group, event); + + /* Enable RAS events */ + fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND); + if (fd < 0) { + log(LOG_WARNING, "Can't open set_event\n"); + rc = -errno; + goto err; + } + + rc = write(fd, fname, strlen(fname)); + close(fd); + if (rc <= 0) { + log(LOG_WARNING, "Can't write to set_event\n"); + rc = -EIO; + goto err; + } + + log(LOG_INFO, "%s:%s event %s\n", + group, event, + enable ? "enabled" : "disabled"); + return 0; +err: + log(LOG_ERROR, "Can't %s %s:%s tracing\n", + enable ? "enable" : "disable", group, event); + return rc; +} + +static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent) +{ + int fd, len, page_size = DEFAULT_PAGE_SIZE; + char buf[page_size]; + + fd = open_trace(ras->tracing, "events/header_page", O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "Open event header page failed\n"); + return -1; + } + + len = read(fd, buf, page_size); + close(fd); + if (len <= 0) { + log(LOG_WARNING, "Read event header page failed\n"); + return -1; + } + + if (tep_parse_header_page(pevent, buf, len, sizeof(long))) { + log(LOG_WARNING, "Parse event header page failed\n"); + return -1; + } + + return 0; +} + +static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf, + void *data, unsigned long long time_stamp) +{ + struct tep_record record; + struct trace_seq s; + + record.ts = time_stamp; + record.size = kbuffer_event_size(kbuf); + record.data = data; + record.offset = kbuffer_curr_offset(kbuf); + record.cpu = pdata->cpu; + + /* note offset is just offset in subbuffer */ + record.missed_events = kbuffer_missed_events(kbuf); + record.record_size = kbuffer_curr_size(kbuf); + + trace_seq_init(&s); + tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s", + TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO); + trace_seq_do_printf(&s); + fflush(stdout); + trace_seq_destroy(&s); +} + +static int get_num_cpus() +{ + return sysconf(_SC_NPROCESSORS_ONLN); +} + +static int set_buffer_percent(struct ras_events *ras, int percent) +{ + int res = 0; + int fd; + + fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY); + if (fd >= 0) { + char buf[16]; + ssize_t size; + snprintf(buf, sizeof(buf), "%d", percent); + size = write(fd, buf, strlen(buf)); + if (size <= 0) { + log(LOG_WARNING, "can't write to buffer_percent\n"); + res = -1; + } + close(fd); + } else { + log(LOG_WARNING, "Can't open buffer_percent\n"); + res = -1; + } + + return res; +} + +static int read_ras_event_all_cpus(struct pcpu_data *pdata, + unsigned n_cpus) +{ + ssize_t size; + unsigned long long time_stamp; + void *data; + int ready, i, count_nready; + struct kbuffer *kbuf; + void *page; + struct pollfd fds[n_cpus + 1]; + struct signalfd_siginfo fdsiginfo; + sigset_t mask; + int warnonce[n_cpus]; + char pipe_raw[PATH_MAX]; + + memset(&warnonce, 0, sizeof(warnonce)); + + page = malloc(pdata[0].ras->page_size); + if (!page) { + log(LOG_ERROR, "Can't allocate page\n"); + return -ENOMEM; + } + + kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN); + if (!kbuf) { + log(LOG_ERROR, "Can't allocate kbuf\n"); + free(page); + return -ENOMEM; + } + + /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks + * indefinitely with the default buffer_percent in the kernel trace system, + * which is introduced by the following change in the kernel. + * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. + * Set buffer_percent to 0 so that poll() will return immediately + * when the trace data is available in the ras per_cpu trace pipe_raw + */ + if (set_buffer_percent(pdata[0].ras, 0)) + log(LOG_WARNING, "Set buffer_percent failed\n"); + + for (i = 0; i < (n_cpus + 1); i++) + fds[i].fd = -1; + + for (i = 0; i < n_cpus; i++) { + fds[i].events = POLLIN; + + snprintf(pipe_raw, sizeof(pipe_raw), + "per_cpu/cpu%d/trace_pipe_raw", i); + + fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY); + if (fds[i].fd < 0) { + log(LOG_ERROR, "Can't open trace_pipe_raw\n"); + goto error; + } + } + + sigemptyset(&mask); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGHUP); + sigaddset(&mask, SIGQUIT); + if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) + log(LOG_WARNING, "sigprocmask\n"); + fds[n_cpus].events = POLLIN; + fds[n_cpus].fd = signalfd(-1, &mask, 0); + if (fds[n_cpus].fd < 0) { + log(LOG_WARNING, "signalfd\n"); + goto error; + } + + log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1); + + do { + ready = poll(fds, (n_cpus + 1), -1); + if (ready < 0) { + log(LOG_WARNING, "poll\n"); + } + + /* check for the signal */ + if (fds[n_cpus].revents & POLLIN) { + size = read(fds[n_cpus].fd, &fdsiginfo, + sizeof(struct signalfd_siginfo)); + if (size != sizeof(struct signalfd_siginfo)) { + log(LOG_WARNING, "signalfd read\n"); + continue; + } + + if (fdsiginfo.ssi_signo == SIGINT || + fdsiginfo.ssi_signo == SIGTERM || + fdsiginfo.ssi_signo == SIGHUP || + fdsiginfo.ssi_signo == SIGQUIT) { + log(LOG_INFO, "Recevied signal=%d\n", + fdsiginfo.ssi_signo); + goto error; + } else { + log(LOG_INFO, + "Received unexpected signal=%d\n", + fdsiginfo.ssi_signo); + continue; + } + } + + count_nready = 0; + for (i = 0; i < n_cpus; i++) { + if (fds[i].revents & POLLERR) { + if (!warnonce[i]) { + log(LOG_INFO, + "Error on CPU %i\n", i); + warnonce[i]++; + } + continue; + } + if (!(fds[i].revents & POLLIN)) { + count_nready++; + continue; + } + size = read(fds[i].fd, page, pdata[i].ras->page_size); + if (size < 0) { + log(LOG_WARNING, "read\n"); + goto error; + } else if (size > 0) { + log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size); + kbuffer_load_subbuffer(kbuf, page); + + while ((data = kbuffer_read_event(kbuf, &time_stamp))) { + if (kbuffer_curr_size(kbuf) < 0) { + log(LOG_ERROR, "invalid kbuf data, discard\n"); + break; + } + + log(LOG_DEBUG, "parse_ras_data\n"); + parse_ras_data(&pdata[i], + kbuf, data, time_stamp); + + /* increment to read next event */ + log(LOG_DEBUG, "kbuffer_next_event\n"); + kbuffer_next_event(kbuf, NULL); + } + } else { + count_nready++; + } + } + + /* + * If count_nready == n_cpus, there is no cpu fd in POLLIN state, + * so we need to break the cycle + */ + if (count_nready == n_cpus) { + log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n"); + break; + } + } while (1); + +error: + kbuffer_free(kbuf); + free(page); + sigprocmask(SIG_UNBLOCK, &mask, NULL); + + for (i = 0; i < (n_cpus + 1); i++) { + if (fds[i].fd > 0) + close(fds[i].fd); + } + + return -1; +} + +static int init_header_page(struct ras_events *ras, struct tep_handle *pevent) +{ + int rc; + + rc = parse_header_page(ras, pevent); + if (rc) { + log(LOG_ERROR, "cannot read trace header_page: %d\n", rc); + return rc; + } + return 0; +} + +static int init_event_format(struct ras_events *ras, struct tep_handle *pevent, + char *group, char *event) +{ + char *page, fname[MAX_PATH + 1]; + int fd, size, rc, page_size = DEFAULT_PAGE_SIZE; + + // read one page from format + snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); + fd = open_trace(ras->tracing, fname, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, + "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n", + group, event); + return errno; + } + + log(LOG_INFO, "page_size: %d\n", page_size); + ras->page_size = page_size; + page = malloc(page_size); + if (!page) { + log(LOG_ERROR, "Can't allocate page to read %s:%s format\n", + group, event); + rc = errno; + close(fd); + return rc; + } + + size = read(fd, page, page_size); + close(fd); + if (size < 0) { + log(LOG_ERROR, "Can't read format\n"); + free(page); + return size; + } + + // parse event format + rc = tep_parse_event(pevent, page, size, group); + if (rc) { + log(LOG_ERROR, "Can't parse event %s:%s\n", group, event); + free(page); + return EINVAL; + } + return 0; +} + +static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, + char *group, char *event, + tep_event_handler_func func) +{ + int rc; + + rc = init_event_format(ras, pevent, group, event); + if (rc) { + log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event); + return rc; + } + + /* Registers the special event handlers */ + rc = tep_register_event_handler(pevent, -1, group, event, func, ras); + if (rc < 0) { + log(LOG_ERROR, "Can't register event handler for %s:%s\n", + group, event); + return EINVAL; + } + + return 0; +} + +int handle_ras_events(struct ras_events *ras) +{ + int rc, i; + unsigned cpus; + struct tep_handle *pevent = NULL; + struct pcpu_data *data = NULL; + + pevent = tep_alloc(); + if (!pevent) { + log(LOG_ERROR, "Can't allocate pevent\n"); + rc = errno; + goto err; + } + ras->pevent = pevent; + + rc = init_header_page(ras, pevent); + if (rc) { + log(LOG_ERROR, "init_header_page failed\n"); + goto err; + } + + rc = add_event_handler(ras, pevent, "ras", "non_standard_event", + ras_non_standard_event_handler); + if (rc) { + log(LOG_ERROR, "Can't get traces from %s:%s\n", + "ras", "non_standard_event"); + goto err; + } + log(LOG_INFO, "add_event_handler done\n"); + + cpus = get_num_cpus(); + data = calloc(sizeof(*data), cpus); + if (!data) + goto err; + + for (i = 0; i < cpus; i++) { + data[i].ras = ras; + data[i].cpu = i; + } + rc = read_ras_event_all_cpus(data, cpus); + +err: + if (data) + free(data); + if (pevent) + tep_free(pevent); + return rc; +} diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h new file mode 100644 index 0000000..4218d93 --- /dev/null +++ b/src/c/hbm_online_repair/ras-events.h @@ -0,0 +1,28 @@ +#ifndef __RAS_EVENTS_H +#define __RAS_EVENTS_H + +#include +#include + +#define MAX_PATH 1024 + +#define DEFAULT_PAGE_SIZE 4096 + +struct ras_events { + char tracing[MAX_PATH + 1]; + struct tep_handle *pevent; + int page_size; +}; + +struct pcpu_data { + struct tep_handle *pevent; + struct ras_events *ras; + int cpu; +}; + +/* Function prototypes */ +int toggle_ras_event(char *trace_dir, char *group, char *event, int enable); +int handle_ras_events(struct ras_events *ras); +struct ras_events *init_trace_instance(void); + +#endif diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c new file mode 100644 index 0000000..1d1fd04 --- /dev/null +++ b/src/c/hbm_online_repair/ras-non-standard-handler.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include "ras-non-standard-handler.h" +#include "logger.h" + +static char *uuid_le(const char *uu) +{ + static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + if (!uu) { + log(LOG_ERROR, "uuid_le failed: uu is empty"); + return uuid; + } + size_t uu_len = strlen(uu); + if (uu_len < SECTION_TYPE_UUID_LEN) { + log(LOG_ERROR, "uuid_le failed: uu is too short"); + return uuid; + } + + char *p = uuid; + int i; + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { + p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: + case 7: + case 9: + *p++ = '-'; + break; + } + } + + *p = 0; + + return uuid; +} + +int ras_non_standard_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len; + unsigned long long val; + struct ras_non_standard_event ev; + + ev.sec_type = tep_get_field_raw(s, event, "sec_type", + record, &len, 1); + if(!ev.sec_type) { + log(LOG_WARNING, "get event section type failed"); + return -1; + } + + trace_seq_printf(s, "\n"); + trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); + + if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { + log(LOG_WARNING, "tep get field val failed"); + return -1; + } + + ev.length = val; + trace_seq_printf(s, "length: %d\n", ev.length); + + ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); + if(!ev.error || ev.length != len) { + log(LOG_WARNING, "get event error failed"); + return -1; + } + + if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { + decode_hisi_common_section(&ev); + } + + return 0; +} diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h new file mode 100644 index 0000000..0272dc1 --- /dev/null +++ b/src/c/hbm_online_repair/ras-non-standard-handler.h @@ -0,0 +1,25 @@ +#ifndef __RAS_NON_STANDARD_HANDLER_H +#define __RAS_NON_STANDARD_HANDLER_H + +#include +#include "ras-events.h" + +#define BIT(nr) (1UL << (nr)) + +#define SECTION_TYPE_UUID_LEN 16 +#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" + +struct ras_non_standard_event { + char timestamp[64]; + const char *sec_type; + const uint8_t *error; + uint32_t length; +}; + +int ras_non_standard_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); + +int decode_hisi_common_section(struct ras_non_standard_event *event); + +#endif diff --git a/src/python/.gitignore b/src/python/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/src/python/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py new file mode 100644 index 0000000..5956538 --- /dev/null +++ b/src/python/syssentry/bmc_alarm.py @@ -0,0 +1,159 @@ +import logging +import socket +from enum import Enum + +from .utils import execute_command + +HEX_CHAR_LEN = 2 +SOCKET_RECEIVE_LEN = 128 +BMC_DATA_HEAD = "REP" +BMC_REPORT_TYPE_BIT = 0 +HBMC_REPAIR_TYPE_BIT = 1 +HBMC_REPAIR_RESULT_BIT = 2 +HBMC_ISOLATION_TYPE_BIT = 3 +HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92" +HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN +HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN +HBMC_REPAIR_TYPE_OFFSET = 7 + +HBMC_SEND_SUCCESS_CODE = "db 07 00" + + +class ReportType(Enum): + HBMC_REPAIR_BMC = 0x00 + + +class HBMCRepairType(Enum): + CE_ACLS = 7 + PS_UCE_ACLS = 8 + CE_SPPR = 9 + PS_UCE_SPPR = 10 + + +class HBMCRepairResultType(Enum): + ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001 + ISOLATE_FAILED_OTHER_REASON = 0b10000010 + REPAIR_FAILED_NO_RESOURCE = 0b10010100 + REPAIR_FAILED_INVALID_PARAM = 0b10011000 + REPAIR_FAILED_OTHER_REASON = 0b10011100 + ONLINE_PAGE_FAILED = 0b10100000 + ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000 + + +class HBMCIsolationType(Enum): + ROW_FAULT = 1 + SINGLE_ADDR_FAULT = 6 + + +def find_value_is_in_enum(value: int, enum: Enum): + for item in enum: + if value == item.value: + return True + return False + + +def convert_hex_char_to_int(data, bit): + if len(data) < (bit+1)*HEX_CHAR_LEN: + logging.error(f"Data {data} len is too short, current convert bit is {bit}") + char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN] + try: + value = int(char, 16) + except ValueError: + logging.error(f"Cannot convert char [{char}] to int") + raise ValueError + return value + + +def reverse_byte(data): + return data[3], data[2], data[1], data[0] + + +def parse_hbmc_report(data: str): + logging.debug(f"bmc receive raw data is {data}") + repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT) + repair_type += HBMC_REPAIR_TYPE_OFFSET + if not find_value_is_in_enum(repair_type, HBMCRepairType): + logging.warning(f"HBMC msg repair type ({repair_type}) is unknown") + raise ValueError + + repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT) + if not find_value_is_in_enum(repair_result, HBMCRepairResultType): + logging.warning(f"HBMC msg repair result ({repair_result}) is unknown") + raise ValueError + + isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT) + if not find_value_is_in_enum(isolation_type, HBMCIsolationType): + logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown") + raise ValueError + + cmd_list = [ + "ipmitool", + "raw", + "0x30", # Netfn + "0x92", # cmd + "0xdb", + "0x07", + "0x00", + "0x65", # sub command + "0x01", # SystemId + "0x00", # LocalSystemId + "{:#04X}".format(repair_type), + "{:#04X}".format(repair_result), + "{:#04X}".format(isolation_type), + ] + # send the remain data directly + data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:] + other_info_str = [] + for i in range(len(data) // 2): + other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i))) + cmd_list.extend(other_info_str) + + cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4]) + cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4]) + + logging.info(f"Send bmc alarm command is {cmd_list}") + + ret = execute_command(cmd_list) + if HBMC_SEND_SUCCESS_CODE not in ret: + logging.warning(f"Send bmc alarm failed, error code is {ret}") + raise ValueError + logging.debug("Send bmc alarm success") + + +PARSE_REPORT_MSG_FUNC_DICT = { + ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report, +} + + +def bmc_recv(server_socket: socket.socket): + logging.debug("Get hbm socket connection request") + try: + client_socket, _ = server_socket.accept() + logging.debug("cpu alarm fd listen ok") + + data = client_socket.recv(SOCKET_RECEIVE_LEN) + data = data.decode() + + data_head = data[0:len(BMC_DATA_HEAD)] + if data_head != BMC_DATA_HEAD: + logging.warning(f"The head of the msg is incorrect, head is {data_head}") + raise ValueError + + # remove the data head + data = data[len(BMC_DATA_HEAD):] + logging.info(f"Remove head data is {data}") + + report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT) + if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys(): + logging.warning(f"The type of the msg ({report_type}) is unknown") + raise ValueError + + PARSE_REPORT_MSG_FUNC_DICT[report_type](data) + + except socket.error: + logging.error("socket error") + return + except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): + logging.error("server recv bmc msg failed!") + client_socket.close() + return diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 776971f..debff4e 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -44,6 +44,12 @@ try: except ImportError: CPU_EXIST = False +BMC_EXIST = True +try: + from .bmc_alarm import bmc_recv +except ImportError: + BMC_EXIST = False + INSPECTOR = None @@ -83,6 +89,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" +BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock" + +fd_list = [] def msg_data_process(msg_data): """message data process""" @@ -325,6 +334,41 @@ def cpu_alarm_fd_create(): return cpu_alarm_fd +def bmc_fd_create(): + """create bmc fd""" + if not os.path.exists(SENTRY_RUN_DIR): + logging.debug("%s not exist", SENTRY_RUN_DIR) + return None + + try: + bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + except socket.error: + logging.error("bmc fd create failed") + return None + + bmc_fd.setblocking(False) + if os.path.exists(BMC_SOCKET_PATH): + os.remove(BMC_SOCKET_PATH) + + try: + bmc_fd.bind(BMC_SOCKET_PATH) + except OSError: + logging.error("bmc fd bind failed") + bmc_fd.close() + return None + + os.chmod(BMC_SOCKET_PATH, 0o600) + try: + bmc_fd.listen(5) + except OSError: + logging.error("bmc fd listen failed") + bmc_fd.close() + return None + + logging.debug("%s bind and listen", BMC_SOCKET_PATH) + + return bmc_fd + def server_result_recv(server_socket: socket.socket): """server result receive""" @@ -398,35 +442,47 @@ def server_result_fd_create(): return server_result_fd +def close_all_fd(): + for fd in fd_list: + fd.close() + + def main_loop(): """main loop""" + server_fd = server_fd_create() if not server_fd: + close_all_fd() return + fd_list.append(server_fd) server_result_fd = server_result_fd_create() if not server_result_fd: - server_fd.close() + close_all_fd() return + fd_list.append(server_result_fd) heartbeat_fd = heartbeat_fd_create() if not heartbeat_fd: - server_fd.close() - server_result_fd.close() + close_all_fd() return + fd_list.append(heartbeat_fd) cpu_alarm_fd = cpu_alarm_fd_create() if not cpu_alarm_fd: - server_fd.close() - heartbeat_fd.close() - server_result_fd.close() + close_all_fd() return + fd_list.append(cpu_alarm_fd) + + bmc_fd = bmc_fd_create() + if not bmc_fd: + close_all_fd() + return + fd_list.append(bmc_fd) epoll_fd = select.epoll() - epoll_fd.register(server_fd.fileno(), select.EPOLLIN) - epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN) - epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN) - epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) + for fd in fd_list: + epoll_fd.register(fd.fileno(), select.EPOLLIN) logging.debug("start main loop") # onstart_tasks_handle() @@ -449,6 +505,8 @@ def main_loop(): heartbeat_recv(heartbeat_fd) elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): cpu_alarm_recv(cpu_alarm_fd) + elif BMC_EXIST and event_fd == bmc_fd.fileno(): + bmc_recv(bmc_fd) else: continue @@ -587,4 +645,3 @@ def main(): logging.error('%s', traceback.format_exc()) finally: release_pidfile() - -- Gitee From 9bbc6a4a59efad15aafe34361e06ec1fb7161206 Mon Sep 17 00:00:00 2001 From: luckky Date: Mon, 28 Oct 2024 18:34:34 +0800 Subject: [PATCH 18/24] fix hbm online repair notice and efi create --- src/c/hbm_online_repair/hbm_online_repair.c | 5 +- .../non-standard-hbm-repair.c | 194 +++++++++--------- .../non-standard-hbm-repair.h | 2 +- src/c/hbm_online_repair/ras-events.c | 1 - .../ras-non-standard-handler.c | 33 +-- .../ras-non-standard-handler.h | 1 + 6 files changed, 116 insertions(+), 120 deletions(-) diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c index 3ace206..b3b2742 100644 --- a/src/c/hbm_online_repair/hbm_online_repair.c +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -127,10 +127,7 @@ int main(int argc, char *argv[]) return -1; } - ret = init_all_flash(); - if (ret < 0) { - log(LOG_ERROR, "flash writer init failed\n"); - } + get_flash_total_size(); handle_ras_events(ras); diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c index b175e14..f26d8ae 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.c +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -15,7 +15,7 @@ #include "non-standard-hbm-repair.h" extern int page_isolation_threshold; -size_t total_size = 0; +size_t flash_total_size = 0; struct hisi_common_error_section { uint32_t val_bits; uint8_t version; @@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned info_struct->crc8 = (uint32_t)fault_addr; } -static bool variable_existed(char *name, char *guid) +static bool is_variable_existing(char *name, char *guid) { + char filename[PATH_MAX]; + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + return access(filename, F_OK | R_OK) == 0; +} + +static size_t get_var_size(char *name, char *guid) { char filename[PATH_MAX]; int fd; + struct stat stat; snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); // open var file fd = open(filename, O_RDONLY); if (fd < 0) { - log(LOG_WARNING, "open file %s failed\n", filename); - return false; + log(LOG_WARNING, "open %s failed\n", filename); + goto err; + } + // read stat + if (fstat(fd, &stat) != 0) { + log(LOG_WARNING, "fstat %s failed\n", filename); + goto err; } close(fd); - return true; + return stat.st_size; +err: + if (fd >= 0) + close(fd); + return (size_t)-1; } -static uint32_t read_variable_attribute(char *name, char *guid) { +void get_flash_total_size() { + for (int i = 0; i < FLASH_ENTRY_NUM; i++) { + if (is_variable_existing(flash_names[i], flash_guids[i])) { + flash_total_size += get_var_size(flash_names[i], flash_guids[i]); + } + } + // check total entry size + log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", + flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); + if (flash_total_size > MAX_VAR_SIZE) { + log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size); + } +} + +static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) { char filename[PATH_MAX]; int fd; size_t readsize; - uint32_t attribute = (uint32_t)-1; snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); @@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) { fd = open(filename, O_RDONLY); if (fd < 0) { log(LOG_ERROR, "open %s failed\n", filename); - return attribute; + return -1; } // read attributes from first 4 bytes - readsize = read(fd, &attribute, sizeof(uint32_t)); + readsize = read(fd, attribute, sizeof(uint32_t)); if (readsize != sizeof(uint32_t)) { log(LOG_ERROR, "read attribute of %s failed\n", filename); + return -1; } close(fd); - return attribute; + return 0; } static int efivarfs_set_mutable(char *name, char *guid, bool mutable) @@ -205,8 +236,8 @@ err: return -1; } -static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { - int fd, mode; +static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) { + int fd = -1, mode; size_t writesize; void *buffer; unsigned long total; @@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz memcpy(buffer + sizeof(uint32_t), value, size); // change attr - if (efivarfs_set_mutable(name, guid, 1) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) { log(LOG_ERROR, "set mutable for %s failed\n", filename); goto err; } mode = O_WRONLY; - if (attribute & EFI_VARIABLE_APPEND_WRITE) - mode |= O_APPEND; - else - mode |= O_CREAT; + mode |= is_existing ? O_APPEND : O_CREAT; // open var file fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); @@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz close(fd); free(buffer); - if (efivarfs_set_mutable(name, guid, 0) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { log(LOG_ERROR, "set immutable for %s failed\n", filename); } return 0; @@ -261,86 +289,21 @@ err: close(fd); if (buffer) free(buffer); - if (efivarfs_set_mutable(name, guid, 0) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { log(LOG_ERROR, "set immutable for %s failed\n", filename); } return -1; } -static int append_variable(char *name, char *guid, void *data, unsigned long size) { - // prepare append attribute - uint32_t attribute = read_variable_attribute(name, guid); - if (attribute == (uint32_t)-1) { - log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); - return -1; - } - attribute |= EFI_VARIABLE_APPEND_WRITE; - - return write_variable(name, guid, data, size, attribute); -} - -static size_t get_var_size(char *name, char *guid) { - char filename[PATH_MAX]; - int fd; - struct stat stat; - - snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); - - // open var file - fd = open(filename, O_RDONLY); - if (fd < 0) { - log(LOG_WARNING, "open %s failed\n", filename); - goto err; - } - // read stat - if (fstat(fd, &stat) != 0) { - log(LOG_WARNING, "fstat %s failed\n", filename); - goto err; - } - close(fd); - return stat.st_size; -err: - if (fd >= 0) - close(fd); - return (size_t)-1; -} - -int init_all_flash() { - for (int i = 0; i < FLASH_ENTRY_NUM; i++) { - // check existed entry - if (variable_existed(flash_names[i], flash_guids[i])) { - total_size += get_var_size(flash_names[i], flash_guids[i]); - continue; - } - // create new entry - uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS; - char *data = ""; - unsigned long size = 1; - int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); - if (ret) { - log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); - return -1; - } - total_size += sizeof(uint32_t) + 1; - } - // check total entry size - log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", - total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); - if (total_size > MAX_VAR_SIZE) { - log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); - } - return 0; -} - static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { int ret, guid_index; uint32_t reg_size; uint64_t fault_addr; + bool is_existing; + uint32_t attribute = -1; // check flash usage threshold - if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { + if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); return -1; } @@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err log(LOG_ERROR, "invalid fault info\n"); return -1; } + + // judge if the efivar is existing to set the attribute + is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]); + attribute = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + if (is_existing) { + ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute); + if (ret < 0) { + log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]); + return -1; + } + attribute |= EFI_VARIABLE_APPEND_WRITE; + } + // record physical addr in flash - ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); + ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing); if (ret < 0) { - log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); + log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); return -1; } - total_size += sizeof(uint64_t); - log(LOG_INFO, "write hbm fault info to flash success\n"); + flash_total_size += sizeof(uint64_t); + log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]); return 0; } @@ -421,7 +399,7 @@ static int get_hardware_corrupted_size() return hardware_corrupted_size; } -static uint8_t get_repair_result_code(int ret) +static uint8_t get_repair_failed_result_code(int ret) { if (ret == -ENOSPC) { return REPAIR_FAILED_NO_RESOURCE; @@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) { int ret; - if (repair_ret < 0) { + if (repair_ret <= 0) { log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); /* not much we can do about errors here */ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); - return get_repair_result_code(repair_ret); + return get_repair_failed_result_code(repair_ret); } ret = write_file("/sys/kernel/page_eject", "online_page", paddr); @@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); - if (ret < 0) { - notice_BMC(err, get_repair_result_code(ret)); - log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + + /* Only positive num means the error is supported to repair */ + if (ret <= 0) { + if (ret != -ENXIO) { + notice_BMC(err, get_repair_failed_result_code(ret)); + log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + } return ret; } @@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char all_online_success = false; } } - if (ret < 0) { - notice_BMC(err, get_repair_result_code(ret)); + /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ + if (ret <= 0) { + notice_BMC(err, get_repair_failed_result_code(ret)); return ret; } else if (all_online_success) { notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); @@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) struct dirent *dent; DIR *dir; int ret; - bool find_device = false, find_hbm_mem = false; + bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false; ret = hbmc_hbm_page_isolate(err); if (ret < 0) { @@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { find_hbm_mem = true; ret = hbmc_hbm_repair(err, path); - if (ret != -ENXIO) + if (ret != -ENXIO) { + addr_in_hbm_device = true; break; + } } } + if (!find_device) { log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); @@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } else if (!addr_in_hbm_device) { + log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_INVALID_PARAM); } closedir(dir); @@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err) (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { - log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", + log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n", hbm_repair_reg_type, err->reg_array_size); return false; } diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h index 7e8e448..ecb04fe 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.h +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h @@ -84,6 +84,6 @@ #define FLASH_ENTRY_NUM 8 #define KB_SIZE 1024 -extern int init_all_flash(); +extern void get_flash_total_size(); #endif diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c index 0b12329..4d281ad 100644 --- a/src/c/hbm_online_repair/ras-events.c +++ b/src/c/hbm_online_repair/ras-events.c @@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata, "Error on CPU %i\n", i); warnonce[i]++; } - continue; } if (!(fds[i].revents & POLLIN)) { count_nready++; diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c index 1d1fd04..48ffa70 100644 --- a/src/c/hbm_online_repair/ras-non-standard-handler.c +++ b/src/c/hbm_online_repair/ras-non-standard-handler.c @@ -7,17 +7,21 @@ #include "ras-non-standard-handler.h" #include "logger.h" -static char *uuid_le(const char *uu) +static int uuid_le(const char *uu, char* uuid) { - static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; if (!uu) { log(LOG_ERROR, "uuid_le failed: uu is empty"); - return uuid; + return -1; } size_t uu_len = strlen(uu); - if (uu_len < SECTION_TYPE_UUID_LEN) { - log(LOG_ERROR, "uuid_le failed: uu is too short"); - return uuid; + if (uu_len != SECTION_TYPE_UUID_LEN) { + log(LOG_ERROR, "uuid_le failed: uu len is incorrect"); + return -1; + } + size_t uuid_len = strlen(uuid); + if (uuid_len != strlen(UUID_STR_TYPE)) { + log(LOG_ERROR, "uuid_le failed: uuid len is incorrect"); + return -1; } char *p = uuid; @@ -38,7 +42,7 @@ static char *uuid_le(const char *uu) *p = 0; - return uuid; + return 0; } int ras_non_standard_event_handler(struct trace_seq *s, @@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s, ev.sec_type = tep_get_field_raw(s, event, "sec_type", record, &len, 1); if(!ev.sec_type) { - log(LOG_WARNING, "get event section type failed"); + log(LOG_WARNING, "get event section type failed\n"); return -1; } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); + char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE; + if (uuid_le(ev.sec_type, uuid) < 0) { + log(LOG_WARNING, "get uuid failed\n"); + return -1; + } + trace_seq_printf(s, "sec_type: %s\n", uuid); if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { - log(LOG_WARNING, "tep get field val failed"); + log(LOG_WARNING, "tep get field val failed\n"); return -1; } @@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s, ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); if(!ev.error || ev.length != len) { - log(LOG_WARNING, "get event error failed"); + log(LOG_WARNING, "get event error failed\n"); return -1; } - if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { + if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) { decode_hisi_common_section(&ev); } diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h index 0272dc1..15a37ee 100644 --- a/src/c/hbm_online_repair/ras-non-standard-handler.h +++ b/src/c/hbm_online_repair/ras-non-standard-handler.h @@ -7,6 +7,7 @@ #define BIT(nr) (1UL << (nr)) #define SECTION_TYPE_UUID_LEN 16 +#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" struct ras_non_standard_event { -- Gitee From 3e30a27620491012b2f0e11c95b920d21231f1b3 Mon Sep 17 00:00:00 2001 From: luckky Date: Fri, 1 Nov 2024 15:01:39 +0800 Subject: [PATCH 19/24] fix uint8 bug and change isolation default value --- src/c/hbm_online_repair/hbm_online_repair.env | 2 +- src/c/hbm_online_repair/non-standard-hbm-repair.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env index de56079..7166c8d 100644 --- a/src/c/hbm_online_repair/hbm_online_repair.env +++ b/src/c/hbm_online_repair/hbm_online_repair.env @@ -1,2 +1,2 @@ HBM_ONLINE_REPAIR_LOG_LEVEL=1 -PAGE_ISOLATION_THRESHOLD=128 +PAGE_ISOLATION_THRESHOLD=3355443 diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c index f26d8ae..b8dde7a 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.c +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -359,7 +359,7 @@ static int write_file(char *path, const char *name, unsigned long long value) fd = open(fname, O_WRONLY); if (fd < 0) { - log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", + log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", fname, strerror(errno)); return -errno; } @@ -367,7 +367,7 @@ static int write_file(char *path, const char *name, unsigned long long value) snprintf(buf, sizeof(buf), "0x%llx\n", value); ret = write(fd, buf, strlen(buf)); if (ret <= 0) - log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", + log(LOG_WARNING, "HBM: Failed to set %s (0x%llx): %s\n", fname, value, strerror(errno)); close(fd); @@ -557,7 +557,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) return ret < 0 ? ret : 0; } -static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) +static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) { int ret; if (repair_ret <= 0) { @@ -577,7 +577,7 @@ static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsig } } -static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) +static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) { unsigned long long paddr; int ret; -- Gitee From d1bf2c9a460822d13ead316712f61963fd2a8280 Mon Sep 17 00:00:00 2001 From: luckky Date: Mon, 4 Nov 2024 20:18:05 +0800 Subject: [PATCH 20/24] fix write file return code bug Set the return code 0 to -EINVAL to unify the processing of return code. --- .../hbm_online_repair/non-standard-hbm-repair.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c index b8dde7a..97cb9a7 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.c +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -112,7 +112,7 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; fault_addr >>= FAULT_ADDR_ROW_ID_LEN; info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; - fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; + fault_addr >>= FAULT_ADDR_COLUMN_ID_LEN; info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; @@ -371,7 +371,12 @@ static int write_file(char *path, const char *name, unsigned long long value) fname, value, strerror(errno)); close(fd); - return ret > 0 ? 0 : -errno; + if (ret == 0) { + ret = -EINVAL; + } else if (ret < 0) { + ret = -errno; + } + return ret; } static int get_hardware_corrupted_size() @@ -560,7 +565,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) { int ret; - if (repair_ret <= 0) { + if (repair_ret < 0) { log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); /* not much we can do about errors here */ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); @@ -594,8 +599,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); - /* Only positive num means the error is supported to repair */ - if (ret <= 0) { + if (ret < 0) { if (ret != -ENXIO) { notice_BMC(err, get_repair_failed_result_code(ret)); log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); @@ -624,8 +628,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa all_online_success = false; } } - /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ - if (ret <= 0) { + if (ret < 0) { notice_BMC(err, get_repair_failed_result_code(ret)); return ret; } else if (all_online_success) { -- Gitee From 368aeb2b7442800d7c98c1e6b615f56c4f005cff Mon Sep 17 00:00:00 2001 From: luckky Date: Tue, 5 Nov 2024 18:53:49 +0800 Subject: [PATCH 21/24] update the commit of the log level and format of syssentry --- config/inspect.conf | 5 +++- config/tasks/hbm_online_repair.mod | 2 +- src/c/hbm_online_repair/hbm_online_repair.c | 8 +++--- src/python/syssentry/sentry_config.py | 28 +++++++++++++++++++++ src/python/syssentry/syssentry.py | 16 +++++++----- 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/config/inspect.conf b/config/inspect.conf index 071cca1..f451d9e 100644 --- a/config/inspect.conf +++ b/config/inspect.conf @@ -1,2 +1,5 @@ [inspect] -Interval=3 \ No newline at end of file +Interval=3 + +[log] +level=info diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod index 77dd73e..4dcef43 100644 --- a/config/tasks/hbm_online_repair.mod +++ b/config/tasks/hbm_online_repair.mod @@ -3,7 +3,7 @@ enabled=yes task_start=/usr/bin/hbm_online_repair task_stop=kill $pid type=period -interval=180 +interval=10 onstart=yes env_file=/etc/sysconfig/hbm_online_repair.env conflict=up \ No newline at end of file diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c index b3b2742..943f201 100644 --- a/src/c/hbm_online_repair/hbm_online_repair.c +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -9,7 +9,7 @@ #include "non-standard-hbm-repair.h" #define DEFAULT_LOG_LEVEL LOG_INFO -#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 +#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443 int global_level_setting; int page_isolation_threshold; @@ -44,7 +44,7 @@ int execute_command(const char *command) } fgets(buffer, sizeof(buffer), fp); - log(LOG_DEBUG, "output of command is: %s\n", buffer); + log(LOG_DEBUG, "output of command %s is: %s\n", command, buffer); ret = pclose(fp); if (ret < 0) { @@ -53,12 +53,12 @@ int execute_command(const char *command) } if (!WIFEXITED(ret)) { - log(LOG_ERROR, "command did not terminate normally\n"); + log(LOG_ERROR, "command %s did not terminate normally\n", command); return -1; } ret = WEXITSTATUS(ret); - log(LOG_DEBUG, "command exited with status: %d\n", ret); + log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret); return ret; } diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py index a0e7b79..1169887 100644 --- a/src/python/syssentry/sentry_config.py +++ b/src/python/syssentry/sentry_config.py @@ -21,6 +21,34 @@ import sys DEFAULT_INSPECT_DELAY = 3 INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf" +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + + +def get_log_level(filename=INSPECT_CONF_PATH): + if not os.path.exists(filename): + return logging.INFO + + try: + config = configparser.ConfigParser() + config.read(filename) + if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): + return logging.INFO + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) + + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return logging.INFO + except configparser.Error: + return logging.INFO + class SentryConfig: """ diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index debff4e..0956e1e 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -23,7 +23,7 @@ import fcntl import select -from .sentry_config import SentryConfig +from .sentry_config import SentryConfig, get_log_level from .task_map import TasksMap from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM @@ -112,15 +112,16 @@ def msg_data_process(msg_data): cmd_type = data_struct['type'] if cmd_type not in type_func and cmd_type not in type_func_void: - logging.error("recv invaild cmd type: %s", cmd_type) - return "Invaild cmd type" + logging.error("recv invalid cmd type: %s", cmd_type) + return "Invalid cmd type" cmd_param = data_struct['data'] - logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param) + logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param)) if cmd_type in type_func: ret, res_data = type_func[cmd_type](cmd_param) else: ret, res_data = type_func_void[cmd_type]() + logging.debug("msg_data_process res_data:%s",str(res_data)) res_msg_struct = {"ret": ret, "data": res_data} res_msg = json.dumps(res_msg_struct) @@ -414,7 +415,7 @@ def server_result_recv(server_socket: socket.socket): try: client_socket.send(process_plugins_result.encode()) except OSError: - logging.warning("server send reponse to plugins failed") + logging.warning("server send response to plugins failed") finally: client_socket.close() return @@ -621,7 +622,10 @@ def main(): os.mkdir(SENTRY_RUN_DIR) os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) + log_level = get_log_level() + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + + logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=log_level, format=log_format) os.chmod(SYSSENTRY_LOG_FILE, 0o600) if not chk_and_set_pidfile(): -- Gitee From 85249f8022ac49c0bf89d79294c39664e40298b0 Mon Sep 17 00:00:00 2001 From: luckky Date: Wed, 6 Nov 2024 11:42:53 +0800 Subject: [PATCH 22/24] add boundary check for settings 1. add two boundary checks for page_isolation_threshold and hbm_online_repair_log_level (0 <= page_isolation_threshold) (0(LOG_DEBUG) <= hbm_online_repair_log_level <= 3(LOG_ERROR)) --- src/c/hbm_online_repair/hbm_online_repair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c index 943f201..00c9c0b 100644 --- a/src/c/hbm_online_repair/hbm_online_repair.c +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -89,6 +89,9 @@ void hbm_param_init(void) if (ret < 0) { global_level_setting = DEFAULT_LOG_LEVEL; log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); + } else if (global_level_setting < LOG_DEBUG || global_level_setting > LOG_ERROR) { + log(LOG_WARNING, "The log level value %d in config is out of range, set the default value %d\n", global_level_setting, DEFAULT_LOG_LEVEL); + global_level_setting = DEFAULT_LOG_LEVEL; } else { log(LOG_INFO, "log level: %d\n", global_level_setting); } @@ -98,6 +101,9 @@ void hbm_param_init(void) if (ret < 0) { page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); + } else if (page_isolation_threshold < 0) { + log(LOG_WARNING, "The page_isolation_threshold %d in config is out of range, set the default value %d\n", page_isolation_threshold, DEFAULT_PAGE_ISOLATION_THRESHOLD); + page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; } else { log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); } -- Gitee From 84170e03f6e8bf301e4883e25a7432a33a6a44ca Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Thu, 7 Nov 2024 11:44:26 +0800 Subject: [PATCH 23/24] fix xalarm not reject alarm msg exceeds max length --- src/libso/xalarm/register_xalarm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c index 152c078..9eeed74 100644 --- a/src/libso/xalarm/register_xalarm.c +++ b/src/libso/xalarm/register_xalarm.c @@ -363,6 +363,11 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel, return -1; } + if (pucParas == NULL || (int)strlen(pucParas) > MAX_PARAS_LEN) { + fprintf(stderr, "%s: alarm info invalid\n", __func__); + return -1; + } + if (memset(&info, 0, sizeof(struct alarm_info)) == NULL) { fprintf(stderr, "%s: memset info failed, ret: %d\n", __func__, ret); return -1; -- Gitee From 756aa5d21eeb6aa517b8aaf497bc0286a854fbf7 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 13 Dec 2024 11:20:55 +0800 Subject: [PATCH 24/24] change status of period task and sort mod file --- src/python/syssentry/cron_process.py | 1 + src/python/syssentry/load_mods.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py index 50780b3..5543d67 100644 --- a/src/python/syssentry/cron_process.py +++ b/src/python/syssentry/cron_process.py @@ -144,6 +144,7 @@ def period_tasks_handle(): if not task.onstart: logging.debug("period onstart not enabled, task: %s", task.name) + task.runtime_status = EXITED_STATUS continue if task.runtime_status == WAITING_STATUS and \ diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py index 48d7e66..5be5540 100644 --- a/src/python/syssentry/load_mods.py +++ b/src/python/syssentry/load_mods.py @@ -224,6 +224,7 @@ def load_tasks(): return "failed", "" mod_files = os.listdir(TASKS_STORAGE_PATH) + mod_files.sort() for mod_file in mod_files: logging.debug("find mod, path is %s", mod_file) if not mod_file.endswith(MOD_FILE_SUFFIX): -- Gitee