From 3b937362779cf67aba61c475a23174443bd0ee90 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Tue, 11 Jun 2024 16:47:46 +0800 Subject: [PATCH 01/76] fix version in setup.py --- src/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/setup.py b/src/python/setup.py index 21dbe9f..f96a96e 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -17,7 +17,7 @@ from setuptools import setup, find_packages setup( name="syssentry", - version="1.0.1", + version="1.0.2", description="System inspection framework tool set", packages=find_packages(), include_package_data=True, -- Gitee From 73ecab48704d1ece815d3bdba91358fbf157cf74 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 24 Jul 2024 16:17:54 +0800 Subject: [PATCH 02/76] Fix the problem that function cpu_report_result() is called more than once when task is running, user to exec "sentryctl stop cpu_sentry", cpu_report_result() will be called twice. This will cause the log to be printed twice --- src/python/syssentry/cpu_sentry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 7e77654..3c4d58d 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -133,6 +133,7 @@ class CpuSentry: result_level = self.send_result.get("result", ResultLevel.FAIL) report_result(task_name, result_level, details) + self.init_send_result() def kill_process(signum, _f, cpu_sentry_obj): """kill process by 'pkill -9'""" @@ -179,6 +180,6 @@ def main(): cpu_sentry_task.send_result["result"] = ResultLevel.FAIL cpu_sentry_task.send_result["details"]["code"] = 1004 cpu_sentry_task.send_result["details"]["msg"] = "run cmd [%s] raise Error" % cpu_sentry_task_cmd - finally: cpu_sentry_task.cpu_report_result() - cpu_sentry_task.init_send_result() + else: + cpu_sentry_task.cpu_report_result() -- Gitee From 8c20697608fab68c7e8779a5409f8745aafa3575 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 24 Jul 2024 17:53:58 +0800 Subject: [PATCH 03/76] fix error handling --- src/python/syssentry/cpu_sentry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 3c4d58d..d0bafa8 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -87,7 +87,7 @@ class CpuSentry: } def handle_cpu_output(self, stdout: str): - if "" in stdout: + if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 self.send_result["details"]["msg"] = stdout.split("\n")[0] -- Gitee From bb5a87b4b199214e9d01f420d5c687458207ad7b Mon Sep 17 00:00:00 2001 From: shixuantong Date: Fri, 26 Jul 2024 15:59:42 +0800 Subject: [PATCH 04/76] fix result when process output is None --- src/python/syssentry/cpu_sentry.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index d0bafa8..9287e2f 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -87,11 +87,19 @@ class CpuSentry: } def handle_cpu_output(self, stdout: str): + if not stdout: + logging.error("%s process output is None, it may be killed!", LOW_LEVEL_INSPECT_CMD) + self.send_result["result"] = ResultLevel.FAIL + self.send_result["details"]["code"] = 1005 + self.send_result["details"]["msg"] = "cpu_sentry task is killed!" + return + if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 self.send_result["details"]["msg"] = stdout.split("\n")[0] return + out_split = stdout.split("\n") isolated_cores_number = 0 found_fault_cores_list = [] -- Gitee From 576c863b623f39880c1dd7f247bb22d6716973cb Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Mon, 22 Jul 2024 14:58:27 +0800 Subject: [PATCH 05/76] cpu_utility and cpu_patrol musht be an integer --- src/c/catcli/catlib/cli_param_checker.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index a1aa636..e400428 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include "cli_common.h" @@ -13,7 +14,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) { long cpu_utility = strtol(getopt_optarg, NULL, DECIMAL); - if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX) { + if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); } @@ -68,7 +69,7 @@ void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) { long second = strtol(getopt_optarg, NULL, DECIMAL); - if (second <= 0 || second > INT_MAX) { + if (second <= 0 || second > INT_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_time_err, "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", MAX_ERR_LEN); -- Gitee From 5954c7f95aa4d3589a16cfad6803a52705facc50 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Fri, 30 Aug 2024 14:30:46 +0800 Subject: [PATCH 06/76] must be integer --- src/c/catcli/catlib/cli_param_checker.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index e400428..5b38402 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -17,8 +17,9 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); + } else { + p_request_body->cpu_utility = (int)cpu_utility; } - p_request_body->cpu_utility = (int)cpu_utility; } void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) @@ -73,8 +74,9 @@ void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_bo strncpy(errs->patrol_time_err, "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", MAX_ERR_LEN); + } else { + p_request_body->patrol_second = (int)second; } - p_request_body->patrol_second = (int)second; } void checkset_patrol_type(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) -- Gitee From 42e141ccc468671099eaf2152b9d00bf41c33e27 Mon Sep 17 00:00:00 2001 From: jwolf <523083921@qq.com> Date: Fri, 30 Aug 2024 16:59:56 +0800 Subject: [PATCH 07/76] param must be integer --- src/c/catcli/catlib/cli_param_checker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c index 5b38402..71edf17 100644 --- a/src/c/catcli/catlib/cli_param_checker.c +++ b/src/c/catcli/catlib/cli_param_checker.c @@ -17,6 +17,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { strncpy(errs->patrol_module_err, "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); + p_request_body->cpu_utility = 0; } else { p_request_body->cpu_utility = (int)cpu_utility; } -- Gitee From b28bbd473ded5fa2a67459cf92c80f6077848699 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 30 Aug 2024 19:58:41 +0800 Subject: [PATCH 08/76] add deleted code to plugin rasdaemon --- src/python/syssentry/syssentry.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 32b81e3..3d5cb8d 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -462,6 +462,14 @@ def main_loop(): epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) logging.debug("start main loop") + # onstart_tasks_handle() + for task_type in TasksMap.tasks_dict: + for task_name in TasksMap.tasks_dict.get(task_type): + task = TasksMap.tasks_dict.get(task_type).get(task_name) + if not task: + continue + task.onstart_handle() + while True: try: events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) -- Gitee From cd5d20f196b4a4e0767b901566ac5eb1d27aa180 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Sun, 11 Aug 2024 18:36:23 +0800 Subject: [PATCH 09/76] Remove ANSI escape sequences --- src/python/syssentry/cpu_sentry.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py index 9287e2f..99af127 100644 --- a/src/python/syssentry/cpu_sentry.py +++ b/src/python/syssentry/cpu_sentry.py @@ -97,7 +97,14 @@ class CpuSentry: if "ERROR" in stdout: self.send_result["result"] = ResultLevel.FAIL self.send_result["details"]["code"] = 1004 - self.send_result["details"]["msg"] = stdout.split("\n")[0] + + # Remove ANSI escape sequences + error_info = stdout.split("\n")[0] + if error_info.startswith("\u001b"): + ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' + error_info = re.sub(ansi_escape, '', error_info) + + self.send_result["details"]["msg"] = error_info return out_split = stdout.split("\n") -- Gitee From f5591b20bd0d30931111efeb66424d70c24ec527 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Wed, 14 Aug 2024 21:10:20 +0800 Subject: [PATCH 10/76] split cpu_sentry and syssentry --- src/python/syssentry/cpu_alarm.py | 42 +++++++++++++++++++++++++ src/python/syssentry/syssentry.py | 52 ++++++------------------------- 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py index d972c42..0b1642b 100644 --- a/src/python/syssentry/cpu_alarm.py +++ b/src/python/syssentry/cpu_alarm.py @@ -1,6 +1,7 @@ import re import math import logging +import socket from enum import Enum from .utils import execute_command @@ -15,6 +16,12 @@ BINARY = 2 MIN_DATA_LEN = 0 MAX_DATA_LEN = 999 +PARAM_REP_LEN = 3 +PARAM_TYPE_LEN = 1 +PARAM_MODULE_LEN = 1 +PARAM_TRANS_TO_LEN = 2 +PARAM_DATA_LEN = 3 + class Type(Enum): CE = 0x00 @@ -207,3 +214,38 @@ def check_fixed_param(data, expect): raise ValueError("expected str param is not valid") return data raise NotImplementedError("unexpected param type") + + +def cpu_alarm_recv(server_socket: socket.socket): + try: + client_socket, _ = server_socket.accept() + logging.debug("cpu alarm fd listen ok") + + data = client_socket.recv(PARAM_REP_LEN) + check_fixed_param(data, "REP") + + data = client_socket.recv(PARAM_TYPE_LEN) + _type = check_fixed_param(data, Type) + + data = client_socket.recv(PARAM_MODULE_LEN) + module = check_fixed_param(data, Module) + + data = client_socket.recv(PARAM_TRANS_TO_LEN) + trans_to = check_fixed_param(data, TransTo) + + data = client_socket.recv(PARAM_DATA_LEN) + data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) + + data = client_socket.recv(data_len) + + command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) + except socket.error: + logging.error("socket error") + return + except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): + logging.error("server recv cpu alarm msg failed!") + client_socket.close() + return + + upload_bmc(_type, module, command, event_type, socket_id, core_id) + diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 3d5cb8d..f93956e 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -36,8 +36,15 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel from .utils import get_current_time_string -from .cpu_alarm import (upload_bmc, check_fixed_param, parser_cpu_alarm_info, - Type, Module, TransTo, MIN_DATA_LEN, MAX_DATA_LEN) + + +CPU_EXIST = True +try: + from .cpu_alarm import cpu_alarm_recv +except ImportError: + CPU_EXIST = False + logging.debug("Cannot find cpu sentry mod") + INSPECTOR = None @@ -76,45 +83,6 @@ PID_FILE_FLOCK = None RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" -PARAM_REP_LEN = 3 -PARAM_TYPE_LEN = 1 -PARAM_MODULE_LEN = 1 -PARAM_TRANS_TO_LEN = 2 -PARAM_DATA_LEN = 3 - - -def cpu_alarm_recv(server_socket: socket.socket): - try: - client_socket, _ = server_socket.accept() - logging.debug("cpu alarm fd listen ok") - - data = client_socket.recv(PARAM_REP_LEN) - check_fixed_param(data, "REP") - - data = client_socket.recv(PARAM_TYPE_LEN) - _type = check_fixed_param(data, Type) - - data = client_socket.recv(PARAM_MODULE_LEN) - module = check_fixed_param(data, Module) - - data = client_socket.recv(PARAM_TRANS_TO_LEN) - trans_to = check_fixed_param(data, TransTo) - - data = client_socket.recv(PARAM_DATA_LEN) - data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) - - data = client_socket.recv(data_len) - - command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) - except socket.error: - logging.error("socket error") - return - except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): - logging.error("server recv cpu alarm msg failed!") - client_socket.close() - return - - upload_bmc(_type, module, command, event_type, socket_id, core_id) def msg_data_process(msg_data): @@ -480,7 +448,7 @@ def main_loop(): server_result_recv(server_result_fd) elif event_fd == heartbeat_fd.fileno(): heartbeat_recv(heartbeat_fd) - elif event_fd == cpu_alarm_fd.fileno(): + elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): cpu_alarm_recv(cpu_alarm_fd) else: continue -- Gitee From 799c1a439ebc3fc8d4811786379b2a05098cc059 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Wed, 11 Sep 2024 10:23:41 +0800 Subject: [PATCH 11/76] fix configparser.InterpolationSyntaxError --- src/python/syssentry/sentry_config.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py index 01f3df8..a0e7b79 100644 --- a/src/python/syssentry/sentry_config.py +++ b/src/python/syssentry/sentry_config.py @@ -103,14 +103,18 @@ class CpuPluginsParamsConfig: """read config file""" config_param_section_args = {} if os.path.exists(self.config_file): - self.config.read(self.config_file) try: + self.config.read(self.config_file) config_param_section_args = dict(self.config[self.param_section_name]) - except (ValueError, KeyError): + except (ValueError, KeyError, configparser.InterpolationSyntaxError): config_param_section_args = {} + logging.error("Failed to parse cpu_sentry.ini!") return config_param_section_args def join_cpu_start_cmd(self, cpu_param_dict: dict) -> str: + if not cpu_param_dict: + return "" + cpu_list = cpu_param_dict.get("cpu_list", "default") if cpu_list == "default": cpu_list = CpuPluginsParamsConfig.get_cpu_info() -- Gitee From ecdf8804ccb08ac97fba1566530a7bda78b9b39c Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Sat, 14 Sep 2024 09:28:00 +0800 Subject: [PATCH 12/76] fix syssentry fails to be started when cpu_sentry is not installed --- src/python/syssentry/syssentry.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index f93956e..776971f 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -43,7 +43,6 @@ try: from .cpu_alarm import cpu_alarm_recv except ImportError: CPU_EXIST = False - logging.debug("Cannot find cpu sentry mod") INSPECTOR = None @@ -563,20 +562,21 @@ def main(): if not os.path.exists(SENTRY_RUN_DIR): os.mkdir(SENTRY_RUN_DIR) os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - if not chk_and_set_pidfile(): - logging.error("get pid file lock failed, exist") - sys.exit(17) logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) os.chmod(SYSSENTRY_LOG_FILE, 0o600) + if not chk_and_set_pidfile(): + logging.error("get pid file lock failed, exist") + sys.exit(17) + try: signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGCHLD, sigchld_handler) - logging.debug("finish main parse_args") + logging.info("finish main parse_args") _ = SentryConfig.init_param() TasksMap.init_task_map() @@ -587,3 +587,4 @@ def main(): logging.error('%s', traceback.format_exc()) finally: release_pidfile() + -- Gitee From 666bd7964477f1a9eb516a818bca8e49a9a2a1f4 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Thu, 12 Sep 2024 11:29:01 +0800 Subject: [PATCH 13/76] add collect module to sysSentry --- config/collector.conf | 7 + service/sentryCollector.service | 12 + service/sysSentry.service | 2 +- src/python/sentryCollector/__init__.py | 0 src/python/sentryCollector/__main__.py | 17 ++ src/python/sentryCollector/collect_config.py | 118 ++++++++ src/python/sentryCollector/collect_io.py | 239 ++++++++++++++++ src/python/sentryCollector/collect_plugin.py | 276 ++++++++++++++++++ src/python/sentryCollector/collect_server.py | 285 +++++++++++++++++++ src/python/sentryCollector/collectd.py | 99 +++++++ src/python/setup.py | 4 +- 11 files changed, 1057 insertions(+), 2 deletions(-) create mode 100644 config/collector.conf create mode 100644 service/sentryCollector.service create mode 100644 src/python/sentryCollector/__init__.py create mode 100644 src/python/sentryCollector/__main__.py create mode 100644 src/python/sentryCollector/collect_config.py create mode 100644 src/python/sentryCollector/collect_io.py create mode 100644 src/python/sentryCollector/collect_plugin.py create mode 100644 src/python/sentryCollector/collect_server.py create mode 100644 src/python/sentryCollector/collectd.py diff --git a/config/collector.conf b/config/collector.conf new file mode 100644 index 0000000..52e91b1 --- /dev/null +++ b/config/collector.conf @@ -0,0 +1,7 @@ +[common] +modules=io + +[io] +period_time=1 +max_save=10 +disk=default \ No newline at end of file diff --git a/service/sentryCollector.service b/service/sentryCollector.service new file mode 100644 index 0000000..4ee07d5 --- /dev/null +++ b/service/sentryCollector.service @@ -0,0 +1,12 @@ +[Unit] +Description = Collection module added for sysSentry and kernel lock-free collection + +[Service] +ExecStart=/usr/bin/python3 /usr/bin/sentryCollector +ExecStop=/bin/kill $MAINPID +KillMode=process +Restart=on-failure +RestartSec=10s + +[Install] +WantedBy = multi-user.target diff --git a/service/sysSentry.service b/service/sysSentry.service index 4d85a6c..1d8338f 100644 --- a/service/sysSentry.service +++ b/service/sysSentry.service @@ -2,7 +2,7 @@ Description=EulerOS System Inspection Frame [Service] -ExecStart=/usr/bin/syssentry +ExecStart=/usr/bin/python3 /usr/bin/syssentry ExecStop=/bin/kill $MAINPID KillMode=process Restart=on-failure diff --git a/src/python/sentryCollector/__init__.py b/src/python/sentryCollector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryCollector/__main__.py b/src/python/sentryCollector/__main__.py new file mode 100644 index 0000000..9c2ae50 --- /dev/null +++ b/src/python/sentryCollector/__main__.py @@ -0,0 +1,17 @@ +# coding: utf-8 +# Copyright (c) 2023 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +main +""" +from collectd import collectd + +collectd.main() diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py new file mode 100644 index 0000000..b6cc75c --- /dev/null +++ b/src/python/sentryCollector/collect_config.py @@ -0,0 +1,118 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +Read and save collector.conf value. +""" +import configparser +import logging +import os +import re + + +COLLECT_CONF_PATH = "/etc/sysSentry/collector.conf" + +CONF_COMMON = 'common' +CONF_MODULES = 'modules' + +# io +CONF_IO = 'io' +CONF_IO_PERIOD_TIME = 'period_time' +CONF_IO_MAX_SAVE = 'max_save' +CONF_IO_DISK = 'disk' +CONF_IO_PERIOD_TIME_DEFAULT = 1 +CONF_IO_MAX_SAVE_DEFAULT = 10 +CONF_IO_DISK_DEFAULT = "default" + +class CollectConfig: + def __init__(self, filename=COLLECT_CONF_PATH): + + self.filename = filename + self.modules = [] + self.module_count = 0 + self.load_config() + + def load_config(self): + if not os.path.exists(self.filename): + logging.error("%s is not exists", self.filename) + return + + try: + self.config = configparser.ConfigParser() + self.config.read(self.filename) + except configparser.Error: + logging.error("collectd configure file read failed") + return + + try: + common_config = self.config[CONF_COMMON] + modules_str = common_config[CONF_MODULES] + # remove space + modules_list = modules_str.replace(" ", "").split(',') + except KeyError as e: + logging.error("read config data failed, %s", e) + return + + pattern = r'^[a-zA-Z0-9-_]+$' + for module_name in modules_list: + if not re.match(pattern, module_name): + logging.warning("module_name: %s is invalid", module_name) + continue + if not self.config.has_section(module_name): + logging.warning("module_name: %s config is incorrect", module_name) + continue + self.modules.append(module_name) + + def load_module_config(self, module_name): + module_name = module_name.strip().lower() + if module_name in self.modules and self.config.has_section(module_name): + return {key.lower(): value for key, value in self.config[module_name].items()} + else: + raise ValueError(f"Module '{module_name}' not found in configuration") + + def get_io_config(self): + result_io_config = {} + io_map_value = self.load_module_config(CONF_IO) + # period_time + period_time = io_map_value.get(CONF_IO_PERIOD_TIME) + if period_time and period_time.isdigit() and int(period_time) >= 1 and int(period_time) <= 300: + result_io_config[CONF_IO_PERIOD_TIME] = int(period_time) + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %d", + CONF_IO, CONF_IO_PERIOD_TIME, CONF_IO_PERIOD_TIME_DEFAULT) + result_io_config[CONF_IO_PERIOD_TIME] = CONF_IO_PERIOD_TIME_DEFAULT + # max_save + max_save = io_map_value.get(CONF_IO_MAX_SAVE) + if max_save and max_save.isdigit() and int(max_save) >= 1 and int(max_save) <= 300: + result_io_config[CONF_IO_MAX_SAVE] = int(max_save) + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %d", + CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) + result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT + # disk + disk = io_map_value.get(CONF_IO_DISK) + if disk: + disk_str = disk.replace(" ", "") + pattern = r'^[a-zA-Z0-9-_,]+$' + if not re.match(pattern, disk_str): + logging.warning("module_name = %s section, field = %s is incorrect, use default %s", + CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) + disk_str = CONF_IO_DISK_DEFAULT + result_io_config[CONF_IO_DISK] = disk_str + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %s", + CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) + result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT + logging.info("config get_io_config: %s", result_io_config) + return result_io_config + + def get_common_config(self): + return {key.lower(): value for key, value in self.config['common'].items()} diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py new file mode 100644 index 0000000..b826dc4 --- /dev/null +++ b/src/python/sentryCollector/collect_io.py @@ -0,0 +1,239 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +collect module +""" +import os +import time +import logging +import threading + +from .collect_config import CollectConfig + +Io_Category = ["read", "write", "flush", "discard"] +IO_GLOBAL_DATA = {} +IO_CONFIG_DATA = [] + +class IoStatus(): + TOTAL = 0 + FINISH = 1 + LATENCY = 2 + +class CollectIo(): + + def __init__(self, module_config): + + io_config = module_config.get_io_config() + + self.period_time = io_config['period_time'] + self.max_save = io_config['max_save'] + disk_str = io_config['disk'] + + self.disk_map_stage = {} + self.window_value = {} + + self.loop_all = False + + if disk_str == "default": + self.loop_all = True + else: + self.disk_list = disk_str.strip().split(',') + + self.stop_event = threading.Event() + + IO_CONFIG_DATA.append(self.period_time) + IO_CONFIG_DATA.append(self.max_save) + + def get_blk_io_hierarchy(self, disk_name, stage_list): + stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) + try: + with open(stats_file, 'r') as file: + lines = file.read() + except FileNotFoundError: + logging.error("The file %s does not exist", stats_file) + return -1 + except Exception as e: + logging.error("An error occurred3: %s", e) + return -1 + + curr_value = lines.strip().split('\n') + + for stage_val in curr_value: + stage = stage_val.split(' ')[0] + if (len(self.window_value[disk_name][stage])) >= 2: + self.window_value[disk_name][stage].pop(0) + + curr_stage_value = stage_val.split(' ')[1:-1] + self.window_value[disk_name][stage].append(curr_stage_value) + return 0 + + def append_period_lat(self, disk_name, stage_list): + for stage in stage_list: + if len(self.window_value[disk_name][stage]) < 2: + return + curr_stage_value = self.window_value[disk_name][stage][-1] + last_stage_value = self.window_value[disk_name][stage][-2] + + for index in range(len(Io_Category)): + # read=0, write=1, flush=2, discard=3 + if (len(IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save: + IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].pop() + + curr_lat = self.get_latency_value(curr_stage_value, last_stage_value, index) + curr_iops = self.get_iops(curr_stage_value, last_stage_value, index) + curr_io_length = self.get_io_length(curr_stage_value, last_stage_value, index) + curr_io_dump = self.get_io_dump(disk_name, stage, index) + + IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) + + def get_iops(self, curr_stage_value, last_stage_value, category): + try: + finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) + except ValueError as e: + logging.error("get_iops convert to int failed, %s", e) + return 0 + value = finish / self.period_time + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_latency_value(self, curr_stage_value, last_stage_value, category): + try: + finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) + lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) + except ValueError as e: + logging.error("get_latency_value convert to int failed, %s", e) + return 0 + if finish <= 0 or lat_time <= 0: + return 0 + value = lat_time / finish / 1000 / 1000 + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_io_length(self, curr_stage_value, last_stage_value, category): + try: + finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) + except ValueError as e: + logging.error("get_io_length convert to int failed, %s", e) + return 0 + value = finish / self.period_time / 1000 / 1000 + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_io_dump(self, disk_name, stage, category): + io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) + count = 0 + try: + with open(io_dump_file, 'r') as file: + for line in file: + count += line.count('.op=' + Io_Category[category]) + except FileNotFoundError: + logging.error("The file %s does not exist.", io_dump_file) + return count + except Exception as e: + logging.error("An error occurred1: %s", e) + return count + return count + + def extract_first_column(self, file_path): + column_names = [] + try: + with open(file_path, 'r') as file: + for line in file: + parts = line.strip().split() + if parts: + column_names.append(parts[0]) + except FileNotFoundError: + logging.error("The file %s does not exist.", file_path) + except Exception as e: + logging.error("An error occurred2: %s", e) + return column_names + + def task_loop(self): + if self.stop_event.is_set(): + logging.info("collect io thread exit") + return + + for disk_name, stage_list in self.disk_map_stage.items(): + if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: + continue + self.append_period_lat(disk_name, stage_list) + + threading.Timer(self.period_time, self.task_loop).start() + + def main_loop(self): + logging.info("collect io thread start") + base_path = '/sys/kernel/debug/block' + for disk_name in os.listdir(base_path): + if not self.loop_all and disk_name not in self.disk_list: + continue + + disk_path = os.path.join(base_path, disk_name) + blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') + + if not os.path.exists(blk_io_hierarchy_path): + logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name) + continue + + for file_name in os.listdir(blk_io_hierarchy_path): + file_path = os.path.join(blk_io_hierarchy_path, file_name) + + if file_name == 'stats': + stage_list = self.extract_first_column(file_path) + self.disk_map_stage[disk_name] = stage_list + self.window_value[disk_name] = {} + IO_GLOBAL_DATA[disk_name] = {} + + if len(self.disk_map_stage) == 0: + logging.warning("no disks meet the requirements. the thread exits") + return + + for disk_name, stage_list in self.disk_map_stage.items(): + for stage in stage_list: + self.window_value[disk_name][stage] = [] + IO_GLOBAL_DATA[disk_name][stage] = {} + for category in Io_Category: + IO_GLOBAL_DATA[disk_name][stage][category] = [] + + while True: + start_time = time.time() + + if self.stop_event.is_set(): + logging.info("collect io thread exit") + return + + for disk_name, stage_list in self.disk_map_stage.items(): + if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: + continue + self.append_period_lat(disk_name, stage_list) + + elapsed_time = time.time() - start_time + sleep_time = self.period_time - elapsed_time + if sleep_time < 0: + continue + while sleep_time > 1: + if self.stop_event.is_set(): + logging.info("collect io thread exit") + return + time.sleep(1) + sleep_time -= 1 + time.sleep(sleep_time) + + # set stop event, notify thread exit + def stop_thread(self): + logging.info("collect io thread is preparing to exit") + self.stop_event.set() diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py new file mode 100644 index 0000000..49ce0a8 --- /dev/null +++ b/src/python/sentryCollector/collect_plugin.py @@ -0,0 +1,276 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +collcet plugin +""" +import json +import socket +import logging +import re + +COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" + +# data length param +CLT_MSG_HEAD_LEN = 9 #3+2+4 +CLT_MSG_PRO_LEN = 2 +CLT_MSG_MAGIC_LEN = 3 +CLT_MSG_LEN_LEN = 4 + +CLT_MAGIC = "CLT" +RES_MAGIC = "RES" + +# disk limit +LIMIT_DISK_CHAR_LEN = 32 +LIMIT_DISK_LIST_LEN = 10 + +# stage limit +LIMIT_STAGE_CHAR_LEN = 20 +LIMIT_STAGE_LIST_LEN = 15 + +#iotype limit +LIMIT_IOTYPE_CHAR_LEN = 7 +LIMIT_IOTYPE_LIST_LEN = 4 + +#period limit +LIMIT_PERIOD_MIN_LEN = 1 +LIMIT_PERIOD_MAX_LEN = 300 + +# interface protocol +class ClientProtocol(): + IS_IOCOLLECT_VALID = 0 + GET_IO_DATA = 1 + PRO_END = 3 + +class ResultMessage(): + RESULT_SUCCEED = 0 + RESULT_UNKNOWN = 1 # unknown error + RESULT_NOT_PARAM = 2 # the parameter does not exist or the type does not match. + RESULT_INVALID_LENGTH = 3 # invalid parameter length. + RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. + RESULT_PARSE_FAILED = 5 # parse failed + RESULT_INVALID_CHAR = 6 # invalid char + +Result_Messages = { + ResultMessage.RESULT_SUCCEED: "Succeed", + ResultMessage.RESULT_UNKNOWN: "Unknown error", + ResultMessage.RESULT_NOT_PARAM: "The parameter does not exist or the type does not match", + ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", + ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", + ResultMessage.RESULT_PARSE_FAILED: "Parse failed", + ResultMessage.RESULT_INVALID_CHAR: "Invalid char" +} + + +def client_send_and_recv(request_data, data_str_len, protocol): + """client socket send and recv message""" + try: + client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + except socket.error: + print("collect_plugin: client creat socket error") + return None + + try: + client_socket.connect(COLLECT_SOCKET_PATH) + except OSError: + client_socket.close() + print("collect_plugin: client connect error") + return None + + req_data_len = len(request_data) + request_msg = CLT_MAGIC + str(protocol).zfill(CLT_MSG_PRO_LEN) + str(req_data_len).zfill(CLT_MSG_LEN_LEN) + request_data + + try: + client_socket.send(request_msg.encode()) + res_data = client_socket.recv(len(RES_MAGIC) + CLT_MSG_PRO_LEN + data_str_len) + res_data = res_data.decode() + except (OSError, UnicodeError): + client_socket.close() + print("collect_plugin: client communicate error") + return None + + res_magic = res_data[:CLT_MSG_MAGIC_LEN] + if res_magic != "RES": + print("res msg format error") + return None + + protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] + try: + protocol_id = int(protocol_str) + except ValueError: + print("recv msg protocol id is invalid %s", protocol_str) + return None + + if protocol_id >= ClientProtocol.PRO_END: + print("protocol id is invalid") + return None + + try: + res_data_len = int(res_data[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:]) + res_msg_data = client_socket.recv(res_data_len) + res_msg_data = res_msg_data.decode() + return res_msg_data + except (OSError, ValueError, UnicodeError): + print("collect_plugin: client recv res msg error") + finally: + client_socket.close() + + return None + +def validate_parameters(param, len_limit, char_limit): + ret = ResultMessage.RESULT_SUCCEED + if not param: + print("parm is invalid") + ret = ResultMessage.RESULT_NOT_PARAM + return [False, ret] + + if not isinstance(param, list): + print(f"{param} is not list type.") + ret = ResultMessage.RESULT_NOT_PARAM + return [False, ret] + + if len(param) <= 0: + print(f"{param} length is 0.") + ret = ResultMessage.RESULT_INVALID_LENGTH + return [False, ret] + + if len(param) > len_limit: + print(f"{param} length more than {len_limit}") + ret = ResultMessage.RESULT_EXCEED_LIMIT + return [False, ret] + + pattern = r'^[a-zA-Z0-9_-]+$' + for info in param: + if len(info) > char_limit: + print(f"{info} length more than {char_limit}") + ret = ResultMessage.RESULT_EXCEED_LIMIT + return [False, ret] + if not re.match(pattern, info): + print(f"{info} is invalid char") + ret = ResultMessage.RESULT_INVALID_CHAR + return [False, ret] + + return [True, ret] + +def is_iocollect_valid(period, disk_list=None, stage=None): + result = inter_is_iocollect_valid(period, disk_list, stage) + error_code = result['ret'] + if error_code != ResultMessage.RESULT_SUCCEED: + result['message'] = Result_Messages[error_code] + return result + +def inter_is_iocollect_valid(period, disk_list=None, stage=None): + result = {} + result['ret'] = ResultMessage.RESULT_UNKNOWN + result['message'] = "" + + if not period or not isinstance(period, int): + result['ret'] = ResultMessage.RESULT_NOT_PARAM + return result + if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: + result['ret'] = ResultMessage.RESULT_INVALID_LENGTH + return result + + if not disk_list: + disk_list = [] + else: + res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) + if not res[0]: + result['ret'] = res[1] + return result + + if not stage: + stage = [] + else: + res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) + if not res[0]: + result['ret'] = res[1] + return result + + req_msg_struct = { + 'disk_list': json.dumps(disk_list), + 'period': period, + 'stage': json.dumps(stage) + } + request_message = json.dumps(req_msg_struct) + result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID) + if not result_message: + print("collect_plugin: client_send_and_recv failed") + return result + + try: + json.loads(result_message) + except json.JSONDecodeError: + print("is_iocollect_valid: json decode error") + result['ret'] = ResultMessage.RESULT_PARSE_FAILED + return result + + result['ret'] = ResultMessage.RESULT_SUCCEED + result['message'] = result_message + return result + +def get_io_data(period, disk_list, stage, iotype): + result = inter_get_io_data(period, disk_list, stage, iotype) + error_code = result['ret'] + if error_code != ResultMessage.RESULT_SUCCEED: + result['message'] = Result_Messages[error_code] + return result + +def inter_get_io_data(period, disk_list, stage, iotype): + result = {} + result['ret'] = ResultMessage.RESULT_UNKNOWN + result['message'] = "" + + if not isinstance(period, int): + result['ret'] = ResultMessage.RESULT_NOT_PARAM + return result + if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: + result['ret'] = ResultMessage.RESULT_INVALID_LENGTH + return result + + res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) + if not res[0]: + result['ret'] = res[1] + return result + + res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) + if not res[0]: + result['ret'] = res[1] + return result + + res = validate_parameters(iotype, LIMIT_IOTYPE_LIST_LEN, LIMIT_IOTYPE_CHAR_LEN) + if not res[0]: + result['ret'] = res[1] + return result + + req_msg_struct = { + 'disk_list': json.dumps(disk_list), + 'period': period, + 'stage': json.dumps(stage), + 'iotype' : json.dumps(iotype) + } + + request_message = json.dumps(req_msg_struct) + result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) + if not result_message: + print("collect_plugin: client_send_and_recv failed") + return result + try: + json.loads(result_message) + except json.JSONDecodeError: + print("get_io_data: json decode error") + result['ret'] = ResultMessage.RESULT_PARSE_FAILED + return result + + result['ret'] = ResultMessage.RESULT_SUCCEED + result['message'] = result_message + return result + diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py new file mode 100644 index 0000000..fa49781 --- /dev/null +++ b/src/python/sentryCollector/collect_server.py @@ -0,0 +1,285 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +listen module +""" +import sys +import signal +import traceback +import socket +import os +import json +import logging +import fcntl +import select +import threading +import time + +from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA +from .collect_config import CollectConfig + +SENTRY_RUN_DIR = "/var/run/sysSentry" +COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" + +# socket param +CLT_LISTEN_QUEUE_LEN = 5 +SERVER_EPOLL_TIMEOUT = 0.3 + +# data length param +CLT_MSG_HEAD_LEN = 9 #3+2+4 +CLT_MSG_PRO_LEN = 2 +CLT_MSG_MAGIC_LEN = 3 +CLT_MSG_LEN_LEN = 4 + +# data flag param +CLT_MAGIC = "CLT" +RES_MAGIC = "RES" + +# interface protocol +class ServerProtocol(): + IS_IOCOLLECT_VALID = 0 + GET_IO_DATA = 1 + PRO_END = 3 + +class CollectServer(): + + def __init__(self): + + self.io_global_data = {} + + self.stop_event = threading.Event() + + def is_iocollect_valid(self, data_struct): + + result_rev = {} + self.io_global_data = IO_GLOBAL_DATA + + if len(IO_CONFIG_DATA) == 0: + logging.error("the collect thread is not started, the data is invalid. ") + return json.dumps(result_rev) + + period_time = IO_CONFIG_DATA[0] + max_save = IO_CONFIG_DATA[1] + + disk_list = json.loads(data_struct['disk_list']) + period = int(data_struct['period']) + stage_list = json.loads(data_struct['stage']) + + if (period < period_time) or (period > period_time * max_save) or (period % period_time): + logging.error("is_iocollect_valid: period time: %d is invalid", period) + return json.dumps(result_rev) + + for disk_name, stage_info in self.io_global_data.items(): + if len(disk_list) > 0 and disk_name not in disk_list: + continue + result_rev[disk_name] = [] + if len(stage_list) == 0: + result_rev[disk_name] = list(stage_info.keys()) + continue + for stage_name, stage_data in stage_info.items(): + if stage_name in stage_list: + result_rev[disk_name].append(stage_name) + + return json.dumps(result_rev) + + def get_io_data(self, data_struct): + result_rev = {} + self.io_global_data = IO_GLOBAL_DATA + + if len(IO_CONFIG_DATA) == 0: + logging.error("the collect thread is not started, the data is invalid. ") + return json.dumps(result_rev) + period_time = IO_CONFIG_DATA[0] + max_save = IO_CONFIG_DATA[1] + + period = int(data_struct['period']) + disk_list = json.loads(data_struct['disk_list']) + stage_list = json.loads(data_struct['stage']) + iotype_list = json.loads(data_struct['iotype']) + + if (period < period_time) or (period > period_time * max_save) or (period % period_time): + logging.error("get_io_data: period time: %d is invalid", period) + return json.dumps(result_rev) + + collect_index = period // period_time - 1 + logging.debug("period: %d, collect_index: %d", period, collect_index) + + for disk_name, stage_info in self.io_global_data.items(): + if disk_name not in disk_list: + continue + result_rev[disk_name] = {} + for stage_name, iotype_info in stage_info.items(): + if len(stage_list) > 0 and stage_name not in stage_list: + continue + result_rev[disk_name][stage_name] = {} + for iotype_name, iotype_info in iotype_info.items(): + if iotype_name not in iotype_list: + continue + if len(iotype_info) < collect_index: + continue + result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] + + return json.dumps(result_rev) + + def msg_data_process(self, msg_data, protocal_id): + """message data process""" + logging.debug("msg_data %s", msg_data) + protocol_name = msg_data[0] + try: + data_struct = json.loads(msg_data) + except json.JSONDecodeError: + logging.error("msg data process: json decode error") + return "Request message decode failed" + + if protocal_id == ServerProtocol.IS_IOCOLLECT_VALID: + res_msg = self.is_iocollect_valid(data_struct) + elif protocal_id == ServerProtocol.GET_IO_DATA: + res_msg = self.get_io_data(data_struct) + + return res_msg + + def msg_head_process(self, msg_head): + """message head process""" + ctl_magic = msg_head[:CLT_MSG_MAGIC_LEN] + if ctl_magic != CLT_MAGIC: + logging.error("recv msg head magic invalid") + return None + + protocol_str = msg_head[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] + try: + protocol_id = int(protocol_str) + except ValueError: + logging.error("recv msg protocol id is invalid") + return None + + data_len_str = msg_head[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:CLT_MSG_HEAD_LEN] + try: + data_len = int(data_len_str) + except ValueError: + logging.error("recv msg data len is invalid %s", data_len_str) + return None + + return [protocol_id, data_len] + + def server_recv(self, server_socket: socket.socket): + """server receive""" + try: + client_socket, _ = server_socket.accept() + logging.debug("server_fd listen ok") + except socket.error: + logging.error("server accept failed, %s", socket.error) + return + + try: + msg_head = client_socket.recv(CLT_MSG_HEAD_LEN) + logging.debug("recv msg head: %s", msg_head.decode()) + head_info = self.msg_head_process(msg_head.decode()) + except (OSError, UnicodeError): + client_socket.close() + logging.error("server recv HEAD failed") + return + + protocol_id = head_info[0] + data_len = head_info[1] + logging.debug("msg protocol id: %d, data length: %d", protocol_id, data_len) + if protocol_id >= ServerProtocol.PRO_END: + client_socket.close() + logging.error("protocol id is invalid") + return + + if data_len < 0: + client_socket.close() + logging.error("msg head parse failed") + return + + try: + msg_data = client_socket.recv(data_len) + msg_data_decode = msg_data.decode() + logging.debug("msg data %s", msg_data_decode) + except (OSError, UnicodeError): + client_socket.close() + logging.error("server recv MSG failed") + return + + res_data = self.msg_data_process(msg_data_decode, protocol_id) + logging.debug("res data %s", res_data) + + # server send + res_head = RES_MAGIC + res_head += str(protocol_id).zfill(CLT_MSG_PRO_LEN) + res_data_len = str(len(res_data)).zfill(CLT_MSG_LEN_LEN) + res_head += res_data_len + logging.debug("res head %s", res_head) + + res_msg = res_head + res_data + logging.debug("res msg %s", res_msg) + + try: + client_socket.send(res_msg.encode()) + except OSError: + logging.error("server recv failed") + finally: + client_socket.close() + return + + def server_fd_create(self): + """create server fd""" + if not os.path.exists(SENTRY_RUN_DIR): + logging.error("%s not exist, failed", SENTRY_RUN_DIR) + return None + + try: + server_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + server_fd.setblocking(False) + if os.path.exists(COLLECT_SOCKET_PATH): + os.remove(COLLECT_SOCKET_PATH) + + server_fd.bind(COLLECT_SOCKET_PATH) + os.chmod(COLLECT_SOCKET_PATH, 0o600) + server_fd.listen(CLT_LISTEN_QUEUE_LEN) + logging.debug("%s bind and listen", COLLECT_SOCKET_PATH) + except socket.error: + logging.error("server fd create failed") + server_fd = None + + return server_fd + + + def server_loop(self): + """main loop""" + logging.info("collect server thread start") + server_fd = self.server_fd_create() + if not server_fd: + return + + epoll_fd = select.epoll() + epoll_fd.register(server_fd.fileno(), select.EPOLLIN) + + logging.debug("start server_loop loop") + while True: + if self.stop_event.is_set(): + logging.info("collect server thread exit") + server_fd = None + return + try: + events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) + for event_fd, _ in events_list: + if event_fd == server_fd.fileno(): + self.server_recv(server_fd) + else: + continue + except socket.error: + pass + + def stop_thread(self): + logging.info("collect server thread is preparing to exit") + self.stop_event.set() diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py new file mode 100644 index 0000000..b77c642 --- /dev/null +++ b/src/python/sentryCollector/collectd.py @@ -0,0 +1,99 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +main loop for collect. +""" +import sys +import signal +import traceback +import socket +import os +import json +import logging +import fcntl +import select + +import threading + +from .collect_io import CollectIo +from .collect_server import CollectServer +from .collect_config import CollectConfig + +SENTRY_RUN_DIR = "/var/run/sysSentry" +COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" +SENTRY_RUN_DIR_PERM = 0o750 + +COLLECT_LOG_FILE = "/var/log/sysSentry/collector.log" +Thread_List = [] +Module_Map_Class = {"io" : CollectIo} + +def remove_sock_file(): + try: + os.unlink(COLLECT_SOCKET_PATH) + except FileNotFoundError: + pass + +def sig_handler(signum, _f): + if signum not in (signal.SIGINT, signal.SIGTERM): + return + for i in range(len(Thread_List)): + Thread_List[i][0].stop_thread() + + remove_sock_file() + sys.exit(0) + +def main(): + """main + """ + if not os.path.exists(SENTRY_RUN_DIR): + os.mkdir(SENTRY_RUN_DIR) + os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) + + logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO) + os.chmod(COLLECT_LOG_FILE, 0o600) + + try: + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + signal.signal(signal.SIGHUP, sig_handler) + + logging.info("finish main parse_args") + + module_config = CollectConfig() + module_list = module_config.modules + + # listen thread + cs = CollectServer() + listen_thread = threading.Thread(target=cs.server_loop) + listen_thread.start() + Thread_List.append([cs, listen_thread]) + + # collect thread + for info in module_list: + class_name = Module_Map_Class.get(info) + if not class_name: + logging.info("%s correspond to class is not exists", info) + continue + cn = class_name(module_config) + collect_thread = threading.Thread(target=cn.main_loop) + collect_thread.start() + Thread_List.append([cn, collect_thread]) + + for i in range(len(Thread_List)): + Thread_List[i][1].join() + + except Exception: + logging.error('%s', traceback.format_exc()) + finally: + pass + + logging.info("All threads have finished. Main thread is exiting.") \ No newline at end of file diff --git a/src/python/setup.py b/src/python/setup.py index f96a96e..c28c691 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -31,7 +31,9 @@ setup( 'console_scripts': [ 'cpu_sentry=syssentry.cpu_sentry:main', 'syssentry=syssentry.syssentry:main', - 'xalarmd=xalarm.xalarm_daemon:alarm_process_create' + 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', + 'sentryCollector=sentryCollector.collectd:main', + 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' ] }, ) -- Gitee From 5ac1052811af59ee6401d401fddc1ba47571cc3c Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Thu, 12 Sep 2024 11:31:34 +0800 Subject: [PATCH 14/76] add avg_block_io plugin --- config/plugins/avg_block_io.ini | 21 ++ config/tasks/avg_block_io.mod | 5 + src/python/sentryPlugins/__init__.py | 0 .../sentryPlugins/avg_block_io/__init__.py | 0 .../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++ .../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++ .../avg_block_io/stage_window.py | 47 ++++ .../sentryPlugins/avg_block_io/utils.py | 86 ++++++ 8 files changed, 502 insertions(+) create mode 100644 config/plugins/avg_block_io.ini create mode 100644 config/tasks/avg_block_io.mod create mode 100644 src/python/sentryPlugins/__init__.py create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini new file mode 100644 index 0000000..bc33dde --- /dev/null +++ b/config/plugins/avg_block_io.ini @@ -0,0 +1,21 @@ +[common] +disk=default +stage=default +iotype=read,write +period_time=1 + +[algorithm] +win_size=30 +win_threshold=6 + +[latency] +read_avg_lim=10 +write_avg_lim=10 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50 +write_tot_lim=50 + +[iodump] +read_iodump_lim=0 +write_iodump_lim=0 diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod new file mode 100644 index 0000000..814c483 --- /dev/null +++ b/config/tasks/avg_block_io.mod @@ -0,0 +1,5 @@ +[common] +enabled=yes +task_start=/usr/bin/python3 /usr/bin/avg_block_io +task_stop=pkill avg_block_io +type=oneshot \ No newline at end of file diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py new file mode 100644 index 0000000..ff2071d --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -0,0 +1,257 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging +import signal +import configparser +import time + +from .stage_window import IoWindow, IoDumpWindow +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler +from .utils import update_avg_and_check_abnormal + +CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" + +def log_invalid_keys(not_in_list, keys_name, config_list, default_list): + """print invalid log""" + if config_list and default_list: + logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) + elif config_list == ["default"]: + logging.warning("Default {} use {}".format(keys_name, default_list)) + + +def read_config_common(config): + """read config file, get [common] section value""" + try: + common_sec = config['common'] + except configparser.NoSectionError: + report_alarm_fail("Cannot find common section in config file") + + try: + period_time = int(common_sec.get("period_time", 1)) + if not (1 <= period_time <= 300): + raise ValueError("Invalid period_time") + except ValueError: + period_time = 1 + logging.warning("Invalid period_time, set to 1s") + + disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] + stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] + + if len(disk) > 10: + logging.warning("Too many disks, record only max 10 disks") + disk = disk[:10] + + iotype = common_sec.get('iotype', 'read,write').split(",") + iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] + err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] + + if err_iotype: + logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) + + return period_time, disk, stage, iotype_list + + +def read_config_algorithm(config): + """read config file, get [algorithm] section value""" + if not config.has_section("algorithm"): + report_alarm_fail("Cannot find algorithm section in config file") + + try: + win_size = int(config.get("algorithm", "win_size")) + if not (1 <= win_size <= 300): + raise ValueError("Invalid win_size") + win_threshold = int(config.get("algorithm", "win_threshold")) + if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: + raise ValueError("Invalid win_threshold") + except ValueError: + report_alarm_fail("Invalid win_threshold or win_size") + + return win_size, win_threshold + + +def read_config_lat_iodump(io_dic, config): + """read config file, get [latency] [iodump] section value""" + common_param = {} + for io_type in io_dic["iotype_list"]: + common_param[io_type] = {} + + latency_keys = { + "avg_lim": "{}_avg_lim".format(io_type), + "avg_time": "{}_avg_time".format(io_type), + "tot_lim": "{}_tot_lim".format(io_type), + } + iodump_key = "{}_iodump_lim".format(io_type) + + for key_suffix, key_template in latency_keys.items(): + if key_template in config["latency"] and config["latency"][key_template].isdecimal(): + common_param[io_type][key_template] = int(config["latency"][key_template]) + + if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): + common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) + + return common_param + + +def read_config_stage(config, stage, iotype_list): + """read config file, get [STAGE_NAME] section value""" + res = {} + if not stage in config: + return res + + for key in config[stage]: + if config[stage][key].isdecimal(): + res[key] = int(config[stage][key]) + + return res + + +def init_io_win(io_dic, config, common_param): + """initialize windows of latency, iodump, and dict of avg_value""" + iotype_list = io_dic["iotype_list"] + io_data = {} + io_avg_value = {} + for disk_name in io_dic["disk_list"]: + io_data[disk_name] = {} + io_avg_value[disk_name] = {} + for stage_name in io_dic["stage_list"]: + io_data[disk_name][stage_name] = {} + io_avg_value[disk_name][stage_name] = {} + # step3. 解析stage配置 + curr_stage_param = read_config_stage(config, stage_name, iotype_list) + for rw in iotype_list: + io_data[disk_name][stage_name][rw] = {} + io_avg_value[disk_name][stage_name][rw] = [0, 0] + + # 对每个rw创建latency和iodump窗口 + avg_lim_key = "{}_avg_lim".format(rw) + avg_time_key = "{}_avg_time".format(rw) + tot_lim_key = "{}_tot_lim".format(rw) + iodump_lim_key = "{}_iodump_lim".format(rw) + + # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 + avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) + avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) + tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) + iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) + + if avg_lim_value and avg_time_value and tot_lim_value: + io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) + + if iodump_lim_value is not None: + io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) + return io_data, io_avg_value + + +def get_valid_disk_stage_list(io_dic, config_disk, config_stage): + """get disk_list and stage_list by sentryCollector""" + json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage) + + all_disk_set = json_data.keys() + all_stage_set = set() + for disk_stage_list in json_data.values(): + all_stage_set.update(disk_stage_list) + + disk_list = [key for key in config_disk if key in all_disk_set] + not_in_disk_list = [key for key in config_disk if key not in all_disk_set] + + stage_list = [key for key in config_stage if key in all_stage_set] + not_in_stage_list = [key for key in config_stage if key not in all_stage_set] + + if not config_disk: + disk_list = [key for key in all_disk_set] + + if not config_stage: + stage_list = [key for key in all_stage_set] + + if config_disk and not disk_list: + logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) + disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) + + if config_stage and not stage_list: + logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) + disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) + + if not stage_list or not disk_list: + report_alarm_fail("Cannot get valid disk name or stage name.") + + log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) + log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) + + return disk_list, stage_list + + +def main_loop(io_dic, io_data, io_avg_value): + """main loop of avg_block_io""" + period_time = io_dic["period_time"] + disk_list = io_dic["disk_list"] + stage_list = io_dic["stage_list"] + iotype_list = io_dic["iotype_list"] + win_size = io_dic["win_size"] + # 开始循环 + while True: + # 等待x秒 + time.sleep(period_time) + + # 采集模块对接,获取周期数据 + curr_period_data = avg_get_io_data(io_dic) + + # 处理周期数据 + reach_size = False + for disk_name in disk_list: + for stage_name in stage_list: + for rw in iotype_list: + if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: + io_key = (disk_name, stage_name, rw) + reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) + + # win_size不满时不进行告警判断 + if not reach_size: + continue + + # 判断异常窗口、异常场景 + for disk_name in disk_list: + for rw in iotype_list: + process_report_data(disk_name, rw, io_data) + + +def main(): + """main func""" + # 注册停止信号-2/-15 + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + + # 初始化配置读取 + config = configparser.ConfigParser(comment_prefixes=('#', ';')) + try: + config.read(CONFIG_FILE) + except configparser.Error: + report_alarm_fail("Failed to read config file") + + io_dic = {} + + # 读取配置文件 -- common段 + io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config) + + # 采集模块对接,is_iocollect_valid() + io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) + + if "bio" not in io_dic["stage_list"]: + report_alarm_fail("Cannot run avg_block_io without bio stage") + + # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 + # step1. 解析公共配置 --- algorithm + io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) + + # step2. 循环创建窗口 + common_param = read_config_lat_iodump(io_dic, config) + io_data, io_avg_value = init_io_win(io_dic, config, common_param) + + main_loop(io_dic, io_data, io_avg_value) diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py new file mode 100644 index 0000000..0da4208 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -0,0 +1,86 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import json +import logging +import sys +import time + +from .utils import is_abnormal +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages +from syssentry.result import ResultLevel, report_result + + +TASK_NAME = "avg_block_io" + +def sig_handler(signum, _f): + """stop avg_block_io""" + report_result(TASK_NAME, ResultLevel.PASS, json.dumps({})) + logging.info("Finished avg_block_io plugin running.") + sys.exit(0) + +def avg_get_io_data(io_dic): + """get_io_data from sentryCollector""" + res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) + return check_result_validation(res, 'get io data') + + +def avg_is_iocollect_valid(io_dic, config_disk, config_stage): + """is_iocollect_valid from sentryCollector""" + res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) + return check_result_validation(res, 'check config validation') + + +def check_result_validation(res, reason): + """check validation of result from sentryCollector""" + if not 'ret' in res or not 'message' in res: + err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) + report_alarm_fail(err_msg) + if res['ret'] != 0: + err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) + report_alarm_fail(err_msg) + + try: + json_data = json.loads(res['message']) + except json.JSONDecodeError: + err_msg = "Failed to {}: invalid return message".format(reason) + report_alarm_fail(err_msg) + + return json_data + + +def report_alarm_fail(alarm_info): + """report result to xalarmd""" + report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) + logging.error(alarm_info) + sys.exit(1) + + +def process_report_data(disk_name, rw, io_data): + """check abnormal window and report to xalarm""" + if not is_abnormal((disk_name, 'bio', rw), io_data): + return + + ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] + for stage_name in ctrl_stage: + if is_abnormal((disk_name, stage_name, rw), io_data): + logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) + return + + if is_abnormal((disk_name, 'rq_driver', rw), io_data): + logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) + return + + kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] + for stage_name in kernel_stage: + if is_abnormal((disk_name, stage_name, rw), io_data): + logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) + return + logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py new file mode 100644 index 0000000..9b0ce79 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/stage_window.py @@ -0,0 +1,47 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +class AbnormalWindowBase: + def __init__(self, window_size=10, window_threshold=7): + self.window_size = window_size + self.window_threshold = window_threshold + self.abnormal_window = [False] * window_size + + def append_new_period(self, ab_res, avg_val=0): + self.abnormal_window.pop(0) + if self.is_abnormal_period(ab_res, avg_val): + self.abnormal_window.append(True) + else: + self.abnormal_window.append(False) + + def is_abnormal_window(self): + return sum(self.abnormal_window) > self.window_threshold + + +class IoWindow(AbnormalWindowBase): + def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): + super().__init__(window_size, window_threshold) + self.abnormal_multiple = abnormal_multiple + self.abnormal_multiple_lim = abnormal_multiple_lim + self.abnormal_time = abnormal_time + + def is_abnormal_period(self, value, avg_val): + return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \ + (value > self.abnormal_time) + + +class IoDumpWindow(AbnormalWindowBase): + def __init__(self, window_size=10, window_threshold=7, abnormal_time=40): + super().__init__(window_size, window_threshold) + self.abnormal_time = abnormal_time + + def is_abnormal_period(self, value, avg_val=0): + return value > self.abnormal_time diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py new file mode 100644 index 0000000..54ed080 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -0,0 +1,86 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +AVG_VALUE = 0 +AVG_COUNT = 1 + + +def get_nested_value(data, keys): + """get data from nested dict""" + for key in keys: + if key in data: + data = data[key] + else: + return None + return data + + +def set_nested_value(data, keys, value): + """set data to nested dict""" + for key in keys[:-1]: + if key in data: + data = data[key] + else: + return False + data[keys[-1]] = value + return True + + +def is_abnormal(io_key, io_data): + """check if latency and iodump win abnormal""" + for key in ['latency', 'iodump']: + all_keys = get_nested_value(io_data, io_key) + if all_keys and key in all_keys: + win = get_nested_value(io_data, io_key + (key,)) + if win and win.is_abnormal_window(): + return True + return False + + +def update_io_avg(old_avg, period_value, win_size): + """update average of latency window""" + if old_avg[AVG_COUNT] < win_size: + new_avg_count = old_avg[AVG_COUNT] + 1 + new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count + else: + new_avg_count = old_avg[AVG_COUNT] + new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count + return [new_avg_value, new_avg_count] + + +def update_io_data(old_avg, period_value, win_size, io_data, io_key): + """update data of latency and iodump window""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) + if all_wins and "iodump" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) + + +def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): + """update avg and check abonrmal, return true if win_size full""" + period_value = get_nested_value(data, io_key) + old_avg = get_nested_value(io_avg_value, io_key) + + # 更新avg数据 + if old_avg[AVG_COUNT] < win_size: + set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) + return False + + # 更新win数据 -- 判断异常周期 + update_io_data(old_avg, period_value, win_size, io_data, io_key) + all_wins = get_nested_value(io_data, io_key) + if all_wins and 'latency' not in all_wins: + return True + period = get_nested_value(io_data, io_key + ("latency",)) + if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): + return True + set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) + return True -- Gitee From 534eee9a8ee77bf928a8d5cf574893fda8c67039 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 20 Sep 2024 14:35:39 +0800 Subject: [PATCH 15/76] fix some about collect module and avg block io --- config/tasks/avg_block_io.mod | 4 ++-- src/python/sentryCollector/collect_io.py | 18 +++++++++++------- src/python/sentryCollector/collect_plugin.py | 17 ++++++++--------- src/python/sentryCollector/collect_server.py | 6 +++--- src/python/sentryCollector/collectd.py | 2 -- .../sentryPlugins/avg_block_io/avg_block_io.py | 13 ++++++++++--- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod index 814c483..b9b6f34 100644 --- a/config/tasks/avg_block_io.mod +++ b/config/tasks/avg_block_io.mod @@ -1,5 +1,5 @@ [common] enabled=yes task_start=/usr/bin/python3 /usr/bin/avg_block_io -task_stop=pkill avg_block_io -type=oneshot \ No newline at end of file +task_stop=pkill -f /usr/bin/avg_block_io +type=oneshot diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index b826dc4..104b734 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -175,8 +175,7 @@ class CollectIo(): threading.Timer(self.period_time, self.task_loop).start() - def main_loop(self): - logging.info("collect io thread start") + def is_kernel_avaliable(self): base_path = '/sys/kernel/debug/block' for disk_name in os.listdir(base_path): if not self.loop_all and disk_name not in self.disk_list: @@ -198,8 +197,13 @@ class CollectIo(): self.window_value[disk_name] = {} IO_GLOBAL_DATA[disk_name] = {} - if len(self.disk_map_stage) == 0: - logging.warning("no disks meet the requirements. the thread exits") + return len(IO_GLOBAL_DATA) != 0 + + def main_loop(self): + logging.info("collect io thread start") + + if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: + logging.warning("no disks meet the requirements. collect io thread exits") return for disk_name, stage_list in self.disk_map_stage.items(): @@ -213,7 +217,7 @@ class CollectIo(): start_time = time.time() if self.stop_event.is_set(): - logging.info("collect io thread exit") + logging.debug("collect io thread exit") return for disk_name, stage_list in self.disk_map_stage.items(): @@ -227,7 +231,7 @@ class CollectIo(): continue while sleep_time > 1: if self.stop_event.is_set(): - logging.info("collect io thread exit") + logging.debug("collect io thread exit") return time.sleep(1) sleep_time -= 1 @@ -235,5 +239,5 @@ class CollectIo(): # set stop event, notify thread exit def stop_thread(self): - logging.info("collect io thread is preparing to exit") + logging.debug("collect io thread is preparing to exit") self.stop_event.set() diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 49ce0a8..9132473 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -142,22 +142,21 @@ def validate_parameters(param, len_limit, char_limit): ret = ResultMessage.RESULT_INVALID_LENGTH return [False, ret] - if len(param) > len_limit: - print(f"{param} length more than {len_limit}") - ret = ResultMessage.RESULT_EXCEED_LIMIT - return [False, ret] - pattern = r'^[a-zA-Z0-9_-]+$' for info in param: - if len(info) > char_limit: - print(f"{info} length more than {char_limit}") - ret = ResultMessage.RESULT_EXCEED_LIMIT - return [False, ret] if not re.match(pattern, info): print(f"{info} is invalid char") ret = ResultMessage.RESULT_INVALID_CHAR return [False, ret] + # length of len_limit is exceeded, keep len_limit + if len(param) > len_limit: + print(f"{param} length more than {len_limit}, keep the first {len_limit}") + param[:] = param[0:len_limit] + + # only keep elements under the char_limit length + param[:] = [elem for elem in param if len(elem) <= char_limit] + return [True, ret] def is_iocollect_valid(period, disk_list=None, stage=None): diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py index fa49781..bab4e56 100644 --- a/src/python/sentryCollector/collect_server.py +++ b/src/python/sentryCollector/collect_server.py @@ -256,7 +256,7 @@ class CollectServer(): def server_loop(self): """main loop""" - logging.info("collect server thread start") + logging.info("collect listen thread start") server_fd = self.server_fd_create() if not server_fd: return @@ -267,7 +267,7 @@ class CollectServer(): logging.debug("start server_loop loop") while True: if self.stop_event.is_set(): - logging.info("collect server thread exit") + logging.debug("collect listen thread exit") server_fd = None return try: @@ -281,5 +281,5 @@ class CollectServer(): pass def stop_thread(self): - logging.info("collect server thread is preparing to exit") + logging.debug("collect listen thread is preparing to exit") self.stop_event.set() diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py index b77c642..3a836df 100644 --- a/src/python/sentryCollector/collectd.py +++ b/src/python/sentryCollector/collectd.py @@ -49,7 +49,6 @@ def sig_handler(signum, _f): Thread_List[i][0].stop_thread() remove_sock_file() - sys.exit(0) def main(): """main @@ -64,7 +63,6 @@ def main(): try: signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) - signal.signal(signal.SIGHUP, sig_handler) logging.info("finish main parse_args") diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index ff2071d..73f0b22 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -21,7 +21,7 @@ CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" def log_invalid_keys(not_in_list, keys_name, config_list, default_list): """print invalid log""" - if config_list and default_list: + if config_list and not_in_list: logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) elif config_list == ["default"]: logging.warning("Default {} use {}".format(keys_name, default_list)) @@ -144,9 +144,11 @@ def init_io_win(io_dic, config, common_param): if avg_lim_value and avg_time_value and tot_lim_value: io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) + logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) + logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) return io_data, io_avg_value @@ -159,10 +161,10 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): for disk_stage_list in json_data.values(): all_stage_set.update(disk_stage_list) - disk_list = [key for key in config_disk if key in all_disk_set] + disk_list = [key for key in all_disk_set if key in config_disk] not_in_disk_list = [key for key in config_disk if key not in all_disk_set] - stage_list = [key for key in config_stage if key in all_stage_set] + stage_list = [key for key in all_stage_set if key in config_stage] not_in_stage_list = [key for key in config_stage if key not in all_stage_set] if not config_disk: @@ -171,6 +173,9 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): if not config_stage: stage_list = [key for key in all_stage_set] + disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list + stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list + if config_disk and not disk_list: logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) @@ -228,6 +233,8 @@ def main(): signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) + logging.basicConfig(level=logging.INFO) + # 初始化配置读取 config = configparser.ConfigParser(comment_prefixes=('#', ';')) try: -- Gitee From d2a44b1004a3ed542225b0996a54866622b6701f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Mon, 23 Sep 2024 11:03:26 +0800 Subject: [PATCH 16/76] add ai threshold slow io detection to sysSentry --- .../ai_threshold_slow_io_detection.ini | 16 ++ .../tasks/ai_threshold_slow_io_detection.mod | 5 + .../test_ai_threshold_slow_io_detection.py | 165 ++++++++++++++++++ .../ai_threshold_slow_io_detection/README.md | 2 + .../__init__.py | 0 .../alarm_report.py | 49 ++++++ .../config_parser.py | 141 +++++++++++++++ .../data_access.py | 91 ++++++++++ .../detector.py | 48 +++++ .../ai_threshold_slow_io_detection/io_data.py | 74 ++++++++ .../sliding_window.py | 113 ++++++++++++ .../slow_io_detection.py | 133 ++++++++++++++ .../threshold.py | 160 +++++++++++++++++ .../ai_threshold_slow_io_detection/utils.py | 67 +++++++ src/python/setup.py | 3 +- 15 files changed, 1066 insertions(+), 1 deletion(-) create mode 100644 config/plugins/ai_threshold_slow_io_detection.ini create mode 100644 config/tasks/ai_threshold_slow_io_detection.mod create mode 100644 selftest/test/test_ai_threshold_slow_io_detection.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_threshold_slow_io_detection.ini new file mode 100644 index 0000000..44eb928 --- /dev/null +++ b/config/plugins/ai_threshold_slow_io_detection.ini @@ -0,0 +1,16 @@ +[common] +absolute_threshold=40 +slow_io_detect_frequency=1 +log_level=info + +[algorithm] +train_data_duration=0.1 +train_update_duration=0.02 +algorithm_type=n_sigma +boxplot_parameter=1.5 +n_sigma_parameter=3 + +[sliding_window] +sliding_window_type=not_continuous +window_size=30 +window_minimum_threshold=6 \ No newline at end of file diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod new file mode 100644 index 0000000..2729f72 --- /dev/null +++ b/config/tasks/ai_threshold_slow_io_detection.mod @@ -0,0 +1,5 @@ +[common] +enabled=yes +task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection +task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection +type=oneshot \ No newline at end of file diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py new file mode 100644 index 0000000..c36fef5 --- /dev/null +++ b/selftest/test/test_ai_threshold_slow_io_detection.py @@ -0,0 +1,165 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +import unittest +import numpy as np + +from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold +from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow, + ContinuousSlidingWindow, MedianSlidingWindow) + + +def _get_boxplot_threshold(data_list: list, parameter): + q1 = np.percentile(data_list, 25) + q3 = np.percentile(data_list, 75) + iqr = q3 - q1 + return q3 + parameter * iqr + + +def _get_n_sigma_threshold(data_list: list, parameter): + mean = np.mean(data_list) + std = np.std(data_list) + return mean + parameter * std + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + print("UnitTest Begin...") + + @classmethod + def tearDownClass(cls): + print("UnitTest End...") + + def setUp(self): + print("Begin...") + + def tearDown(self): + print("End...") + + def test_absolute_threshold(self): + absolute = AbsoluteThreshold() + self.assertEqual(None, absolute.get_threshold()) + self.assertFalse(absolute.is_abnormal(5000)) + absolute.set_threshold(40) + self.assertEqual(40, absolute.get_threshold()) + self.assertTrue(absolute.is_abnormal(50)) + + def test_boxplot_threshold(self): + boxplot = BoxplotThreshold(1.5, 5, 1) + # 阶段1:尚未初始化 + self.assertEqual(None, boxplot.get_threshold()) + self.assertFalse(boxplot.is_abnormal(5000)) + # 往boxplot中插入5个元素后,会生成阈值 + data_list = [20, 20, 20, 30, 10] + for data in data_list: + boxplot.push_latest_data_to_queue(data) + # 阶段2:初始化 + boxplot_threshold = boxplot.get_threshold() + self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) + self.assertTrue(boxplot.is_abnormal(5000)) + data_list.pop(0) + data_list.append(100) + boxplot.push_latest_data_to_queue(100) + # 阶段3:更新阈值 + boxplot_threshold = boxplot.get_threshold() + self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) + + def test_n_sigma_threshold(self): + n_sigma = NSigmaThreshold(3, 5, 1) + self.assertEqual(None, n_sigma.get_threshold()) + self.assertFalse(n_sigma.is_abnormal(5000)) + data_list = [20, 20, 20, 30, 10] + for data in data_list: + n_sigma.push_latest_data_to_queue(data) + n_sigma_threshold = n_sigma.get_threshold() + self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) + self.assertTrue(n_sigma.is_abnormal(5000)) + data_list.pop(0) + data_list.append(100) + n_sigma.push_latest_data_to_queue(100) + # 阶段3:更新阈值 + n_sigma_threshold = n_sigma.get_threshold() + self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) + + def test_not_continuous_sliding_window(self): + not_continuous = NotContinuousSlidingWindow(5, 3) + boxplot_threshold = BoxplotThreshold(1.5, 10, 8) + boxplot_threshold.attach_observer(not_continuous) + data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] + for data in data_list1: + boxplot_threshold.push_latest_data_to_queue(data) + result = not_continuous.is_slow_io_event(data) + self.assertFalse(result[0]) + self.assertEqual(23.75, boxplot_threshold.get_threshold()) + boxplot_threshold.push_latest_data_to_queue(24) + result = not_continuous.is_slow_io_event(24) + self.assertFalse(result[0]) + boxplot_threshold.push_latest_data_to_queue(25) + result = not_continuous.is_slow_io_event(25) + self.assertTrue(result[0]) + data_list2 = [20, 20, 20, 20, 20, 20] + for data in data_list2: + boxplot_threshold.push_latest_data_to_queue(data) + result = not_continuous.is_slow_io_event(data) + self.assertFalse(result[0]) + self.assertEqual(25.625, boxplot_threshold.get_threshold()) + + def test_continuous_sliding_window(self): + continuous = ContinuousSlidingWindow(5, 3) + boxplot_threshold = BoxplotThreshold(1.5, 10, 8) + boxplot_threshold.attach_observer(continuous) + data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] + for data in data_list: + boxplot_threshold.push_latest_data_to_queue(data) + result = continuous.is_slow_io_event(data) + self.assertFalse(result[0]) + self.assertEqual(23.75, boxplot_threshold.get_threshold()) + # 没有三个异常点 + self.assertFalse(continuous.is_slow_io_event(25)[0]) + # 不连续的三个异常点 + self.assertFalse(continuous.is_slow_io_event(25)[0]) + # 连续的三个异常点 + self.assertTrue(continuous.is_slow_io_event(25)[0]) + + def test_median_sliding_window(self): + median = MedianSlidingWindow(5, 3) + absolute_threshold = AbsoluteThreshold(10, 8) + absolute_threshold.attach_observer(median) + absolute_threshold.set_threshold(24.5) + data_list = [24, 24, 24, 25, 25] + for data in data_list: + self.assertFalse(median.is_slow_io_event(data)[0]) + self.assertTrue(median.is_slow_io_event(25)[0]) + + def test_parse_collect_data(self): + collect = { + "read": [1.0, 2.0, 3.0, 4.0], + "write": [5.0, 6.0, 7.0, 8.0], + "flush": [9.0, 10.0, 11.0, 12.0], + "discard": [13.0, 14.0, 15.0, 16.0], + } + from io_data import BaseData + from data_access import _get_io_stage_data + + io_data = _get_io_stage_data(collect) + self.assertEqual( + io_data.read, BaseData(latency=1.0, io_dump=2.0, io_length=3.0, iops=4.0) + ) + self.assertEqual( + io_data.write, BaseData(latency=5.0, io_dump=6.0, io_length=7.0, iops=8.0) + ) + self.assertEqual( + io_data.flush, BaseData(latency=9.0, io_dump=10.0, io_length=11.0, iops=12.0) + ) + self.assertEqual( + io_data.discard, BaseData(latency=13.0, io_dump=14.0, io_length=15.0, iops=16.0) + ) diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md new file mode 100644 index 0000000..f9b8388 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md @@ -0,0 +1,2 @@ +# slow_io_detection + diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py new file mode 100644 index 0000000..3f4f34e --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py @@ -0,0 +1,49 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +from syssentry.result import ResultLevel, report_result +import logging +import json + + +class AlarmReport: + TASK_NAME = "SLOW_IO_DETECTION" + + @staticmethod + def report_pass(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') + + @staticmethod + def report_fail(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') + + @staticmethod + def report_skip(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') + + @staticmethod + def report_minor_alm(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') + + @staticmethod + def report_major_alm(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') + + @staticmethod + def report_critical_alm(info: str): + report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) + logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') + diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py new file mode 100644 index 0000000..cd4e6f1 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py @@ -0,0 +1,141 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +import configparser +import logging + + +class ConfigParser: + + DEFAULT_ABSOLUTE_THRESHOLD = 40 + DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 + DEFAULT_LOG_LEVEL = 'info' + DEFAULT_TRAIN_DATA_DURATION = 24 + DEFAULT_TRAIN_UPDATE_DURATION = 2 + DEFAULT_ALGORITHM_TYPE = 'boxplot' + DEFAULT_N_SIGMA_PARAMETER = 3 + DEFAULT_BOXPLOT_PARAMETER = 1.5 + DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' + DEFAULT_WINDOW_SIZE = 30 + DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 + + def __init__(self, config_file_name): + self.__boxplot_parameter = None + self.__window_minimum_threshold = None + self.__window_size = None + self.__sliding_window_type = None + self.__n_sigma_parameter = None + self.__algorithm_type = None + self.__train_update_duration = None + self.__log_level = None + self.__slow_io_detect_frequency = None + self.__absolute_threshold = None + self.__train_data_duration = None + self.__config_file_name = config_file_name + + def read_config_from_file(self): + + con = configparser.ConfigParser() + con.read(self.__config_file_name, encoding='utf-8') + + items_common = dict(con.items('common')) + items_algorithm = dict(con.items('algorithm')) + items_sliding_window = dict(con.items('sliding_window')) + + try: + self.__absolute_threshold = int(items_common.get('absolute_threshold', + ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) + except ValueError: + self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD + logging.warning('absolute threshold type conversion has error, use default value.') + + try: + self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', + ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) + except ValueError: + self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + logging.warning('slow_io_detect_frequency type conversion has error, use default value.') + + self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) + + try: + self.__train_data_duration = float(items_algorithm.get('train_data_duration', + ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) + except ValueError: + self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION + logging.warning('train_data_duration type conversion has error, use default value.') + + try: + self.__train_update_duration = float(items_algorithm.get('train_update_duration', + ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) + except ValueError: + self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION + logging.warning('train_update_duration type conversion has error, use default value.') + + try: + self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) + except ValueError: + self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE + logging.warning('algorithmType type conversion has error, use default value.') + + if self.__algorithm_type == 'n_sigma': + try: + self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', + ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) + except ValueError: + self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER + logging.warning('n_sigma_parameter type conversion has error, use default value.') + elif self.__algorithm_type == 'boxplot': + try: + self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', + ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) + except ValueError: + self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER + logging.warning('boxplot_parameter type conversion has error, use default value.') + + self.__sliding_window_type = items_sliding_window.get('sliding_window_type', + ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) + + try: + self.__window_size = int(items_sliding_window.get('window_size', + ConfigParser.DEFAULT_WINDOW_SIZE)) + except ValueError: + self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE + logging.warning('window_size type conversion has error, use default value.') + + try: + self.__window_minimum_threshold = ( + int(items_sliding_window.get('window_minimum_threshold', + ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) + except ValueError: + self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD + logging.warning('window_minimum_threshold type conversion has error, use default value.') + + def get_slow_io_detect_frequency(self): + return self.__slow_io_detect_frequency + + def get_algorithm_type(self): + return self.__algorithm_type + + def get_sliding_window_type(self): + return self.__sliding_window_type + + def get_train_data_duration_and_train_update_duration(self): + return self.__train_data_duration, self.__train_update_duration + + def get_window_size_and_window_minimum_threshold(self): + return self.__window_size, self.__window_minimum_threshold + + def get_absolute_threshold(self): + return self.__absolute_threshold + + def get_log_level(self): + return self.__log_level diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py new file mode 100644 index 0000000..d9f3460 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py @@ -0,0 +1,91 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +import json +import logging + +from sentryCollector.collect_plugin import ( + Result_Messages, + get_io_data, + is_iocollect_valid, +) +from .io_data import IOStageData, IOData + +COLLECT_STAGES = [ + "throtl", + "wbt", + "gettag", + "plug", + "bfq", + "hctx", + "requeue", + "rq_driver", + "bio", + "iocost", +] + +def check_collect_valid(period): + data_raw = is_iocollect_valid(period) + if data_raw["ret"] == 0: + try: + data = json.loads(data_raw["message"]) + except Exception as e: + logging.warning(f"get io data failed, {e}") + return [] + return [k for k in data.keys()] + else: + return [] + + +def _get_raw_data(period, disk_list): + return get_io_data( + period, + disk_list, + COLLECT_STAGES, + ["read", "write", "flush", "discard"], + ) + + +def _get_io_stage_data(data): + io_stage_data = IOStageData() + for data_type in ('read', 'write', 'flush', 'discard'): + if data_type in data: + getattr(io_stage_data, data_type).latency = data[data_type][0] + getattr(io_stage_data, data_type).io_dump = data[data_type][1] + getattr(io_stage_data, data_type).io_length = data[data_type][2] + getattr(io_stage_data, data_type).iops = data[data_type][3] + return io_stage_data + + +def get_io_data_from_collect_plug(period, disk_list): + data_raw = _get_raw_data(period, disk_list) + if data_raw["ret"] == 0: + ret = {} + try: + data = json.loads(data_raw["message"]) + except json.decoder.JSONDecodeError as e: + logging.warning(f"get io data failed, {e}") + return None + + for disk in data: + disk_data = data[disk] + disk_ret = IOData() + for k, v in disk_data.items(): + try: + getattr(disk_ret, k) + setattr(disk_ret, k, _get_io_stage_data(v)) + except AttributeError: + logging.debug(f'no attr {k}') + continue + ret[disk] = disk_ret + return ret + logging.warning(f'get io data failed with message: {data_raw["message"]}') + return None diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py new file mode 100644 index 0000000..eda9825 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py @@ -0,0 +1,48 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging + +from .io_data import MetricName +from .threshold import Threshold +from .sliding_window import SlidingWindow +from .utils import get_metric_value_from_io_data_dict_by_metric_name + + +class Detector: + _metric_name: MetricName = None + _threshold: Threshold = None + _slidingWindow: SlidingWindow = None + + def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): + self._metric_name = metric_name + self._threshold = threshold + self._slidingWindow = sliding_window + self._threshold.attach_observer(self._slidingWindow) + + def get_metric_name(self): + return self._metric_name + + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): + logging.debug(f'Enter Detector: {self}') + metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) + if metric_value > 1e-6: + logging.debug(f'Input metric value: {str(metric_value)}') + self._threshold.push_latest_data_to_queue(metric_value) + detection_result = self._slidingWindow.is_slow_io_event(metric_value) + logging.debug(f'Detection result: {str(detection_result)}') + logging.debug(f'Exit Detector: {self}') + return detection_result + + def __repr__(self): + return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' + f' access_type_name: {self._metric_name.get_io_access_type_name()},' + f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' + f' sliding_window_type: {self._slidingWindow}') diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py new file mode 100644 index 0000000..0e17051 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py @@ -0,0 +1,74 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional + + +@dataclass +class BaseData: + latency: Optional[float] = field(default_factory=lambda: None) + io_dump: Optional[int] = field(default_factory=lambda: None) + io_length: Optional[int] = field(default_factory=lambda: None) + iops: Optional[int] = field(default_factory=lambda: None) + + +@dataclass +class IOStageData: + read: BaseData = field(default_factory=lambda: BaseData()) + write: BaseData = field(default_factory=lambda: BaseData()) + flush: BaseData = field(default_factory=lambda: BaseData()) + discard: BaseData = field(default_factory=lambda: BaseData()) + + +@dataclass +class IOData: + throtl: IOStageData = field(default_factory=lambda: IOStageData()) + wbt: IOStageData = field(default_factory=lambda: IOStageData()) + gettag: IOStageData = field(default_factory=lambda: IOStageData()) + iocost: IOStageData = field(default_factory=lambda: IOStageData()) + plug: IOStageData = field(default_factory=lambda: IOStageData()) + bfq: IOStageData = field(default_factory=lambda: IOStageData()) + hctx: IOStageData = field(default_factory=lambda: IOStageData()) + requeue: IOStageData = field(default_factory=lambda: IOStageData()) + rq_driver: IOStageData = field(default_factory=lambda: IOStageData()) + bio: IOStageData = field(default_factory=lambda: IOStageData()) + time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) + + +class MetricName: + _disk_name: str = None + _stage_name: str = None + _io_access_type_name: str = None + _metric_name: str = None + + def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str): + self._disk_name = disk_name + self._stage_name = stage_name + self._io_access_type_name = io_access_type_name + self._metric_name = metric_name + + def get_disk_name(self): + return self._disk_name + + def get_stage_name(self): + return self._stage_name + + def get_io_access_type_name(self): + return self._io_access_type_name + + def get_metric_name(self): + return self._metric_name + + def __repr__(self): + return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},' + f'metric: {self._metric_name}') diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py new file mode 100644 index 0000000..d395d48 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py @@ -0,0 +1,113 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +from enum import Enum, unique +import numpy as np + + +@unique +class SlidingWindowType(Enum): + NotContinuousSlidingWindow = 0 + ContinuousSlidingWindow = 1 + MedianSlidingWindow = 2 + + +class SlidingWindow: + _ai_threshold = None + _queue_length = None + _queue_threshold = None + _io_data_queue: list = None + _io_data_queue_abnormal_tag: list = None + + def __init__(self, queue_length: int, threshold: int): + self._queue_length = queue_length + self._queue_threshold = threshold + self._io_data_queue = [] + self._io_data_queue_abnormal_tag = [] + + def push(self, data: float): + if len(self._io_data_queue) == self._queue_length: + self._io_data_queue.pop(0) + self._io_data_queue_abnormal_tag.pop(0) + self._io_data_queue.append(data) + self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False) + + def update(self, threshold): + if self._ai_threshold == threshold: + return + self._ai_threshold = threshold + self._io_data_queue_abnormal_tag.clear() + for data in self._io_data_queue: + self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) + + def is_slow_io_event(self, data): + return False, None, None + + def __repr__(self): + return "SlidingWindow" + + +class NotContinuousSlidingWindow(SlidingWindow): + def is_slow_io_event(self, data): + super().push(data) + if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: + return False, self._io_data_queue, self._ai_threshold + if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: + return True, self._io_data_queue, self._ai_threshold + return False, self._io_data_queue, self._ai_threshold + + def __repr__(self): + return "NotContinuousSlidingWindow" + + +class ContinuousSlidingWindow(SlidingWindow): + def is_slow_io_event(self, data): + super().push(data) + if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: + return False, self._io_data_queue, self._ai_threshold + consecutive_count = 0 + for tag in self._io_data_queue_abnormal_tag: + if tag: + consecutive_count += 1 + if consecutive_count >= self._queue_threshold: + return True, self._io_data_queue, self._ai_threshold + else: + consecutive_count = 0 + return False, self._io_data_queue, self._ai_threshold + + def __repr__(self): + return "ContinuousSlidingWindow" + + +class MedianSlidingWindow(SlidingWindow): + def is_slow_io_event(self, data): + super().push(data) + if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: + return False, self._io_data_queue, self._ai_threshold + median = np.median(self._io_data_queue) + if median >= self._ai_threshold: + return True, self._io_data_queue, self._ai_threshold + return False, self._io_data_queue, self._ai_threshold + + def __repr__(self): + return "MedianSlidingWindow" + + +class SlidingWindowFactory: + def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs): + if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow: + return NotContinuousSlidingWindow(*args, **kwargs) + elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow: + return ContinuousSlidingWindow(*args, **kwargs) + elif sliding_window_type == SlidingWindowType.MedianSlidingWindow: + return MedianSlidingWindow(*args, **kwargs) + else: + return NotContinuousSlidingWindow(*args, **kwargs) diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py new file mode 100644 index 0000000..43cf770 --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py @@ -0,0 +1,133 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +import time +import signal +import logging + +from .detector import Detector +from .threshold import ThresholdFactory, AbsoluteThreshold +from .sliding_window import SlidingWindowFactory +from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size, + get_log_level) +from .config_parser import ConfigParser +from .data_access import get_io_data_from_collect_plug, check_collect_valid +from .io_data import MetricName +from .alarm_report import AlarmReport + +CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" + + +def sig_handler(signum, frame): + logging.info("receive signal: %d", signum) + AlarmReport().report_fail(f"receive signal: {signum}") + exit(signum) + + +class SlowIODetection: + _config_parser = None + _disk_list = None + _detector_name_list = [] + _detectors = {} + + def __init__(self, config_parser: ConfigParser): + self._config_parser = config_parser + self.__set_log_format() + self.__init_detector_name_list() + self.__init_detector() + + def __set_log_format(self): + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + log_level = get_log_level(self._config_parser.get_log_level()) + logging.basicConfig(level=log_level, format=log_format) + + def __init_detector_name_list(self): + self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) + for disk in self._disk_list: + self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) + self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) + + def __init_detector(self): + train_data_duration, train_update_duration = (self._config_parser. + get_train_data_duration_and_train_update_duration()) + slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() + threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) + data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, + train_update_duration, + slow_io_detection_frequency) + sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) + window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() + + for detector_name in self._detector_name_list: + threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, + data_queue_update_size=update_size) + sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, + threshold=window_threshold) + detector = Detector(detector_name, threshold, sliding_window) + # 绝对阈值的阈值初始化 + if isinstance(threshold, AbsoluteThreshold): + threshold.set_threshold(self._config_parser.get_absolute_threshold()) + self._detectors[detector_name] = detector + logging.info(f"add detector: {detector}") + + def launch(self): + while True: + logging.debug('step0. AI threshold slow io event detection is looping.') + + # Step1:获取IO数据 + io_data_dict_with_disk_name = get_io_data_from_collect_plug( + self._config_parser.get_slow_io_detect_frequency(), self._disk_list + ) + logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') + if io_data_dict_with_disk_name is None: + continue + # Step2:慢IO检测 + logging.debug('step2. Start to detection slow io event.') + slow_io_event_list = [] + for metric_name, detector in self._detectors.items(): + result = detector.is_slow_io_event(io_data_dict_with_disk_name) + if result[0]: + slow_io_event_list.append((detector.get_metric_name(), result)) + logging.debug('step2. End to detection slow io event.') + + # Step3:慢IO事件上报 + logging.debug('step3. Report slow io event to sysSentry.') + for slow_io_event in slow_io_event_list: + metric_name: MetricName = slow_io_event[0] + result = slow_io_event[1] + AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." + f"stage: {metric_name.get_metric_name()}," + f"type: {metric_name.get_io_access_type_name()}," + f"metric: {metric_name.get_metric_name()}," + f"current window: {result[1]}," + f"threshold: {result[2]}") + logging.error(f"slow io event happen: {str(slow_io_event)}") + + # Step4:等待检测时间 + logging.debug('step4. Wait to start next slow io event detection loop.') + time.sleep(self._config_parser.get_slow_io_detect_frequency()) + + +def main(): + # Step1:注册消息处理函数 + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + # Step2:断点恢复 + # todo: + + # Step3:读取配置 + config_file_name = CONFIG_FILE + config = ConfigParser(config_file_name) + config.read_config_from_file() + + # Step4:启动慢IO检测 + slow_io_detection = SlowIODetection(config) + slow_io_detection.launch() diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py new file mode 100644 index 0000000..9e1ca7b --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py @@ -0,0 +1,160 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging +from enum import Enum +import queue +import numpy as np +import math + +from .sliding_window import SlidingWindow + + +class ThresholdState(Enum): + INIT = 0 + START = 1 + + +class Threshold: + threshold = None + data_queue: queue.Queue = None + data_queue_update_size: int = None + new_data_size: int = None + threshold_state: ThresholdState = None + + def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + self._observer = None + self.data_queue = queue.Queue(data_queue_size) + self.data_queue_update_size = data_queue_update_size + self.new_data_size = 0 + self.threshold_state = ThresholdState.INIT + self.threshold = math.inf + + def set_threshold(self, threshold): + self.threshold = threshold + self.threshold_state = ThresholdState.START + self.notify_observer() + + def get_threshold(self): + if self.threshold_state == ThresholdState.INIT: + return None + return self.threshold + + def is_abnormal(self, data): + if self.threshold_state == ThresholdState.INIT: + return False + return data >= self.threshold + + # 使用观察者模式,当阈值更新时,自动同步刷新滑窗中的阈值 + def attach_observer(self, observer: SlidingWindow): + self._observer = observer + + def notify_observer(self): + if self._observer is not None: + self._observer.update(self.threshold) + + def push_latest_data_to_queue(self, data): + pass + + def __repr__(self): + return "Threshold" + + +class AbsoluteThreshold(Threshold): + def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + super().__init__(data_queue_size, data_queue_update_size) + + def push_latest_data_to_queue(self, data): + pass + + def __repr__(self): + return "AbsoluteThreshold" + + +class BoxplotThreshold(Threshold): + def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + super().__init__(data_queue_size, data_queue_update_size) + self.parameter = parameter + + def _update_threshold(self): + data = list(self.data_queue.queue) + q1 = np.percentile(data, 25) + q3 = np.percentile(data, 75) + iqr = q3 - q1 + self.threshold = q3 + self.parameter * iqr + if self.threshold_state == ThresholdState.INIT: + self.threshold_state = ThresholdState.START + self.notify_observer() + + def push_latest_data_to_queue(self, data): + try: + self.data_queue.put(data, block=False) + except queue.Full: + self.data_queue.get() + self.data_queue.put(data) + self.new_data_size += 1 + if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or + (self.threshold_state == ThresholdState.START and + self.new_data_size >= self.data_queue_update_size))): + self._update_threshold() + self.new_data_size = 0 + + def __repr__(self): + return "BoxplotThreshold" + + +class NSigmaThreshold(Threshold): + def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + super().__init__(data_queue_size, data_queue_update_size) + self.parameter = parameter + + def _update_threshold(self): + data = list(self.data_queue.queue) + mean = np.mean(data) + std = np.std(data) + self.threshold = mean + self.parameter * std + if self.threshold_state == ThresholdState.INIT: + self.threshold_state = ThresholdState.START + self.notify_observer() + + def push_latest_data_to_queue(self, data): + try: + self.data_queue.put(data, block=False) + except queue.Full: + self.data_queue.get() + self.data_queue.put(data) + self.new_data_size += 1 + if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or + (self.threshold_state == ThresholdState.START and + self.new_data_size >= self.data_queue_update_size))): + self._update_threshold() + self.new_data_size = 0 + + def __repr__(self): + return "NSigmaThreshold" + + +class ThresholdType(Enum): + AbsoluteThreshold = 0 + BoxplotThreshold = 1 + NSigmaThreshold = 2 + + +class ThresholdFactory: + def get_threshold(self, threshold_type: ThresholdType, *args, **kwargs): + if threshold_type == ThresholdType.AbsoluteThreshold: + return AbsoluteThreshold(*args, **kwargs) + elif threshold_type == ThresholdType.BoxplotThreshold: + return BoxplotThreshold(*args, **kwargs) + elif threshold_type == ThresholdType.NSigmaThreshold: + return NSigmaThreshold(*args, **kwargs) + else: + raise ValueError(f"Invalid threshold type: {threshold_type}") + diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py new file mode 100644 index 0000000..f66e5ed --- /dev/null +++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py @@ -0,0 +1,67 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging +from dataclasses import asdict + +from .threshold import ThresholdType +from .sliding_window import SlidingWindowType +from .io_data import MetricName, IOData + +def get_threshold_type_enum(algorithm_type: str): + if algorithm_type.lower() == 'absolute': + return ThresholdType.AbsoluteThreshold + if algorithm_type.lower() == 'boxplot': + return ThresholdType.BoxplotThreshold + if algorithm_type.lower() == 'n_sigma': + return ThresholdType.NSigmaThreshold + logging.info('not found correct algorithm type, use default: boxplot.') + return ThresholdType.BoxplotThreshold + + +def get_sliding_window_type_enum(sliding_window_type: str): + if sliding_window_type.lower() == 'not_continuous': + return SlidingWindowType.NotContinuousSlidingWindow + if sliding_window_type.lower() == 'continuous': + return SlidingWindowType.ContinuousSlidingWindow + if sliding_window_type.lower() == 'median': + return SlidingWindowType.MedianSlidingWindow + logging.info('not found correct sliding window type, use default: not_continuous.') + return SlidingWindowType.NotContinuousSlidingWindow + + +def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName): + try: + io_data: IOData = io_data_dict[metric_name.get_disk_name()] + io_stage_data = asdict(io_data)[metric_name.get_stage_name()] + base_data = io_stage_data[metric_name.get_io_access_type_name()] + metric_value = base_data[metric_name.get_metric_name()] + return metric_value + except KeyError: + return None + + +def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float, + slow_io_detect_frequency: int): + data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency) + update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency) + return data_queue_size, update_size + + +def get_log_level(log_level: str): + if log_level.lower() == 'debug': + return logging.DEBUG + elif log_level.lower() == 'info': + return logging.INFO + elif log_level.lower() == 'warning': + return logging.WARNING + elif log_level.lower() == 'fatal': + return logging.FATAL + return None diff --git a/src/python/setup.py b/src/python/setup.py index c28c691..dac6481 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -33,7 +33,8 @@ setup( 'syssentry=syssentry.syssentry:main', 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', 'sentryCollector=sentryCollector.collectd:main', - 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' + 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', + 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' ] }, ) -- Gitee From f12a420ca3e71d58738d69fb289af2a84d269410 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Wed, 25 Sep 2024 11:03:29 +0800 Subject: [PATCH 17/76] fix-bug-step-2-about-collect-module-and-avg-block-io --- src/python/sentryCollector/collect_config.py | 11 ++- src/python/sentryCollector/collect_io.py | 25 ++--- src/python/sentryCollector/collect_plugin.py | 6 +- src/python/sentryCollector/collect_server.py | 1 - src/python/sentryCollector/collectd.py | 4 +- .../avg_block_io/avg_block_io.py | 92 ++++++++++++++----- 6 files changed, 96 insertions(+), 43 deletions(-) diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py index b6cc75c..0fdd9f0 100644 --- a/src/python/sentryCollector/collect_config.py +++ b/src/python/sentryCollector/collect_config.py @@ -49,14 +49,14 @@ class CollectConfig: self.config = configparser.ConfigParser() self.config.read(self.filename) except configparser.Error: - logging.error("collectd configure file read failed") + logging.error("collect configure file read failed") return try: common_config = self.config[CONF_COMMON] - modules_str = common_config[CONF_MODULES] + modules_str = common_config[CONF_MODULES].lower() # remove space - modules_list = modules_str.replace(" ", "").split(',') + modules_list = set(modules_str.replace(" ", "").split(',')) except KeyError as e: logging.error("read config data failed, %s", e) return @@ -98,7 +98,7 @@ class CollectConfig: CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT # disk - disk = io_map_value.get(CONF_IO_DISK) + disk = io_map_value.get(CONF_IO_DISK).lower() if disk: disk_str = disk.replace(" ", "") pattern = r'^[a-zA-Z0-9-_,]+$' @@ -106,12 +106,13 @@ class CollectConfig: logging.warning("module_name = %s section, field = %s is incorrect, use default %s", CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) disk_str = CONF_IO_DISK_DEFAULT + disk_str = ",".join(set(disk_str.split(','))) result_io_config[CONF_IO_DISK] = disk_str else: logging.warning("module_name = %s section, field = %s is incorrect, use default %s", CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT - logging.info("config get_io_config: %s", result_io_config) + logging.debug("config get_io_config: %s", result_io_config) return result_io_config def get_common_config(self): diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 104b734..9c8dae7 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -177,10 +177,8 @@ class CollectIo(): def is_kernel_avaliable(self): base_path = '/sys/kernel/debug/block' + all_disk = [] for disk_name in os.listdir(base_path): - if not self.loop_all and disk_name not in self.disk_list: - continue - disk_path = os.path.join(base_path, disk_name) blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') @@ -190,12 +188,18 @@ class CollectIo(): for file_name in os.listdir(blk_io_hierarchy_path): file_path = os.path.join(blk_io_hierarchy_path, file_name) - if file_name == 'stats': - stage_list = self.extract_first_column(file_path) - self.disk_map_stage[disk_name] = stage_list - self.window_value[disk_name] = {} - IO_GLOBAL_DATA[disk_name] = {} + all_disk.append(disk_name) + + for disk_name in self.disk_list: + if not self.loop_all and disk_name not in all_disk: + logging.warning("the %s disk not exist!", disk_name) + continue + stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) + stage_list = self.extract_first_column(stats_file) + self.disk_map_stage[disk_name] = stage_list + self.window_value[disk_name] = {} + IO_GLOBAL_DATA[disk_name] = {} return len(IO_GLOBAL_DATA) != 0 @@ -203,7 +207,7 @@ class CollectIo(): logging.info("collect io thread start") if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: - logging.warning("no disks meet the requirements. collect io thread exits") + logging.warning("no disks meet the requirements. collect io thread exit") return for disk_name, stage_list in self.disk_map_stage.items(): @@ -239,5 +243,4 @@ class CollectIo(): # set stop event, notify thread exit def stop_thread(self): - logging.debug("collect io thread is preparing to exit") - self.stop_event.set() + self.stop_event.set() diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 9132473..1faa5e3 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -10,7 +10,7 @@ # See the Mulan PSL v2 for more details. """ -collcet plugin +collect plugin """ import json import socket @@ -75,7 +75,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): try: client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) except socket.error: - print("collect_plugin: client creat socket error") + print("collect_plugin: client create socket error") return None try: @@ -128,7 +128,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): def validate_parameters(param, len_limit, char_limit): ret = ResultMessage.RESULT_SUCCEED if not param: - print("parm is invalid") + print("param is invalid") ret = ResultMessage.RESULT_NOT_PARAM return [False, ret] diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py index bab4e56..11d1af0 100644 --- a/src/python/sentryCollector/collect_server.py +++ b/src/python/sentryCollector/collect_server.py @@ -281,5 +281,4 @@ class CollectServer(): pass def stop_thread(self): - logging.debug("collect listen thread is preparing to exit") self.stop_event.set() diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py index 3a836df..d9d8862 100644 --- a/src/python/sentryCollector/collectd.py +++ b/src/python/sentryCollector/collectd.py @@ -79,7 +79,7 @@ def main(): for info in module_list: class_name = Module_Map_Class.get(info) if not class_name: - logging.info("%s correspond to class is not exists", info) + logging.info("%s correspond to class is not exist", info) continue cn = class_name(module_config) collect_thread = threading.Thread(target=cn.main_loop) @@ -94,4 +94,4 @@ def main(): finally: pass - logging.info("All threads have finished. Main thread is exiting.") \ No newline at end of file + logging.info("all threads have finished. main thread exit.") \ No newline at end of file diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index 73f0b22..ac35be2 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -28,33 +28,53 @@ def log_invalid_keys(not_in_list, keys_name, config_list, default_list): def read_config_common(config): - """read config file, get [common] section value""" - try: - common_sec = config['common'] - except configparser.NoSectionError: + """read config file, get [common] section value""" + if not config.has_section("common"): report_alarm_fail("Cannot find common section in config file") try: - period_time = int(common_sec.get("period_time", 1)) - if not (1 <= period_time <= 300): - raise ValueError("Invalid period_time") - except ValueError: - period_time = 1 - logging.warning("Invalid period_time, set to 1s") + disk_name = config.get("common", "disk") + disk = [] if disk_name == "default" else disk_name.split(",") + except configparser.NoOptionError: + disk = [] + logging.warning("Unset disk, set to default") - disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] - stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] + try: + stage_name = config.get("common", "stage") + stage = [] if stage_name == "default" else stage_name.split(",") + except configparser.NoOptionError: + stage = [] + logging.warning("Unset stage, set to read,write") if len(disk) > 10: logging.warning("Too many disks, record only max 10 disks") disk = disk[:10] - iotype = common_sec.get('iotype', 'read,write').split(",") - iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] - err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] + try: + iotype_name = config.get("common", "iotype").split(",") + iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']] + err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']] + + if iotype_list in [None, []]: + iotype_list = ["read", "write"] + except configparser.NoOptionError: + iotype = ["read", "write"] + logging.warning("Unset iotype, set to default") if err_iotype: logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) + + + try: + period_time = int(config.get("common", "period_time")) + if not (1 <= period_time <= 300): + raise ValueError("Invalid period_time") + except ValueError: + period_time = 1 + logging.warning("Invalid period_time, set to 1s") + except configparser.NoOptionError: + period_time = 1 + logging.warning("Unset period_time, use 1s as default") return period_time, disk, stage, iotype_list @@ -68,11 +88,23 @@ def read_config_algorithm(config): win_size = int(config.get("algorithm", "win_size")) if not (1 <= win_size <= 300): raise ValueError("Invalid win_size") + except ValueError: + win_size = 30 + logging.warning("Invalid win_size, set to 30") + except configparser.NoOptionError: + win_size = 30 + logging.warning("Unset win_size, use 30 as default") + + try: win_threshold = int(config.get("algorithm", "win_threshold")) if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: raise ValueError("Invalid win_threshold") except ValueError: - report_alarm_fail("Invalid win_threshold or win_size") + win_threshold = 6 + logging.warning("Invalid win_threshold, set to 6") + except configparser.NoOptionError: + win_threshold = 6 + logging.warning("Unset win_threshold, use 6 as default") return win_size, win_threshold @@ -80,6 +112,21 @@ def read_config_algorithm(config): def read_config_lat_iodump(io_dic, config): """read config file, get [latency] [iodump] section value""" common_param = {} + lat_sec = None + if not config.has_section("latency"): + logging.warning("Cannot find algorithm section in config file") + else: + lat_sec = config["latency"] + + iodump_sec = None + if not config.has_section("iodump"): + logging.warning("Cannot find iodump section in config file") + else: + lat_sec = config["iodump"] + + if not lat_sec and not iodump_sec: + return common_param + for io_type in io_dic["iotype_list"]: common_param[io_type] = {} @@ -90,13 +137,16 @@ def read_config_lat_iodump(io_dic, config): } iodump_key = "{}_iodump_lim".format(io_type) + if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal(): + common_param[io_type][iodump_key] = int(iodump_sec[iodump_key]) + + if not lat_sec: + continue + for key_suffix, key_template in latency_keys.items(): - if key_template in config["latency"] and config["latency"][key_template].isdecimal(): - common_param[io_type][key_template] = int(config["latency"][key_template]) + if key_template in lat_sec and lat_sec[key_template].isdecimal(): + common_param[io_type][key_template] = int(lat_sec[key_template]) - if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): - common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) - return common_param -- Gitee From f49dcfb148930e5b01796d5a74994ca948c6ca75 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Fri, 27 Sep 2024 14:10:18 +0800 Subject: [PATCH 18/76] add log level and change log format --- config/collector.conf | 5 ++- config/inspect.conf | 5 ++- config/plugins/avg_block_io.ini | 5 ++- config/xalarm.conf | 3 ++ src/python/sentryCollector/collect_config.py | 29 ++++++++++++++++ src/python/sentryCollector/collect_io.py | 15 ++------- src/python/sentryCollector/collect_plugin.py | 32 +++++++++--------- src/python/sentryCollector/collectd.py | 6 ++-- .../avg_block_io/avg_block_io.py | 7 ++-- .../sentryPlugins/avg_block_io/utils.py | 32 ++++++++++++++++++ src/python/syssentry/sentry_config.py | 28 ++++++++++++++++ src/python/syssentry/syssentry.py | 7 ++-- src/python/xalarm/xalarm_config.py | 33 +++++++++++++++++-- src/python/xalarm/xalarm_daemon.py | 7 ++-- 14 files changed, 172 insertions(+), 42 deletions(-) diff --git a/config/collector.conf b/config/collector.conf index 52e91b1..b2567d8 100644 --- a/config/collector.conf +++ b/config/collector.conf @@ -4,4 +4,7 @@ modules=io [io] period_time=1 max_save=10 -disk=default \ No newline at end of file +disk=default + +[log] +level=info \ No newline at end of file diff --git a/config/inspect.conf b/config/inspect.conf index 071cca1..f451d9e 100644 --- a/config/inspect.conf +++ b/config/inspect.conf @@ -1,2 +1,5 @@ [inspect] -Interval=3 \ No newline at end of file +Interval=3 + +[log] +level=info diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index bc33dde..858db18 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -1,8 +1,11 @@ +[log] +level=info + [common] disk=default stage=default iotype=read,write -period_time=1 +period_time=1 [algorithm] win_size=30 diff --git a/config/xalarm.conf b/config/xalarm.conf index 14c6d39..323d2dd 100644 --- a/config/xalarm.conf +++ b/config/xalarm.conf @@ -1,2 +1,5 @@ [filter] id_mask = 1001-1128 + +[log] +level=info diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py index 0fdd9f0..5aa38ec 100644 --- a/src/python/sentryCollector/collect_config.py +++ b/src/python/sentryCollector/collect_config.py @@ -32,6 +32,35 @@ CONF_IO_PERIOD_TIME_DEFAULT = 1 CONF_IO_MAX_SAVE_DEFAULT = 10 CONF_IO_DISK_DEFAULT = "default" +# log +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + + +def get_log_level(filename=COLLECT_CONF_PATH): + if not os.path.exists(filename): + return logging.INFO + + try: + config = configparser.ConfigParser() + config.read(filename) + if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): + return logging.INFO + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return logging.INFO + except configparser.Error: + return logging.INFO + + class CollectConfig: def __init__(self, filename=COLLECT_CONF_PATH): diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 9c8dae7..019d174 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -163,18 +163,6 @@ class CollectIo(): logging.error("An error occurred2: %s", e) return column_names - def task_loop(self): - if self.stop_event.is_set(): - logging.info("collect io thread exit") - return - - for disk_name, stage_list in self.disk_map_stage.items(): - if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: - continue - self.append_period_lat(disk_name, stage_list) - - threading.Timer(self.period_time, self.task_loop).start() - def is_kernel_avaliable(self): base_path = '/sys/kernel/debug/block' all_disk = [] @@ -191,6 +179,9 @@ class CollectIo(): if file_name == 'stats': all_disk.append(disk_name) + if self.loop_all: + self.disk_list = all_disk + for disk_name in self.disk_list: if not self.loop_all and disk_name not in all_disk: logging.warning("the %s disk not exist!", disk_name) diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 1faa5e3..3e2cf4c 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -75,14 +75,14 @@ def client_send_and_recv(request_data, data_str_len, protocol): try: client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) except socket.error: - print("collect_plugin: client create socket error") + logging.error("collect_plugin: client create socket error") return None try: client_socket.connect(COLLECT_SOCKET_PATH) except OSError: client_socket.close() - print("collect_plugin: client connect error") + logging.error("collect_plugin: client connect error") return None req_data_len = len(request_data) @@ -94,23 +94,23 @@ def client_send_and_recv(request_data, data_str_len, protocol): res_data = res_data.decode() except (OSError, UnicodeError): client_socket.close() - print("collect_plugin: client communicate error") + logging.error("collect_plugin: client communicate error") return None res_magic = res_data[:CLT_MSG_MAGIC_LEN] if res_magic != "RES": - print("res msg format error") + logging.error("res msg format error") return None protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] try: protocol_id = int(protocol_str) except ValueError: - print("recv msg protocol id is invalid %s", protocol_str) + logging.error("recv msg protocol id is invalid %s", protocol_str) return None if protocol_id >= ClientProtocol.PRO_END: - print("protocol id is invalid") + logging.error("protocol id is invalid") return None try: @@ -119,7 +119,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): res_msg_data = res_msg_data.decode() return res_msg_data except (OSError, ValueError, UnicodeError): - print("collect_plugin: client recv res msg error") + logging.error("collect_plugin: client recv res msg error") finally: client_socket.close() @@ -128,30 +128,30 @@ def client_send_and_recv(request_data, data_str_len, protocol): def validate_parameters(param, len_limit, char_limit): ret = ResultMessage.RESULT_SUCCEED if not param: - print("param is invalid") + logging.error("param is invalid, param = %s", param) ret = ResultMessage.RESULT_NOT_PARAM return [False, ret] if not isinstance(param, list): - print(f"{param} is not list type.") + logging.error("%s is not list type.", param) ret = ResultMessage.RESULT_NOT_PARAM return [False, ret] if len(param) <= 0: - print(f"{param} length is 0.") + logging.error("%s length is 0.", param) ret = ResultMessage.RESULT_INVALID_LENGTH return [False, ret] pattern = r'^[a-zA-Z0-9_-]+$' for info in param: if not re.match(pattern, info): - print(f"{info} is invalid char") + logging.error("%s is invalid char", info) ret = ResultMessage.RESULT_INVALID_CHAR return [False, ret] # length of len_limit is exceeded, keep len_limit if len(param) > len_limit: - print(f"{param} length more than {len_limit}, keep the first {len_limit}") + logging.error("%s length more than %d, keep the first %d", param, len_limit, len_limit) param[:] = param[0:len_limit] # only keep elements under the char_limit length @@ -202,13 +202,13 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): request_message = json.dumps(req_msg_struct) result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID) if not result_message: - print("collect_plugin: client_send_and_recv failed") + logging.error("collect_plugin: client_send_and_recv failed") return result try: json.loads(result_message) except json.JSONDecodeError: - print("is_iocollect_valid: json decode error") + logging.error("is_iocollect_valid: json decode error") result['ret'] = ResultMessage.RESULT_PARSE_FAILED return result @@ -260,12 +260,12 @@ def inter_get_io_data(period, disk_list, stage, iotype): request_message = json.dumps(req_msg_struct) result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) if not result_message: - print("collect_plugin: client_send_and_recv failed") + logging.error("collect_plugin: client_send_and_recv failed") return result try: json.loads(result_message) except json.JSONDecodeError: - print("get_io_data: json decode error") + logging.error("get_io_data: json decode error") result['ret'] = ResultMessage.RESULT_PARSE_FAILED return result diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py index d9d8862..33f4b04 100644 --- a/src/python/sentryCollector/collectd.py +++ b/src/python/sentryCollector/collectd.py @@ -26,7 +26,7 @@ import threading from .collect_io import CollectIo from .collect_server import CollectServer -from .collect_config import CollectConfig +from .collect_config import CollectConfig, get_log_level SENTRY_RUN_DIR = "/var/run/sysSentry" COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" @@ -57,7 +57,9 @@ def main(): os.mkdir(SENTRY_RUN_DIR) os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO) + log_level = get_log_level() + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + logging.basicConfig(filename=COLLECT_LOG_FILE, level=log_level, format=log_format) os.chmod(COLLECT_LOG_FILE, 0o600) try: diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index ac35be2..b6b3b28 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -15,7 +15,7 @@ import time from .stage_window import IoWindow, IoDumpWindow from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler -from .utils import update_avg_and_check_abnormal +from .utils import update_avg_and_check_abnormal, get_log_level CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -283,7 +283,10 @@ def main(): signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) - logging.basicConfig(level=logging.INFO) + log_level = get_log_level(CONFIG_FILE) + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + + logging.basicConfig(level=log_level, format=log_format) # 初始化配置读取 config = configparser.ConfigParser(comment_prefixes=('#', ';')) diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index 54ed080..2de9a46 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -8,9 +8,41 @@ # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR # PURPOSE. # See the Mulan PSL v2 for more details. +import configparser +import logging +import os + AVG_VALUE = 0 AVG_COUNT = 1 +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + + +def get_log_level(filename): + if not os.path.exists(filename): + return logging.INFO + + try: + config = configparser.ConfigParser() + config.read(filename) + if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): + return logging.INFO + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) + + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return logging.INFO + except configparser.Error: + return logging.INFO + def get_nested_value(data, keys): """get data from nested dict""" diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py index a0e7b79..1169887 100644 --- a/src/python/syssentry/sentry_config.py +++ b/src/python/syssentry/sentry_config.py @@ -21,6 +21,34 @@ import sys DEFAULT_INSPECT_DELAY = 3 INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf" +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + + +def get_log_level(filename=INSPECT_CONF_PATH): + if not os.path.exists(filename): + return logging.INFO + + try: + config = configparser.ConfigParser() + config.read(filename) + if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): + return logging.INFO + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) + + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return logging.INFO + except configparser.Error: + return logging.INFO + class SentryConfig: """ diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 776971f..9ef0203 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -23,7 +23,7 @@ import fcntl import select -from .sentry_config import SentryConfig +from .sentry_config import SentryConfig, get_log_level from .task_map import TasksMap from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM @@ -563,7 +563,10 @@ def main(): os.mkdir(SENTRY_RUN_DIR) os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) + log_level = get_log_level() + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + + logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=log_level, format=log_format) os.chmod(SYSSENTRY_LOG_FILE, 0o600) if not chk_and_set_pidfile(): diff --git a/src/python/xalarm/xalarm_config.py b/src/python/xalarm/xalarm_config.py index 8e56d10..754a816 100644 --- a/src/python/xalarm/xalarm_config.py +++ b/src/python/xalarm/xalarm_config.py @@ -15,9 +15,10 @@ Create: 2023-11-02 """ import re +import os import dataclasses import logging -from configparser import ConfigParser +import configparser MAIN_CONFIG_PATH = '/etc/sysSentry/xalarm.conf' @@ -27,6 +28,34 @@ MIN_ID_NUMBER = 1001 MAX_ID_NUMBER = 1128 MAX_ID_MASK_CAPACITY = 128 +# log +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + + +def get_log_level(filename=MAIN_CONFIG_PATH): + if not os.path.exists(filename): + return logging.INFO + + try: + config = configparser.ConfigParser() + config.read(filename) + if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): + return logging.INFO + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return logging.INFO + except configparser.Error: + return logging.INFO + @dataclasses.dataclass class AlarmConfig: @@ -106,7 +135,7 @@ def config_init(): """ alarm_config = AlarmConfig() - cfg = ConfigParser() + cfg = configparser.ConfigParser() cfg.read(MAIN_CONFIG_PATH) id_mask = parse_id_mask(cfg) diff --git a/src/python/xalarm/xalarm_daemon.py b/src/python/xalarm/xalarm_daemon.py index 00e8886..3ab211c 100644 --- a/src/python/xalarm/xalarm_daemon.py +++ b/src/python/xalarm/xalarm_daemon.py @@ -21,7 +21,7 @@ import signal import fcntl import socket -from .xalarm_config import config_init +from .xalarm_config import config_init, get_log_level from .xalarm_server import server_loop, SOCK_FILE ALARM_DIR = "/var/run/xalarm" @@ -120,9 +120,10 @@ def alarm_process_create(): os.mkdir(ALARM_DIR) os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION) + log_level = get_log_level() + log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - logging.basicConfig(filename=ALARM_LOGFILE, level=logging.INFO, - format='%(asctime)s|%(levelname)s| %(message)s') + logging.basicConfig(filename=ALARM_LOGFILE, level=log_level, format=log_format) signal.signal(signal.SIGTERM, signal_handler) -- Gitee From 2f4497678fc6e05afcb141db81bd0d675715e95b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Mon, 30 Sep 2024 00:15:29 +0800 Subject: [PATCH 19/76] fix ai_block_io some issues --- ..._slow_io_detection.ini => ai_block_io.ini} | 6 +- config/tasks/ai_block_io.mod | 5 + .../tasks/ai_threshold_slow_io_detection.mod | 5 - ...ow_io_detection.py => test_ai_block_io.py} | 0 .../README.md | 0 .../__init__.py | 0 .../ai_block_io.py} | 57 ++-- .../alarm_report.py | 2 +- .../ai_block_io/config_parser.py | 256 ++++++++++++++++++ .../data_access.py | 3 + .../detector.py | 17 +- .../io_data.py | 0 .../sliding_window.py | 0 .../threshold.py | 13 +- .../utils.py | 15 +- .../config_parser.py | 141 ---------- src/python/setup.py | 2 +- 17 files changed, 336 insertions(+), 186 deletions(-) rename config/plugins/{ai_threshold_slow_io_detection.ini => ai_block_io.ini} (75%) create mode 100644 config/tasks/ai_block_io.mod delete mode 100644 config/tasks/ai_threshold_slow_io_detection.mod rename selftest/test/{test_ai_threshold_slow_io_detection.py => test_ai_block_io.py} (100%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/README.md (100%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/__init__.py (100%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection/slow_io_detection.py => ai_block_io/ai_block_io.py} (66%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/alarm_report.py (98%) create mode 100644 src/python/sentryPlugins/ai_block_io/config_parser.py rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/data_access.py (99%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/detector.py (77%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/io_data.py (100%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/sliding_window.py (100%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/threshold.py (92%) rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/utils.py (86%) delete mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_block_io.ini similarity index 75% rename from config/plugins/ai_threshold_slow_io_detection.ini rename to config/plugins/ai_block_io.ini index 44eb928..01ce266 100644 --- a/config/plugins/ai_threshold_slow_io_detection.ini +++ b/config/plugins/ai_block_io.ini @@ -4,9 +4,9 @@ slow_io_detect_frequency=1 log_level=info [algorithm] -train_data_duration=0.1 -train_update_duration=0.02 -algorithm_type=n_sigma +train_data_duration=24 +train_update_duration=2 +algorithm_type=boxplot boxplot_parameter=1.5 n_sigma_parameter=3 diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod new file mode 100644 index 0000000..1971d7d --- /dev/null +++ b/config/tasks/ai_block_io.mod @@ -0,0 +1,5 @@ +[common] +enabled=yes +task_start=/usr/bin/python3 /usr/bin/ai_block_io +task_stop=pkill -f /usr/bin/ai_block_io +type=oneshot \ No newline at end of file diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod deleted file mode 100644 index 2729f72..0000000 --- a/config/tasks/ai_threshold_slow_io_detection.mod +++ /dev/null @@ -1,5 +0,0 @@ -[common] -enabled=yes -task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection -task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection -type=oneshot \ No newline at end of file diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_block_io.py similarity index 100% rename from selftest/test/test_ai_threshold_slow_io_detection.py rename to selftest/test/test_ai_block_io.py diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_block_io/README.md similarity index 100% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md rename to src/python/sentryPlugins/ai_block_io/README.md diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_block_io/__init__.py similarity index 100% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py rename to src/python/sentryPlugins/ai_block_io/__init__.py diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py similarity index 66% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py rename to src/python/sentryPlugins/ai_block_io/ai_block_io.py index 43cf770..31b8a97 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -23,7 +23,7 @@ from .data_access import get_io_data_from_collect_plug, check_collect_valid from .io_data import MetricName from .alarm_report import AlarmReport -CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" +CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" def sig_handler(signum, frame): @@ -40,34 +40,48 @@ class SlowIODetection: def __init__(self, config_parser: ConfigParser): self._config_parser = config_parser - self.__set_log_format() self.__init_detector_name_list() self.__init_detector() - def __set_log_format(self): - log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - log_level = get_log_level(self._config_parser.get_log_level()) - logging.basicConfig(level=log_level, format=log_format) - def __init_detector_name_list(self): self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) - for disk in self._disk_list: - self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) - self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) + disks_to_detection: list = self._config_parser.get_disks_to_detection() + # 情况1:None,则启用所有磁盘检测 + # 情况2:is not None and len = 0,则不启动任何磁盘检测 + # 情况3:len != 0,则取交集 + if disks_to_detection is None: + for disk in self._disk_list: + self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) + self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) + elif len(disks_to_detection) == 0: + logging.warning('please attention: conf file not specify any disk to detection, ' + 'so it will not start ai block io.') + else: + disks_name_to_detection = [] + for disk_name_to_detection in disks_to_detection: + disks_name_to_detection.append(disk_name_to_detection.get_disk_name()) + disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection] + for disk in disk_intersection: + self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) + self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) + logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') def __init_detector(self): train_data_duration, train_update_duration = (self._config_parser. get_train_data_duration_and_train_update_duration()) slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() - threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) + threshold_type = self._config_parser.get_algorithm_type() data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, train_update_duration, slow_io_detection_frequency) - sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) + sliding_window_type = self._config_parser.get_sliding_window_type() window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() for detector_name in self._detector_name_list: - threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, + threshold = ThresholdFactory().get_threshold(threshold_type, + boxplot_parameter=self._config_parser.get_boxplot_parameter(), + n_sigma_paramter=self._config_parser.get_n_sigma_parameter(), + data_queue_size=data_queue_size, data_queue_update_size=update_size) sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, threshold=window_threshold) @@ -89,6 +103,7 @@ class SlowIODetection: logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') if io_data_dict_with_disk_name is None: continue + # Step2:慢IO检测 logging.debug('step2. Start to detection slow io event.') slow_io_event_list = [] @@ -103,13 +118,14 @@ class SlowIODetection: for slow_io_event in slow_io_event_list: metric_name: MetricName = slow_io_event[0] result = slow_io_event[1] - AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." - f"stage: {metric_name.get_metric_name()}," - f"type: {metric_name.get_io_access_type_name()}," - f"metric: {metric_name.get_metric_name()}," - f"current window: {result[1]}," - f"threshold: {result[2]}") - logging.error(f"slow io event happen: {str(slow_io_event)}") + alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " + f"stage is: {metric_name.get_stage_name()}, " + f"io access type is: {metric_name.get_io_access_type_name()}, " + f"metric is: {metric_name.get_metric_name()}, " + f"current window is: {result[1]}, " + f"threshold is: {result[2]}") + AlarmReport.report_major_alm(alarm_content) + logging.warning(alarm_content) # Step4:等待检测时间 logging.debug('step4. Wait to start next slow io event detection loop.') @@ -120,6 +136,7 @@ def main(): # Step1:注册消息处理函数 signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) + # Step2:断点恢复 # todo: diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py similarity index 98% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py rename to src/python/sentryPlugins/ai_block_io/alarm_report.py index 3f4f34e..230c8cd 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py +++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py @@ -15,7 +15,7 @@ import json class AlarmReport: - TASK_NAME = "SLOW_IO_DETECTION" + TASK_NAME = "ai_block_io" @staticmethod def report_pass(info: str): diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py new file mode 100644 index 0000000..632391d --- /dev/null +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -0,0 +1,256 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +import configparser +import json +import logging + +from .io_data import MetricName +from .threshold import ThresholdType +from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level + +LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" + + +def init_log_format(log_level: str): + logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT) + + +class ConfigParser: + DEFAULT_ABSOLUTE_THRESHOLD = 40 + DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 + DEFAULT_LOG_LEVEL = 'info' + + DEFAULT_ALGORITHM_TYPE = 'boxplot' + DEFAULT_TRAIN_DATA_DURATION = 24 + DEFAULT_TRAIN_UPDATE_DURATION = 2 + DEFAULT_BOXPLOT_PARAMETER = 1.5 + DEFAULT_N_SIGMA_PARAMETER = 3 + + DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' + DEFAULT_WINDOW_SIZE = 30 + DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 + + def __init__(self, config_file_name): + self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD + self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL + self.__disks_to_detection: list = [] + + self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE + self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION + self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION + self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER + self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER + + self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE + self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE + self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD + + self.__config_file_name = config_file_name + + def __read_absolute_threshold(self, items_common: dict): + try: + self.__absolute_threshold = float(items_common.get('absolute_threshold', + ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) + if self.__absolute_threshold <= 0: + logging.warning( + f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.') + self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD + except ValueError: + self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD + logging.warning( + f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.') + + def __read__slow_io_detect_frequency(self, items_common: dict): + try: + self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', + ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) + if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10: + logging.warning( + f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.') + self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + except ValueError: + self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') + + def __read__disks_to_detect(self, items_common: dict): + disks_to_detection = items_common.get('disks_to_detect') + if disks_to_detection is None: + logging.warning(f'config of disks_to_detect not found, the default value be used.') + self.__disks_to_detection = None + return + try: + disks_to_detection_list = json.loads(disks_to_detection) + for disk_to_detection in disks_to_detection_list: + disk_name = disk_to_detection.get('disk_name', None) + stage_name = disk_to_detection.get('stage_name', None) + io_access_type_name = disk_to_detection.get('io_access_type_name', None) + metric_name = disk_to_detection.get('metric_name', None) + if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None): + metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name) + self.__disks_to_detection.append(metric_name_object) + else: + logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.') + except json.decoder.JSONDecodeError as e: + logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.') + self.__disks_to_detection = None + + def __read__train_data_duration(self, items_algorithm: dict): + try: + self.__train_data_duration = float(items_algorithm.get('train_data_duration', + ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) + if self.__train_data_duration <= 0 or self.__train_data_duration > 720: + logging.warning( + f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.') + self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION + except ValueError: + self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION + logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.') + + def __read__train_update_duration(self, items_algorithm: dict): + default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION + if default_train_update_duration > self.__train_data_duration: + default_train_update_duration = self.__train_data_duration / 2 + + try: + self.__train_update_duration = float(items_algorithm.get('train_update_duration', + ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) + if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration: + logging.warning( + f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.') + self.__train_update_duration = default_train_update_duration + except ValueError: + self.__train_update_duration = default_train_update_duration + logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.') + + def __read__algorithm_type_and_parameter(self, items_algorithm: dict): + algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) + self.__algorithm_type = get_threshold_type_enum(algorithm_type) + + if self.__algorithm_type == ThresholdType.NSigmaThreshold: + try: + self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', + ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) + if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10: + logging.warning( + f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.') + self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER + except ValueError: + self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER + logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.') + elif self.__algorithm_type == ThresholdType.BoxplotThreshold: + try: + self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', + ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) + if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10: + logging.warning( + f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.') + self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER + except ValueError: + self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER + logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.') + + def __read__window_size(self, items_sliding_window: dict): + try: + self.__window_size = int(items_sliding_window.get('window_size', + ConfigParser.DEFAULT_WINDOW_SIZE)) + if self.__window_size < 1 or self.__window_size > 3600: + logging.warning( + f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.') + self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE + except ValueError: + self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE + logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.') + + def __read__window_minimum_threshold(self, items_sliding_window: dict): + default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD + if default_window_minimum_threshold > self.__window_size: + default_window_minimum_threshold = self.__window_size / 2 + try: + self.__window_minimum_threshold = ( + int(items_sliding_window.get('window_minimum_threshold', + ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) + if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size: + logging.warning( + f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.') + self.__window_minimum_threshold = default_window_minimum_threshold + except ValueError: + self.__window_minimum_threshold = default_window_minimum_threshold + logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.') + + def read_config_from_file(self): + con = configparser.ConfigParser() + con.read(self.__config_file_name, encoding='utf-8') + + if con.has_section('common'): + items_common = dict(con.items('common')) + self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) + init_log_format(self.__log_level) + self.__read_absolute_threshold(items_common) + self.__read__slow_io_detect_frequency(items_common) + self.__read__disks_to_detect(items_common) + else: + init_log_format(self.__log_level) + logging.warning("common section parameter not found, it will be set to default value.") + + if con.has_section('algorithm'): + items_algorithm = dict(con.items('algorithm')) + self.__read__train_data_duration(items_algorithm) + self.__read__train_update_duration(items_algorithm) + self.__read__algorithm_type_and_parameter(items_algorithm) + else: + logging.warning("algorithm section parameter not found, it will be set to default value.") + + if con.has_section('sliding_window'): + items_sliding_window = dict(con.items('sliding_window')) + sliding_window_type = items_sliding_window.get('sliding_window_type', + ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) + self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type) + self.__read__window_size(items_sliding_window) + self.__read__window_minimum_threshold(items_sliding_window) + else: + logging.warning("sliding_window section parameter not found, it will be set to default value.") + + self.__print_all_config_value() + + def __print_all_config_value(self): + pass + + def get_slow_io_detect_frequency(self): + return self.__slow_io_detect_frequency + + def get_algorithm_type(self): + return self.__algorithm_type + + def get_sliding_window_type(self): + return self.__sliding_window_type + + def get_train_data_duration_and_train_update_duration(self): + return self.__train_data_duration, self.__train_update_duration + + def get_window_size_and_window_minimum_threshold(self): + return self.__window_size, self.__window_minimum_threshold + + def get_absolute_threshold(self): + return self.__absolute_threshold + + def get_log_level(self): + return self.__log_level + + def get_disks_to_detection(self): + return self.__disks_to_detection + + def get_boxplot_parameter(self): + return self.__boxplot_parameter + + def get_n_sigma_parameter(self): + return self.__n_sigma_parameter diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py similarity index 99% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py rename to src/python/sentryPlugins/ai_block_io/data_access.py index d9f3460..01c5315 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -17,6 +17,8 @@ from sentryCollector.collect_plugin import ( get_io_data, is_iocollect_valid, ) + + from .io_data import IOStageData, IOData COLLECT_STAGES = [ @@ -32,6 +34,7 @@ COLLECT_STAGES = [ "iocost", ] + def check_collect_valid(period): data_raw = is_iocollect_valid(period) if data_raw["ret"] == 0: diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py similarity index 77% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py rename to src/python/sentryPlugins/ai_block_io/detector.py index eda9825..bcf62cb 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -26,19 +26,26 @@ class Detector: self._threshold = threshold self._slidingWindow = sliding_window self._threshold.attach_observer(self._slidingWindow) + self._count = 0 def get_metric_name(self): return self._metric_name def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - logging.debug(f'Enter Detector: {self}') + self._count += 1 + if self._count % 15 == 0: + self._count = 0 + logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + logging.debug(f'enter Detector: {self}') metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) - if metric_value > 1e-6: - logging.debug(f'Input metric value: {str(metric_value)}') - self._threshold.push_latest_data_to_queue(metric_value) + if metric_value is None: + logging.debug('not found metric value, so return None.') + return False, None, None + logging.debug(f'input metric value: {str(metric_value)}') + self._threshold.push_latest_data_to_queue(metric_value) detection_result = self._slidingWindow.is_slow_io_event(metric_value) logging.debug(f'Detection result: {str(detection_result)}') - logging.debug(f'Exit Detector: {self}') + logging.debug(f'exit Detector: {self}') return detection_result def __repr__(self): diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py similarity index 100% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py rename to src/python/sentryPlugins/ai_block_io/io_data.py diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py similarity index 100% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py rename to src/python/sentryPlugins/ai_block_io/sliding_window.py diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py similarity index 92% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py rename to src/python/sentryPlugins/ai_block_io/threshold.py index 9e1ca7b..ff85d85 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py +++ b/src/python/sentryPlugins/ai_block_io/threshold.py @@ -79,9 +79,9 @@ class AbsoluteThreshold(Threshold): class BoxplotThreshold(Threshold): - def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): super().__init__(data_queue_size, data_queue_update_size) - self.parameter = parameter + self.parameter = boxplot_parameter def _update_threshold(self): data = list(self.data_queue.queue) @@ -94,6 +94,8 @@ class BoxplotThreshold(Threshold): self.notify_observer() def push_latest_data_to_queue(self, data): + if data < 1e-6: + return try: self.data_queue.put(data, block=False) except queue.Full: @@ -111,9 +113,9 @@ class BoxplotThreshold(Threshold): class NSigmaThreshold(Threshold): - def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): super().__init__(data_queue_size, data_queue_update_size) - self.parameter = parameter + self.parameter = n_sigma_parameter def _update_threshold(self): data = list(self.data_queue.queue) @@ -125,6 +127,8 @@ class NSigmaThreshold(Threshold): self.notify_observer() def push_latest_data_to_queue(self, data): + if data < 1e-6: + return try: self.data_queue.put(data, block=False) except queue.Full: @@ -157,4 +161,3 @@ class ThresholdFactory: return NSigmaThreshold(*args, **kwargs) else: raise ValueError(f"Invalid threshold type: {threshold_type}") - diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py similarity index 86% rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py rename to src/python/sentryPlugins/ai_block_io/utils.py index f66e5ed..8dbba06 100644 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py +++ b/src/python/sentryPlugins/ai_block_io/utils.py @@ -8,13 +8,16 @@ # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR # PURPOSE. # See the Mulan PSL v2 for more details. + import logging from dataclasses import asdict + from .threshold import ThresholdType from .sliding_window import SlidingWindowType from .io_data import MetricName, IOData + def get_threshold_type_enum(algorithm_type: str): if algorithm_type.lower() == 'absolute': return ThresholdType.AbsoluteThreshold @@ -22,7 +25,7 @@ def get_threshold_type_enum(algorithm_type: str): return ThresholdType.BoxplotThreshold if algorithm_type.lower() == 'n_sigma': return ThresholdType.NSigmaThreshold - logging.info('not found correct algorithm type, use default: boxplot.') + logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot") return ThresholdType.BoxplotThreshold @@ -33,7 +36,7 @@ def get_sliding_window_type_enum(sliding_window_type: str): return SlidingWindowType.ContinuousSlidingWindow if sliding_window_type.lower() == 'median': return SlidingWindowType.MedianSlidingWindow - logging.info('not found correct sliding window type, use default: not_continuous.') + logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous") return SlidingWindowType.NotContinuousSlidingWindow @@ -62,6 +65,8 @@ def get_log_level(log_level: str): return logging.INFO elif log_level.lower() == 'warning': return logging.WARNING - elif log_level.lower() == 'fatal': - return logging.FATAL - return None + elif log_level.lower() == 'error': + return logging.ERROR + elif log_level.lower() == 'critical': + return logging.CRITICAL + return logging.INFO diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py deleted file mode 100644 index cd4e6f1..0000000 --- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py +++ /dev/null @@ -1,141 +0,0 @@ -# coding: utf-8 -# Copyright (c) 2024 Huawei Technologies Co., Ltd. -# sysSentry is licensed under the Mulan PSL v2. -# You can use this software according to the terms and conditions of the Mulan PSL v2. -# You may obtain a copy of Mulan PSL v2 at: -# http://license.coscl.org.cn/MulanPSL2 -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -# PURPOSE. -# See the Mulan PSL v2 for more details. - -import configparser -import logging - - -class ConfigParser: - - DEFAULT_ABSOLUTE_THRESHOLD = 40 - DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 - DEFAULT_LOG_LEVEL = 'info' - DEFAULT_TRAIN_DATA_DURATION = 24 - DEFAULT_TRAIN_UPDATE_DURATION = 2 - DEFAULT_ALGORITHM_TYPE = 'boxplot' - DEFAULT_N_SIGMA_PARAMETER = 3 - DEFAULT_BOXPLOT_PARAMETER = 1.5 - DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' - DEFAULT_WINDOW_SIZE = 30 - DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 - - def __init__(self, config_file_name): - self.__boxplot_parameter = None - self.__window_minimum_threshold = None - self.__window_size = None - self.__sliding_window_type = None - self.__n_sigma_parameter = None - self.__algorithm_type = None - self.__train_update_duration = None - self.__log_level = None - self.__slow_io_detect_frequency = None - self.__absolute_threshold = None - self.__train_data_duration = None - self.__config_file_name = config_file_name - - def read_config_from_file(self): - - con = configparser.ConfigParser() - con.read(self.__config_file_name, encoding='utf-8') - - items_common = dict(con.items('common')) - items_algorithm = dict(con.items('algorithm')) - items_sliding_window = dict(con.items('sliding_window')) - - try: - self.__absolute_threshold = int(items_common.get('absolute_threshold', - ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) - except ValueError: - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD - logging.warning('absolute threshold type conversion has error, use default value.') - - try: - self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', - ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) - except ValueError: - self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY - logging.warning('slow_io_detect_frequency type conversion has error, use default value.') - - self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) - - try: - self.__train_data_duration = float(items_algorithm.get('train_data_duration', - ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) - except ValueError: - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION - logging.warning('train_data_duration type conversion has error, use default value.') - - try: - self.__train_update_duration = float(items_algorithm.get('train_update_duration', - ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) - except ValueError: - self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - logging.warning('train_update_duration type conversion has error, use default value.') - - try: - self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) - except ValueError: - self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE - logging.warning('algorithmType type conversion has error, use default value.') - - if self.__algorithm_type == 'n_sigma': - try: - self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', - ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) - except ValueError: - self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER - logging.warning('n_sigma_parameter type conversion has error, use default value.') - elif self.__algorithm_type == 'boxplot': - try: - self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', - ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) - except ValueError: - self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER - logging.warning('boxplot_parameter type conversion has error, use default value.') - - self.__sliding_window_type = items_sliding_window.get('sliding_window_type', - ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) - - try: - self.__window_size = int(items_sliding_window.get('window_size', - ConfigParser.DEFAULT_WINDOW_SIZE)) - except ValueError: - self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE - logging.warning('window_size type conversion has error, use default value.') - - try: - self.__window_minimum_threshold = ( - int(items_sliding_window.get('window_minimum_threshold', - ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) - except ValueError: - self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD - logging.warning('window_minimum_threshold type conversion has error, use default value.') - - def get_slow_io_detect_frequency(self): - return self.__slow_io_detect_frequency - - def get_algorithm_type(self): - return self.__algorithm_type - - def get_sliding_window_type(self): - return self.__sliding_window_type - - def get_train_data_duration_and_train_update_duration(self): - return self.__train_data_duration, self.__train_update_duration - - def get_window_size_and_window_minimum_threshold(self): - return self.__window_size, self.__window_minimum_threshold - - def get_absolute_threshold(self): - return self.__absolute_threshold - - def get_log_level(self): - return self.__log_level diff --git a/src/python/setup.py b/src/python/setup.py index dac6481..9e26a10 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -34,7 +34,7 @@ setup( 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', 'sentryCollector=sentryCollector.collectd:main', 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', - 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' + 'ai_block_io=sentryPlugins.ai_block_io.ai_block_io:main' ] }, ) -- Gitee From b96011196229c362d1c1d4078a1d2ed972b6e6a4 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Fri, 27 Sep 2024 11:36:41 +0800 Subject: [PATCH 20/76] add ebpf collector --- src/c/ebpf_collector/Makefile | 101 ++ src/c/ebpf_collector/bpf_helpers.h | 535 +++++++ src/c/ebpf_collector/bpf_load.c | 709 +++++++++ src/c/ebpf_collector/ebpf_collector.bpf.c | 1408 +++++++++++++++++ src/c/ebpf_collector/ebpf_collector.c | 270 ++++ src/c/ebpf_collector/ebpf_collector.h | 77 + src/python/sentryCollector/collect_io.py | 241 ++- .../avg_block_io/avg_block_io.py | 4 +- 8 files changed, 3328 insertions(+), 17 deletions(-) create mode 100644 src/c/ebpf_collector/Makefile create mode 100644 src/c/ebpf_collector/bpf_helpers.h create mode 100644 src/c/ebpf_collector/bpf_load.c create mode 100644 src/c/ebpf_collector/ebpf_collector.bpf.c create mode 100644 src/c/ebpf_collector/ebpf_collector.c create mode 100644 src/c/ebpf_collector/ebpf_collector.h diff --git a/src/c/ebpf_collector/Makefile b/src/c/ebpf_collector/Makefile new file mode 100644 index 0000000..20a9b72 --- /dev/null +++ b/src/c/ebpf_collector/Makefile @@ -0,0 +1,101 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +# Description: ebpf collector program +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') + +KERNEL_VERSION ?= $(shell rpm -qa | grep "kernel-source-4.19" | cut -d' ' -f1 | sed 's/kernel-source-//') +KERNEL_SRC := /usr/src/kernels/$(KERNEL_VERSION) +KERNEL_PATH := /usr/src/linux-$(KERNEL_VERSION) +GCC_ARCH ?= $(shell gcc -dumpmachine) +GCC_VERSION ?= $(shell gcc -dumpversion) + +LINUX_INCLUDE := -I$(KERNEL_SRC)/include/ +LINUX_INCLUDE += -I$(KERNEL_SRC)/arch/$(ARCH)/include/ +LINUX_INCLUDE += -I$(KERNEL_SRC)/arch/$(ARCH)/include/generated +LINUX_INCLUDE += -I$(KERNEL_SRC)/arch/$(ARCH)/include/uapi +LINUX_INCLUDE += -I$(KERNEL_SRC)/arch/$(ARCH)/include/uapi/linux +LINUX_INCLUDE += -I$(KERNEL_SRC)/arch/$(ARCH)/include/generated/uapi +LINUX_INCLUDE += -I$(KERNEL_SRC)/include/uapi +LINUX_INCLUDE += -I$(KERNEL_SRC)/include/generated/uapi +LINUX_INCLUDE += -include $(KERNEL_SRC)/include/linux/kconfig.h +LINUX_INCLUDE += -I$(KERNEL_PATH)/samples/bpf +LINUX_INCLUDE += -I$(KERNEL_SRC)/tools/lib/ +LINUX_INCLUDE += -I/usr/src/kernels/$(KERNEL_VERSION)/samples/bpf +LINUX_INCLUDE += -I$(KERNEL_SRC)/tools/perf/include/bpf +LINUX_INCLUDE += -I/usr/include/libbpf/src/bpf +LINUX_INCLUDE += -I/usr/src/kernels/$(KERNEL_VERSION)/include/uapi/linux/ +LINUX_INCLUDE += -I/usr/include/bpf/ +LINUX_INCLUDE += -I/usr/include/ +BPF_LOAD_INCLUDE := -I/usr/include +BPF_LOAD_INCLUDE += -I$(KERNEL_SRC)/include/ +BPF_LOAD_INCLUDE += -I/usr/src/kernels/$(KERNEL_VERSION)/include/ +KBUILD_HOSTCFLAGS := -I$(KERNEL_PATH)/include/ +KBUILD_HOSTCFLAGS += -I$(KERNEL_PATH)/tools/lib/ -I$(KERNEL_PATH)/tools/include +KBUILD_HOSTCFLAGS += -I$(KERNEL_PATH)/tools/perf +NOSTDINC_FLAGS := -nostdinc +EXTRA_CFLAGS := -isystem /usr/lib/gcc/$(GCC_ARCH)/$(GCC_VERSION)/include +CFLAGS := -g -Wall -w + +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +APPS = ebpf_collector + +CC = gcc +LLC ?= llc +CLANG ?= clang + +USER_CFLAGS = -I. -I/usr/src/kernels/$(KERNEL_VERSION)/include/uapi/linux/ -I/usr/src/kernel/include -Wall +KERNEL_CFLAGS = -I. -I/usr/src/kernels/$(KERNEL_VERSION)/include/uapi/linux/ -Wall +LOADER_CFLAGS = -I. -I/usr/src/kernels/$(KERNEL_VERSION)/include/uapi/linux/ -I/usr/src/kernel/include +CLANG_FLAGS = -O2 -emit-llvm -c +LLC_FLAGS = -march=bpf -filetype=obj + +OUTPUT := output + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(OUTPUT)/%.bpf.o: %.bpf.c + $(call msg,BPF,$@) + $(CLANG) $(NOSTDINC_FLAGS) $(EXTRA_CFLAGS) $(LINUX_INCLUDE) $(KBUILD_HOSTCFLAGS) \ + -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ + -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ + -O2 -emit-llvm -c $< -o -| $(LLC) $(LLC_FLAGS) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.bpf.o + +$(OUTPUT)/bpf_load.o: bpf_load.c | $(OUTPUT) + $(call msg,CC,$@) + $(CC) $(NOSTDINC_FLAGS) $(EXTRA_CFLAGS) $(CFLAGS) -I$(KERNEL_PATH)/samples/bpf -I$(KERNEL_PATH)/tools/perf $(BPF_LOAD_INCLUDE) \ + -I$(KERNEL_PATH)/tools/lib/ -I$(KERNEL_PATH)/tools/include \ + -c $(filter %.c,$^) -o $@ + +$(OUTPUT)/%.o: %.c | $(OUTPUT) + $(call msg,CC,$@) + $(CC) $(CFLAGS) $(INCLUDES) -I$(KERNEL_PATH)/samples/bpf -c $(filter %.c,$^) -o $@ + +$(APPS): %: $(OUTPUT)/%.o $(OUTPUT)/bpf_load.o | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -I$(KERNEL_PATH)/samples/bpf -lelf -lbpf -lz -o $@ + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/src/c/ebpf_collector/bpf_helpers.h b/src/c/ebpf_collector/bpf_helpers.h new file mode 100644 index 0000000..352965a --- /dev/null +++ b/src/c/ebpf_collector/bpf_helpers.h @@ -0,0 +1,535 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __BPF_HELPERS__ +#define __BPF_HELPERS__ + +#define __uint(name, val) int (*name)[val] +#define __type(name, val) val *name + +/* helper macro to print out debug messages */ +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +#ifdef __clang__ + +/* helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by elf_bpf loader + */ +#define SEC(NAME) __attribute__((section(NAME), used)) + +/* helper functions called from eBPF programs written in C */ +static void *(*bpf_map_lookup_elem)(void *map, const void *key) = + (void *) BPF_FUNC_map_lookup_elem; +static int (*bpf_map_update_elem)(void *map, const void *key, const void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_update_elem; +static int (*bpf_map_delete_elem)(void *map, const void *key) = + (void *) BPF_FUNC_map_delete_elem; +static int (*bpf_map_push_elem)(void *map, const void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_push_elem; +static int (*bpf_map_pop_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_pop_elem; +static int (*bpf_map_peek_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_peek_elem; +static int (*bpf_probe_read)(void *dst, int size, const void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read; +static unsigned long long (*bpf_ktime_get_ns)(void) = + (void *) BPF_FUNC_ktime_get_ns; +static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = + (void *) BPF_FUNC_trace_printk; +static void (*bpf_tail_call)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_tail_call; +static unsigned long long (*bpf_get_smp_processor_id)(void) = + (void *) BPF_FUNC_get_smp_processor_id; +static unsigned long long (*bpf_get_current_pid_tgid)(void) = + (void *) BPF_FUNC_get_current_pid_tgid; +static unsigned long long (*bpf_get_current_uid_gid)(void) = + (void *) BPF_FUNC_get_current_uid_gid; +static int (*bpf_get_current_comm)(void *buf, int buf_size) = + (void *) BPF_FUNC_get_current_comm; +static unsigned long long (*bpf_perf_event_read)(void *map, + unsigned long long flags) = + (void *) BPF_FUNC_perf_event_read; +static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = + (void *) BPF_FUNC_clone_redirect; +static int (*bpf_redirect)(int ifindex, int flags) = + (void *) BPF_FUNC_redirect; +static int (*bpf_redirect_map)(void *map, int key, int flags) = + (void *) BPF_FUNC_redirect_map; +static int (*bpf_perf_event_output)(void *ctx, void *map, + unsigned long long flags, void *data, + int size) = + (void *) BPF_FUNC_perf_event_output; +static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = + (void *) BPF_FUNC_get_stackid; +static int (*bpf_probe_write_user)(void *dst, const void *src, int size) = + (void *) BPF_FUNC_probe_write_user; +static int (*bpf_current_task_under_cgroup)(void *map, int index) = + (void *) BPF_FUNC_current_task_under_cgroup; +static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = + (void *) BPF_FUNC_skb_get_tunnel_key; +static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = + (void *) BPF_FUNC_skb_set_tunnel_key; +static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = + (void *) BPF_FUNC_skb_get_tunnel_opt; +static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = + (void *) BPF_FUNC_skb_set_tunnel_opt; +static unsigned long long (*bpf_get_prandom_u32)(void) = + (void *) BPF_FUNC_get_prandom_u32; +static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_head; +static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_meta; +static int (*bpf_get_socket_cookie)(void *ctx) = + (void *) BPF_FUNC_get_socket_cookie; +static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_setsockopt; +static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_getsockopt; +static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = + (void *) BPF_FUNC_sock_ops_cb_flags_set; +static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = + (void *) BPF_FUNC_sk_redirect_map; +static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) = + (void *) BPF_FUNC_sk_redirect_hash; +static int (*bpf_sock_map_update)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_sock_map_update; +static int (*bpf_sock_hash_update)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_sock_hash_update; +static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, + void *buf, unsigned int buf_size) = + (void *) BPF_FUNC_perf_event_read_value; +static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, + unsigned int buf_size) = + (void *) BPF_FUNC_perf_prog_read_value; +static int (*bpf_override_return)(void *ctx, unsigned long rc) = + (void *) BPF_FUNC_override_return; +static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = + (void *) BPF_FUNC_msg_redirect_map; +static int (*bpf_msg_redirect_hash)(void *ctx, + void *map, void *key, int flags) = + (void *) BPF_FUNC_msg_redirect_hash; +static int (*bpf_msg_apply_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_apply_bytes; +static int (*bpf_msg_cork_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_cork_bytes; +static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_pull_data; +static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_push_data; +static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) = + (void *) BPF_FUNC_msg_pop_data; +static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = + (void *) BPF_FUNC_bind; +static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_tail; +static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, + int size, int flags) = + (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = + (void *) BPF_FUNC_sk_select_reuseport; +static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = + (void *) BPF_FUNC_get_stack; +static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, + int plen, __u32 flags) = + (void *) BPF_FUNC_fib_lookup; +static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr, + unsigned int len) = + (void *) BPF_FUNC_lwt_push_encap; +static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset, + void *from, unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_store_bytes; +static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param, + unsigned int param_len) = + (void *) BPF_FUNC_lwt_seg6_action; +static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset, + unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_adjust_srh; +static int (*bpf_rc_repeat)(void *ctx) = + (void *) BPF_FUNC_rc_repeat; +static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol, + unsigned long long scancode, unsigned int toggle) = + (void *) BPF_FUNC_rc_keydown; +static unsigned long long (*bpf_get_current_cgroup_id)(void) = + (void *) BPF_FUNC_get_current_cgroup_id; +static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = + (void *) BPF_FUNC_get_local_storage; +static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = + (void *) BPF_FUNC_skb_cgroup_id; +static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = + (void *) BPF_FUNC_skb_ancestor_cgroup_id; +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_tcp; +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_skc_lookup_tcp; +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_udp; +static int (*bpf_sk_release)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_release; +static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) = + (void *) BPF_FUNC_skb_vlan_push; +static int (*bpf_skb_vlan_pop)(void *ctx) = + (void *) BPF_FUNC_skb_vlan_pop; +static int (*bpf_rc_pointer_rel)(void *ctx, int rel_x, int rel_y) = + (void *) BPF_FUNC_rc_pointer_rel; +static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_lock; +static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_unlock; +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_fullsock; +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_tcp_sock; +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_get_listener_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; +static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, + void *ip, int ip_len, void *tcp, int tcp_len) = + (void *) BPF_FUNC_tcp_check_syncookie; +static int (*bpf_sysctl_get_name)(void *ctx, char *buf, + unsigned long long buf_len, + unsigned long long flags) = + (void *) BPF_FUNC_sysctl_get_name; +static int (*bpf_sysctl_get_current_value)(void *ctx, char *buf, + unsigned long long buf_len) = + (void *) BPF_FUNC_sysctl_get_current_value; +static int (*bpf_sysctl_get_new_value)(void *ctx, char *buf, + unsigned long long buf_len) = + (void *) BPF_FUNC_sysctl_get_new_value; +static int (*bpf_sysctl_set_new_value)(void *ctx, const char *buf, + unsigned long long buf_len) = + (void *) BPF_FUNC_sysctl_set_new_value; +static int (*bpf_strtol)(const char *buf, unsigned long long buf_len, + unsigned long long flags, long *res) = + (void *) BPF_FUNC_strtol; +static int (*bpf_strtoul)(const char *buf, unsigned long long buf_len, + unsigned long long flags, unsigned long *res) = + (void *) BPF_FUNC_strtoul; +static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, + void *value, __u64 flags) = + (void *) BPF_FUNC_sk_storage_get; +static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = + (void *)BPF_FUNC_sk_storage_delete; +static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal; +static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip, + int ip_len, void *tcp, int tcp_len) = + (void *) BPF_FUNC_tcp_gen_syncookie; + +/* llvm builtin functions that eBPF C program may use to + * emit BPF_LD_ABS and BPF_LD_IND instructions + */ +struct sk_buff; +unsigned long long load_byte(void *skb, + unsigned long long off) asm("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, + unsigned long long off) asm("llvm.bpf.load.half"); +unsigned long long load_word(void *skb, + unsigned long long off) asm("llvm.bpf.load.word"); + +/* a helper structure used by eBPF C program + * to describe map attributes to elf_bpf loader + */ +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; + unsigned int numa_node; +}; + +#else + +#include + +#endif + +#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ + struct ____btf_map_##name { \ + type_key key; \ + type_val value; \ + }; \ + struct ____btf_map_##name \ + __attribute__ ((section(".maps." #name), used)) \ + ____btf_map_##name = { } + +static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = + (void *) BPF_FUNC_skb_load_bytes; +static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = + (void *) BPF_FUNC_skb_load_bytes_relative; +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l3_csum_replace; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; +static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) = + (void *) BPF_FUNC_csum_diff; +static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_skb_under_cgroup; +static int (*bpf_skb_change_head)(void *, int len, int flags) = + (void *) BPF_FUNC_skb_change_head; +static int (*bpf_skb_pull_data)(void *, int len) = + (void *) BPF_FUNC_skb_pull_data; +static unsigned int (*bpf_get_cgroup_classid)(void *ctx) = + (void *) BPF_FUNC_get_cgroup_classid; +static unsigned int (*bpf_get_route_realm)(void *ctx) = + (void *) BPF_FUNC_get_route_realm; +static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) = + (void *) BPF_FUNC_skb_change_proto; +static int (*bpf_skb_change_type)(void *ctx, __u32 type) = + (void *) BPF_FUNC_skb_change_type; +static unsigned int (*bpf_get_hash_recalc)(void *ctx) = + (void *) BPF_FUNC_get_hash_recalc; +static unsigned long long (*bpf_get_current_task)(void) = + (void *) BPF_FUNC_get_current_task; +static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) = + (void *) BPF_FUNC_skb_change_tail; +static long long (*bpf_csum_update)(void *ctx, __u32 csum) = + (void *) BPF_FUNC_csum_update; +static void (*bpf_set_hash_invalid)(void *ctx) = + (void *) BPF_FUNC_set_hash_invalid; +static int (*bpf_get_numa_node_id)(void) = + (void *) BPF_FUNC_get_numa_node_id; +static int (*bpf_probe_read_str)(void *ctx, __u32 size, + const void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read_str; +static unsigned int (*bpf_get_socket_uid)(void *ctx) = + (void *) BPF_FUNC_get_socket_uid; +static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) = + (void *) BPF_FUNC_set_hash; +static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, + unsigned long long flags) = + (void *) BPF_FUNC_skb_adjust_room; + +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#else + #undef bpf_target_defined +#endif + +/* Fall back to what the compiler says */ +#ifndef bpf_target_defined +#if defined(__x86_64__) + #define bpf_target_x86 +#elif defined(__s390__) + #define bpf_target_s390 +#elif defined(__arm__) + #define bpf_target_arm +#elif defined(__aarch64__) + #define bpf_target_arm64 +#elif defined(__mips__) + #define bpf_target_mips +#elif defined(__powerpc__) + #define bpf_target_powerpc +#elif defined(__sparc__) + #define bpf_target_sparc +#endif +#endif + +#if defined(bpf_target_x86) + +#ifdef __KERNEL__ +#define PT_REGS_PARM1(x) ((x)->di) +#define PT_REGS_PARM2(x) ((x)->si) +#define PT_REGS_PARM3(x) ((x)->dx) +#define PT_REGS_PARM4(x) ((x)->cx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->sp) +#define PT_REGS_FP(x) ((x)->bp) +#define PT_REGS_RC(x) ((x)->ax) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->ip) +#else +#ifdef __i386__ +/* i386 kernel is built with -mregparm=3 */ +#define PT_REGS_PARM1(x) ((x)->eax) +#define PT_REGS_PARM2(x) ((x)->edx) +#define PT_REGS_PARM3(x) ((x)->ecx) +#define PT_REGS_PARM4(x) 0 +#define PT_REGS_PARM5(x) 0 +#define PT_REGS_RET(x) ((x)->esp) +#define PT_REGS_FP(x) ((x)->ebp) +#define PT_REGS_RC(x) ((x)->eax) +#define PT_REGS_SP(x) ((x)->esp) +#define PT_REGS_IP(x) ((x)->eip) +#else +#define PT_REGS_PARM1(x) ((x)->rdi) +#define PT_REGS_PARM2(x) ((x)->rsi) +#define PT_REGS_PARM3(x) ((x)->rdx) +#define PT_REGS_PARM4(x) ((x)->rcx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->rsp) +#define PT_REGS_FP(x) ((x)->rbp) +#define PT_REGS_RC(x) ((x)->rax) +#define PT_REGS_SP(x) ((x)->rsp) +#define PT_REGS_IP(x) ((x)->rip) +#endif +#endif + +#elif defined(bpf_target_s390) + +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_S390 const volatile user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_PARM2(x) (((PT_REGS_S390 *)(x))->gprs[3]) +#define PT_REGS_PARM3(x) (((PT_REGS_S390 *)(x))->gprs[4]) +#define PT_REGS_PARM4(x) (((PT_REGS_S390 *)(x))->gprs[5]) +#define PT_REGS_PARM5(x) (((PT_REGS_S390 *)(x))->gprs[6]) +#define PT_REGS_RET(x) (((PT_REGS_S390 *)(x))->gprs[14]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_S390 *)(x))->gprs[11]) +#define PT_REGS_RC(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15]) +#define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr) + +#elif defined(bpf_target_arm) + +#define PT_REGS_PARM1(x) ((x)->uregs[0]) +#define PT_REGS_PARM2(x) ((x)->uregs[1]) +#define PT_REGS_PARM3(x) ((x)->uregs[2]) +#define PT_REGS_PARM4(x) ((x)->uregs[3]) +#define PT_REGS_PARM5(x) ((x)->uregs[4]) +#define PT_REGS_RET(x) ((x)->uregs[14]) +#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->uregs[0]) +#define PT_REGS_SP(x) ((x)->uregs[13]) +#define PT_REGS_IP(x) ((x)->uregs[12]) + +#elif defined(bpf_target_arm64) + +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_ARM64 const volatile struct user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_PARM2(x) (((PT_REGS_ARM64 *)(x))->regs[1]) +#define PT_REGS_PARM3(x) (((PT_REGS_ARM64 *)(x))->regs[2]) +#define PT_REGS_PARM4(x) (((PT_REGS_ARM64 *)(x))->regs[3]) +#define PT_REGS_PARM5(x) (((PT_REGS_ARM64 *)(x))->regs[4]) +#define PT_REGS_RET(x) (((PT_REGS_ARM64 *)(x))->regs[30]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_ARM64 *)(x))->regs[29]) +#define PT_REGS_RC(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp) +#define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc) + +#elif defined(bpf_target_mips) + +#define PT_REGS_PARM1(x) ((x)->regs[4]) +#define PT_REGS_PARM2(x) ((x)->regs[5]) +#define PT_REGS_PARM3(x) ((x)->regs[6]) +#define PT_REGS_PARM4(x) ((x)->regs[7]) +#define PT_REGS_PARM5(x) ((x)->regs[8]) +#define PT_REGS_RET(x) ((x)->regs[31]) +#define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[1]) +#define PT_REGS_SP(x) ((x)->regs[29]) +#define PT_REGS_IP(x) ((x)->cp0_epc) + +#elif defined(bpf_target_powerpc) + +#define PT_REGS_PARM1(x) ((x)->gpr[3]) +#define PT_REGS_PARM2(x) ((x)->gpr[4]) +#define PT_REGS_PARM3(x) ((x)->gpr[5]) +#define PT_REGS_PARM4(x) ((x)->gpr[6]) +#define PT_REGS_PARM5(x) ((x)->gpr[7]) +#define PT_REGS_RC(x) ((x)->gpr[3]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->nip) + +#elif defined(bpf_target_sparc) + +#define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) +#define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) +#define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) +#define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) +#define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) +#define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) + +/* Should this also be a bpf_target check for the sparc case? */ +#if defined(__arch64__) +#define PT_REGS_IP(x) ((x)->tpc) +#else +#define PT_REGS_IP(x) ((x)->pc) +#endif + +#endif + +#if defined(bpf_target_powerpc) +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#elif defined(bpf_target_sparc) +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#else +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), \ + (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) +#endif + +/* + * BPF_CORE_READ abstracts away bpf_probe_read() call and captures offset + * relocation for source address using __builtin_preserve_access_index() + * built-in, provided by Clang. + * + * __builtin_preserve_access_index() takes as an argument an expression of + * taking an address of a field within struct/union. It makes compiler emit + * a relocation, which records BTF type ID describing root struct/union and an + * accessor string which describes exact embedded field that was used to take + * an address. See detailed description of this relocation format and + * semantics in comments to struct bpf_offset_reloc in libbpf_internal.h. + * + * This relocation allows libbpf to adjust BPF instruction to use correct + * actual field offset, based on target kernel BTF type that matches original + * (local) BTF, used to record relocation. + */ +#define BPF_CORE_READ(dst, src) \ + bpf_probe_read((dst), sizeof(*(src)), \ + __builtin_preserve_access_index(src)) + +#endif diff --git a/src/c/ebpf_collector/bpf_load.c b/src/c/ebpf_collector/bpf_load.c new file mode 100644 index 0000000..db33eb1 --- /dev/null +++ b/src/c/ebpf_collector/bpf_load.c @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +static char license[128]; +static int kern_version; +static bool processed_sec[128]; +char bpf_log_buf[BPF_LOG_BUF_SIZE]; +int map_fd[MAX_MAPS]; +int prog_fd[MAX_PROGS]; +int event_fd[MAX_PROGS]; +int prog_cnt; +int prog_array_fd = -1; + +struct bpf_map_data map_data[MAX_MAPS]; +int map_data_count = 0; + +static int populate_prog_array(const char *event, int prog_fd) +{ + int ind = atoi(event), err; + + err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); + if (err < 0) { + printf("failed to store prog_fd in prog_array\n"); + return -1; + } + return 0; +} + +static int write_kprobe_events(const char *val) +{ + int fd, ret, flags; + + if (val == NULL) + return -1; + else if (val[0] == '\0') + flags = O_WRONLY | O_TRUNC; + else + flags = O_WRONLY | O_APPEND; + + fd = open("/sys/kernel/debug/tracing/kprobe_events", flags); + + ret = write(fd, val, strlen(val)); + close(fd); + + return ret; +} + +static int load_and_attach(const char *event, struct bpf_insn *prog, int size) +{ + bool is_socket = strncmp(event, "socket", 6) == 0; + bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; + bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; + bool is_xdp = strncmp(event, "xdp", 3) == 0; + bool is_perf_event = strncmp(event, "perf_event", 10) == 0; + bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; + bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; + bool is_sockops = strncmp(event, "sockops", 7) == 0; + bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; + bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; + size_t insns_cnt = size / sizeof(struct bpf_insn); + enum bpf_prog_type prog_type; + char buf[256]; + int fd, efd, err, id; + struct perf_event_attr attr = {}; + + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + if (is_socket) { + prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + } else if (is_kprobe || is_kretprobe) { + prog_type = BPF_PROG_TYPE_KPROBE; + } else if (is_tracepoint) { + prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_raw_tracepoint) { + prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; + } else if (is_xdp) { + prog_type = BPF_PROG_TYPE_XDP; + } else if (is_perf_event) { + prog_type = BPF_PROG_TYPE_PERF_EVENT; + } else if (is_cgroup_skb) { + prog_type = BPF_PROG_TYPE_CGROUP_SKB; + } else if (is_cgroup_sk) { + prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + } else if (is_sockops) { + prog_type = BPF_PROG_TYPE_SOCK_OPS; + } else if (is_sk_skb) { + prog_type = BPF_PROG_TYPE_SK_SKB; + } else if (is_sk_msg) { + prog_type = BPF_PROG_TYPE_SK_MSG; + } else { + printf("Unknown event '%s'\n", event); + return -1; + } + + if (prog_cnt == MAX_PROGS) + return -1; + + fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, + bpf_log_buf, BPF_LOG_BUF_SIZE); + if (fd < 0) { + printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + return 0; + + if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { + if (is_socket) + event += 6; + else + event += 7; + if (*event != '/') + return 0; + event++; + if (!isdigit(*event)) { + printf("invalid prog number\n"); + return -1; + } + return populate_prog_array(event, fd); + } + + if (is_raw_tracepoint) { + efd = bpf_raw_tracepoint_open(event + 15, fd); + if (efd < 0) { + printf("tracepoint %s %s\n", event + 15, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + return 0; + } + + if (is_kprobe || is_kretprobe) { + bool need_normal_check = true; + const char *event_prefix = ""; + + if (is_kprobe) + event += 7; + else + event += 10; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + + if (isdigit(*event)) + return populate_prog_array(event, fd); + +#ifdef __x86_64__ + if (strncmp(event, "sys_", 4) == 0) { + snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", + is_kprobe ? 'p' : 'r', event, event); + err = write_kprobe_events(buf); + if (err >= 0) { + need_normal_check = false; + event_prefix = "__x64_"; + } + } +#endif + if (need_normal_check) { + if (strcmp("wbt_wait", event) == 0 || strcmp("blk_mq_get_tag", event) == 0) { + if (is_kprobe) { + snprintf(buf, sizeof(buf), "%c:%s_1 %s", + is_kprobe ? 'p' : 'r', event, event); + } + else { + snprintf(buf, sizeof(buf), "%c:%s_2 %s", + is_kprobe ? 'p' : 'r', event, event); + } + } + else { + snprintf(buf, sizeof(buf), "%c:%s %s", + is_kprobe ? 'p' : 'r', event, event); + } + err = write_kprobe_events(buf); + if (err < 0) { + printf("failed to create kprobe '%s' error '%s'\n", + event, strerror(errno)); + return -1; + } + } + + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event_prefix); + strcat(buf, event); + + if (strcmp("wbt_wait", event) == 0 || strcmp("blk_mq_get_tag", event) == 0) { + if (is_kprobe) { + strcat(buf, "_1"); + } + else { + strcat(buf, "_2"); + } + } + strcat(buf, "/id"); + } else if (is_tracepoint) { + event += 11; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + strcpy(buf, DEBUGFS); + strcat(buf, "events/"); + strcat(buf, event); + strcat(buf, "/id"); + } + + efd = open(buf, O_RDONLY, 0); + if (efd < 0) { + printf("failed to open event %s\n", event); + return -1; + } + + err = read(efd, buf, sizeof(buf)); + if (err < 0 || err >= sizeof(buf)) { + printf("read from '%s' failed '%s'\n", event, strerror(errno)); + return -1; + } + + close(efd); + + buf[err] = 0; + id = atoi(buf); + attr.config = id; + + efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); + if (efd < 0) { + printf("event %d fd %d err %s\n", id, efd, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", + strerror(errno)); + return -1; + } + err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", + strerror(errno)); + return -1; + } + + return 0; +} + +static int load_maps(struct bpf_map_data *maps, int nr_maps, + fixup_map_cb fixup_map) +{ + int i, numa_node; + + for (i = 0; i < nr_maps; i++) { + if (fixup_map) { + fixup_map(&maps[i], i); + /* Allow userspace to assign map FD prior to creation */ + if (maps[i].fd != -1) { + map_fd[i] = maps[i].fd; + continue; + } + } + + numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? + maps[i].def.numa_node : -1; + + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, + maps[i].def.key_size, + inner_map_fd, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); + } else { + map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); + } + if (map_fd[i] < 0) { + printf("failed to create a map: %d %s\n", + errno, strerror(errno)); + return 1; + } + maps[i].fd = map_fd[i]; + + if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) + prog_array_fd = map_fd[i]; + } + return 0; +} + +static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, + GElf_Shdr *shdr, Elf_Data **data) +{ + Elf_Scn *scn; + + scn = elf_getscn(elf, i); + if (!scn) + return 1; + + if (gelf_getshdr(scn, shdr) != shdr) + return 2; + + *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); + if (!*shname || !shdr->sh_size) + return 3; + + *data = elf_getdata(scn, 0); + if (!*data || elf_getdata(scn, *data) != NULL) + return 4; + + return 0; +} + +static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, + GElf_Shdr *shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) +{ + int i, nrels; + + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + bool match = false; + int j, map_idx; + + gelf_getrel(data, i, &rel); + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); + + if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + printf("invalid relo for insn[%d].code 0x%x\n", + insn_idx, insn[insn_idx].code); + return 1; + } + insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].fd; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } + } + + return 0; +} + +static int cmp_symbols(const void *l, const void *r) +{ + const GElf_Sym *lsym = (const GElf_Sym *)l; + const GElf_Sym *rsym = (const GElf_Sym *)r; + + if (lsym->st_value < rsym->st_value) + return -1; + else if (lsym->st_value > rsym->st_value) + return 1; + else + return 0; +} + +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf *elf, Elf_Data *symbols, int strtabidx) +{ + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + int copy_sz; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } + + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS+1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) + continue; + if (sym[nr_maps].st_shndx != maps_shndx) + continue; + /* Only increment iif maps section */ + nr_maps++; + } + + /* Align to map_fd[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_load_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_load_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + + /* Memcpy relevant part of ELF maps data to loader maps */ + for (i = 0; i < nr_maps; i++) { + struct bpf_load_map_def *def; + unsigned char *addr, *end; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { + printf("strdup(%s): %s(%d)\n", map_name, + strerror(errno), errno); + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char*) def + map_sz_copy; + end = (unsigned char*) def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } + } + } + + free(sym); + return nr_maps; +} + +static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) +{ + int fd, i, ret, maps_shndx = -1, strtabidx = -1; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr, shdr_prog; + Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; + char *shname, *shname_prog; + int nr_maps = 0; + + /* reset global variables */ + kern_version = 0; + memset(license, 0, sizeof(license)); + memset(processed_sec, 0, sizeof(processed_sec)); + + if (elf_version(EV_CURRENT) == EV_NONE) + return 1; + + fd = open(path, O_RDONLY, 0); + if (fd < 0) + return 1; + + elf = elf_begin(fd, ELF_C_READ, NULL); + + if (!elf) + return 1; + + if (gelf_getehdr(elf, &ehdr) != &ehdr) + return 1; + + /* clear all kprobes */ + i = write_kprobe_events(""); + + /* scan over all elf sections to get license and map info */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (0) /* helpful for llvm debugging */ + printf("section %d:%s data %p size %zd link %d flags %d\n", + i, shname, data->d_buf, data->d_size, + shdr.sh_link, (int) shdr.sh_flags); + + if (strcmp(shname, "license") == 0) { + processed_sec[i] = true; + memcpy(license, data->d_buf, data->d_size); + } else if (strcmp(shname, "version") == 0) { + processed_sec[i] = true; + if (data->d_size != sizeof(int)) { + printf("invalid size of version section %zd\n", + data->d_size); + return 1; + } + memcpy(&kern_version, data->d_buf, sizeof(int)); + } else if (strcmp(shname, "maps") == 0) { + int j; + + maps_shndx = i; + data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].fd = -1; + } else if (shdr.sh_type == SHT_SYMTAB) { + strtabidx = shdr.sh_link; + symbols = data; + } + } + + ret = 1; + + if (!symbols) { + printf("missing SHT_SYMTAB section\n"); + goto done; + } + + if (data_maps) { + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); + goto done; + } + if (load_maps(map_data, nr_maps, fixup_map)) + goto done; + map_data_count = nr_maps; + + processed_sec[maps_shndx] = true; + } + + /* process all relo sections, and rewrite bpf insns for maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (shdr.sh_type == SHT_REL) { + struct bpf_insn *insns; + + /* locate prog sec that need map fixup (relocations) */ + if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, + &shdr_prog, &data_prog)) + continue; + + if (shdr_prog.sh_type != SHT_PROGBITS || + !(shdr_prog.sh_flags & SHF_EXECINSTR)) + continue; + + insns = (struct bpf_insn *) data_prog->d_buf; + processed_sec[i] = true; /* relo section */ + + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) + continue; + } + } + + /* load programs */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (memcmp(shname, "kprobe/", 7) == 0 || + memcmp(shname, "kretprobe/", 10) == 0 || + memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "raw_tracepoint/", 15) == 0 || + memcmp(shname, "xdp", 3) == 0 || + memcmp(shname, "perf_event", 10) == 0 || + memcmp(shname, "socket", 6) == 0 || + memcmp(shname, "cgroup/", 7) == 0 || + memcmp(shname, "sockops", 7) == 0 || + memcmp(shname, "sk_skb", 6) == 0 || + memcmp(shname, "sk_msg", 6) == 0) { + ret = load_and_attach(shname, data->d_buf, + data->d_size); + if (ret != 0) + goto done; + } + } + +done: + close(fd); + return ret; +} + +int load_bpf_file(char *path) +{ + return do_load_bpf_file(path, NULL); +} + +int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) +{ + return do_load_bpf_file(path, fixup_map); +} + +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[4096]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} diff --git a/src/c/ebpf_collector/ebpf_collector.bpf.c b/src/c/ebpf_collector/ebpf_collector.bpf.c new file mode 100644 index 0000000..28cdde2 --- /dev/null +++ b/src/c/ebpf_collector/ebpf_collector.bpf.c @@ -0,0 +1,1408 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * Description: ebpf collector program + * Author: Zhang Nan + * Create: 2024-09-27 + */ +#define KBUILD_MODNAME "foo" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "ebpf_collector.h" + +#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) + +struct bpf_map_def SEC("maps") blk_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(struct io_counter), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") blk_res = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct stage_data), + .max_entries = 128, +}; + +struct bpf_map_def SEC("maps") bio_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(struct io_counter), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") bio_res = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct stage_data), + .max_entries = 128, +}; + +struct bpf_map_def SEC("maps") wbt_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(struct io_counter), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") wbt_res = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct stage_data), + .max_entries = 128, +}; + +struct bpf_map_def SEC("maps") wbt_args = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = 1000, +}; + +struct bpf_map_def SEC("maps") tag_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(struct io_counter), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") tag_res = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct stage_data), + .max_entries = 128, +}; + +struct bpf_map_def SEC("maps") tag_args = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = 1000, +}; + +struct blk_mq_alloc_data { + /* input parameter */ + struct request_queue *q; + blk_mq_req_flags_t flags; + unsigned int shallow_depth; + + /* input & output parameter */ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; +}; + +static __always_inline void blk_fill_rwbs(char *rwbs, unsigned int op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: + rwbs[0] = 'W'; + break; + case REQ_OP_DISCARD: + rwbs[0] = 'D'; + break; + case REQ_OP_SECURE_ERASE: + rwbs[0] = 'E'; + break; + case REQ_OP_FLUSH: + rwbs[0] = 'F'; + break; + case REQ_OP_READ: + rwbs[0] = 'R'; + break; + default: + rwbs[0] = 'N'; + } + + if (op & REQ_FUA) { + rwbs[1] = 'F'; + } else { + rwbs[1] = '#'; + } + if (op & REQ_RAHEAD) { + rwbs[2] = 'A'; + } else { + rwbs[2] = '#'; + } + if (op & REQ_SYNC) { + rwbs[3] = 'S'; + } else { + rwbs[3] = '#'; + } + if (op & REQ_META) { + rwbs[4] = 'M'; + } else { + rwbs[4] = '#'; + } +} + +void update_new_data_in_start(struct stage_data *new_data, struct update_params *params) { + blk_fill_rwbs(new_data->io_type, params->cmd_flags); + if (new_data->bucket[params->update_bucket].start_range == params->curr_start_range){ + new_data->bucket[params->update_bucket].io_count += 1; + } else { + new_data->bucket[MAX_BUCKETS].io_count += new_data->bucket[params->update_bucket].io_count; + new_data->bucket[params->update_bucket].io_count = 1; + new_data->bucket[params->update_bucket].start_range = params->curr_start_range; + } +} + +void update_curr_data_in_start(struct stage_data *curr_data, struct update_params *params) { + if (curr_data && params) { + curr_data->start_count += 1; + curr_data->major = params->major; + curr_data->first_minor = params->first_minor; + blk_fill_rwbs(curr_data->io_type, params->cmd_flags); + if (curr_data->bucket[params->update_bucket].start_range == params->curr_start_range) { + curr_data->bucket[params->update_bucket].io_count += 1; + } else { + curr_data->bucket[MAX_BUCKETS].io_count += curr_data->bucket[params->update_bucket].io_count; + curr_data->bucket[params->update_bucket].io_count = 1; + } + curr_data->bucket[params->update_bucket].start_range = params->curr_start_range; + } +} + +void update_new_data_in_finish(struct stage_data *new_data, struct update_params *params) { + blk_fill_rwbs(new_data->io_type, params->cmd_flags); + if (new_data->bucket[params->update_bucket].start_range == params->curr_start_range){ + new_data->bucket[params->update_bucket].io_count = (new_data->bucket[params->update_bucket].io_count > 1) ? new_data->bucket[params->update_bucket].io_count - 1 : 0; + } else { + new_data->bucket[MAX_BUCKETS].io_count = (new_data->bucket[MAX_BUCKETS].io_count > 1) ? new_data->bucket[MAX_BUCKETS].io_count - 1 : 0; + } +} + +void update_curr_data_in_finish(struct stage_data *curr_data, struct update_params *params, u64 duration) { + if (curr_data && params) { + curr_data->finish_count += 1; + curr_data->major = params->major; + curr_data->first_minor = params->first_minor; + blk_fill_rwbs(curr_data->io_type, params->cmd_flags); + if (duration > DURATION_THRESHOLD) { + curr_data->finish_over_time += 1; + } + } +} + +static void init_io_counter(struct io_counter *counterp, int major, int first_minor) { + if (counterp) { + counterp->start_time = bpf_ktime_get_ns(); + counterp->major = major; + counterp->first_minor = first_minor; + } +} + + +u32 find_matching_tag_1_keys(int major, int minor) { + u32 key = 0; + struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 1; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&tag_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 2; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&tag_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_tag_2_keys(int major, int minor) { + u32 key = 3; + struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 4; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&tag_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 5; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&tag_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_tag_3_keys(int major, int minor) { + u32 key = 6; + struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 7; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&tag_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 8; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&tag_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_tag_4_keys(int major, int minor) { + u32 key = 9; + struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 10; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&tag_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 11; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&tag_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_tag_5_keys(int major, int minor) { + u32 key = 12; + struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 13; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&tag_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 14; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&tag_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_blk_1_keys(int major, int minor) { + u32 key = 0; + struct stage_data *curr_data = bpf_map_lookup_elem(&blk_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 1; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&blk_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 2; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&blk_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_blk_2_keys(int major, int minor) { + u32 key = 3; + struct stage_data *curr_data = bpf_map_lookup_elem(&blk_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 4; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&blk_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 5; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&blk_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_blk_3_keys(int major, int minor) { + u32 key = 6; + struct stage_data *curr_data = bpf_map_lookup_elem(&blk_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 7; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&blk_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 8; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&blk_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_blk_4_keys(int major, int minor) { + u32 key = 9; + struct stage_data *curr_data = bpf_map_lookup_elem(&blk_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 10; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&blk_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 11; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&blk_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_blk_5_keys(int major, int minor) { + u32 key = 12; + struct stage_data *curr_data = bpf_map_lookup_elem(&blk_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 13; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&blk_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 14; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&blk_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_bio_1_keys(int major, int minor) { + u32 key = 0; + struct stage_data *curr_data = bpf_map_lookup_elem(&bio_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 1; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&bio_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 2; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&bio_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_bio_2_keys(int major, int minor) { + u32 key = 3; + struct stage_data *curr_data = bpf_map_lookup_elem(&bio_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 4; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&bio_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 5; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&bio_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_bio_3_keys(int major, int minor) { + u32 key = 6; + struct stage_data *curr_data = bpf_map_lookup_elem(&bio_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 7; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&bio_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 8; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&bio_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_bio_4_keys(int major, int minor) { + u32 key = 9; + struct stage_data *curr_data = bpf_map_lookup_elem(&bio_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 10; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&bio_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 11; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&bio_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_bio_5_keys(int major, int minor) { + u32 key = 12; + struct stage_data *curr_data = bpf_map_lookup_elem(&bio_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 13; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&bio_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 14; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&bio_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_wbt_1_keys(int major, int minor) { + u32 key = 0; + struct stage_data *curr_data = bpf_map_lookup_elem(&wbt_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 1; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&wbt_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 2; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&wbt_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_wbt_2_keys(int major, int minor) { + u32 key = 3; + struct stage_data *curr_data = bpf_map_lookup_elem(&wbt_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 4; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&wbt_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 5; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&wbt_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_wbt_3_keys(int major, int minor) { + u32 key = 6; + struct stage_data *curr_data = bpf_map_lookup_elem(&wbt_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 7; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&wbt_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 8; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&wbt_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_wbt_4_keys(int major, int minor) { + u32 key = 9; + struct stage_data *curr_data = bpf_map_lookup_elem(&wbt_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 10; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&wbt_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 11; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&wbt_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +u32 find_matching_wbt_5_keys(int major, int minor) { + u32 key = 12; + struct stage_data *curr_data = bpf_map_lookup_elem(&wbt_res, &key); + + if (curr_data != NULL && curr_data->major == major && curr_data->first_minor == minor) { + return key; + } + + u32 key_2 = 13; + struct stage_data *curr_data_2 = bpf_map_lookup_elem(&wbt_res, &key_2); + + if (curr_data_2 != NULL && curr_data_2->major == major && curr_data_2->first_minor == minor) { + return key_2; + } + + u32 key_3 = 14; + struct stage_data *curr_data_3 = bpf_map_lookup_elem(&wbt_res, &key_3); + + if (curr_data_3 != NULL && curr_data_3->major == major && curr_data_3->first_minor == minor) { + return key_3; + } + + return MAP_SIZE + 1; +} + +SEC("kprobe/blk_mq_start_request") +int kprobe_blk_mq_start_request(struct pt_regs *regs) +{ + struct request *rq = (struct request *)PT_REGS_PARM1(regs); + struct gendisk *curr_rq_disk = _(rq->rq_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(rq->cmd_flags); + + struct io_counter *counterp, zero = {}; + + u32 key = find_matching_blk_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + init_io_counter(&zero, major, first_minor); + + counterp = bpf_map_lookup_elem(&blk_map, &rq); + if (counterp || major == 0) + return 0; + long err = bpf_map_update_elem(&blk_map, &rq, &zero, BPF_NOEXIST); + if (err) + return 0; + + u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&blk_res, &key); + if (!curr_data) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 0, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_start(&new_data, ¶ms); + bpf_map_update_elem(&blk_res, &key, &new_data, 0); + } else { + update_curr_data_in_start(curr_data, ¶ms); + } + + return 0; +} + +SEC("kprobe/blk_mq_free_request") +int kprobe_blk_mq_free_request(struct pt_regs *regs) +{ + struct request *rq = (struct request *)PT_REGS_PARM1(regs); + struct gendisk *curr_rq_disk = _(rq->rq_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(rq->cmd_flags); + + struct io_counter *counterp; + u32 key = find_matching_blk_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_blk_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + counterp = bpf_map_lookup_elem(&blk_map, &rq); + + if (!counterp) { + return 0; + } + + u64 duration = bpf_ktime_get_ns() - counterp->start_time; + u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&blk_res, &key); + if (curr_data == NULL && duration > DURATION_THRESHOLD) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 1, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&blk_res, &key, &new_data, 0); + } else if (curr_data == NULL) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&blk_res, &key, &new_data, 0); + } else { + if (curr_data->bucket[update_bucket].start_range == curr_start_range) { + curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; + } else { + curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; + + } + curr_data->duration += duration; + update_curr_data_in_finish(curr_data, ¶ms, &duration); + } + + bpf_map_delete_elem(&blk_map, &rq); + return 0; +} + +SEC("kprobe/blk_mq_make_request") +int kprobe_blk_mq_make_request(struct pt_regs *regs) +{ + struct bio *bio = (struct bio *)PT_REGS_PARM2(regs); + struct gendisk *curr_rq_disk = _(bio->bi_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(bio->bi_opf); + + struct io_counter *counterp, zero = {}; + u32 key = find_matching_bio_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + init_io_counter(&zero, major, first_minor); + + counterp = bpf_map_lookup_elem(&bio_map, &bio); + if (counterp || major == 0) + return 0; + + long err = bpf_map_update_elem(&bio_map, &bio, &zero, BPF_NOEXIST); + if (err && err != -EEXIST) + return 0; + + u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&bio_res, &key); + if (curr_data == NULL) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 0, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_start(&new_data, ¶ms); + bpf_map_update_elem(&bio_res, &key, &new_data, 0); + } else { + update_curr_data_in_start(curr_data, ¶ms); + } + + return 0; +} + +SEC("kprobe/bio_endio") +int kprobe_bio_endio(struct pt_regs *regs) +{ + struct bio *bio = (struct bio *)PT_REGS_PARM1(regs); + struct gendisk *curr_rq_disk = _(bio->bi_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(bio->bi_opf); + + struct io_counter *counterp; + void *delete_map = NULL; + u32 key = find_matching_bio_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_bio_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + counterp = bpf_map_lookup_elem(&bio_map, &bio); + + if (!counterp) { + return 0; + } + + delete_map = &bio_map; + + u64 duration = bpf_ktime_get_ns() - counterp->start_time; + u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&bio_res, &key); + if (curr_data == NULL && duration > DURATION_THRESHOLD) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 1, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&bio_res, &key, &new_data, 0); + } else if (curr_data == NULL) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&bio_res, &key, &new_data, 0); + } else { + if (curr_data->bucket[update_bucket].start_range == curr_start_range) { + curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; + } else { + curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; + + } + curr_data->duration += duration; + update_curr_data_in_finish(curr_data, ¶ms, &duration); + } + + bpf_map_delete_elem(delete_map, &bio); + return 0; +} + +SEC("kprobe/wbt_wait") +int kprobe_wbt_wait(struct pt_regs *regs) +{ + u64 wbtkey = bpf_get_current_task(); + u64 value = (u64)PT_REGS_PARM2(regs); + (void)bpf_map_update_elem(&wbt_args, &wbtkey, &value, BPF_ANY); + struct bio *bio = (struct bio *)value; + struct gendisk *curr_rq_disk = _(bio->bi_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(bio->bi_opf); + + struct io_counter *counterp, zero = {}; + u32 key = find_matching_wbt_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + init_io_counter(&zero, major, first_minor); + + counterp = bpf_map_lookup_elem(&wbt_map, &wbtkey); + + if (counterp || major == 0) + return 0; + long err = bpf_map_update_elem(&wbt_map, &wbtkey, &zero, BPF_NOEXIST); + if (err) + return 0; + + u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&wbt_res, &key); + if (!curr_data) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 0, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_start(&new_data, ¶ms); + bpf_map_update_elem(&wbt_res, &key, &new_data, 0); + } else { + update_curr_data_in_start(curr_data, ¶ms); + } + + return 0; +} + +SEC("kretprobe/wbt_wait") +int kretprobe_wbt_wait(struct pt_regs *regs) +{ + struct bio *bio = NULL; + u64 *wbtargs = NULL; + u64 wbtkey = bpf_get_current_task(); + wbtargs = (u64 *)bpf_map_lookup_elem(&wbt_args, &wbtkey); + if (wbtargs == NULL) { + bpf_map_delete_elem(&wbt_args, &wbtkey); + return 0; + } + bio = (struct bio *)(*wbtargs); + struct gendisk *curr_rq_disk = _(bio->bi_disk); + int major = _(curr_rq_disk->major); + int first_minor = _(curr_rq_disk->first_minor); + unsigned int cmd_flags = _(bio->bi_opf); + + struct io_counter *counterp; + u32 key = find_matching_wbt_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_wbt_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + counterp = bpf_map_lookup_elem(&wbt_map, &wbtkey); + + if (!counterp) + return 0; + + u64 duration = bpf_ktime_get_ns() - counterp->start_time; + u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&wbt_res, &key); + if (curr_data == NULL && duration > DURATION_THRESHOLD) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 1, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&wbt_res, &key, &new_data, 0); + } else if (curr_data == NULL) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 0, + .duration = 0, + .io_type = "", + .major = major, + .first_minor = first_minor, + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&wbt_res, &key, &new_data, 0); + } else { + if (curr_data->bucket[update_bucket].start_range == curr_start_range) { + curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; + } else { + curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; + + } + curr_data->duration += duration; + update_curr_data_in_finish(curr_data, ¶ms, &duration); + } + + bpf_map_delete_elem(&wbt_map, &wbtkey); + bpf_map_delete_elem(&wbt_args, &wbtkey); + return 0; +} + +SEC("kprobe/blk_mq_get_tag") +int kprobe_blk_mq_get_tag(struct pt_regs *regs) +{ + u64 tagkey = bpf_get_current_task(); + u64 value = (u64)PT_REGS_PARM1(regs); + (void)bpf_map_update_elem(&tag_args, &tagkey, &value, BPF_ANY); + struct blk_mq_alloc_data *bd= (struct blk_mq_alloc_data *)value; + struct request_queue *q = _(bd->q); + struct backing_dev_info *backing_dev_info = _(q->backing_dev_info); + struct device *owner = _(backing_dev_info->owner); + dev_t devt = _(owner->devt); + int major = MAJOR(devt); + int first_minor = MINOR(devt); + unsigned int cmd_flags = 0; + + struct io_counter *counterp, zero = {}; + u32 key = find_matching_tag_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + init_io_counter(&zero, major, first_minor); + + counterp = bpf_map_lookup_elem(&tag_map, &tagkey); + if (counterp || major == 0) + return 0; + long err = bpf_map_update_elem(&tag_map, &tagkey, &zero, BPF_NOEXIST); + if (err) + return 0; + + u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&tag_res, &key); + if (!curr_data) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 0, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_start(&new_data, ¶ms); + bpf_map_update_elem(&tag_res, &key, &new_data, 0); + } else { + update_curr_data_in_start(curr_data, ¶ms); + } + + return 0; +} + +SEC("kretprobe/blk_mq_get_tag") +int kretprobe_blk_mq_get_tag(struct pt_regs *regs) +{ + u64 tagkey = bpf_get_current_task(); + u64 *tagargs = NULL; + struct blk_mq_alloc_data *bd = NULL; + + tagargs = (u64 *)bpf_map_lookup_elem(&tag_args, &tagkey); + if (tagargs == NULL) { + bpf_map_delete_elem(&tag_args, &tagkey); + return 0; + } + bd = (struct blk_mq_alloc_data *)(*tagargs); + struct request_queue *q = _(bd->q); + struct backing_dev_info *backing_dev_info = _(q->backing_dev_info); + struct device *owner = _(backing_dev_info->owner); + dev_t devt = _(owner->devt); + int major = MAJOR(devt); + int first_minor = MINOR(devt); + unsigned int cmd_flags = 0; + + struct io_counter *counterp; + u32 key = find_matching_tag_1_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_2_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_3_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_4_keys(major, first_minor); + if (key >= MAP_SIZE){ + key = find_matching_tag_5_keys(major, first_minor); + if (key >= MAP_SIZE){ + return 0; + } + } + } + } + } + + counterp = bpf_map_lookup_elem(&tag_map, &tagkey); + + if (!counterp) + return 0; + + u64 duration = bpf_ktime_get_ns() - counterp->start_time; + u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; + u64 update_bucket = curr_start_range % MAX_BUCKETS; + + struct update_params params = { + .major = major, + .first_minor = first_minor, + .cmd_flags = cmd_flags, + .update_bucket = update_bucket, + .curr_start_range = curr_start_range, + }; + + struct stage_data *curr_data; + curr_data = bpf_map_lookup_elem(&tag_res, &key); + if (curr_data == NULL && duration > DURATION_THRESHOLD) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 1, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&tag_res, &key, &new_data, 0); + } else if (curr_data == NULL) { + struct stage_data new_data = { + .start_count = 1, + .finish_count = 1, + .finish_over_time = 0, + .duration = 0, + .major = major, + .first_minor = first_minor, + .io_type = "", + .bucket = { + [0] = {.start_range = 0, .io_count = 0}, + [1] = {.start_range = 0, .io_count = 0}, + }, + }; + update_new_data_in_finish(&new_data, ¶ms); + bpf_map_update_elem(&tag_res, &key, &new_data, 0); + } else { + if (curr_data->bucket[update_bucket].start_range == curr_start_range) { + curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; + } else { + curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; + + } + curr_data->duration += duration; + update_curr_data_in_finish(curr_data, ¶ms, &duration); + } + bpf_map_delete_elem(&tag_map, &tagkey); + bpf_map_delete_elem(&tag_args, &tagkey); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; + diff --git a/src/c/ebpf_collector/ebpf_collector.c b/src/c/ebpf_collector/ebpf_collector.c new file mode 100644 index 0000000..a949ae8 --- /dev/null +++ b/src/c/ebpf_collector/ebpf_collector.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * Description: ebpf collector program + * Author: Zhang Nan + * Create: 2024-09-27 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ebpf_collector.h" + +#define BLK_MAP (map_fd[0]) +#define BLK_RES (map_fd[1]) +#define BIO_MAP (map_fd[2]) +#define BIO_RES (map_fd[3]) +#define WBT_MAP (map_fd[4]) +#define WBT_RES (map_fd[5]) +#define TAG_MAP (map_fd[7]) +#define TAG_RES (map_fd[8]) +#define BPF_FILE "/usr/lib/ebpf_collector.bpf.o" + +typedef struct { + int major; + int minor; +} DeviceInfo; + +static volatile bool exiting; + +const char argp_program_doc[] = +"Show block device I/O pattern.\n" +"\n" +"USAGE: ebpf_collector [--help]\n" +"\n" +"EXAMPLES:\n" +" ebpf_collector # show block I/O pattern\n"; + +static const struct argp_option opts[] = { + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) { + static int pos_args; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void sig_handler(int sig) +{ + exiting = true; +} + +char* extract_device_name(const char *path) { + const char *dev_dir = "/dev/"; + char *name = strrchr(path, '/') + 1; + if (strncmp(dev_dir, path, strlen(dev_dir)) == 0) { + return strdup(name); + } + return NULL; +} + +char* find_device_name(dev_t dev) { + DIR *dir; + struct dirent *entry; + struct stat sb; + char *device_name = NULL; + char path[1024]; + + dir = opendir("/dev"); + if (dir == NULL) { + perror("Failed to open /dev"); + return NULL; + } + + while ((entry = readdir(dir)) != NULL) { + snprintf(path, sizeof(path), "/dev/%s", entry->d_name); + + if (entry->d_type == DT_DIR || entry->d_type == DT_LNK) { + continue; + } + + if (stat(path, &sb) == -1) { + continue; + } + + if (major(sb.st_rdev) == major(dev) && minor(sb.st_rdev) == minor(dev)) { + device_name = extract_device_name(path); + break; + } + } + + closedir(dir); + return device_name; +} + +static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size) +{ + struct stage_data counter; + int key = 0; + + struct sysinfo info; + sysinfo(&info); + + for (key = 0; key < map_size; key++) { + int err; + err = bpf_map_lookup_elem(map_res, &key, &counter); + if (err < 0) { + fprintf(stderr, "failed to lookup %s map_res: %d\n", stage, err); + return -1; + } + + size_t length = strlen(counter.io_type); + char io_type; + if (length > 0) { + io_type = counter.io_type[0]; + } else { + io_type = NULL; + } + int major = counter.major; + int first_minor = counter.first_minor; + dev_t dev = makedev(major, first_minor); + char *device_name = find_device_name(dev); + if (device_name && io_type) { + printf("%-7s %10llu %10llu %u %c %s\n", + stage, + counter.finish_count, + counter.duration, + counter.bucket[MAX_BUCKETS].io_count, + io_type, + device_name + ); + fflush(stdout); + } + } + + return 0; +} + +int init_map(int *map_fd, const char *map_name, int *map_size, DeviceInfo *devices) { + struct stage_data init_data = {0}; + memset(init_data.io_type, 0, sizeof(init_data.io_type)); + memset(init_data.bucket, 0, sizeof(init_data.bucket)); + + for (int i = 0; i < map_size; i++) { + init_data.major = devices[i].major; + init_data.first_minor = devices[i].minor; + if (bpf_map_update_elem(map_fd, &i, &init_data, BPF_ANY) != 0) { + printf("Failed to initialize map %s at index %d\n", map_name, i); + return 1; + } + } + + return 0; +} + +int main(int argc, char **argv) { + struct partitions *partitions = NULL; + const struct partition *partition; + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + int err; + char filename[256]; + DIR *dir; + struct dirent *entry; + char path[1024]; + int major, minor; + DeviceInfo devices[MAP_SIZE]; + int device_count = 0; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + setrlimit(RLIMIT_MEMLOCK, &r); + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + snprintf(filename, sizeof(filename), BPF_FILE); + + if (load_bpf_file(filename)) { + return 1; + } + + signal(SIGINT, sig_handler); + + dir = opendir("/dev"); + if (dir == NULL) { + printf("Failed to open /dev directory"); + return EXIT_FAILURE; + } + + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_BLK) { + snprintf(path, sizeof(path), "/dev/%s", entry->d_name); + struct stat statbuf; + if (lstat(path, &statbuf) == 0) { + if (S_ISBLK(statbuf.st_mode)) { + devices[device_count].major = major(statbuf.st_rdev); + devices[device_count].minor = minor(statbuf.st_rdev); + device_count++; + if (device_count >= MAP_SIZE) { + break; + } + } + } + } + } + + closedir(dir); + + if (init_map(BLK_RES, "blk_res_map", device_count, devices) != 0) { + return 1; + } + if (init_map(BIO_RES, "blo_res_map", device_count, devices) != 0) { + return 1; + } + if (init_map(WBT_RES, "wbt_res_map", device_count, devices) != 0) { + return 1; + } + if (init_map(TAG_RES, "tag_res_map", device_count, devices) != 0) { + return 1; + } + + for (;;) { + + sleep(1); + + err = print_map_res(BLK_RES, "rq_driver", device_count); + if (err) + break; + + err = print_map_res(BIO_RES, "bio", device_count); + if (err) + break; + + err = print_map_res(TAG_RES, "gettag", device_count); + if (err) + break; + + err = print_map_res(WBT_RES, "wbt", device_count); + if (err) + break; + + if (exiting) + break; + } + + return -err; +} diff --git a/src/c/ebpf_collector/ebpf_collector.h b/src/c/ebpf_collector/ebpf_collector.h new file mode 100644 index 0000000..1ae33de --- /dev/null +++ b/src/c/ebpf_collector/ebpf_collector.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * Description: ebpf collector program + * Author: Zhang Nan + * Create: 2024-09-27 + */ +#ifndef __EBPFCOLLECTOR_H +#define __EBPFCOLLECTOR_H + +typedef long long unsigned int u64; +typedef unsigned int u32; + +#define MAX_BUCKETS 1 +#define THRESHOLD 1000 +#define DURATION_THRESHOLD 500000000 + +#define RWBS_LEN 8 + +#define REQ_OP_BITS 8 +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) +#define REQ_SYNC (1ULL << __REQ_SYNC) +#define REQ_META (1ULL << __REQ_META) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define REQ_OP_READ 0 +#define REQ_OP_WRITE 1 +#define REQ_OP_FLUSH 2 +#define REQ_OP_DISCARD 3 +#define REQ_OP_SECURE_ERASE 5 +#define REQ_OP_WRITE_SAME 7 +#define MAP_SIZE 128 + +enum stage_type { + BIO=0, + WBT, + GET_TAG, + DEADLINE, + BFQ, + KYBER, + RQ_DRIVER, + MAX_STAGE_TYPE, +}; + +struct time_bucket { + u64 start_range; + u32 io_count; +}; + +struct stage_data { + u64 start_count; + u64 finish_count; + u64 finish_over_time; + u64 duration; + int major; + int first_minor; + char io_type[RWBS_LEN]; + struct time_bucket bucket[MAX_BUCKETS+1]; +}; + +struct io_counter { + u64 duration; + u64 start_time; + u32 isend; + int major; + int first_minor; +}; + +struct update_params { + int major; + int first_minor; + unsigned int cmd_flags; + u64 update_bucket; + u64 curr_start_range; +}; + +#endif /* __EBPFCOLLECTOR_H */ diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 019d174..e45947a 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -16,12 +16,18 @@ import os import time import logging import threading +import subprocess +from typing import Union from .collect_config import CollectConfig Io_Category = ["read", "write", "flush", "discard"] IO_GLOBAL_DATA = {} IO_CONFIG_DATA = [] +EBPF_GLOBAL_DATA = [] +EBPF_PROCESS = None +EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] +EBPF_SUPPORT_VERSION = ["4.19.90"] class IoStatus(): TOTAL = 0 @@ -41,6 +47,8 @@ class CollectIo(): self.disk_map_stage = {} self.window_value = {} + self.ebpf_base_path = 'ebpf_collector' + self.loop_all = False if disk_str == "default": @@ -62,7 +70,7 @@ class CollectIo(): logging.error("The file %s does not exist", stats_file) return -1 except Exception as e: - logging.error("An error occurred3: %s", e) + logging.error("An error occurred: %s", e) return -1 curr_value = lines.strip().split('\n') @@ -193,33 +201,109 @@ class CollectIo(): IO_GLOBAL_DATA[disk_name] = {} return len(IO_GLOBAL_DATA) != 0 - - def main_loop(self): - logging.info("collect io thread start") + + def is_ebpf_avaliable(self): + with open('/proc/version', 'r') as f: + kernel_version = f.read().split()[2] + major_version = kernel_version.split('-')[0] + + base_path = '/sys/kernel/debug/block' + for disk_name in os.listdir(base_path): + if not self.loop_all and disk_name not in self.disk_list: + continue + self.disk_map_stage[disk_name] = EBPF_STAGE_LIST + self.window_value[disk_name] = {} + IO_GLOBAL_DATA[disk_name] = {} - if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: - logging.warning("no disks meet the requirements. collect io thread exit") - return - for disk_name, stage_list in self.disk_map_stage.items(): for stage in stage_list: - self.window_value[disk_name][stage] = [] + self.window_value[disk_name][stage] = {} IO_GLOBAL_DATA[disk_name][stage] = {} for category in Io_Category: IO_GLOBAL_DATA[disk_name][stage][category] = [] + self.window_value[disk_name][stage][category] = [[0,0,0], [0,0,0]] - while True: - start_time = time.time() + return major_version in EBPF_SUPPORT_VERSION and os.path.exists('/usr/bin/ebpf_collector') and len(IO_GLOBAL_DATA) != 0 + + def get_ebpf_raw_data( + self + ) -> None: + global EBPF_PROCESS + global EBPF_GLOBAL_DATA + while True: if self.stop_event.is_set(): logging.debug("collect io thread exit") return + line = EBPF_PROCESS.stdout.readline() + if not line: + logging.info("no ebpf data found, wait for collect") + break + EBPF_GLOBAL_DATA.append(line.strip()) + time.sleep(0.1) + + def update_ebpf_collector_data( + self, + ) -> None: + global EBPF_GLOBAL_DATA + while True: + if self.stop_event.is_set(): + logging.debug("collect io thread exit") + return + if EBPF_GLOBAL_DATA: + for data in EBPF_GLOBAL_DATA: + data_list = data.split() + stage, finish_count, latency, io_dump, io_type ,disk_name = data_list + if disk_name not in self.window_value: + continue + io_type = self.get_ebpf_io_type(io_type) + if not io_type: + continue + if (len(self.window_value[disk_name][stage][io_type])) >= 2: + self.window_value[disk_name][stage][io_type].pop() + self.window_value[disk_name][stage][io_type].append([int(finish_count), int(latency), int(io_dump)]) + EBPF_GLOBAL_DATA.clear() + time.sleep(0.1) + + def get_ebpf_io_type( + self, + io_type: str + ) -> str: + io_type_mapping = { + "R": "read", + "W": "write", + "F": "flush", + "D": "discard" + } + io_type = io_type_mapping.get(io_type, None) + return io_type + + def append_ebpf_period_data( + self, + ) -> None: + global IO_GLOBAL_DATA + while True: + if self.stop_event.is_set(): + logging.debug("collect io thread exit") + return + start_time = time.time() for disk_name, stage_list in self.disk_map_stage.items(): - if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: - continue - self.append_period_lat(disk_name, stage_list) - + for stage in stage_list: + for io_type in Io_Category: + if len(self.window_value[disk_name][stage][io_type]) < 2: + return + if (len(IO_GLOBAL_DATA[disk_name][stage][io_type])) >= self.max_save: + IO_GLOBAL_DATA[disk_name][stage][io_type].pop() + curr_finish_count, curr_latency, curr_io_dump_count = self.window_value[disk_name][stage][io_type][-1] + prev_finish_count, prev_latency, prev_io_dump_count = self.window_value[disk_name][stage][io_type][-2] + self.window_value[disk_name][stage][io_type].pop(0) + self.window_value[disk_name][stage][io_type].insert(1, self.window_value[disk_name][stage][io_type][0]) + curr_lat = self.get_ebpf_latency_value(curr_latency=curr_latency, prev_latency=prev_latency, curr_finish_count=curr_finish_count, prev_finish_count=prev_finish_count) + curr_iops = self.get_ebpf_iops(curr_finish_count=curr_finish_count, prev_finish_count=prev_finish_count) + curr_io_length = self.get_ebpf_io_length(curr_latency=curr_latency, prev_latency=prev_latency) + curr_io_dump = self.get_ebpf_io_dump(curr_io_dump_count=curr_io_dump_count, prev_io_dump_count=prev_io_dump_count) + IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_iops, curr_io_length, curr_io_dump]) elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time if sleep_time < 0: @@ -231,6 +315,133 @@ class CollectIo(): time.sleep(1) sleep_time -= 1 time.sleep(sleep_time) + + def get_ebpf_latency_value( + self, + curr_latency: int, + prev_latency: int, + curr_finish_count: int, + prev_finish_count: int + ) -> Union[int, float]: + finish = curr_finish_count - prev_finish_count + lat_time = curr_latency - prev_latency + if finish <= 0 or lat_time <= 0: + return 0 + value = lat_time / finish / 1000 / 1000 + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_ebpf_iops( + self, + curr_finish_count: int, + prev_finish_count: int + ) -> Union[int, float]: + finish = curr_finish_count - prev_finish_count + if finish <= 0: + return 0 + value = finish / self.period_time / 1000 / 1000 + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_ebpf_io_length( + self, + curr_latency: int, + prev_latency: int, + ) -> Union[int, float]: + lat_time = curr_latency - prev_latency + if lat_time <= 0: + return 0 + value = lat_time / self.period_time + if value.is_integer(): + return int(value) + else: + return round(value, 1) + + def get_ebpf_io_dump( + self, + curr_io_dump_count: int, + prev_io_dump_count: int + ) -> Union[int, float]: + io_dump_count = curr_io_dump_count - prev_io_dump_count + if io_dump_count <= 0: + return 0 + value = io_dump_count + return int(value) + + def start_ebpf_subprocess( + self + ) -> None: + global EBPF_PROCESS + EBPF_PROCESS = subprocess.Popen(self.ebpf_base_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + def stop_ebpf_subprocess( + self + ) -> None: + global EBPF_PROCESS + if EBPF_PROCESS: + EBPF_PROCESS.terminate() + EBPF_PROCESS.wait() + logging.info("ebpf collector thread exit") + + def main_loop(self): + global IO_GLOBAL_DATA + logging.info("collect io thread start") + + if self.is_kernel_avaliable() and len(self.disk_map_stage) != 0: + for disk_name, stage_list in self.disk_map_stage.items(): + for stage in stage_list: + self.window_value[disk_name][stage] = [] + IO_GLOBAL_DATA[disk_name][stage] = {} + for category in Io_Category: + IO_GLOBAL_DATA[disk_name][stage][category] = [] + + while True: + start_time = time.time() + + if self.stop_event.is_set(): + logging.debug("collect io thread exit") + return + + for disk_name, stage_list in self.disk_map_stage.items(): + if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: + continue + self.append_period_lat(disk_name, stage_list) + + elapsed_time = time.time() - start_time + sleep_time = self.period_time - elapsed_time + if sleep_time < 0: + continue + while sleep_time > 1: + if self.stop_event.is_set(): + logging.debug("collect io thread exit") + return + time.sleep(1) + sleep_time -= 1 + time.sleep(sleep_time) + elif self.is_ebpf_avaliable(): + self.start_ebpf_subprocess() + + thread_get_data = threading.Thread(target=self.get_ebpf_raw_data) + thread_update_data = threading.Thread(target=self.update_ebpf_collector_data) + thread_append_data = threading.Thread(target=self.append_ebpf_period_data) + + thread_get_data.start() + thread_update_data.start() + thread_append_data.start() + + thread_get_data.join() + thread_update_data.join() + thread_append_data.join() + + self.stop_ebpf_subprocess() + logging.info("ebpf collector thread exits") + else: + logging.warning("fail to start ebpf collector thread. collect io thread exits") + return # set stop event, notify thread exit def stop_thread(self): diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index b6b3b28..26a60c5 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -114,7 +114,7 @@ def read_config_lat_iodump(io_dic, config): common_param = {} lat_sec = None if not config.has_section("latency"): - logging.warning("Cannot find algorithm section in config file") + logging.warning("Cannot find latency section in config file") else: lat_sec = config["latency"] @@ -122,7 +122,7 @@ def read_config_lat_iodump(io_dic, config): if not config.has_section("iodump"): logging.warning("Cannot find iodump section in config file") else: - lat_sec = config["iodump"] + iodump_sec = config["iodump"] if not lat_sec and not iodump_sec: return common_param -- Gitee From 0ed8f361ea590a9ba037a9df3d42e65033a2d7ee Mon Sep 17 00:00:00 2001 From: PshySimon Date: Thu, 26 Sep 2024 16:12:25 +0800 Subject: [PATCH 21/76] add pyxalarm and pySentryNotify, add multi users support for xalarmd and adapt libxalarm --- src/libso/xalarm/register_xalarm.c | 41 ++---- src/libso/xalarm/register_xalarm.h | 10 +- src/python/xalarm/register_xalarm.py | 192 +++++++++++++++++++++++++++ src/python/xalarm/sentry_notify.py | 71 ++++++++++ src/python/xalarm/xalarm_api.py | 18 ++- src/python/xalarm/xalarm_server.py | 40 +++++- src/python/xalarm/xalarm_transfer.py | 96 ++++++++++++-- 7 files changed, 408 insertions(+), 60 deletions(-) create mode 100644 src/python/xalarm/register_xalarm.py create mode 100644 src/python/xalarm/sentry_notify.py diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c index 152c078..21a419f 100644 --- a/src/libso/xalarm/register_xalarm.c +++ b/src/libso/xalarm/register_xalarm.c @@ -35,7 +35,7 @@ #define ALARM_SOCKET_PERMISSION 0700 #define TIME_UNIT_MILLISECONDS 1000 -#define MAX_PARAS_LEN 511 +#define MAX_PARAS_LEN 1023 #define MIN_ALARM_ID 1001 #define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) @@ -91,7 +91,7 @@ static int create_unix_socket(const char *path) return -1; } - fd = socket(AF_UNIX, SOCK_DGRAM, 0); + fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd < 0) { printf("socket failed:%s\n", strerror(errno)); return -1; @@ -103,14 +103,6 @@ static int create_unix_socket(const char *path) goto release_socket; } - if (access(PATH_REG_ALARM, F_OK) == 0) { - ret = unlink(PATH_REG_ALARM); - if (ret != 0) { - printf("unlink register socket file failed\n"); - goto release_socket; - } - } - if (access(DIR_XALARM, F_OK) == -1) { if (mkdir(DIR_XALARM, ALARM_DIR_PERMISSION) == -1) { printf("mkdir %s failed\n", DIR_XALARM); @@ -120,32 +112,22 @@ static int create_unix_socket(const char *path) if (memset(&alarm_addr, 0, sizeof(alarm_addr)) == NULL) { printf("create_unix_socket: memset alarm_addr failed, ret: %d\n", ret); - goto remove_dir; + goto release_socket; } alarm_addr.sun_family = AF_UNIX; strncpy(alarm_addr.sun_path, path, sizeof(alarm_addr.sun_path) - 1); - if (bind(fd, (struct sockaddr *)&alarm_addr, sizeof(alarm_addr.sun_family) + strlen(alarm_addr.sun_path)) < 0) { - printf("bind socket failed:%s\n", strerror(errno)); - goto remove_dir; + if (connect(fd, (struct sockaddr*)&alarm_addr, sizeof(alarm_addr)) == -1) { + printf("create_unix_socket: connect alarm_addr failed, ret: %d\n", ret); + goto release_socket; } if (chmod(path, ALARM_SOCKET_PERMISSION) < 0) { printf("chmod %s failed: %s\n", path, strerror(errno)); - goto unlink_sockfile; + goto release_socket; } return fd; -unlink_sockfile: - ret = unlink(PATH_REG_ALARM); - if (ret != 0) { - printf("unlink register socket file failed\n"); - } -remove_dir: - ret = rmdir(DIR_XALARM); - if (ret != 0) { - printf("rmdir %s failed: %s\n", path, strerror(errno)); - } release_socket: (void)close(fd); @@ -271,8 +253,6 @@ int xalarm_Register(alarm_callback_func callback, struct alarm_subscription_info void xalarm_UnRegister(int client_id) { - int ret; - if (!g_register_info.is_registered) { printf("%s: alarm has not registered\n", __func__); return; @@ -292,10 +272,6 @@ void xalarm_UnRegister(int client_id) if (g_register_info.register_fd != -1) { (void)close(g_register_info.register_fd); g_register_info.register_fd = -1; - ret = unlink(PATH_REG_ALARM); - if (ret != 0) { - printf("%s: unlink register socket file failed\n", __func__); - } } memset(g_register_info.alarm_enable_bitmap, 0, MAX_NUM_OF_ALARM_ID * sizeof(char)); @@ -357,7 +333,7 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel, struct sockaddr_un alarm_addr; if ((usAlarmId < MIN_ALARM_ID || usAlarmId > MAX_ALARM_ID) || - (ucAlarmLevel < ALARM_LEVEL_FATAL || ucAlarmLevel > ALARM_LEVEL_DEBUG) || + (ucAlarmLevel < MINOR_ALM || ucAlarmLevel > CRITICAL_ALM) || (ucAlarmType < ALARM_TYPE_OCCUR || ucAlarmType > ALARM_TYPE_RECOVER)) { fprintf(stderr, "%s: alarm info invalid\n", __func__); return -1; @@ -666,3 +642,4 @@ int report_result(const char *task_name, enum RESULT_LEVEL result_level, const c return RETURE_CODE_SUCCESS; } + diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h index 1f26c6a..fef9482 100644 --- a/src/libso/xalarm/register_xalarm.h +++ b/src/libso/xalarm/register_xalarm.h @@ -11,7 +11,7 @@ #include #include -#define ALARM_INFO_MAX_PARAS_LEN 512 +#define ALARM_INFO_MAX_PARAS_LEN 1024 #define MAX_STRERROR_SIZE 1024 #define MAX_ALARM_TYEPS 1024 #define MIN_ALARM_ID 1001 @@ -19,11 +19,9 @@ #define MEMORY_ALARM_ID 1001 -#define ALARM_LEVEL_FATAL 1 -#define ALARM_LEVEL_ERROR 2 -#define ALARM_LEVEL_WARNING 3 -#define ALARM_LEVEL_INFO 4 -#define ALARM_LEVEL_DEBUG 5 +#define MINOR_ALM 1 +#define MAJOR_ALM 2 +#define CRITICAL_ALM 3 #define ALARM_TYPE_OCCUR 1 #define ALARM_TYPE_RECOVER 2 diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py new file mode 100644 index 0000000..e58343d --- /dev/null +++ b/src/python/xalarm/register_xalarm.py @@ -0,0 +1,192 @@ +import os +import sys +import socket +import logging +import threading +import time +import fcntl +import inspect +from struct import error as StructParseError + +from .xalarm_api import Xalarm, alarm_bin2stu + + +ALARM_REPORT_LEN = 1048 +MAX_NUM_OF_ALARM_ID=128 +MIN_ALARM_ID = 1001 +MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) +DIR_XALARM = "/var/run/xalarm" +PATH_REG_ALARM = "/var/run/xalarm/alarm" +PATH_REPORT_ALARM = "/var/run/xalarm/report" +ALARM_DIR_PERMISSION = 0o0750 +ALARM_REG_SOCK_PERMISSION = 0o0700 +ALARM_SOCKET_PERMISSION = 0o0700 +TIME_UNIT_MILLISECONDS = 1000 +ALARM_REGISTER_INFO = None + + +class AlarmRegister: + def __init__(self, id_filter: list[bool], callback: callable): + self.id_filter = id_filter + self.callback = callback + self.socket = self.create_unix_socket() + self.is_registered = True + self.thread = threading.Thread(target=self.alarm_recv) + self.thread_should_stop = False + + def check_params(self) -> bool: + if (len(self.id_filter) != MAX_NUM_OF_ALARM_ID): + sys.stderr.write("check_params: invalid param id_filter\n") + return False + + sig = inspect.signature(self.callback) + if len(sig.parameters) != 1: + sys.stderr.write("check_params: invalid param callback\n") + return False + + if self.socket is None: + sys.stderr.write("check_params: scoket create failed\n") + return False + return True + + def set_id_filter(self, id_filter: list[bool]) -> bool: + if (len(id_filter) > MAX_NUM_OF_ALARM_ID): + sys.stderr.write("set_id_filter: invalid param id_filter\n") + return False + self.id_filter = id_filter + + def id_is_registered(self, alarm_id) -> bool: + if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: + return False + return self.id_filter[alarm_id - MIN_ALARM_ID] + + def put_alarm_info(self, alarm_info: Xalarm) -> None: + if not self.callback or not alarm_info: + return + if not self.id_is_registered(alarm_info.alarm_id): + return + self.callback(alarm_info) + + def create_unix_socket(self) -> socket.socket: + try: + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.setblocking(False) + + if not os.access(DIR_XALARM, os.F_OK): + os.makedirs(DIR_XALARM) + os.chmod(DIR_XALARM, ALARM_DIR_PERMISSION) + + sock.connect(PATH_REG_ALARM) + return sock + except (IOError, OSError, FileNotFoundError) as e: + sock.close() + sys.stderr.write(f"create_unix_socket: create socket error:{e}\n") + return None + + def alarm_recv(self): + while not self.thread_should_stop: + try: + data = self.socket.recv(ALARM_REPORT_LEN) + if not data: + sys.stderr.write("connection closed by xalarmd, maybe connections reach max num or service stopped.\n") + self.thread_should_stop = True + break + if len(data) != ALARM_REPORT_LEN: + sys.stderr.write(f"server receive report msg length wrong {len(data)}\n") + continue + + alarm_info = alarm_bin2stu(data) + self.put_alarm_info(alarm_info) + except (BlockingIOError) as e: + time.sleep(0.1) + except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError): + sys.stderr.write("Connection closed by the server.\n") + self.thread_should_stop = True + except (ValueError, StructParseError, InterruptedError) as e: + sys.stderr.write(f"{e}\n") + except Exception as e: + sys.stderr.write(f"{e}\n") + self.thread_should_stop = True + + def start_thread(self) -> None: + self.thread.daemon = True + self.thread.start() + + def stop_thread(self) -> None: + self.thread_should_stop = True + self.thread.join() + self.socket.close() + + +def xalarm_register(callback: callable, id_filter: list[bool]) -> int: + global ALARM_REGISTER_INFO + + if ALARM_REGISTER_INFO is not None: + sys.stderr.write("xalarm_register: alarm has registered\n") + return -1 + + ALARM_REGISTER_INFO = AlarmRegister(id_filter, callback) + if not ALARM_REGISTER_INFO.check_params(): + return -1 + + ALARM_REGISTER_INFO.start_thread() + + return 0 + + +def xalarm_unregister(clientId: int) -> None: + global ALARM_REGISTER_INFO + if clientId < 0: + sys.stderr.write("xalarm_unregister: invalid client\n") + return + + if ALARM_REGISTER_INFO is None: + sys.stderr.write("xalarm_unregister: alarm has not registered\n") + return + + ALARM_REGISTER_INFO.stop_thread() + ALARM_REGISTER_INFO = None + + +def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None: + global ALARM_REGISTER_INFO + if clientId < 0: + sys.stderr.write("xalarm_unregister: invalid client\n") + return + if ALARM_REGISTER_INFO is None: + sys.stderr.write("xalarm_unregister: alarm has not registered\n") + return + ALARM_REGISTER_INFO.id_filter = id_filter + + +def xalarm_getid(alarm_info: Xalarm) -> int: + if not alarm_info: + return 0 + return alarm_info.alarm_id + + +def xalarm_getlevel(alarm_info: Xalarm) -> int: + if not alarm_info: + return 0 + return alarm_info.alarm_level + + +def xalarm_gettype(alarm_info: Xalarm) -> int: + if not alarm_info: + return 0 + return alarm_info.alarm_type + + +def xalarm_gettime(alarm_info: Xalarm) -> int: + if not alarm_info: + return 0 + return alarm_info.timetamp.tv_sec * TIME_UNIT_MILLISECONDS + alarm_info.timetamp.tv_usec / TIME_UNIT_MILLISECONDS + +def xalarm_getdesc(alarm_info: Xalarm) -> str: + if not alarm_info: + return None + try: + desc_str = alarm_info.msg1.rstrip(b'\x00').decode('utf-8') + except UnicodeError: + desc_str = None + return desc_str diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py new file mode 100644 index 0000000..a19e5b3 --- /dev/null +++ b/src/python/xalarm/sentry_notify.py @@ -0,0 +1,71 @@ +import os +import sys +import time +import socket +from struct import error as StructParseError + +from .xalarm_api import alarm_stu2bin, Xalarm + +MAX_NUM_OF_ALARM_ID = 128 +MIN_ALARM_ID = 1001 +MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) + +MINOR_ALM = 1 +MAJOR_ALM = 2 +CRITICAL_ALM = 3 + +ALARM_TYPE_OCCUR = 1 +ALARM_TYPE_RECOVER = 2 + +MAX_PUC_PARAS_LEN = 1024 + +DIR_XALARM = "/var/run/xalarm" +PATH_REPORT_ALARM = "/var/run/xalarm/report" +ALARM_DIR_PERMISSION = 0o750 +ALARM_SOCKET_PERMISSION = 0o700 + + +def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: + if not os.path.exists(DIR_XALARM): + sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed") + return False + + if not os.path.exists(PATH_REPORT_ALARM): + sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed") + return False + + if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or + alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or + alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER): + sys.stderr.write("check_params: alarm info invalid\n") + return False + + if len(puc_paras) >= MAX_PUC_PARAS_LEN: + sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n") + return False + + return True + +def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: + if not check_params(alarm_id, alarm_level, alarm_type, puc_paras): + return False + + try: + sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + + current_time = time.time() + current_time_seconds = int(current_time) + current_microseconds = int((current_time - current_time_seconds) * 1_000_000) + alarm_info = Xalarm(alarm_id, alarm_type, alarm_level, + current_time_seconds, current_microseconds, puc_paras) + + sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM) + except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e: + sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n") + return False + finally: + sock.close() + + return True + + diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py index 94d7638..99eabf5 100644 --- a/src/python/xalarm/xalarm_api.py +++ b/src/python/xalarm/xalarm_api.py @@ -23,6 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5) ALARM_SOCK_PATH = "/var/run/xalarm/report" MIN_ALARM_ID = 1001 MAX_ALARM_ID = 1128 +MAX_MSG_LEN = 1024 @dataclasses.dataclass @@ -97,15 +98,15 @@ class Xalarm: def msg1(self, msg): """msg1 setter """ - if len(msg) > 512: - raise ValueError("msg1 length must below 255") + if len(msg) > MAX_MSG_LEN: + raise ValueError(f"msg1 length must below {MAX_MSG_LEN}") self._msg1 = msg def alarm_bin2stu(bin_data): """alarm binary to struct """ - struct_data = struct.unpack("@HBBll512s", bin_data) + struct_data = struct.unpack(f"@HBBll{MAX_MSG_LEN}s", bin_data) alarm_info = Xalarm(1001, 2, 1, 0, 0, "") alarm_info.alarm_id = struct_data[0] @@ -116,3 +117,14 @@ def alarm_bin2stu(bin_data): alarm_info.msg1 = struct_data[5] return alarm_info + + +def alarm_stu2bin(alarm_info: Xalarm): + return struct.pack( + f'@HBBll{MAX_MSG_LEN}s', + alarm_info.alarm_id, + alarm_info.alarm_level, + alarm_info.alarm_type, + alarm_info.timetamp.tv_sec, + alarm_info.timetamp.tv_usec, + alarm_info.msg1.encode('utf-8')) diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py index 84db273..fcaf393 100644 --- a/src/python/xalarm/xalarm_server.py +++ b/src/python/xalarm/xalarm_server.py @@ -17,16 +17,20 @@ Create: 2023-11-02 import socket import os import logging +import select +import threading from struct import error as StructParseError from .xalarm_api import alarm_bin2stu -from .xalarm_transfer import check_filter, transmit_alarm +from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection ALARM_DIR = "/var/run/xalarm" +USER_RECV_SOCK = "/var/run/xalarm/alarm" SOCK_FILE = "/var/run/xalarm/report" -ALARM_REPORT_LEN = 536 +ALARM_REPORT_LEN = 1048 ALARM_DIR_PERMISSION = 0o750 +ALARM_LISTEN_QUEUE_LEN = 5 def clear_sock_path(): @@ -37,6 +41,8 @@ def clear_sock_path(): os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION) if os.path.exists(SOCK_FILE): os.unlink(SOCK_FILE) + if os.path.exists(USER_RECV_SOCK): + os.unlink(USER_RECV_SOCK) def server_loop(alarm_config): @@ -49,6 +55,21 @@ def server_loop(alarm_config): sock.bind(SOCK_FILE) os.chmod(SOCK_FILE, 0o600) + alarm_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + alarm_sock.bind(USER_RECV_SOCK) + os.chmod(USER_RECV_SOCK, 0o600) + alarm_sock.listen(ALARM_LISTEN_QUEUE_LEN) + alarm_sock.setblocking(False) + + epoll = select.epoll() + epoll.register(alarm_sock.fileno(), select.EPOLLIN) + fd_to_socket = {alarm_sock.fileno(): alarm_sock,} + thread_should_stop = False + + thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) + thread.daemon = True + thread.start() + while True: try: data, _ = sock.recvfrom(ALARM_REPORT_LEN) @@ -58,14 +79,21 @@ def server_loop(alarm_config): logging.debug("server receive report msg length wrong %d", len(data)) continue - alarm_info = alarm_bin2stu(data) logging.debug("server bin2stu msg") if not check_filter(alarm_info, alarm_config): continue + transmit_alarm(alarm_sock, epoll, fd_to_socket, data) + except Exception as e: + logging.error(f"Error server:{e}") + + thread_should_stop = True + thread.join() - transmit_alarm(data) - except (ValueError, StructParseError): - pass + epoll.unregister(alarm_sock.fileno()) + epoll.close() + alarm_sock.close() + os.unlink(USER_RECV_SOCK) sock.close() + diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py index b590b43..42137d8 100644 --- a/src/python/xalarm/xalarm_transfer.py +++ b/src/python/xalarm/xalarm_transfer.py @@ -16,10 +16,12 @@ Create: 2023-11-02 import socket import logging +import select -USER_RECV_SOCK = "/var/run/xalarm/alarm" MIN_ID_NUMBER = 1001 MAX_ID_NUMBER = 1128 +MAX_CONNECTION_NUM = 100 +TEST_CONNECT_BUFFER_SIZE = 32 def check_filter(alarm_info, alarm_filter): @@ -35,16 +37,84 @@ def check_filter(alarm_info, alarm_filter): return True -def transmit_alarm(bin_data): - """forward alarm message +def cleanup_closed_connections(server_sock, epoll, fd_to_socket): """ - sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) - try: - sock.sendto(bin_data, USER_RECV_SOCK) - logging.debug("transfer alarm success") - except ConnectionRefusedError: - logging.debug("transfer sendto failed") - except FileNotFoundError: - logging.debug("transfer sendto failed") - finally: - sock.close() + clean invalid client socket connections saved in 'fd_to_socket' + :param server_sock: server socket instance of alarm + :param epoll: epoll instance, used to unregister invalid client connections + :param fd_to_socket: dict instance, used to hold client connections and server connections + """ + to_remove = [] + for fileno, connection in fd_to_socket.items(): + if connection is server_sock: + continue + try: + # test whether connection still alive, use MSG_DONTWAIT to avoid blocking thread + # use MSG_PEEK to avoid consuming buffer data + data = connection.recv(TEST_CONNECT_BUFFER_SIZE, socket.MSG_DONTWAIT | socket.MSG_PEEK) + if not data: + to_remove.append(fileno) + except BlockingIOError: + pass + except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError): + to_remove.append(fileno) + + for fileno in to_remove: + epoll.unregister(fileno) + fd_to_socket[fileno].close() + del fd_to_socket[fileno] + logging.info(f"cleaned up connection {fileno} for client lost connection.") + + +def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): + """ + thread function for catch and save client connection + :param server_sock: server socket instance of alarm + :param epoll: epoll instance, used to unregister invalid client connections + :param fd_to_socket: dict instance, used to hold client connections and server connections + :param thread_should_stop: bool instance + """ + while not thread_should_stop: + try: + events = epoll.poll(1) + + for fileno, event in events: + if fileno == server_sock.fileno(): + connection, client_address = server_sock.accept() + # if reach max connection, cleanup closed connections + if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM: + cleanup_closed_connections(server_sock, epoll, fd_to_socket) + # if connections still reach max num, close this connection automatically + if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM: + logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!") + connection.close() + continue + epoll.register(connection.fileno(), select.EPOLLOUT) + fd_to_socket[connection.fileno()] = connection + except socket.error as e: + logging.debug(f"socket error, reason is {e}") + break + except (KeyError, OSError, ValueError) as e: + logging.debug(f"wait for connection failed {e}") + + +def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): + """ + this function is to broadcast alarm data to client, if fail to send data, remove connections held by fd_to_socket + :param server_sock: server socket instance of alarm + :param epoll: epoll instance, used to unregister invalid client connections + :param fd_to_socket: dict instance, used to hold client connections and server connections + :param bin_data: binary instance, alarm info data in C-style struct format defined in xalarm_api.py + """ + to_remove = [] + for fileno, connection in fd_to_socket.items(): + if connection is not server_sock: + try: + connection.sendall(bin_data) + except (BrokenPipeError, ConnectionResetError): + to_remove.append(fileno) + for fileno in to_remove: + epoll.unregister(fileno) + fd_to_socket[fileno].close() + del fd_to_socket[fileno] + -- Gitee From d7a1eb6b0dcadbccc8162133f77448153988f7a5 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Sun, 29 Sep 2024 02:04:52 +0000 Subject: [PATCH 22/76] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=E4=BA=8B=E4=BB=B6=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD=EF=BC=9A?= =?UTF-8?q?sentryctl=20get=5Falarm=20=20-s=20?= =?UTF-8?q?=20-d?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 142 ++++++++++++++++++++++++++ src/python/syssentry/callbacks.py | 17 +++ src/python/syssentry/global_values.py | 4 + src/python/syssentry/load_mods.py | 16 +++ src/python/syssentry/sentryctl | 20 +++- src/python/syssentry/syssentry.py | 13 ++- src/python/syssentry/task_map.py | 5 +- 7 files changed, 212 insertions(+), 5 deletions(-) create mode 100644 src/python/syssentry/alarm.py diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py new file mode 100644 index 0000000..3b80769 --- /dev/null +++ b/src/python/syssentry/alarm.py @@ -0,0 +1,142 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +use for report alarm +""" +import threading +from typing import Dict, List +from datetime import datetime +import time +import logging +import json + +from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc +from xalarm.xalarm_api import Xalarm + +from .global_values import InspectTask +from .task_map import TasksMap + +# 告警ID映射字典,key为插件名,value为告警ID(类型为数字) +task_alarm_id_dict: Dict[str, int] = {} + +# 告警老化时间字典,key为告警ID,value为老化时间(类型为数字,单位为秒) +alarm_id_clear_time_dict: Dict[int, int] = {} + +# 告警事件列表,key为告警ID,value为告警ID对应的告警事件列表(类型为list) +alarm_list_dict: Dict[int, List[Xalarm]] = {} +# 告警事件列表锁 +alarm_list_lock = threading.Lock() + +id_filter = [] +id_base = 1001 +clientId = -1 + +MILLISECONDS_UNIT_SECONDS = 1000 + +def update_alarm_list(alarm_info: Xalarm): + alarm_id = xalarm_getid(alarm_info) + timestamp = xalarm_gettime(alarm_info) + if not timestamp: + logging.error("Retrieve timestamp failed") + return + alarm_list_lock.acquire() + try: + # new alarm is inserted into list head + if alarm_id not in alarm_list_dict: + logging.warning(f"update_alarm_list: alarm_id {alarm_id} not found in alarm_list_dict") + return + alarm_list = alarm_list_dict[alarm_id] + + alarm_list.insert(0, alarm_info) + # clear alarm_info older than clear time threshold + clear_index = -1 + clear_time = alarm_id_clear_time_dict[alarm_id] + for i in range(len(alarm_list)): + if (timestamp - xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time: + clear_index = i + break + if clear_index >= 0: + alarm_list_dict[alarm_id] = alarm_list[:clear_index] + finally: + alarm_list_lock.release() + +def alarm_register(): + logging.debug(f"alarm_register: enter") + # 初始化告警ID映射字典、告警老化时间字典 + for task_type in TasksMap.tasks_dict: + for task_name in TasksMap.tasks_dict[task_type]: + logging.info(f"alarm_register: {task_name} is registered") + task = TasksMap.tasks_dict[task_type][task_name] + alarm_id = task.alarm_id + alarm_clear_time = task.alarm_clear_time + alarm_list_dict[alarm_id] = [] + task_alarm_id_dict[task_name] = alarm_id + if alarm_id not in alarm_id_clear_time_dict: + alarm_id_clear_time_dict[alarm_id] = alarm_clear_time + else: + alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id]) + # 注册告警回调 + id_filter = [True] * 128 + clientId = xalarm_register(update_alarm_list, id_filter) + if clientId < 0: + logging.info(f'register xalarm: failed') + return clientId + logging.info('register xalarm: success') + return clientId + +def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Dict]: + alarm_list_lock.acquire() + try: + if task_name not in task_alarm_id_dict: + logging.debug("task_name does not exist") + return [] + alarm_id = task_alarm_id_dict[task_name] + if alarm_id not in alarm_list_dict: + logging.debug("alarm_id does not exist") + return [] + alarm_list = alarm_list_dict[alarm_id] + logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements") + # clear alarm_info older than clear time threshold + stop_index = -1 + timestamp = int(datetime.now().timestamp()) + for i in range(len(alarm_list)): + logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}") + if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range): + stop_index = i + break + if stop_index >= 0: + alarm_list = alarm_list[:stop_index] + logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") + + def xalarm_to_dict(alarm_info: Xalarm) -> dict: + return { + 'alarm_id': xalarm_getid(alarm_info), + 'alarm_type': xalarm_gettype(alarm_info), + 'alarm_level': xalarm_getlevel(alarm_info), + 'timetamp': xalarm_gettime(alarm_info), + 'msg1': xalarm_getdesc(alarm_info) + } + + alarm_list = [xalarm_to_dict(alarm) for alarm in alarm_list] + + # keep detail + for alarm in alarm_list: + alarm_info = alarm['msg1'] + alarm_info = json.loads(alarm_info) + if not detailed: + if 'details' in alarm_info: + alarm_info.pop('details', None) + alarm.pop('msg1', None) + alarm['alarm_info'] = alarm_info + return alarm_list + finally: + alarm_list_lock.release() diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py index d0d0719..d2dff24 100644 --- a/src/python/syssentry/callbacks.py +++ b/src/python/syssentry/callbacks.py @@ -18,6 +18,7 @@ import logging from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE from .mod_status import EXITED_STATUS, RUNNING_STATUS, WAITING_STATUS, set_runtime_status +from .alarm import get_alarm_result def task_get_status(mod_name): @@ -41,6 +42,22 @@ def task_get_result(mod_name): return "success", task.get_result() +def task_get_alarm(data): + """get alarm by mod name""" + task_name = data['task_name'] + time_range = data['time_range'] + try: + detailed = data['detailed'] + except KeyError: + logging.debug("Key 'detailed' does not exist in the dictionary") + detailed = None + task = TasksMap.get_task_by_name(task_name) + if not task: + return "failed", f"cannot find task by name {task_name}" + if not task.load_enabled: + return "failed", f"mod {task_name} is not enabled" + + return "success", get_alarm_result(task_name, time_range, detailed) def task_stop(mod_name): """stop by mod name""" diff --git a/src/python/syssentry/global_values.py b/src/python/syssentry/global_values.py index 483d544..b123b2d 100644 --- a/src/python/syssentry/global_values.py +++ b/src/python/syssentry/global_values.py @@ -27,6 +27,7 @@ CTL_SOCKET_PATH = "/var/run/sysSentry/control.sock" SYSSENTRY_CONF_PATH = "/etc/sysSentry" INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf" TASK_LOG_DIR = "/var/log/sysSentry" +DEFAULT_ALARM_CLEAR_TIME = 15 SENTRY_RUN_DIR_PERM = 0o750 @@ -76,6 +77,9 @@ class InspectTask: self.env_file = "" # start mode self.conflict = "up" + # alarm id + self.alarm_id = -1 + self.alarm_clear_time = DEFAULT_ALARM_CLEAR_TIME def start(self): """ diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py index 48d7e66..ae05e57 100644 --- a/src/python/syssentry/load_mods.py +++ b/src/python/syssentry/load_mods.py @@ -24,6 +24,7 @@ from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE from .cron_process import PeriodTask from .mod_status import set_task_status +from xalarm.register_xalarm import MIN_ALARM_ID, MAX_ALARM_ID ONESHOT_CONF = 'oneshot' PERIOD_CONF = 'period' @@ -41,6 +42,8 @@ CONF_TASK_RESTART = 'task_restart' CONF_ONSTART = 'onstart' CONF_ENV_FILE = 'env_file' CONF_CONFLICT = 'conflict' +CONF_ALARM_ID = 'alarm_id' +CONF_ALARM_CLEAR_TIME = 'alarm_clear_time' MOD_FILE_SUFFIX = '.mod' MOD_SUFFIX_LEN = 4 @@ -194,6 +197,18 @@ def parse_mod_conf(mod_name, mod_conf): task.heartbeat_interval = heartbeat_interval task.load_enabled = is_enabled + try: + task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID)) + task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)) + if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): + raise ValueError("Invalid alarm_id") + except ValueError: + task.alarm_id = -1 + logging.warning("Invalid alarm_id, set to -1") + except configparser.NoOptionError: + task.alarm_id = -1 + logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default") + if CONF_ONSTART in mod_conf.options(CONF_TASK): is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') if task_type == PERIOD_CONF: @@ -327,3 +342,4 @@ def reload_single_mod(mod_name): res, ret = reload_mod_by_name(mod_name) return res, ret + diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl index e94491f..675c17a 100644 --- a/src/python/syssentry/sentryctl +++ b/src/python/syssentry/sentryctl @@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256 RESULT_MSG_DATA_LEN = 4 CTL_MSG_LEN_LEN = 3 +DEFAULT_ALARM_TIME_RANGE = 10 def status_output_format(res_data): """format output""" @@ -57,6 +58,8 @@ def res_output_handle(res_struct, req_type): status_output_format(res_struct['data']) elif req_type == 'get_result': result_output_format(res_struct['data']) + elif req_type == 'get_alarm': + result_output_format(res_struct['data']) elif res_struct['ret'] == "failed": print(res_struct['data']) @@ -75,6 +78,7 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client creat socket error") return None + # connect to syssentry try: client_socket.connect(CTL_SOCKET_PATH) except OSError: @@ -82,6 +86,7 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client connect error") return None + # msg: CTL{len}{data} req_data_len = len(request_data) request_msg = "CTL" + str(req_data_len).zfill(3) + request_data @@ -94,8 +99,8 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client communicate error") return None + # res: RES{len}{data} res_magic = res_data[:3] - if res_magic != "RES": print("res msg format error") return None @@ -128,6 +133,10 @@ if __name__ == '__main__': parser_status.add_argument('task_name') parser_get_result = subparsers.add_parser('get_result', help='get task result') parser_get_result.add_argument('task_name') + parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm') + parser_get_alarm.add_argument('task_name') + parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') + parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information') parser_list = subparsers.add_parser('list', help='show all loaded task mod') client_args = parser.parse_args() @@ -142,6 +151,15 @@ if __name__ == '__main__': req_msg_struct = {"type": "get_status", "data": client_args.task_name} elif client_args.cmd_type == 'get_result': req_msg_struct = {"type": "get_result", "data": client_args.task_name} + elif client_args.cmd_type == 'get_alarm': + req_msg_struct = { + "type": "get_alarm", + "data": { + 'task_name': client_args.task_name, + 'time_range': client_args.time_range, + 'detailed': client_args.detailed, + } + } elif client_args.cmd_type == 'reload': req_msg_struct = {"type": "reload", "data": client_args.task_name} else: diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index 9ef0203..c2dee85 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -28,7 +28,7 @@ from .sentry_config import SentryConfig, get_log_level from .task_map import TasksMap from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM from .cron_process import period_tasks_handle -from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result +from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result, task_get_alarm from .mod_status import get_task_by_pid, set_runtime_status from .load_mods import load_tasks, reload_single_mod from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, @@ -36,7 +36,11 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel from .utils import get_current_time_string +from .alarm import alarm_register +from xalarm.register_xalarm import xalarm_unregister + +clientId = -1 CPU_EXIST = True try: @@ -62,6 +66,7 @@ type_func = { 'stop': task_stop, 'get_status': task_get_status, 'get_result': task_get_result, + 'get_alarm': task_get_alarm, 'reload': reload_single_mod } @@ -107,11 +112,12 @@ def msg_data_process(msg_data): return "Invaild cmd type" cmd_param = data_struct['data'] - logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param) + logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param)) if cmd_type in type_func: ret, res_data = type_func[cmd_type](cmd_param) else: ret, res_data = type_func_void[cmd_type]() + logging.debug("msg_data_process res_data:%s",str(res_data)) res_msg_struct = {"ret": ret, "data": res_data} res_msg = json.dumps(res_msg_struct) @@ -584,10 +590,13 @@ def main(): _ = SentryConfig.init_param() TasksMap.init_task_map() load_tasks() + clientId = alarm_register() main_loop() except Exception: logging.error('%s', traceback.format_exc()) finally: + if clientId != -1: + xalarm_unregister(clientId) release_pidfile() diff --git a/src/python/syssentry/task_map.py b/src/python/syssentry/task_map.py index 70aa19d..27e97ff 100644 --- a/src/python/syssentry/task_map.py +++ b/src/python/syssentry/task_map.py @@ -13,16 +13,16 @@ tasks map class and initialize function. """ import logging +from typing import Dict ONESHOT_TYPE = "ONESHOT" PERIOD_TYPE = "PERIOD" TASKS_MAP = None - class TasksMap: """task map class""" - tasks_dict = {} + tasks_dict: Dict[str, Dict] = {} @classmethod def init_task_map(cls): @@ -65,3 +65,4 @@ class TasksMap: logging.debug("getting task by name: %s", res) break return res + -- Gitee From a50c27908e57a7e02d81063b8c0f321b556b3401 Mon Sep 17 00:00:00 2001 From: PshySimon Date: Wed, 9 Oct 2024 10:20:34 +0800 Subject: [PATCH 23/76] fix python 3.7 not support list[bool] type --- src/python/xalarm/register_xalarm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py index e58343d..6756b1b 100644 --- a/src/python/xalarm/register_xalarm.py +++ b/src/python/xalarm/register_xalarm.py @@ -26,7 +26,7 @@ ALARM_REGISTER_INFO = None class AlarmRegister: - def __init__(self, id_filter: list[bool], callback: callable): + def __init__(self, id_filter: list, callback: callable): self.id_filter = id_filter self.callback = callback self.socket = self.create_unix_socket() @@ -49,7 +49,7 @@ class AlarmRegister: return False return True - def set_id_filter(self, id_filter: list[bool]) -> bool: + def set_id_filter(self, id_filter: list) -> bool: if (len(id_filter) > MAX_NUM_OF_ALARM_ID): sys.stderr.write("set_id_filter: invalid param id_filter\n") return False @@ -118,7 +118,7 @@ class AlarmRegister: self.socket.close() -def xalarm_register(callback: callable, id_filter: list[bool]) -> int: +def xalarm_register(callback: callable, id_filter: list) -> int: global ALARM_REGISTER_INFO if ALARM_REGISTER_INFO is not None: @@ -148,7 +148,7 @@ def xalarm_unregister(clientId: int) -> None: ALARM_REGISTER_INFO = None -def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None: +def xalarm_upgrade(clientId: int, id_filter: list) -> None: global ALARM_REGISTER_INFO if clientId < 0: sys.stderr.write("xalarm_unregister: invalid client\n") -- Gitee From 1ae65f281375d6c0c9968810e2592e3438e07cbf Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Wed, 9 Oct 2024 14:22:38 +0800 Subject: [PATCH 24/76] avg_block_io send alarm to xalarmd --- config/tasks/avg_block_io.mod | 2 ++ .../sentryPlugins/avg_block_io/module_conn.py | 23 +++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod index b9b6f34..bcd063b 100644 --- a/config/tasks/avg_block_io.mod +++ b/config/tasks/avg_block_io.mod @@ -3,3 +3,5 @@ enabled=yes task_start=/usr/bin/python3 /usr/bin/avg_block_io task_stop=pkill -f /usr/bin/avg_block_io type=oneshot +alarm_id=1002 +alarm_clear_time=5 diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 0da4208..2fc5a83 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -16,6 +16,7 @@ import time from .utils import is_abnormal from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages from syssentry.result import ResultLevel, report_result +from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR TASK_NAME = "avg_block_io" @@ -68,19 +69,33 @@ def process_report_data(disk_name, rw, io_data): if not is_abnormal((disk_name, 'bio', rw), io_data): return + msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} + ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] for stage_name in ctrl_stage: if is_abnormal((disk_name, stage_name, rw), io_data): - logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) + msg["reason"] = "IO press slow" + msg["block_stack"] = f"bio,{stage_name}" + logging.warning("{} - {} report IO press slow".format(disk_name, rw)) + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return if is_abnormal((disk_name, 'rq_driver', rw), io_data): - logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) + msg["reason"] = "driver slow" + msg["block_stack"] = "bio,rq_driver" + logging.warning("{} - {} report driver slow".format(disk_name, rw)) + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] for stage_name in kernel_stage: if is_abnormal((disk_name, stage_name, rw), io_data): - logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) + msg["reason"] = "kernel slow" + msg["block_stack"] = f"bio,{stage_name}" + logging.warning("{} - {} report kernel slow".format(disk_name, rw)) + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return - logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) + msg["reason"] = "unknown" + msg["block_stack"] = "bio" + logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -- Gitee From 92842a334e2c96fcf654bbd4b7a03e0d653b3bd9 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Wed, 9 Oct 2024 16:37:24 +0800 Subject: [PATCH 25/76] update log when it is not lock collect --- src/python/sentryCollector/collect_io.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index e45947a..997fbe8 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -179,13 +179,17 @@ class CollectIo(): blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') if not os.path.exists(blk_io_hierarchy_path): - logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name) + logging.warning("no blk_io_hierarchy directory found in %s, skipping.", disk_name) continue for file_name in os.listdir(blk_io_hierarchy_path): file_path = os.path.join(blk_io_hierarchy_path, file_name) if file_name == 'stats': all_disk.append(disk_name) + + if len(all_disk) == 0: + logging.debug("no blk_io_hierarchy disk, it is not lock-free collection") + return False if self.loop_all: self.disk_list = all_disk -- Gitee From b9064aecf74284442b6c25e113900692c35b4cc6 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Wed, 9 Oct 2024 16:46:24 +0800 Subject: [PATCH 26/76] ebpf log update --- src/python/sentryCollector/collect_io.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 997fbe8..9d61f60 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -258,6 +258,8 @@ class CollectIo(): if EBPF_GLOBAL_DATA: for data in EBPF_GLOBAL_DATA: data_list = data.split() + if len(data_list) != 6: + continue stage, finish_count, latency, io_dump, io_type ,disk_name = data_list if disk_name not in self.window_value: continue @@ -345,7 +347,7 @@ class CollectIo(): finish = curr_finish_count - prev_finish_count if finish <= 0: return 0 - value = finish / self.period_time / 1000 / 1000 + value = finish / self.period_time if value.is_integer(): return int(value) else: @@ -359,7 +361,7 @@ class CollectIo(): lat_time = curr_latency - prev_latency if lat_time <= 0: return 0 - value = lat_time / self.period_time + value = lat_time / self.period_time / 1000 / 1000 if value.is_integer(): return int(value) else: @@ -427,6 +429,7 @@ class CollectIo(): sleep_time -= 1 time.sleep(sleep_time) elif self.is_ebpf_avaliable(): + logging.info("ebpf collector thread start") self.start_ebpf_subprocess() thread_get_data = threading.Thread(target=self.get_ebpf_raw_data) @@ -442,7 +445,6 @@ class CollectIo(): thread_append_data.join() self.stop_ebpf_subprocess() - logging.info("ebpf collector thread exits") else: logging.warning("fail to start ebpf collector thread. collect io thread exits") return -- Gitee From e7f7371541bdbf029f1a6a76b55bbbe0efd76a36 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Wed, 9 Oct 2024 11:55:35 +0800 Subject: [PATCH 27/76] change alarm length Signed-off-by: jinsaihang --- src/python/syssentry/sentryctl | 3 +++ src/python/syssentry/syssentry.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl index 675c17a..3de93d0 100644 --- a/src/python/syssentry/sentryctl +++ b/src/python/syssentry/sentryctl @@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256 RESULT_MSG_DATA_LEN = 4 CTL_MSG_LEN_LEN = 3 +ALARM_MSG_DATA_LEN = 6 DEFAULT_ALARM_TIME_RANGE = 10 def status_output_format(res_data): @@ -173,6 +174,8 @@ if __name__ == '__main__': request_message = json.dumps(req_msg_struct) if client_args.cmd_type == 'get_result': result_message = client_send_and_recv(request_message, RESULT_MSG_DATA_LEN) + elif client_args.cmd_type == 'get_alarm': + result_message = client_send_and_recv(request_message, ALARM_MSG_DATA_LEN) else: result_message = client_send_and_recv(request_message, CTL_MSG_LEN_LEN) if not result_message: diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index c2dee85..ea09095 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -56,6 +56,7 @@ CTL_MSG_MAGIC_LEN = 3 CTL_MSG_LEN_LEN = 3 CTL_MAGIC = "CTL" RES_MAGIC = "RES" +ALARM_MSG_DATA_LEN = 6 CTL_LISTEN_QUEUE_LEN = 5 SERVER_EPOLL_TIMEOUT = 0.3 @@ -256,6 +257,8 @@ def server_recv(server_socket: socket.socket): res_head = RES_MAGIC if cmd_type == "get_result": res_data_len = str(len(res_data)).zfill(RESULT_MSG_HEAD_LEN - RESULT_MSG_MAGIC_LEN) + elif cmd_type == "get_alarm": + res_data_len = str(len(res_data)).zfill(ALARM_MSG_DATA_LEN) else: res_data_len = str(len(res_data)).zfill(CTL_MSG_MAGIC_LEN) res_head += res_data_len -- Gitee From aff563fb902c82bddb33550a490eeed1a8c85411 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Wed, 9 Oct 2024 08:09:04 +0000 Subject: [PATCH 28/76] add detail time Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index 3b80769..7ba7e80 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -118,11 +118,13 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") def xalarm_to_dict(alarm_info: Xalarm) -> dict: + timestamp = alarm_info.timetamp.tv_sec + alarm_info.timetamp.tv_usec / 1000000 + dt_object = datetime.fromtimestamp(int(timestamp)) return { 'alarm_id': xalarm_getid(alarm_info), 'alarm_type': xalarm_gettype(alarm_info), 'alarm_level': xalarm_getlevel(alarm_info), - 'timetamp': xalarm_gettime(alarm_info), + 'timestamp': dt_object.strftime("%Y-%m-%d %H:%M:%S"), 'msg1': xalarm_getdesc(alarm_info) } -- Gitee From c1dcbff263323e2c144846d413ddc88e47a38adb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Wed, 9 Oct 2024 16:19:52 +0800 Subject: [PATCH 29/76] fix config relative some issues --- .../sentryPlugins/ai_block_io/README.md | 1 - .../sentryPlugins/ai_block_io/ai_block_io.py | 21 +++++----- .../ai_block_io/config_parser.py | 42 +++++++++---------- .../sentryPlugins/ai_block_io/detector.py | 2 +- .../ai_block_io/sliding_window.py | 8 ++-- .../sentryPlugins/ai_block_io/threshold.py | 6 +-- 6 files changed, 39 insertions(+), 41 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/README.md b/src/python/sentryPlugins/ai_block_io/README.md index f9b8388..95c1111 100644 --- a/src/python/sentryPlugins/ai_block_io/README.md +++ b/src/python/sentryPlugins/ai_block_io/README.md @@ -1,2 +1 @@ # slow_io_detection - diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 31b8a97..3b00ef3 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -16,8 +16,7 @@ import logging from .detector import Detector from .threshold import ThresholdFactory, AbsoluteThreshold from .sliding_window import SlidingWindowFactory -from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size, - get_log_level) +from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser from .data_access import get_io_data_from_collect_plug, check_collect_valid from .io_data import MetricName @@ -45,25 +44,25 @@ class SlowIODetection: def __init_detector_name_list(self): self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) + logging.info(f"ai_block_io plug has found disks: {self._disk_list}") disks_to_detection: list = self._config_parser.get_disks_to_detection() # 情况1:None,则启用所有磁盘检测 # 情况2:is not None and len = 0,则不启动任何磁盘检测 # 情况3:len != 0,则取交集 if disks_to_detection is None: + logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") for disk in self._disk_list: self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) elif len(disks_to_detection) == 0: - logging.warning('please attention: conf file not specify any disk to detection, ' - 'so it will not start ai block io.') + logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.') else: - disks_name_to_detection = [] - for disk_name_to_detection in disks_to_detection: - disks_name_to_detection.append(disk_name_to_detection.get_disk_name()) - disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection] - for disk in disk_intersection: - self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) - self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) + for disk_to_detection in disks_to_detection: + if disk_to_detection in self._disk_list: + self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency")) + self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency")) + else: + logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.") logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') def __init_detector(self): diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 632391d..354c122 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -10,18 +10,19 @@ # See the Mulan PSL v2 for more details. import configparser -import json import logging -from .io_data import MetricName from .threshold import ThresholdType from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level + LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" def init_log_format(log_level: str): - logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT) + logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) + if log_level.lower() not in ('info', 'warning', 'error', 'debug'): + logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.') class ConfigParser: @@ -43,7 +44,7 @@ class ConfigParser: self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL - self.__disks_to_detection: list = [] + self.__disks_to_detection = None self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION @@ -83,26 +84,20 @@ class ConfigParser: logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') def __read__disks_to_detect(self, items_common: dict): - disks_to_detection = items_common.get('disks_to_detect') + disks_to_detection = items_common.get('disk') if disks_to_detection is None: - logging.warning(f'config of disks_to_detect not found, the default value be used.') + logging.warning(f'config of disk not found, the default value will be used.') self.__disks_to_detection = None return - try: - disks_to_detection_list = json.loads(disks_to_detection) - for disk_to_detection in disks_to_detection_list: - disk_name = disk_to_detection.get('disk_name', None) - stage_name = disk_to_detection.get('stage_name', None) - io_access_type_name = disk_to_detection.get('io_access_type_name', None) - metric_name = disk_to_detection.get('metric_name', None) - if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None): - metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name) - self.__disks_to_detection.append(metric_name_object) - else: - logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.') - except json.decoder.JSONDecodeError as e: - logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.') + disk_list = disks_to_detection.split(',') + if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''): + logging.warning("you don't specify any disk.") + self.__disks_to_detection = [] + return + if len(disk_list) == 1 and disk_list[0] == 'default': self.__disks_to_detection = None + return + self.__disks_to_detection = disk_list def __read__train_data_duration(self, items_algorithm: dict): try: @@ -189,7 +184,12 @@ class ConfigParser: def read_config_from_file(self): con = configparser.ConfigParser() - con.read(self.__config_file_name, encoding='utf-8') + try: + con.read(self.__config_file_name, encoding='utf-8') + except configparser.Error as e: + init_log_format(self.__log_level) + logging.critical(f'config file read error: {e}, ai_block_io plug will exit.') + exit(1) if con.has_section('common'): items_common = dict(con.items('common')) diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index bcf62cb..a48144f 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -50,6 +50,6 @@ class Detector: def __repr__(self): return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' - f' access_type_name: {self._metric_name.get_io_access_type_name()},' + f' io_type_name: {self._metric_name.get_io_access_type_name()},' f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' f' sliding_window_type: {self._slidingWindow}') diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index d395d48..89191e5 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -52,7 +52,7 @@ class SlidingWindow: return False, None, None def __repr__(self): - return "SlidingWindow" + return "[SlidingWindow]" class NotContinuousSlidingWindow(SlidingWindow): @@ -65,7 +65,7 @@ class NotContinuousSlidingWindow(SlidingWindow): return False, self._io_data_queue, self._ai_threshold def __repr__(self): - return "NotContinuousSlidingWindow" + return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" class ContinuousSlidingWindow(SlidingWindow): @@ -84,7 +84,7 @@ class ContinuousSlidingWindow(SlidingWindow): return False, self._io_data_queue, self._ai_threshold def __repr__(self): - return "ContinuousSlidingWindow" + return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" class MedianSlidingWindow(SlidingWindow): @@ -98,7 +98,7 @@ class MedianSlidingWindow(SlidingWindow): return False, self._io_data_queue, self._ai_threshold def __repr__(self): - return "MedianSlidingWindow" + return f"[MedianSlidingWindow, window size: {self._queue_length}]" class SlidingWindowFactory: diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py index ff85d85..3b7a5a8 100644 --- a/src/python/sentryPlugins/ai_block_io/threshold.py +++ b/src/python/sentryPlugins/ai_block_io/threshold.py @@ -75,7 +75,7 @@ class AbsoluteThreshold(Threshold): pass def __repr__(self): - return "AbsoluteThreshold" + return "[AbsoluteThreshold]" class BoxplotThreshold(Threshold): @@ -109,7 +109,7 @@ class BoxplotThreshold(Threshold): self.new_data_size = 0 def __repr__(self): - return "BoxplotThreshold" + return f"[BoxplotThreshold, param is: {self.parameter}]" class NSigmaThreshold(Threshold): @@ -142,7 +142,7 @@ class NSigmaThreshold(Threshold): self.new_data_size = 0 def __repr__(self): - return "NSigmaThreshold" + return f"[NSigmaThreshold, param is: {self.parameter}]" class ThresholdType(Enum): -- Gitee From f7811376596833d4f2caa8d50017d3a9e73a6a0c Mon Sep 17 00:00:00 2001 From: zhangnan Date: Thu, 10 Oct 2024 12:27:06 +0800 Subject: [PATCH 30/76] ebpf fix alarm bug --- src/c/ebpf_collector/ebpf_collector.h | 2 +- src/python/sentryCollector/collect_io.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.h b/src/c/ebpf_collector/ebpf_collector.h index 1ae33de..dca04d8 100644 --- a/src/c/ebpf_collector/ebpf_collector.h +++ b/src/c/ebpf_collector/ebpf_collector.h @@ -11,7 +11,7 @@ typedef long long unsigned int u64; typedef unsigned int u32; #define MAX_BUCKETS 1 -#define THRESHOLD 1000 +#define THRESHOLD 1000000000 #define DURATION_THRESHOLD 500000000 #define RWBS_LEN 8 diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 9d61f60..ba94b9d 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -309,7 +309,8 @@ class CollectIo(): curr_iops = self.get_ebpf_iops(curr_finish_count=curr_finish_count, prev_finish_count=prev_finish_count) curr_io_length = self.get_ebpf_io_length(curr_latency=curr_latency, prev_latency=prev_latency) curr_io_dump = self.get_ebpf_io_dump(curr_io_dump_count=curr_io_dump_count, prev_io_dump_count=prev_io_dump_count) - IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_iops, curr_io_length, curr_io_dump]) + IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) + logging.debug(f"ebpf collect data : {IO_GLOBAL_DATA}") elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time if sleep_time < 0: -- Gitee From 0e628c7ba9efe30cac96a27564b5c7f4e2cb3dc0 Mon Sep 17 00:00:00 2001 From: PshySimon Date: Thu, 10 Oct 2024 16:15:52 +0800 Subject: [PATCH 31/76] xalarm add alarm msg length to 8192 --- src/libso/xalarm/register_xalarm.c | 2 +- src/libso/xalarm/register_xalarm.h | 2 +- src/python/xalarm/register_xalarm.py | 2 +- src/python/xalarm/sentry_notify.py | 2 +- src/python/xalarm/xalarm_api.py | 8 ++++++-- src/python/xalarm/xalarm_server.py | 2 +- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c index 21a419f..5aff2bc 100644 --- a/src/libso/xalarm/register_xalarm.c +++ b/src/libso/xalarm/register_xalarm.c @@ -35,7 +35,7 @@ #define ALARM_SOCKET_PERMISSION 0700 #define TIME_UNIT_MILLISECONDS 1000 -#define MAX_PARAS_LEN 1023 +#define MAX_PARAS_LEN 8191 #define MIN_ALARM_ID 1001 #define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h index fef9482..dcf4f03 100644 --- a/src/libso/xalarm/register_xalarm.h +++ b/src/libso/xalarm/register_xalarm.h @@ -11,7 +11,7 @@ #include #include -#define ALARM_INFO_MAX_PARAS_LEN 1024 +#define ALARM_INFO_MAX_PARAS_LEN 8192 #define MAX_STRERROR_SIZE 1024 #define MAX_ALARM_TYEPS 1024 #define MIN_ALARM_ID 1001 diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py index 6756b1b..edd9994 100644 --- a/src/python/xalarm/register_xalarm.py +++ b/src/python/xalarm/register_xalarm.py @@ -11,7 +11,7 @@ from struct import error as StructParseError from .xalarm_api import Xalarm, alarm_bin2stu -ALARM_REPORT_LEN = 1048 +ALARM_REPORT_LEN = 8216 MAX_NUM_OF_ALARM_ID=128 MIN_ALARM_ID = 1001 MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py index a19e5b3..c763a24 100644 --- a/src/python/xalarm/sentry_notify.py +++ b/src/python/xalarm/sentry_notify.py @@ -17,7 +17,7 @@ CRITICAL_ALM = 3 ALARM_TYPE_OCCUR = 1 ALARM_TYPE_RECOVER = 2 -MAX_PUC_PARAS_LEN = 1024 +MAX_PUC_PARAS_LEN = 8192 DIR_XALARM = "/var/run/xalarm" PATH_REPORT_ALARM = "/var/run/xalarm/report" diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py index 99eabf5..863bd02 100644 --- a/src/python/xalarm/xalarm_api.py +++ b/src/python/xalarm/xalarm_api.py @@ -23,7 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5) ALARM_SOCK_PATH = "/var/run/xalarm/report" MIN_ALARM_ID = 1001 MAX_ALARM_ID = 1128 -MAX_MSG_LEN = 1024 +MAX_MSG_LEN = 8192 @dataclasses.dataclass @@ -120,6 +120,10 @@ def alarm_bin2stu(bin_data): def alarm_stu2bin(alarm_info: Xalarm): + alarm_msg = alarm_info.msg1 + padding_length = MAX_MSG_LEN - len(alarm_msg) + if padding_length > 0: + alarm_msg = alarm_msg + ('\x00' * padding_length) return struct.pack( f'@HBBll{MAX_MSG_LEN}s', alarm_info.alarm_id, @@ -127,4 +131,4 @@ def alarm_stu2bin(alarm_info: Xalarm): alarm_info.alarm_type, alarm_info.timetamp.tv_sec, alarm_info.timetamp.tv_usec, - alarm_info.msg1.encode('utf-8')) + alarm_msg.encode('utf-8')) diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py index fcaf393..2882609 100644 --- a/src/python/xalarm/xalarm_server.py +++ b/src/python/xalarm/xalarm_server.py @@ -28,7 +28,7 @@ from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection ALARM_DIR = "/var/run/xalarm" USER_RECV_SOCK = "/var/run/xalarm/alarm" SOCK_FILE = "/var/run/xalarm/report" -ALARM_REPORT_LEN = 1048 +ALARM_REPORT_LEN = 8216 ALARM_DIR_PERMISSION = 0o750 ALARM_LISTEN_QUEUE_LEN = 5 -- Gitee From f87500c73456a9d931963b928a5ac7f6f695cb0c Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Thu, 10 Oct 2024 15:07:29 +0800 Subject: [PATCH 32/76] add log for improving maintainability --- .../avg_block_io/avg_block_io.py | 4 +- .../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++------- .../avg_block_io/stage_window.py | 8 +++ .../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++-- 4 files changed, 103 insertions(+), 29 deletions(-) diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index 26a60c5..cf2ded3 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param): if avg_lim_value and avg_time_value and tot_lim_value: io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) - logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) + logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) - logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) + logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) return io_data, io_avg_value diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 2fc5a83..40b3fcc 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -13,7 +13,7 @@ import logging import sys import time -from .utils import is_abnormal +from .utils import is_abnormal, get_win_data, log_slow_win from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info): def process_report_data(disk_name, rw, io_data): """check abnormal window and report to xalarm""" - if not is_abnormal((disk_name, 'bio', rw), io_data): + abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) + if not abnormal: return - msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} + msg = { + "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw, + "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list, + "details": get_win_data(disk_name, rw, io_data) + } + # io press ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] for stage_name in ctrl_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): - msg["reason"] = "IO press slow" - msg["block_stack"] = f"bio,{stage_name}" - logging.warning("{} - {} report IO press slow".format(disk_name, rw)) - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - - if is_abnormal((disk_name, 'rq_driver', rw), io_data): + abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) + if not abnormal: + continue + msg["reason"] = "IO press" + msg["block_stack"] = f"bio,{stage_name}" + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "IO press") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + return + + # driver slow + abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data) + if abnormal: msg["reason"] = "driver slow" msg["block_stack"] = "bio,rq_driver" - logging.warning("{} - {} report driver slow".format(disk_name, rw)) + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "driver slow") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return + # kernel slow kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] for stage_name in kernel_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): - msg["reason"] = "kernel slow" - msg["block_stack"] = f"bio,{stage_name}" - logging.warning("{} - {} report kernel slow".format(disk_name, rw)) - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - msg["reason"] = "unknown" - msg["block_stack"] = "bio" - logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) + abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) + if not abnormal: + continue + msg["reason"] = "kernel slow" + msg["block_stack"] = f"bio,{stage_name}" + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "kernel slow") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + return + + log_slow_win(msg, "unknown") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py index 9b0ce79..5113782 100644 --- a/src/python/sentryPlugins/avg_block_io/stage_window.py +++ b/src/python/sentryPlugins/avg_block_io/stage_window.py @@ -14,6 +14,11 @@ class AbnormalWindowBase: self.window_size = window_size self.window_threshold = window_threshold self.abnormal_window = [False] * window_size + self.window_data = [-1] * window_size + + def append_new_data(self, ab_res): + self.window_data.pop(0) + self.window_data.append(ab_res) def append_new_period(self, ab_res, avg_val=0): self.abnormal_window.pop(0) @@ -25,6 +30,9 @@ class AbnormalWindowBase: def is_abnormal_window(self): return sum(self.abnormal_window) > self.window_threshold + def window_data_to_string(self): + return ",".join(str(x) for x in self.window_data) + class IoWindow(AbnormalWindowBase): def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index 2de9a46..3b7f027 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -65,15 +65,32 @@ def set_nested_value(data, keys, value): return True +def get_win_data(disk_name, rw, io_data): + """get latency and iodump win data""" + latency = '' + iodump = '' + for stage_name in io_data[disk_name]: + if 'latency' in io_data[disk_name][stage_name][rw]: + latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() + latency += f'{stage_name}: [{latency_list}], ' + if 'iodump' in io_data[disk_name][stage_name][rw]: + iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string() + iodump += f'{stage_name}: [{iodump_list}], ' + return {"latency": latency[:-2], "iodump": iodump[:-2]} + + def is_abnormal(io_key, io_data): """check if latency and iodump win abnormal""" + abnormal_list = '' for key in ['latency', 'iodump']: all_keys = get_nested_value(io_data, io_key) if all_keys and key in all_keys: win = get_nested_value(io_data, io_key + (key,)) if win and win.is_abnormal_window(): - return True - return False + abnormal_list += key + ', ' + if not abnormal_list: + return False, abnormal_list + return True, abnormal_list[:-2] def update_io_avg(old_avg, period_value, win_size): @@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size): return [new_avg_value, new_avg_count] -def update_io_data(old_avg, period_value, win_size, io_data, io_key): - """update data of latency and iodump window""" +def update_io_period(old_avg, period_value, io_data, io_key): + """update period of latency and iodump window""" all_wins = get_nested_value(io_data, io_key) if all_wins and "latency" in all_wins: io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) @@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key): io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) +def update_io_data(period_value, io_data, io_key): + """update data of latency and iodump window""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0]) + if all_wins and "iodump" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) + + +def log_abnormal_period(old_avg, period_value, io_data, io_key): + """record log of abnormal period""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): + logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " + f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}") + if all_wins and "iodump" in all_wins: + if all_wins["iodump"].is_abnormal_period(period_value[1]): + logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " + f"type: iodump, curr_val: {period_value[1]}") + + +def log_slow_win(msg, reason): + """record log of slow win""" + logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " + f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") + logging.info(f"latency: {msg['details']['latency']}") + logging.info(f"iodump: {msg['details']['iodump']}") + + def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): """update avg and check abonrmal, return true if win_size full""" period_value = get_nested_value(data, io_key) old_avg = get_nested_value(io_avg_value, io_key) # 更新avg数据 + update_io_data(period_value, io_data, io_key) if old_avg[AVG_COUNT] < win_size: set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) return False + # 打印异常周期数据 + log_abnormal_period(old_avg, period_value, io_data, io_key) + # 更新win数据 -- 判断异常周期 - update_io_data(old_avg, period_value, win_size, io_data, io_key) + update_io_period(old_avg, period_value, io_data, io_key) all_wins = get_nested_value(io_data, io_key) - if all_wins and 'latency' not in all_wins: + if not all_wins or 'latency' not in all_wins: return True period = get_nested_value(io_data, io_key + ("latency",)) if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): -- Gitee From 958d41a6c796244bf42e9e29a021c3ef46d11c68 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Thu, 10 Oct 2024 20:17:34 +0800 Subject: [PATCH 33/76] add get_disk_type and fix some bugs --- service/sentryCollector.service | 2 +- src/python/sentryCollector/collect_io.py | 26 ++++++-- src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++- 3 files changed, 89 insertions(+), 7 deletions(-) diff --git a/service/sentryCollector.service b/service/sentryCollector.service index 4ee07d5..e09ddb3 100644 --- a/service/sentryCollector.service +++ b/service/sentryCollector.service @@ -1,5 +1,5 @@ [Unit] -Description = Collection module added for sysSentry and kernel lock-free collection +Description = Collection module added for sysSentry [Service] ExecStart=/usr/bin/python3 /usr/bin/sentryCollector diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index ba94b9d..d734734 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -124,7 +124,7 @@ class CollectIo(): return 0 if finish <= 0 or lat_time <= 0: return 0 - value = lat_time / finish / 1000 / 1000 + value = lat_time / finish / 1000 if value.is_integer(): return int(value) else: @@ -132,11 +132,17 @@ class CollectIo(): def get_io_length(self, curr_stage_value, last_stage_value, category): try: - finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) + lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) except ValueError as e: logging.error("get_io_length convert to int failed, %s", e) return 0 - value = finish / self.period_time / 1000 / 1000 + if lat_time <= 0: + return 0 + # ns convert us + lat_time = lat_time / 1000 + # s convert us + period_time = self.period_time * 1000 * 1000 + value = lat_time / period_time if value.is_integer(): return int(value) else: @@ -149,6 +155,8 @@ class CollectIo(): with open(io_dump_file, 'r') as file: for line in file: count += line.count('.op=' + Io_Category[category]) + if count > 0: + logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") except FileNotFoundError: logging.error("The file %s does not exist.", io_dump_file) return count @@ -309,6 +317,8 @@ class CollectIo(): curr_iops = self.get_ebpf_iops(curr_finish_count=curr_finish_count, prev_finish_count=prev_finish_count) curr_io_length = self.get_ebpf_io_length(curr_latency=curr_latency, prev_latency=prev_latency) curr_io_dump = self.get_ebpf_io_dump(curr_io_dump_count=curr_io_dump_count, prev_io_dump_count=prev_io_dump_count) + if curr_io_dump > 0: + logging.info(f"ebpf io_dump info : {disk_name}, {stage}, {category}, {curr_io_dump}") IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) logging.debug(f"ebpf collect data : {IO_GLOBAL_DATA}") elapsed_time = time.time() - start_time @@ -334,7 +344,7 @@ class CollectIo(): lat_time = curr_latency - prev_latency if finish <= 0 or lat_time <= 0: return 0 - value = lat_time / finish / 1000 / 1000 + value = lat_time / finish / 1000 if value.is_integer(): return int(value) else: @@ -362,7 +372,11 @@ class CollectIo(): lat_time = curr_latency - prev_latency if lat_time <= 0: return 0 - value = lat_time / self.period_time / 1000 / 1000 + # ns convert us + lat_time = lat_time / 1000 + # s convert us + period_time = self.period_time * 1000 * 1000 + value = lat_time / period_time if value.is_integer(): return int(value) else: @@ -417,6 +431,8 @@ class CollectIo(): if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: continue self.append_period_lat(disk_name, stage_list) + + logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}") elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 3e2cf4c..31bf11b 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -16,6 +16,7 @@ import json import socket import logging import re +import os COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" @@ -58,6 +59,8 @@ class ResultMessage(): RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. RESULT_PARSE_FAILED = 5 # parse failed RESULT_INVALID_CHAR = 6 # invalid char + RESULT_DISK_NOEXIST = 7 # disk is not exist + RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch Result_Messages = { ResultMessage.RESULT_SUCCEED: "Succeed", @@ -66,9 +69,15 @@ Result_Messages = { ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", ResultMessage.RESULT_PARSE_FAILED: "Parse failed", - ResultMessage.RESULT_INVALID_CHAR: "Invalid char" + ResultMessage.RESULT_INVALID_CHAR: "Invalid char", + ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist", + ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch" } +class DiskType(): + TYPE_NVME_SSD = 0 + TYPE_SATA_SSD = 1 + TYPE_SATA_HDD = 2 def client_send_and_recv(request_data, data_str_len, protocol): """client socket send and recv message""" @@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype): result['message'] = result_message return result +def get_disk_type(disk): + result = {} + result['ret'] = ResultMessage.RESULT_UNKNOWN + result['message'] = "" + if not disk: + logging.error("param is invalid") + result['ret'] = ResultMessage.RESULT_NOT_PARAM + return result + if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN: + logging.error("invalid disk length") + result['ret'] = ResultMessage.RESULT_INVALID_LENGTH + return result + pattern = r'^[a-zA-Z0-9_-]+$' + if not re.match(pattern, disk): + logging.error("%s is invalid char", disk) + result['ret'] = ResultMessage.RESULT_INVALID_CHAR + return result + + base_path = '/sys/block' + all_disk = [] + for disk_name in os.listdir(base_path): + all_disk.append(disk_name) + + if disk not in all_disk: + logging.error("disk %s is not exist", disk) + result['ret'] = ResultMessage.RESULT_DISK_NOEXIST + return result + + if disk[0:4] == "nvme": + result['message'] = str(DiskType.TYPE_NVME_SSD) + elif disk[0:2] == "sd": + disk_file = '/sys/block/{}/queue/rotational'.format(disk) + try: + with open(disk_file, 'r') as file: + num = int(file.read()) + if num == 1: + result['message'] = str(DiskType.TYPE_SATA_SSD) + elif num == 0: + result['message'] = str(DiskType.TYPE_SATA_HDD) + else: + logging.error("disk %s is not support, num = %d", disk, num) + result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH + return result + except FileNotFoundError: + logging.error("The disk_file [%s] does not exist", disk_file) + result['ret'] = ResultMessage.RESULT_DISK_NOEXIST + return result + except Exception as e: + logging.error("open disk_file %s happen an error: %s", disk_file, e) + return result + else: + logging.error("disk %s is not support", disk) + result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH + return result + + result['ret'] = ResultMessage.RESULT_SUCCEED + return result \ No newline at end of file -- Gitee From b801c2b875a75a7a2426ff7829d1530382e39396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Thu, 10 Oct 2024 17:21:48 +0800 Subject: [PATCH 34/76] ai_block_io adapt alarm module --- config/tasks/ai_block_io.mod | 4 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++--- .../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++----- .../sentryPlugins/ai_block_io/data_access.py | 5 +- .../sentryPlugins/ai_block_io/detector.py | 2 +- 5 files changed, 73 insertions(+), 31 deletions(-) diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod index 1971d7d..82f4f0b 100644 --- a/config/tasks/ai_block_io.mod +++ b/config/tasks/ai_block_io.mod @@ -2,4 +2,6 @@ enabled=yes task_start=/usr/bin/python3 /usr/bin/ai_block_io task_stop=pkill -f /usr/bin/ai_block_io -type=oneshot \ No newline at end of file +type=oneshot +alarm_id=1002 +alarm_clear_time=5 \ No newline at end of file diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 3b00ef3..77104a9 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser from .data_access import get_io_data_from_collect_plug, check_collect_valid from .io_data import MetricName -from .alarm_report import AlarmReport +from .alarm_report import Xalarm, Report CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" def sig_handler(signum, frame): logging.info("receive signal: %d", signum) - AlarmReport().report_fail(f"receive signal: {signum}") + Report.report_pass(f"receive signal: {signum}, exiting...") exit(signum) @@ -44,6 +44,10 @@ class SlowIODetection: def __init_detector_name_list(self): self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) + if self._disk_list is None: + Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") + exit(1) + logging.info(f"ai_block_io plug has found disks: {self._disk_list}") disks_to_detection: list = self._config_parser.get_disks_to_detection() # 情况1:None,则启用所有磁盘检测 @@ -101,7 +105,8 @@ class SlowIODetection: ) logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') if io_data_dict_with_disk_name is None: - continue + Report.report_pass("get io data error, please check if the collector plug is enable. exitting...") + exit(1) # Step2:慢IO检测 logging.debug('step2. Start to detection slow io event.') @@ -117,13 +122,16 @@ class SlowIODetection: for slow_io_event in slow_io_event_list: metric_name: MetricName = slow_io_event[0] result = slow_io_event[1] - alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " - f"stage is: {metric_name.get_stage_name()}, " - f"io access type is: {metric_name.get_io_access_type_name()}, " - f"metric is: {metric_name.get_metric_name()}, " - f"current window is: {result[1]}, " - f"threshold is: {result[2]}") - AlarmReport.report_major_alm(alarm_content) + alarm_content = { + "driver_name": f"{metric_name.get_disk_name()}", + "reason": "disk_slow", + "block_stack": f"{metric_name.get_stage_name()}", + "io_type": f"{metric_name.get_io_access_type_name()}", + "alarm_source": "ai_block_io", + "alarm_type": "latency", + "details": f"current window is: {result[1]}, threshold is: {result[2]}.", + } + Xalarm.major(alarm_content) logging.warning(alarm_content) # Step4:等待检测时间 diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py index 230c8cd..92bd6e3 100644 --- a/src/python/sentryPlugins/ai_block_io/alarm_report.py +++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py @@ -9,41 +9,72 @@ # PURPOSE. # See the Mulan PSL v2 for more details. -from syssentry.result import ResultLevel, report_result import logging import json +from xalarm.sentry_notify import ( + xalarm_report, + MINOR_ALM, + MAJOR_ALM, + CRITICAL_ALM, + ALARM_TYPE_OCCUR, + ALARM_TYPE_RECOVER, +) + +from syssentry.result import ResultLevel, report_result + -class AlarmReport: +class Report: TASK_NAME = "ai_block_io" @staticmethod def report_pass(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') + report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} PASS: {info}') @staticmethod def report_fail(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') + report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} FAIL: {info}') @staticmethod def report_skip(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') + report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} SKIP: {info}') + + +class Xalarm: + ALARM_ID = 1002 @staticmethod - def report_minor_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') + def minor(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") @staticmethod - def report_major_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') + def major(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") @staticmethod - def report_critical_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') + def critical(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") + + def minor_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") + + def major_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") + def critical_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index 01c5315..c7679cd 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -42,10 +42,11 @@ def check_collect_valid(period): data = json.loads(data_raw["message"]) except Exception as e: logging.warning(f"get io data failed, {e}") - return [] + return None return [k for k in data.keys()] else: - return [] + logging.warning(f"get io data failed, return {data_raw}") + return None def _get_raw_data(period, disk_list): diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index a48144f..0ed282b 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -35,7 +35,7 @@ class Detector: self._count += 1 if self._count % 15 == 0: self._count = 0 - logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") logging.debug(f'enter Detector: {self}') metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: -- Gitee From d1c9dc14243526b6252f17adb56852288c6db9a1 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Fri, 11 Oct 2024 10:48:35 +0800 Subject: [PATCH 35/76] diff disk type use diff config --- config/plugins/avg_block_io.ini | 26 +++- src/python/sentryCollector/collect_plugin.py | 6 + .../avg_block_io/avg_block_io.py | 144 ++++++++---------- .../sentryPlugins/avg_block_io/module_conn.py | 19 ++- .../sentryPlugins/avg_block_io/utils.py | 43 ++++++ 5 files changed, 146 insertions(+), 92 deletions(-) diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 858db18..5c4b9b0 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -11,13 +11,29 @@ period_time=1 win_size=30 win_threshold=6 -[latency] -read_avg_lim=10 -write_avg_lim=10 +[latency_nvme_ssd] +read_avg_lim=300 +write_avg_lim=300 read_avg_time=3 write_avg_time=3 -read_tot_lim=50 -write_tot_lim=50 +read_tot_lim=500 +write_tot_lim=500 + +[latency_sata_ssd] +read_avg_lim=10000 +write_avg_lim=10000 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50000 +write_tot_lim=50000 + +[latency_sata_hdd] +read_avg_lim=15000 +write_avg_lim=15000 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50000 +write_tot_lim=50000 [iodump] read_iodump_lim=0 diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 31bf11b..bec405a 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -79,6 +79,12 @@ class DiskType(): TYPE_SATA_SSD = 1 TYPE_SATA_HDD = 2 +Disk_Type = { + DiskType.TYPE_NVME_SSD: "nvme_ssd", + DiskType.TYPE_SATA_SSD: "sata_ssd", + DiskType.TYPE_SATA_HDD: "sata_hdd" +} + def client_send_and_recv(request_data, data_str_len, protocol): """client socket send and recv message""" try: diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index cf2ded3..f3ade09 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -14,8 +14,9 @@ import configparser import time from .stage_window import IoWindow, IoDumpWindow -from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler -from .utils import update_avg_and_check_abnormal, get_log_level +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name +from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value +from sentryCollector.collect_plugin import Disk_Type CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -37,44 +38,40 @@ def read_config_common(config): disk = [] if disk_name == "default" else disk_name.split(",") except configparser.NoOptionError: disk = [] - logging.warning("Unset disk, set to default") + logging.warning("Unset common.disk, set to default") try: stage_name = config.get("common", "stage") stage = [] if stage_name == "default" else stage_name.split(",") except configparser.NoOptionError: stage = [] - logging.warning("Unset stage, set to read,write") + logging.warning("Unset common.stage, set to default") if len(disk) > 10: - logging.warning("Too many disks, record only max 10 disks") + logging.warning("Too many common.disks, record only max 10 disks") disk = disk[:10] try: iotype_name = config.get("common", "iotype").split(",") - iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']] - err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']] + iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] + err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] - if iotype_list in [None, []]: - iotype_list = ["read", "write"] - except configparser.NoOptionError: - iotype = ["read", "write"] - logging.warning("Unset iotype, set to default") + if err_iotype: + report_alarm_fail("Invalid common.iotype config") - if err_iotype: - logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) - + except configparser.NoOptionError: + iotype_list = ["read", "write"] + logging.warning("Unset common.iotype, set to read,write") try: period_time = int(config.get("common", "period_time")) if not (1 <= period_time <= 300): raise ValueError("Invalid period_time") except ValueError: - period_time = 1 - logging.warning("Invalid period_time, set to 1s") + report_alarm_fail("Invalid common.period_time") except configparser.NoOptionError: period_time = 1 - logging.warning("Unset period_time, use 1s as default") + logging.warning("Unset common.period_time, use 1s as default") return period_time, disk, stage, iotype_list @@ -87,76 +84,56 @@ def read_config_algorithm(config): try: win_size = int(config.get("algorithm", "win_size")) if not (1 <= win_size <= 300): - raise ValueError("Invalid win_size") + raise ValueError("Invalid algorithm.win_size") except ValueError: - win_size = 30 - logging.warning("Invalid win_size, set to 30") + report_alarm_fail("Invalid algorithm.win_size config") except configparser.NoOptionError: win_size = 30 - logging.warning("Unset win_size, use 30 as default") + logging.warning("Unset algorithm.win_size, use 30 as default") try: win_threshold = int(config.get("algorithm", "win_threshold")) if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: - raise ValueError("Invalid win_threshold") + raise ValueError("Invalid algorithm.win_threshold") except ValueError: - win_threshold = 6 - logging.warning("Invalid win_threshold, set to 6") + report_alarm_fail("Invalid algorithm.win_threshold config") except configparser.NoOptionError: win_threshold = 6 - logging.warning("Unset win_threshold, use 6 as default") + logging.warning("Unset algorithm.win_threshold, use 6 as default") return win_size, win_threshold -def read_config_lat_iodump(io_dic, config): - """read config file, get [latency] [iodump] section value""" +def read_config_latency(config): + """read config file, get [latency_xxx] section value""" common_param = {} - lat_sec = None - if not config.has_section("latency"): - logging.warning("Cannot find latency section in config file") - else: - lat_sec = config["latency"] - - iodump_sec = None - if not config.has_section("iodump"): - logging.warning("Cannot find iodump section in config file") - else: - iodump_sec = config["iodump"] - - if not lat_sec and not iodump_sec: - return common_param - - for io_type in io_dic["iotype_list"]: - common_param[io_type] = {} - - latency_keys = { - "avg_lim": "{}_avg_lim".format(io_type), - "avg_time": "{}_avg_time".format(io_type), - "tot_lim": "{}_tot_lim".format(io_type), - } - iodump_key = "{}_iodump_lim".format(io_type) + for type_name in Disk_Type: + section_name = f"latency_{Disk_Type[type_name]}" + if not config.has_section(section_name): + report_alarm_fail(f"Cannot find {section_name} section in config file") - if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal(): - common_param[io_type][iodump_key] = int(iodump_sec[iodump_key]) + common_param[Disk_Type[type_name]] = get_section_value(section_name, config) + return common_param - if not lat_sec: - continue - for key_suffix, key_template in latency_keys.items(): - if key_template in lat_sec and lat_sec[key_template].isdecimal(): - common_param[io_type][key_template] = int(lat_sec[key_template]) +def read_config_iodump(config): + """read config file, get [iodump] section value""" + common_param = {} + section_name = "iodump" + if not config.has_section(section_name): + report_alarm_fail(f"Cannot find {section_name} section in config file") - return common_param + return get_section_value(section_name, config) -def read_config_stage(config, stage, iotype_list): - """read config file, get [STAGE_NAME] section value""" +def read_config_stage(config, stage, iotype_list, curr_disk_type): + """read config file, get [STAGE_NAME_diskType] section value""" res = {} - if not stage in config: + section_name = f"{stage}_{curr_disk_type}" + if not config.has_section(section_name): return res - for key in config[stage]: + for key in config[section_name]: if config[stage][key].isdecimal(): res[key] = int(config[stage][key]) @@ -171,11 +148,12 @@ def init_io_win(io_dic, config, common_param): for disk_name in io_dic["disk_list"]: io_data[disk_name] = {} io_avg_value[disk_name] = {} + curr_disk_type = get_disk_type_by_name(disk_name) for stage_name in io_dic["stage_list"]: io_data[disk_name][stage_name] = {} io_avg_value[disk_name][stage_name] = {} - # step3. 解析stage配置 - curr_stage_param = read_config_stage(config, stage_name, iotype_list) + # 解析stage配置 + curr_stage_param = read_config_stage(config, stage_name, iotype_list, curr_disk_type) for rw in iotype_list: io_data[disk_name][stage_name][rw] = {} io_avg_value[disk_name][stage_name][rw] = [0, 0] @@ -187,10 +165,10 @@ def init_io_win(io_dic, config, common_param): iodump_lim_key = "{}_iodump_lim".format(rw) # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 - avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) - avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) - tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) - iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) + avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(curr_disk_type, {}).get(avg_lim_key)) + avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(curr_disk_type, {}).get(avg_time_key)) + tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(curr_disk_type, {}).get(tot_lim_key)) + iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key)) if avg_lim_value and avg_time_value and tot_lim_value: io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) @@ -217,28 +195,21 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): stage_list = [key for key in all_stage_set if key in config_stage] not_in_stage_list = [key for key in config_stage if key not in all_stage_set] - if not config_disk: + if not_in_stage_list: + report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}") + + if not config_disk and not not_in_disk_list: disk_list = [key for key in all_disk_set] - if not config_stage: + if not config_stage and not not_in_stage_list: stage_list = [key for key in all_stage_set] disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list - stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list - - if config_disk and not disk_list: - logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) - disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) - - if config_stage and not stage_list: - logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) - disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) if not stage_list or not disk_list: report_alarm_fail("Cannot get valid disk name or stage name.") log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) - log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) return disk_list, stage_list @@ -310,8 +281,13 @@ def main(): # step1. 解析公共配置 --- algorithm io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) - # step2. 循环创建窗口 - common_param = read_config_lat_iodump(io_dic, config) + # step2. 解析公共配置 --- latency_xxx + common_param = read_config_latency(config) + + # step3. 解析公共配置 --- iodump + common_param['iodump'] = read_config_iodump(config) + + # step4. 循环创建窗口 io_data, io_avg_value = init_io_win(io_dic, config, common_param) main_loop(io_dic, io_data, io_avg_value) diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 40b3fcc..8d6f429 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -14,7 +14,7 @@ import sys import time from .utils import is_abnormal, get_win_data, log_slow_win -from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -51,7 +51,7 @@ def check_result_validation(res, reason): try: json_data = json.loads(res['message']) except json.JSONDecodeError: - err_msg = "Failed to {}: invalid return message".format(reason) + err_msg = f"Failed to {reason}: invalid return message" report_alarm_fail(err_msg) return json_data @@ -60,7 +60,7 @@ def check_result_validation(res, reason): def report_alarm_fail(alarm_info): """report result to xalarmd""" report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) - logging.error(alarm_info) + logging.critical(alarm_info) sys.exit(1) @@ -114,3 +114,16 @@ def process_report_data(disk_name, rw, io_data): log_slow_win(msg, "unknown") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + + +def get_disk_type_by_name(disk_name): + res = get_disk_type(disk_name) + disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') + try: + curr_disk_type = int(disk_type_str) + if curr_disk_type not in Disk_Type: + raise ValueError + except ValueError: + report_alarm_fail(f"Failed to get disk type for {disk_name}") + + return Disk_Type[curr_disk_type] \ No newline at end of file diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index 3b7f027..c381c07 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -26,6 +26,49 @@ LogLevel = { } +DEFAULT_PARAM = { + 'latency_nvme_ssd': { + 'read_avg_lim': 300, + 'write_avg_lim': 300, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 500, + 'write_tot_lim': 500, + }, 'latency_sata_ssd' : { + 'read_avg_lim': 10000, + 'write_avg_lim': 10000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000, + }, 'latency_sata_hdd' : { + 'read_avg_lim': 15000, + 'write_avg_lim': 15000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000 + }, 'iodump': { + 'read_iodump_lim': 0, + 'write_iodump_lim': 0 + } +} + + +def get_section_value(section_name, config): + common_param = {} + config_sec = config[section_name] + for config_key in DEFAULT_PARAM[section_name]: + if config_key in config_sec: + if not config_sec[config_key].isdecimal(): + report_alarm_fail(f"Invalid {section_name}.{config_key} config.") + common_param[config_key] = int(config_sec[config_key]) + else: + logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default") + common_param[config_key] = DEFAULT_PARAM[section_name][config_key] + return common_param + + def get_log_level(filename): if not os.path.exists(filename): return logging.INFO -- Gitee From d180f2993ec9e49af03f348b4c46710a292a642f Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Fri, 11 Oct 2024 15:35:43 +0800 Subject: [PATCH 36/76] add parameter time_range ,alarm_id and alarm_clear_time validation Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 19 +++++++++++++++++++ src/python/syssentry/load_mods.py | 6 ++---- src/python/syssentry/sentryctl | 4 +++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index 7ba7e80..fd379d7 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -18,6 +18,7 @@ from datetime import datetime import time import logging import json +import sys from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc from xalarm.xalarm_api import Xalarm @@ -41,9 +42,15 @@ id_base = 1001 clientId = -1 MILLISECONDS_UNIT_SECONDS = 1000 +MAX_NUM_OF_ALARM_ID = 128 +MIN_ALARM_ID = 1001 +MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) def update_alarm_list(alarm_info: Xalarm): alarm_id = xalarm_getid(alarm_info) + if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: + logging.warnning(f"Invalid alarm_id {alarm_id}") + return timestamp = xalarm_gettime(alarm_info) if not timestamp: logging.error("Retrieve timestamp failed") @@ -77,7 +84,19 @@ def alarm_register(): logging.info(f"alarm_register: {task_name} is registered") task = TasksMap.tasks_dict[task_type][task_name] alarm_id = task.alarm_id + if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: + logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + continue alarm_clear_time = task.alarm_clear_time + try: + alarm_clear_time = int(alarm_clear_time) + if alarm_clear_time <= 0: + raise ValueError("Not a positive integer") + if alarm_clear_time > sys.maxsize: + raise ValueError("Exceeds maximum value for int") + except (ValueError, OverflowError, TypeError) as e: + logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") + continue alarm_list_dict[alarm_id] = [] task_alarm_id_dict[task_name] = alarm_id if alarm_id not in alarm_id_clear_time_dict: diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py index ae05e57..7daf17d 100644 --- a/src/python/syssentry/load_mods.py +++ b/src/python/syssentry/load_mods.py @@ -203,11 +203,9 @@ def parse_mod_conf(mod_name, mod_conf): if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): raise ValueError("Invalid alarm_id") except ValueError: - task.alarm_id = -1 - logging.warning("Invalid alarm_id, set to -1") + logging.warning("Invalid alarm_id") except configparser.NoOptionError: - task.alarm_id = -1 - logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default") + logging.warning("Unset alarm_clear_time, use 15s as default") if CONF_ONSTART in mod_conf.options(CONF_TASK): is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl index 3de93d0..c2e3cef 100644 --- a/src/python/syssentry/sentryctl +++ b/src/python/syssentry/sentryctl @@ -136,7 +136,7 @@ if __name__ == '__main__': parser_get_result.add_argument('task_name') parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm') parser_get_alarm.add_argument('task_name') - parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') + parser_get_alarm.add_argument('-s', '--time_range', type=int, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information') parser_list = subparsers.add_parser('list', help='show all loaded task mod') @@ -153,6 +153,8 @@ if __name__ == '__main__': elif client_args.cmd_type == 'get_result': req_msg_struct = {"type": "get_result", "data": client_args.task_name} elif client_args.cmd_type == 'get_alarm': + if not isinstance(client_args.time_range, int) or client_args.time_range <= 0: + print(f"time_range is not a positive integer: {client_args.time_range}") req_msg_struct = { "type": "get_alarm", "data": { -- Gitee From 3359b91ee9622ef3714a427d107d709e187b8f21 Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Fri, 11 Oct 2024 12:12:53 +0800 Subject: [PATCH 37/76] fix xalarm_Report function not refuse alarm msg exceeds maximum --- src/libso/xalarm/register_xalarm.c | 5 +++++ src/python/xalarm/register_xalarm.py | 6 +++--- src/python/xalarm/sentry_notify.py | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c index 5aff2bc..fd7339a 100644 --- a/src/libso/xalarm/register_xalarm.c +++ b/src/libso/xalarm/register_xalarm.c @@ -339,6 +339,11 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel, return -1; } + if (pucParas == NULL || (int)strlen(pucParas) > MAX_PARAS_LEN) { + fprintf(stderr, "%s: alarm info invalid\n", __func__); + return -1; + } + if (memset(&info, 0, sizeof(struct alarm_info)) == NULL) { fprintf(stderr, "%s: memset info failed, ret: %d\n", __func__, ret); return -1; diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py index edd9994..39623bd 100644 --- a/src/python/xalarm/register_xalarm.py +++ b/src/python/xalarm/register_xalarm.py @@ -45,7 +45,7 @@ class AlarmRegister: return False if self.socket is None: - sys.stderr.write("check_params: scoket create failed\n") + sys.stderr.write("check_params: socket create failed\n") return False return True @@ -151,10 +151,10 @@ def xalarm_unregister(clientId: int) -> None: def xalarm_upgrade(clientId: int, id_filter: list) -> None: global ALARM_REGISTER_INFO if clientId < 0: - sys.stderr.write("xalarm_unregister: invalid client\n") + sys.stderr.write("xalarm_upgrade: invalid client\n") return if ALARM_REGISTER_INFO is None: - sys.stderr.write("xalarm_unregister: alarm has not registered\n") + sys.stderr.write("xalarm_upgrade: alarm has not registered\n") return ALARM_REGISTER_INFO.id_filter = id_filter diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py index c763a24..5838473 100644 --- a/src/python/xalarm/sentry_notify.py +++ b/src/python/xalarm/sentry_notify.py @@ -27,11 +27,11 @@ ALARM_SOCKET_PERMISSION = 0o700 def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: if not os.path.exists(DIR_XALARM): - sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed") + sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n") return False if not os.path.exists(PATH_REPORT_ALARM): - sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed") + sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n") return False if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or -- Gitee From 01e867b5627094c221d2cc6c2976915ce8b223a5 Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Fri, 11 Oct 2024 17:54:04 +0800 Subject: [PATCH 38/76] fix xalarm_upgrade not return val and fail when thread stopped --- src/libso/xalarm/register_xalarm.c | 11 ++++++++++- src/python/xalarm/register_xalarm.py | 10 +++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c index fd7339a..fe15dc8 100644 --- a/src/libso/xalarm/register_xalarm.c +++ b/src/libso/xalarm/register_xalarm.c @@ -156,7 +156,11 @@ static void *alarm_recv(void *arg) continue; } printf("recv error len:%d errno:%d\n", recvlen, errno); - } + } else if (recvlen == 0) { + printf("connection closed by xalarmd, maybe connections reach max num or service stopped.\n"); + g_register_info.thread_should_stop = 1; + break; + } } return NULL; } @@ -211,6 +215,11 @@ bool xalarm_Upgrade(struct alarm_subscription_info id_filter, int client_id) printf("%s: invalid args\n", __func__); return false; } + + if (g_register_info.thread_should_stop) { + printf("%s: upgrade failed, alarm thread has stopped\n", __func__); + return false; + } set_alarm_id(id_filter); return true; diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py index 39623bd..2a6dabf 100644 --- a/src/python/xalarm/register_xalarm.py +++ b/src/python/xalarm/register_xalarm.py @@ -148,15 +148,19 @@ def xalarm_unregister(clientId: int) -> None: ALARM_REGISTER_INFO = None -def xalarm_upgrade(clientId: int, id_filter: list) -> None: +def xalarm_upgrade(id_filter: list, clientId: int) -> bool: global ALARM_REGISTER_INFO if clientId < 0: sys.stderr.write("xalarm_upgrade: invalid client\n") - return + return False if ALARM_REGISTER_INFO is None: sys.stderr.write("xalarm_upgrade: alarm has not registered\n") - return + return False + if ALARM_REGISTER_INFO.thread_should_stop: + sys.stderr.write("xalarm_upgrade: upgrade failed, alarm thread has stopped\n") + return False ALARM_REGISTER_INFO.id_filter = id_filter + return True def xalarm_getid(alarm_info: Xalarm) -> int: -- Gitee From c2c219c668e2092ac6e48e589c9f8434b8c12fd0 Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Fri, 11 Oct 2024 17:59:54 +0800 Subject: [PATCH 39/76] add log for xalarm when sending msg and clean invalid client socket --- src/python/xalarm/xalarm_transfer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py index 42137d8..90dccbc 100644 --- a/src/python/xalarm/xalarm_transfer.py +++ b/src/python/xalarm/xalarm_transfer.py @@ -117,4 +117,5 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): epoll.unregister(fileno) fd_to_socket[fileno].close() del fd_to_socket[fileno] + logging.info(f"cleaned up connection {fileno} for client lost connection.") -- Gitee From ff23301c53e2de60943ae52848d5885eb8425e17 Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Fri, 11 Oct 2024 18:12:21 +0800 Subject: [PATCH 40/76] add xalarm cleanup invalid server socket peroidly --- src/python/xalarm/xalarm_server.py | 20 +++++++++++++++----- src/python/xalarm/xalarm_transfer.py | 8 ++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py index 2882609..f90a0e2 100644 --- a/src/python/xalarm/xalarm_server.py +++ b/src/python/xalarm/xalarm_server.py @@ -22,7 +22,12 @@ import threading from struct import error as StructParseError from .xalarm_api import alarm_bin2stu -from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection +from .xalarm_transfer import ( + check_filter, + transmit_alarm, + wait_for_connection, + peroid_task_to_cleanup_connections +) ALARM_DIR = "/var/run/xalarm" @@ -66,9 +71,13 @@ def server_loop(alarm_config): fd_to_socket = {alarm_sock.fileno(): alarm_sock,} thread_should_stop = False - thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) - thread.daemon = True - thread.start() + conn_thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) + conn_thread.daemon = True + conn_thread.start() + + cleanup_thread = threading.Thread(target=peroid_task_to_cleanup_connections, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) + cleanup_thread.daemon = True + cleanup_thread.start() while True: try: @@ -88,7 +97,8 @@ def server_loop(alarm_config): logging.error(f"Error server:{e}") thread_should_stop = True - thread.join() + conn_thread.join() + cleanup_thread.join() epoll.unregister(alarm_sock.fileno()) epoll.close() diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py index 90dccbc..75807e0 100644 --- a/src/python/xalarm/xalarm_transfer.py +++ b/src/python/xalarm/xalarm_transfer.py @@ -17,11 +17,13 @@ Create: 2023-11-02 import socket import logging import select +from time import sleep MIN_ID_NUMBER = 1001 MAX_ID_NUMBER = 1128 MAX_CONNECTION_NUM = 100 TEST_CONNECT_BUFFER_SIZE = 32 +PEROID_SCANN_TIME = 60 def check_filter(alarm_info, alarm_filter): @@ -66,6 +68,12 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket): logging.info(f"cleaned up connection {fileno} for client lost connection.") +def peroid_task_to_cleanup_connections(server_sock, epoll, fd_to_socket, thread_should_stop): + while not thread_should_stop: + sleep(PEROID_SCANN_TIME) + cleanup_closed_connections(server_sock, epoll, fd_to_socket) + + def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): """ thread function for catch and save client connection -- Gitee From bb99ee6766d55f23427581a2a0a3cdfdbfc00c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Fri, 11 Oct 2024 21:50:32 +0800 Subject: [PATCH 41/76] ai_block_io support stage and iotype --- config/plugins/ai_block_io.ini | 7 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 126 +++-- .../ai_block_io/config_parser.py | 471 +++++++++++++----- .../sentryPlugins/ai_block_io/data_access.py | 11 +- .../sentryPlugins/ai_block_io/detector.py | 25 + src/python/sentryPlugins/ai_block_io/utils.py | 3 +- 6 files changed, 453 insertions(+), 190 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 01ce266..a814d52 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -1,7 +1,12 @@ +[log] +level=info + [common] absolute_threshold=40 slow_io_detect_frequency=1 -log_level=info +disk=default +stage=bio +iotype=read,write [algorithm] train_data_duration=24 diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 77104a9..e1052ec 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -13,7 +13,7 @@ import time import signal import logging -from .detector import Detector +from .detector import Detector, DiskDetector from .threshold import ThresholdFactory, AbsoluteThreshold from .sliding_window import SlidingWindowFactory from .utils import get_data_queue_size_and_update_size @@ -34,8 +34,8 @@ def sig_handler(signum, frame): class SlowIODetection: _config_parser = None _disk_list = None - _detector_name_list = [] - _detectors = {} + _detector_name_list = {} + _disk_detectors = {} def __init__(self, config_parser: ConfigParser): self._config_parser = config_parser @@ -43,85 +43,101 @@ class SlowIODetection: self.__init_detector() def __init_detector_name_list(self): - self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) + self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency) if self._disk_list is None: Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") exit(1) logging.info(f"ai_block_io plug has found disks: {self._disk_list}") - disks_to_detection: list = self._config_parser.get_disks_to_detection() + disks: list = self._config_parser.disks_to_detection + stages: list = self._config_parser.stage + iotypes: list = self._config_parser.iotype # 情况1:None,则启用所有磁盘检测 # 情况2:is not None and len = 0,则不启动任何磁盘检测 # 情况3:len != 0,则取交集 - if disks_to_detection is None: + if disks is None: logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") for disk in self._disk_list: - self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) - self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) - elif len(disks_to_detection) == 0: - logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.') + for stage in stages: + for iotype in iotypes: + if disk not in self._detector_name_list: + self._detector_name_list[disk] = [] + self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) else: - for disk_to_detection in disks_to_detection: - if disk_to_detection in self._disk_list: - self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency")) - self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency")) + for disk in disks: + if disk in self._disk_list: + for stage in stages: + for iotype in iotypes: + if disk not in self._detector_name_list: + self._detector_name_list[disk] = [] + self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) else: - logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.") - logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') + logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk) + if len(self._detector_name_list) == 0: + logging.critical("the disks to detection is empty, ai_block_io will exit.") + Report.report_pass("the disks to detection is empty, ai_block_io will exit.") + exit(1) def __init_detector(self): - train_data_duration, train_update_duration = (self._config_parser. - get_train_data_duration_and_train_update_duration()) - slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() - threshold_type = self._config_parser.get_algorithm_type() - data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, - train_update_duration, - slow_io_detection_frequency) - sliding_window_type = self._config_parser.get_sliding_window_type() - window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() - - for detector_name in self._detector_name_list: - threshold = ThresholdFactory().get_threshold(threshold_type, - boxplot_parameter=self._config_parser.get_boxplot_parameter(), - n_sigma_paramter=self._config_parser.get_n_sigma_parameter(), - data_queue_size=data_queue_size, - data_queue_update_size=update_size) - sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, - threshold=window_threshold) - detector = Detector(detector_name, threshold, sliding_window) - # 绝对阈值的阈值初始化 - if isinstance(threshold, AbsoluteThreshold): - threshold.set_threshold(self._config_parser.get_absolute_threshold()) - self._detectors[detector_name] = detector - logging.info(f"add detector: {detector}") + train_data_duration, train_update_duration = ( + self._config_parser.get_train_data_duration_and_train_update_duration() + ) + slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency + threshold_type = self._config_parser.algorithm_type + data_queue_size, update_size = get_data_queue_size_and_update_size( + train_data_duration, train_update_duration, slow_io_detection_frequency + ) + sliding_window_type = self._config_parser.sliding_window_type + window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold()) + + for disk, metric_name_list in self._detector_name_list.items(): + threshold = ThresholdFactory().get_threshold( + threshold_type, + boxplot_parameter=self._config_parser.boxplot_parameter, + n_sigma_paramter=self._config_parser.n_sigma_parameter, + data_queue_size=data_queue_size, + data_queue_update_size=update_size, + ) + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold, + ) + disk_detector = DiskDetector(disk) + for metric_name in metric_name_list: + detector = Detector(metric_name, threshold, sliding_window) + disk_detector.add_detector(detector) + logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]') + self._disk_detectors[disk] = disk_detector def launch(self): while True: - logging.debug('step0. AI threshold slow io event detection is looping.') + logging.debug("step0. AI threshold slow io event detection is looping.") # Step1:获取IO数据 io_data_dict_with_disk_name = get_io_data_from_collect_plug( - self._config_parser.get_slow_io_detect_frequency(), self._disk_list + self._config_parser.slow_io_detect_frequency, self._disk_list ) - logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') + logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}") if io_data_dict_with_disk_name is None: - Report.report_pass("get io data error, please check if the collector plug is enable. exitting...") + Report.report_pass( + "get io data error, please check if the collector plug is enable. exitting..." + ) exit(1) # Step2:慢IO检测 - logging.debug('step2. Start to detection slow io event.') + logging.debug("step2. Start to detection slow io event.") slow_io_event_list = [] - for metric_name, detector in self._detectors.items(): - result = detector.is_slow_io_event(io_data_dict_with_disk_name) + for disk, disk_detector in self._disk_detectors.items(): + result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name) if result[0]: - slow_io_event_list.append((detector.get_metric_name(), result)) - logging.debug('step2. End to detection slow io event.') + slow_io_event_list.append(result) + logging.debug("step2. End to detection slow io event.") # Step3:慢IO事件上报 - logging.debug('step3. Report slow io event to sysSentry.') + logging.debug("step3. Report slow io event to sysSentry.") for slow_io_event in slow_io_event_list: - metric_name: MetricName = slow_io_event[0] - result = slow_io_event[1] + metric_name: MetricName = slow_io_event[1] alarm_content = { "driver_name": f"{metric_name.get_disk_name()}", "reason": "disk_slow", @@ -129,14 +145,14 @@ class SlowIODetection: "io_type": f"{metric_name.get_io_access_type_name()}", "alarm_source": "ai_block_io", "alarm_type": "latency", - "details": f"current window is: {result[1]}, threshold is: {result[2]}.", + "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.", } Xalarm.major(alarm_content) logging.warning(alarm_content) # Step4:等待检测时间 - logging.debug('step4. Wait to start next slow io event detection loop.') - time.sleep(self._config_parser.get_slow_io_detect_frequency()) + logging.debug("step4. Wait to start next slow io event detection loop.") + time.sleep(self._config_parser.slow_io_detect_frequency) def main(): diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 354c122..a357766 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -9,44 +9,60 @@ # PURPOSE. # See the Mulan PSL v2 for more details. +import os import configparser import logging +from .alarm_report import Report from .threshold import ThresholdType from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" +ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio'] +ALL_IOTPYE_LIST = ['read', 'write'] + def init_log_format(log_level: str): logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) - if log_level.lower() not in ('info', 'warning', 'error', 'debug'): - logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.') + if log_level.lower() not in ("info", "warning", "error", "debug"): + logging.warning( + f"the log_level: {log_level} you set is invalid, use default value: info." + ) class ConfigParser: DEFAULT_ABSOLUTE_THRESHOLD = 40 DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 - DEFAULT_LOG_LEVEL = 'info' + DEFAULT_LOG_LEVEL = "info" + + DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio' + DEFAULT_IOTYPE = 'read,write' - DEFAULT_ALGORITHM_TYPE = 'boxplot' + DEFAULT_ALGORITHM_TYPE = "boxplot" DEFAULT_TRAIN_DATA_DURATION = 24 DEFAULT_TRAIN_UPDATE_DURATION = 2 DEFAULT_BOXPLOT_PARAMETER = 1.5 DEFAULT_N_SIGMA_PARAMETER = 3 - DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' + DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous" DEFAULT_WINDOW_SIZE = 30 DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 def __init__(self, config_file_name): self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD - self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + self.__slow_io_detect_frequency = ( + ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY + ) self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL self.__disks_to_detection = None + self.__stage = ConfigParser.DEFAULT_STAGE + self.__iotype = ConfigParser.DEFAULT_IOTYPE - self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE + self.__algorithm_type = get_threshold_type_enum( + ConfigParser.DEFAULT_ALGORITHM_TYPE + ) self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER @@ -58,199 +74,398 @@ class ConfigParser: self.__config_file_name = config_file_name - def __read_absolute_threshold(self, items_common: dict): + def _get_config_value( + self, + config_items: dict, + key: str, + value_type, + default_value=None, + gt=None, + ge=None, + lt=None, + le=None, + ): + value = config_items.get(key) + if value is None: + logging.warning( + "config of %s not found, the default value %s will be used.", + key, + default_value, + ) + value = default_value + if not value: + logging.critical( + "the value of %s is empty, ai_block_io plug will exit.", key + ) + Report.report_pass( + f"the value of {key} is empty, ai_block_io plug will exit." + ) + exit(1) try: - self.__absolute_threshold = float(items_common.get('absolute_threshold', - ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) - if self.__absolute_threshold <= 0: - logging.warning( - f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.') - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD + value = value_type(value) except ValueError: - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD - logging.warning( - f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.') + logging.critical( + "the value of %s is not a valid %s, ai_block_io plug will exit.", + key, + value_type, + ) + Report.report_pass( + f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit." + ) + exit(1) + if gt is not None and value <= gt: + logging.critical( + "the value of %s is not greater than %s, ai_block_io plug will exit.", + key, + gt, + ) + Report.report_pass( + f"the value of {key} is not greater than {gt}, ai_block_io plug will exit." + ) + exit(1) + if ge is not None and value < ge: + logging.critical( + "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.", + key, + ge, + ) + Report.report_pass( + f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit." + ) + exit(1) + if lt is not None and value >= lt: + logging.critical( + "the value of %s is not less than %s, ai_block_io plug will exit.", + key, + lt, + ) + Report.report_pass( + f"the value of {key} is not less than {lt}, ai_block_io plug will exit." + ) + exit(1) + if le is not None and value > le: + logging.critical( + "the value of %s is not less than or equal to %s, ai_block_io plug will exit.", + key, + le, + ) + Report.report_pass( + f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit." + ) + exit(1) + + return value + + def __read_absolute_threshold(self, items_common: dict): + self.__absolute_threshold = self._get_config_value( + items_common, + "absolute_threshold", + float, + ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD, + gt=0, + ) def __read__slow_io_detect_frequency(self, items_common: dict): - try: - self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', - ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) - if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10: - logging.warning( - f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.') - self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY - except ValueError: - self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY - logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') + self.__slow_io_detect_frequency = self._get_config_value( + items_common, + "slow_io_detect_frequency", + int, + ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY, + gt=0, + le=300, + ) def __read__disks_to_detect(self, items_common: dict): - disks_to_detection = items_common.get('disk') + disks_to_detection = items_common.get("disk") if disks_to_detection is None: - logging.warning(f'config of disk not found, the default value will be used.') + logging.warning("config of disk not found, the default value will be used.") self.__disks_to_detection = None return - disk_list = disks_to_detection.split(',') - if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''): - logging.warning("you don't specify any disk.") - self.__disks_to_detection = [] - return - if len(disk_list) == 1 and disk_list[0] == 'default': + disks_to_detection = disks_to_detection.strip() + if not disks_to_detection: + logging.critical("the value of disk is empty, ai_block_io plug will exit.") + Report.report_pass( + "the value of disk is empty, ai_block_io plug will exit." + ) + exit(1) + disk_list = disks_to_detection.split(",") + if len(disk_list) == 1 and disk_list[0] == "default": self.__disks_to_detection = None return self.__disks_to_detection = disk_list def __read__train_data_duration(self, items_algorithm: dict): - try: - self.__train_data_duration = float(items_algorithm.get('train_data_duration', - ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) - if self.__train_data_duration <= 0 or self.__train_data_duration > 720: - logging.warning( - f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.') - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION - except ValueError: - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION - logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.') + self.__train_data_duration = self._get_config_value( + items_algorithm, + "train_data_duration", + float, + ConfigParser.DEFAULT_TRAIN_DATA_DURATION, + gt=0, + le=720, + ) def __read__train_update_duration(self, items_algorithm: dict): default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION if default_train_update_duration > self.__train_data_duration: default_train_update_duration = self.__train_data_duration / 2 - - try: - self.__train_update_duration = float(items_algorithm.get('train_update_duration', - ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) - if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration: - logging.warning( - f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.') - self.__train_update_duration = default_train_update_duration - except ValueError: - self.__train_update_duration = default_train_update_duration - logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.') + self.__train_update_duration = self._get_config_value( + items_algorithm, + "train_update_duration", + float, + default_train_update_duration, + gt=0, + le=self.__train_data_duration, + ) def __read__algorithm_type_and_parameter(self, items_algorithm: dict): - algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) + algorithm_type = items_algorithm.get( + "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE + ) self.__algorithm_type = get_threshold_type_enum(algorithm_type) + if self.__algorithm_type is None: + logging.critical( + "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", + algorithm_type, + ) + Report.report_pass( + f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." + ) + exit(1) if self.__algorithm_type == ThresholdType.NSigmaThreshold: - try: - self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', - ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) - if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10: - logging.warning( - f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.') - self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER - except ValueError: - self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER - logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.') + self.__n_sigma_parameter = self._get_config_value( + items_algorithm, + "n_sigma_parameter", + float, + ConfigParser.DEFAULT_N_SIGMA_PARAMETER, + gt=0, + le=10, + ) elif self.__algorithm_type == ThresholdType.BoxplotThreshold: - try: - self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', - ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) - if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10: - logging.warning( - f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.') - self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER - except ValueError: - self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER - logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.') + self.__boxplot_parameter = self._get_config_value( + items_algorithm, + "boxplot_parameter", + float, + ConfigParser.DEFAULT_BOXPLOT_PARAMETER, + gt=0, + le=10, + ) + + def __read__stage(self, items_algorithm: dict): + stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE) + stage_list = stage_str.split(',') + if len(stage_list) == 1 and stage_list[0] == '': + logging.critical('stage value not allow is empty, exiting...') + exit(1) + if len(stage_list) == 1 and stage_list[0] == 'default': + logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}') + self.__stage = ALL_STAGE_LIST + return + for stage in stage_list: + if stage not in ALL_STAGE_LIST: + logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...') + exit(1) + dup_stage_list = set(stage_list) + if 'bio' not in dup_stage_list: + logging.critical('stage must contains bio stage, exiting...') + exit(1) + self.__stage = dup_stage_list + + def __read__iotype(self, items_algorithm: dict): + iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE) + iotype_list = iotype_str.split(',') + if len(iotype_list) == 1 and iotype_list[0] == '': + logging.critical('iotype value not allow is empty, exiting...') + exit(1) + if len(iotype_list) == 1 and iotype_list[0] == 'default': + logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}') + self.__iotype = ALL_IOTPYE_LIST + return + for iotype in iotype_list: + if iotype not in ALL_IOTPYE_LIST: + logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...') + exit(1) + dup_iotype_list = set(iotype_list) + self.__iotype = dup_iotype_list def __read__window_size(self, items_sliding_window: dict): - try: - self.__window_size = int(items_sliding_window.get('window_size', - ConfigParser.DEFAULT_WINDOW_SIZE)) - if self.__window_size < 1 or self.__window_size > 3600: - logging.warning( - f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.') - self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE - except ValueError: - self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE - logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.') + self.__window_size = self._get_config_value( + items_sliding_window, + "window_size", + int, + ConfigParser.DEFAULT_WINDOW_SIZE, + gt=0, + le=3600, + ) def __read__window_minimum_threshold(self, items_sliding_window: dict): default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD if default_window_minimum_threshold > self.__window_size: default_window_minimum_threshold = self.__window_size / 2 - try: - self.__window_minimum_threshold = ( - int(items_sliding_window.get('window_minimum_threshold', - ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) - if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size: - logging.warning( - f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.') - self.__window_minimum_threshold = default_window_minimum_threshold - except ValueError: - self.__window_minimum_threshold = default_window_minimum_threshold - logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.') + self.__window_minimum_threshold = self._get_config_value( + items_sliding_window, + "window_minimum_threshold", + int, + default_window_minimum_threshold, + gt=0, + le=self.__window_size, + ) def read_config_from_file(self): + if not os.path.exists(self.__config_file_name): + init_log_format(self.__log_level) + logging.critical( + "config file %s not found, ai_block_io plug will exit.", + self.__config_file_name, + ) + Report.report_pass( + f"config file {self.__config_file_name} not found, ai_block_io plug will exit." + ) + exit(1) + con = configparser.ConfigParser() try: - con.read(self.__config_file_name, encoding='utf-8') + con.read(self.__config_file_name, encoding="utf-8") except configparser.Error as e: init_log_format(self.__log_level) - logging.critical(f'config file read error: {e}, ai_block_io plug will exit.') + logging.critical( + f"config file read error: %s, ai_block_io plug will exit.", e + ) + Report.report_pass( + f"config file read error: {e}, ai_block_io plug will exit." + ) exit(1) - if con.has_section('common'): - items_common = dict(con.items('common')) - self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) + if con.has_section('log'): + items_log = dict(con.items('log')) + # 情况一:没有log,则使用默认值 + # 情况二:有log,值为空或异常,使用默认值 + # 情况三:有log,值正常,则使用该值 + self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL) init_log_format(self.__log_level) + else: + init_log_format(self.__log_level) + logging.warning(f"log section parameter not found, it will be set to default value.") + + if con.has_section("common"): + items_common = dict(con.items("common")) self.__read_absolute_threshold(items_common) self.__read__slow_io_detect_frequency(items_common) self.__read__disks_to_detect(items_common) + self.__read__stage(items_common) + self.__read__iotype(items_common) else: - init_log_format(self.__log_level) - logging.warning("common section parameter not found, it will be set to default value.") + logging.warning( + "common section parameter not found, it will be set to default value." + ) - if con.has_section('algorithm'): - items_algorithm = dict(con.items('algorithm')) + if con.has_section("algorithm"): + items_algorithm = dict(con.items("algorithm")) self.__read__train_data_duration(items_algorithm) self.__read__train_update_duration(items_algorithm) self.__read__algorithm_type_and_parameter(items_algorithm) else: - logging.warning("algorithm section parameter not found, it will be set to default value.") - - if con.has_section('sliding_window'): - items_sliding_window = dict(con.items('sliding_window')) - sliding_window_type = items_sliding_window.get('sliding_window_type', - ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) - self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type) + logging.warning( + "algorithm section parameter not found, it will be set to default value." + ) + + if con.has_section("sliding_window"): + items_sliding_window = dict(con.items("sliding_window")) + sliding_window_type = items_sliding_window.get( + "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE + ) + self.__sliding_window_type = get_sliding_window_type_enum( + sliding_window_type + ) self.__read__window_size(items_sliding_window) self.__read__window_minimum_threshold(items_sliding_window) else: - logging.warning("sliding_window section parameter not found, it will be set to default value.") + logging.warning( + "sliding_window section parameter not found, it will be set to default value." + ) self.__print_all_config_value() + def __repr__(self): + config_str = { + 'log.level': self.__log_level, + 'common.absolute_threshold': self.__absolute_threshold, + 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency, + 'common.disk': self.__disks_to_detection, + 'common.stage': self.__stage, + 'common.iotype': self.__iotype, + 'algorithm.train_data_duration': self.__train_data_duration, + 'algorithm.train_update_duration': self.__train_update_duration, + 'algorithm.algorithm_type': self.__algorithm_type, + 'algorithm.boxplot_parameter': self.__boxplot_parameter, + 'algorithm.n_sigma_parameter': self.__n_sigma_parameter, + 'sliding_window.sliding_window_type': self.__sliding_window_type, + 'sliding_window.window_size': self.__window_size, + 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold + } + return str(config_str) + def __print_all_config_value(self): - pass + logging.info(f"all config is follow:\n {self}") + + def get_train_data_duration_and_train_update_duration(self): + return self.__train_data_duration, self.__train_update_duration - def get_slow_io_detect_frequency(self): + def get_window_size_and_window_minimum_threshold(self): + return self.__window_size, self.__window_minimum_threshold + + @property + def slow_io_detect_frequency(self): return self.__slow_io_detect_frequency - def get_algorithm_type(self): + @property + def algorithm_type(self): return self.__algorithm_type - def get_sliding_window_type(self): + @property + def sliding_window_type(self): return self.__sliding_window_type - def get_train_data_duration_and_train_update_duration(self): - return self.__train_data_duration, self.__train_update_duration + @property + def train_data_duration(self): + return self.__train_data_duration - def get_window_size_and_window_minimum_threshold(self): - return self.__window_size, self.__window_minimum_threshold + @property + def train_update_duration(self): + return self.__train_update_duration + + @property + def window_size(self): + return self.__window_size - def get_absolute_threshold(self): + @property + def window_minimum_threshold(self): + return self.__window_minimum_threshold + + @property + def absolute_threshold(self): return self.__absolute_threshold - def get_log_level(self): + @property + def log_level(self): return self.__log_level - def get_disks_to_detection(self): + @property + def disks_to_detection(self): return self.__disks_to_detection - def get_boxplot_parameter(self): + @property + def stage(self): + return self.__stage + + @property + def iotype(self): + return self.__iotype + + @property + def boxplot_parameter(self): return self.__boxplot_parameter - def get_n_sigma_parameter(self): + @property + def n_sigma_parameter(self): return self.__n_sigma_parameter diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index c7679cd..ed997e6 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -41,11 +41,14 @@ def check_collect_valid(period): try: data = json.loads(data_raw["message"]) except Exception as e: - logging.warning(f"get io data failed, {e}") + logging.warning(f"get valid devices failed, occur exception: {e}") + return None + if not data: + logging.warning(f"get valid devices failed, return {data_raw}") return None return [k for k in data.keys()] else: - logging.warning(f"get io data failed, return {data_raw}") + logging.warning(f"get valid devices failed, return {data_raw}") return None @@ -60,7 +63,7 @@ def _get_raw_data(period, disk_list): def _get_io_stage_data(data): io_stage_data = IOStageData() - for data_type in ('read', 'write', 'flush', 'discard'): + for data_type in ("read", "write", "flush", "discard"): if data_type in data: getattr(io_stage_data, data_type).latency = data[data_type][0] getattr(io_stage_data, data_type).io_dump = data[data_type][1] @@ -87,7 +90,7 @@ def get_io_data_from_collect_plug(period, disk_list): getattr(disk_ret, k) setattr(disk_ret, k, _get_io_stage_data(v)) except AttributeError: - logging.debug(f'no attr {k}') + logging.debug(f"no attr {k}") continue ret[disk] = disk_ret return ret diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index 0ed282b..e710ddd 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -53,3 +53,28 @@ class Detector: f' io_type_name: {self._metric_name.get_io_access_type_name()},' f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' f' sliding_window_type: {self._slidingWindow}') + + +class DiskDetector: + + def __init__(self, disk_name: str): + self._disk_name = disk_name + self._detector_list = [] + + def add_detector(self, detector: Detector): + self._detector_list.append(detector) + + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): + # 只有bio阶段发生异常,就认为发生了慢IO事件 + # todo:根因诊断 + for detector in self._detector_list: + result = detector.is_slow_io_event(io_data_dict_with_disk_name) + if result[0] and detector.get_metric_name().get_stage_name() == 'bio': + return result[0], detector.get_metric_name(), result[1], result[2] + return False, None, None, None + + def __repr__(self): + msg = f'disk: {self._disk_name}, ' + for detector in self._detector_list: + msg += f'\n detector: [{detector}]' + return msg diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py index 8dbba06..0ed37b9 100644 --- a/src/python/sentryPlugins/ai_block_io/utils.py +++ b/src/python/sentryPlugins/ai_block_io/utils.py @@ -25,8 +25,7 @@ def get_threshold_type_enum(algorithm_type: str): return ThresholdType.BoxplotThreshold if algorithm_type.lower() == 'n_sigma': return ThresholdType.NSigmaThreshold - logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot") - return ThresholdType.BoxplotThreshold + return None def get_sliding_window_type_enum(sliding_window_type: str): -- Gitee From d1300f24b91b4be649fe3e50687b9887f763ed75 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Sat, 12 Oct 2024 11:24:32 +0800 Subject: [PATCH 42/76] ebpf fix collect iodump --- src/c/ebpf_collector/ebpf_collector.bpf.c | 292 +++++++++++----------- src/c/ebpf_collector/ebpf_collector.c | 51 +++- src/c/ebpf_collector/ebpf_collector.h | 18 +- 3 files changed, 200 insertions(+), 161 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.bpf.c b/src/c/ebpf_collector/ebpf_collector.bpf.c index 28cdde2..870a677 100644 --- a/src/c/ebpf_collector/ebpf_collector.bpf.c +++ b/src/c/ebpf_collector/ebpf_collector.bpf.c @@ -92,6 +92,35 @@ struct bpf_map_def SEC("maps") tag_args = { .max_entries = 1000, }; +struct bpf_map_def SEC("maps") blk_res_2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(struct time_range_io_count), + .max_entries = MAX_IO_TIME, +}; + +struct bpf_map_def SEC("maps") bio_res_2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(struct time_range_io_count), + .max_entries = MAX_IO_TIME, +}; + +struct bpf_map_def SEC("maps") wbt_res_2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(struct time_range_io_count), + .max_entries = MAX_IO_TIME, +}; + +struct bpf_map_def SEC("maps") tag_res_2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(struct time_range_io_count), + .max_entries = MAX_IO_TIME, +}; + + struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; @@ -148,39 +177,12 @@ static __always_inline void blk_fill_rwbs(char *rwbs, unsigned int op) } } -void update_new_data_in_start(struct stage_data *new_data, struct update_params *params) { - blk_fill_rwbs(new_data->io_type, params->cmd_flags); - if (new_data->bucket[params->update_bucket].start_range == params->curr_start_range){ - new_data->bucket[params->update_bucket].io_count += 1; - } else { - new_data->bucket[MAX_BUCKETS].io_count += new_data->bucket[params->update_bucket].io_count; - new_data->bucket[params->update_bucket].io_count = 1; - new_data->bucket[params->update_bucket].start_range = params->curr_start_range; - } -} - void update_curr_data_in_start(struct stage_data *curr_data, struct update_params *params) { if (curr_data && params) { curr_data->start_count += 1; curr_data->major = params->major; curr_data->first_minor = params->first_minor; blk_fill_rwbs(curr_data->io_type, params->cmd_flags); - if (curr_data->bucket[params->update_bucket].start_range == params->curr_start_range) { - curr_data->bucket[params->update_bucket].io_count += 1; - } else { - curr_data->bucket[MAX_BUCKETS].io_count += curr_data->bucket[params->update_bucket].io_count; - curr_data->bucket[params->update_bucket].io_count = 1; - } - curr_data->bucket[params->update_bucket].start_range = params->curr_start_range; - } -} - -void update_new_data_in_finish(struct stage_data *new_data, struct update_params *params) { - blk_fill_rwbs(new_data->io_type, params->cmd_flags); - if (new_data->bucket[params->update_bucket].start_range == params->curr_start_range){ - new_data->bucket[params->update_bucket].io_count = (new_data->bucket[params->update_bucket].io_count > 1) ? new_data->bucket[params->update_bucket].io_count - 1 : 0; - } else { - new_data->bucket[MAX_BUCKETS].io_count = (new_data->bucket[MAX_BUCKETS].io_count > 1) ? new_data->bucket[MAX_BUCKETS].io_count - 1 : 0; } } @@ -204,7 +206,6 @@ static void init_io_counter(struct io_counter *counterp, int major, int first_mi } } - u32 find_matching_tag_1_keys(int major, int minor) { u32 key = 0; struct stage_data *curr_data = bpf_map_lookup_elem(&tag_res, &key); @@ -705,6 +706,7 @@ u32 find_matching_wbt_5_keys(int major, int minor) { return MAP_SIZE + 1; } +// start rq_driver SEC("kprobe/blk_mq_start_request") int kprobe_blk_mq_start_request(struct pt_regs *regs) { @@ -742,14 +744,12 @@ int kprobe_blk_mq_start_request(struct pt_regs *regs) if (err) return 0; - u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = zero.start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -764,20 +764,28 @@ int kprobe_blk_mq_start_request(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_start(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&blk_res, &key, &new_data, 0); } else { update_curr_data_in_start(curr_data, ¶ms); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&blk_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&blk_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE) { + __sync_fetch_and_add(&curr_data_time_range->count[key], 1); + } + } + return 0; } +// finish rq_driver SEC("kprobe/blk_mq_free_request") int kprobe_blk_mq_free_request(struct pt_regs *regs) { @@ -811,15 +819,13 @@ int kprobe_blk_mq_free_request(struct pt_regs *regs) return 0; } - u64 duration = bpf_ktime_get_ns() - counterp->start_time; - u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 duration = bpf_ktime_get_ns() - counterp->start_time; + u64 curr_start_range = counterp->start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -834,12 +840,8 @@ int kprobe_blk_mq_free_request(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&blk_res, &key, &new_data, 0); } else if (curr_data == NULL) { struct stage_data new_data = { @@ -850,28 +852,30 @@ int kprobe_blk_mq_free_request(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&blk_res, &key, &new_data, 0); } else { - if (curr_data->bucket[update_bucket].start_range == curr_start_range) { - curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; - } else { - curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; - - } curr_data->duration += duration; update_curr_data_in_finish(curr_data, ¶ms, &duration); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&blk_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&blk_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) { + __sync_fetch_and_add(&curr_data_time_range->count[key], -1); + } + } + bpf_map_delete_elem(&blk_map, &rq); return 0; } +// start bio SEC("kprobe/blk_mq_make_request") int kprobe_blk_mq_make_request(struct pt_regs *regs) { @@ -909,20 +913,18 @@ int kprobe_blk_mq_make_request(struct pt_regs *regs) if (err && err != -EEXIST) return 0; - u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = zero.start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; struct stage_data *curr_data; curr_data = bpf_map_lookup_elem(&bio_res, &key); - if (curr_data == NULL) { + if (curr_data == NULL) { struct stage_data new_data = { .start_count = 1, .finish_count = 0, @@ -931,20 +933,28 @@ int kprobe_blk_mq_make_request(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_start(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&bio_res, &key, &new_data, 0); - } else { + } else { update_curr_data_in_start(curr_data, ¶ms); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&bio_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&bio_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE) { + __sync_fetch_and_add(&curr_data_time_range->count[key], 1); + } + } + return 0; } +// finish bio SEC("kprobe/bio_endio") int kprobe_bio_endio(struct pt_regs *regs) { @@ -982,20 +992,18 @@ int kprobe_bio_endio(struct pt_regs *regs) delete_map = &bio_map; u64 duration = bpf_ktime_get_ns() - counterp->start_time; - u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = counterp->start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; struct stage_data *curr_data; curr_data = bpf_map_lookup_elem(&bio_res, &key); - if (curr_data == NULL && duration > DURATION_THRESHOLD) { + if (curr_data == NULL && duration > DURATION_THRESHOLD) { struct stage_data new_data = { .start_count = 1, .finish_count = 1, @@ -1004,14 +1012,10 @@ int kprobe_bio_endio(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&bio_res, &key, &new_data, 0); - } else if (curr_data == NULL) { + } else if (curr_data == NULL) { struct stage_data new_data = { .start_count = 1, .finish_count = 1, @@ -1020,28 +1024,30 @@ int kprobe_bio_endio(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&bio_res, &key, &new_data, 0); } else { - if (curr_data->bucket[update_bucket].start_range == curr_start_range) { - curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; - } else { - curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; - - } curr_data->duration += duration; update_curr_data_in_finish(curr_data, ¶ms, &duration); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&bio_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&bio_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) { + __sync_fetch_and_add(&curr_data_time_range->count[key], -1); + } + } + bpf_map_delete_elem(delete_map, &bio); return 0; } +// start wbt SEC("kprobe/wbt_wait") int kprobe_wbt_wait(struct pt_regs *regs) { @@ -1082,14 +1088,12 @@ int kprobe_wbt_wait(struct pt_regs *regs) if (err) return 0; - u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = zero.start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -1104,20 +1108,28 @@ int kprobe_wbt_wait(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_start(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&wbt_res, &key, &new_data, 0); } else { update_curr_data_in_start(curr_data, ¶ms); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&wbt_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&wbt_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE) { + __sync_fetch_and_add(&curr_data_time_range->count[key], 1); + } + } + return 0; } +// finish wbt SEC("kretprobe/wbt_wait") int kretprobe_wbt_wait(struct pt_regs *regs) { @@ -1159,14 +1171,12 @@ int kretprobe_wbt_wait(struct pt_regs *regs) return 0; u64 duration = bpf_ktime_get_ns() - counterp->start_time; - u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = counterp->start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -1181,12 +1191,8 @@ int kretprobe_wbt_wait(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&wbt_res, &key, &new_data, 0); } else if (curr_data == NULL) { struct stage_data new_data = { @@ -1197,29 +1203,31 @@ int kretprobe_wbt_wait(struct pt_regs *regs) .io_type = "", .major = major, .first_minor = first_minor, - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&wbt_res, &key, &new_data, 0); } else { - if (curr_data->bucket[update_bucket].start_range == curr_start_range) { - curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; - } else { - curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; - - } curr_data->duration += duration; update_curr_data_in_finish(curr_data, ¶ms, &duration); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&wbt_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&wbt_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) { + __sync_fetch_and_add(&curr_data_time_range->count[key], -1); + } + } + bpf_map_delete_elem(&wbt_map, &wbtkey); bpf_map_delete_elem(&wbt_args, &wbtkey); return 0; } +// start get_tag SEC("kprobe/blk_mq_get_tag") int kprobe_blk_mq_get_tag(struct pt_regs *regs) { @@ -1262,14 +1270,12 @@ int kprobe_blk_mq_get_tag(struct pt_regs *regs) if (err) return 0; - u64 curr_start_range = zero.start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = zero.start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -1284,20 +1290,28 @@ int kprobe_blk_mq_get_tag(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_start(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&tag_res, &key, &new_data, 0); } else { update_curr_data_in_start(curr_data, ¶ms); } + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&tag_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&tag_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE) { + __sync_fetch_and_add(&curr_data_time_range->count[key], 1); + } + } + return 0; } +// finish get_tag SEC("kretprobe/blk_mq_get_tag") int kretprobe_blk_mq_get_tag(struct pt_regs *regs) { @@ -1343,14 +1357,12 @@ int kretprobe_blk_mq_get_tag(struct pt_regs *regs) return 0; u64 duration = bpf_ktime_get_ns() - counterp->start_time; - u64 curr_start_range = counterp->start_time / THRESHOLD / MAX_BUCKETS; - u64 update_bucket = curr_start_range % MAX_BUCKETS; + u64 curr_start_range = counterp->start_time / THRESHOLD; struct update_params params = { .major = major, .first_minor = first_minor, .cmd_flags = cmd_flags, - .update_bucket = update_bucket, .curr_start_range = curr_start_range, }; @@ -1365,12 +1377,8 @@ int kretprobe_blk_mq_get_tag(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&tag_res, &key, &new_data, 0); } else if (curr_data == NULL) { struct stage_data new_data = { @@ -1381,23 +1389,25 @@ int kretprobe_blk_mq_get_tag(struct pt_regs *regs) .major = major, .first_minor = first_minor, .io_type = "", - .bucket = { - [0] = {.start_range = 0, .io_count = 0}, - [1] = {.start_range = 0, .io_count = 0}, - }, }; - update_new_data_in_finish(&new_data, ¶ms); + blk_fill_rwbs(new_data.io_type, cmd_flags); bpf_map_update_elem(&tag_res, &key, &new_data, 0); } else { - if (curr_data->bucket[update_bucket].start_range == curr_start_range) { - curr_data->bucket[update_bucket].io_count = (curr_data->bucket[update_bucket].io_count > 1) ? curr_data->bucket[update_bucket].io_count - 1 : 0; - } else { - curr_data->bucket[MAX_BUCKETS].io_count = (curr_data->bucket[MAX_BUCKETS].io_count > 1) ? curr_data->bucket[MAX_BUCKETS].io_count - 1 : 0; - - } curr_data->duration += duration; update_curr_data_in_finish(curr_data, ¶ms, &duration); } + + struct time_range_io_count *curr_data_time_range; + curr_data_time_range = bpf_map_lookup_elem(&tag_res_2, &curr_start_range); + if (curr_data_time_range == NULL) { + struct time_range_io_count new_data = { .count = {0} }; + bpf_map_update_elem(&tag_res_2, &curr_start_range, &new_data, 0); + } else { + if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) { + __sync_fetch_and_add(&curr_data_time_range->count[key], -1); + } + } + bpf_map_delete_elem(&tag_map, &tagkey); bpf_map_delete_elem(&tag_args, &tagkey); return 0; diff --git a/src/c/ebpf_collector/ebpf_collector.c b/src/c/ebpf_collector/ebpf_collector.c index a949ae8..6e981da 100644 --- a/src/c/ebpf_collector/ebpf_collector.c +++ b/src/c/ebpf_collector/ebpf_collector.c @@ -30,6 +30,10 @@ #define WBT_RES (map_fd[5]) #define TAG_MAP (map_fd[7]) #define TAG_RES (map_fd[8]) +#define BLK_RES_2 (map_fd[10]) +#define BIO_RES_2 (map_fd[11]) +#define WBT_RES_2 (map_fd[12]) +#define TAG_RES_2 (map_fd[13]) #define BPF_FILE "/usr/lib/ebpf_collector.bpf.o" typedef struct { @@ -113,16 +117,40 @@ char* find_device_name(dev_t dev) { return device_name; } -static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size) +static int print_map_res(struct bpf_map *map_res, struct bpf_map *map_res_2, char *stage, int *map_size) { + int err; struct stage_data counter; - int key = 0; + struct time_range_io_count time_count; + int key = 0; + int io_dump[MAP_SIZE] = {0}; + u32 io_dump_key = 0, io_dump_next_key = 0; struct sysinfo info; - sysinfo(&info); + sysinfo(&info); + + while (bpf_map_get_next_key(map_res_2, &io_dump_key, &io_dump_next_key) == 0) { + err = bpf_map_lookup_elem(map_res_2, &io_dump_next_key, &time_count); + if (err < 0) { + fprintf(stderr, "failed to lookup %s io dump: %d\n", stage, err); + continue; + } + io_dump_key = io_dump_next_key; + if ((info.uptime - io_dump_key) > 2) { + int isempty = 1; + for (key = 0; key < map_size; key++){ + if (time_count.count[key] > 0) { + io_dump[key] += time_count.count[key]; + isempty = 0; + } + } + if (isempty || (info.uptime - io_dump_key) > IO_DUMP_THRESHOLD) { + bpf_map_delete_elem(map_res_2, &io_dump_key); + } + } + } for (key = 0; key < map_size; key++) { - int err; err = bpf_map_lookup_elem(map_res, &key, &counter); if (err < 0) { fprintf(stderr, "failed to lookup %s map_res: %d\n", stage, err); @@ -141,11 +169,11 @@ static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size) dev_t dev = makedev(major, first_minor); char *device_name = find_device_name(dev); if (device_name && io_type) { - printf("%-7s %10llu %10llu %u %c %s\n", + printf("%-7s %10llu %10llu %d %c %s\n", stage, counter.finish_count, counter.duration, - counter.bucket[MAX_BUCKETS].io_count, + io_dump[key], io_type, device_name ); @@ -158,8 +186,8 @@ static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size) int init_map(int *map_fd, const char *map_name, int *map_size, DeviceInfo *devices) { struct stage_data init_data = {0}; + memset(init_data.io_type, 0, sizeof(init_data.io_type)); - memset(init_data.bucket, 0, sizeof(init_data.bucket)); for (int i = 0; i < map_size; i++) { init_data.major = devices[i].major; @@ -246,19 +274,19 @@ int main(int argc, char **argv) { sleep(1); - err = print_map_res(BLK_RES, "rq_driver", device_count); + err = print_map_res(BLK_RES, BLK_RES_2, "rq_driver", device_count); if (err) break; - err = print_map_res(BIO_RES, "bio", device_count); + err = print_map_res(BIO_RES, BIO_RES_2, "bio", device_count); if (err) break; - err = print_map_res(TAG_RES, "gettag", device_count); + err = print_map_res(TAG_RES, TAG_RES_2, "gettag", device_count); if (err) break; - err = print_map_res(WBT_RES, "wbt", device_count); + err = print_map_res(WBT_RES, WBT_RES_2, "wbt", device_count); if (err) break; @@ -268,3 +296,4 @@ int main(int argc, char **argv) { return -err; } + diff --git a/src/c/ebpf_collector/ebpf_collector.h b/src/c/ebpf_collector/ebpf_collector.h index dca04d8..fcebc93 100644 --- a/src/c/ebpf_collector/ebpf_collector.h +++ b/src/c/ebpf_collector/ebpf_collector.h @@ -10,7 +10,8 @@ typedef long long unsigned int u64; typedef unsigned int u32; -#define MAX_BUCKETS 1 +#define MAX_IO_TIME 130 +#define IO_DUMP_THRESHOLD 120 #define THRESHOLD 1000000000 #define DURATION_THRESHOLD 500000000 @@ -29,7 +30,7 @@ typedef unsigned int u32; #define REQ_OP_DISCARD 3 #define REQ_OP_SECURE_ERASE 5 #define REQ_OP_WRITE_SAME 7 -#define MAP_SIZE 128 +#define MAP_SIZE 15 enum stage_type { BIO=0, @@ -42,11 +43,6 @@ enum stage_type { MAX_STAGE_TYPE, }; -struct time_bucket { - u64 start_range; - u32 io_count; -}; - struct stage_data { u64 start_count; u64 finish_count; @@ -55,7 +51,6 @@ struct stage_data { int major; int first_minor; char io_type[RWBS_LEN]; - struct time_bucket bucket[MAX_BUCKETS+1]; }; struct io_counter { @@ -70,8 +65,13 @@ struct update_params { int major; int first_minor; unsigned int cmd_flags; - u64 update_bucket; u64 curr_start_range; }; +struct time_range_io_count +{ + u32 count[MAP_SIZE]; +}; + #endif /* __EBPFCOLLECTOR_H */ + -- Gitee From dfcbed79ecbbfd8d69973731f945eb2db214aec6 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Sat, 12 Oct 2024 14:51:51 +0800 Subject: [PATCH 43/76] fix io_dump for collect module --- src/python/sentryCollector/collect_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index d734734..2b10cde 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -154,7 +154,7 @@ class CollectIo(): try: with open(io_dump_file, 'r') as file: for line in file: - count += line.count('.op=' + Io_Category[category]) + count += line.count('.op=' + Io_Category[category].upper()) if count > 0: logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") except FileNotFoundError: -- Gitee From 3606087536b2cbd4b75ce68c443ab36868fd1f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Sat, 12 Oct 2024 21:59:18 +0800 Subject: [PATCH 44/76] add root cause analysis --- config/plugins/ai_block_io.ini | 15 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 133 +++-- .../ai_block_io/config_parser.py | 465 +++++++++++------- .../sentryPlugins/ai_block_io/data_access.py | 1 + .../sentryPlugins/ai_block_io/detector.py | 54 +- .../sentryPlugins/ai_block_io/io_data.py | 32 +- .../ai_block_io/sliding_window.py | 57 ++- src/python/sentryPlugins/ai_block_io/utils.py | 44 +- 8 files changed, 491 insertions(+), 310 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index a814d52..422cfa3 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -2,7 +2,6 @@ level=info [common] -absolute_threshold=40 slow_io_detect_frequency=1 disk=default stage=bio @@ -18,4 +17,16 @@ n_sigma_parameter=3 [sliding_window] sliding_window_type=not_continuous window_size=30 -window_minimum_threshold=6 \ No newline at end of file +window_minimum_threshold=6 + +[latency_sata_ssd] +read_tot_lim=50000 +write_tot_lim=50000 + +[latency_nvme_ssd] +read_tot_lim=500 +write_tot_lim=500 + +[latency_sata_hdd] +read_tot_lim=50000 +write_tot_lim=50000 \ No newline at end of file diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index e1052ec..dd661a1 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -12,13 +12,18 @@ import time import signal import logging +from collections import defaultdict from .detector import Detector, DiskDetector -from .threshold import ThresholdFactory, AbsoluteThreshold +from .threshold import ThresholdFactory from .sliding_window import SlidingWindowFactory from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser -from .data_access import get_io_data_from_collect_plug, check_collect_valid +from .data_access import ( + get_io_data_from_collect_plug, + check_collect_valid, + get_disk_type, +) from .io_data import MetricName from .alarm_report import Xalarm, Report @@ -34,7 +39,7 @@ def sig_handler(signum, frame): class SlowIODetection: _config_parser = None _disk_list = None - _detector_name_list = {} + _detector_name_list = defaultdict(list) _disk_detectors = {} def __init__(self, config_parser: ConfigParser): @@ -43,9 +48,13 @@ class SlowIODetection: self.__init_detector() def __init_detector_name_list(self): - self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency) + self._disk_list = check_collect_valid( + self._config_parser.slow_io_detect_frequency + ) if self._disk_list is None: - Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") + Report.report_pass( + "get available disk error, please check if the collector plug is enable. exiting..." + ) exit(1) logging.info(f"ai_block_io plug has found disks: {self._disk_list}") @@ -56,27 +65,45 @@ class SlowIODetection: # 情况2:is not None and len = 0,则不启动任何磁盘检测 # 情况3:len != 0,则取交集 if disks is None: - logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") - for disk in self._disk_list: - for stage in stages: - for iotype in iotypes: - if disk not in self._detector_name_list: - self._detector_name_list[disk] = [] - self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) - else: - for disk in disks: - if disk in self._disk_list: - for stage in stages: - for iotype in iotypes: - if disk not in self._detector_name_list: - self._detector_name_list[disk] = [] - self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) - else: - logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk) - if len(self._detector_name_list) == 0: - logging.critical("the disks to detection is empty, ai_block_io will exit.") - Report.report_pass("the disks to detection is empty, ai_block_io will exit.") - exit(1) + logging.warning( + "you not specify any disk or use default, so ai_block_io will enable all available disk." + ) + for disk in self._disk_list: + if disks is not None: + if disk not in disks: + continue + disks.remove(disk) + + disk_type_result = get_disk_type(disk) + if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( + '0', + '1', + '2', + ): + disk_type = int(disk_type_result["message"]) + else: + logging.warning( + "%s get disk type error, return %s, so it will be ignored.", + disk, + disk_type_result, + ) + continue + for stage in stages: + for iotype in iotypes: + self._detector_name_list[disk].append( + MetricName(disk, disk_type, stage, iotype, "latency") + ) + if disks: + logging.warning( + "disks: %s not in available disk list, so they will be ignored.", + disks, + ) + if not self._detector_name_list: + logging.critical("the disks to detection is empty, ai_block_io will exit.") + Report.report_pass( + "the disks to detection is empty, ai_block_io will exit." + ) + exit(1) def __init_detector(self): train_data_duration, train_update_duration = ( @@ -88,26 +115,39 @@ class SlowIODetection: train_data_duration, train_update_duration, slow_io_detection_frequency ) sliding_window_type = self._config_parser.sliding_window_type - window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold()) + window_size, window_threshold = ( + self._config_parser.get_window_size_and_window_minimum_threshold() + ) for disk, metric_name_list in self._detector_name_list.items(): - threshold = ThresholdFactory().get_threshold( - threshold_type, - boxplot_parameter=self._config_parser.boxplot_parameter, - n_sigma_paramter=self._config_parser.n_sigma_parameter, - data_queue_size=data_queue_size, - data_queue_update_size=update_size, - ) - sliding_window = SlidingWindowFactory().get_sliding_window( - sliding_window_type, - queue_length=window_size, - threshold=window_threshold, - ) disk_detector = DiskDetector(disk) for metric_name in metric_name_list: + threshold = ThresholdFactory().get_threshold( + threshold_type, + boxplot_parameter=self._config_parser.boxplot_parameter, + n_sigma_paramter=self._config_parser.n_sigma_parameter, + data_queue_size=data_queue_size, + data_queue_update_size=update_size, + ) + abs_threshold = self._config_parser.get_tot_lim( + metric_name.disk_type, metric_name.io_access_type_name + ) + if abs_threshold is None: + logging.warning( + "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", + disk, + metric_name.disk_type, + metric_name.io_access_type_name, + ) + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold, + abs_threshold=abs_threshold, + ) detector = Detector(metric_name, threshold, sliding_window) disk_detector.add_detector(detector) - logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]') + logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") self._disk_detectors[disk] = disk_detector def launch(self): @@ -138,14 +178,17 @@ class SlowIODetection: logging.debug("step3. Report slow io event to sysSentry.") for slow_io_event in slow_io_event_list: metric_name: MetricName = slow_io_event[1] + window_info = slow_io_event[2] + root_cause = slow_io_event[3] alarm_content = { - "driver_name": f"{metric_name.get_disk_name()}", - "reason": "disk_slow", - "block_stack": f"{metric_name.get_stage_name()}", - "io_type": f"{metric_name.get_io_access_type_name()}", + "driver_name": f"{metric_name.disk_name}", + "reason": root_cause, + "block_stack": f"{metric_name.stage_name}", + "io_type": f"{metric_name.io_access_type_name}", "alarm_source": "ai_block_io", "alarm_type": "latency", - "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.", + "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, " + f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.", } Xalarm.major(alarm_content) logging.warning(alarm_content) diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index a357766..3388cd4 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -20,59 +20,62 @@ from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_lo LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio'] -ALL_IOTPYE_LIST = ['read', 'write'] +ALL_STAGE_LIST = [ + "throtl", + "wbt", + "gettag", + "plug", + "deadline", + "hctx", + "requeue", + "rq_driver", + "bio", +] +ALL_IOTPYE_LIST = ["read", "write"] +DISK_TYPE_MAP = { + 0: "nvme_ssd", + 1: "sata_ssd", + 2: "sata_hdd", +} def init_log_format(log_level: str): logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) if log_level.lower() not in ("info", "warning", "error", "debug"): logging.warning( - f"the log_level: {log_level} you set is invalid, use default value: info." + "the log_level: %s you set is invalid, use default value: info.", log_level ) class ConfigParser: - DEFAULT_ABSOLUTE_THRESHOLD = 40 - DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 - DEFAULT_LOG_LEVEL = "info" - - DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio' - DEFAULT_IOTYPE = 'read,write' - - DEFAULT_ALGORITHM_TYPE = "boxplot" - DEFAULT_TRAIN_DATA_DURATION = 24 - DEFAULT_TRAIN_UPDATE_DURATION = 2 - DEFAULT_BOXPLOT_PARAMETER = 1.5 - DEFAULT_N_SIGMA_PARAMETER = 3 - - DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous" - DEFAULT_WINDOW_SIZE = 30 - DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 + DEFAULT_CONF = { + "log": {"level": "info"}, + "common": { + "slow_io_detect_frequency": 1, + "disk": None, + "stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio", + "iotype": "read,write", + }, + "algorithm": { + "train_data_duration": 24.0, + "train_update_duration": 2.0, + "algorithm_type": get_threshold_type_enum("boxplot"), + "boxplot_parameter": 1.5, + "n_sigma_parameter": 3.0, + }, + "sliding_window": { + "sliding_window_type": get_sliding_window_type_enum("not_continuous"), + "window_size": 30, + "window_minimum_threshold": 6, + }, + "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, + "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, + "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, + } def __init__(self, config_file_name): - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD - self.__slow_io_detect_frequency = ( - ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY - ) - self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL - self.__disks_to_detection = None - self.__stage = ConfigParser.DEFAULT_STAGE - self.__iotype = ConfigParser.DEFAULT_IOTYPE - - self.__algorithm_type = get_threshold_type_enum( - ConfigParser.DEFAULT_ALGORITHM_TYPE - ) - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER - self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER - - self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE - self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE - self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD - - self.__config_file_name = config_file_name + self._conf = ConfigParser.DEFAULT_CONF + self._config_file_name = config_file_name def _get_config_value( self, @@ -156,30 +159,21 @@ class ConfigParser: return value - def __read_absolute_threshold(self, items_common: dict): - self.__absolute_threshold = self._get_config_value( - items_common, - "absolute_threshold", - float, - ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD, - gt=0, - ) - - def __read__slow_io_detect_frequency(self, items_common: dict): - self.__slow_io_detect_frequency = self._get_config_value( + def _read_slow_io_detect_frequency(self, items_common: dict): + self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value( items_common, "slow_io_detect_frequency", int, - ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY, + self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], gt=0, le=300, ) - def __read__disks_to_detect(self, items_common: dict): + def _read_disks_to_detect(self, items_common: dict): disks_to_detection = items_common.get("disk") if disks_to_detection is None: logging.warning("config of disk not found, the default value will be used.") - self.__disks_to_detection = None + self._conf["common"]["disk"] = None return disks_to_detection = disks_to_detection.strip() if not disks_to_detection: @@ -189,40 +183,46 @@ class ConfigParser: ) exit(1) disk_list = disks_to_detection.split(",") + disk_list = [disk.strip() for disk in disk_list] if len(disk_list) == 1 and disk_list[0] == "default": - self.__disks_to_detection = None + self._conf["common"]["disk"] = None return - self.__disks_to_detection = disk_list + self._conf["common"]["disk"] = disk_list - def __read__train_data_duration(self, items_algorithm: dict): - self.__train_data_duration = self._get_config_value( + def _read_train_data_duration(self, items_algorithm: dict): + self._conf["common"]["train_data_duration"] = self._get_config_value( items_algorithm, "train_data_duration", float, - ConfigParser.DEFAULT_TRAIN_DATA_DURATION, + self.DEFAULT_CONF["algorithm"]["train_data_duration"], gt=0, le=720, ) - def __read__train_update_duration(self, items_algorithm: dict): - default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - if default_train_update_duration > self.__train_data_duration: - default_train_update_duration = self.__train_data_duration / 2 - self.__train_update_duration = self._get_config_value( + def _read_train_update_duration(self, items_algorithm: dict): + default_train_update_duration = self.DEFAULT_CONF["algorithm"][ + "train_update_duration" + ] + if default_train_update_duration > self._conf["common"]["train_data_duration"]: + default_train_update_duration = ( + self._conf["common"]["train_data_duration"] / 2 + ) + self._conf["common"]["train_update_duration"] = self._get_config_value( items_algorithm, "train_update_duration", float, default_train_update_duration, gt=0, - le=self.__train_data_duration, + le=self._conf["common"]["train_data_duration"], ) - def __read__algorithm_type_and_parameter(self, items_algorithm: dict): - algorithm_type = items_algorithm.get( - "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE - ) - self.__algorithm_type = get_threshold_type_enum(algorithm_type) - if self.__algorithm_type is None: + def _read_algorithm_type_and_parameter(self, items_algorithm: dict): + algorithm_type = items_algorithm.get("algorithm_type") + if algorithm_type is not None: + self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum( + algorithm_type + ) + if self._conf["algorithm"]["algorithm_type"] is None: logging.critical( "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", algorithm_type, @@ -231,129 +231,175 @@ class ConfigParser: f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." ) exit(1) - - if self.__algorithm_type == ThresholdType.NSigmaThreshold: - self.__n_sigma_parameter = self._get_config_value( + elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold: + self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value( items_algorithm, "n_sigma_parameter", float, - ConfigParser.DEFAULT_N_SIGMA_PARAMETER, + self.DEFAULT_CONF["algorithm"]["n_sigma_parameter"], gt=0, le=10, ) - elif self.__algorithm_type == ThresholdType.BoxplotThreshold: - self.__boxplot_parameter = self._get_config_value( + elif ( + self._conf["algorithm"]["algorithm_type"] == ThresholdType.BoxplotThreshold + ): + self._conf["algorithm"]["boxplot_parameter"] = self._get_config_value( items_algorithm, "boxplot_parameter", float, - ConfigParser.DEFAULT_BOXPLOT_PARAMETER, + self.DEFAULT_CONF["algorithm"]["boxplot_parameter"], gt=0, le=10, ) - def __read__stage(self, items_algorithm: dict): - stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE) - stage_list = stage_str.split(',') - if len(stage_list) == 1 and stage_list[0] == '': - logging.critical('stage value not allow is empty, exiting...') + def _read_stage(self, items_algorithm: dict): + stage_str = items_algorithm.get( + "stage", self.DEFAULT_CONF["common"]["stage"] + ).strip() + stage_list = stage_str.split(",") + stage_list = [stage.strip() for stage in stage_list] + if len(stage_list) == 1 and stage_list[0] == "": + logging.critical("stage value not allow is empty, exiting...") exit(1) - if len(stage_list) == 1 and stage_list[0] == 'default': - logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}') - self.__stage = ALL_STAGE_LIST + if len(stage_list) == 1 and stage_list[0] == "default": + logging.warning( + "stage will enable default value: %s", + self.DEFAULT_CONF["common"]["stage"], + ) + self._conf["common"]["stage"] = ALL_STAGE_LIST return for stage in stage_list: if stage not in ALL_STAGE_LIST: - logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...') + logging.critical( + "stage: %s is not valid stage, ai_block_io will exit...", stage + ) exit(1) dup_stage_list = set(stage_list) - if 'bio' not in dup_stage_list: - logging.critical('stage must contains bio stage, exiting...') + if "bio" not in dup_stage_list: + logging.critical("stage must contains bio stage, exiting...") exit(1) - self.__stage = dup_stage_list - - def __read__iotype(self, items_algorithm: dict): - iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE) - iotype_list = iotype_str.split(',') - if len(iotype_list) == 1 and iotype_list[0] == '': - logging.critical('iotype value not allow is empty, exiting...') + self._conf["common"]["stage"] = dup_stage_list + + def _read_iotype(self, items_algorithm: dict): + iotype_str = items_algorithm.get( + "iotype", self.DEFAULT_CONF["common"]["iotype"] + ).strip() + iotype_list = iotype_str.split(",") + iotype_list = [iotype.strip() for iotype in iotype_list] + if len(iotype_list) == 1 and iotype_list[0] == "": + logging.critical("iotype value not allow is empty, exiting...") exit(1) - if len(iotype_list) == 1 and iotype_list[0] == 'default': - logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}') - self.__iotype = ALL_IOTPYE_LIST + if len(iotype_list) == 1 and iotype_list[0] == "default": + logging.warning( + "iotype will enable default value: %s", + self.DEFAULT_CONF["common"]["iotype"], + ) + self._conf["common"]["iotype"] = ALL_IOTPYE_LIST return for iotype in iotype_list: if iotype not in ALL_IOTPYE_LIST: - logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...') + logging.critical( + "iotype: %s is not valid iotype, ai_block_io will exit...", iotype + ) exit(1) dup_iotype_list = set(iotype_list) - self.__iotype = dup_iotype_list + self._conf["common"]["iotype"] = dup_iotype_list + + def _read_sliding_window_type(self, items_sliding_window: dict): + sliding_window_type = items_sliding_window.get("sliding_window_type") + if sliding_window_type is not None: + self._conf["sliding_window"]["sliding_window_type"] = ( + get_sliding_window_type_enum(sliding_window_type) + ) + if self._conf["sliding_window"]["sliding_window_type"] is None: + logging.critical( + "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.", + sliding_window_type, + ) + Report.report_pass( + f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." + ) + exit(1) - def __read__window_size(self, items_sliding_window: dict): - self.__window_size = self._get_config_value( + def _read_window_size(self, items_sliding_window: dict): + self._conf["sliding_window"]["window_size"] = self._get_config_value( items_sliding_window, "window_size", int, - ConfigParser.DEFAULT_WINDOW_SIZE, + self.DEFAULT_CONF["sliding_window"]["window_size"], gt=0, le=3600, ) - def __read__window_minimum_threshold(self, items_sliding_window: dict): - default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD - if default_window_minimum_threshold > self.__window_size: - default_window_minimum_threshold = self.__window_size / 2 - self.__window_minimum_threshold = self._get_config_value( - items_sliding_window, - "window_minimum_threshold", - int, - default_window_minimum_threshold, - gt=0, - le=self.__window_size, + def _read_window_minimum_threshold(self, items_sliding_window: dict): + default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][ + "window_minimum_threshold" + ] + if ( + default_window_minimum_threshold + > self._conf["sliding_window"]["window_size"] + ): + default_window_minimum_threshold = ( + self._conf["sliding_window"]["window_size"] / 2 + ) + self._conf["sliding_window"]["window_minimum_threshold"] = ( + self._get_config_value( + items_sliding_window, + "window_minimum_threshold", + int, + default_window_minimum_threshold, + gt=0, + le=self._conf["sliding_window"]["window_size"], + ) ) def read_config_from_file(self): - if not os.path.exists(self.__config_file_name): - init_log_format(self.__log_level) + if not os.path.exists(self._config_file_name): + init_log_format(self._conf["log"]["level"]) logging.critical( "config file %s not found, ai_block_io plug will exit.", - self.__config_file_name, + self._config_file_name, ) Report.report_pass( - f"config file {self.__config_file_name} not found, ai_block_io plug will exit." + f"config file {self._config_file_name} not found, ai_block_io plug will exit." ) exit(1) con = configparser.ConfigParser() try: - con.read(self.__config_file_name, encoding="utf-8") + con.read(self._config_file_name, encoding="utf-8") except configparser.Error as e: - init_log_format(self.__log_level) + init_log_format(self._conf["log"]["level"]) logging.critical( - f"config file read error: %s, ai_block_io plug will exit.", e + "config file read error: %s, ai_block_io plug will exit.", e ) Report.report_pass( f"config file read error: {e}, ai_block_io plug will exit." ) exit(1) - if con.has_section('log'): - items_log = dict(con.items('log')) + if con.has_section("log"): + items_log = dict(con.items("log")) # 情况一:没有log,则使用默认值 # 情况二:有log,值为空或异常,使用默认值 # 情况三:有log,值正常,则使用该值 - self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL) - init_log_format(self.__log_level) + self._conf["log"]["level"] = items_log.get( + "level", self.DEFAULT_CONF["log"]["level"] + ) + init_log_format(self._conf["log"]["level"]) else: - init_log_format(self.__log_level) - logging.warning(f"log section parameter not found, it will be set to default value.") + init_log_format(self._conf["log"]["level"]) + logging.warning( + "log section parameter not found, it will be set to default value." + ) if con.has_section("common"): items_common = dict(con.items("common")) - self.__read_absolute_threshold(items_common) - self.__read__slow_io_detect_frequency(items_common) - self.__read__disks_to_detect(items_common) - self.__read__stage(items_common) - self.__read__iotype(items_common) + + self._read_slow_io_detect_frequency(items_common) + self._read_disks_to_detect(items_common) + self._read_stage(items_common) + self._read_iotype(items_common) else: logging.warning( "common section parameter not found, it will be set to default value." @@ -361,9 +407,9 @@ class ConfigParser: if con.has_section("algorithm"): items_algorithm = dict(con.items("algorithm")) - self.__read__train_data_duration(items_algorithm) - self.__read__train_update_duration(items_algorithm) - self.__read__algorithm_type_and_parameter(items_algorithm) + self._read_train_data_duration(items_algorithm) + self._read_train_update_duration(items_algorithm) + self._read_algorithm_type_and_parameter(items_algorithm) else: logging.warning( "algorithm section parameter not found, it will be set to default value." @@ -371,101 +417,162 @@ class ConfigParser: if con.has_section("sliding_window"): items_sliding_window = dict(con.items("sliding_window")) - sliding_window_type = items_sliding_window.get( - "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE + + self._read_window_size(items_sliding_window) + self._read_window_minimum_threshold(items_sliding_window) + else: + logging.warning( + "sliding_window section parameter not found, it will be set to default value." + ) + + if con.has_section("latency_sata_ssd"): + items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) + self._conf["latency_sata_ssd"]["read_tot_lim"] = self._get_config_value( + items_latency_sata_ssd, + "read_tot_lim", + int, + self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"], + gt=0, ) - self.__sliding_window_type = get_sliding_window_type_enum( - sliding_window_type + self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value( + items_latency_sata_ssd, + "write_tot_lim", + int, + self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], + gt=0, ) - self.__read__window_size(items_sliding_window) - self.__read__window_minimum_threshold(items_sliding_window) else: logging.warning( - "sliding_window section parameter not found, it will be set to default value." + "latency_sata_ssd section parameter not found, it will be set to default value." + ) + if con.has_section("latency_nvme_ssd"): + items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd")) + self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value( + items_latency_nvme_ssd, + "read_tot_lim", + int, + self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"], + gt=0, + ) + self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value( + items_latency_nvme_ssd, + "write_tot_lim", + int, + self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], + gt=0, + ) + else: + logging.warning( + "latency_nvme_ssd section parameter not found, it will be set to default value." + ) + if con.has_section("latency_sata_hdd"): + items_latency_sata_hdd = dict(con.items("latency_sata_hdd")) + self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value( + items_latency_sata_hdd, + "read_tot_lim", + int, + self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"], + gt=0, + ) + self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value( + items_latency_sata_hdd, + "write_tot_lim", + int, + self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], + gt=0, + ) + else: + logging.warning( + "latency_sata_hdd section parameter not found, it will be set to default value." ) self.__print_all_config_value() - def __repr__(self): - config_str = { - 'log.level': self.__log_level, - 'common.absolute_threshold': self.__absolute_threshold, - 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency, - 'common.disk': self.__disks_to_detection, - 'common.stage': self.__stage, - 'common.iotype': self.__iotype, - 'algorithm.train_data_duration': self.__train_data_duration, - 'algorithm.train_update_duration': self.__train_update_duration, - 'algorithm.algorithm_type': self.__algorithm_type, - 'algorithm.boxplot_parameter': self.__boxplot_parameter, - 'algorithm.n_sigma_parameter': self.__n_sigma_parameter, - 'sliding_window.sliding_window_type': self.__sliding_window_type, - 'sliding_window.window_size': self.__window_size, - 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold - } - return str(config_str) + def __repr__(self) -> str: + return str(self._conf) + + def __str__(self) -> str: + return str(self._conf) def __print_all_config_value(self): - logging.info(f"all config is follow:\n {self}") + logging.info("all config is follow:\n %s", self) + + def get_tot_lim(self, disk_type, io_type): + if io_type == "read": + return self._conf.get( + f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} + ).get("read_tot_lim", None) + elif io_type == "write": + return self._conf.get( + f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} + ).get("write_tot_lim", None) + else: + return None def get_train_data_duration_and_train_update_duration(self): - return self.__train_data_duration, self.__train_update_duration + return ( + self._conf["common"]["train_data_duration"], + self._conf["common"]["train_update_duration"], + ) def get_window_size_and_window_minimum_threshold(self): - return self.__window_size, self.__window_minimum_threshold + return ( + self._conf["sliding_window"]["window_size"], + self._conf["sliding_window"]["window_minimum_threshold"], + ) @property def slow_io_detect_frequency(self): - return self.__slow_io_detect_frequency + return self._conf["common"]["slow_io_detect_frequency"] @property def algorithm_type(self): - return self.__algorithm_type + return self._conf["algorithm"]["algorithm_type"] @property def sliding_window_type(self): - return self.__sliding_window_type + return self._conf["sliding_window"]["sliding_window_type"] @property def train_data_duration(self): - return self.__train_data_duration + return self._conf["common"]["train_data_duration"] @property def train_update_duration(self): - return self.__train_update_duration + return self._conf["common"]["train_update_duration"] @property def window_size(self): - return self.__window_size + return self._conf["sliding_window"]["window_size"] @property def window_minimum_threshold(self): - return self.__window_minimum_threshold + return self._conf["sliding_window"]["window_minimum_threshold"] @property def absolute_threshold(self): - return self.__absolute_threshold + return self._conf["common"]["absolute_threshold"] @property def log_level(self): - return self.__log_level + return self._conf["log"]["level"] @property def disks_to_detection(self): - return self.__disks_to_detection + return self._conf["common"]["disk"] @property def stage(self): - return self.__stage + return self._conf["common"]["stage"] @property def iotype(self): - return self.__iotype + return self._conf["common"]["iotype"] @property def boxplot_parameter(self): - return self.__boxplot_parameter + return self._conf["algorithm"]["boxplot_parameter"] @property def n_sigma_parameter(self): - return self.__n_sigma_parameter + return self._conf["algorithm"]["n_sigma_parameter"] diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index ed997e6..1bc5ed8 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -16,6 +16,7 @@ from sentryCollector.collect_plugin import ( Result_Messages, get_io_data, is_iocollect_valid, + get_disk_type ) diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index e710ddd..87bd1dd 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -17,9 +17,6 @@ from .utils import get_metric_value_from_io_data_dict_by_metric_name class Detector: - _metric_name: MetricName = None - _threshold: Threshold = None - _slidingWindow: SlidingWindow = None def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): self._metric_name = metric_name @@ -40,18 +37,24 @@ class Detector: metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: logging.debug('not found metric value, so return None.') - return False, None, None + return (False, False), None, None, None logging.debug(f'input metric value: {str(metric_value)}') self._threshold.push_latest_data_to_queue(metric_value) detection_result = self._slidingWindow.is_slow_io_event(metric_value) - logging.debug(f'Detection result: {str(detection_result)}') + # 检测到慢周期,由Detector负责打印info级别日志 + if detection_result[0][1]: + logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, ' + f'current value: {metric_value}, ai threshold: {detection_result[2]}, ' + f'absolute threshold: {detection_result[3]}') + else: + logging.debug(f'Detection result: {str(detection_result)}') logging.debug(f'exit Detector: {self}') return detection_result def __repr__(self): - return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' - f' io_type_name: {self._metric_name.get_io_access_type_name()},' - f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' + return (f'disk_name: {self._metric_name.disk_name}, stage_name: {self._metric_name.stage_name},' + f' io_type_name: {self._metric_name.io_access_type_name},' + f' metric_name: {self._metric_name.metric_name}, threshold_type: {self._threshold},' f' sliding_window_type: {self._slidingWindow}') @@ -65,13 +68,38 @@ class DiskDetector: self._detector_list.append(detector) def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - # 只有bio阶段发生异常,就认为发生了慢IO事件 - # todo:根因诊断 + """ + 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件 + 情况一:bio异常,rq_driver也异常,则慢盘 + 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常 + 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大 + 情况四:bio异常,则UNKNOWN + """ + diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []} for detector in self._detector_list: + # result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值 + # 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold result = detector.is_slow_io_event(io_data_dict_with_disk_name) - if result[0] and detector.get_metric_name().get_stage_name() == 'bio': - return result[0], detector.get_metric_name(), result[1], result[2] - return False, None, None, None + if result[0][0]: + if detector.get_metric_name().stage_name == "bio": + diagnosis_info["bio"].append((detector.get_metric_name(), result)) + elif detector.get_metric_name().stage_name == "rq_driver": + diagnosis_info["rq_driver"].append((detector.get_metric_name(), result)) + else: + diagnosis_info["io_stage"].append((detector.get_metric_name(), result)) + + # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因 + root_cause = None + if len(diagnosis_info["bio"]) == 0: + return False, None, None, None + elif len(diagnosis_info["rq_driver"]) != 0: + root_cause = "[Root Cause:disk slow]" + elif len(diagnosis_info["io_stage"]) != 0: + stage = diagnosis_info["io_stage"][0][1].get_stage_name() + root_cause = f"[Root Cause:io stage slow, stage: {stage}]" + if root_cause is None: + root_cause = "[Root Cause:high io pressure]" + return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause def __repr__(self): msg = f'disk: {self._disk_name}, ' diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py index 0e17051..d341b55 100644 --- a/src/python/sentryPlugins/ai_block_io/io_data.py +++ b/src/python/sentryPlugins/ai_block_io/io_data.py @@ -45,30 +45,10 @@ class IOData: time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) +@dataclass(frozen=True) class MetricName: - _disk_name: str = None - _stage_name: str = None - _io_access_type_name: str = None - _metric_name: str = None - - def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str): - self._disk_name = disk_name - self._stage_name = stage_name - self._io_access_type_name = io_access_type_name - self._metric_name = metric_name - - def get_disk_name(self): - return self._disk_name - - def get_stage_name(self): - return self._stage_name - - def get_io_access_type_name(self): - return self._io_access_type_name - - def get_metric_name(self): - return self._metric_name - - def __repr__(self): - return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},' - f'metric: {self._metric_name}') + disk_name: str + disk_type: str + stage_name: str + io_access_type_name: str + metric_name: str diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index 89191e5..d7c402a 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -21,15 +21,11 @@ class SlidingWindowType(Enum): class SlidingWindow: - _ai_threshold = None - _queue_length = None - _queue_threshold = None - _io_data_queue: list = None - _io_data_queue_abnormal_tag: list = None - - def __init__(self, queue_length: int, threshold: int): + def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None): self._queue_length = queue_length self._queue_threshold = threshold + self._ai_threshold = None + self._abs_threshold = abs_threshold self._io_data_queue = [] self._io_data_queue_abnormal_tag = [] @@ -38,7 +34,12 @@ class SlidingWindow: self._io_data_queue.pop(0) self._io_data_queue_abnormal_tag.pop(0) self._io_data_queue.append(data) - self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False) + tag = False + if ((self._ai_threshold is not None and data >= self._ai_threshold) or + (self._abs_threshold is not None and data >= self._abs_threshold)): + tag = True + self._io_data_queue_abnormal_tag.append(tag) + return tag def update(self, threshold): if self._ai_threshold == threshold: @@ -49,7 +50,7 @@ class SlidingWindow: self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) def is_slow_io_event(self, data): - return False, None, None + return False, None, None, None def __repr__(self): return "[SlidingWindow]" @@ -57,12 +58,13 @@ class SlidingWindow: class NotContinuousSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: - return True, self._io_data_queue, self._ai_threshold - return False, self._io_data_queue, self._ai_threshold + is_slow_io_event = True + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -70,18 +72,20 @@ class NotContinuousSlidingWindow(SlidingWindow): class ContinuousSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False consecutive_count = 0 for tag in self._io_data_queue_abnormal_tag: if tag: consecutive_count += 1 if consecutive_count >= self._queue_threshold: - return True, self._io_data_queue, self._ai_threshold + is_slow_io_event = True + break else: consecutive_count = 0 - return False, self._io_data_queue, self._ai_threshold + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -89,20 +93,23 @@ class ContinuousSlidingWindow(SlidingWindow): class MedianSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False median = np.median(self._io_data_queue) if median >= self._ai_threshold: - return True, self._io_data_queue, self._ai_threshold - return False, self._io_data_queue, self._ai_threshold + is_slow_io_event = True + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[MedianSlidingWindow, window size: {self._queue_length}]" class SlidingWindowFactory: - def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs): + def get_sliding_window( + self, sliding_window_type: SlidingWindowType, *args, **kwargs + ): if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow: return NotContinuousSlidingWindow(*args, **kwargs) elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow: diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py index 0ed37b9..d6f4067 100644 --- a/src/python/sentryPlugins/ai_block_io/utils.py +++ b/src/python/sentryPlugins/ai_block_io/utils.py @@ -19,53 +19,57 @@ from .io_data import MetricName, IOData def get_threshold_type_enum(algorithm_type: str): - if algorithm_type.lower() == 'absolute': + if algorithm_type.lower() == "absolute": return ThresholdType.AbsoluteThreshold - if algorithm_type.lower() == 'boxplot': + if algorithm_type.lower() == "boxplot": return ThresholdType.BoxplotThreshold - if algorithm_type.lower() == 'n_sigma': + if algorithm_type.lower() == "n_sigma": return ThresholdType.NSigmaThreshold return None def get_sliding_window_type_enum(sliding_window_type: str): - if sliding_window_type.lower() == 'not_continuous': + if sliding_window_type.lower() == "not_continuous": return SlidingWindowType.NotContinuousSlidingWindow - if sliding_window_type.lower() == 'continuous': + if sliding_window_type.lower() == "continuous": return SlidingWindowType.ContinuousSlidingWindow - if sliding_window_type.lower() == 'median': + if sliding_window_type.lower() == "median": return SlidingWindowType.MedianSlidingWindow - logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous") - return SlidingWindowType.NotContinuousSlidingWindow + return None -def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName): +def get_metric_value_from_io_data_dict_by_metric_name( + io_data_dict: dict, metric_name: MetricName +): try: - io_data: IOData = io_data_dict[metric_name.get_disk_name()] - io_stage_data = asdict(io_data)[metric_name.get_stage_name()] - base_data = io_stage_data[metric_name.get_io_access_type_name()] - metric_value = base_data[metric_name.get_metric_name()] + io_data: IOData = io_data_dict[metric_name.disk_name] + io_stage_data = asdict(io_data)[metric_name.stage_name] + base_data = io_stage_data[metric_name.io_access_type_name] + metric_value = base_data[metric_name.metric_name] return metric_value except KeyError: return None -def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float, - slow_io_detect_frequency: int): +def get_data_queue_size_and_update_size( + training_data_duration: float, + train_update_duration: float, + slow_io_detect_frequency: int, +): data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency) update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency) return data_queue_size, update_size def get_log_level(log_level: str): - if log_level.lower() == 'debug': + if log_level.lower() == "debug": return logging.DEBUG - elif log_level.lower() == 'info': + elif log_level.lower() == "info": return logging.INFO - elif log_level.lower() == 'warning': + elif log_level.lower() == "warning": return logging.WARNING - elif log_level.lower() == 'error': + elif log_level.lower() == "error": return logging.ERROR - elif log_level.lower() == 'critical': + elif log_level.lower() == "critical": return logging.CRITICAL return logging.INFO -- Gitee From d127200c9dde34fc3c129def9e277486aff5dc3f Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Sat, 12 Oct 2024 16:06:32 +0800 Subject: [PATCH 45/76] update collect log --- src/python/sentryCollector/collect_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 2b10cde..945ccbc 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -156,7 +156,7 @@ class CollectIo(): for line in file: count += line.count('.op=' + Io_Category[category].upper()) if count > 0: - logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") + logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}") except FileNotFoundError: logging.error("The file %s does not exist.", io_dump_file) return count @@ -318,7 +318,7 @@ class CollectIo(): curr_io_length = self.get_ebpf_io_length(curr_latency=curr_latency, prev_latency=prev_latency) curr_io_dump = self.get_ebpf_io_dump(curr_io_dump_count=curr_io_dump_count, prev_io_dump_count=prev_io_dump_count) if curr_io_dump > 0: - logging.info(f"ebpf io_dump info : {disk_name}, {stage}, {category}, {curr_io_dump}") + logging.info(f"ebpf io_dump info : {disk_name}, {stage}, {io_type}, {curr_io_dump}") IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) logging.debug(f"ebpf collect data : {IO_GLOBAL_DATA}") elapsed_time = time.time() - start_time -- Gitee From 70e2e973dc3c81f5c095c4c87c94cd0b0ff9d9c1 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Sat, 12 Oct 2024 17:57:01 +0800 Subject: [PATCH 46/76] modify abnormal stack when the disk field is not configured --- src/python/sentryCollector/collect_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py index 5aa38ec..7ca9898 100644 --- a/src/python/sentryCollector/collect_config.py +++ b/src/python/sentryCollector/collect_config.py @@ -127,9 +127,9 @@ class CollectConfig: CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT # disk - disk = io_map_value.get(CONF_IO_DISK).lower() + disk = io_map_value.get(CONF_IO_DISK) if disk: - disk_str = disk.replace(" ", "") + disk_str = disk.lower().replace(" ", "") pattern = r'^[a-zA-Z0-9-_,]+$' if not re.match(pattern, disk_str): logging.warning("module_name = %s section, field = %s is incorrect, use default %s", -- Gitee From 6edaefc73088e5878d7b5f416dc19e1922890e6b Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Sat, 12 Oct 2024 16:51:37 +0800 Subject: [PATCH 47/76] precise alarm query time Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 25 +++++++++++++++++++++++-- src/python/syssentry/load_mods.py | 3 ++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index fd379d7..117acf5 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -76,6 +76,18 @@ def update_alarm_list(alarm_info: Xalarm): finally: alarm_list_lock.release() +def check_alarm_id_if_number(alarm_id): + if isinstance(alarm_id, int): + return True + else: + return False + +def check_alarm_clear_time_if_positive_integer(alarm_clear_time): + if isinstance(alarm_clear_time, int) and alarm_clear_time > 0: + return True + else: + return False + def alarm_register(): logging.debug(f"alarm_register: enter") # 初始化告警ID映射字典、告警老化时间字典 @@ -84,10 +96,16 @@ def alarm_register(): logging.info(f"alarm_register: {task_name} is registered") task = TasksMap.tasks_dict[task_type][task_name] alarm_id = task.alarm_id + if not check_alarm_id_if_number(alarm_id): + logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + continue if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") continue alarm_clear_time = task.alarm_clear_time + if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): + logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") + continue try: alarm_clear_time = int(alarm_clear_time) if alarm_clear_time <= 0: @@ -119,6 +137,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di logging.debug("task_name does not exist") return [] alarm_id = task_alarm_id_dict[task_name] + clear_time = alarm_id_clear_time_dict[alarm_id] + if clear_time < int(time_range): + return [] if alarm_id not in alarm_list_dict: logging.debug("alarm_id does not exist") return [] @@ -126,10 +147,10 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements") # clear alarm_info older than clear time threshold stop_index = -1 - timestamp = int(datetime.now().timestamp()) + timestamp = datetime.now().timestamp() for i in range(len(alarm_list)): logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}") - if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range): + if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range: stop_index = i break if stop_index >= 0: diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py index 7daf17d..f74f165 100644 --- a/src/python/syssentry/load_mods.py +++ b/src/python/syssentry/load_mods.py @@ -203,7 +203,8 @@ def parse_mod_conf(mod_name, mod_conf): if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): raise ValueError("Invalid alarm_id") except ValueError: - logging.warning("Invalid alarm_id") + task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID) + task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) except configparser.NoOptionError: logging.warning("Unset alarm_clear_time, use 15s as default") -- Gitee From e2778f7d6f7113172bc2102e490f55bafe8dbc59 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Mon, 14 Oct 2024 12:01:27 +0800 Subject: [PATCH 48/76] ebpf fix dead loop --- src/c/ebpf_collector/ebpf_collector.c | 53 +++++++++++++++++---------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.c b/src/c/ebpf_collector/ebpf_collector.c index 6e981da..0885a5f 100644 --- a/src/c/ebpf_collector/ebpf_collector.c +++ b/src/c/ebpf_collector/ebpf_collector.c @@ -117,41 +117,46 @@ char* find_device_name(dev_t dev) { return device_name; } -static int print_map_res(struct bpf_map *map_res, struct bpf_map *map_res_2, char *stage, int *map_size) -{ - int err; - struct stage_data counter; +static void update_io_dump(struct bpf_map *map_res, int *io_dump, int *map_size, char *stage) { struct time_range_io_count time_count; - int key = 0; - int io_dump[MAP_SIZE] = {0}; u32 io_dump_key = 0, io_dump_next_key = 0; - struct sysinfo info; sysinfo(&info); - while (bpf_map_get_next_key(map_res_2, &io_dump_key, &io_dump_next_key) == 0) { - err = bpf_map_lookup_elem(map_res_2, &io_dump_next_key, &time_count); + while (bpf_map_get_next_key(map_res, &io_dump_key, &io_dump_next_key) == 0) { + int err = bpf_map_lookup_elem(map_res, &io_dump_next_key, &time_count); if (err < 0) { fprintf(stderr, "failed to lookup %s io dump: %d\n", stage, err); - continue; + break; + } + if (io_dump_key == io_dump_next_key) { + break; } + io_dump_key = io_dump_next_key; - if ((info.uptime - io_dump_key) > 2) { + + if ((info.uptime - io_dump_key) >= 2) { int isempty = 1; - for (key = 0; key < map_size; key++){ + for (int key = 0; key < map_size; key++) { if (time_count.count[key] > 0) { io_dump[key] += time_count.count[key]; isempty = 0; } } if (isempty || (info.uptime - io_dump_key) > IO_DUMP_THRESHOLD) { - bpf_map_delete_elem(map_res_2, &io_dump_key); + bpf_map_delete_elem(map_res, &io_dump_key); } } } +} + +static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size, int *io_dump) +{ + struct stage_data counter; + int key = 0; for (key = 0; key < map_size; key++) { - err = bpf_map_lookup_elem(map_res, &key, &counter); + int err = bpf_map_lookup_elem(map_res, &key, &counter); if (err < 0) { fprintf(stderr, "failed to lookup %s map_res: %d\n", stage, err); return -1; @@ -274,19 +279,27 @@ int main(int argc, char **argv) { sleep(1); - err = print_map_res(BLK_RES, BLK_RES_2, "rq_driver", device_count); + int io_dump_blk[MAP_SIZE] = {0}; + update_io_dump(BLK_RES_2, io_dump_blk, device_count,"rq_driver"); + err = print_map_res(BLK_RES, "rq_driver", device_count, io_dump_blk); if (err) break; - err = print_map_res(BIO_RES, BIO_RES_2, "bio", device_count); + int io_dump_bio[MAP_SIZE] = {0}; + update_io_dump(BIO_RES_2, io_dump_bio, device_count,"bio"); + err = print_map_res(BIO_RES, "bio", device_count, io_dump_bio); if (err) break; - - err = print_map_res(TAG_RES, TAG_RES_2, "gettag", device_count); + + int io_dump_tag[MAP_SIZE] = {0}; + update_io_dump(TAG_RES_2, io_dump_tag, device_count,"gettag"); + err = print_map_res(TAG_RES, "gettag", device_count, io_dump_tag); if (err) break; - - err = print_map_res(WBT_RES, WBT_RES_2, "wbt", device_count); + + int io_dump_wbt[MAP_SIZE] = {0}; + update_io_dump(WBT_RES_2, io_dump_wbt, device_count,"wbt"); + err = print_map_res(WBT_RES, "wbt", device_count, io_dump_wbt); if (err) break; -- Gitee From 485a984e02f219b9ff8768a115ae032f9f38cd99 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Mon, 14 Oct 2024 11:30:58 +0800 Subject: [PATCH 49/76] fix word error Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index 117acf5..c0b0e8a 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -49,7 +49,7 @@ MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) def update_alarm_list(alarm_info: Xalarm): alarm_id = xalarm_getid(alarm_info) if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: - logging.warnning(f"Invalid alarm_id {alarm_id}") + logging.warning(f"Invalid alarm_id {alarm_id}") return timestamp = xalarm_gettime(alarm_info) if not timestamp: @@ -97,14 +97,14 @@ def alarm_register(): task = TasksMap.tasks_dict[task_type][task_name] alarm_id = task.alarm_id if not check_alarm_id_if_number(alarm_id): - logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") continue if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: - logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") continue alarm_clear_time = task.alarm_clear_time if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): - logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") + logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") continue try: alarm_clear_time = int(alarm_clear_time) @@ -113,7 +113,7 @@ def alarm_register(): if alarm_clear_time > sys.maxsize: raise ValueError("Exceeds maximum value for int") except (ValueError, OverflowError, TypeError) as e: - logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") + logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") continue alarm_list_dict[alarm_id] = [] task_alarm_id_dict[task_name] = alarm_id -- Gitee From 77367098826890225fd3c4e9b2ff009786c70c5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Mon, 14 Oct 2024 23:16:46 +0800 Subject: [PATCH 50/76] ai_block_io fix some bugs --- .../sentryPlugins/ai_block_io/ai_block_io.py | 1 + .../ai_block_io/config_parser.py | 20 ++++++++++--------- .../sentryPlugins/ai_block_io/detector.py | 18 ++++++++++++----- .../sentryPlugins/ai_block_io/io_data.py | 2 +- .../sentryPlugins/ai_block_io/threshold.py | 17 +++++++++------- 5 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index dd661a1..4eecd43 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -55,6 +55,7 @@ class SlowIODetection: Report.report_pass( "get available disk error, please check if the collector plug is enable. exiting..." ) + logging.critical("get available disk error, please check if the collector plug is enable. exiting...") exit(1) logging.info(f"ai_block_io plug has found disks: {self._disk_list}") diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 3388cd4..7b0cd29 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -190,7 +190,7 @@ class ConfigParser: self._conf["common"]["disk"] = disk_list def _read_train_data_duration(self, items_algorithm: dict): - self._conf["common"]["train_data_duration"] = self._get_config_value( + self._conf["algorithm"]["train_data_duration"] = self._get_config_value( items_algorithm, "train_data_duration", float, @@ -203,17 +203,17 @@ class ConfigParser: default_train_update_duration = self.DEFAULT_CONF["algorithm"][ "train_update_duration" ] - if default_train_update_duration > self._conf["common"]["train_data_duration"]: + if default_train_update_duration > self._conf["algorithm"]["train_data_duration"]: default_train_update_duration = ( - self._conf["common"]["train_data_duration"] / 2 + self._conf["algorithm"]["train_data_duration"] / 2 ) - self._conf["common"]["train_update_duration"] = self._get_config_value( + self._conf["algorithm"]["train_update_duration"] = self._get_config_value( items_algorithm, "train_update_duration", float, default_train_update_duration, gt=0, - le=self._conf["common"]["train_data_duration"], + le=self._conf["algorithm"]["train_data_duration"], ) def _read_algorithm_type_and_parameter(self, items_algorithm: dict): @@ -401,6 +401,8 @@ class ConfigParser: self._read_stage(items_common) self._read_iotype(items_common) else: + self._conf["common"]["stage"] = ALL_STAGE_LIST + self._conf["common"]["iotype"] = ALL_IOTPYE_LIST logging.warning( "common section parameter not found, it will be set to default value." ) @@ -511,8 +513,8 @@ class ConfigParser: def get_train_data_duration_and_train_update_duration(self): return ( - self._conf["common"]["train_data_duration"], - self._conf["common"]["train_update_duration"], + self._conf["algorithm"]["train_data_duration"], + self._conf["algorithm"]["train_update_duration"], ) def get_window_size_and_window_minimum_threshold(self): @@ -535,11 +537,11 @@ class ConfigParser: @property def train_data_duration(self): - return self._conf["common"]["train_data_duration"] + return self._conf["algorithm"]["train_data_duration"] @property def train_update_duration(self): - return self._conf["common"]["train_update_duration"] + return self._conf["algorithm"]["train_update_duration"] @property def window_size(self): diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index 87bd1dd..5b21714 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -9,6 +9,7 @@ # PURPOSE. # See the Mulan PSL v2 for more details. import logging +from datetime import datetime from .io_data import MetricName from .threshold import Threshold @@ -21,18 +22,25 @@ class Detector: def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): self._metric_name = metric_name self._threshold = threshold + # for when threshold update, it can print latest threshold with metric name + self._threshold.set_metric_name(self._metric_name) self._slidingWindow = sliding_window self._threshold.attach_observer(self._slidingWindow) - self._count = 0 + self._count = None def get_metric_name(self): return self._metric_name def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - self._count += 1 - if self._count % 15 == 0: - self._count = 0 - logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + if self._count is None: + self._count = datetime.now() + else: + now_time = datetime.now() + time_diff = (now_time - self._count).total_seconds() + if time_diff >= 60: + logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + self._count = None + logging.debug(f'enter Detector: {self}') metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py index d341b55..6042911 100644 --- a/src/python/sentryPlugins/ai_block_io/io_data.py +++ b/src/python/sentryPlugins/ai_block_io/io_data.py @@ -48,7 +48,7 @@ class IOData: @dataclass(frozen=True) class MetricName: disk_name: str - disk_type: str + disk_type: int stage_name: str io_access_type_name: str metric_name: str diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py index 3b7a5a8..600d041 100644 --- a/src/python/sentryPlugins/ai_block_io/threshold.py +++ b/src/python/sentryPlugins/ai_block_io/threshold.py @@ -23,11 +23,6 @@ class ThresholdState(Enum): class Threshold: - threshold = None - data_queue: queue.Queue = None - data_queue_update_size: int = None - new_data_size: int = None - threshold_state: ThresholdState = None def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): self._observer = None @@ -36,12 +31,16 @@ class Threshold: self.new_data_size = 0 self.threshold_state = ThresholdState.INIT self.threshold = math.inf + self.metric_name = None def set_threshold(self, threshold): self.threshold = threshold self.threshold_state = ThresholdState.START self.notify_observer() + def set_metric_name(self, metric_name): + self.metric_name = metric_name + def get_threshold(self): if self.threshold_state == ThresholdState.INIT: return None @@ -84,6 +83,7 @@ class BoxplotThreshold(Threshold): self.parameter = boxplot_parameter def _update_threshold(self): + old_threshold = self.threshold data = list(self.data_queue.queue) q1 = np.percentile(data, 25) q3 = np.percentile(data, 75) @@ -91,6 +91,7 @@ class BoxplotThreshold(Threshold): self.threshold = q3 + self.parameter * iqr if self.threshold_state == ThresholdState.INIT: self.threshold_state = ThresholdState.START + logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}") self.notify_observer() def push_latest_data_to_queue(self, data): @@ -109,7 +110,7 @@ class BoxplotThreshold(Threshold): self.new_data_size = 0 def __repr__(self): - return f"[BoxplotThreshold, param is: {self.parameter}]" + return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" class NSigmaThreshold(Threshold): @@ -118,12 +119,14 @@ class NSigmaThreshold(Threshold): self.parameter = n_sigma_parameter def _update_threshold(self): + old_threshold = self.threshold data = list(self.data_queue.queue) mean = np.mean(data) std = np.std(data) self.threshold = mean + self.parameter * std if self.threshold_state == ThresholdState.INIT: self.threshold_state = ThresholdState.START + logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}") self.notify_observer() def push_latest_data_to_queue(self, data): @@ -142,7 +145,7 @@ class NSigmaThreshold(Threshold): self.new_data_size = 0 def __repr__(self): - return f"[NSigmaThreshold, param is: {self.parameter}]" + return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" class ThresholdType(Enum): -- Gitee From db1c81d28c92142cad1d1d3ed65e19d21f621d60 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Tue, 15 Oct 2024 10:00:07 +0000 Subject: [PATCH 51/76] refactor config.py and bugfix uncorrect slow io report Signed-off-by: gaoruoshu --- .../avg_block_io/avg_block_io.py | 155 ++----------- .../sentryPlugins/avg_block_io/config.py | 208 ++++++++++++++++++ .../sentryPlugins/avg_block_io/module_conn.py | 9 +- .../sentryPlugins/avg_block_io/utils.py | 72 ------ 4 files changed, 238 insertions(+), 206 deletions(-) create mode 100644 src/python/sentryPlugins/avg_block_io/config.py diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index f3ade09..cd47919 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -13,132 +13,13 @@ import signal import configparser import time +from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage from .stage_window import IoWindow, IoDumpWindow from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name -from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value -from sentryCollector.collect_plugin import Disk_Type +from .utils import update_avg_and_check_abnormal CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" -def log_invalid_keys(not_in_list, keys_name, config_list, default_list): - """print invalid log""" - if config_list and not_in_list: - logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) - elif config_list == ["default"]: - logging.warning("Default {} use {}".format(keys_name, default_list)) - - -def read_config_common(config): - """read config file, get [common] section value""" - if not config.has_section("common"): - report_alarm_fail("Cannot find common section in config file") - - try: - disk_name = config.get("common", "disk") - disk = [] if disk_name == "default" else disk_name.split(",") - except configparser.NoOptionError: - disk = [] - logging.warning("Unset common.disk, set to default") - - try: - stage_name = config.get("common", "stage") - stage = [] if stage_name == "default" else stage_name.split(",") - except configparser.NoOptionError: - stage = [] - logging.warning("Unset common.stage, set to default") - - if len(disk) > 10: - logging.warning("Too many common.disks, record only max 10 disks") - disk = disk[:10] - - try: - iotype_name = config.get("common", "iotype").split(",") - iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] - err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] - - if err_iotype: - report_alarm_fail("Invalid common.iotype config") - - except configparser.NoOptionError: - iotype_list = ["read", "write"] - logging.warning("Unset common.iotype, set to read,write") - - try: - period_time = int(config.get("common", "period_time")) - if not (1 <= period_time <= 300): - raise ValueError("Invalid period_time") - except ValueError: - report_alarm_fail("Invalid common.period_time") - except configparser.NoOptionError: - period_time = 1 - logging.warning("Unset common.period_time, use 1s as default") - - return period_time, disk, stage, iotype_list - - -def read_config_algorithm(config): - """read config file, get [algorithm] section value""" - if not config.has_section("algorithm"): - report_alarm_fail("Cannot find algorithm section in config file") - - try: - win_size = int(config.get("algorithm", "win_size")) - if not (1 <= win_size <= 300): - raise ValueError("Invalid algorithm.win_size") - except ValueError: - report_alarm_fail("Invalid algorithm.win_size config") - except configparser.NoOptionError: - win_size = 30 - logging.warning("Unset algorithm.win_size, use 30 as default") - - try: - win_threshold = int(config.get("algorithm", "win_threshold")) - if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: - raise ValueError("Invalid algorithm.win_threshold") - except ValueError: - report_alarm_fail("Invalid algorithm.win_threshold config") - except configparser.NoOptionError: - win_threshold = 6 - logging.warning("Unset algorithm.win_threshold, use 6 as default") - - return win_size, win_threshold - - -def read_config_latency(config): - """read config file, get [latency_xxx] section value""" - common_param = {} - for type_name in Disk_Type: - section_name = f"latency_{Disk_Type[type_name]}" - if not config.has_section(section_name): - report_alarm_fail(f"Cannot find {section_name} section in config file") - - common_param[Disk_Type[type_name]] = get_section_value(section_name, config) - return common_param - - -def read_config_iodump(config): - """read config file, get [iodump] section value""" - common_param = {} - section_name = "iodump" - if not config.has_section(section_name): - report_alarm_fail(f"Cannot find {section_name} section in config file") - - return get_section_value(section_name, config) - - -def read_config_stage(config, stage, iotype_list, curr_disk_type): - """read config file, get [STAGE_NAME_diskType] section value""" - res = {} - section_name = f"{stage}_{curr_disk_type}" - if not config.has_section(section_name): - return res - - for key in config[section_name]: - if config[stage][key].isdecimal(): - res[key] = int(config[stage][key]) - - return res - def init_io_win(io_dic, config, common_param): """initialize windows of latency, iodump, and dict of avg_value""" @@ -192,24 +73,33 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): disk_list = [key for key in all_disk_set if key in config_disk] not_in_disk_list = [key for key in config_disk if key not in all_disk_set] + if not config_disk and not not_in_disk_list: + disk_list = [key for key in all_disk_set] + + if not disk_list: + report_alarm_fail("Cannot get valid disk name") + + disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list + + if not config_disk: + logging.info(f"Default common.disk using disk={disk_list}") + elif sorted(disk_list) != sorted(config_disk): + logging.warning(f"Set common.disk to {disk_list}") + stage_list = [key for key in all_stage_set if key in config_stage] not_in_stage_list = [key for key in config_stage if key not in all_stage_set] if not_in_stage_list: report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}") - if not config_disk and not not_in_disk_list: - disk_list = [key for key in all_disk_set] - - if not config_stage and not not_in_stage_list: + if not config_stage: stage_list = [key for key in all_stage_set] - disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list - - if not stage_list or not disk_list: - report_alarm_fail("Cannot get valid disk name or stage name.") + if not stage_list: + report_alarm_fail("Cannot get valid stage name.") - log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) + if not config_stage: + logging.info(f"Default common.stage using stage={stage_list}") return disk_list, stage_list @@ -254,9 +144,8 @@ def main(): signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) - log_level = get_log_level(CONFIG_FILE) + log_level = read_config_log(CONFIG_FILE) log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - logging.basicConfig(level=log_level, format=log_format) # 初始化配置读取 @@ -274,6 +163,8 @@ def main(): # 采集模块对接,is_iocollect_valid() io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) + logging.debug(f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}") + if "bio" not in io_dic["stage_list"]: report_alarm_fail("Cannot run avg_block_io without bio stage") diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py new file mode 100644 index 0000000..c8f45ce --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/config.py @@ -0,0 +1,208 @@ +import configparser +import logging +import os + +from .module_conn import report_alarm_fail +from sentryCollector.collect_plugin import Disk_Type + + +CONF_LOG = 'log' +CONF_LOG_LEVEL = 'level' +LogLevel = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL +} + +CONF_COMMON = 'common' +CONF_COMMON_DISK = 'disk' +CONF_COMMON_STAGE = 'stage' +CONF_COMMON_IOTYPE = 'iotype' +CONF_COMMON_PER_TIME = 'period_time' + +CONF_ALGO = 'algorithm' +CONF_ALGO_SIZE = 'win_size' +CONF_ALGO_THRE = 'win_threshold' + +CONF_LATENCY = 'latency_{}' +CONF_IODUMP = 'iodump' + + +DEFAULT_PARAM = { + CONF_LOG: { + CONF_LOG_LEVEL: 'info' + }, CONF_COMMON: { + CONF_COMMON_DISK: 'default', + CONF_COMMON_STAGE: 'default', + CONF_COMMON_IOTYPE: 'read,write', + CONF_COMMON_PER_TIME: 1 + }, CONF_ALGO: { + CONF_ALGO_SIZE: 30, + CONF_ALGO_THRE: 6 + }, 'latency_nvme_ssd': { + 'read_avg_lim': 300, + 'write_avg_lim': 300, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 500, + 'write_tot_lim': 500, + }, 'latency_sata_ssd' : { + 'read_avg_lim': 10000, + 'write_avg_lim': 10000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000, + }, 'latency_sata_hdd' : { + 'read_avg_lim': 15000, + 'write_avg_lim': 15000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000 + }, CONF_IODUMP: { + 'read_iodump_lim': 0, + 'write_iodump_lim': 0 + } +} + + +def get_section_value(section_name, config): + common_param = {} + config_sec = config[section_name] + for config_key in DEFAULT_PARAM[section_name]: + if config_key in config_sec: + if not config_sec[config_key].isdecimal(): + report_alarm_fail(f"Invalid {section_name}.{config_key} config.") + common_param[config_key] = int(config_sec[config_key]) + else: + common_param[config_key] = DEFAULT_PARAM[section_name][config_key] + logging.warning(f"Unset {section_name}.{config_key} in config file, use {common_param[config_key]} as default") + return common_param + + +def read_config_log(filename): + """read config file, get [log] section value""" + default_log_level = DEFAULT_PARAM[CONF_LOG][CONF_LOG_LEVEL] + if not os.path.exists(filename): + return LogLevel.get(default_log_level) + + config = configparser.ConfigParser() + config.read(filename) + + log_level = config.get(CONF_LOG, CONF_LOG_LEVEL, fallback=default_log_level) + if log_level.lower() in LogLevel: + return LogLevel.get(log_level.lower()) + return LogLevel.get(default_log_level) + + +def read_config_common(config): + """read config file, get [common] section value""" + if not config.has_section(CONF_COMMON): + report_alarm_fail(f"Cannot find {CONF_COMMON} section in config file") + + try: + disk_name = config.get(CONF_COMMON, CONF_COMMON_DISK).lower() + disk = [] if disk_name == "default" else disk_name.split(",") + except configparser.NoOptionError: + disk = [] + logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_DISK}, set to default") + + try: + stage_name = config.get(CONF_COMMON, CONF_COMMON_STAGE).lower() + stage = [] if stage_name == "default" else stage_name.split(",") + except configparser.NoOptionError: + stage = [] + logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_STAGE}, set to default") + + if len(disk) > 10: + logging.warning(f"Too many {CONF_COMMON}.disks, record only max 10 disks") + disk = disk[:10] + + try: + iotype_name = config.get(CONF_COMMON, CONF_COMMON_IOTYPE).lower().split(",") + iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] + err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] + + if err_iotype: + report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_IOTYPE} config") + + except configparser.NoOptionError: + iotype_list = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_IOTYPE] + logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_IOTYPE}, use {iotupe_list} as default") + + try: + period_time = int(config.get(CONF_COMMON, CONF_COMMON_PER_TIME)) + if not (1 <= period_time <= 300): + raise ValueError("Invalid period_time") + except ValueError: + report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_PER_TIME}") + except configparser.NoOptionError: + period_time = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_PER_TIME] + logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_PER_TIME}, use {period_time} as default") + + return period_time, disk, stage, iotype_list + + +def read_config_algorithm(config): + """read config file, get [algorithm] section value""" + if not config.has_section(CONF_ALGO): + report_alarm_fail(f"Cannot find {CONF_ALGO} section in config file") + + try: + win_size = int(config.get(CONF_ALGO, CONF_ALGO_SIZE)) + if not (1 <= win_size <= 300): + raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE}") + except ValueError: + report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE} config") + except configparser.NoOptionError: + win_size = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_SIZE] + logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default") + + try: + win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE)) + if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: + raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}") + except ValueError: + report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config") + except configparser.NoOptionError: + win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold'] + logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default") + + return win_size, win_threshold + + +def read_config_latency(config): + """read config file, get [latency_xxx] section value""" + common_param = {} + for type_name in Disk_Type: + section_name = CONF_LATENCY.format(Disk_Type[type_name]) + if not config.has_section(section_name): + report_alarm_fail(f"Cannot find {section_name} section in config file") + + common_param[Disk_Type[type_name]] = get_section_value(section_name, config) + return common_param + + +def read_config_iodump(config): + """read config file, get [iodump] section value""" + if not config.has_section(CONF_IODUMP): + report_alarm_fail(f"Cannot find {CONF_IODUMP} section in config file") + + return get_section_value(CONF_IODUMP, config) + + +def read_config_stage(config, stage, iotype_list, curr_disk_type): + """read config file, get [STAGE_NAME_diskType] section value""" + res = {} + section_name = f"{stage}_{curr_disk_type}" + if not config.has_section(section_name): + return res + + for key in config[section_name]: + if config[stage][key].isdecimal(): + res[key] = int(config[stage][key]) + + return res diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 8d6f429..cbdaad4 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -29,12 +29,16 @@ def sig_handler(signum, _f): def avg_get_io_data(io_dic): """get_io_data from sentryCollector""" + logging.debug(f"send to sentryCollector get_io_data: period={io_dic['period_time']}, " + f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}") res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) return check_result_validation(res, 'get io data') def avg_is_iocollect_valid(io_dic, config_disk, config_stage): """is_iocollect_valid from sentryCollector""" + logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, " + f"disk={config_disk}, stage={config_stage}") res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) return check_result_validation(res, 'check config validation') @@ -79,7 +83,7 @@ def process_report_data(disk_name, rw, io_data): # io press ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] for stage_name in ctrl_stage: - abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) + abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) if not abnormal: continue msg["reason"] = "IO press" @@ -117,6 +121,7 @@ def process_report_data(disk_name, rw, io_data): def get_disk_type_by_name(disk_name): + logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}") res = get_disk_type(disk_name) disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') try: @@ -126,4 +131,4 @@ def get_disk_type_by_name(disk_name): except ValueError: report_alarm_fail(f"Failed to get disk type for {disk_name}") - return Disk_Type[curr_disk_type] \ No newline at end of file + return Disk_Type[curr_disk_type] diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index c381c07..1bfd4e8 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -8,84 +8,12 @@ # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR # PURPOSE. # See the Mulan PSL v2 for more details. -import configparser import logging import os AVG_VALUE = 0 AVG_COUNT = 1 -CONF_LOG = 'log' -CONF_LOG_LEVEL = 'level' -LogLevel = { - "debug": logging.DEBUG, - "info": logging.INFO, - "warning": logging.WARNING, - "error": logging.ERROR, - "critical": logging.CRITICAL -} - - -DEFAULT_PARAM = { - 'latency_nvme_ssd': { - 'read_avg_lim': 300, - 'write_avg_lim': 300, - 'read_avg_time': 3, - 'write_avg_time': 3, - 'read_tot_lim': 500, - 'write_tot_lim': 500, - }, 'latency_sata_ssd' : { - 'read_avg_lim': 10000, - 'write_avg_lim': 10000, - 'read_avg_time': 3, - 'write_avg_time': 3, - 'read_tot_lim': 50000, - 'write_tot_lim': 50000, - }, 'latency_sata_hdd' : { - 'read_avg_lim': 15000, - 'write_avg_lim': 15000, - 'read_avg_time': 3, - 'write_avg_time': 3, - 'read_tot_lim': 50000, - 'write_tot_lim': 50000 - }, 'iodump': { - 'read_iodump_lim': 0, - 'write_iodump_lim': 0 - } -} - - -def get_section_value(section_name, config): - common_param = {} - config_sec = config[section_name] - for config_key in DEFAULT_PARAM[section_name]: - if config_key in config_sec: - if not config_sec[config_key].isdecimal(): - report_alarm_fail(f"Invalid {section_name}.{config_key} config.") - common_param[config_key] = int(config_sec[config_key]) - else: - logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default") - common_param[config_key] = DEFAULT_PARAM[section_name][config_key] - return common_param - - -def get_log_level(filename): - if not os.path.exists(filename): - return logging.INFO - - try: - config = configparser.ConfigParser() - config.read(filename) - if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): - return logging.INFO - log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) - - if log_level.lower() in LogLevel: - return LogLevel.get(log_level.lower()) - return logging.INFO - except configparser.Error: - return logging.INFO - def get_nested_value(data, keys): """get data from nested dict""" -- Gitee From 4681062b4e5efa066fd4fa5b39dabdc5428c3ca3 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Tue, 15 Oct 2024 21:21:10 +0800 Subject: [PATCH 52/76] get_io_data failed wont stop avg_block_io and del disk not support --- src/python/sentryCollector/collect_plugin.py | 14 ++++----- .../avg_block_io/avg_block_io.py | 9 ++++-- .../sentryPlugins/avg_block_io/module_conn.py | 31 +++++++++++++------ 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index bec405a..53dddec 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -90,14 +90,14 @@ def client_send_and_recv(request_data, data_str_len, protocol): try: client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) except socket.error: - logging.error("collect_plugin: client create socket error") + logging.debug("collect_plugin: client create socket error") return None try: client_socket.connect(COLLECT_SOCKET_PATH) except OSError: client_socket.close() - logging.error("collect_plugin: client connect error") + logging.debug("collect_plugin: client connect error") return None req_data_len = len(request_data) @@ -109,23 +109,23 @@ def client_send_and_recv(request_data, data_str_len, protocol): res_data = res_data.decode() except (OSError, UnicodeError): client_socket.close() - logging.error("collect_plugin: client communicate error") + logging.debug("collect_plugin: client communicate error") return None res_magic = res_data[:CLT_MSG_MAGIC_LEN] if res_magic != "RES": - logging.error("res msg format error") + logging.debug("res msg format error") return None protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] try: protocol_id = int(protocol_str) except ValueError: - logging.error("recv msg protocol id is invalid %s", protocol_str) + logging.debug("recv msg protocol id is invalid %s", protocol_str) return None if protocol_id >= ClientProtocol.PRO_END: - logging.error("protocol id is invalid") + logging.debug("protocol id is invalid") return None try: @@ -134,7 +134,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): res_msg_data = res_msg_data.decode() return res_msg_data except (OSError, ValueError, UnicodeError): - logging.error("collect_plugin: client recv res msg error") + logging.debug("collect_plugin: client recv res msg error") finally: client_socket.close() diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index cd47919..899d517 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -15,7 +15,7 @@ import time from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage from .stage_window import IoWindow, IoDumpWindow -from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation from .utils import update_avg_and_check_abnormal CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -79,6 +79,8 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): if not disk_list: report_alarm_fail("Cannot get valid disk name") + disk_list = check_disk_list_validation(disk_list) + disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list if not config_disk: @@ -117,7 +119,10 @@ def main_loop(io_dic, io_data, io_avg_value): time.sleep(period_time) # 采集模块对接,获取周期数据 - curr_period_data = avg_get_io_data(io_dic) + is_success, curr_period_data = avg_get_io_data(io_dic) + if not is_success: + logging.error(f"{curr_period_data['msg']}") + continue # 处理周期数据 reach_size = False diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index cbdaad4..a67ef45 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -40,25 +40,25 @@ def avg_is_iocollect_valid(io_dic, config_disk, config_stage): logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, " f"disk={config_disk}, stage={config_stage}") res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) - return check_result_validation(res, 'check config validation') + is_success, data = check_result_validation(res, 'check config validation') + if not is_success: + report_alarm_fail(f"{data['msg']}") + return data def check_result_validation(res, reason): """check validation of result from sentryCollector""" if not 'ret' in res or not 'message' in res: - err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) - report_alarm_fail(err_msg) + return False, {'msg': f"Failed to {reason}: Cannot connect to sentryCollector"} if res['ret'] != 0: - err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) - report_alarm_fail(err_msg) + return False, {'msg': f"Failed to {reason}: {Result_Messages[res['ret']]}"} try: json_data = json.loads(res['message']) except json.JSONDecodeError: - err_msg = f"Failed to {reason}: invalid return message" - report_alarm_fail(err_msg) + return False, {'msg': f"Failed to {reason}: invalid return message"} - return json_data + return True, json_data def report_alarm_fail(alarm_info): @@ -120,10 +120,21 @@ def process_report_data(disk_name, rw, io_data): xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) +def check_disk_list_validation(disk_list): + valid_disk_list = [] + for disk_name in disk_list: + is_success, _ = check_result_validation(get_disk_type(disk_name), "") + if not is_success: + continue + valid_disk_list.append(disk_name) + return valid_disk_list + + def get_disk_type_by_name(disk_name): logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}") - res = get_disk_type(disk_name) - disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') + is_success, disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') + if not is_success: + report_alarm_fail(f"{disk_type_str['msg']}") try: curr_disk_type = int(disk_type_str) if curr_disk_type not in Disk_Type: -- Gitee From 96aa422bbf12ab00d97dd8cee7f9bec461e18cba Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Wed, 16 Oct 2024 11:30:30 +0800 Subject: [PATCH 53/76] listen thread of collect module exits occasionally --- src/python/sentryCollector/collect_io.py | 4 +--- src/python/sentryCollector/collect_server.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 945ccbc..9c6bbc9 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -320,7 +320,7 @@ class CollectIo(): if curr_io_dump > 0: logging.info(f"ebpf io_dump info : {disk_name}, {stage}, {io_type}, {curr_io_dump}") IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) - logging.debug(f"ebpf collect data : {IO_GLOBAL_DATA}") + elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time if sleep_time < 0: @@ -432,8 +432,6 @@ class CollectIo(): continue self.append_period_lat(disk_name, stage_list) - logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}") - elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time if sleep_time < 0: diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py index 11d1af0..ad3ac0e 100644 --- a/src/python/sentryCollector/collect_server.py +++ b/src/python/sentryCollector/collect_server.py @@ -64,7 +64,7 @@ class CollectServer(): self.io_global_data = IO_GLOBAL_DATA if len(IO_CONFIG_DATA) == 0: - logging.error("the collect thread is not started, the data is invalid. ") + logging.error("the collect thread is not started, the data is invalid.") return json.dumps(result_rev) period_time = IO_CONFIG_DATA[0] @@ -75,7 +75,7 @@ class CollectServer(): stage_list = json.loads(data_struct['stage']) if (period < period_time) or (period > period_time * max_save) or (period % period_time): - logging.error("is_iocollect_valid: period time: %d is invalid", period) + logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time) return json.dumps(result_rev) for disk_name, stage_info in self.io_global_data.items(): @@ -96,7 +96,7 @@ class CollectServer(): self.io_global_data = IO_GLOBAL_DATA if len(IO_CONFIG_DATA) == 0: - logging.error("the collect thread is not started, the data is invalid. ") + logging.error("the collect thread is not started, the data is invalid.") return json.dumps(result_rev) period_time = IO_CONFIG_DATA[0] max_save = IO_CONFIG_DATA[1] @@ -107,11 +107,11 @@ class CollectServer(): iotype_list = json.loads(data_struct['iotype']) if (period < period_time) or (period > period_time * max_save) or (period % period_time): - logging.error("get_io_data: period time: %d is invalid", period) + logging.error("get_io_data: period time is invalid, user period: %d, config period_time: %d", period, period_time) return json.dumps(result_rev) collect_index = period // period_time - 1 - logging.debug("period: %d, collect_index: %d", period, collect_index) + logging.debug("user period: %d, config period_time: %d, collect_index: %d", period, period_time, collect_index) for disk_name, stage_info in self.io_global_data.items(): if disk_name not in disk_list: @@ -124,7 +124,7 @@ class CollectServer(): for iotype_name, iotype_info in iotype_info.items(): if iotype_name not in iotype_list: continue - if len(iotype_info) < collect_index: + if len(iotype_info) - 1 < collect_index: continue result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] @@ -250,10 +250,8 @@ class CollectServer(): except socket.error: logging.error("server fd create failed") server_fd = None - return server_fd - def server_loop(self): """main loop""" logging.info("collect listen thread start") @@ -277,8 +275,8 @@ class CollectServer(): self.server_recv(server_fd) else: continue - except socket.error: - pass + except Exception: + logging.error('collect listen exception : %s', traceback.format_exc()) def stop_thread(self): self.stop_event.set() -- Gitee From f99a526381fe6e5c8b2608308d4618438be13dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Wed, 16 Oct 2024 11:50:46 +0800 Subject: [PATCH 54/76] fix ai_block_io root cause bug --- src/python/sentryPlugins/ai_block_io/detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index 5b21714..ed8b64a 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -101,12 +101,12 @@ class DiskDetector: if len(diagnosis_info["bio"]) == 0: return False, None, None, None elif len(diagnosis_info["rq_driver"]) != 0: - root_cause = "[Root Cause:disk slow]" + root_cause = "[Root Cause: disk slow]" elif len(diagnosis_info["io_stage"]) != 0: - stage = diagnosis_info["io_stage"][0][1].get_stage_name() - root_cause = f"[Root Cause:io stage slow, stage: {stage}]" + stage = diagnosis_info["io_stage"][0][1].stage_name + root_cause = f"[Root Cause: io stage slow, stage: {stage}]" if root_cause is None: - root_cause = "[Root Cause:high io pressure]" + root_cause = "[Root Cause: high io pressure]" return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause def __repr__(self): -- Gitee From a3602256970acf4b0950c0ea2670af6da3923b7a Mon Sep 17 00:00:00 2001 From: zhangnan Date: Wed, 16 Oct 2024 10:58:24 +0800 Subject: [PATCH 55/76] ebpf fix kyqin miss rq_driver data --- src/c/ebpf_collector/ebpf_collector.bpf.c | 139 +++++++++++++++++++++- 1 file changed, 133 insertions(+), 6 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.bpf.c b/src/c/ebpf_collector/ebpf_collector.bpf.c index 870a677..a7ea51b 100644 --- a/src/c/ebpf_collector/ebpf_collector.bpf.c +++ b/src/c/ebpf_collector/ebpf_collector.bpf.c @@ -120,7 +120,6 @@ struct bpf_map_def SEC("maps") tag_res_2 = { .max_entries = MAX_IO_TIME, }; - struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; @@ -132,6 +131,133 @@ struct blk_mq_alloc_data { struct blk_mq_hw_ctx *hctx; }; +struct request_kylin { + struct request_queue *q; + struct blk_mq_ctx *mq_ctx; + + int cpu; + unsigned int cmd_flags; /* op and common flags */ + req_flags_t rq_flags; + + int internal_tag; + + /* the following two fields are internal, NEVER access directly */ + unsigned int __data_len; /* total data len */ + int tag; + sector_t __sector; /* sector cursor */ + + struct bio *bio; + struct bio *biotail; + + struct list_head queuelist; + + /* + * The hash is used inside the scheduler, and killed once the + * request reaches the dispatch list. The ipi_list is only used + * to queue the request for softirq completion, which is long + * after the request has been unhashed (and even removed from + * the dispatch list). + */ + union { + struct hlist_node hash; /* merge hash */ + struct list_head ipi_list; + }; + + struct hlist_node front_hash; /* front merge hash */ + + /* + * The rb_node is only used inside the io scheduler, requests + * are pruned when moved to the dispatch queue. So let the + * completion_data share space with the rb_node. + */ + union { + struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; + void *completion_data; + int error_count; /* for legacy drivers, don't use */ + }; + + /* + * Three pointers are available for the IO schedulers, if they need + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the elevator data. + */ + union { + struct { + struct io_cq *icq; + void *priv[2]; + } elv; + + struct { + unsigned int seq; + struct list_head list; + rq_end_io_fn *saved_end_io; + } flush; + }; + + struct gendisk *rq_disk; + struct hd_struct *part; + /* Time that I/O was submitted to the kernel. */ + u64 start_time_ns; + /* Time that I/O was submitted to the device. */ + u64 io_start_time_ns; + +#ifdef CONFIG_BLK_WBT + unsigned short wbt_flags; +#endif +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + unsigned short throtl_size; +#endif + + /* + * Number of scatter-gather DMA addr+len pairs after + * physical address coalescing is performed. + */ + unsigned short nr_phys_segments; + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + unsigned short nr_integrity_segments; +#endif + + unsigned short write_hint; + unsigned short ioprio; + + void *special; /* opaque pointer available for LLD use */ + + unsigned int extra_len; /* length of alignment and padding */ + + enum mq_rq_state state; + refcount_t ref; + + unsigned int timeout; + + /* access through blk_rq_set_deadline, blk_rq_deadline */ + unsigned long __deadline; + + struct list_head timeout_list; + + union { + struct __call_single_data csd; + u64 fifo_time; + }; + + /* + * completion callback. + */ + rq_end_io_fn *end_io; + void *end_io_data; + + /* for bidi */ + struct request_kylin *next_rq; + +#ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ +#endif + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + static __always_inline void blk_fill_rwbs(char *rwbs, unsigned int op) { switch (op & REQ_OP_MASK) { @@ -710,7 +836,7 @@ u32 find_matching_wbt_5_keys(int major, int minor) { SEC("kprobe/blk_mq_start_request") int kprobe_blk_mq_start_request(struct pt_regs *regs) { - struct request *rq = (struct request *)PT_REGS_PARM1(regs); + struct request_kylin *rq = (struct request_kylin *)PT_REGS_PARM1(regs); struct gendisk *curr_rq_disk = _(rq->rq_disk); int major = _(curr_rq_disk->major); int first_minor = _(curr_rq_disk->first_minor); @@ -734,15 +860,16 @@ int kprobe_blk_mq_start_request(struct pt_regs *regs) } } } - init_io_counter(&zero, major, first_minor); counterp = bpf_map_lookup_elem(&blk_map, &rq); - if (counterp || major == 0) + if (counterp || major == 0) { return 0; + } long err = bpf_map_update_elem(&blk_map, &rq, &zero, BPF_NOEXIST); - if (err) + if (err) { return 0; + } u64 curr_start_range = zero.start_time / THRESHOLD; @@ -789,7 +916,7 @@ int kprobe_blk_mq_start_request(struct pt_regs *regs) SEC("kprobe/blk_mq_free_request") int kprobe_blk_mq_free_request(struct pt_regs *regs) { - struct request *rq = (struct request *)PT_REGS_PARM1(regs); + struct request_kylin *rq = (struct request_kylin *)PT_REGS_PARM1(regs); struct gendisk *curr_rq_disk = _(rq->rq_disk); int major = _(curr_rq_disk->major); int first_minor = _(curr_rq_disk->first_minor); -- Gitee From b385323dfa6a6b8ef4b983bbb53699e79b1181ec Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Wed, 16 Oct 2024 14:51:24 +0800 Subject: [PATCH 56/76] optimize log printing Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 53 ++++++++++++++++--------------- src/python/syssentry/load_mods.py | 15 +++++---- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index c0b0e8a..b02aa84 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -76,16 +76,26 @@ def update_alarm_list(alarm_info: Xalarm): finally: alarm_list_lock.release() -def check_alarm_id_if_number(alarm_id): - if isinstance(alarm_id, int): - return True - else: +def validate_alarm_id(alarm_id): + if alarm_id is None: + return False + try: + alarm_id = int(alarm_id) + if MIN_ALARM_ID <= alarm_id <= MAX_ALARM_ID: + return True + else: + return False + except ValueError: return False -def check_alarm_clear_time_if_positive_integer(alarm_clear_time): - if isinstance(alarm_clear_time, int) and alarm_clear_time > 0: - return True - else: +def validate_alarm_clear_time(alarm_clear_time): + try: + alarm_clear_time = int(alarm_clear_time) + if alarm_clear_time > 0 and alarm_clear_time <= sys.maxsize: + return True + else: + return False + except ValueError: return False def alarm_register(): @@ -93,34 +103,25 @@ def alarm_register(): # 初始化告警ID映射字典、告警老化时间字典 for task_type in TasksMap.tasks_dict: for task_name in TasksMap.tasks_dict[task_type]: - logging.info(f"alarm_register: {task_name} is registered") task = TasksMap.tasks_dict[task_type][task_name] - alarm_id = task.alarm_id - if not check_alarm_id_if_number(alarm_id): - logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + if not validate_alarm_id(task.alarm_id): + logging.warning(f"Invalid alarm_id {task.alarm_id}: ignore {task_name} alarm") continue - if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: - logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") + if not validate_alarm_clear_time(task.alarm_clear_time): + logging.warning(f"Invalid alarm_clear_time {task.alarm_clear_time}: ignore {task_name} alarm") continue + task.alarm_id = int(task.alarm_id) + task.alarm_clear_time = int(task.alarm_clear_time) + alarm_id = task.alarm_id alarm_clear_time = task.alarm_clear_time - if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): - logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") - continue - try: - alarm_clear_time = int(alarm_clear_time) - if alarm_clear_time <= 0: - raise ValueError("Not a positive integer") - if alarm_clear_time > sys.maxsize: - raise ValueError("Exceeds maximum value for int") - except (ValueError, OverflowError, TypeError) as e: - logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") - continue + alarm_list_dict[alarm_id] = [] task_alarm_id_dict[task_name] = alarm_id if alarm_id not in alarm_id_clear_time_dict: alarm_id_clear_time_dict[alarm_id] = alarm_clear_time else: alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id]) + logging.info(f"alarm_register: {task_name} is registered") # 注册告警回调 id_filter = [True] * 128 clientId = xalarm_register(update_alarm_list, id_filter) diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py index f74f165..78db446 100644 --- a/src/python/syssentry/load_mods.py +++ b/src/python/syssentry/load_mods.py @@ -198,15 +198,16 @@ def parse_mod_conf(mod_name, mod_conf): task.load_enabled = is_enabled try: - task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID)) - task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)) - if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): - raise ValueError("Invalid alarm_id") - except ValueError: task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID) - task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) except configparser.NoOptionError: - logging.warning("Unset alarm_clear_time, use 15s as default") + task.alarm_id = None + logging.warning(f"{mod_name} alarm_id not set, alarm_id is None") + + if task.alarm_id is not None: + try: + task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) + except configparser.NoOptionError: + logging.warning(f"{mod_name} not set alarm_clear_time, use 15s as default") if CONF_ONSTART in mod_conf.options(CONF_TASK): is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') -- Gitee From ee7d96cfc5c7fff68804a3c5b7b0369d25aecbd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Wed, 16 Oct 2024 17:20:01 +0800 Subject: [PATCH 57/76] enrich alert info about kernel stack --- src/python/sentryPlugins/ai_block_io/detector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index ed8b64a..8536f7a 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -103,8 +103,10 @@ class DiskDetector: elif len(diagnosis_info["rq_driver"]) != 0: root_cause = "[Root Cause: disk slow]" elif len(diagnosis_info["io_stage"]) != 0: - stage = diagnosis_info["io_stage"][0][1].stage_name - root_cause = f"[Root Cause: io stage slow, stage: {stage}]" + stage_list = [] + for io_stage in diagnosis_info["io_stage"]: + stage_list.append(io_stage[0].stage_name) + root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]" if root_cause is None: root_cause = "[Root Cause: high io pressure]" return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause -- Gitee From 9f8af0608628c45b39012e6e6134fb3fa4505368 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Wed, 16 Oct 2024 22:08:42 +0800 Subject: [PATCH 58/76] ebpf fix collect iodump --- src/c/ebpf_collector/ebpf_collector.c | 27 +++++++++++------------- src/python/sentryCollector/collect_io.py | 2 +- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.c b/src/c/ebpf_collector/ebpf_collector.c index 0885a5f..af452c8 100644 --- a/src/c/ebpf_collector/ebpf_collector.c +++ b/src/c/ebpf_collector/ebpf_collector.c @@ -119,23 +119,19 @@ char* find_device_name(dev_t dev) { static void update_io_dump(struct bpf_map *map_res, int *io_dump, int *map_size, char *stage) { struct time_range_io_count time_count; - u32 io_dump_key = 0, io_dump_next_key = 0; + u32 io_dump_key = 0; struct sysinfo info; sysinfo(&info); - - while (bpf_map_get_next_key(map_res, &io_dump_key, &io_dump_next_key) == 0) { - int err = bpf_map_lookup_elem(map_res, &io_dump_next_key, &time_count); - if (err < 0) { - fprintf(stderr, "failed to lookup %s io dump: %d\n", stage, err); - break; - } - if (io_dump_key == io_dump_next_key) { - break; + int count_time = 150; + u32 curr_time = info.uptime; + while (count_time >= 0) { + io_dump_key = curr_time - count_time; + int err = bpf_map_lookup_elem(map_res, &io_dump_key, &time_count); + if (err < 0) { + count_time -= 1; + continue; } - - io_dump_key = io_dump_next_key; - - if ((info.uptime - io_dump_key) >= 2) { + if ((curr_time - io_dump_key) >= 2) { int isempty = 1; for (int key = 0; key < map_size; key++) { if (time_count.count[key] > 0) { @@ -143,10 +139,11 @@ static void update_io_dump(struct bpf_map *map_res, int *io_dump, int *map_size, isempty = 0; } } - if (isempty || (info.uptime - io_dump_key) > IO_DUMP_THRESHOLD) { + if (isempty || (curr_time - io_dump_key) > IO_DUMP_THRESHOLD) { bpf_map_delete_elem(map_res, &io_dump_key); } } + count_time -= 1; } } diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 9c6bbc9..8a82eab 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -387,7 +387,7 @@ class CollectIo(): curr_io_dump_count: int, prev_io_dump_count: int ) -> Union[int, float]: - io_dump_count = curr_io_dump_count - prev_io_dump_count + io_dump_count = curr_io_dump_count if io_dump_count <= 0: return 0 value = io_dump_count -- Gitee From fd92046b916bc8ef86da88d51006ed32cb6f4766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Mon, 21 Oct 2024 14:18:20 +0800 Subject: [PATCH 59/76] ai_block_io lack section exit --- .../ai_block_io/config_parser.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 7b0cd29..447eccd 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -401,11 +401,9 @@ class ConfigParser: self._read_stage(items_common) self._read_iotype(items_common) else: - self._conf["common"]["stage"] = ALL_STAGE_LIST - self._conf["common"]["iotype"] = ALL_IOTPYE_LIST - logging.warning( - "common section parameter not found, it will be set to default value." - ) + Report.report_pass("not found common section. exiting...") + logging.critical("not found common section. exiting...") + exit(1) if con.has_section("algorithm"): items_algorithm = dict(con.items("algorithm")) @@ -413,9 +411,9 @@ class ConfigParser: self._read_train_update_duration(items_algorithm) self._read_algorithm_type_and_parameter(items_algorithm) else: - logging.warning( - "algorithm section parameter not found, it will be set to default value." - ) + Report.report_pass("not found algorithm section. exiting...") + logging.critical("not found algorithm section. exiting...") + exit(1) if con.has_section("sliding_window"): items_sliding_window = dict(con.items("sliding_window")) @@ -423,9 +421,9 @@ class ConfigParser: self._read_window_size(items_sliding_window) self._read_window_minimum_threshold(items_sliding_window) else: - logging.warning( - "sliding_window section parameter not found, it will be set to default value." - ) + Report.report_pass("not found sliding_window section. exiting...") + logging.critical("not found sliding_window section. exiting...") + exit(1) if con.has_section("latency_sata_ssd"): items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) @@ -444,9 +442,10 @@ class ConfigParser: gt=0, ) else: - logging.warning( - "latency_sata_ssd section parameter not found, it will be set to default value." - ) + Report.report_pass("not found latency_sata_ssd section. exiting...") + logging.critical("not found latency_sata_ssd section. exiting...") + exit(1) + if con.has_section("latency_nvme_ssd"): items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd")) self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value( @@ -464,9 +463,10 @@ class ConfigParser: gt=0, ) else: - logging.warning( - "latency_nvme_ssd section parameter not found, it will be set to default value." - ) + Report.report_pass("not found latency_nvme_ssd section. exiting...") + logging.critical("not found latency_nvme_ssd section. exiting...") + exit(1) + if con.has_section("latency_sata_hdd"): items_latency_sata_hdd = dict(con.items("latency_sata_hdd")) self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value( @@ -484,9 +484,9 @@ class ConfigParser: gt=0, ) else: - logging.warning( - "latency_sata_hdd section parameter not found, it will be set to default value." - ) + Report.report_pass("not found latency_sata_hdd section. exiting...") + logging.critical("not found latency_sata_hdd section. exiting...") + exit(1) self.__print_all_config_value() -- Gitee From f8e0b86b462283db37d98f26b25d554d8fc8cef9 Mon Sep 17 00:00:00 2001 From: caixiaomeng Date: Mon, 21 Oct 2024 11:57:37 +0800 Subject: [PATCH 60/76] fix xalarm non-uniform log formatting --- src/python/xalarm/sentry_notify.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py index 5838473..1e3fa76 100644 --- a/src/python/xalarm/sentry_notify.py +++ b/src/python/xalarm/sentry_notify.py @@ -2,6 +2,7 @@ import os import sys import time import socket +import logging from struct import error as StructParseError from .xalarm_api import alarm_stu2bin, Xalarm @@ -27,21 +28,21 @@ ALARM_SOCKET_PERMISSION = 0o700 def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: if not os.path.exists(DIR_XALARM): - sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n") + logging.error(f"check_params: {DIR_XALARM} not exist, failed") return False if not os.path.exists(PATH_REPORT_ALARM): - sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n") + logging.error(f"check_params: {PATH_REPORT_ALARM} not exist, failed") return False if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER): - sys.stderr.write("check_params: alarm info invalid\n") + logging.error("check_params: alarm info invalid") return False if len(puc_paras) >= MAX_PUC_PARAS_LEN: - sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n") + logging.error(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}") return False return True @@ -61,7 +62,7 @@ def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM) except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e: - sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n") + logging.error(f"error occurs when sending msg.") return False finally: sock.close() -- Gitee From 6a93b4e0abe66ba8d2ba53616e43df605e9ffb71 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Mon, 21 Oct 2024 17:30:39 +0800 Subject: [PATCH 61/76] update collect plugin period max --- src/python/sentryCollector/collect_plugin.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 53dddec..9495d8b 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -45,6 +45,9 @@ LIMIT_IOTYPE_LIST_LEN = 4 LIMIT_PERIOD_MIN_LEN = 1 LIMIT_PERIOD_MAX_LEN = 300 +# max_save +LIMIT_MAX_SAVE_LEN = 300 + # interface protocol class ClientProtocol(): IS_IOCOLLECT_VALID = 0 @@ -189,7 +192,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): if not period or not isinstance(period, int): result['ret'] = ResultMessage.RESULT_NOT_PARAM return result - if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: + if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN: result['ret'] = ResultMessage.RESULT_INVALID_LENGTH return result @@ -246,7 +249,7 @@ def inter_get_io_data(period, disk_list, stage, iotype): if not isinstance(period, int): result['ret'] = ResultMessage.RESULT_NOT_PARAM return result - if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: + if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN: result['ret'] = ResultMessage.RESULT_INVALID_LENGTH return result -- Gitee From 241ffcc81f208438ccfd70d4af5a168938fb0b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Mon, 21 Oct 2024 17:31:32 +0800 Subject: [PATCH 62/76] fix frequency param check bug --- .../sentryPlugins/ai_block_io/config_parser.py | 13 +++++++++++-- .../sentryPlugins/ai_block_io/data_access.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 447eccd..274a31e 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -16,6 +16,7 @@ import logging from .alarm_report import Report from .threshold import ThresholdType from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level +from .data_access import check_detect_frequency_is_valid LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" @@ -165,9 +166,17 @@ class ConfigParser: "slow_io_detect_frequency", int, self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], - gt=0, - le=300, + gt=0 ) + frequency = self._conf["common"]["slow_io_detect_frequency"] + ret = check_detect_frequency_is_valid(frequency) + if ret is None: + log = f"slow io detect frequency: {frequency} is valid, "\ + f"Check whether the value range is too large or is not an "\ + f"integer multiple of period_time.. exiting..." + Report.report_pass(log) + logging.critical(log) + exit(1) def _read_disks_to_detect(self, items_common: dict): disks_to_detection = items_common.get("disk") diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index 1bc5ed8..e4869d5 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -53,6 +53,20 @@ def check_collect_valid(period): return None +def check_detect_frequency_is_valid(period): + data_raw = is_iocollect_valid(period) + if data_raw["ret"] == 0: + try: + data = json.loads(data_raw["message"]) + except Exception as e: + return None + if not data: + return None + return [k for k in data.keys()] + else: + return None + + def _get_raw_data(period, disk_list): return get_io_data( period, -- Gitee From 3e2779ddc27878016a7fe5edaed3b9d821a804e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Tue, 22 Oct 2024 16:37:52 +0800 Subject: [PATCH 63/76] ai_block_io support iodump --- config/plugins/ai_block_io.ini | 6 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 75 ++++++++++++------- .../ai_block_io/config_parser.py | 30 ++++++++ .../ai_block_io/sliding_window.py | 4 +- 4 files changed, 84 insertions(+), 31 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 422cfa3..040237d 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -29,4 +29,8 @@ write_tot_lim=500 [latency_sata_hdd] read_tot_lim=50000 -write_tot_lim=50000 \ No newline at end of file +write_tot_lim=50000 + +[iodump] +read_iodump_lim=0 +write_iodump_lim=0 \ No newline at end of file diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 4eecd43..f25e6d5 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -15,7 +15,7 @@ import logging from collections import defaultdict from .detector import Detector, DiskDetector -from .threshold import ThresholdFactory +from .threshold import ThresholdFactory, ThresholdType from .sliding_window import SlidingWindowFactory from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser @@ -91,9 +91,8 @@ class SlowIODetection: continue for stage in stages: for iotype in iotypes: - self._detector_name_list[disk].append( - MetricName(disk, disk_type, stage, iotype, "latency") - ) + self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) + self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) if disks: logging.warning( "disks: %s not in available disk list, so they will be ignored.", @@ -123,31 +122,51 @@ class SlowIODetection: for disk, metric_name_list in self._detector_name_list.items(): disk_detector = DiskDetector(disk) for metric_name in metric_name_list: - threshold = ThresholdFactory().get_threshold( - threshold_type, - boxplot_parameter=self._config_parser.boxplot_parameter, - n_sigma_paramter=self._config_parser.n_sigma_parameter, - data_queue_size=data_queue_size, - data_queue_update_size=update_size, - ) - abs_threshold = self._config_parser.get_tot_lim( - metric_name.disk_type, metric_name.io_access_type_name - ) - if abs_threshold is None: - logging.warning( - "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", - disk, - metric_name.disk_type, - metric_name.io_access_type_name, + + if metric_name.metric_name == 'latency': + threshold = ThresholdFactory().get_threshold( + threshold_type, + boxplot_parameter=self._config_parser.boxplot_parameter, + n_sigma_paramter=self._config_parser.n_sigma_parameter, + data_queue_size=data_queue_size, + data_queue_update_size=update_size, ) - sliding_window = SlidingWindowFactory().get_sliding_window( - sliding_window_type, - queue_length=window_size, - threshold=window_threshold, - abs_threshold=abs_threshold, - ) - detector = Detector(metric_name, threshold, sliding_window) - disk_detector.add_detector(detector) + abs_threshold = self._config_parser.get_tot_lim( + metric_name.disk_type, metric_name.io_access_type_name + ) + if abs_threshold is None: + logging.warning( + "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", + disk, + metric_name.disk_type, + metric_name.io_access_type_name, + ) + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold, + abs_threshold=abs_threshold, + ) + detector = Detector(metric_name, threshold, sliding_window) + disk_detector.add_detector(detector) + continue + + elif metric_name.metric_name == 'io_dump': + threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold) + abs_threshold = None + if metric_name.io_access_type_name == 'read': + abs_threshold = self._config_parser.read_iodump_lim + elif metric_name.io_access_type_name == 'write': + abs_threshold = self._config_parser.write_iodump_lim + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold + ) + detector = Detector(metric_name, threshold, sliding_window) + threshold.set_threshold(abs_threshold) + disk_detector.add_detector(detector) + logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") self._disk_detectors[disk] = disk_detector diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 274a31e..1117939 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -72,6 +72,7 @@ class ConfigParser: "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, + "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0} } def __init__(self, config_file_name): @@ -497,6 +498,27 @@ class ConfigParser: logging.critical("not found latency_sata_hdd section. exiting...") exit(1) + if con.has_section("iodump"): + items_iodump = dict(con.items("iodump")) + self._conf["iodump"]["read_iodump_lim"] = self._get_config_value( + items_iodump, + "read_iodump_lim", + int, + self.DEFAULT_CONF["iodump"]["read_iodump_lim"], + ge=0 + ) + self._conf["iodump"]["write_iodump_lim"] = self._get_config_value( + items_iodump, + "write_iodump_lim", + int, + self.DEFAULT_CONF["iodump"]["write_iodump_lim"], + ge=0 + ) + else: + Report.report_pass("not found iodump section. exiting...") + logging.critical("not found iodump section. exiting...") + exit(1) + self.__print_all_config_value() def __repr__(self) -> str: @@ -587,3 +609,11 @@ class ConfigParser: @property def n_sigma_parameter(self): return self._conf["algorithm"]["n_sigma_parameter"] + + @property + def read_iodump_lim(self): + return self._conf["iodump"]["read_iodump_lim"] + + @property + def write_iodump_lim(self): + return self._conf["iodump"]["write_iodump_lim"] \ No newline at end of file diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index d7c402a..cebe41f 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -35,8 +35,8 @@ class SlidingWindow: self._io_data_queue_abnormal_tag.pop(0) self._io_data_queue.append(data) tag = False - if ((self._ai_threshold is not None and data >= self._ai_threshold) or - (self._abs_threshold is not None and data >= self._abs_threshold)): + if ((self._ai_threshold is not None and data > self._ai_threshold) or + (self._abs_threshold is not None and data > self._abs_threshold)): tag = True self._io_data_queue_abnormal_tag.append(tag) return tag -- Gitee From 8eae489ebaaac91b8ea8e903915febc817f74592 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Tue, 22 Oct 2024 20:28:59 +0800 Subject: [PATCH 64/76] fix get_alarm error Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index b02aa84..ba581b4 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -139,8 +139,6 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di return [] alarm_id = task_alarm_id_dict[task_name] clear_time = alarm_id_clear_time_dict[alarm_id] - if clear_time < int(time_range): - return [] if alarm_id not in alarm_list_dict: logging.debug("alarm_id does not exist") return [] @@ -154,6 +152,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range: stop_index = i break + if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time: + stop_index = i + break if stop_index >= 0: alarm_list = alarm_list[:stop_index] logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") -- Gitee From 519586d0417bed06d202fb8bd5d8d157c302bf3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Thu, 24 Oct 2024 09:39:16 +0800 Subject: [PATCH 65/76] ai_block_io support absolute threshold lower limit --- config/plugins/ai_block_io.ini | 19 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 36 ++-- .../sentryPlugins/ai_block_io/alarm_report.py | 18 +- .../ai_block_io/config_parser.py | 168 ++++++++++++------ .../sentryPlugins/ai_block_io/detector.py | 92 ++++++---- .../ai_block_io/sliding_window.py | 21 ++- 6 files changed, 222 insertions(+), 132 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 040237d..d0b1e74 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -2,9 +2,9 @@ level=info [common] -slow_io_detect_frequency=1 +period_time=1 disk=default -stage=bio +stage=default iotype=read,write [algorithm] @@ -12,22 +12,25 @@ train_data_duration=24 train_update_duration=2 algorithm_type=boxplot boxplot_parameter=1.5 -n_sigma_parameter=3 - -[sliding_window] -sliding_window_type=not_continuous -window_size=30 -window_minimum_threshold=6 +win_type=not_continuous +win_size=30 +win_threshold=6 [latency_sata_ssd] +read_avg_lim=10000 +write_avg_lim=10000 read_tot_lim=50000 write_tot_lim=50000 [latency_nvme_ssd] +read_avg_lim=300 +write_avg_lim=300 read_tot_lim=500 write_tot_lim=500 [latency_sata_hdd] +read_avg_lim=15000 +write_avg_lim=15000 read_tot_lim=50000 write_tot_lim=50000 diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index f25e6d5..74f246a 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -49,7 +49,7 @@ class SlowIODetection: def __init_detector_name_list(self): self._disk_list = check_collect_valid( - self._config_parser.slow_io_detect_frequency + self._config_parser.period_time ) if self._disk_list is None: Report.report_pass( @@ -109,7 +109,7 @@ class SlowIODetection: train_data_duration, train_update_duration = ( self._config_parser.get_train_data_duration_and_train_update_duration() ) - slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency + slow_io_detection_frequency = self._config_parser.period_time threshold_type = self._config_parser.algorithm_type data_queue_size, update_size = get_data_queue_size_and_update_size( train_data_duration, train_update_duration, slow_io_detection_frequency @@ -131,10 +131,13 @@ class SlowIODetection: data_queue_size=data_queue_size, data_queue_update_size=update_size, ) - abs_threshold = self._config_parser.get_tot_lim( + tot_lim = self._config_parser.get_tot_lim( metric_name.disk_type, metric_name.io_access_type_name ) - if abs_threshold is None: + avg_lim = self._config_parser.get_avg_lim( + metric_name.disk_type, metric_name.io_access_type_name + ) + if tot_lim is None: logging.warning( "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", disk, @@ -145,7 +148,8 @@ class SlowIODetection: sliding_window_type, queue_length=window_size, threshold=window_threshold, - abs_threshold=abs_threshold, + abs_threshold=tot_lim, + avg_lim=avg_lim ) detector = Detector(metric_name, threshold, sliding_window) disk_detector.add_detector(detector) @@ -176,7 +180,7 @@ class SlowIODetection: # Step1:获取IO数据 io_data_dict_with_disk_name = get_io_data_from_collect_plug( - self._config_parser.slow_io_detect_frequency, self._disk_list + self._config_parser.period_time, self._disk_list ) logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}") if io_data_dict_with_disk_name is None: @@ -197,25 +201,21 @@ class SlowIODetection: # Step3:慢IO事件上报 logging.debug("step3. Report slow io event to sysSentry.") for slow_io_event in slow_io_event_list: - metric_name: MetricName = slow_io_event[1] - window_info = slow_io_event[2] - root_cause = slow_io_event[3] alarm_content = { - "driver_name": f"{metric_name.disk_name}", - "reason": root_cause, - "block_stack": f"{metric_name.stage_name}", - "io_type": f"{metric_name.io_access_type_name}", + "driver_name": slow_io_event[1], + "reason": slow_io_event[2], + "block_stack": slow_io_event[3], + "io_type": slow_io_event[4], "alarm_source": "ai_block_io", - "alarm_type": "latency", - "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, " - f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.", + "alarm_type": slow_io_event[5], + "details": slow_io_event[6], } Xalarm.major(alarm_content) - logging.warning(alarm_content) + logging.warning("[SLOW IO] " + str(alarm_content)) # Step4:等待检测时间 logging.debug("step4. Wait to start next slow io event detection loop.") - time.sleep(self._config_parser.slow_io_detect_frequency) + time.sleep(self._config_parser.period_time) def main(): diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py index 92bd6e3..61bb145 100644 --- a/src/python/sentryPlugins/ai_block_io/alarm_report.py +++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py @@ -30,17 +30,17 @@ class Report: @staticmethod def report_pass(info: str): report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) - logging.info(f'Report {Report.TASK_NAME} PASS: {info}') + logging.debug(f'Report {Report.TASK_NAME} PASS: {info}') @staticmethod def report_fail(info: str): report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) - logging.info(f'Report {Report.TASK_NAME} FAIL: {info}') + logging.debug(f'Report {Report.TASK_NAME} FAIL: {info}') @staticmethod def report_skip(info: str): report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) - logging.info(f'Report {Report.TASK_NAME} SKIP: {info}') + logging.debug(f'Report {Report.TASK_NAME} SKIP: {info}') class Xalarm: @@ -50,31 +50,31 @@ class Xalarm: def minor(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") @staticmethod def major(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") @staticmethod def critical(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") def minor_recover(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") def major_recover(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") def critical_recover(info: dict): info_str = json.dumps(info) xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str) - logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") + logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 1117939..91ec5c6 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -52,7 +52,7 @@ class ConfigParser: DEFAULT_CONF = { "log": {"level": "info"}, "common": { - "slow_io_detect_frequency": 1, + "period_time": 1, "disk": None, "stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio", "iotype": "read,write", @@ -63,16 +63,32 @@ class ConfigParser: "algorithm_type": get_threshold_type_enum("boxplot"), "boxplot_parameter": 1.5, "n_sigma_parameter": 3.0, + "win_type": get_sliding_window_type_enum("not_continuous"), + "win_size": 30, + "win_threshold": 6, }, - "sliding_window": { - "sliding_window_type": get_sliding_window_type_enum("not_continuous"), - "window_size": 30, - "window_minimum_threshold": 6, + "latency_sata_ssd": { + "read_avg_lim": 10000, + "write_avg_lim": 10000, + "read_tot_lim": 50000, + "write_tot_lim": 50000 }, - "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, - "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, - "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, - "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0} + "latency_nvme_ssd": { + "read_avg_lim": 300, + "write_avg_lim": 300, + "read_tot_lim": 500, + "write_tot_lim": 500 + }, + "latency_sata_hdd": { + "read_avg_lim": 15000, + "write_avg_lim": 15000, + "read_tot_lim": 50000, + "write_tot_lim": 50000 + }, + "iodump": { + "read_iodump_lim": 0, + "write_iodump_lim": 0 + } } def __init__(self, config_file_name): @@ -161,18 +177,18 @@ class ConfigParser: return value - def _read_slow_io_detect_frequency(self, items_common: dict): - self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value( + def _read_period_time(self, items_common: dict): + self._conf["common"]["period_time"] = self._get_config_value( items_common, - "slow_io_detect_frequency", + "period_time", int, - self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], + self.DEFAULT_CONF["common"]["period_time"], gt=0 ) - frequency = self._conf["common"]["slow_io_detect_frequency"] + frequency = self._conf["common"]["period_time"] ret = check_detect_frequency_is_valid(frequency) if ret is None: - log = f"slow io detect frequency: {frequency} is valid, "\ + log = f"period_time: {frequency} is valid, "\ f"Check whether the value range is too large or is not an "\ f"integer multiple of period_time.. exiting..." Report.report_pass(log) @@ -316,50 +332,41 @@ class ConfigParser: self._conf["common"]["iotype"] = dup_iotype_list def _read_sliding_window_type(self, items_sliding_window: dict): - sliding_window_type = items_sliding_window.get("sliding_window_type") + sliding_window_type = items_sliding_window.get("win_type") if sliding_window_type is not None: - self._conf["sliding_window"]["sliding_window_type"] = ( + self._conf["algorithm"]["win_type"] = ( get_sliding_window_type_enum(sliding_window_type) ) - if self._conf["sliding_window"]["sliding_window_type"] is None: + if self._conf["algorithm"]["win_type"] is None: logging.critical( - "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.", + "the win_type: %s you set is invalid. ai_block_io plug will exit.", sliding_window_type, ) Report.report_pass( - f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." + f"the win_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." ) exit(1) def _read_window_size(self, items_sliding_window: dict): - self._conf["sliding_window"]["window_size"] = self._get_config_value( + self._conf["algorithm"]["win_size"] = self._get_config_value( items_sliding_window, - "window_size", + "win_size", int, - self.DEFAULT_CONF["sliding_window"]["window_size"], + self.DEFAULT_CONF["algorithm"]["win_size"], gt=0, - le=3600, + le=300, ) def _read_window_minimum_threshold(self, items_sliding_window: dict): - default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][ - "window_minimum_threshold" - ] - if ( - default_window_minimum_threshold - > self._conf["sliding_window"]["window_size"] - ): - default_window_minimum_threshold = ( - self._conf["sliding_window"]["window_size"] / 2 - ) - self._conf["sliding_window"]["window_minimum_threshold"] = ( + default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"] + self._conf["algorithm"]["win_threshold"] = ( self._get_config_value( items_sliding_window, - "window_minimum_threshold", + "win_threshold", int, default_window_minimum_threshold, gt=0, - le=self._conf["sliding_window"]["window_size"], + le=self._conf["algorithm"]["win_size"], ) ) @@ -406,7 +413,7 @@ class ConfigParser: if con.has_section("common"): items_common = dict(con.items("common")) - self._read_slow_io_detect_frequency(items_common) + self._read_period_time(items_common) self._read_disks_to_detect(items_common) self._read_stage(items_common) self._read_iotype(items_common) @@ -420,20 +427,9 @@ class ConfigParser: self._read_train_data_duration(items_algorithm) self._read_train_update_duration(items_algorithm) self._read_algorithm_type_and_parameter(items_algorithm) - else: - Report.report_pass("not found algorithm section. exiting...") - logging.critical("not found algorithm section. exiting...") - exit(1) - - if con.has_section("sliding_window"): - items_sliding_window = dict(con.items("sliding_window")) - - self._read_window_size(items_sliding_window) - self._read_window_minimum_threshold(items_sliding_window) - else: - Report.report_pass("not found sliding_window section. exiting...") - logging.critical("not found sliding_window section. exiting...") - exit(1) + self._read_sliding_window_type(items_algorithm) + self._read_window_size(items_algorithm) + self._read_window_minimum_threshold(items_algorithm) if con.has_section("latency_sata_ssd"): items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) @@ -451,6 +447,20 @@ class ConfigParser: self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], gt=0, ) + self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value( + items_latency_sata_ssd, + "read_avg_lim", + int, + self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"], + gt=0 + ) + self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value( + items_latency_sata_ssd, + "write_avg_lim", + int, + self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"], + gt=0 + ) else: Report.report_pass("not found latency_sata_ssd section. exiting...") logging.critical("not found latency_sata_ssd section. exiting...") @@ -472,6 +482,20 @@ class ConfigParser: self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], gt=0, ) + self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value( + items_latency_nvme_ssd, + "read_avg_lim", + int, + self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"], + gt=0 + ) + self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value( + items_latency_nvme_ssd, + "write_avg_lim", + int, + self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"], + gt=0 + ) else: Report.report_pass("not found latency_nvme_ssd section. exiting...") logging.critical("not found latency_nvme_ssd section. exiting...") @@ -493,6 +517,20 @@ class ConfigParser: self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], gt=0, ) + self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value( + items_latency_sata_hdd, + "read_avg_lim", + int, + self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"], + gt=0 + ) + self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value( + items_latency_sata_hdd, + "write_avg_lim", + int, + self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"], + gt=0 + ) else: Report.report_pass("not found latency_sata_hdd section. exiting...") logging.critical("not found latency_sata_hdd section. exiting...") @@ -542,6 +580,18 @@ class ConfigParser: else: return None + def get_avg_lim(self, disk_type, io_type): + if io_type == "read": + return self._conf.get( + f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} + ).get("read_avg_lim", None) + elif io_type == "write": + return self._conf.get( + f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} + ).get("write_avg_lim", None) + else: + return None + def get_train_data_duration_and_train_update_duration(self): return ( self._conf["algorithm"]["train_data_duration"], @@ -550,13 +600,13 @@ class ConfigParser: def get_window_size_and_window_minimum_threshold(self): return ( - self._conf["sliding_window"]["window_size"], - self._conf["sliding_window"]["window_minimum_threshold"], + self._conf["algorithm"]["win_size"], + self._conf["algorithm"]["win_threshold"], ) @property - def slow_io_detect_frequency(self): - return self._conf["common"]["slow_io_detect_frequency"] + def period_time(self): + return self._conf["common"]["period_time"] @property def algorithm_type(self): @@ -564,7 +614,7 @@ class ConfigParser: @property def sliding_window_type(self): - return self._conf["sliding_window"]["sliding_window_type"] + return self._conf["algorithm"]["win_type"] @property def train_data_duration(self): @@ -576,11 +626,11 @@ class ConfigParser: @property def window_size(self): - return self._conf["sliding_window"]["window_size"] + return self._conf["algorithm"]["win_size"] @property def window_minimum_threshold(self): - return self._conf["sliding_window"]["window_minimum_threshold"] + return self._conf["algorithm"]["win_threshold"] @property def absolute_threshold(self): diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index 8536f7a..e3a0952 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -28,9 +28,13 @@ class Detector: self._threshold.attach_observer(self._slidingWindow) self._count = None - def get_metric_name(self): + @property + def metric_name(self): return self._metric_name + def get_sliding_window_data(self): + return self._slidingWindow.get_data() + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): if self._count is None: self._count = datetime.now() @@ -38,22 +42,27 @@ class Detector: now_time = datetime.now() time_diff = (now_time - self._count).total_seconds() if time_diff >= 60: - logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + logging.info(f"({self._metric_name}) 's latest ai threshold is: {self._threshold.get_threshold()}.") self._count = None logging.debug(f'enter Detector: {self}') metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: logging.debug('not found metric value, so return None.') - return (False, False), None, None, None + return (False, False), None, None, None, None logging.debug(f'input metric value: {str(metric_value)}') self._threshold.push_latest_data_to_queue(metric_value) detection_result = self._slidingWindow.is_slow_io_event(metric_value) # 检测到慢周期,由Detector负责打印info级别日志 if detection_result[0][1]: - logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, ' - f'current value: {metric_value}, ai threshold: {detection_result[2]}, ' - f'absolute threshold: {detection_result[3]}') + logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, ' + f'stage: {self._metric_name.stage_name}, ' + f'iotype: {self._metric_name.io_access_type_name}, ' + f'metric: {self._metric_name.metric_name}, ' + f'current value: {metric_value}, ' + f'ai threshold: {detection_result[2]}, ' + f'absolute threshold upper limit: {detection_result[3]}, ' + f'lower limit: {detection_result[4]}') else: logging.debug(f'Detection result: {str(detection_result)}') logging.debug(f'exit Detector: {self}') @@ -75,41 +84,60 @@ class DiskDetector: def add_detector(self, detector: Detector): self._detector_list.append(detector) + def get_detector_list_window(self): + latency_wins = {"read": {}, "write": {}} + iodump_wins = {"read": {}, "write": {}} + for detector in self._detector_list: + if detector.metric_name.metric_name == 'latency': + latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() + elif detector.metric_name.metric_name == 'io_dump': + iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() + return latency_wins, iodump_wins + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - """ - 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件 - 情况一:bio异常,rq_driver也异常,则慢盘 - 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常 - 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大 - 情况四:bio异常,则UNKNOWN - """ - diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []} + diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []} for detector in self._detector_list: # result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值 # 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold result = detector.is_slow_io_event(io_data_dict_with_disk_name) if result[0][0]: - if detector.get_metric_name().stage_name == "bio": - diagnosis_info["bio"].append((detector.get_metric_name(), result)) - elif detector.get_metric_name().stage_name == "rq_driver": - diagnosis_info["rq_driver"].append((detector.get_metric_name(), result)) + if detector.metric_name.stage_name == "bio": + diagnosis_info["bio"].append(detector.metric_name) + elif detector.metric_name.stage_name == "rq_driver": + diagnosis_info["rq_driver"].append(detector.metric_name) else: - diagnosis_info["io_stage"].append((detector.get_metric_name(), result)) + diagnosis_info["kernel_stack"].append(detector.metric_name) - # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因 - root_cause = None if len(diagnosis_info["bio"]) == 0: - return False, None, None, None - elif len(diagnosis_info["rq_driver"]) != 0: - root_cause = "[Root Cause: disk slow]" - elif len(diagnosis_info["io_stage"]) != 0: - stage_list = [] - for io_stage in diagnosis_info["io_stage"]: - stage_list.append(io_stage[0].stage_name) - root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]" - if root_cause is None: - root_cause = "[Root Cause: high io pressure]" - return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause + return False, None, None, None, None, None, None + + driver_name = self._disk_name + reason = "unknown" + block_stack = set() + io_type = set() + alarm_type = set() + + for key, value in diagnosis_info.items(): + for metric_name in value: + block_stack.add(metric_name.stage_name) + io_type.add(metric_name.io_access_type_name) + alarm_type.add(metric_name.metric_name) + + latency_wins, iodump_wins = self.get_detector_list_window() + details = f"latency: {latency_wins}, iodump: {iodump_wins}" + + io_press = {"throtl", "wbt", "iocost", "bfq"} + driver_slow = {"rq_driver"} + kernel_slow = {"gettag", "plug", "deadline", "hctx", "requeue"} + + if not io_press.isdisjoint(block_stack): + reason = "io_press" + elif not driver_slow.isdisjoint(block_stack): + reason = "driver_slow" + elif not kernel_slow.isdisjoint(block_stack): + reason = "kernel_slow" + + return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details def __repr__(self): msg = f'disk: {self._disk_name}, ' diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index cebe41f..4083c43 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -21,11 +21,12 @@ class SlidingWindowType(Enum): class SlidingWindow: - def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None): + def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None, avg_lim: int = None): self._queue_length = queue_length self._queue_threshold = threshold self._ai_threshold = None self._abs_threshold = abs_threshold + self._avg_lim = avg_lim self._io_data_queue = [] self._io_data_queue_abnormal_tag = [] @@ -35,8 +36,13 @@ class SlidingWindow: self._io_data_queue_abnormal_tag.pop(0) self._io_data_queue.append(data) tag = False - if ((self._ai_threshold is not None and data > self._ai_threshold) or - (self._abs_threshold is not None and data > self._abs_threshold)): + if self._avg_lim is not None and data < self._avg_lim: + tag = False + self._io_data_queue_abnormal_tag.append(tag) + return tag + if self._ai_threshold is not None and data > self._ai_threshold: + tag = True + if self._abs_threshold is not None and data > self._abs_threshold: tag = True self._io_data_queue_abnormal_tag.append(tag) return tag @@ -52,6 +58,9 @@ class SlidingWindow: def is_slow_io_event(self, data): return False, None, None, None + def get_data(self): + return self._io_data_queue + def __repr__(self): return "[SlidingWindow]" @@ -64,7 +73,7 @@ class NotContinuousSlidingWindow(SlidingWindow): is_slow_io_event = False if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: is_slow_io_event = True - return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim def __repr__(self): return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -85,7 +94,7 @@ class ContinuousSlidingWindow(SlidingWindow): break else: consecutive_count = 0 - return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim def __repr__(self): return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -100,7 +109,7 @@ class MedianSlidingWindow(SlidingWindow): median = np.median(self._io_data_queue) if median >= self._ai_threshold: is_slow_io_event = True - return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim def __repr__(self): return f"[MedianSlidingWindow, window size: {self._queue_length}]" -- Gitee From 12b84adbdc2c494110f5a6c2fd2f1c182165652f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Fri, 25 Oct 2024 15:34:25 +0800 Subject: [PATCH 66/76] ai_block_io fix some config parameters parse bug --- .../sentryPlugins/ai_block_io/ai_block_io.py | 70 +++++---- .../ai_block_io/config_parser.py | 135 ++++++++++++++---- .../sentryPlugins/ai_block_io/data_access.py | 14 ++ .../sentryPlugins/ai_block_io/detector.py | 16 ++- .../ai_block_io/sliding_window.py | 2 +- .../sentryPlugins/ai_block_io/threshold.py | 14 +- src/python/sentryPlugins/ai_block_io/utils.py | 2 - 7 files changed, 180 insertions(+), 73 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 74f246a..14f740d 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -23,6 +23,7 @@ from .data_access import ( get_io_data_from_collect_plug, check_collect_valid, get_disk_type, + check_disk_is_available ) from .io_data import MetricName from .alarm_report import Xalarm, Report @@ -31,14 +32,14 @@ CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" def sig_handler(signum, frame): - logging.info("receive signal: %d", signum) Report.report_pass(f"receive signal: {signum}, exiting...") + logging.info("Finished ai_block_io plugin running.") exit(signum) class SlowIODetection: _config_parser = None - _disk_list = None + _disk_list = [] _detector_name_list = defaultdict(list) _disk_detectors = {} @@ -48,32 +49,30 @@ class SlowIODetection: self.__init_detector() def __init_detector_name_list(self): - self._disk_list = check_collect_valid( - self._config_parser.period_time - ) - if self._disk_list is None: - Report.report_pass( - "get available disk error, please check if the collector plug is enable. exiting..." - ) - logging.critical("get available disk error, please check if the collector plug is enable. exiting...") - exit(1) - - logging.info(f"ai_block_io plug has found disks: {self._disk_list}") disks: list = self._config_parser.disks_to_detection stages: list = self._config_parser.stage iotypes: list = self._config_parser.iotype - # 情况1:None,则启用所有磁盘检测 - # 情况2:is not None and len = 0,则不启动任何磁盘检测 - # 情况3:len != 0,则取交集 + if disks is None: - logging.warning( - "you not specify any disk or use default, so ai_block_io will enable all available disk." - ) - for disk in self._disk_list: - if disks is not None: - if disk not in disks: - continue - disks.remove(disk) + logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") + all_available_disk_list = check_collect_valid(self._config_parser.period_time) + if all_available_disk_list is None: + Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") + logging.critical("get available disk error, please check if the collector plug is enable. exiting...") + exit(1) + if len(all_available_disk_list) == 0: + Report.report_pass("not found available disk. exiting...") + logging.critical("not found available disk. exiting...") + exit(1) + disks = all_available_disk_list + logging.info(f"available disk list is follow: {disks}.") + + for disk in disks: + tmp_disk = [disk] + ret = check_disk_is_available(self._config_parser.period_time, tmp_disk) + if not ret: + logging.warning(f"disk: {disk} is not available, it will be ignored.") + continue disk_type_result = get_disk_type(disk) if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( @@ -89,20 +88,15 @@ class SlowIODetection: disk_type_result, ) continue + self._disk_list.append(disk) for stage in stages: for iotype in iotypes: self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) - if disks: - logging.warning( - "disks: %s not in available disk list, so they will be ignored.", - disks, - ) + if not self._detector_name_list: + Report.report_pass("the disks to detection is empty, ai_block_io will exit.") logging.critical("the disks to detection is empty, ai_block_io will exit.") - Report.report_pass( - "the disks to detection is empty, ai_block_io will exit." - ) exit(1) def __init_detector(self): @@ -202,16 +196,20 @@ class SlowIODetection: logging.debug("step3. Report slow io event to sysSentry.") for slow_io_event in slow_io_event_list: alarm_content = { + "alarm_source": "ai_block_io", "driver_name": slow_io_event[1], + "io_type": slow_io_event[4], "reason": slow_io_event[2], "block_stack": slow_io_event[3], - "io_type": slow_io_event[4], - "alarm_source": "ai_block_io", "alarm_type": slow_io_event[5], - "details": slow_io_event[6], + "details": slow_io_event[6] } Xalarm.major(alarm_content) - logging.warning("[SLOW IO] " + str(alarm_content)) + tmp_alarm_content = alarm_content.copy() + del tmp_alarm_content["details"] + logging.warning("[SLOW IO] " + str(tmp_alarm_content)) + logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) + logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) # Step4:等待检测时间 logging.debug("step4. Wait to start next slow io event detection loop.") diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 91ec5c6..3049db2 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -105,21 +105,26 @@ class ConfigParser: ge=None, lt=None, le=None, + section=None ): + if section is not None: + print_key = section + "." + key + else: + print_key = key value = config_items.get(key) if value is None: logging.warning( "config of %s not found, the default value %s will be used.", - key, + print_key, default_value, ) value = default_value if not value: logging.critical( - "the value of %s is empty, ai_block_io plug will exit.", key + "the value of %s is empty, ai_block_io plug will exit.", print_key ) Report.report_pass( - f"the value of {key} is empty, ai_block_io plug will exit." + f"the value of {print_key} is empty, ai_block_io plug will exit." ) exit(1) try: @@ -127,51 +132,51 @@ class ConfigParser: except ValueError: logging.critical( "the value of %s is not a valid %s, ai_block_io plug will exit.", - key, + print_key, value_type, ) Report.report_pass( - f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit." + f"the value of {print_key} is not a valid {value_type}, ai_block_io plug will exit." ) exit(1) if gt is not None and value <= gt: logging.critical( "the value of %s is not greater than %s, ai_block_io plug will exit.", - key, + print_key, gt, ) Report.report_pass( - f"the value of {key} is not greater than {gt}, ai_block_io plug will exit." + f"the value of {print_key} is not greater than {gt}, ai_block_io plug will exit." ) exit(1) if ge is not None and value < ge: logging.critical( "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.", - key, + print_key, ge, ) Report.report_pass( - f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit." + f"the value of {print_key} is not greater than or equal to {ge}, ai_block_io plug will exit." ) exit(1) if lt is not None and value >= lt: logging.critical( "the value of %s is not less than %s, ai_block_io plug will exit.", - key, + print_key, lt, ) Report.report_pass( - f"the value of {key} is not less than {lt}, ai_block_io plug will exit." + f"the value of {print_key} is not less than {lt}, ai_block_io plug will exit." ) exit(1) if le is not None and value > le: logging.critical( "the value of %s is not less than or equal to %s, ai_block_io plug will exit.", - key, + print_key, le, ) Report.report_pass( - f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit." + f"the value of {print_key} is not less than or equal to {le}, ai_block_io plug will exit." ) exit(1) @@ -188,7 +193,7 @@ class ConfigParser: frequency = self._conf["common"]["period_time"] ret = check_detect_frequency_is_valid(frequency) if ret is None: - log = f"period_time: {frequency} is valid, "\ + log = f"period_time: {frequency} is invalid, "\ f"Check whether the value range is too large or is not an "\ f"integer multiple of period_time.. exiting..." Report.report_pass(log) @@ -202,6 +207,7 @@ class ConfigParser: self._conf["common"]["disk"] = None return disks_to_detection = disks_to_detection.strip() + disks_to_detection = disks_to_detection.lower() if not disks_to_detection: logging.critical("the value of disk is empty, ai_block_io plug will exit.") Report.report_pass( @@ -213,7 +219,18 @@ class ConfigParser: if len(disk_list) == 1 and disk_list[0] == "default": self._conf["common"]["disk"] = None return - self._conf["common"]["disk"] = disk_list + if len(disk_list) > 10: + ten_disk_list = disk_list[0:10] + other_disk_list = disk_list[10:] + logging.warning(f"disk only support maximum is 10, disks: {ten_disk_list} will be retained, other: {other_disk_list} will be ignored.") + else: + ten_disk_list = disk_list + set_ten_disk_list = set(ten_disk_list) + if len(ten_disk_list) > len(set_ten_disk_list): + tmp = ten_disk_list + ten_disk_list = list(set_ten_disk_list) + logging.warning(f"disk exist duplicate, it will be deduplicate, before: {tmp}, after: {ten_disk_list}") + self._conf["common"]["disk"] = ten_disk_list def _read_train_data_duration(self, items_algorithm: dict): self._conf["algorithm"]["train_data_duration"] = self._get_config_value( @@ -244,10 +261,12 @@ class ConfigParser: def _read_algorithm_type_and_parameter(self, items_algorithm: dict): algorithm_type = items_algorithm.get("algorithm_type") - if algorithm_type is not None: - self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum( - algorithm_type - ) + if algorithm_type is None: + default_algorithm_type = self._conf["algorithm"]["algorithm_type"] + logging.warning(f"algorithm_type not found, it will be set default: {default_algorithm_type}") + else: + self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(algorithm_type) + if self._conf["algorithm"]["algorithm_type"] is None: logging.critical( "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", @@ -257,6 +276,7 @@ class ConfigParser: f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." ) exit(1) + elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold: self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value( items_algorithm, @@ -279,9 +299,14 @@ class ConfigParser: ) def _read_stage(self, items_algorithm: dict): - stage_str = items_algorithm.get( - "stage", self.DEFAULT_CONF["common"]["stage"] - ).strip() + stage_str = items_algorithm.get("stage") + if stage_str is None: + stage_str = self.DEFAULT_CONF["common"]["stage"] + logging.warning(f"stage not found, it will be set default: {stage_str}") + else: + stage_str = stage_str.strip() + + stage_str = stage_str.lower() stage_list = stage_str.split(",") stage_list = [stage.strip() for stage in stage_list] if len(stage_list) == 1 and stage_list[0] == "": @@ -307,9 +332,14 @@ class ConfigParser: self._conf["common"]["stage"] = dup_stage_list def _read_iotype(self, items_algorithm: dict): - iotype_str = items_algorithm.get( - "iotype", self.DEFAULT_CONF["common"]["iotype"] - ).strip() + iotype_str = items_algorithm.get("iotype") + if iotype_str is None: + iotype_str = self.DEFAULT_CONF["common"]["iotype"] + logging.warning(f"iotype not found, it will be set default: {iotype_str}") + else: + iotype_str = iotype_str.strip() + + iotype_str = iotype_str.lower() iotype_list = iotype_str.split(",") iotype_list = [iotype.strip() for iotype in iotype_list] if len(iotype_list) == 1 and iotype_list[0] == "": @@ -333,6 +363,13 @@ class ConfigParser: def _read_sliding_window_type(self, items_sliding_window: dict): sliding_window_type = items_sliding_window.get("win_type") + + if sliding_window_type is None: + default_sliding_window_type = self._conf["algorithm"]["win_type"] + logging.warning(f"win_type not found, it will be set default: {default_sliding_window_type}") + return + + sliding_window_type = sliding_window_type.strip() if sliding_window_type is not None: self._conf["algorithm"]["win_type"] = ( get_sliding_window_type_enum(sliding_window_type) @@ -439,6 +476,7 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"], gt=0, + section="latency_sata_ssd" ) self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value( items_latency_sata_ssd, @@ -446,21 +484,32 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], gt=0, + section="latency_sata_ssd" ) self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value( items_latency_sata_ssd, "read_avg_lim", int, self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"], - gt=0 + gt=0, + section="latency_sata_ssd" ) self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value( items_latency_sata_ssd, "write_avg_lim", int, self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"], - gt=0 + gt=0, + section="latency_sata_ssd" ) + if self._conf["latency_sata_ssd"]["read_avg_lim"] >= self._conf["latency_sata_ssd"]["read_tot_lim"]: + Report.report_pass("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") + logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") + exit(1) + if self._conf["latency_sata_ssd"]["write_avg_lim"] >= self._conf["latency_sata_ssd"]["write_tot_lim"]: + Report.report_pass("latency_sata_ssd.write_avg_lim must < latency_sata_ssd.write_tot_lim . exiting...") + logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") + exit(1) else: Report.report_pass("not found latency_sata_ssd section. exiting...") logging.critical("not found latency_sata_ssd section. exiting...") @@ -474,6 +523,7 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"], gt=0, + section="latency_nvme_ssd" ) self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value( items_latency_nvme_ssd, @@ -481,21 +531,32 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], gt=0, + section="latency_nvme_ssd" ) self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value( items_latency_nvme_ssd, "read_avg_lim", int, self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"], - gt=0 + gt=0, + section="latency_nvme_ssd" ) self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value( items_latency_nvme_ssd, "write_avg_lim", int, self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"], - gt=0 + gt=0, + section="latency_nvme_ssd" ) + if self._conf["latency_nvme_ssd"]["read_avg_lim"] >= self._conf["latency_nvme_ssd"]["read_tot_lim"]: + Report.report_pass("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...") + logging.critical("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...") + exit(1) + if self._conf["latency_nvme_ssd"]["write_avg_lim"] >= self._conf["latency_nvme_ssd"]["write_tot_lim"]: + Report.report_pass("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...") + logging.critical("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...") + exit(1) else: Report.report_pass("not found latency_nvme_ssd section. exiting...") logging.critical("not found latency_nvme_ssd section. exiting...") @@ -509,6 +570,7 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"], gt=0, + section="latency_sata_hdd" ) self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value( items_latency_sata_hdd, @@ -516,21 +578,32 @@ class ConfigParser: int, self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], gt=0, + section="latency_sata_hdd" ) self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value( items_latency_sata_hdd, "read_avg_lim", int, self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"], - gt=0 + gt=0, + section="latency_sata_hdd" ) self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value( items_latency_sata_hdd, "write_avg_lim", int, self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"], - gt=0 + gt=0, + section="latency_sata_hdd" ) + if self._conf["latency_sata_hdd"]["read_avg_lim"] >= self._conf["latency_sata_hdd"]["read_tot_lim"]: + Report.report_pass("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...") + logging.critical("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...") + exit(1) + if self._conf["latency_sata_hdd"]["write_avg_lim"] >= self._conf["latency_sata_hdd"]["write_tot_lim"]: + Report.report_pass("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...") + logging.critical("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...") + exit(1) else: Report.report_pass("not found latency_sata_hdd section. exiting...") logging.critical("not found latency_sata_hdd section. exiting...") diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index e4869d5..2f2d607 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -67,6 +67,20 @@ def check_detect_frequency_is_valid(period): return None +def check_disk_is_available(period_time, disk): + data_raw = is_iocollect_valid(period_time, disk) + if data_raw["ret"] == 0: + try: + data = json.loads(data_raw["message"]) + except Exception as e: + return False + if not data: + return False + return True + else: + return False + + def _get_raw_data(period, disk_list): return get_io_data( period, diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index e3a0952..496e032 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -75,6 +75,18 @@ class Detector: f' sliding_window_type: {self._slidingWindow}') +def set_to_str(parameter: set): + ret = "" + parameter = list(parameter) + length = len(parameter) + for i in range(length): + if i == 0: + ret += parameter[i] + else: + ret += "," + parameter[i] + return ret + + class DiskDetector: def __init__(self, disk_name: str): @@ -124,7 +136,7 @@ class DiskDetector: alarm_type.add(metric_name.metric_name) latency_wins, iodump_wins = self.get_detector_list_window() - details = f"latency: {latency_wins}, iodump: {iodump_wins}" + details = {"latency": latency_wins, "iodump": iodump_wins} io_press = {"throtl", "wbt", "iocost", "bfq"} driver_slow = {"rq_driver"} @@ -137,7 +149,7 @@ class DiskDetector: elif not kernel_slow.isdisjoint(block_stack): reason = "kernel_slow" - return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details + return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details def __repr__(self): msg = f'disk: {self._disk_name}, ' diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index 4083c43..ff3fa3b 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -107,7 +107,7 @@ class MedianSlidingWindow(SlidingWindow): if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): is_slow_io_event = False median = np.median(self._io_data_queue) - if median >= self._ai_threshold: + if (self._ai_threshold is not None and median > self._ai_threshold) or (self._abs_threshold is not None and median > self._abs_threshold): is_slow_io_event = True return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py index 600d041..e202bb8 100644 --- a/src/python/sentryPlugins/ai_block_io/threshold.py +++ b/src/python/sentryPlugins/ai_block_io/threshold.py @@ -65,9 +65,12 @@ class Threshold: def __repr__(self): return "Threshold" + def __str__(self): + return "Threshold" + class AbsoluteThreshold(Threshold): - def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): + def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): super().__init__(data_queue_size, data_queue_update_size) def push_latest_data_to_queue(self, data): @@ -76,6 +79,9 @@ class AbsoluteThreshold(Threshold): def __repr__(self): return "[AbsoluteThreshold]" + def __str__(self): + return "absolute" + class BoxplotThreshold(Threshold): def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): @@ -112,6 +118,9 @@ class BoxplotThreshold(Threshold): def __repr__(self): return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" + def __str__(self): + return "boxplot" + class NSigmaThreshold(Threshold): def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): @@ -147,6 +156,9 @@ class NSigmaThreshold(Threshold): def __repr__(self): return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" + def __str__(self): + return "n_sigma" + class ThresholdType(Enum): AbsoluteThreshold = 0 diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py index d6f4067..7d2390b 100644 --- a/src/python/sentryPlugins/ai_block_io/utils.py +++ b/src/python/sentryPlugins/ai_block_io/utils.py @@ -19,8 +19,6 @@ from .io_data import MetricName, IOData def get_threshold_type_enum(algorithm_type: str): - if algorithm_type.lower() == "absolute": - return ThresholdType.AbsoluteThreshold if algorithm_type.lower() == "boxplot": return ThresholdType.BoxplotThreshold if algorithm_type.lower() == "n_sigma": -- Gitee From 50b82207a1fec0fd5833a5e87058b049085e81c4 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Sat, 26 Oct 2024 07:18:16 +0000 Subject: [PATCH 67/76] fix alarm_info newline break error Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index ba581b4..37042d7 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -180,7 +180,30 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di if 'details' in alarm_info: alarm_info.pop('details', None) alarm.pop('msg1', None) + + # dump each {key,value} of details in one line + if 'details' in alarm_info and isinstance(alarm_info['details'], dict): + for key in alarm_info['details']: + alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None) + alarm['alarm_info'] = alarm_info + alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name] + + alarm_level_mapping = { + 1: 'MINOR_ALM', + 2: 'MAJOR_ALM', + 3: 'CRITICAL_ALM' + } + + alarm_type_mapping = { + 1: 'ALARM_TYPE_OCCUR', + 2: 'ALARM_TYPE_RECOVER' + } + + for alarm in alarm_list: + alarm['alarm_level'] = alarm_level_mapping.get(alarm['alarm_level'], 'UNKNOWN_LEVEL') + alarm['alarm_type'] = alarm_type_mapping.get(alarm['alarm_type'], 'UNKNOWN_TYPE') return alarm_list + finally: alarm_list_lock.release() -- Gitee From bd2948fcbc39d9a92f598af0fdf0620c78ac770a Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Mon, 28 Oct 2024 09:22:53 +0800 Subject: [PATCH 68/76] get_alarm -d abnomal display Signed-off-by: jinsaihang --- src/python/syssentry/alarm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py index 37042d7..e07867b 100644 --- a/src/python/syssentry/alarm.py +++ b/src/python/syssentry/alarm.py @@ -184,7 +184,7 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di # dump each {key,value} of details in one line if 'details' in alarm_info and isinstance(alarm_info['details'], dict): for key in alarm_info['details']: - alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None) + alarm_info['details'][key] = str(alarm_info['details'][key]) alarm['alarm_info'] = alarm_info alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name] -- Gitee From ccfca2a5619e5c59aadbaecd67804eae92b56a76 Mon Sep 17 00:00:00 2001 From: zhangnan Date: Mon, 28 Oct 2024 17:32:49 +0800 Subject: [PATCH 69/76] modify logrotate rule --- config/logrotate | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/logrotate b/config/logrotate index f54e7b3..3dc77f5 100644 --- a/config/logrotate +++ b/config/logrotate @@ -1,8 +1,9 @@ /var/log/sysSentry/*.log { - nocompress + compress missingok notifempty copytruncate rotate 2 size +4096k + hourly } -- Gitee From c5f2a83ae5289778065a72258c08f4d659d34b53 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Thu, 31 Oct 2024 16:00:50 +0800 Subject: [PATCH 70/76] excessive CPU usage Signed-off-by: jinsaihang --- src/python/xalarm/xalarm_transfer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py index 75807e0..d135095 100644 --- a/src/python/xalarm/xalarm_transfer.py +++ b/src/python/xalarm/xalarm_transfer.py @@ -62,7 +62,6 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket): to_remove.append(fileno) for fileno in to_remove: - epoll.unregister(fileno) fd_to_socket[fileno].close() del fd_to_socket[fileno] logging.info(f"cleaned up connection {fileno} for client lost connection.") @@ -97,7 +96,6 @@ def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!") connection.close() continue - epoll.register(connection.fileno(), select.EPOLLOUT) fd_to_socket[connection.fileno()] = connection except socket.error as e: logging.debug(f"socket error, reason is {e}") @@ -122,7 +120,6 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): except (BrokenPipeError, ConnectionResetError): to_remove.append(fileno) for fileno in to_remove: - epoll.unregister(fileno) fd_to_socket[fileno].close() del fd_to_socket[fileno] logging.info(f"cleaned up connection {fileno} for client lost connection.") -- Gitee From 87879d697eb18022369c5da27894d97e5af2a0a0 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Tue, 5 Nov 2024 10:31:10 +0800 Subject: [PATCH 71/76] change avg_block_io config --- config/plugins/avg_block_io.ini | 8 ++++---- src/python/sentryPlugins/avg_block_io/config.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 5c4b9b0..3b4ee33 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -12,12 +12,12 @@ win_size=30 win_threshold=6 [latency_nvme_ssd] -read_avg_lim=300 -write_avg_lim=300 +read_avg_lim=10000 +write_avg_lim=10000 read_avg_time=3 write_avg_time=3 -read_tot_lim=500 -write_tot_lim=500 +read_tot_lim=50000 +write_tot_lim=50000 [latency_sata_ssd] read_avg_lim=10000 diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py index c8f45ce..c1e8ab1 100644 --- a/src/python/sentryPlugins/avg_block_io/config.py +++ b/src/python/sentryPlugins/avg_block_io/config.py @@ -42,12 +42,12 @@ DEFAULT_PARAM = { CONF_ALGO_SIZE: 30, CONF_ALGO_THRE: 6 }, 'latency_nvme_ssd': { - 'read_avg_lim': 300, - 'write_avg_lim': 300, + 'read_avg_lim': 10000, + 'write_avg_lim': 10000, 'read_avg_time': 3, 'write_avg_time': 3, - 'read_tot_lim': 500, - 'write_tot_lim': 500, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000, }, 'latency_sata_ssd' : { 'read_avg_lim': 10000, 'write_avg_lim': 10000, -- Gitee From a48e20ebeaa23a4e79cba5f8eb43476e2daaa91e Mon Sep 17 00:00:00 2001 From: znzjugod Date: Tue, 5 Nov 2024 11:47:56 +0800 Subject: [PATCH 72/76] update nvme config --- config/plugins/ai_block_io.ini | 8 ++++---- src/python/sentryPlugins/ai_block_io/config_parser.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index d0b1e74..69f44ba 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -23,10 +23,10 @@ read_tot_lim=50000 write_tot_lim=50000 [latency_nvme_ssd] -read_avg_lim=300 -write_avg_lim=300 -read_tot_lim=500 -write_tot_lim=500 +read_avg_lim=10000 +write_avg_lim=10000 +read_tot_lim=50000 +write_tot_lim=50000 [latency_sata_hdd] read_avg_lim=15000 diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py index 3049db2..1bbb609 100644 --- a/src/python/sentryPlugins/ai_block_io/config_parser.py +++ b/src/python/sentryPlugins/ai_block_io/config_parser.py @@ -74,10 +74,10 @@ class ConfigParser: "write_tot_lim": 50000 }, "latency_nvme_ssd": { - "read_avg_lim": 300, - "write_avg_lim": 300, - "read_tot_lim": 500, - "write_tot_lim": 500 + "read_avg_lim": 10000, + "write_avg_lim": 10000, + "read_tot_lim": 50000, + "write_tot_lim": 50000 }, "latency_sata_hdd": { "read_avg_lim": 15000, -- Gitee From 95d9f505e3cf5189e16e9374f70d0fc12534dcef Mon Sep 17 00:00:00 2001 From: Youzhi He Date: Fri, 15 Nov 2024 17:34:35 +0800 Subject: [PATCH 73/76] fix when ai threshold update but absolute threshold not be used --- .../ai_block_io/sliding_window.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py index ff3fa3b..a13033f 100644 --- a/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -30,20 +30,20 @@ class SlidingWindow: self._io_data_queue = [] self._io_data_queue_abnormal_tag = [] + def is_abnormal(self, data): + if self._avg_lim is not None and data < self._avg_lim: + return False + if self._ai_threshold is not None and data > self._ai_threshold: + return True + if self._abs_threshold is not None and data > self._abs_threshold: + return True + def push(self, data: float): if len(self._io_data_queue) == self._queue_length: self._io_data_queue.pop(0) self._io_data_queue_abnormal_tag.pop(0) self._io_data_queue.append(data) - tag = False - if self._avg_lim is not None and data < self._avg_lim: - tag = False - self._io_data_queue_abnormal_tag.append(tag) - return tag - if self._ai_threshold is not None and data > self._ai_threshold: - tag = True - if self._abs_threshold is not None and data > self._abs_threshold: - tag = True + tag = self.is_abnormal(data) self._io_data_queue_abnormal_tag.append(tag) return tag @@ -53,7 +53,7 @@ class SlidingWindow: self._ai_threshold = threshold self._io_data_queue_abnormal_tag.clear() for data in self._io_data_queue: - self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) + self._io_data_queue_abnormal_tag.append(self.is_abnormal(data)) def is_slow_io_event(self, data): return False, None, None, None -- Gitee From 68ee5ee107120af9619f8229d5b6e107ac94a7ad Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 13 Dec 2024 14:21:32 +0800 Subject: [PATCH 74/76] add debug log and modify disk type for ebpf --- src/c/ebpf_collector/ebpf_collector.c | 200 ++++++++++++++++++++--- src/python/sentryCollector/collect_io.py | 2 + 2 files changed, 178 insertions(+), 24 deletions(-) diff --git a/src/c/ebpf_collector/ebpf_collector.c b/src/c/ebpf_collector/ebpf_collector.c index af452c8..54a3a9e 100644 --- a/src/c/ebpf_collector/ebpf_collector.c +++ b/src/c/ebpf_collector/ebpf_collector.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -36,11 +37,25 @@ #define TAG_RES_2 (map_fd[13]) #define BPF_FILE "/usr/lib/ebpf_collector.bpf.o" +#define MAX_LINE_LENGTH 1024 +#define MAX_SECTION_NAME_LENGTH 256 +#define CONFIG_FILE "/etc/sysSentry/collector.conf" + typedef struct { int major; int minor; } DeviceInfo; +typedef enum { + LOG_LEVEL_NONE, + LOG_LEVEL_DEBUG, + LOG_LEVEL_INFO, + LOG_LEVEL_WARNING, + LOG_LEVEL_ERROR +} LogLevel; + +LogLevel currentLogLevel = LOG_LEVEL_INFO; + static volatile bool exiting; const char argp_program_doc[] = @@ -152,24 +167,28 @@ static int print_map_res(struct bpf_map *map_res, char *stage, int *map_size, in struct stage_data counter; int key = 0; + logMessage(LOG_LEVEL_DEBUG, "print_map_res map_size: %d\n", map_size); for (key = 0; key < map_size; key++) { int err = bpf_map_lookup_elem(map_res, &key, &counter); if (err < 0) { - fprintf(stderr, "failed to lookup %s map_res: %d\n", stage, err); + logMessage(LOG_LEVEL_ERROR, "failed to lookup %s map_res: %d\n", stage, err); return -1; } size_t length = strlen(counter.io_type); char io_type; if (length > 0) { + logMessage(LOG_LEVEL_DEBUG, "io_type have value.\n"); io_type = counter.io_type[0]; } else { + logMessage(LOG_LEVEL_DEBUG, "io_type not value.\n"); io_type = NULL; } int major = counter.major; int first_minor = counter.first_minor; dev_t dev = makedev(major, first_minor); char *device_name = find_device_name(dev); + logMessage(LOG_LEVEL_DEBUG, "device_name: %s\n", device_name); if (device_name && io_type) { printf("%-7s %10llu %10llu %d %c %s\n", stage, @@ -195,7 +214,7 @@ int init_map(int *map_fd, const char *map_name, int *map_size, DeviceInfo *devic init_data.major = devices[i].major; init_data.first_minor = devices[i].minor; if (bpf_map_update_elem(map_fd, &i, &init_data, BPF_ANY) != 0) { - printf("Failed to initialize map %s at index %d\n", map_name, i); + logMessage(LOG_LEVEL_ERROR, "Failed to initialize map %s at index %d\n", map_name, i); return 1; } } @@ -203,6 +222,101 @@ int init_map(int *map_fd, const char *map_name, int *map_size, DeviceInfo *devic return 0; } +char *read_config_value(const char *file, const char *section, const char *key) { + FILE *fp = fopen(file, "r"); + if (fp == NULL) { + logMessage(LOG_LEVEL_ERROR, "Failed to open config file.\n"); + return NULL; + } + + char line[MAX_LINE_LENGTH]; + char current_section[MAX_SECTION_NAME_LENGTH] = {0}; + char *value = NULL; + + while (fgets(line, sizeof(line), fp) != NULL) { + line[strcspn(line, "\n")] = 0; + + if (line[0] == '\0' || line[0] == ';' || line[0] == '#') { + continue; + } + + if (line[0] == '[') { + sscanf(line, "[%255[^]]", current_section); + continue; + } + + if (strcmp(current_section, section) == 0) { + char *delimiter = "="; + char *token = strtok(line, delimiter); + if (token != NULL) { + if (strcmp(token, key) == 0) { + token = strtok(NULL, delimiter); + if (token != NULL) { + value = strdup(token); + break; + } + } + } + } + } + + fclose(fp); + return value; +} + +void setLogLevel(const char *levelStr) { + if (strcmp(levelStr, "info") == 0) { + currentLogLevel = LOG_LEVEL_INFO; + } + else if (strcmp(levelStr, "warning") == 0) { + currentLogLevel = LOG_LEVEL_WARNING; + } + else if (strcmp(levelStr, "error") == 0) { + currentLogLevel = LOG_LEVEL_ERROR; + } + else if (strcmp(levelStr, "debug") == 0) { + currentLogLevel = LOG_LEVEL_DEBUG; + } + else if (strcmp(levelStr, "none") == 0) { + currentLogLevel = LOG_LEVEL_NONE; + } + else { + logMessage(LOG_LEVEL_ERROR, "unknown log level: %s\n", levelStr); + } +} + +void logMessage(LogLevel level, const char *format, ...){ + if (level >= currentLogLevel) { + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + } +} + +int check_for_device(const char *device_name) { + char path[256]; + snprintf(path, sizeof(path), "/sys/block/%s", device_name); + + DIR *dir = opendir(path); + if (dir == NULL) { + return 0; + } + + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) { + struct stat statbuf; + if (stat(path, &statbuf) == 0) { + if (S_ISDIR(statbuf.st_mode)) { + closedir(dir); + return 1; + } + } + } + closedir(dir); + return 0; +} + int main(int argc, char **argv) { struct partitions *partitions = NULL; const struct partition *partition; @@ -222,13 +336,29 @@ int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; setrlimit(RLIMIT_MEMLOCK, &r); + char *level = read_config_value(CONFIG_FILE, "log", "level"); + if (level != NULL) { + if (level[strlen(level) - 1] == '\r') { + size_t len = strlen(level); + level[len - 1] = '\0'; + } + setLogLevel(level); + free(level); + } else { + logMessage(LOG_LEVEL_INFO, "the log level is incorrectly configured. the default log level is info.\n"); + } + logMessage(LOG_LEVEL_DEBUG, "finish config parse.\n"); + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); - if (err) + if (err) { + logMessage(LOG_LEVEL_ERROR, "argp parse failed.\n"); return err; + } snprintf(filename, sizeof(filename), BPF_FILE); - if (load_bpf_file(filename)) { + if (load_bpf_file(filename)) { + logMessage(LOG_LEVEL_ERROR, "load_bpf_file failed.\n"); return 1; } @@ -236,39 +366,52 @@ int main(int argc, char **argv) { dir = opendir("/dev"); if (dir == NULL) { - printf("Failed to open /dev directory"); + logMessage(LOG_LEVEL_ERROR, "Failed to open /dev directory.\n"); return EXIT_FAILURE; } while ((entry = readdir(dir)) != NULL) { - if (entry->d_type == DT_BLK) { - snprintf(path, sizeof(path), "/dev/%s", entry->d_name); - struct stat statbuf; - if (lstat(path, &statbuf) == 0) { - if (S_ISBLK(statbuf.st_mode)) { - devices[device_count].major = major(statbuf.st_rdev); - devices[device_count].minor = minor(statbuf.st_rdev); - device_count++; - if (device_count >= MAP_SIZE) { - break; - } - } - } + if (entry->d_type != DT_BLK) { + continue; + } + snprintf(path, sizeof(path), "/dev/%s", entry->d_name); + struct stat statbuf; + if (lstat(path, &statbuf) != 0 && !S_ISBLK(statbuf.st_mode)) { + continue; + } + if (!strncmp(entry->d_name, "dm-", 3) || !strncmp(entry->d_name, "loop", 4) || + !strncmp(entry->d_name, "md", 2)) { + continue; + } + if (!check_for_device(entry->d_name)) { + continue; + } + + devices[device_count].major = major(statbuf.st_rdev); + devices[device_count].minor = minor(statbuf.st_rdev); + device_count++; + if (device_count >= MAP_SIZE) { + logMessage(LOG_LEVEL_DEBUG, " device_count moren than MAP_SIZE.\n"); + break; } } closedir(dir); if (init_map(BLK_RES, "blk_res_map", device_count, devices) != 0) { + logMessage(LOG_LEVEL_ERROR, "blk_res_map failed.\n"); return 1; } if (init_map(BIO_RES, "blo_res_map", device_count, devices) != 0) { + logMessage(LOG_LEVEL_ERROR, "blo_res_map failed.\n"); return 1; } if (init_map(WBT_RES, "wbt_res_map", device_count, devices) != 0) { + logMessage(LOG_LEVEL_ERROR, "wbt_res_map failed.\n"); return 1; } if (init_map(TAG_RES, "tag_res_map", device_count, devices) != 0) { + logMessage(LOG_LEVEL_ERROR, "tag_res_map failed.\n"); return 1; } @@ -279,31 +422,40 @@ int main(int argc, char **argv) { int io_dump_blk[MAP_SIZE] = {0}; update_io_dump(BLK_RES_2, io_dump_blk, device_count,"rq_driver"); err = print_map_res(BLK_RES, "rq_driver", device_count, io_dump_blk); - if (err) + if (err) { + logMessage(LOG_LEVEL_ERROR, "print_map_res rq_driver error.\n"); break; + } int io_dump_bio[MAP_SIZE] = {0}; update_io_dump(BIO_RES_2, io_dump_bio, device_count,"bio"); err = print_map_res(BIO_RES, "bio", device_count, io_dump_bio); - if (err) + if (err) { + logMessage(LOG_LEVEL_ERROR, "print_map_res bio error.\n"); break; + } int io_dump_tag[MAP_SIZE] = {0}; update_io_dump(TAG_RES_2, io_dump_tag, device_count,"gettag"); err = print_map_res(TAG_RES, "gettag", device_count, io_dump_tag); - if (err) + if (err) { + logMessage(LOG_LEVEL_ERROR, "print_map_res gettag error.\n"); break; + } int io_dump_wbt[MAP_SIZE] = {0}; update_io_dump(WBT_RES_2, io_dump_wbt, device_count,"wbt"); err = print_map_res(WBT_RES, "wbt", device_count, io_dump_wbt); - if (err) + if (err) { + logMessage(LOG_LEVEL_ERROR, "print_map_res wbt error.\n"); break; + } - if (exiting) + if (exiting) { + logMessage(LOG_LEVEL_DEBUG, "exit program.\n"); break; + } } return -err; } - diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py index 8a82eab..a7e86cb 100644 --- a/src/python/sentryCollector/collect_io.py +++ b/src/python/sentryCollector/collect_io.py @@ -269,6 +269,8 @@ class CollectIo(): if len(data_list) != 6: continue stage, finish_count, latency, io_dump, io_type ,disk_name = data_list + if stage not in EBPF_STAGE_LIST: + continue if disk_name not in self.window_value: continue io_type = self.get_ebpf_io_type(io_type) -- Gitee From e066c611fc3b6e39c75d674293c5561bf2948769 Mon Sep 17 00:00:00 2001 From: shixuantong Date: Mon, 16 Dec 2024 20:49:47 +0800 Subject: [PATCH 75/76] remove cpu_sentry --- src/python/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/setup.py b/src/python/setup.py index 9e26a10..d332e36 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -29,7 +29,6 @@ setup( ], entry_points={ 'console_scripts': [ - 'cpu_sentry=syssentry.cpu_sentry:main', 'syssentry=syssentry.syssentry:main', 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', 'sentryCollector=sentryCollector.collectd:main', -- Gitee From 89de01ac558cdc907f8cad07a34bc8bfa6d04c90 Mon Sep 17 00:00:00 2001 From: jinsaihang Date: Tue, 17 Dec 2024 11:36:11 +0800 Subject: [PATCH 76/76] uniform avg_block_io log and ai_block_io log Signed-off-by: jinsaihang --- src/python/sentryPlugins/ai_block_io/ai_block_io.py | 5 +++++ src/python/sentryPlugins/ai_block_io/detector.py | 8 +++----- src/python/sentryPlugins/avg_block_io/stage_window.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 14f740d..8075f5f 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -208,6 +208,11 @@ class SlowIODetection: tmp_alarm_content = alarm_content.copy() del tmp_alarm_content["details"] logging.warning("[SLOW IO] " + str(tmp_alarm_content)) + logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, ' + f'stage: {str(tmp_alarm_content.get("driver_name"))}, ' + f'iotype: {str(tmp_alarm_content.get("io_type"))}, ' + f'type: {str(tmp_alarm_content.get("alarm_type"))}, ' + f'reason: {str(tmp_alarm_content.get("reason"))}') logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index 496e032..27fb7f7 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -58,11 +58,9 @@ class Detector: logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, ' f'stage: {self._metric_name.stage_name}, ' f'iotype: {self._metric_name.io_access_type_name}, ' - f'metric: {self._metric_name.metric_name}, ' - f'current value: {metric_value}, ' - f'ai threshold: {detection_result[2]}, ' - f'absolute threshold upper limit: {detection_result[3]}, ' - f'lower limit: {detection_result[4]}') + f'type: {self._metric_name.metric_name}, ' + f'ai_threshold: {round(detection_result[2], 3)}, ' + f'curr_val: {metric_value}') else: logging.debug(f'Detection result: {str(detection_result)}') logging.debug(f'exit Detector: {self}') diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py index 5113782..587bd49 100644 --- a/src/python/sentryPlugins/avg_block_io/stage_window.py +++ b/src/python/sentryPlugins/avg_block_io/stage_window.py @@ -28,7 +28,7 @@ class AbnormalWindowBase: self.abnormal_window.append(False) def is_abnormal_window(self): - return sum(self.abnormal_window) > self.window_threshold + return sum(self.abnormal_window) >= self.window_threshold def window_data_to_string(self): return ",".join(str(x) for x in self.window_data) -- Gitee