diff --git a/fix-ai_block_io-some-issues.patch b/fix-ai_block_io-some-issues.patch new file mode 100644 index 0000000000000000000000000000000000000000..d80cbe8266f3a78d0038b8cc689a4098f7412d9e --- /dev/null +++ b/fix-ai_block_io-some-issues.patch @@ -0,0 +1,832 @@ +From 35ba8fe8e241c5e3508c5dadc82a777065a5cc4d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> +Date: Mon, 30 Sep 2024 00:15:29 +0800 +Subject: [PATCH] fix ai_block_io some issues + +--- + ..._slow_io_detection.ini => ai_block_io.ini} | 6 +- + config/tasks/ai_block_io.mod | 5 + + .../tasks/ai_threshold_slow_io_detection.mod | 5 - + ...ow_io_detection.py => test_ai_block_io.py} | 0 + .../README.md | 0 + .../__init__.py | 0 + .../ai_block_io.py} | 57 ++-- + .../alarm_report.py | 2 +- + .../ai_block_io/config_parser.py | 256 ++++++++++++++++++ + .../data_access.py | 3 + + .../detector.py | 17 +- + .../io_data.py | 0 + .../sliding_window.py | 0 + .../threshold.py | 13 +- + .../utils.py | 15 +- + .../config_parser.py | 141 ---------- + src/python/setup.py | 2 +- + 17 files changed, 336 insertions(+), 186 deletions(-) + rename config/plugins/{ai_threshold_slow_io_detection.ini => ai_block_io.ini} (66%) + create mode 100644 config/tasks/ai_block_io.mod + delete mode 100644 config/tasks/ai_threshold_slow_io_detection.mod + rename selftest/test/{test_ai_threshold_slow_io_detection.py => test_ai_block_io.py} (100%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/README.md (100%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/__init__.py (100%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection/slow_io_detection.py => ai_block_io/ai_block_io.py} (66%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/alarm_report.py (98%) + create mode 100644 src/python/sentryPlugins/ai_block_io/config_parser.py + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/data_access.py (99%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/detector.py (77%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/io_data.py (100%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/sliding_window.py (100%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/threshold.py (92%) + rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/utils.py (86%) + delete mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py + +diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_block_io.ini +similarity index 66% +rename from config/plugins/ai_threshold_slow_io_detection.ini +rename to config/plugins/ai_block_io.ini +index 44eb928..01ce266 100644 +--- a/config/plugins/ai_threshold_slow_io_detection.ini ++++ b/config/plugins/ai_block_io.ini +@@ -4,9 +4,9 @@ slow_io_detect_frequency=1 + log_level=info + + [algorithm] +-train_data_duration=0.1 +-train_update_duration=0.02 +-algorithm_type=n_sigma ++train_data_duration=24 ++train_update_duration=2 ++algorithm_type=boxplot + boxplot_parameter=1.5 + n_sigma_parameter=3 + +diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod +new file mode 100644 +index 0000000..1971d7d +--- /dev/null ++++ b/config/tasks/ai_block_io.mod +@@ -0,0 +1,5 @@ ++[common] ++enabled=yes ++task_start=/usr/bin/python3 /usr/bin/ai_block_io ++task_stop=pkill -f /usr/bin/ai_block_io ++type=oneshot +\ No newline at end of file +diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod +deleted file mode 100644 +index 2729f72..0000000 +--- a/config/tasks/ai_threshold_slow_io_detection.mod ++++ /dev/null +@@ -1,5 +0,0 @@ +-[common] +-enabled=yes +-task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection +-task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection +-type=oneshot +\ No newline at end of file +diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_block_io.py +similarity index 100% +rename from selftest/test/test_ai_threshold_slow_io_detection.py +rename to selftest/test/test_ai_block_io.py +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_block_io/README.md +similarity index 100% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md +rename to src/python/sentryPlugins/ai_block_io/README.md +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_block_io/__init__.py +similarity index 100% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py +rename to src/python/sentryPlugins/ai_block_io/__init__.py +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py +similarity index 66% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py +rename to src/python/sentryPlugins/ai_block_io/ai_block_io.py +index 43cf770..31b8a97 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py ++++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py +@@ -23,7 +23,7 @@ from .data_access import get_io_data_from_collect_plug, check_collect_valid + from .io_data import MetricName + from .alarm_report import AlarmReport + +-CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" ++CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" + + + def sig_handler(signum, frame): +@@ -40,34 +40,48 @@ class SlowIODetection: + + def __init__(self, config_parser: ConfigParser): + self._config_parser = config_parser +- self.__set_log_format() + self.__init_detector_name_list() + self.__init_detector() + +- def __set_log_format(self): +- log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" +- log_level = get_log_level(self._config_parser.get_log_level()) +- logging.basicConfig(level=log_level, format=log_format) +- + def __init_detector_name_list(self): + self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) +- for disk in self._disk_list: +- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) +- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) ++ disks_to_detection: list = self._config_parser.get_disks_to_detection() ++ # 情况1:None,则启用所有磁盘检测 ++ # 情况2:is not None and len = 0,则不启动任何磁盘检测 ++ # 情况3:len != 0,则取交集 ++ if disks_to_detection is None: ++ for disk in self._disk_list: ++ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) ++ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) ++ elif len(disks_to_detection) == 0: ++ logging.warning('please attention: conf file not specify any disk to detection, ' ++ 'so it will not start ai block io.') ++ else: ++ disks_name_to_detection = [] ++ for disk_name_to_detection in disks_to_detection: ++ disks_name_to_detection.append(disk_name_to_detection.get_disk_name()) ++ disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection] ++ for disk in disk_intersection: ++ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) ++ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) ++ logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') + + def __init_detector(self): + train_data_duration, train_update_duration = (self._config_parser. + get_train_data_duration_and_train_update_duration()) + slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() +- threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) ++ threshold_type = self._config_parser.get_algorithm_type() + data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, + train_update_duration, + slow_io_detection_frequency) +- sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) ++ sliding_window_type = self._config_parser.get_sliding_window_type() + window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() + + for detector_name in self._detector_name_list: +- threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, ++ threshold = ThresholdFactory().get_threshold(threshold_type, ++ boxplot_parameter=self._config_parser.get_boxplot_parameter(), ++ n_sigma_paramter=self._config_parser.get_n_sigma_parameter(), ++ data_queue_size=data_queue_size, + data_queue_update_size=update_size) + sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, + threshold=window_threshold) +@@ -89,6 +103,7 @@ class SlowIODetection: + logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') + if io_data_dict_with_disk_name is None: + continue ++ + # Step2:慢IO检测 + logging.debug('step2. Start to detection slow io event.') + slow_io_event_list = [] +@@ -103,13 +118,14 @@ class SlowIODetection: + for slow_io_event in slow_io_event_list: + metric_name: MetricName = slow_io_event[0] + result = slow_io_event[1] +- AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." +- f"stage: {metric_name.get_metric_name()}," +- f"type: {metric_name.get_io_access_type_name()}," +- f"metric: {metric_name.get_metric_name()}," +- f"current window: {result[1]}," +- f"threshold: {result[2]}") +- logging.error(f"slow io event happen: {str(slow_io_event)}") ++ alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " ++ f"stage is: {metric_name.get_stage_name()}, " ++ f"io access type is: {metric_name.get_io_access_type_name()}, " ++ f"metric is: {metric_name.get_metric_name()}, " ++ f"current window is: {result[1]}, " ++ f"threshold is: {result[2]}") ++ AlarmReport.report_major_alm(alarm_content) ++ logging.warning(alarm_content) + + # Step4:等待检测时间 + logging.debug('step4. Wait to start next slow io event detection loop.') +@@ -120,6 +136,7 @@ def main(): + # Step1:注册消息处理函数 + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) ++ + # Step2:断点恢复 + # todo: + +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py +similarity index 98% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py +rename to src/python/sentryPlugins/ai_block_io/alarm_report.py +index 3f4f34e..230c8cd 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py ++++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py +@@ -15,7 +15,7 @@ import json + + + class AlarmReport: +- TASK_NAME = "SLOW_IO_DETECTION" ++ TASK_NAME = "ai_block_io" + + @staticmethod + def report_pass(info: str): +diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py +new file mode 100644 +index 0000000..632391d +--- /dev/null ++++ b/src/python/sentryPlugins/ai_block_io/config_parser.py +@@ -0,0 +1,256 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++import configparser ++import json ++import logging ++ ++from .io_data import MetricName ++from .threshold import ThresholdType ++from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level ++ ++LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" ++ ++ ++def init_log_format(log_level: str): ++ logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT) ++ ++ ++class ConfigParser: ++ DEFAULT_ABSOLUTE_THRESHOLD = 40 ++ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 ++ DEFAULT_LOG_LEVEL = 'info' ++ ++ DEFAULT_ALGORITHM_TYPE = 'boxplot' ++ DEFAULT_TRAIN_DATA_DURATION = 24 ++ DEFAULT_TRAIN_UPDATE_DURATION = 2 ++ DEFAULT_BOXPLOT_PARAMETER = 1.5 ++ DEFAULT_N_SIGMA_PARAMETER = 3 ++ ++ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' ++ DEFAULT_WINDOW_SIZE = 30 ++ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 ++ ++ def __init__(self, config_file_name): ++ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD ++ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY ++ self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL ++ self.__disks_to_detection: list = [] ++ ++ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE ++ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION ++ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION ++ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER ++ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER ++ ++ self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE ++ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE ++ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD ++ ++ self.__config_file_name = config_file_name ++ ++ def __read_absolute_threshold(self, items_common: dict): ++ try: ++ self.__absolute_threshold = float(items_common.get('absolute_threshold', ++ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) ++ if self.__absolute_threshold <= 0: ++ logging.warning( ++ f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.') ++ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD ++ except ValueError: ++ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD ++ logging.warning( ++ f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.') ++ ++ def __read__slow_io_detect_frequency(self, items_common: dict): ++ try: ++ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', ++ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) ++ if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10: ++ logging.warning( ++ f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.') ++ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY ++ except ValueError: ++ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY ++ logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') ++ ++ def __read__disks_to_detect(self, items_common: dict): ++ disks_to_detection = items_common.get('disks_to_detect') ++ if disks_to_detection is None: ++ logging.warning(f'config of disks_to_detect not found, the default value be used.') ++ self.__disks_to_detection = None ++ return ++ try: ++ disks_to_detection_list = json.loads(disks_to_detection) ++ for disk_to_detection in disks_to_detection_list: ++ disk_name = disk_to_detection.get('disk_name', None) ++ stage_name = disk_to_detection.get('stage_name', None) ++ io_access_type_name = disk_to_detection.get('io_access_type_name', None) ++ metric_name = disk_to_detection.get('metric_name', None) ++ if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None): ++ metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name) ++ self.__disks_to_detection.append(metric_name_object) ++ else: ++ logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.') ++ except json.decoder.JSONDecodeError as e: ++ logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.') ++ self.__disks_to_detection = None ++ ++ def __read__train_data_duration(self, items_algorithm: dict): ++ try: ++ self.__train_data_duration = float(items_algorithm.get('train_data_duration', ++ ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) ++ if self.__train_data_duration <= 0 or self.__train_data_duration > 720: ++ logging.warning( ++ f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.') ++ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION ++ except ValueError: ++ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION ++ logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.') ++ ++ def __read__train_update_duration(self, items_algorithm: dict): ++ default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION ++ if default_train_update_duration > self.__train_data_duration: ++ default_train_update_duration = self.__train_data_duration / 2 ++ ++ try: ++ self.__train_update_duration = float(items_algorithm.get('train_update_duration', ++ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) ++ if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration: ++ logging.warning( ++ f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.') ++ self.__train_update_duration = default_train_update_duration ++ except ValueError: ++ self.__train_update_duration = default_train_update_duration ++ logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.') ++ ++ def __read__algorithm_type_and_parameter(self, items_algorithm: dict): ++ algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) ++ self.__algorithm_type = get_threshold_type_enum(algorithm_type) ++ ++ if self.__algorithm_type == ThresholdType.NSigmaThreshold: ++ try: ++ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', ++ ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) ++ if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10: ++ logging.warning( ++ f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.') ++ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER ++ except ValueError: ++ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER ++ logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.') ++ elif self.__algorithm_type == ThresholdType.BoxplotThreshold: ++ try: ++ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', ++ ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) ++ if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10: ++ logging.warning( ++ f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.') ++ self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER ++ except ValueError: ++ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER ++ logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.') ++ ++ def __read__window_size(self, items_sliding_window: dict): ++ try: ++ self.__window_size = int(items_sliding_window.get('window_size', ++ ConfigParser.DEFAULT_WINDOW_SIZE)) ++ if self.__window_size < 1 or self.__window_size > 3600: ++ logging.warning( ++ f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.') ++ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE ++ except ValueError: ++ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE ++ logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.') ++ ++ def __read__window_minimum_threshold(self, items_sliding_window: dict): ++ default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD ++ if default_window_minimum_threshold > self.__window_size: ++ default_window_minimum_threshold = self.__window_size / 2 ++ try: ++ self.__window_minimum_threshold = ( ++ int(items_sliding_window.get('window_minimum_threshold', ++ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) ++ if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size: ++ logging.warning( ++ f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.') ++ self.__window_minimum_threshold = default_window_minimum_threshold ++ except ValueError: ++ self.__window_minimum_threshold = default_window_minimum_threshold ++ logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.') ++ ++ def read_config_from_file(self): ++ con = configparser.ConfigParser() ++ con.read(self.__config_file_name, encoding='utf-8') ++ ++ if con.has_section('common'): ++ items_common = dict(con.items('common')) ++ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) ++ init_log_format(self.__log_level) ++ self.__read_absolute_threshold(items_common) ++ self.__read__slow_io_detect_frequency(items_common) ++ self.__read__disks_to_detect(items_common) ++ else: ++ init_log_format(self.__log_level) ++ logging.warning("common section parameter not found, it will be set to default value.") ++ ++ if con.has_section('algorithm'): ++ items_algorithm = dict(con.items('algorithm')) ++ self.__read__train_data_duration(items_algorithm) ++ self.__read__train_update_duration(items_algorithm) ++ self.__read__algorithm_type_and_parameter(items_algorithm) ++ else: ++ logging.warning("algorithm section parameter not found, it will be set to default value.") ++ ++ if con.has_section('sliding_window'): ++ items_sliding_window = dict(con.items('sliding_window')) ++ sliding_window_type = items_sliding_window.get('sliding_window_type', ++ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) ++ self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type) ++ self.__read__window_size(items_sliding_window) ++ self.__read__window_minimum_threshold(items_sliding_window) ++ else: ++ logging.warning("sliding_window section parameter not found, it will be set to default value.") ++ ++ self.__print_all_config_value() ++ ++ def __print_all_config_value(self): ++ pass ++ ++ def get_slow_io_detect_frequency(self): ++ return self.__slow_io_detect_frequency ++ ++ def get_algorithm_type(self): ++ return self.__algorithm_type ++ ++ def get_sliding_window_type(self): ++ return self.__sliding_window_type ++ ++ def get_train_data_duration_and_train_update_duration(self): ++ return self.__train_data_duration, self.__train_update_duration ++ ++ def get_window_size_and_window_minimum_threshold(self): ++ return self.__window_size, self.__window_minimum_threshold ++ ++ def get_absolute_threshold(self): ++ return self.__absolute_threshold ++ ++ def get_log_level(self): ++ return self.__log_level ++ ++ def get_disks_to_detection(self): ++ return self.__disks_to_detection ++ ++ def get_boxplot_parameter(self): ++ return self.__boxplot_parameter ++ ++ def get_n_sigma_parameter(self): ++ return self.__n_sigma_parameter +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py +similarity index 99% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py +rename to src/python/sentryPlugins/ai_block_io/data_access.py +index d9f3460..01c5315 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py ++++ b/src/python/sentryPlugins/ai_block_io/data_access.py +@@ -17,6 +17,8 @@ from sentryCollector.collect_plugin import ( + get_io_data, + is_iocollect_valid, + ) ++ ++ + from .io_data import IOStageData, IOData + + COLLECT_STAGES = [ +@@ -32,6 +34,7 @@ COLLECT_STAGES = [ + "iocost", + ] + ++ + def check_collect_valid(period): + data_raw = is_iocollect_valid(period) + if data_raw["ret"] == 0: +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py +similarity index 77% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py +rename to src/python/sentryPlugins/ai_block_io/detector.py +index eda9825..bcf62cb 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py ++++ b/src/python/sentryPlugins/ai_block_io/detector.py +@@ -26,19 +26,26 @@ class Detector: + self._threshold = threshold + self._slidingWindow = sliding_window + self._threshold.attach_observer(self._slidingWindow) ++ self._count = 0 + + def get_metric_name(self): + return self._metric_name + + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): +- logging.debug(f'Enter Detector: {self}') ++ self._count += 1 ++ if self._count % 15 == 0: ++ self._count = 0 ++ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") ++ logging.debug(f'enter Detector: {self}') + metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) +- if metric_value > 1e-6: +- logging.debug(f'Input metric value: {str(metric_value)}') +- self._threshold.push_latest_data_to_queue(metric_value) ++ if metric_value is None: ++ logging.debug('not found metric value, so return None.') ++ return False, None, None ++ logging.debug(f'input metric value: {str(metric_value)}') ++ self._threshold.push_latest_data_to_queue(metric_value) + detection_result = self._slidingWindow.is_slow_io_event(metric_value) + logging.debug(f'Detection result: {str(detection_result)}') +- logging.debug(f'Exit Detector: {self}') ++ logging.debug(f'exit Detector: {self}') + return detection_result + + def __repr__(self): +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py +similarity index 100% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py +rename to src/python/sentryPlugins/ai_block_io/io_data.py +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py +similarity index 100% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py +rename to src/python/sentryPlugins/ai_block_io/sliding_window.py +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py +similarity index 92% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py +rename to src/python/sentryPlugins/ai_block_io/threshold.py +index 9e1ca7b..ff85d85 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py ++++ b/src/python/sentryPlugins/ai_block_io/threshold.py +@@ -79,9 +79,9 @@ class AbsoluteThreshold(Threshold): + + + class BoxplotThreshold(Threshold): +- def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): + super().__init__(data_queue_size, data_queue_update_size) +- self.parameter = parameter ++ self.parameter = boxplot_parameter + + def _update_threshold(self): + data = list(self.data_queue.queue) +@@ -94,6 +94,8 @@ class BoxplotThreshold(Threshold): + self.notify_observer() + + def push_latest_data_to_queue(self, data): ++ if data < 1e-6: ++ return + try: + self.data_queue.put(data, block=False) + except queue.Full: +@@ -111,9 +113,9 @@ class BoxplotThreshold(Threshold): + + + class NSigmaThreshold(Threshold): +- def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): + super().__init__(data_queue_size, data_queue_update_size) +- self.parameter = parameter ++ self.parameter = n_sigma_parameter + + def _update_threshold(self): + data = list(self.data_queue.queue) +@@ -125,6 +127,8 @@ class NSigmaThreshold(Threshold): + self.notify_observer() + + def push_latest_data_to_queue(self, data): ++ if data < 1e-6: ++ return + try: + self.data_queue.put(data, block=False) + except queue.Full: +@@ -157,4 +161,3 @@ class ThresholdFactory: + return NSigmaThreshold(*args, **kwargs) + else: + raise ValueError(f"Invalid threshold type: {threshold_type}") +- +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py +similarity index 86% +rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py +rename to src/python/sentryPlugins/ai_block_io/utils.py +index f66e5ed..8dbba06 100644 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py ++++ b/src/python/sentryPlugins/ai_block_io/utils.py +@@ -8,13 +8,16 @@ + # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + # PURPOSE. + # See the Mulan PSL v2 for more details. ++ + import logging + from dataclasses import asdict + ++ + from .threshold import ThresholdType + from .sliding_window import SlidingWindowType + from .io_data import MetricName, IOData + ++ + def get_threshold_type_enum(algorithm_type: str): + if algorithm_type.lower() == 'absolute': + return ThresholdType.AbsoluteThreshold +@@ -22,7 +25,7 @@ def get_threshold_type_enum(algorithm_type: str): + return ThresholdType.BoxplotThreshold + if algorithm_type.lower() == 'n_sigma': + return ThresholdType.NSigmaThreshold +- logging.info('not found correct algorithm type, use default: boxplot.') ++ logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot") + return ThresholdType.BoxplotThreshold + + +@@ -33,7 +36,7 @@ def get_sliding_window_type_enum(sliding_window_type: str): + return SlidingWindowType.ContinuousSlidingWindow + if sliding_window_type.lower() == 'median': + return SlidingWindowType.MedianSlidingWindow +- logging.info('not found correct sliding window type, use default: not_continuous.') ++ logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous") + return SlidingWindowType.NotContinuousSlidingWindow + + +@@ -62,6 +65,8 @@ def get_log_level(log_level: str): + return logging.INFO + elif log_level.lower() == 'warning': + return logging.WARNING +- elif log_level.lower() == 'fatal': +- return logging.FATAL +- return None ++ elif log_level.lower() == 'error': ++ return logging.ERROR ++ elif log_level.lower() == 'critical': ++ return logging.CRITICAL ++ return logging.INFO +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py +deleted file mode 100644 +index cd4e6f1..0000000 +--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py ++++ /dev/null +@@ -1,141 +0,0 @@ +-# coding: utf-8 +-# Copyright (c) 2024 Huawei Technologies Co., Ltd. +-# sysSentry is licensed under the Mulan PSL v2. +-# You can use this software according to the terms and conditions of the Mulan PSL v2. +-# You may obtain a copy of Mulan PSL v2 at: +-# http://license.coscl.org.cn/MulanPSL2 +-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-# PURPOSE. +-# See the Mulan PSL v2 for more details. +- +-import configparser +-import logging +- +- +-class ConfigParser: +- +- DEFAULT_ABSOLUTE_THRESHOLD = 40 +- DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 +- DEFAULT_LOG_LEVEL = 'info' +- DEFAULT_TRAIN_DATA_DURATION = 24 +- DEFAULT_TRAIN_UPDATE_DURATION = 2 +- DEFAULT_ALGORITHM_TYPE = 'boxplot' +- DEFAULT_N_SIGMA_PARAMETER = 3 +- DEFAULT_BOXPLOT_PARAMETER = 1.5 +- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' +- DEFAULT_WINDOW_SIZE = 30 +- DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 +- +- def __init__(self, config_file_name): +- self.__boxplot_parameter = None +- self.__window_minimum_threshold = None +- self.__window_size = None +- self.__sliding_window_type = None +- self.__n_sigma_parameter = None +- self.__algorithm_type = None +- self.__train_update_duration = None +- self.__log_level = None +- self.__slow_io_detect_frequency = None +- self.__absolute_threshold = None +- self.__train_data_duration = None +- self.__config_file_name = config_file_name +- +- def read_config_from_file(self): +- +- con = configparser.ConfigParser() +- con.read(self.__config_file_name, encoding='utf-8') +- +- items_common = dict(con.items('common')) +- items_algorithm = dict(con.items('algorithm')) +- items_sliding_window = dict(con.items('sliding_window')) +- +- try: +- self.__absolute_threshold = int(items_common.get('absolute_threshold', +- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) +- except ValueError: +- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD +- logging.warning('absolute threshold type conversion has error, use default value.') +- +- try: +- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', +- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) +- except ValueError: +- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY +- logging.warning('slow_io_detect_frequency type conversion has error, use default value.') +- +- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) +- +- try: +- self.__train_data_duration = float(items_algorithm.get('train_data_duration', +- ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) +- except ValueError: +- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION +- logging.warning('train_data_duration type conversion has error, use default value.') +- +- try: +- self.__train_update_duration = float(items_algorithm.get('train_update_duration', +- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) +- except ValueError: +- self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION +- logging.warning('train_update_duration type conversion has error, use default value.') +- +- try: +- self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) +- except ValueError: +- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE +- logging.warning('algorithmType type conversion has error, use default value.') +- +- if self.__algorithm_type == 'n_sigma': +- try: +- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', +- ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) +- except ValueError: +- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER +- logging.warning('n_sigma_parameter type conversion has error, use default value.') +- elif self.__algorithm_type == 'boxplot': +- try: +- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', +- ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) +- except ValueError: +- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER +- logging.warning('boxplot_parameter type conversion has error, use default value.') +- +- self.__sliding_window_type = items_sliding_window.get('sliding_window_type', +- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) +- +- try: +- self.__window_size = int(items_sliding_window.get('window_size', +- ConfigParser.DEFAULT_WINDOW_SIZE)) +- except ValueError: +- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE +- logging.warning('window_size type conversion has error, use default value.') +- +- try: +- self.__window_minimum_threshold = ( +- int(items_sliding_window.get('window_minimum_threshold', +- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) +- except ValueError: +- self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD +- logging.warning('window_minimum_threshold type conversion has error, use default value.') +- +- def get_slow_io_detect_frequency(self): +- return self.__slow_io_detect_frequency +- +- def get_algorithm_type(self): +- return self.__algorithm_type +- +- def get_sliding_window_type(self): +- return self.__sliding_window_type +- +- def get_train_data_duration_and_train_update_duration(self): +- return self.__train_data_duration, self.__train_update_duration +- +- def get_window_size_and_window_minimum_threshold(self): +- return self.__window_size, self.__window_minimum_threshold +- +- def get_absolute_threshold(self): +- return self.__absolute_threshold +- +- def get_log_level(self): +- return self.__log_level +diff --git a/src/python/setup.py b/src/python/setup.py +index dac6481..9e26a10 100644 +--- a/src/python/setup.py ++++ b/src/python/setup.py +@@ -34,7 +34,7 @@ setup( + 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', + 'sentryCollector=sentryCollector.collectd:main', + 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', +- 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' ++ 'ai_block_io=sentryPlugins.ai_block_io.ai_block_io:main' + ] + }, + ) +-- +2.23.0 + diff --git a/sysSentry.spec b/sysSentry.spec index 7f3e8332f78444ac556b37339ae35c0aee7013e1..1bd859e6a00afc17862c79e06c3546663248bfd2 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -4,7 +4,7 @@ Summary: System Inspection Framework Name: sysSentry Version: 1.0.2 -Release: 18 +Release: 19 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz @@ -30,6 +30,7 @@ Patch17: optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch Patch18: over-threshold-should-be-warn-level-log-in-cat-cli.patch Patch19: fix-bug-step-2-about-collect-module-and-avg-block-io.patch Patch20: add-log-level-and-change-log-format.patch +Patch21: fix-ai_block_io-some-issues.patch BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools @@ -73,12 +74,12 @@ Requires: sysSentry = %{version}-%{release} %description -n avg_block_io This package provides Supports slow I/O detection based on EBPF -%package -n ai_threshold_slow_io_detection +%package -n ai_block_io Summary: Supports slow I/O detection Requires: python3-numpy Requires: sysSentry = %{version}-%{release} -%description -n ai_threshold_slow_io_detection +%description -n ai_block_io This package provides Supports slow I/O detection based on AI %prep @@ -139,9 +140,9 @@ chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/ install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini -# ai_threshold_slow_io_detection -install config/tasks/ai_threshold_slow_io_detection.mod %{buildroot}/etc/sysSentry/tasks/ -install config/plugins/ai_threshold_slow_io_detection.ini %{buildroot}/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini +# ai_block_io +install config/tasks/ai_block_io.mod %{buildroot}/etc/sysSentry/tasks/ +install config/plugins/ai_block_io.ini %{buildroot}/etc/sysSentry/plugins/ai_block_io.ini pushd src/python python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES @@ -176,7 +177,7 @@ rm -rf %{buildroot} %attr(0550,root,root) %{python3_sitelib}/syssentry %attr(0550,root,root) %{python3_sitelib}/sentryCollector %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io -%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_threshold_slow_io_detection +%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io # sysSentry %attr(0500,root,root) %{_bindir}/sentryctl @@ -208,10 +209,10 @@ rm -rf %{buildroot} %exclude %{_bindir}/avg_block_io %exclude %{python3_sitelib}/sentryPlugins/* -# ai_threshold_slow_io_detection -%exclude %{_sysconfdir}/sysSentry/tasks/ai_threshold_slow_io_detection.mod -%exclude %{_sysconfdir}/sysSentry/plugins/ai_threshold_slow_io_detection.ini -%exclude %{_bindir}/ai_threshold_slow_io_detection +# ai_block_io +%exclude %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod +%exclude %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini +%exclude %{_bindir}/ai_block_io %exclude %{python3_sitelib}/sentryPlugins/* # sentryCollector @@ -241,13 +242,19 @@ rm -rf %{buildroot} %attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io -%files -n ai_threshold_slow_io_detection -%attr(0500,root,root) %{_bindir}/ai_threshold_slow_io_detection -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/ai_threshold_slow_io_detection.mod -%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/ai_threshold_slow_io_detection.ini -%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_threshold_slow_io_detection +%files -n ai_block_io +%attr(0500,root,root) %{_bindir}/ai_block_io +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod +%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini +%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io %changelog +* Mon Sep 30 2024 heyouzhi - 1.0.2-19 +- Type:bugfix +- CVE:NA +- SUG:NA +- DESC:fix ai_block_io some issues + * Fri Sep 27 2024 zhuofeng - 1.0.2-18 - Type:bugfix - CVE:NA