diff --git a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 2672f1de4069b1215157d10460760a8866126774..dd661a1947a8d915a836d9ac0ea35bd2a583e58e 100644 --- a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -15,7 +15,7 @@ import logging from collections import defaultdict from .detector import Detector, DiskDetector -from .threshold import ThresholdFactory, AbsoluteThreshold +from .threshold import ThresholdFactory from .sliding_window import SlidingWindowFactory from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser @@ -178,14 +178,17 @@ class SlowIODetection: logging.debug("step3. Report slow io event to sysSentry.") for slow_io_event in slow_io_event_list: metric_name: MetricName = slow_io_event[1] + window_info = slow_io_event[2] + root_cause = slow_io_event[3] alarm_content = { "driver_name": f"{metric_name.disk_name}", - "reason": "disk_slow", + "reason": root_cause, "block_stack": f"{metric_name.stage_name}", "io_type": f"{metric_name.io_access_type_name}", "alarm_source": "ai_block_io", "alarm_type": "latency", - "details": f"disk type: {metric_name.disk_type}, ai threshold: {slow_io_event[3]}, abs threshold: {slow_io_event[4]},current window: {slow_io_event[2]}.", + "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, " + f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.", } Xalarm.major(alarm_content) logging.warning(alarm_content) diff --git a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/detector.py b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/detector.py index 4ef70497f5612cbf6ed902dab896d62a9b8c4261..87bd1dd8eaeae3de3252937897f958b031f52fc2 100644 --- a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/detector.py +++ b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/detector.py @@ -17,9 +17,6 @@ from .utils import get_metric_value_from_io_data_dict_by_metric_name class Detector: - _metric_name: MetricName = None - _threshold: Threshold = None - _slidingWindow: SlidingWindow = None def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): self._metric_name = metric_name @@ -40,11 +37,17 @@ class Detector: metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: logging.debug('not found metric value, so return None.') - return False, None, None, None + return (False, False), None, None, None logging.debug(f'input metric value: {str(metric_value)}') self._threshold.push_latest_data_to_queue(metric_value) detection_result = self._slidingWindow.is_slow_io_event(metric_value) - logging.debug(f'Detection result: {str(detection_result)}') + # 检测到慢周期,由Detector负责打印info级别日志 + if detection_result[0][1]: + logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, ' + f'current value: {metric_value}, ai threshold: {detection_result[2]}, ' + f'absolute threshold: {detection_result[3]}') + else: + logging.debug(f'Detection result: {str(detection_result)}') logging.debug(f'exit Detector: {self}') return detection_result @@ -65,13 +68,38 @@ class DiskDetector: self._detector_list.append(detector) def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - # 只有bio阶段发生异常,就认为发生了慢IO事件 - # todo:根因诊断 + """ + 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件 + 情况一:bio异常,rq_driver也异常,则慢盘 + 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常 + 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大 + 情况四:bio异常,则UNKNOWN + """ + diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []} for detector in self._detector_list: + # result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值 + # 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold result = detector.is_slow_io_event(io_data_dict_with_disk_name) - if result[0] and detector.get_metric_name().stage_name == 'bio': - return result[0], detector.get_metric_name(), result[1], result[2], result[3] - return False, None, None, None, None + if result[0][0]: + if detector.get_metric_name().stage_name == "bio": + diagnosis_info["bio"].append((detector.get_metric_name(), result)) + elif detector.get_metric_name().stage_name == "rq_driver": + diagnosis_info["rq_driver"].append((detector.get_metric_name(), result)) + else: + diagnosis_info["io_stage"].append((detector.get_metric_name(), result)) + + # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因 + root_cause = None + if len(diagnosis_info["bio"]) == 0: + return False, None, None, None + elif len(diagnosis_info["rq_driver"]) != 0: + root_cause = "[Root Cause:disk slow]" + elif len(diagnosis_info["io_stage"]) != 0: + stage = diagnosis_info["io_stage"][0][1].get_stage_name() + root_cause = f"[Root Cause:io stage slow, stage: {stage}]" + if root_cause is None: + root_cause = "[Root Cause:high io pressure]" + return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause def __repr__(self): msg = f'disk: {self._disk_name}, ' diff --git a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/sliding_window.py b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/sliding_window.py index 800441bd25c9128a13b2a3cc7b505c41e7d8de2a..d7c402a9477c973cbc5d0c21036fbea771cceac5 100644 --- a/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/sliding_window.py +++ b/sysSentry-1.0.2/src/python/sentryPlugins/ai_block_io/sliding_window.py @@ -34,15 +34,12 @@ class SlidingWindow: self._io_data_queue.pop(0) self._io_data_queue_abnormal_tag.pop(0) self._io_data_queue.append(data) - self._io_data_queue_abnormal_tag.append( - ( - data >= self._ai_threshold - or self._abs_threshold is not None - and data >= self._abs_threshold - ) - if self._ai_threshold is not None - else False - ) + tag = False + if ((self._ai_threshold is not None and data >= self._ai_threshold) or + (self._abs_threshold is not None and data >= self._abs_threshold)): + tag = True + self._io_data_queue_abnormal_tag.append(tag) + return tag def update(self, threshold): if self._ai_threshold == threshold: @@ -61,12 +58,13 @@ class SlidingWindow: class NotContinuousSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: - return True, self._io_data_queue, self._ai_threshold, self._abs_threshold - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + is_slow_io_event = True + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -74,23 +72,20 @@ class NotContinuousSlidingWindow(SlidingWindow): class ContinuousSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False consecutive_count = 0 for tag in self._io_data_queue_abnormal_tag: if tag: consecutive_count += 1 if consecutive_count >= self._queue_threshold: - return ( - True, - self._io_data_queue, - self._ai_threshold, - self._abs_threshold, - ) + is_slow_io_event = True + break else: consecutive_count = 0 - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" @@ -98,13 +93,14 @@ class ContinuousSlidingWindow(SlidingWindow): class MedianSlidingWindow(SlidingWindow): def is_slow_io_event(self, data): - super().push(data) - if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + is_abnormal_period = super().push(data) + is_slow_io_event = False + if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): + is_slow_io_event = False median = np.median(self._io_data_queue) if median >= self._ai_threshold: - return True, self._io_data_queue, self._ai_threshold, self._abs_threshold - return False, self._io_data_queue, self._ai_threshold, self._abs_threshold + is_slow_io_event = True + return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold def __repr__(self): return f"[MedianSlidingWindow, window size: {self._queue_length}]"