diff --git a/add-get_disk_type-and-fix-some-bugs.patch b/add-get_disk_type-and-fix-some-bugs.patch new file mode 100644 index 0000000000000000000000000000000000000000..b5e59e8960bb7514dd66e78fb81365dc29518a7f --- /dev/null +++ b/add-get_disk_type-and-fix-some-bugs.patch @@ -0,0 +1,176 @@ +From c2ffc679eddda5d78362612d89a9319d268da7e3 Mon Sep 17 00:00:00 2001 +From: zhuofeng +Date: Thu, 10 Oct 2024 20:17:34 +0800 +Subject: [PATCH] add get_disk_type and fix some bugs + +--- + service/sentryCollector.service | 2 +- + src/python/sentryCollector/collect_io.py | 16 ++++- + src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++- + 3 files changed, 81 insertions(+), 5 deletions(-) + +diff --git a/service/sentryCollector.service b/service/sentryCollector.service +index 4ee07d5..e09ddb3 100644 +--- a/service/sentryCollector.service ++++ b/service/sentryCollector.service +@@ -1,5 +1,5 @@ + [Unit] +-Description = Collection module added for sysSentry and kernel lock-free collection ++Description = Collection module added for sysSentry + + [Service] + ExecStart=/usr/bin/python3 /usr/bin/sentryCollector +diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py +index 8780648..6699a90 100644 +--- a/src/python/sentryCollector/collect_io.py ++++ b/src/python/sentryCollector/collect_io.py +@@ -116,7 +116,7 @@ class CollectIo(): + return 0 + if finish <= 0 or lat_time <= 0: + return 0 +- value = lat_time / finish / 1000 / 1000 ++ value = lat_time / finish / 1000 + if value.is_integer(): + return int(value) + else: +@@ -124,11 +124,17 @@ class CollectIo(): + + def get_io_length(self, curr_stage_value, last_stage_value, category): + try: +- finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) ++ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) + except ValueError as e: + logging.error("get_io_length convert to int failed, %s", e) + return 0 +- value = finish / self.period_time / 1000 / 1000 ++ if lat_time <= 0: ++ return 0 ++ # ns convert us ++ lat_time = lat_time / 1000 ++ # s convert us ++ period_time = self.period_time * 1000 * 1000 ++ value = lat_time / period_time + if value.is_integer(): + return int(value) + else: +@@ -141,6 +147,8 @@ class CollectIo(): + with open(io_dump_file, 'r') as file: + for line in file: + count += line.count('.op=' + Io_Category[category]) ++ if count > 0: ++ logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") + except FileNotFoundError: + logging.error("The file %s does not exist.", io_dump_file) + return count +@@ -223,6 +231,8 @@ class CollectIo(): + if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: + continue + self.append_period_lat(disk_name, stage_list) ++ ++ logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}") + + elapsed_time = time.time() - start_time + sleep_time = self.period_time - elapsed_time +diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py +index 3e2cf4c..31bf11b 100644 +--- a/src/python/sentryCollector/collect_plugin.py ++++ b/src/python/sentryCollector/collect_plugin.py +@@ -16,6 +16,7 @@ import json + import socket + import logging + import re ++import os + + COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" + +@@ -58,6 +59,8 @@ class ResultMessage(): + RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. + RESULT_PARSE_FAILED = 5 # parse failed + RESULT_INVALID_CHAR = 6 # invalid char ++ RESULT_DISK_NOEXIST = 7 # disk is not exist ++ RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch + + Result_Messages = { + ResultMessage.RESULT_SUCCEED: "Succeed", +@@ -66,9 +69,15 @@ Result_Messages = { + ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", + ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", + ResultMessage.RESULT_PARSE_FAILED: "Parse failed", +- ResultMessage.RESULT_INVALID_CHAR: "Invalid char" ++ ResultMessage.RESULT_INVALID_CHAR: "Invalid char", ++ ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist", ++ ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch" + } + ++class DiskType(): ++ TYPE_NVME_SSD = 0 ++ TYPE_SATA_SSD = 1 ++ TYPE_SATA_HDD = 2 + + def client_send_and_recv(request_data, data_str_len, protocol): + """client socket send and recv message""" +@@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype): + result['message'] = result_message + return result + ++def get_disk_type(disk): ++ result = {} ++ result['ret'] = ResultMessage.RESULT_UNKNOWN ++ result['message'] = "" ++ if not disk: ++ logging.error("param is invalid") ++ result['ret'] = ResultMessage.RESULT_NOT_PARAM ++ return result ++ if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN: ++ logging.error("invalid disk length") ++ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH ++ return result ++ pattern = r'^[a-zA-Z0-9_-]+$' ++ if not re.match(pattern, disk): ++ logging.error("%s is invalid char", disk) ++ result['ret'] = ResultMessage.RESULT_INVALID_CHAR ++ return result ++ ++ base_path = '/sys/block' ++ all_disk = [] ++ for disk_name in os.listdir(base_path): ++ all_disk.append(disk_name) ++ ++ if disk not in all_disk: ++ logging.error("disk %s is not exist", disk) ++ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST ++ return result ++ ++ if disk[0:4] == "nvme": ++ result['message'] = str(DiskType.TYPE_NVME_SSD) ++ elif disk[0:2] == "sd": ++ disk_file = '/sys/block/{}/queue/rotational'.format(disk) ++ try: ++ with open(disk_file, 'r') as file: ++ num = int(file.read()) ++ if num == 1: ++ result['message'] = str(DiskType.TYPE_SATA_SSD) ++ elif num == 0: ++ result['message'] = str(DiskType.TYPE_SATA_HDD) ++ else: ++ logging.error("disk %s is not support, num = %d", disk, num) ++ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH ++ return result ++ except FileNotFoundError: ++ logging.error("The disk_file [%s] does not exist", disk_file) ++ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST ++ return result ++ except Exception as e: ++ logging.error("open disk_file %s happen an error: %s", disk_file, e) ++ return result ++ else: ++ logging.error("disk %s is not support", disk) ++ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH ++ return result ++ ++ result['ret'] = ResultMessage.RESULT_SUCCEED ++ return result +\ No newline at end of file +-- +2.33.0 + diff --git a/add-log-for-improving-maintainability.patch b/add-log-for-improving-maintainability.patch new file mode 100644 index 0000000000000000000000000000000000000000..a9a45273d0a95d8b5be940fbd070387f4a96f1fe --- /dev/null +++ b/add-log-for-improving-maintainability.patch @@ -0,0 +1,251 @@ +From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001 +From: gaoruoshu +Date: Thu, 10 Oct 2024 15:07:29 +0800 +Subject: [PATCH] add log for improving maintainability + +--- + .../avg_block_io/avg_block_io.py | 4 +- + .../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++------- + .../avg_block_io/stage_window.py | 8 +++ + .../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++-- + 4 files changed, 103 insertions(+), 29 deletions(-) + +diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py +index 26a60c5..cf2ded3 100644 +--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py ++++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py +@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param): + + if avg_lim_value and avg_time_value and tot_lim_value: + io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) +- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) ++ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) + + if iodump_lim_value is not None: + io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) +- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) ++ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) + return io_data, io_avg_value + + +diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py +index 2fc5a83..40b3fcc 100644 +--- a/src/python/sentryPlugins/avg_block_io/module_conn.py ++++ b/src/python/sentryPlugins/avg_block_io/module_conn.py +@@ -13,7 +13,7 @@ import logging + import sys + import time + +-from .utils import is_abnormal ++from .utils import is_abnormal, get_win_data, log_slow_win + from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages + from syssentry.result import ResultLevel, report_result + from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR +@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info): + + def process_report_data(disk_name, rw, io_data): + """check abnormal window and report to xalarm""" +- if not is_abnormal((disk_name, 'bio', rw), io_data): ++ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) ++ if not abnormal: + return + +- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} ++ msg = { ++ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw, ++ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list, ++ "details": get_win_data(disk_name, rw, io_data) ++ } + ++ # io press + ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] + for stage_name in ctrl_stage: +- if is_abnormal((disk_name, stage_name, rw), io_data): +- msg["reason"] = "IO press slow" +- msg["block_stack"] = f"bio,{stage_name}" +- logging.warning("{} - {} report IO press slow".format(disk_name, rw)) +- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) +- return +- +- if is_abnormal((disk_name, 'rq_driver', rw), io_data): ++ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) ++ if not abnormal: ++ continue ++ msg["reason"] = "IO press" ++ msg["block_stack"] = f"bio,{stage_name}" ++ msg["alarm_type"] = abnormal_list ++ log_slow_win(msg, "IO press") ++ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) ++ return ++ ++ # driver slow ++ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data) ++ if abnormal: + msg["reason"] = "driver slow" + msg["block_stack"] = "bio,rq_driver" +- logging.warning("{} - {} report driver slow".format(disk_name, rw)) ++ msg["alarm_type"] = abnormal_list ++ log_slow_win(msg, "driver slow") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + return + ++ # kernel slow + kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] + for stage_name in kernel_stage: +- if is_abnormal((disk_name, stage_name, rw), io_data): +- msg["reason"] = "kernel slow" +- msg["block_stack"] = f"bio,{stage_name}" +- logging.warning("{} - {} report kernel slow".format(disk_name, rw)) +- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) +- return +- msg["reason"] = "unknown" +- msg["block_stack"] = "bio" +- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) ++ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) ++ if not abnormal: ++ continue ++ msg["reason"] = "kernel slow" ++ msg["block_stack"] = f"bio,{stage_name}" ++ msg["alarm_type"] = abnormal_list ++ log_slow_win(msg, "kernel slow") ++ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) ++ return ++ ++ log_slow_win(msg, "unknown") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) +diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py +index 9b0ce79..5113782 100644 +--- a/src/python/sentryPlugins/avg_block_io/stage_window.py ++++ b/src/python/sentryPlugins/avg_block_io/stage_window.py +@@ -14,6 +14,11 @@ class AbnormalWindowBase: + self.window_size = window_size + self.window_threshold = window_threshold + self.abnormal_window = [False] * window_size ++ self.window_data = [-1] * window_size ++ ++ def append_new_data(self, ab_res): ++ self.window_data.pop(0) ++ self.window_data.append(ab_res) + + def append_new_period(self, ab_res, avg_val=0): + self.abnormal_window.pop(0) +@@ -25,6 +30,9 @@ class AbnormalWindowBase: + def is_abnormal_window(self): + return sum(self.abnormal_window) > self.window_threshold + ++ def window_data_to_string(self): ++ return ",".join(str(x) for x in self.window_data) ++ + + class IoWindow(AbnormalWindowBase): + def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): +diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py +index 2de9a46..3b7f027 100644 +--- a/src/python/sentryPlugins/avg_block_io/utils.py ++++ b/src/python/sentryPlugins/avg_block_io/utils.py +@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value): + return True + + ++def get_win_data(disk_name, rw, io_data): ++ """get latency and iodump win data""" ++ latency = '' ++ iodump = '' ++ for stage_name in io_data[disk_name]: ++ if 'latency' in io_data[disk_name][stage_name][rw]: ++ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() ++ latency += f'{stage_name}: [{latency_list}], ' ++ if 'iodump' in io_data[disk_name][stage_name][rw]: ++ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string() ++ iodump += f'{stage_name}: [{iodump_list}], ' ++ return {"latency": latency[:-2], "iodump": iodump[:-2]} ++ ++ + def is_abnormal(io_key, io_data): + """check if latency and iodump win abnormal""" ++ abnormal_list = '' + for key in ['latency', 'iodump']: + all_keys = get_nested_value(io_data, io_key) + if all_keys and key in all_keys: + win = get_nested_value(io_data, io_key + (key,)) + if win and win.is_abnormal_window(): +- return True +- return False ++ abnormal_list += key + ', ' ++ if not abnormal_list: ++ return False, abnormal_list ++ return True, abnormal_list[:-2] + + + def update_io_avg(old_avg, period_value, win_size): +@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size): + return [new_avg_value, new_avg_count] + + +-def update_io_data(old_avg, period_value, win_size, io_data, io_key): +- """update data of latency and iodump window""" ++def update_io_period(old_avg, period_value, io_data, io_key): ++ """update period of latency and iodump window""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) +@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key): + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) + + ++def update_io_data(period_value, io_data, io_key): ++ """update data of latency and iodump window""" ++ all_wins = get_nested_value(io_data, io_key) ++ if all_wins and "latency" in all_wins: ++ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0]) ++ if all_wins and "iodump" in all_wins: ++ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) ++ ++ ++def log_abnormal_period(old_avg, period_value, io_data, io_key): ++ """record log of abnormal period""" ++ all_wins = get_nested_value(io_data, io_key) ++ if all_wins and "latency" in all_wins: ++ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): ++ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " ++ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}") ++ if all_wins and "iodump" in all_wins: ++ if all_wins["iodump"].is_abnormal_period(period_value[1]): ++ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " ++ f"type: iodump, curr_val: {period_value[1]}") ++ ++ ++def log_slow_win(msg, reason): ++ """record log of slow win""" ++ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " ++ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") ++ logging.info(f"latency: {msg['details']['latency']}") ++ logging.info(f"iodump: {msg['details']['iodump']}") ++ ++ + def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): + """update avg and check abonrmal, return true if win_size full""" + period_value = get_nested_value(data, io_key) + old_avg = get_nested_value(io_avg_value, io_key) + + # 更新avg数据 ++ update_io_data(period_value, io_data, io_key) + if old_avg[AVG_COUNT] < win_size: + set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) + return False + ++ # 打印异常周期数据 ++ log_abnormal_period(old_avg, period_value, io_data, io_key) ++ + # 更新win数据 -- 判断异常周期 +- update_io_data(old_avg, period_value, win_size, io_data, io_key) ++ update_io_period(old_avg, period_value, io_data, io_key) + all_wins = get_nested_value(io_data, io_key) +- if all_wins and 'latency' not in all_wins: ++ if not all_wins or 'latency' not in all_wins: + return True + period = get_nested_value(io_data, io_key + ("latency",)) + if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): +-- +2.27.0 + diff --git a/sysSentry.spec b/sysSentry.spec index f37ba628cf1cf33ea0a1fe98e1d8d3b7491cb58e..8f6907f65eac4f07c230aacc3d328be78e93c21e 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -4,7 +4,7 @@ Summary: System Inspection Framework Name: sysSentry Version: 1.0.2 -Release: 29 +Release: 30 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz @@ -42,6 +42,8 @@ Patch29: change-alarm-length.patch Patch30: add-detail-time.patch Patch31: xalarm-add-alarm-msg-length-to-8192.patch Patch32: ai_block_io-adapt-alarm-module.patch +Patch33: add-log-for-improving-maintainability.patch +Patch34: add-get_disk_type-and-fix-some-bugs.patch BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools @@ -286,6 +288,13 @@ rm -rf %{buildroot} %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io %changelog +* Thu Oct 10 2024 zhuofeng - 1.0.2-30 +- Type:bugfix +- CVE:NA +- SUG:NA +- DESC:add get_disk_type and fix some bugs + add log for improving maintainability + * Thu Oct 10 2024 heyouzhi - 1.0.2-29 - Type:requirement - CVE:NA