diff --git a/sysSentry-1.0.2/src/python/syssentry/alarm.py b/sysSentry-1.0.2/src/python/syssentry/alarm.py new file mode 100644 index 0000000000000000000000000000000000000000..1bbf68ef5d837cbd9a0e310d1e67c03f1b41c627 --- /dev/null +++ b/sysSentry-1.0.2/src/python/syssentry/alarm.py @@ -0,0 +1,171 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +use for report alarm +""" +import threading +from typing import Dict, List +from datetime import datetime +import time +import logging +import json + +# from xalarm import xalarm_register, Xalarm +from xalarm.register_xalarm import xalarm_register +from xalarm.xalarm_api import Xalarm +from xalarm.register_xalarm import * + +from .global_values import InspectTask +from .task_map import TasksMap + +# 告警ID映射字典,key为插件名,value为告警ID(类型为数字) +task_alarm_id_dict: Dict[str, int] = {} + +# 告警老化时间字典,key为告警ID,value为老化时间(类型为数字,单位为秒) +alarm_id_clear_time_dict: Dict[int, int] = {} + +# 告警事件列表,key为告警ID,value为告警ID对应的告警事件列表(类型为list) +alarm_list_dict: Dict[int, List[Xalarm]] = {} +# 告警事件列表锁 +alarm_list_lock = threading.Lock() + +id_filter = [] +id_base = 1001 + +TIME_UNIT_MILLISECONDS = 1000 + +def xalarm_gettime(alarm_info: Xalarm) -> int: + if not alarm_info: + return 0 + return alarm_info.timetamp.tv_sec * TIME_UNIT_MILLISECONDS + alarm_info.timetamp.tv_usec / TIME_UNIT_MILLISECONDS + +def update_alarm_list(alarm_info: Xalarm): + logging.info(f"进入 update_alarm_list() 中") + alarm_id = alarm_info.alarm_id + timestamp = xalarm_gettime(alarm_info) + alarm_list_lock.acquire() + try: + # new alarm is inserted into list head + if alarm_id not in alarm_list_dict: + logging.warn(f"update_alarm_list: alarm_id {alarm_id} not found in alarm_list_dict") + return + alarm_list = alarm_list_dict[alarm_id] + + logging.info(f"alarm_list_dict中的内容为: {alarm_list_dict}") + logging.info(f"在update_alarm_list()中 alarm_list={alarm_list}") + + alarm_list.insert(0, alarm_info) + # clear alarm_info older than clear time threshold + clear_index = -1 + clear_time = alarm_id_clear_time_dict[alarm_id] + for i in range(len(alarm_list)): + if timestamp - xalarm_gettime(alarm_list[i]) > clear_time: + clear_index = i + break + if clear_index >= 0: + alarm_list_dict[alarm_id] = alarm_list[:clear_index] + finally: + alarm_list_lock.release() + + + +def alarm_register(): + logging.info(f"alarm_register: enter") + # 初始化告警ID映射字典、告警老化时间字典 + for task_type in TasksMap.tasks_dict: + for task_name in TasksMap.tasks_dict[task_type]: + logging.info(f"alarm_register: {task_name} is registered") + task = TasksMap.tasks_dict[task_type][task_name] + alarm_id = task.alarm_id + alarm_clear_time = task.alarm_clear_time + alarm_list_dict[alarm_id] = [] + task_alarm_id_dict[task_name] = alarm_id + if alarm_id not in alarm_id_clear_time_dict: + alarm_id_clear_time_dict[alarm_id] = alarm_clear_time + else: + alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id]) + # 注册告警回调 + id_filter = [True] * 128 + # for id in list(set(task_alarm_id_dict.values())): + # logging.info(f"alarm_register: {id} is filtered") + # id_filter[id - id_base] = True + ret = xalarm_register(update_alarm_list, id_filter) + if ret < 0: + logging.info(f'register注册失败') + return + logging.info(f'ret是注册结果:{ret}') + logging.info('register 注册成功') + + +def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Dict]: + alarm_list_lock.acquire() + try: + if task_name not in task_alarm_id_dict: + return [] + alarm_id = task_alarm_id_dict[task_name] + if alarm_id not in alarm_list_dict: + return [] + alarm_list = alarm_list_dict[alarm_id] + + + logging.info(f"alarm_list_dict {alarm_list_dict}") + + + logging.info(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements") + # clear alarm_info older than clear time threshold + stop_index = -1 + timestamp = int(datetime.now().timestamp()) + for i in range(len(alarm_list)): + logging.info(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}") + if timestamp - xalarm_gettime(alarm_list[i]) > int(time_range): + stop_index = i + break + if stop_index >= 0: + alarm_list = alarm_list[:stop_index] + # filter alarms whose source does not match + + logging.info(f"进入 get_alarm_result() 中,筛选出alarm_list中符合条件的alarm对象") + + # alarm_list = [alarm.__dict__ for alarm in alarm_list if not alarm.alarm_source or alarm.alarm_source == task_name] + logging.info(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") + # keep detail + + def xalarm_to_dict(alarm_info: Xalarm) -> dict: + return { + 'alarm_id': alarm_info.alarm_id, + 'alarm_type': alarm_info.alarm_type, + 'alarm_level': alarm_info.alarm_level, + 'timetamp': xalarm_gettime(alarm_info), + 'msg1': alarm_info.msg1.decode('utf-8').rstrip('\x00') + } + + alarm_list = [xalarm_to_dict(alarm) for alarm in alarm_list] + + # alarm_list = [alarm.__dict__ for alarm in alarm_list if not alarm.alarm_source or alarm.alarm_source == task_name] + + for alarm in alarm_list: + # alarm.pop('detail', None) + logging.info(f"alarm: {alarm}") + alarm_info = alarm['msg1'] + alarm_info = json.loads(alarm_info) + if not detailed: + if 'details' in alarm_info: + alarm_info.pop('details', None) + alarm.pop('msg1', None) + alarm['alarm_info'] = alarm_info + + # if not detailed: + # for alarm in alarm_list: + # alarm.pop('detail', None) + return alarm_list + finally: + alarm_list_lock.release() \ No newline at end of file diff --git a/sysSentry-1.0.2/src/python/syssentry/callbacks.py b/sysSentry-1.0.2/src/python/syssentry/callbacks.py index b38b381ed477197d005876e04b6231133247cd01..534bba25142a53c45c1c5f022770442907cf9b84 100644 --- a/sysSentry-1.0.2/src/python/syssentry/callbacks.py +++ b/sysSentry-1.0.2/src/python/syssentry/callbacks.py @@ -18,6 +18,7 @@ import logging from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE from .mod_status import EXITED_STATUS, RUNNING_STATUS, WAITING_STATUS, set_runtime_status +from .alarm import get_alarm_result def task_get_status(mod_name): @@ -41,6 +42,18 @@ def task_get_result(mod_name): return "success", task.get_result() +def task_get_alarm(data): + """get result by mod name""" + task_name = data['task_name'] + time_range = data['time_range'] + detailed = data['detailed'] + task = TasksMap.get_task_by_name(task_name) + if not task: + return "failed", f"cannot find task by name {task_name}" + if not task.load_enabled: + return "failed", f"mod {task_name} is not enabled" + + return "success", get_alarm_result(task_name, time_range, detailed) def task_stop(mod_name): """stop by mod name""" diff --git a/sysSentry-1.0.2/src/python/syssentry/cpu_sentry.py b/sysSentry-1.0.2/src/python/syssentry/cpu_sentry.py index 2f18d144a5a245aee406bda174b853abd748f83d..72925eb2f9b36a4a8b6c21f2182151847c036d0b 100644 --- a/sysSentry-1.0.2/src/python/syssentry/cpu_sentry.py +++ b/sysSentry-1.0.2/src/python/syssentry/cpu_sentry.py @@ -26,8 +26,6 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini" # Inspection commands running at the bottom layer LOW_LEVEL_INSPECT_CMD = "cat-cli" -# max length of msg in details -DETAILS_LOG_MSG_MAX_LEN = 255 class CpuSentry: """ @@ -96,10 +94,22 @@ class CpuSentry: self.send_result["details"]["msg"] = "cpu_sentry task is killed!" return + if "ERROR" in stdout: + self.send_result["result"] = ResultLevel.FAIL + self.send_result["details"]["code"] = 1004 + + # Remove ANSI escape sequences + error_info = stdout.split("\n")[0] + if error_info.startswith("\u001b"): + ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' + error_info = re.sub(ansi_escape, '', error_info) + + self.send_result["details"]["msg"] = error_info + return + out_split = stdout.split("\n") - isolated_cores_number = -1 + isolated_cores_number = 0 found_fault_cores_list = [] - error_msg_list = [] for out_line_i in out_split: if "handle_patrol_result: Found fault cores" in out_line_i: cores_number_tmp = out_line_i.split("Found fault cores:")[1] @@ -111,25 +121,9 @@ class CpuSentry: elif out_line_i.startswith(''): self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1] break - elif "ERROR" in out_line_i: - logging.error("[cat-cli error] - %s\n", out_line_i) - error_msg_list.append(out_line_i) found_fault_cores_number = len(set(found_fault_cores_list)) - if isolated_cores_number == -1: - self.send_result["result"] = ResultLevel.FAIL - self.send_result["details"]["code"] = 1004 - - send_error_msg = "" - # Remove ANSI escape sequences - for error_info in error_msg_list: - if error_info.startswith("\u001b"): - ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' - error_info = re.sub(ansi_escape, '', error_info) - if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN: - send_error_msg += ";" + error_info - self.send_result["details"]["msg"] = send_error_msg - elif found_fault_cores_number == 0: + if found_fault_cores_number == 0: self.send_result["details"]["code"] = 0 self.send_result["result"] = ResultLevel.PASS elif 0 in found_fault_cores_list: diff --git a/sysSentry-1.0.2/src/python/syssentry/global_values.py b/sysSentry-1.0.2/src/python/syssentry/global_values.py index 483d5447d97b36b1af61c1a2214bef68349f46aa..7a020104f96f2c5dc1a8eb943f4175cb59128a0a 100644 --- a/sysSentry-1.0.2/src/python/syssentry/global_values.py +++ b/sysSentry-1.0.2/src/python/syssentry/global_values.py @@ -76,6 +76,9 @@ class InspectTask: self.env_file = "" # start mode self.conflict = "up" + # alarm id + self.alarm_id = -1 + self.alarm_clear_time = 1800 def start(self): """ diff --git a/sysSentry-1.0.2/src/python/syssentry/load_mods.py b/sysSentry-1.0.2/src/python/syssentry/load_mods.py index 48d7e66e4674a862653ef89144dce328ff2ac820..f7b5aa9fdea1fa78fd20a8047f410e53200e0f03 100644 --- a/sysSentry-1.0.2/src/python/syssentry/load_mods.py +++ b/sysSentry-1.0.2/src/python/syssentry/load_mods.py @@ -41,6 +41,8 @@ CONF_TASK_RESTART = 'task_restart' CONF_ONSTART = 'onstart' CONF_ENV_FILE = 'env_file' CONF_CONFLICT = 'conflict' +CONF_ALARM_ID = 'alarm_id' +CONF_ALARM_CLEAR_TIME = 'alarm_clear_time' MOD_FILE_SUFFIX = '.mod' MOD_SUFFIX_LEN = 4 @@ -194,6 +196,13 @@ def parse_mod_conf(mod_name, mod_conf): task.heartbeat_interval = heartbeat_interval task.load_enabled = is_enabled + try: + task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID)) + task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)) + except: + task.alarm_id = 1001 + task.alarm_clear_time = 15 + if CONF_ONSTART in mod_conf.options(CONF_TASK): is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') if task_type == PERIOD_CONF: diff --git a/sysSentry-1.0.2/src/python/syssentry/sentryctl b/sysSentry-1.0.2/src/python/syssentry/sentryctl index e94491f110f12b692ed3d6461935e1f730063046..42fd7ef83b2373912689877856645e6a393f2857 100644 --- a/sysSentry-1.0.2/src/python/syssentry/sentryctl +++ b/sysSentry-1.0.2/src/python/syssentry/sentryctl @@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256 RESULT_MSG_DATA_LEN = 4 CTL_MSG_LEN_LEN = 3 +DEFAULT_ALARM_TIME_RANGE = 10 def status_output_format(res_data): """format output""" @@ -57,6 +58,8 @@ def res_output_handle(res_struct, req_type): status_output_format(res_struct['data']) elif req_type == 'get_result': result_output_format(res_struct['data']) + elif req_type == 'get_alarm': + result_output_format(res_struct['data']) elif res_struct['ret'] == "failed": print(res_struct['data']) @@ -75,6 +78,7 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client creat socket error") return None + # connect to syssentry try: client_socket.connect(CTL_SOCKET_PATH) except OSError: @@ -82,6 +86,7 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client connect error") return None + # msg: CTL{len}{data} req_data_len = len(request_data) request_msg = "CTL" + str(req_data_len).zfill(3) + request_data @@ -94,8 +99,8 @@ def client_send_and_recv(request_data, data_str_len): print("sentryctl: client communicate error") return None + # res: RES{len}{data} res_magic = res_data[:3] - if res_magic != "RES": print("res msg format error") return None @@ -128,6 +133,10 @@ if __name__ == '__main__': parser_status.add_argument('task_name') parser_get_result = subparsers.add_parser('get_result', help='get task result') parser_get_result.add_argument('task_name') + parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm') + parser_get_alarm.add_argument('task_name') + parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='指定时间范围') + parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='打印详细信息') parser_list = subparsers.add_parser('list', help='show all loaded task mod') client_args = parser.parse_args() @@ -142,6 +151,15 @@ if __name__ == '__main__': req_msg_struct = {"type": "get_status", "data": client_args.task_name} elif client_args.cmd_type == 'get_result': req_msg_struct = {"type": "get_result", "data": client_args.task_name} + elif client_args.cmd_type == 'get_alarm': + req_msg_struct = { + "type": "get_alarm", + "data": { + 'task_name': client_args.task_name, + 'time_range': client_args.time_range, + 'detailed': client_args.detailed, + } + } elif client_args.cmd_type == 'reload': req_msg_struct = {"type": "reload", "data": client_args.task_name} else: diff --git a/sysSentry-1.0.2/src/python/syssentry/syssentry.py b/sysSentry-1.0.2/src/python/syssentry/syssentry.py index 776971f4e46fc05cc6bd8c9ad5856071169fd22e..204cf8c7e6e654c8f59dd8ccbe87ff491ea3e5d0 100644 --- a/sysSentry-1.0.2/src/python/syssentry/syssentry.py +++ b/sysSentry-1.0.2/src/python/syssentry/syssentry.py @@ -28,7 +28,7 @@ from .sentry_config import SentryConfig from .task_map import TasksMap from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM from .cron_process import period_tasks_handle -from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result +from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result, task_get_alarm from .mod_status import get_task_by_pid, set_runtime_status from .load_mods import load_tasks, reload_single_mod from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, @@ -36,6 +36,7 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel from .utils import get_current_time_string +from .alarm import alarm_register CPU_EXIST = True @@ -62,6 +63,7 @@ type_func = { 'stop': task_stop, 'get_status': task_get_status, 'get_result': task_get_result, + 'get_alarm': task_get_alarm, 'reload': reload_single_mod } @@ -107,11 +109,12 @@ def msg_data_process(msg_data): return "Invaild cmd type" cmd_param = data_struct['data'] - logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param) + logging.info("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param)) if cmd_type in type_func: ret, res_data = type_func[cmd_type](cmd_param) else: ret, res_data = type_func_void[cmd_type]() + logging.info("msg_data_process res_data:%s",str(res_data)) res_msg_struct = {"ret": ret, "data": res_data} res_msg = json.dumps(res_msg_struct) @@ -256,7 +259,7 @@ def server_recv(server_socket: socket.socket): logging.debug("res head %s", res_head) res_msg = res_head + res_data - logging.debug("res msg %s", res_msg) + logging.info("res msg %s", res_msg) try: client_socket.send(res_msg.encode()) @@ -581,10 +584,10 @@ def main(): _ = SentryConfig.init_param() TasksMap.init_task_map() load_tasks() + alarm_register() main_loop() except Exception: logging.error('%s', traceback.format_exc()) finally: - release_pidfile() - + release_pidfile() \ No newline at end of file diff --git a/sysSentry-1.0.2/src/python/syssentry/task_map.py b/sysSentry-1.0.2/src/python/syssentry/task_map.py index 70aa19d878b199468e25fc248e3752f149cfa7ac..6f9637783e0f1ca907ea978b62c7ecd7a8125f68 100644 --- a/sysSentry-1.0.2/src/python/syssentry/task_map.py +++ b/sysSentry-1.0.2/src/python/syssentry/task_map.py @@ -13,6 +13,9 @@ tasks map class and initialize function. """ import logging +from typing import Dict + +# from .global_values import InspectTask ONESHOT_TYPE = "ONESHOT" PERIOD_TYPE = "PERIOD" @@ -22,7 +25,7 @@ TASKS_MAP = None class TasksMap: """task map class""" - tasks_dict = {} + tasks_dict: Dict[str, Dict] = {} @classmethod def init_task_map(cls): @@ -64,4 +67,4 @@ class TasksMap: res = cls.tasks_dict.get(task_type).get(task_name) logging.debug("getting task by name: %s", res) break - return res + return res \ No newline at end of file diff --git a/sysSentry-1.0.2/src/python/xalarm/register_xalarm.py b/sysSentry-1.0.2/src/python/xalarm/register_xalarm.py index d45dd356322f2972ec87c11c2611f79e85d468b3..20bb01414fb69bbd847e65a5f8f101105532638e 100644 --- a/sysSentry-1.0.2/src/python/xalarm/register_xalarm.py +++ b/sysSentry-1.0.2/src/python/xalarm/register_xalarm.py @@ -88,7 +88,7 @@ class AlarmRegister: try: data = self.socket.recv(ALARM_REPORT_LEN) if not data: - sys.stderr.write("connection closed by xalarmd, maybe connections reach max num or service stopped.\n") + sys.stderr.write("connection closed by xalarmd.service, maybe connections reach max num or service stopped.\n") self.thread_should_stop = True break if len(data) != ALARM_REPORT_LEN: @@ -100,7 +100,7 @@ class AlarmRegister: except (BlockingIOError) as e: time.sleep(0.1) except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError): - sys.stderr.write("Connection closed by the server.\n") + sys.stderr.write("connection closed by the server.\n") self.thread_should_stop = True except (ValueError, StructParseError, InterruptedError) as e: sys.stderr.write(f"{e}\n") @@ -119,7 +119,7 @@ class AlarmRegister: def xalarm_register(callback: callable, id_filter: list[bool]) -> int: global ALARM_REGISTER_INFO - + logging.info('1234445566777888') if ALARM_REGISTER_INFO is not None: sys.stderr.write("xalarm_register: alarm has registered\n") return -1 @@ -185,4 +185,3 @@ def xalarm_getdesc(alarm_info: Xalarm) -> str: if not alarm_info: return None return alarm_info.msg1 - diff --git a/sysSentry-1.0.2/src/python/xalarm/xalarm_api.py b/sysSentry-1.0.2/src/python/xalarm/xalarm_api.py index 99eabf54584c74ee6e8006598a5963ccb73a7a5a..4804f1c0ae0a2065dcc3c4e58684b25ee10568f4 100644 --- a/sysSentry-1.0.2/src/python/xalarm/xalarm_api.py +++ b/sysSentry-1.0.2/src/python/xalarm/xalarm_api.py @@ -128,3 +128,4 @@ def alarm_stu2bin(alarm_info: Xalarm): alarm_info.timetamp.tv_sec, alarm_info.timetamp.tv_usec, alarm_info.msg1.encode('utf-8')) + diff --git a/sysSentry-1.0.2/src/python/xalarm/xalarm_server.py b/sysSentry-1.0.2/src/python/xalarm/xalarm_server.py index fcaf3937e200c37ca10a2f65832304cec79cf0fc..b4c65f0054debcc8c2308e9346b4843122faeece 100644 --- a/sysSentry-1.0.2/src/python/xalarm/xalarm_server.py +++ b/sysSentry-1.0.2/src/python/xalarm/xalarm_server.py @@ -80,7 +80,7 @@ def server_loop(alarm_config): len(data)) continue alarm_info = alarm_bin2stu(data) - logging.debug("server bin2stu msg") + logging.info("server bin2stu msg") if not check_filter(alarm_info, alarm_config): continue transmit_alarm(alarm_sock, epoll, fd_to_socket, data)