From fa691788e13df1b0666839c606446f56951c0ef0 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 29 Aug 2025 18:03:54 +0800 Subject: [PATCH 01/62] add test . --- Makefile | 16 +- config/plugins/bmc_block_io.ini | 11 + config/tasks/bmc_block_io.mod | 7 + src/sentryPlugins/bmc_block_io/CMakeLists.txt | 19 + src/sentryPlugins/bmc_block_io/build.sh | 26 ++ .../bmc_block_io/include/cbmcblockio.h | 70 +++ .../bmc_block_io/include/common.h | 45 ++ .../bmc_block_io/include/configure.h | 25 ++ .../bmc_block_io/include/logger.h | 91 ++++ .../bmc_block_io/src/cbmcblockio.cpp | 415 ++++++++++++++++++ src/sentryPlugins/bmc_block_io/src/common.cpp | 202 +++++++++ src/sentryPlugins/bmc_block_io/src/logger.cpp | 165 +++++++ src/sentryPlugins/bmc_block_io/src/main.cpp | 126 ++++++ 13 files changed, 1216 insertions(+), 2 deletions(-) create mode 100644 config/plugins/bmc_block_io.ini create mode 100644 config/tasks/bmc_block_io.mod create mode 100644 src/sentryPlugins/bmc_block_io/CMakeLists.txt create mode 100644 src/sentryPlugins/bmc_block_io/build.sh create mode 100644 src/sentryPlugins/bmc_block_io/include/cbmcblockio.h create mode 100644 src/sentryPlugins/bmc_block_io/include/common.h create mode 100644 src/sentryPlugins/bmc_block_io/include/configure.h create mode 100644 src/sentryPlugins/bmc_block_io/include/logger.h create mode 100644 src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp create mode 100644 src/sentryPlugins/bmc_block_io/src/common.cpp create mode 100644 src/sentryPlugins/bmc_block_io/src/logger.cpp create mode 100644 src/sentryPlugins/bmc_block_io/src/main.cpp diff --git a/Makefile b/Makefile index aeb9950..4b99d8c 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ PYTHON_VERSION := $(shell $(PYBIN) --version 2>&1 | awk '{print $$2}' | cut -d ' PKGVER := syssentry-$(VERSION)-py$(PYTHON_VERSION) PKGVEREGG := syssentry-$(VERSION)-py$(PYTHON_VERSION).egg-info -all: lib ebpf hbm_online_repair sentry_msg_monitor +all: lib ebpf hbm_online_repair sentry_msg_monitor bmc_block_io lib:libxalarm log @@ -50,6 +50,9 @@ hbm_online_repair: sentry_msg_monitor: lib cd $(CURSRCDIR)/sentryPlugins/sentry_msg_monitor/ && make +bmc_block_io: + cd $(CURSRCDIR)/sentryPlugins/bmc_block_io/ && sh build.sh + install: all dirs isentry dirs: @@ -131,6 +134,11 @@ isentry: install -m 600 $(CURCONFIGDIR)/env/sentry_msg_monitor.env $(ETCDIR)/sysconfig/ install -m 600 $(CURCONFIGDIR)/tasks/sentry_msg_monitor.mod $(ETCDIR)/sysSentry/tasks/ + # bmc_block_io + install -m 550 $(CURSRCDIR)/sentryPlugins/bmc_block_io/output/bmc_block_io $(BINDIR) + install -m 600 $(CURCONFIGDIR)/plugins/bmc_block_io.ini $(ETCDIR)/sysSentry/plugins/ + install -m 600 $(CURCONFIGDIR)/tasks/bmc_block_io.mod $(ETCDIR)/sysSentry/tasks/ + # pysentry_notify install -m 550 src/libsentry/python/pySentryNotify/sentry_notify.py $(PYDIR)/xalarm @@ -161,7 +169,10 @@ hbm_clean: smm_clean: cd $(CURSRCDIR)/sentryPlugins/sentry_msg_monitor && make clean -clean: ebpf_clean hbm_clean smm_clean +bmc_clean: + cd $(CURSRCDIR)/sentryPlugins/bmc_block_io && sh build.sh clean + +clean: ebpf_clean hbm_clean smm_clean bmc_clean rm -rf $(CURLIBDIR)/build rm -rf $(CURSRCDIR)/build rm -rf $(CURSRCDIR)/libsentry/c/log/build @@ -175,6 +186,7 @@ uninstall: rm -rf $(BINDIR)/sentryCollector rm -rf $(BINDIR)/hbm_online_repair rm -rf $(BINDIR)/sentry_msg_monitor + rm -rf $(BINDIR)/bmc_block_io rm -rf $(BINDIR)/ebpf_collector rm -rf $(LIBINSTALLDIR)/libxalarm.so rm -rf $(INCLUDEDIR)/xalarm diff --git a/config/plugins/bmc_block_io.ini b/config/plugins/bmc_block_io.ini new file mode 100644 index 0000000..3b22618 --- /dev/null +++ b/config/plugins/bmc_block_io.ini @@ -0,0 +1,11 @@ +# log level, accepts debug, info, warning, error or critical +log_level=info + +# polling cycle, unit: seconds, range: [60, 3600] +patrol_second=5 + +# ipmitool login username +bmc_username=Administrator + +# ipmitool login passwd +bmc_passwd=Admin@9000 \ No newline at end of file diff --git a/config/tasks/bmc_block_io.mod b/config/tasks/bmc_block_io.mod new file mode 100644 index 0000000..9518c5f --- /dev/null +++ b/config/tasks/bmc_block_io.mod @@ -0,0 +1,7 @@ +[common] +enabled=yes +task_start=/usr/bin/bmc_block_io +task_stop=kill $pid +type=oneshot +alarm_id=1002 +alarm_clear_time=5 \ No newline at end of file diff --git a/src/sentryPlugins/bmc_block_io/CMakeLists.txt b/src/sentryPlugins/bmc_block_io/CMakeLists.txt new file mode 100644 index 0000000..818a0a8 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required (VERSION 3.12) +project(bmc_block_io) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/output) + +include_directories( + ${CMAKE_SOURCE_DIR}/include +) + +set(SOURCE src/main.cpp + src/logger.cpp + src/common.cpp + src/cbmcblockio.cpp) + +add_executable(bmc_block_io ${SOURCE}) +target_link_libraries(bmc_block_io PRIVATE xalarm pthread json-c) diff --git a/src/sentryPlugins/bmc_block_io/build.sh b/src/sentryPlugins/bmc_block_io/build.sh new file mode 100644 index 0000000..8db398c --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/build.sh @@ -0,0 +1,26 @@ +#!/bin/sh +echo "----------build begin------------" +echo "---------------------------------" + +BUILD_DIR=build + +if [ "$1" = "clean" ]; then + if [ -d "$BUILD_DIR" ]; then + echo "----------clean begin------------" + cd "$BUILD_DIR" && make clean + echo "----------clean end--------------" + else + echo "Build directory does not exist. Nothing to clean." + fi + exit 0 +fi + +[ ! -d $BUILD_DIR ] && mkdir -p $BUILD_DIR +cd $BUILD_DIR + +cmake .. +make || exit "$?" + +echo "------- build end -----------" +echo "-----------------------------" +exit 0 \ No newline at end of file diff --git a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h new file mode 100644 index 0000000..cb0b514 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#ifndef _BMC_BLOCK_IO_H_ +#define _BMC_BLOCK_IO_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace BMCBlockIoPlu { + +struct ResponseHeader { + uint16_t totalEvents; + uint8_t eventCount; + bool valid; +}; + +struct IPMIEvent { + uint32_t alarmTypeCode; + uint32_t timestamp; + uint8_t severity; + uint8_t subjectType; + uint8_t deviceId; + bool valid; +}; + +class CBMCBlockIo { +public: + CBMCBlockIo(); + ~CBMCBlockIo(); + void Start(); + void Stop(); + void SetPatrolInterval(int seconds); + void SetUserName(std::string userName); + void SetPassWd(std::string passWd); + bool IsRunning(); +private: + void SentryWorker(); + void GetBMCIp(); + void ReportAlarm(const IPMIEvent& event); + void ReportResult(int resultLevel, const std::string& msg); + int QueryEvents(); + std::string BuildIPMICommand(uint16_t startIndex); + std::vector ExecuteIPMICommand(const std::string& cmd); + ResponseHeader ParseResponseHeader(const std::vector& hexBytes); + IPMIEvent ParseSingleEvent(const std::vector& hexBytes, size_t startPos); + void ProcessEvents(const std::vector& hexBytes, uint8_t eventCount); + +private: + std::atomic m_running; + std::thread m_worker; + std::mutex m_mutex; + std::condition_variable m_cv; + std::string m_userName; + std::string m_passWd; + std::string m_bmcIp; + int m_patrolSeconds; +}; +} +#endif + diff --git a/src/sentryPlugins/bmc_block_io/include/common.h b/src/sentryPlugins/bmc_block_io/include/common.h new file mode 100644 index 0000000..2bd0980 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/include/common.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#ifndef _BMCPLU_COMMON_H_ +#define _BMCPLU_COMMON_H_ + +#include +#include +#include +#include +#include "configure.h" +#include "logger.h" + +#define BMCPLU_FAILED (-1) +#define BMCPLU_SUCCESS (0) + +struct PluConfig { + BMCBlockIoPlu::Logger::Level logLevel; + int patrolSeconds; + std::string userName; + std::string passWd; +}; + +struct ConfigItem { + bool required; + bool found; + std::function processor; +}; + +namespace BMCBlockIoPlu { + +std::string Trim(const std::string& str); +bool IsValidNumber(const std::string& str, int& num); +int ParseConfig(const std::string& path, PluConfig& config); +std::string ExtractFileName(const std::string& path); +int ExecCommand(const std::string cmd, std::vector& result); +std::string ByteToHex(uint8_t byte); +std::vector SplitString(const std::string& str, const std::string& split); +} + +#endif diff --git a/src/sentryPlugins/bmc_block_io/include/configure.h b/src/sentryPlugins/bmc_block_io/include/configure.h new file mode 100644 index 0000000..65a56ce --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/include/configure.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#ifndef _BMCPLU_CONFIGURE_H_ +#define _BMCPLU_CONFIGURE_H_ + +#include + +namespace BMCBlockIoPlu { + +const std::string BMCPLU_CONFIG_PATH = "/etc/sysSentry/plugins/bmc_block_io.ini"; +const std::string BMCPLU_LOG_PATH = "/var/log/sysSentry/bmc_block_io.log"; +const std::string BMCPLU_DEFAULT_USERNAME = "Administrator"; +const std::string BMCPLU_DEFAULT_PASSWD = "Admin@9000"; +const int BMCPLU_PATROL_MIN = 1; +const int BMCPLU_PATROL_MAX = 3600; +const int BMCPLU_PATROL_DEFAULT = 600; +const int BMCPLU_CONFIG_CHECK_CYCLE = 10; // seconds +const int BMCPLU_DEFAULT_SLEEP_CYCLE = 3; // seconds +} +#endif diff --git a/src/sentryPlugins/bmc_block_io/include/logger.h b/src/sentryPlugins/bmc_block_io/include/logger.h new file mode 100644 index 0000000..0627993 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/include/logger.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#ifndef __BMCPLU_LOGGER_H__ +#define __BMCPLU_LOGGER_H__ + +#include +#include +#include +#include +#include +#include + +namespace BMCBlockIoPlu { + +class Logger { +public: + enum class Level { + Debug, + Info, + Warning, + Error, + Critical + }; + static Logger& GetInstance(); + Logger(const Logger&) = delete; + Logger& operator=(const Logger&) = delete; + bool Initialize(const std::string& logPath, Level level = Level::Info); + void SetLevel(Level level); + Level GetLevel() const; + void WriteLog(Level level, const char* file, int line, const std::string& message); + std::string LevelToString(Level level) const; +private: + Logger() = default; + void OpenLogFile(); + void CheckFileState(); + void ReopenLogFile(); + std::string GetTimeStamp() const; + std::string Format(Level level, const char* file, int line, const std::string& message) const; + +private: + std::ofstream m_logFile; + std::string m_logPath; + Level m_level = Level::Info; + mutable std::mutex m_writeMutex; + std::time_t m_checkTime = 0; + ino_t m_inode = 0; + dev_t m_device = 0; + off_t m_fileSize = 0; + bool m_fileOpen = false; +}; + +class LogStream { +public: + LogStream(Logger::Level level, const char* file, int line) + : m_level(level), m_file(file), m_line(line) + {} + ~LogStream() + { + Logger::GetInstance().WriteLog(m_level, m_file, m_line, m_stream.str()); + } + template + LogStream& operator<<(const T& value) + { + m_stream << value; + return *this; + } + LogStream& operator<<(std::ostream& (*manip)(std::ostream&)) // std::endl, std::flush... + { + m_stream << manip; + return *this; + } + +private: + Logger::Level m_level; + const char* m_file; + int m_line; + std::ostringstream m_stream; +}; + +#define BMC_LOG_DEBUG BMCBlockIoPlu::LogStream(BMCBlockIoPlu::Logger::Level::Debug, __FILE__, __LINE__) +#define BMC_LOG_INFO BMCBlockIoPlu::LogStream(BMCBlockIoPlu::Logger::Level::Info, __FILE__, __LINE__) +#define BMC_LOG_WARNING BMCBlockIoPlu::LogStream(BMCBlockIoPlu::Logger::Level::Warning, __FILE__, __LINE__) +#define BMC_LOG_ERROR BMCBlockIoPlu::LogStream(BMCBlockIoPlu::Logger::Level::Error, __FILE__, __LINE__) +#define BMC_LOG_CRITICAL BMCBlockIoPlu::LogStream(BMCBlockIoPlu::Logger::Level::Critical, __FILE__, __LINE__) +} +#endif \ No newline at end of file diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp new file mode 100644 index 0000000..9263a2d --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -0,0 +1,415 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#include "cbmcblockio.h" +#include +#include +#include +#include +#include +#include +extern "C" { +#include +} +#include "common.h" +#include "configure.h" +#include "logger.h" + +namespace BMCBlockIoPlu { + +const int BMC_ALARM_ID = 1002; +const int RESP_HEADER_SIZE = 7; +const int EVENT_SIZE = 15; +const uint32_t ALARM_OCCUR_CODE = 0x02000039; +const uint32_t ALARM_CLEAR_CODE = 0x0200003A; +const std::string BMC_TASK_NAME = "bmc_block_io"; +const std::string GET_BMCIP_CMD = "ipmitool lan print"; +const std::string IPMI_KEY_IP_ADDR = "IP Address"; +const std::string MSG_BMCIP_EMPTY = "ipmitool get bmc ip failed."; +const std::string MSG_BMC_QUERY_FAIL = "ipmitool query failed."; +const std::string MSG_EXIT_SUCCESS = "receive exit signal, task completed."; +const std::string JSON_KEY_MSG = "msg"; +const std::string JSON_KEY_ALARM_SOURCE = "alarm_source"; +const std::string JSON_KEY_DRIVER_NAME = "driver_name"; +const std::string JSON_KEY_IO_TYPE = "io_type"; +const std::string JSON_KEY_REASON = "reason"; +const std::string JSON_KEY_BLOCK_STACK = "block_stack"; +const std::string JSON_KEY_DETAILS = "details"; + +CBMCBlockIo::CBMCBlockIo() : + m_running(false), + m_patrolSeconds(BMCPLU_PATROL_DEFAULT), + m_userName(BMCPLU_DEFAULT_USERNAME), + m_passWd(BMCPLU_DEFAULT_PASSWD), + m_bmcIp("") +{ +} + +CBMCBlockIo::~CBMCBlockIo() +{ +} + +void CBMCBlockIo::Start() +{ + if (m_running) { + return; + } + + GetBMCIp(); + if (m_bmcIp.empty()) { + BMC_LOG_ERROR << "BMC Ip is empty."; + ReportResult(RESULT_LEVEL_FAIL, MSG_BMCIP_EMPTY); + return; + } + m_running = true; + m_worker = std::thread(&CBMCBlockIo::SentryWorker, this); + BMC_LOG_INFO << "BMC block io Start."; +} + +void CBMCBlockIo::Stop() +{ + { + std::lock_guard lock(m_mutex); + m_running = false; + } + m_cv.notify_all(); + + if (m_worker.joinable()) { + m_worker.join(); + } + BMC_LOG_INFO <<"BMC block io Stop."; +} + +void CBMCBlockIo::SetPatrolInterval(int seconds) +{ + m_patrolSeconds = seconds; +} + +void CBMCBlockIo::SetUserName(std::string userName) +{ + m_userName = userName; +} + +void CBMCBlockIo::SetPassWd(std::string passWd) +{ + m_passWd = passWd; +} + +bool CBMCBlockIo::IsRunning() +{ + return m_running; +} + +void CBMCBlockIo::SentryWorker() +{ + int ret = BMCPLU_SUCCESS; + while (m_running) { + std::unique_lock lock(m_mutex); + m_cv.wait_for(lock, std::chrono::seconds(m_patrolSeconds), [this] { + return !m_running; + }); + + if (!m_running) { + break; + } + ret = QueryEvents(); + if (ret != BMCPLU_SUCCESS) { + break; + } + } + + if (ret == BMCPLU_SUCCESS) { + ReportResult(RESULT_LEVEL_PASS, MSG_EXIT_SUCCESS); + } else { + ReportResult(RESULT_LEVEL_FAIL, MSG_BMC_QUERY_FAIL); + } + m_running = false; + BMC_LOG_INFO << "BMC block io SentryWorker exit."; + return; +} + +void CBMCBlockIo::GetBMCIp() +{ + std::vector result; + if (ExecCommand(GET_BMCIP_CMD, result)) { + return; + } + for (const auto& iter: result) { + if (iter.find(IPMI_KEY_IP_ADDR) != std::string::npos) { + size_t eq_pos = iter.find(':'); + if (eq_pos != std::string::npos) { + std::string key = Trim(iter.substr(0, eq_pos)); + std::string value = Trim(iter.substr(eq_pos + 1)); + if (key == IPMI_KEY_IP_ADDR) { + m_bmcIp = value; + return; + } + } + } + } + return; +} + +/***** ipml protocol *****/ +/* +请求 字节顺序 含义 + 1-3 厂商id 默认0xDB 0x07 0x0 + 4 子命令 默认0x40 + 5 请求类型 默认0x00 + 6-7 需要查询的事件起始编号,某些情况下查询到的事件可能有多条, + 单次响应无法全部返回,因此需要修改该值分页查询 + 8 事件严重级别 位图形式,bit0-normal,bit1-minor,bit2-major,bit3-critical,慢盘事件只支持normal + 9 主体类型 硬盘类型0x02 +响应 字节顺序 含义 + 1 completion code 调用成功时该字节不会显示在终端上 + 2-4 厂商ID,对应请求中内容 + 5-6 事件总数量 + 7 本次返回中包含的事件数量 + 8 占位字节,默认0 + 9-12 告警类型码,0x0200039为告警产生,0x0200003A为告警消除 + 13-16 事件发生的linux时间戳 + 17 事件严重级别,0-normal,1-minor,2-major,3-critical + 18 主体类型,对应请求中内容 + 19 设备序号 带外编号 + 20-23 占位字节,默认0 + N+1-N+15重复上面9-23中的内容,表示下一个事件 +厂商ID固定,其他所有多字节对象均为小端序, eg: +ipmitool -I lanplus -H x.x.x.x -U x -P x -C 17 raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00 0x00 0x00 0x01 0x02 +db 07 00 03 00 03 00 39 00 00 02 2f ab 91 68 00 02 04 00 00 00 00 +39 00 00 02 2e ab 91 68 00 02 02 00 00 00 00 39 00 00 02 2e ab 91 +68 00 02 01 00 00 00 00 + */ +int CBMCBlockIo::QueryEvents() +{ + uint16_t currentIndex = 0; + int ret = BMCPLU_FAILED; + + while (true) { + std::string cmd = BuildIPMICommand(currentIndex); + std::vector hexBytes = ExecuteIPMICommand(cmd); + if (hexBytes.empty()) { + break; + } + + ResponseHeader header = ParseResponseHeader(hexBytes); + if (!header.valid) { + break; + } + + size_t expectedSize = RESP_HEADER_SIZE + header.eventCount * EVENT_SIZE; + if (hexBytes.size() < expectedSize) { + BMC_LOG_ERROR << "Response size too small. Expected: " << expectedSize + << ", Actual: " << hexBytes.size(); + break; + } + + BMC_LOG_DEBUG << "Total events: " << header.totalEvents + << ", returned: " << static_cast(header.eventCount) + << ", current index: " << currentIndex; + if (header.eventCount == 0) { + ret = BMCPLU_SUCCESS; + break; + } + + ProcessEvents(hexBytes, header.eventCount); + currentIndex += header.eventCount; + + if (currentIndex >= header.totalEvents) { + ret = BMCPLU_SUCCESS; + break; + } + } + return ret; +} + +std::string CBMCBlockIo::BuildIPMICommand(uint16_t startIndex) +{ + uint8_t indexHigh = static_cast((startIndex >> 8) & 0xff); + uint8_t indexLow = static_cast(startIndex & 0xff); + std::ostringstream cmdStream; + cmdStream << "ipmitool -I lanplus -H " << m_bmcIp + << " -U " << m_userName + << " -P " << m_passWd + << " -C 17 raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00" + << " " << ByteToHex(indexLow) + << " " << ByteToHex(indexHigh) + << " 0x01 0x02"; + return cmdStream.str(); +} + +std::vector CBMCBlockIo::ExecuteIPMICommand(const std::string& cmd) +{ + BMC_LOG_DEBUG << "IPMI event query command: " << cmd; + + std::vector cmdOut; + if (ExecCommand(cmd, cmdOut)) { + BMC_LOG_ERROR << "IPMI command execute failed."; + return {}; + } + + std::ostringstream responseStream; + for (size_t i = 0; i < cmdOut.size(); ++i) { + std::string line = cmdOut[i]; + BMC_LOG_DEBUG << "Execute IPMI event response: " << line; + line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); + line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); + if (i > 0 && !line.empty()) { + responseStream << ' '; + } + responseStream << line; + } + return SplitString(responseStream.str(), " "); +} + +ResponseHeader CBMCBlockIo::ParseResponseHeader(const std::vector& hexBytes) +{ + ResponseHeader header = {0, 0, false}; + + if (hexBytes.size() < RESP_HEADER_SIZE) { + BMC_LOG_ERROR << "Invalid response length: " << hexBytes.size(); + return header; + } + + if (hexBytes[0] != "db" || hexBytes[1] != "07" || hexBytes[2] != "00") { + BMC_LOG_ERROR << "Unexpected manufacturer ID: " + << hexBytes[0] << " " << hexBytes[1] << " " << hexBytes[2]; + return header; + } + + char* endPtr = nullptr; + unsigned long totalLow = std::strtoul(hexBytes[3].c_str(), &endPtr, 16); + if (endPtr == hexBytes[3].c_str() || *endPtr != '\0' || totalLow > 0xff) { + BMC_LOG_ERROR << "Invalid totalLow byte: " << hexBytes[3]; + return header; + } + + unsigned long totalHigh = std::strtoul(hexBytes[4].c_str(), &endPtr, 16); + if (endPtr == hexBytes[4].c_str() || *endPtr != '\0' || totalHigh > 0xff) { + BMC_LOG_ERROR << "Invalid totalHigh byte: " << hexBytes[4]; + return header; + } + + header.totalEvents = static_cast(totalLow) | (static_cast(totalHigh) << 8); + unsigned long count = std::strtoul(hexBytes[5].c_str(), &endPtr, 16); + if (endPtr == hexBytes[5].c_str() || *endPtr != '\0' || count > 0xff) { + BMC_LOG_ERROR << "Invalid event count byte: " << hexBytes[5]; + return header; + } + + header.eventCount = static_cast(count); + header.valid = true; + return header; +} + +IPMIEvent CBMCBlockIo::ParseSingleEvent(const std::vector& hexBytes, size_t startPos) +{ + IPMIEvent event = {0, 0, 0, 0, 0, false}; + char* endPtr = nullptr; + + for (int i = 0; i < 4; ++i) { + unsigned long byte = std::strtoul(hexBytes[startPos + i].c_str(), &endPtr, 16); + if (endPtr == hexBytes[startPos + i].c_str() || *endPtr != '\0' || byte > 0xff) { + BMC_LOG_ERROR << "Invalid alarm type byte at pos " << startPos + i + << ": " << hexBytes[startPos + i]; + return event; + } + event.alarmTypeCode |= (static_cast(byte) << (i * 8)); + } + + for (int i = 0; i < 4; ++i) { + unsigned long byte = std::strtoul(hexBytes[startPos + 4 + i].c_str(), &endPtr, 16); + if (endPtr == hexBytes[startPos + 4 + i].c_str() || *endPtr != '\0' || byte > 0xff) { + BMC_LOG_ERROR << "Invalid timestamp byte at pos " << startPos + 4 + i + << ": " << hexBytes[startPos + 4 + i]; + return event; + } + event.timestamp |= (static_cast(byte) << (i * 8)); + } + + unsigned long severity = std::strtoul(hexBytes[startPos + 8].c_str(), &endPtr, 16); + if (endPtr == hexBytes[startPos + 8].c_str() || *endPtr != '\0' || severity > 0xff) { + BMC_LOG_ERROR << "Invalid severity byte: " << hexBytes[startPos + 8]; + return event; + } + event.severity = static_cast(severity); + + unsigned long subjectType = std::strtoul(hexBytes[startPos + 9].c_str(), &endPtr, 16); + if (endPtr == hexBytes[startPos + 9].c_str() || *endPtr != '\0' || subjectType > 0xff) { + BMC_LOG_ERROR << "Invalid subject type byte: " << hexBytes[startPos + 9]; + return event; + } + event.subjectType = static_cast(subjectType); + + unsigned long deviceId = std::strtoul(hexBytes[startPos + 10].c_str(), &endPtr, 16); + if (endPtr == hexBytes[startPos + 10].c_str() || *endPtr != '\0' || deviceId > 0xff) { + BMC_LOG_ERROR << "Invalid device ID byte: " << hexBytes[startPos + 10]; + return event; + } + event.deviceId = static_cast(deviceId); + + event.valid = true; + return event; +} + +void CBMCBlockIo::ProcessEvents(const std::vector& hexBytes, uint8_t eventCount) +{ + for (int i = 0; i < eventCount; ++i) { + size_t startPos = RESP_HEADER_SIZE + i * EVENT_SIZE; + + IPMIEvent event = ParseSingleEvent(hexBytes, startPos); + if (!event.valid) { + continue; + } + + ReportAlarm(event); + } + return; +} + +void CBMCBlockIo::ReportAlarm(const IPMIEvent& event) +{ + uint8_t ucAlarmLevel = MINOR_ALM; + uint8_t ucAlarmType = 0; + if (event.alarmTypeCode == ALARM_OCCUR_CODE) { + ucAlarmType = ALARM_TYPE_OCCUR; + } else if (event.alarmTypeCode == ALARM_CLEAR_CODE) { + ucAlarmType = ALARM_TYPE_RECOVER; + } else { + BMC_LOG_ERROR << "Skipping unknown alarm type: 0x" + << std::hex << event.alarmTypeCode; + return; + } + json_object* jObject = json_object_new_object(); + json_object_object_add(jObject, JSON_KEY_ALARM_SOURCE.c_str(), json_object_new_string(BMC_TASK_NAME.c_str())); + json_object_object_add(jObject, JSON_KEY_DRIVER_NAME.c_str(), json_object_new_string(std::to_string(event.deviceId).c_str())); + //json_object_object_add(jObject, JSON_KEY_IO_TYPE.c_str(), json_object_new_string("null")); + //json_object_object_add(jObject, JSON_KEY_REASON.c_str(), json_object_new_string("null")); + //json_object_object_add(jObject, JSON_KEY_BLOCK_STACK.c_str(), json_object_new_string("null")); + //json_object_object_add(jObject, JSON_KEY_DETAILS.c_str(), json_object_new_string("null")); + const char *jData = json_object_to_json_string(jObject); + int ret = xalarm_Report(BMC_ALARM_ID, ucAlarmLevel, ucAlarmType, const_cast(jData)); + if (ret != RETURE_CODE_SUCCESS) { + BMC_LOG_ERROR << "Failed to xalarm_Report, ret: " << ret; + } + json_object_put(jObject); + return; +} + +void CBMCBlockIo::ReportResult(int resultLevel, const std::string& msg) +{ + RESULT_LEVEL level = static_cast(resultLevel); + json_object* jObject = json_object_new_object(); + json_object_object_add(jObject, JSON_KEY_MSG.c_str(), json_object_new_string(msg.c_str())); + const char *jData = json_object_to_json_string(jObject); + int ret = report_result(BMC_TASK_NAME.c_str(), level, const_cast(jData)); + if (ret != RETURE_CODE_SUCCESS) { + BMC_LOG_ERROR << "Failed to report_result, ret: " << ret; + } + json_object_put(jObject); + return; +} +}; diff --git a/src/sentryPlugins/bmc_block_io/src/common.cpp b/src/sentryPlugins/bmc_block_io/src/common.cpp new file mode 100644 index 0000000..d603d7a --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/src/common.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#include "common.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace BMCBlockIoPlu { + +std::string Trim(const std::string& str) +{ + size_t first = str.find_first_not_of(" \t\n\r\v\f"); + if (std::string::npos == first) { + return ""; + } + size_t last = str.find_last_not_of(" \t\n\r\v\f"); + return str.substr(first, (last - first + 1)); +} + +bool IsValidNumber(const std::string& str, int& num) +{ + if (str.empty()) { + return false; + } + for (const auto& iter : str) { + if (!std::isdigit(iter)) { + return false; + } + } + std::istringstream iss(str); + if (!(iss >> num)) { + return false; + } + return true; +} + +int ParseConfig(const std::string& path, PluConfig& config) +{ + std::ifstream file(path); + if (!file.is_open()) { + BMC_LOG_ERROR << "Failed to open config file: " << path; + return BMCPLU_FAILED; + } + + std::unordered_map configMap; + configMap["log_level"] = {true, false, [&](const std::string& value) { + if (value == "debug") { + config.logLevel = Logger::Level::Debug; + } else if (value == "info") { + config.logLevel = Logger::Level::Info; + } else if (value == "warning") { + config.logLevel = Logger::Level::Warning; + } else if (value == "error") { + config.logLevel = Logger::Level::Error; + } else if (value == "critical") { + config.logLevel = Logger::Level::Critical; + } else { + BMC_LOG_ERROR << "Invalid log_level value."; + return false; + } + return true; + }}; + + configMap["patrol_second"] = {true, false, [&](const std::string& value) { + int num = 0; + if (!IsValidNumber(value, num) || !(num >= BMCPLU_PATROL_MIN && num <= BMCPLU_PATROL_MAX)) { + BMC_LOG_ERROR << "Invalid patrol_second value."; + return false; + } + config.patrolSeconds = num; + return true; + }}; + + configMap["bmc_username"] = {true, false, [&](const std::string& value) { + config.userName = value; + return true; + }}; + + configMap["bmc_passwd"] = {true, false, [&](const std::string& value) { + config.passWd = value; + return true; + }}; + + std::string line; + while (std::getline(file, line)) { + line = Trim(line); + if (line.empty() || line[0] == '#') { + continue; + } + + size_t eqPos = line.find('='); + if (eqPos == std::string::npos || eqPos == 0) { + BMC_LOG_ERROR << "Config file format invalid."; + return BMCPLU_FAILED; + } + + std::string key = Trim(line.substr(0, eqPos)); + std::string value = Trim(line.substr(eqPos + 1)); + if (value.empty()) { + BMC_LOG_ERROR << "Config key: " << key << " cannot empty."; + return BMCPLU_FAILED; + } + + auto iter = configMap.find(key); + if (iter == configMap.end()) { + BMC_LOG_ERROR << "Config error, unknown key : " << key; + return BMCPLU_FAILED; + } + + if (!iter->second.processor(value)) { + return BMCPLU_FAILED; + } + iter->second.found = true; + } + + for (const auto& iter : configMap) { + if (iter.second.required && !iter.second.found) { + BMC_LOG_ERROR << "Config error, missing required key : " << iter.first; + return BMCPLU_FAILED; + } + } + return BMCPLU_SUCCESS; +} + +std::string ExtractFileName(const std::string& path) +{ + size_t lastSlashPos = path.find_last_of('/'); + if (lastSlashPos == std::string::npos) { + return path; + } else { + return path.substr(lastSlashPos + 1); + } +} + +int ExecCommand(const std::string cmd, std::vector& result) +{ + FILE* pipe = popen(cmd.c_str(), "r"); + if (!pipe) { + BMC_LOG_ERROR << "Cmd: " << cmd << ", popen failed."; + return BMCPLU_FAILED; + } + + char buffer[512]; + result.clear(); + while (fgets(buffer, sizeof(buffer), pipe)) { + result.push_back(buffer); + } + + int status = pclose(pipe); + if (status == -1) { + BMC_LOG_ERROR << "Cmd: " << cmd << ", pclose failed."; + return BMCPLU_FAILED; + } else { + int exitCode = WEXITSTATUS(status); + if (exitCode != 0) { + BMC_LOG_ERROR << "Cmd: " << cmd << ", exit failed, err code: " << exitCode; + return BMCPLU_FAILED; + } + } + return BMCPLU_SUCCESS; +} + +std::string ByteToHex(uint8_t byte) +{ + std::ostringstream oss; + const int hexLen = 2; + oss << std::hex << std::setfill('0') << std::setw(hexLen) << static_cast(byte); + return "0x" + oss.str(); +} + +std::vector SplitString(const std::string& str, const std::string& split) +{ + std::vector result; + if (split.empty()) { + result.push_back(str); + return result; + } + + size_t pos = 0; + while (true) { + size_t split_pos = str.find(split, pos); + if (split_pos != std::string::npos) { + result.push_back(str.substr(pos, split_pos - pos)); + pos = split_pos + split.size(); + } else { + result.push_back(str.substr(pos)); + break; + } + } + return result; +} +} diff --git a/src/sentryPlugins/bmc_block_io/src/logger.cpp b/src/sentryPlugins/bmc_block_io/src/logger.cpp new file mode 100644 index 0000000..f0c84b1 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/src/logger.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#include "logger.h" +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +namespace BMCBlockIoPlu { + +Logger& Logger::GetInstance() +{ + static Logger instance; + return instance; +} + +bool Logger::Initialize(const std::string& logPath, Level level) +{ + m_logPath = logPath; + m_level = level; + OpenLogFile(); + return m_fileOpen; +} + +void Logger::SetLevel(Level level) +{ + m_level = level; +} + +Logger::Level Logger::GetLevel() const +{ + return m_level; +} + +void Logger::OpenLogFile() +{ + m_logFile.open(m_logPath, std::ios::out | std::ios::app); + if (!m_logFile.is_open()) { + std::cerr << "Failed to open log file: " << m_logPath + << ", error: " << strerror(errno) << std::endl; + m_fileOpen = false; + return; + } + + struct stat fileStat; + if (stat(m_logPath.c_str(), &fileStat) == 0) { + m_inode = fileStat.st_ino; + m_device = fileStat.st_dev; + m_fileSize = fileStat.st_size; + } + m_checkTime = std::time(nullptr); + + m_fileOpen = true; + return; +} + +void Logger::CheckFileState() +{ + const int timeInterval = 30; // second + std::time_t timeNow = std::time(nullptr); + if (timeNow - m_checkTime < timeInterval) { + return; + } + + struct stat fileStat; + if (stat(m_logPath.c_str(), &fileStat) != 0) { + if (errno == ENOENT) { // file deleted + std::lock_guard lock(m_writeMutex); + ReopenLogFile(); + } + std::cerr << "Failed to get file state: " << m_logPath + << ", error: " << strerror(errno) << std::endl; + return; + } + + bool needReopen = false; + if (fileStat.st_ino != m_inode) { + needReopen = true; + } else if (fileStat.st_dev != m_device) { + needReopen = true; + } else if (fileStat.st_size < m_fileSize) { + needReopen = true; + } + + if (needReopen) { + ReopenLogFile(); + } else { + m_fileSize = fileStat.st_size; + } + + m_checkTime = timeNow; +} + +void Logger::ReopenLogFile() +{ + if (m_logFile.is_open()) { + m_logFile.close(); + } + OpenLogFile(); + return; +} + +void Logger::WriteLog(Level level, const char* file, int line, const std::string& message) +{ + if (level < GetLevel() || message.empty()) { + return; + } + + CheckFileState(); + std::lock_guard lock(m_writeMutex); + if (m_fileOpen && m_logFile.good()) { + m_logFile << Format(level, file, line, message) << std::endl; + m_logFile.flush(); + } else { + std::cerr << Format(level, file, line, message) << std::endl; + } +} + +std::string Logger::LevelToString(Level level) const +{ + switch (level) { + case Level::Debug: return std::string("DEBUG"); + case Level::Info: return std::string("INFO"); + case Level::Warning: return std::string("WARNING"); + case Level::Error: return std::string("ERROR"); + case Level::Critical: return std::string("CRITICAL"); + default: return std::string("UNKNOWN"); + } + return std::string("UNKNOWN"); +} + +std::string Logger::GetTimeStamp() const +{ + auto now = std::chrono::system_clock::now(); + auto nowTimer = std::chrono::system_clock::to_time_t(now); + auto milliseconds = std::chrono::duration_cast(now.time_since_epoch()) % 1000; + + struct tm nowTm; + localtime_r(&nowTimer, &nowTm); + + std::ostringstream oss; + const int millisecLen = 3; + oss << std::put_time(&nowTm, "%Y-%m-%d %H:%M:%S"); + oss << '.' << std::setfill('0') << std::setw(millisecLen) << milliseconds.count(); + + return oss.str(); +} + +std::string Logger::Format(Level level, const char* file, int line, const std::string & message) const +{ + std::ostringstream oss; + oss << GetTimeStamp() << " - " << LevelToString(level) << " - [" + << ExtractFileName(file) << ":" << line << "]" << " - " << message; + return oss.str(); +} +} diff --git a/src/sentryPlugins/bmc_block_io/src/main.cpp b/src/sentryPlugins/bmc_block_io/src/main.cpp new file mode 100644 index 0000000..8aeb3d0 --- /dev/null +++ b/src/sentryPlugins/bmc_block_io/src/main.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * bmc_block_io is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * Author: hewanhan@h-partners.com + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cbmcblockio.h" +#include "configure.h" +#include "logger.h" +#include "common.h" + +std::atomic g_exit(false); + +void HandleSignal(int sig) +{ + if (sig == SIGTERM || sig == SIGINT) { + g_exit = true; + BMC_LOG_INFO << "Receive signal SIGTERM or SIGINT, exit."; + } + return; +} + +void SetSignalHandler() +{ + struct sigaction sa; + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGTERM); // block SIGTERM + sigaddset(&sa.sa_mask, SIGINT); // block SIGINT + sa.sa_handler = HandleSignal; + sa.sa_flags = SA_RESTART; + + if (sigaction(SIGTERM, &sa, nullptr) == -1) { + BMC_LOG_ERROR << "Failed to setup signal with SIGTERM:" << strerror(errno); + } + + if (sigaction(SIGINT, &sa, nullptr) == -1) { + BMC_LOG_ERROR << "Failed to setup signal with SIGINT:" << strerror(errno); + } +} + +int main(int argc, char* argv[]) +{ + if (!BMCBlockIoPlu::Logger::GetInstance().Initialize(BMCBlockIoPlu::BMCPLU_LOG_PATH)) { + std::cerr << "Failed to initialize logger." << std::endl; + } + SetSignalHandler(); + + BMCBlockIoPlu::CBMCBlockIo blockIo; + PluConfig config; + if (BMCBlockIoPlu::ParseConfig(BMCBlockIoPlu::BMCPLU_CONFIG_PATH, config)) { + BMC_LOG_ERROR << "Parse config failed, use default configuration."; + } else { + BMCBlockIoPlu::Logger::GetInstance().SetLevel(config.logLevel); + blockIo.SetPatrolInterval(config.patrolSeconds); + blockIo.SetUserName(config.userName); + blockIo.SetPassWd(config.passWd); + } + + std::thread configMonitor([&] { + time_t lastModTime = 0; + struct stat st; + if (stat(BMCBlockIoPlu::BMCPLU_CONFIG_PATH.c_str(), &st) == 0) { + lastModTime = st.st_mtime; + } + + while (!g_exit) { + std::this_thread::sleep_for(std::chrono::seconds(BMCBlockIoPlu::BMCPLU_CONFIG_CHECK_CYCLE)); + if (g_exit) { + break; + } + + struct stat st_; + if (stat(BMCBlockIoPlu::BMCPLU_CONFIG_PATH.c_str(), &st_) != 0) { + continue; + } + if (st_.st_mtime != lastModTime) { + lastModTime = st_.st_mtime; + PluConfig newConfig; + if (BMCBlockIoPlu::ParseConfig(BMCBlockIoPlu::BMCPLU_CONFIG_PATH, newConfig) == BMCPLU_SUCCESS) { + if (newConfig.logLevel != config.logLevel) { + config.logLevel = newConfig.logLevel; + BMC_LOG_INFO << "Log level update to " + << BMCBlockIoPlu::Logger::GetInstance().LevelToString(config.logLevel); + BMCBlockIoPlu::Logger::GetInstance().SetLevel(config.logLevel); + } + if (newConfig.patrolSeconds != config.patrolSeconds) { + config.patrolSeconds = newConfig.patrolSeconds; + BMC_LOG_INFO << "Patrol interval update to " << config.patrolSeconds; + blockIo.SetPatrolInterval(config.patrolSeconds); + } + if (newConfig.userName != config.userName) { + config.userName = newConfig.userName; + BMC_LOG_INFO << "BMC userName update to " << config.userName; + blockIo.SetUserName(config.userName); + } + if (newConfig.passWd != config.passWd) { + config.passWd = newConfig.passWd; + BMC_LOG_INFO << "BMC passWd update"; + blockIo.SetPassWd(config.passWd); + } + } + } + } + }); + blockIo.Start(); + while (!g_exit) { + std::this_thread::sleep_for(std::chrono::seconds(BMCBlockIoPlu::BMCPLU_DEFAULT_SLEEP_CYCLE)); + if (!blockIo.IsRunning()) { + g_exit = true; + break; + } + } + blockIo.Stop(); + if (configMonitor.joinable()) { + configMonitor.join(); + } + return 0; +} -- Gitee From 9e431c144e684d8583c246d26ba0703555bd7b12 Mon Sep 17 00:00:00 2001 From: hewanhan Date: Mon, 1 Sep 2025 10:12:27 +0800 Subject: [PATCH 02/62] add test . --- Makefile | 2 +- src/sentryPlugins/bmc_block_io/CMakeLists.txt | 4 ++++ src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4b99d8c..bd3bcb9 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ hbm_online_repair: sentry_msg_monitor: lib cd $(CURSRCDIR)/sentryPlugins/sentry_msg_monitor/ && make -bmc_block_io: +bmc_block_io: lib cd $(CURSRCDIR)/sentryPlugins/bmc_block_io/ && sh build.sh install: all dirs isentry diff --git a/src/sentryPlugins/bmc_block_io/CMakeLists.txt b/src/sentryPlugins/bmc_block_io/CMakeLists.txt index 818a0a8..2234dee 100644 --- a/src/sentryPlugins/bmc_block_io/CMakeLists.txt +++ b/src/sentryPlugins/bmc_block_io/CMakeLists.txt @@ -8,6 +8,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/output) include_directories( ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/../../libs/libxalarm +) +link_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../libs/build/libxalarm ) set(SOURCE src/main.cpp diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp index 9263a2d..cb52364 100644 --- a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -13,7 +13,7 @@ #include #include extern "C" { -#include +#include "register_xalarm.h" } #include "common.h" #include "configure.h" -- Gitee From 289c3ce41a0b6b613073089451a885fa88a1f781 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 8 Sep 2025 20:51:36 +0800 Subject: [PATCH 03/62] add test . --- src/sentryPlugins/bmc_block_io/include/common.h | 2 +- src/sentryPlugins/bmc_block_io/src/common.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentryPlugins/bmc_block_io/include/common.h b/src/sentryPlugins/bmc_block_io/include/common.h index 2bd0980..9304dee 100644 --- a/src/sentryPlugins/bmc_block_io/include/common.h +++ b/src/sentryPlugins/bmc_block_io/include/common.h @@ -37,7 +37,7 @@ std::string Trim(const std::string& str); bool IsValidNumber(const std::string& str, int& num); int ParseConfig(const std::string& path, PluConfig& config); std::string ExtractFileName(const std::string& path); -int ExecCommand(const std::string cmd, std::vector& result); +int ExecCommand(const std::string& cmd, std::vector& result); std::string ByteToHex(uint8_t byte); std::vector SplitString(const std::string& str, const std::string& split); } diff --git a/src/sentryPlugins/bmc_block_io/src/common.cpp b/src/sentryPlugins/bmc_block_io/src/common.cpp index d603d7a..f03db17 100644 --- a/src/sentryPlugins/bmc_block_io/src/common.cpp +++ b/src/sentryPlugins/bmc_block_io/src/common.cpp @@ -142,7 +142,7 @@ std::string ExtractFileName(const std::string& path) } } -int ExecCommand(const std::string cmd, std::vector& result) +int ExecCommand(const std::string& cmd, std::vector& result) { FILE* pipe = popen(cmd.c_str(), "r"); if (!pipe) { -- Gitee From da61e94d136e85653bbcebb8532ed0dd6f6eb0a7 Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 10 Sep 2025 18:13:43 +0800 Subject: [PATCH 04/62] add test . --- .../avg_block_io/avg_block_io.py | 4 ++- .../avg_block_io/extra_logger.py | 30 +++++++++++++++++++ src/sentryPlugins/avg_block_io/utils.py | 2 ++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 src/sentryPlugins/avg_block_io/extra_logger.py diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index 899d517..5e6b603 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -17,9 +17,10 @@ from .config import read_config_log, read_config_common, read_config_algorithm, from .stage_window import IoWindow, IoDumpWindow from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation from .utils import update_avg_and_check_abnormal +from .extra_logger import init_extra_logger CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" - +AVG_EXTRA_LOG_PATH = "/var/log/sysSentry/avg_extra.log" def init_io_win(io_dic, config, common_param): """initialize windows of latency, iodump, and dict of avg_value""" @@ -152,6 +153,7 @@ def main(): log_level = read_config_log(CONFIG_FILE) log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" logging.basicConfig(level=log_level, format=log_format) + init_extra_logger(AVG_EXTRA_LOG_PATH, log_level, log_format) # 初始化配置读取 config = configparser.ConfigParser(comment_prefixes=('#', ';')) diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py new file mode 100644 index 0000000..83902d1 --- /dev/null +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -0,0 +1,30 @@ +import logging +import os + +extra_logger = None + +def init_extra_logger(log_path, log_level, log_format): + global extra_logger + try: + if not os.path.exists(log_path): + fd = os.open(log_path, os.O_CREAT | os.O_WRONLY, 0o600) + os.close(fd) + logger_name = f"extra_logger_{log_path}" + logger = logging.getLogger(logger_name) + logger.propagate = False + logger.setLevel(log_level) + + file_handler = logging.FileHandler(log_path) + file_handler.setLevel(log_level) + + formatter = logging.Formatter(log_format) + file_handler.setFormatter(formatter) + + logger.addHandler(file_handler) + extra_logger = logger + except Exception as e: + logging.error(f"Failed to create extra logger for {log_path}: {e}") + extra_logger = logging.getLogger() # Fallback to default logger + +def extra_log_slow(msg): + extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}, type: {msg['alarm_type']}") diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index d5f8bb4..c7c9985 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -9,6 +9,7 @@ # PURPOSE. # See the Mulan PSL v2 for more details. import logging +from .logger_setup import extra_log_slow AVG_VALUE = 0 AVG_COUNT = 1 @@ -109,6 +110,7 @@ def log_slow_win(msg, reason): """record log of slow win""" logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") + extra_log_slow(msg) logging.info(f"latency: {msg['details']['latency']}") logging.info(f"iodump: {msg['details']['iodump']}") -- Gitee From 21988a8ad84717fe5f6a2fa7093c1d63aeba6e35 Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 10 Sep 2025 18:18:26 +0800 Subject: [PATCH 05/62] add test . --- src/sentryPlugins/avg_block_io/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index c7c9985..74e5bb4 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -9,7 +9,7 @@ # PURPOSE. # See the Mulan PSL v2 for more details. import logging -from .logger_setup import extra_log_slow +from .extra_logger import extra_log_slow AVG_VALUE = 0 AVG_COUNT = 1 -- Gitee From 28cd344339757ff98ceb1527f39d464c45dc8fed Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 12 Sep 2025 09:31:08 +0800 Subject: [PATCH 06/62] add test . --- .../avg_block_io/avg_block_io.py | 3 +- .../avg_block_io/extra_logger.py | 97 ++++++++++++++++++- src/sentryPlugins/avg_block_io/utils.py | 5 +- 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index 5e6b603..9f6af99 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -20,7 +20,8 @@ from .utils import update_avg_and_check_abnormal from .extra_logger import init_extra_logger CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" -AVG_EXTRA_LOG_PATH = "/var/log/sysSentry/avg_extra.log" +AVG_EXTRA_LOG_PATH = "/var/log/sysSentry/avg_block_io_extra.log" + def init_io_win(io_dic, config, common_param): """initialize windows of latency, iodump, and dict of avg_value""" diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 83902d1..efcd5d8 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -1,8 +1,20 @@ +# coding: utf-8 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. import logging import os +import re extra_logger = None + def init_extra_logger(log_path, log_level, log_format): global extra_logger try: @@ -26,5 +38,88 @@ def init_extra_logger(log_path, log_level, log_format): logging.error(f"Failed to create extra logger for {log_path}: {e}") extra_logger = logging.getLogger() # Fallback to default logger -def extra_log_slow(msg): + +def extra_latency_log(msg): extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}, type: {msg['alarm_type']}") + + # Parse the latency string from msg + latency_str = msg['details']['latency'] + pattern = r'(\w+):\s*\[([\d,]+)\]' + matches = re.findall(pattern, latency_str) + latency_data = {} + for match in matches: + key = match[0] + values = list(map(int, match[1].split(','))) + latency_data[key] = values + + # Define stage groups + groups = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag', 'requeue'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx'], + 'D->C': ['rq_driver'] + } + + # Calculate statistics for each group + group_stats = {} + for group_name, stages in groups.items(): + all_values = [] + for stage in stages: + if stage in latency_data: + all_values.extend(latency_data[stage]) + if all_values: + min_val = min(all_values) + max_val = max(all_values) + avg_val = sum(all_values) / len(all_values) + else: + min_val = 0 + max_val = 0 + avg_val = 0 + # Convert to ms + min_val_ms = min_val / 1000.0 + max_val_ms = max_val / 1000.0 + avg_val_ms = avg_val / 1000.0 + group_stats[group_name] = { + 'min': min_val_ms, + 'max': max_val_ms, + 'avg': avg_val_ms + } + + # Calculate total latency (B->C) + total_avg = 0 + total_min = 0 + total_max = 0 + for group_name in groups: + total_avg += group_stats[group_name]['avg'] + total_min += group_stats[group_name]['min'] + total_max += group_stats[group_name]['max'] + group_stats['B->C'] = { + 'min': total_min, + 'max': total_max, + 'avg': total_avg + } + + # Calculate PCT for each group (except B->C) + for group_name in groups: + if total_avg > 0: + pct = (group_stats[group_name]['avg'] / total_avg) * 100 + else: + pct = 0 + group_stats[group_name]['pct'] = pct + group_stats['B->C']['pct'] = 100.0 + + # Output table + stage_order = ['B->Q', 'Q->G', 'G->I', 'I->D', 'D->C', 'B->C'] + # Format header + header = "STAGE MIN(ms) MAX(ms) AVG(ms) PCT" + extra_logger.warning(header) + for stage in stage_order: + s = group_stats[stage] + min_str = f"{s['min']:.3f}" + max_str = f"{s['max']:.3f}" + avg_str = f"{s['avg']:.3f}" + pct_str = f"{s['pct']:.2f}%" + # Format each line with aligned columns + line = f"{stage} {min_str:>7} {max_str:>7} {avg_str:>7} {pct_str:>6}" + extra_logger.warning(line) diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 74e5bb4..0337cdb 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -9,7 +9,7 @@ # PURPOSE. # See the Mulan PSL v2 for more details. import logging -from .extra_logger import extra_log_slow +from .extra_logger import extra_latency_log AVG_VALUE = 0 AVG_COUNT = 1 @@ -110,7 +110,8 @@ def log_slow_win(msg, reason): """record log of slow win""" logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") - extra_log_slow(msg) + if "latency" in str(msg.get('alarm_type', '')): + extra_latency_log(msg) logging.info(f"latency: {msg['details']['latency']}") logging.info(f"iodump: {msg['details']['iodump']}") -- Gitee From 501a21977165a7e41b4492f3b096dded4670c6b8 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 12 Sep 2025 11:06:35 +0800 Subject: [PATCH 07/62] add test . --- .../avg_block_io/extra_logger.py | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index efcd5d8..7a34bab 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -44,12 +44,12 @@ def extra_latency_log(msg): # Parse the latency string from msg latency_str = msg['details']['latency'] - pattern = r'(\w+):\s*\[([\d,]+)\]' + pattern = r'(\w+):\s*\[([0-9.,]+)\]' matches = re.findall(pattern, latency_str) latency_data = {} for match in matches: key = match[0] - values = list(map(int, match[1].split(','))) + values = list(map(float, match[1].split(','))) latency_data[key] = values # Define stage groups @@ -111,15 +111,28 @@ def extra_latency_log(msg): # Output table stage_order = ['B->Q', 'Q->G', 'G->I', 'I->D', 'D->C', 'B->C'] - # Format header - header = "STAGE MIN(ms) MAX(ms) AVG(ms) PCT" + STAGE_WIDTH = 7 + NUM_WIDTH = 12 + PCT_WIDTH = 8 + + header = f"{'Stage':<{STAGE_WIDTH}} {'Min(ms)':>{NUM_WIDTH}} {'Max(ms)':>{NUM_WIDTH}} {'Avg(ms)':>{NUM_WIDTH}} {'PCT':>{PCT_WIDTH}}" extra_logger.warning(header) + for stage in stage_order: - s = group_stats[stage] - min_str = f"{s['min']:.3f}" - max_str = f"{s['max']:.3f}" - avg_str = f"{s['avg']:.3f}" - pct_str = f"{s['pct']:.2f}%" - # Format each line with aligned columns - line = f"{stage} {min_str:>7} {max_str:>7} {avg_str:>7} {pct_str:>6}" - extra_logger.warning(line) + try: + s = group_stats[stage] + min_str = f"{s['min']:.3f}" + max_str = f"{s['max']:.3f}" + avg_str = f"{s['avg']:.3f}" + pct_str = f"{s['pct']:.2f}%" + + line = ( + f"{stage:<{STAGE_WIDTH}} " + f"{min_str:>{NUM_WIDTH}} " + f"{max_str:>{NUM_WIDTH}} " + f"{avg_str:>{NUM_WIDTH}} " + f"{pct_str:>{PCT_WIDTH}}" + ) + extra_logger.warning(line) + except KeyError: + return -- Gitee From b2e43076aade53e5e3bb48be30bd7e7443ba835a Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 12 Sep 2025 11:37:50 +0800 Subject: [PATCH 08/62] add test . --- .../avg_block_io/extra_logger.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 7a34bab..c2bf63f 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -111,12 +111,17 @@ def extra_latency_log(msg): # Output table stage_order = ['B->Q', 'Q->G', 'G->I', 'I->D', 'D->C', 'B->C'] - STAGE_WIDTH = 7 - NUM_WIDTH = 12 - PCT_WIDTH = 8 - - header = f"{'Stage':<{STAGE_WIDTH}} {'Min(ms)':>{NUM_WIDTH}} {'Max(ms)':>{NUM_WIDTH}} {'Avg(ms)':>{NUM_WIDTH}} {'PCT':>{PCT_WIDTH}}" - extra_logger.warning(header) + stage_width = 7 + num_width = 12 + pct_width = 8 + + extra_logger.warning( + f"{'Stage':<{stage_width}} " + f"{'Min(ms)':>{num_width}} " + f"{'Max(ms)':>{num_width}} " + f"{'Avg(ms)':>{num_width}} " + f"{'PCT':>{pct_width}}" + ) for stage in stage_order: try: @@ -126,13 +131,12 @@ def extra_latency_log(msg): avg_str = f"{s['avg']:.3f}" pct_str = f"{s['pct']:.2f}%" - line = ( - f"{stage:<{STAGE_WIDTH}} " - f"{min_str:>{NUM_WIDTH}} " - f"{max_str:>{NUM_WIDTH}} " - f"{avg_str:>{NUM_WIDTH}} " - f"{pct_str:>{PCT_WIDTH}}" + extra_logger.warning( + f"{stage:<{stage_width}} " + f"{s['min']:.3f:>{num_width}} " + f"{s['max']:.3f:>{num_width}} " + f"{s['avg']:.3f:>{num_width}} " + f"{s['pct']:.2f}%:>{pct_width}" ) - extra_logger.warning(line) except KeyError: return -- Gitee From 24d486a53e737380e502e47b61227e8aa5128927 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 15 Sep 2025 11:30:52 +0800 Subject: [PATCH 09/62] add test . --- .../pySentryCollector/collect_plugin.py | 4 +- src/sentryPlugins/ai_block_io/ai_block_io.py | 4 +- .../ai_block_io/config_parser.py | 3 + src/sentryPlugins/ai_block_io/extra_logger.py | 151 ++++++++++++++++++ .../ai_block_io/sliding_window.py | 7 +- .../avg_block_io/extra_logger.py | 22 ++- src/sentryPlugins/avg_block_io/utils.py | 5 +- .../bmc_block_io/src/cbmcblockio.cpp | 14 +- 8 files changed, 187 insertions(+), 23 deletions(-) create mode 100644 src/sentryPlugins/ai_block_io/extra_logger.py diff --git a/src/libsentry/python/pySentryCollector/collect_plugin.py b/src/libsentry/python/pySentryCollector/collect_plugin.py index 9495d8b..3395f89 100644 --- a/src/libsentry/python/pySentryCollector/collect_plugin.py +++ b/src/libsentry/python/pySentryCollector/collect_plugin.py @@ -326,9 +326,9 @@ def get_disk_type(disk): try: with open(disk_file, 'r') as file: num = int(file.read()) - if num == 1: + if num == 0: result['message'] = str(DiskType.TYPE_SATA_SSD) - elif num == 0: + elif num == 1: result['message'] = str(DiskType.TYPE_SATA_HDD) else: logging.error("disk %s is not support, num = %d", disk, num) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index abf264d..0905c0d 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -27,6 +27,7 @@ from .data_access import ( ) from .io_data import MetricName from .alarm_report import Xalarm, Report +from .extra_logger import extra_slow_log CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" @@ -209,12 +210,13 @@ class SlowIODetection: del tmp_alarm_content["details"] logging.warning("[SLOW IO] " + str(tmp_alarm_content)) logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, ' - f'stage: {str(tmp_alarm_content.get("driver_name"))}, ' + f'stage: {str(tmp_alarm_content.get("block_stack"))}, ' f'iotype: {str(tmp_alarm_content.get("io_type"))}, ' f'type: {str(tmp_alarm_content.get("alarm_type"))}, ' f'reason: {str(tmp_alarm_content.get("reason"))}') logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) + extra_slow_log(alarm_content) # Step4:等待检测时间 logging.debug("step4. Wait to start next slow io event detection loop.") diff --git a/src/sentryPlugins/ai_block_io/config_parser.py b/src/sentryPlugins/ai_block_io/config_parser.py index 612fe9f..a918c8b 100644 --- a/src/sentryPlugins/ai_block_io/config_parser.py +++ b/src/sentryPlugins/ai_block_io/config_parser.py @@ -17,9 +17,11 @@ from .alarm_report import Report from .threshold import ThresholdType from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level from .data_access import check_detect_frequency_is_valid +from .extra_logger import init_extra_logger LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" +AI_EXTRA_LOG_PATH = "/var/log/sysSentry/ai_block_io_extra.log" ALL_STAGE_LIST = [ "throtl", @@ -52,6 +54,7 @@ def init_log_format(log_level: str): logging.warning( "the log_level: %s you set is invalid, use default value: info.", log_level ) + init_extra_logger(AI_EXTRA_LOG_PATH, get_log_level(log_level.lower()), LOG_FORMAT) class ConfigParser: diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py new file mode 100644 index 0000000..fdced27 --- /dev/null +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -0,0 +1,151 @@ +# coding: utf-8 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging +import os +import re + +extra_logger = None + + +def init_extra_logger(log_path, log_level, log_format): + global extra_logger + try: + if not os.path.exists(log_path): + fd = os.open(log_path, os.O_CREAT | os.O_WRONLY, 0o600) + os.close(fd) + logger_name = f"extra_logger_{log_path}" + logger = logging.getLogger(logger_name) + logger.propagate = False + logger.setLevel(log_level) + + file_handler = logging.FileHandler(log_path) + file_handler.setLevel(log_level) + + formatter = logging.Formatter(log_format) + file_handler.setFormatter(formatter) + + logger.addHandler(file_handler) + extra_logger = logger + except Exception as e: + logging.error(f"Failed to create extra logger for {log_path}: {e}") + extra_logger = logging.getLogger() # Fallback to default logger + +def extra_slow_log(msg): + if "latency" in str(msg.get('alarm_type', '')): + extra_latency_log(msg) + return + if "iodump" in str(msg.get('alarm_type', '')): + return + +def extra_latency_log(msg): + io_types = [iot.strip() for iot in re.split(r',+', msg['io_type'])] + + # Define stage groups + groups = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag', 'requeue'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx'], + 'D->C': ['rq_driver'] + } + + for io_type in io_types: + extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}") + + # Parse the latency string from msg + latency_str = msg['details']['latency'].get(io_type, {}) + pattern = r'(\w+):\s*\[([0-9.,]+)\]' + matches = re.findall(pattern, latency_str) + latency_data = {} + for match in matches: + key = match[0] + values = list(map(float, match[1].split(','))) + latency_data[key] = values + + # Calculate statistics for each group + group_stats = {} + for group_name, stages in groups.items(): + all_values = [] + for stage in stages: + if stage in latency_data: + all_values.extend(latency_data[stage]) + if all_values: + min_val = min(all_values) + max_val = max(all_values) + avg_val = sum(all_values) / len(all_values) + else: + min_val = 0 + max_val = 0 + avg_val = 0 + # Convert to ms + min_val_ms = min_val / 1000.0 + max_val_ms = max_val / 1000.0 + avg_val_ms = avg_val / 1000.0 + group_stats[group_name] = { + 'min': min_val_ms, + 'max': max_val_ms, + 'avg': avg_val_ms + } + + # Calculate total latency (B->C) + total_avg = 0 + total_min = 0 + total_max = 0 + for group_name in groups: + total_avg += group_stats[group_name]['avg'] + total_min += group_stats[group_name]['min'] + total_max += group_stats[group_name]['max'] + group_stats['B->C'] = { + 'min': total_min, + 'max': total_max, + 'avg': total_avg + } + + # Calculate PCT for each group (except B->C) + for group_name in groups: + if total_avg > 0: + pct = (group_stats[group_name]['avg'] / total_avg) * 100 + else: + pct = 0 + group_stats[group_name]['pct'] = pct + group_stats['B->C']['pct'] = 100.0 + + # Output table + stage_order = ['B->Q', 'Q->G', 'G->I', 'I->D', 'D->C', 'B->C'] + stage_width = 7 + num_width = 12 + pct_width = 8 + + extra_logger.warning( + f"{'Stage':<{stage_width}} " + f"{'Min(ms)':>{num_width}} " + f"{'Max(ms)':>{num_width}} " + f"{'Avg(ms)':>{num_width}} " + f"{'PCT':>{pct_width}}" + ) + + for stage in stage_order: + try: + s = group_stats[stage] + min_str = f"{s['min']:>.3f}" + max_str = f"{s['max']:>.3f}" + avg_str = f"{s['avg']:>.3f}" + pct_str = f"{s['pct']:.2f}%" + + extra_logger.warning( + f"{stage:<{stage_width}} " + f"{min_str:>{num_width}} " + f"{max_str:>{num_width}} " + f"{avg_str:>{num_width}} " + f"{pct_str:>{pct_width}}" + ) + except KeyError: + return diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index a13033f..b174d94 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -33,10 +33,13 @@ class SlidingWindow: def is_abnormal(self, data): if self._avg_lim is not None and data < self._avg_lim: return False - if self._ai_threshold is not None and data > self._ai_threshold: - return True + if self._avg_lim is not None and self._ai_threshold is not None: + threshold = max(self._avg_lim, self._ai_threshold) + if data > threshold: + return True if self._abs_threshold is not None and data > self._abs_threshold: return True + return False def push(self, data: float): if len(self._io_data_queue) == self._queue_length: diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index c2bf63f..d7f0183 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -38,9 +38,15 @@ def init_extra_logger(log_path, log_level, log_format): logging.error(f"Failed to create extra logger for {log_path}: {e}") extra_logger = logging.getLogger() # Fallback to default logger +def extra_slow_log(msg): + if "latency" in str(msg.get('alarm_type', '')): + extra_latency_log(msg) + return + if "iodump" in str(msg.get('alarm_type', '')): + return def extra_latency_log(msg): - extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}, type: {msg['alarm_type']}") + extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}") # Parse the latency string from msg latency_str = msg['details']['latency'] @@ -126,17 +132,17 @@ def extra_latency_log(msg): for stage in stage_order: try: s = group_stats[stage] - min_str = f"{s['min']:.3f}" - max_str = f"{s['max']:.3f}" - avg_str = f"{s['avg']:.3f}" + min_str = f"{s['min']:>.3f}" + max_str = f"{s['max']:>.3f}" + avg_str = f"{s['avg']:>.3f}" pct_str = f"{s['pct']:.2f}%" extra_logger.warning( f"{stage:<{stage_width}} " - f"{s['min']:.3f:>{num_width}} " - f"{s['max']:.3f:>{num_width}} " - f"{s['avg']:.3f:>{num_width}} " - f"{s['pct']:.2f}%:>{pct_width}" + f"{min_str:>{num_width}} " + f"{max_str:>{num_width}} " + f"{avg_str:>{num_width}} " + f"{pct_str:>{pct_width}}" ) except KeyError: return diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 0337cdb..27c1f84 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -9,7 +9,7 @@ # PURPOSE. # See the Mulan PSL v2 for more details. import logging -from .extra_logger import extra_latency_log +from .extra_logger import extra_slow_log AVG_VALUE = 0 AVG_COUNT = 1 @@ -110,10 +110,9 @@ def log_slow_win(msg, reason): """record log of slow win""" logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") - if "latency" in str(msg.get('alarm_type', '')): - extra_latency_log(msg) logging.info(f"latency: {msg['details']['latency']}") logging.info(f"iodump: {msg['details']['iodump']}") + extra_slow_log(msg) def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp index cb52364..327ee78 100644 --- a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -200,13 +200,6 @@ int CBMCBlockIo::QueryEvents() break; } - size_t expectedSize = RESP_HEADER_SIZE + header.eventCount * EVENT_SIZE; - if (hexBytes.size() < expectedSize) { - BMC_LOG_ERROR << "Response size too small. Expected: " << expectedSize - << ", Actual: " << hexBytes.size(); - break; - } - BMC_LOG_DEBUG << "Total events: " << header.totalEvents << ", returned: " << static_cast(header.eventCount) << ", current index: " << currentIndex; @@ -215,6 +208,13 @@ int CBMCBlockIo::QueryEvents() break; } + size_t expectedSize = RESP_HEADER_SIZE + header.eventCount * EVENT_SIZE; + if (hexBytes.size() != expectedSize) { + BMC_LOG_ERROR << "Response size invalid. Expected: " << expectedSize + << ", Actual: " << hexBytes.size(); + break; + } + ProcessEvents(hexBytes, header.eventCount); currentIndex += header.eventCount; -- Gitee From eda0be6915a6daef29cca55041a7ae725ce815e7 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 15 Sep 2025 14:46:17 +0800 Subject: [PATCH 10/62] add test . --- src/sentryPlugins/ai_block_io/extra_logger.py | 16 +++++----------- src/sentryPlugins/avg_block_io/extra_logger.py | 2 ++ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index fdced27..1684119 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -38,6 +38,7 @@ def init_extra_logger(log_path, log_level, log_format): logging.error(f"Failed to create extra logger for {log_path}: {e}") extra_logger = logging.getLogger() # Fallback to default logger + def extra_slow_log(msg): if "latency" in str(msg.get('alarm_type', '')): extra_latency_log(msg) @@ -45,6 +46,7 @@ def extra_slow_log(msg): if "iodump" in str(msg.get('alarm_type', '')): return + def extra_latency_log(msg): io_types = [iot.strip() for iot in re.split(r',+', msg['io_type'])] @@ -60,23 +62,15 @@ def extra_latency_log(msg): for io_type in io_types: extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}") - # Parse the latency string from msg - latency_str = msg['details']['latency'].get(io_type, {}) - pattern = r'(\w+):\s*\[([0-9.,]+)\]' - matches = re.findall(pattern, latency_str) - latency_data = {} - for match in matches: - key = match[0] - values = list(map(float, match[1].split(','))) - latency_data[key] = values + latency_data_dict = msg['details']['latency'].get(io_type, {}) # Calculate statistics for each group group_stats = {} for group_name, stages in groups.items(): all_values = [] for stage in stages: - if stage in latency_data: - all_values.extend(latency_data[stage]) + if stage in latency_data_dict: + all_values.extend(latency_data_dict[stage]) if all_values: min_val = min(all_values) max_val = max(all_values) diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index d7f0183..cd050f3 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -38,6 +38,7 @@ def init_extra_logger(log_path, log_level, log_format): logging.error(f"Failed to create extra logger for {log_path}: {e}") extra_logger = logging.getLogger() # Fallback to default logger + def extra_slow_log(msg): if "latency" in str(msg.get('alarm_type', '')): extra_latency_log(msg) @@ -45,6 +46,7 @@ def extra_slow_log(msg): if "iodump" in str(msg.get('alarm_type', '')): return + def extra_latency_log(msg): extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}") -- Gitee From f91d7fa88704c0006938be99a7ca99ee90183014 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 15 Sep 2025 15:31:40 +0800 Subject: [PATCH 11/62] add test . --- selftest/test/test_ai_threshold_slow_io_detection.py | 6 +++--- src/sentryPlugins/ai_block_io/extra_logger.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py index c762c82..a89ce9a 100644 --- a/selftest/test/test_ai_threshold_slow_io_detection.py +++ b/selftest/test/test_ai_threshold_slow_io_detection.py @@ -91,7 +91,7 @@ class Test(unittest.TestCase): self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) def test_not_continuous_sliding_window(self): - not_continuous = NotContinuousSlidingWindow(5, 3) + not_continuous = NotContinuousSlidingWindow(5, 3, 15, 40) boxplot_threshold = BoxplotThreshold(1.5, 10, 8) boxplot_threshold.attach_observer(not_continuous) data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] @@ -114,7 +114,7 @@ class Test(unittest.TestCase): self.assertEqual(25.625, boxplot_threshold.get_threshold()) def test_continuous_sliding_window(self): - continuous = ContinuousSlidingWindow(5, 3) + continuous = ContinuousSlidingWindow(5, 3, 15, 40) boxplot_threshold = BoxplotThreshold(1.5, 10, 8) boxplot_threshold.attach_observer(continuous) data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] @@ -131,7 +131,7 @@ class Test(unittest.TestCase): self.assertTrue(continuous.is_slow_io_event(25)[0][0]) def test_median_sliding_window(self): - median = MedianSlidingWindow(5, 3) + median = MedianSlidingWindow(5, 3, 15, 40) absolute_threshold = AbsoluteThreshold(10, 8) absolute_threshold.attach_observer(median) absolute_threshold.set_threshold(24.5) diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index 1684119..1148139 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -62,7 +62,7 @@ def extra_latency_log(msg): for io_type in io_types: extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}") - latency_data_dict = msg['details']['latency'].get(io_type, {}) + latency_data_dict = msg['details']['latency'].get(io_type, {}) # Calculate statistics for each group group_stats = {} -- Gitee From bdcecdb23b3362771f54f67cc88ab263e5478c1f Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 15 Sep 2025 15:44:57 +0800 Subject: [PATCH 12/62] add test . --- selftest/test/test_ai_threshold_slow_io_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py index a89ce9a..a14d01c 100644 --- a/selftest/test/test_ai_threshold_slow_io_detection.py +++ b/selftest/test/test_ai_threshold_slow_io_detection.py @@ -91,7 +91,7 @@ class Test(unittest.TestCase): self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) def test_not_continuous_sliding_window(self): - not_continuous = NotContinuousSlidingWindow(5, 3, 15, 40) + not_continuous = NotContinuousSlidingWindow(5, 3, 40, 15) boxplot_threshold = BoxplotThreshold(1.5, 10, 8) boxplot_threshold.attach_observer(not_continuous) data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] @@ -114,7 +114,7 @@ class Test(unittest.TestCase): self.assertEqual(25.625, boxplot_threshold.get_threshold()) def test_continuous_sliding_window(self): - continuous = ContinuousSlidingWindow(5, 3, 15, 40) + continuous = ContinuousSlidingWindow(5, 3, 40, 15) boxplot_threshold = BoxplotThreshold(1.5, 10, 8) boxplot_threshold.attach_observer(continuous) data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] @@ -131,7 +131,7 @@ class Test(unittest.TestCase): self.assertTrue(continuous.is_slow_io_event(25)[0][0]) def test_median_sliding_window(self): - median = MedianSlidingWindow(5, 3, 15, 40) + median = MedianSlidingWindow(5, 3, 40, 15) absolute_threshold = AbsoluteThreshold(10, 8) absolute_threshold.attach_observer(median) absolute_threshold.set_threshold(24.5) -- Gitee From 9b01061ff559b13ab531c357c94e9cf13e031103 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 16 Sep 2025 10:48:16 +0800 Subject: [PATCH 13/62] add test . --- config/plugins/bmc_block_io.ini | 8 +----- .../bmc_block_io/include/cbmcblockio.h | 4 --- .../bmc_block_io/include/common.h | 2 -- .../bmc_block_io/include/configure.h | 6 ++-- .../bmc_block_io/src/cbmcblockio.cpp | 28 +++++-------------- src/sentryPlugins/bmc_block_io/src/common.cpp | 14 ++++++---- src/sentryPlugins/bmc_block_io/src/main.cpp | 12 -------- 7 files changed, 19 insertions(+), 55 deletions(-) diff --git a/config/plugins/bmc_block_io.ini b/config/plugins/bmc_block_io.ini index 3b22618..41a39ca 100644 --- a/config/plugins/bmc_block_io.ini +++ b/config/plugins/bmc_block_io.ini @@ -2,10 +2,4 @@ log_level=info # polling cycle, unit: seconds, range: [60, 3600] -patrol_second=5 - -# ipmitool login username -bmc_username=Administrator - -# ipmitool login passwd -bmc_passwd=Admin@9000 \ No newline at end of file +patrol_second=60 \ No newline at end of file diff --git a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h index cb0b514..488b700 100644 --- a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h +++ b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h @@ -40,8 +40,6 @@ public: void Start(); void Stop(); void SetPatrolInterval(int seconds); - void SetUserName(std::string userName); - void SetPassWd(std::string passWd); bool IsRunning(); private: void SentryWorker(); @@ -60,8 +58,6 @@ private: std::thread m_worker; std::mutex m_mutex; std::condition_variable m_cv; - std::string m_userName; - std::string m_passWd; std::string m_bmcIp; int m_patrolSeconds; }; diff --git a/src/sentryPlugins/bmc_block_io/include/common.h b/src/sentryPlugins/bmc_block_io/include/common.h index 9304dee..e23f4b9 100644 --- a/src/sentryPlugins/bmc_block_io/include/common.h +++ b/src/sentryPlugins/bmc_block_io/include/common.h @@ -21,8 +21,6 @@ struct PluConfig { BMCBlockIoPlu::Logger::Level logLevel; int patrolSeconds; - std::string userName; - std::string passWd; }; struct ConfigItem { diff --git a/src/sentryPlugins/bmc_block_io/include/configure.h b/src/sentryPlugins/bmc_block_io/include/configure.h index 65a56ce..f253fb4 100644 --- a/src/sentryPlugins/bmc_block_io/include/configure.h +++ b/src/sentryPlugins/bmc_block_io/include/configure.h @@ -14,11 +14,9 @@ namespace BMCBlockIoPlu { const std::string BMCPLU_CONFIG_PATH = "/etc/sysSentry/plugins/bmc_block_io.ini"; const std::string BMCPLU_LOG_PATH = "/var/log/sysSentry/bmc_block_io.log"; -const std::string BMCPLU_DEFAULT_USERNAME = "Administrator"; -const std::string BMCPLU_DEFAULT_PASSWD = "Admin@9000"; -const int BMCPLU_PATROL_MIN = 1; +const int BMCPLU_PATROL_MIN = 60; const int BMCPLU_PATROL_MAX = 3600; -const int BMCPLU_PATROL_DEFAULT = 600; +const int BMCPLU_PATROL_DEFAULT = 300; const int BMCPLU_CONFIG_CHECK_CYCLE = 10; // seconds const int BMCPLU_DEFAULT_SLEEP_CYCLE = 3; // seconds } diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp index 327ee78..9711cbb 100644 --- a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -43,8 +43,6 @@ const std::string JSON_KEY_DETAILS = "details"; CBMCBlockIo::CBMCBlockIo() : m_running(false), m_patrolSeconds(BMCPLU_PATROL_DEFAULT), - m_userName(BMCPLU_DEFAULT_USERNAME), - m_passWd(BMCPLU_DEFAULT_PASSWD), m_bmcIp("") { } @@ -89,16 +87,6 @@ void CBMCBlockIo::SetPatrolInterval(int seconds) m_patrolSeconds = seconds; } -void CBMCBlockIo::SetUserName(std::string userName) -{ - m_userName = userName; -} - -void CBMCBlockIo::SetPassWd(std::string passWd) -{ - m_passWd = passWd; -} - bool CBMCBlockIo::IsRunning() { return m_running; @@ -178,7 +166,7 @@ void CBMCBlockIo::GetBMCIp() 20-23 占位字节,默认0 N+1-N+15重复上面9-23中的内容,表示下一个事件 厂商ID固定,其他所有多字节对象均为小端序, eg: -ipmitool -I lanplus -H x.x.x.x -U x -P x -C 17 raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00 0x00 0x00 0x01 0x02 +ipmitool raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00 0x00 0x00 0x01 0x02 db 07 00 03 00 03 00 39 00 00 02 2f ab 91 68 00 02 04 00 00 00 00 39 00 00 02 2e ab 91 68 00 02 02 00 00 00 00 39 00 00 02 2e ab 91 68 00 02 01 00 00 00 00 @@ -192,6 +180,7 @@ int CBMCBlockIo::QueryEvents() std::string cmd = BuildIPMICommand(currentIndex); std::vector hexBytes = ExecuteIPMICommand(cmd); if (hexBytes.empty()) { + ret = BMCPLU_SUCCESS; break; } @@ -231,10 +220,7 @@ std::string CBMCBlockIo::BuildIPMICommand(uint16_t startIndex) uint8_t indexHigh = static_cast((startIndex >> 8) & 0xff); uint8_t indexLow = static_cast(startIndex & 0xff); std::ostringstream cmdStream; - cmdStream << "ipmitool -I lanplus -H " << m_bmcIp - << " -U " << m_userName - << " -P " << m_passWd - << " -C 17 raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00" + cmdStream << "ipmitool raw 0x30 0x94 0xDB 0x07 0x00 0x40 0x00" << " " << ByteToHex(indexLow) << " " << ByteToHex(indexHigh) << " 0x01 0x02"; @@ -386,10 +372,10 @@ void CBMCBlockIo::ReportAlarm(const IPMIEvent& event) json_object* jObject = json_object_new_object(); json_object_object_add(jObject, JSON_KEY_ALARM_SOURCE.c_str(), json_object_new_string(BMC_TASK_NAME.c_str())); json_object_object_add(jObject, JSON_KEY_DRIVER_NAME.c_str(), json_object_new_string(std::to_string(event.deviceId).c_str())); - //json_object_object_add(jObject, JSON_KEY_IO_TYPE.c_str(), json_object_new_string("null")); - //json_object_object_add(jObject, JSON_KEY_REASON.c_str(), json_object_new_string("null")); - //json_object_object_add(jObject, JSON_KEY_BLOCK_STACK.c_str(), json_object_new_string("null")); - //json_object_object_add(jObject, JSON_KEY_DETAILS.c_str(), json_object_new_string("null")); + json_object_object_add(jObject, JSON_KEY_IO_TYPE.c_str(), json_object_new_string("read,write")); + json_object_object_add(jObject, JSON_KEY_REASON.c_str(), json_object_new_string("driver slow")); + json_object_object_add(jObject, JSON_KEY_BLOCK_STACK.c_str(), json_object_new_string("rq_driver")); + json_object_object_add(jObject, JSON_KEY_DETAILS.c_str(), json_object_new_string("{}}")); const char *jData = json_object_to_json_string(jObject); int ret = xalarm_Report(BMC_ALARM_ID, ucAlarmLevel, ucAlarmType, const_cast(jData)); if (ret != RETURE_CODE_SUCCESS) { diff --git a/src/sentryPlugins/bmc_block_io/src/common.cpp b/src/sentryPlugins/bmc_block_io/src/common.cpp index f03db17..6d98f83 100644 --- a/src/sentryPlugins/bmc_block_io/src/common.cpp +++ b/src/sentryPlugins/bmc_block_io/src/common.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace BMCBlockIoPlu { @@ -189,13 +190,16 @@ std::vector SplitString(const std::string& str, const std::string& size_t pos = 0; while (true) { size_t split_pos = str.find(split, pos); - if (split_pos != std::string::npos) { - result.push_back(str.substr(pos, split_pos - pos)); - pos = split_pos + split.size(); - } else { - result.push_back(str.substr(pos)); + std::string substring = str.substr(pos, split_pos - pos); + + if (!substring.empty()) { + result.push_back(substring); + } + + if (split_pos == std::string::npos) { break; } + pos = split_pos + split.size(); } return result; } diff --git a/src/sentryPlugins/bmc_block_io/src/main.cpp b/src/sentryPlugins/bmc_block_io/src/main.cpp index 8aeb3d0..29773b0 100644 --- a/src/sentryPlugins/bmc_block_io/src/main.cpp +++ b/src/sentryPlugins/bmc_block_io/src/main.cpp @@ -60,8 +60,6 @@ int main(int argc, char* argv[]) } else { BMCBlockIoPlu::Logger::GetInstance().SetLevel(config.logLevel); blockIo.SetPatrolInterval(config.patrolSeconds); - blockIo.SetUserName(config.userName); - blockIo.SetPassWd(config.passWd); } std::thread configMonitor([&] { @@ -96,16 +94,6 @@ int main(int argc, char* argv[]) BMC_LOG_INFO << "Patrol interval update to " << config.patrolSeconds; blockIo.SetPatrolInterval(config.patrolSeconds); } - if (newConfig.userName != config.userName) { - config.userName = newConfig.userName; - BMC_LOG_INFO << "BMC userName update to " << config.userName; - blockIo.SetUserName(config.userName); - } - if (newConfig.passWd != config.passWd) { - config.passWd = newConfig.passWd; - BMC_LOG_INFO << "BMC passWd update"; - blockIo.SetPassWd(config.passWd); - } } } } -- Gitee From 765e41fa3b02dec9a04cd6f21f113d0b3109dbfc Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 16 Sep 2025 11:06:40 +0800 Subject: [PATCH 14/62] add test . --- src/sentryPlugins/bmc_block_io/src/common.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/sentryPlugins/bmc_block_io/src/common.cpp b/src/sentryPlugins/bmc_block_io/src/common.cpp index 6d98f83..cd01b8a 100644 --- a/src/sentryPlugins/bmc_block_io/src/common.cpp +++ b/src/sentryPlugins/bmc_block_io/src/common.cpp @@ -82,16 +82,6 @@ int ParseConfig(const std::string& path, PluConfig& config) return true; }}; - configMap["bmc_username"] = {true, false, [&](const std::string& value) { - config.userName = value; - return true; - }}; - - configMap["bmc_passwd"] = {true, false, [&](const std::string& value) { - config.passWd = value; - return true; - }}; - std::string line; while (std::getline(file, line)) { line = Trim(line); -- Gitee From df576b7a333217a6d849e293b89d2ddc2ae8af23 Mon Sep 17 00:00:00 2001 From: hewanhan Date: Tue, 16 Sep 2025 19:51:28 +0800 Subject: [PATCH 15/62] add test . --- Makefile | 10 +++++----- src/libs/libxalarm/register_xalarm.c | 8 ++------ src/libs/pyxalarm/register_xalarm.py | 2 +- src/services/xalarm/xalarm_daemon.py | 2 +- src/services/xalarm/xalarm_server.py | 4 ++-- 5 files changed, 11 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index bd3bcb9..6009ca2 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ isentry: install -d -m 700 $(LOGSAVEDIR)/sysSentry install -d -m 700 $(VARLIB)/logrotate-syssentry install -d -m 700 $(PYDIR)/syssentry - install -d -m 700 $(PYDIR)/xalarm + install -d -m 755 $(PYDIR)/xalarm install -d -m 700 $(PYDIR)/sentryCollector install -d -m 700 $(PYDIR)/$(PKGVEREGG) install -d -m 700 $(ETCDIR)/sysconfig @@ -97,7 +97,7 @@ isentry: ## 安装python源代码文件到相应的目录 install -m 550 src/build/usr/lib/$(PYNAME)/site-packages/services/syssentry/* $(PYDIR)/syssentry - install -m 550 src/build/usr/lib/$(PYNAME)/site-packages/services/xalarm/* $(PYDIR)/xalarm + install -m 555 src/build/usr/lib/$(PYNAME)/site-packages/services/xalarm/* $(PYDIR)/xalarm install -m 550 src/build/usr/lib/$(PYNAME)/site-packages/services/sentryCollector/* $(PYDIR)/sentryCollector install -m 550 src/build/usr/lib/$(PYNAME)/site-packages/$(PKGVEREGG)/* $(PYDIR)/$(PKGVEREGG) @@ -146,14 +146,14 @@ isentry: install -m 550 src/libsentry/python/pySentryCollector/collect_plugin.py $(PYDIR)/sentryCollector # libxalarm - install -m 550 $(CURLIBDIR)/build/libxalarm/libxalarm.so $(LIBINSTALLDIR) + install -m 555 $(CURLIBDIR)/build/libxalarm/libxalarm.so $(LIBINSTALLDIR) # libxalarm-devel - install -d -m 700 $(INCLUDEDIR)/xalarm + install -d -m 755 $(INCLUDEDIR)/xalarm install -m 644 $(CURLIBDIR)/libxalarm/register_xalarm.h $(INCLUDEDIR)/xalarm/ # pyxalarm - install -m 550 src/libs/pyxalarm/register_xalarm.py $(PYDIR)/xalarm + install -m 555 src/libs/pyxalarm/register_xalarm.py $(PYDIR)/xalarm # log utils install -d -m 700 $(INCLUDEDIR)/libsentry diff --git a/src/libs/libxalarm/register_xalarm.c b/src/libs/libxalarm/register_xalarm.c index 1ddd0ae..4204fce 100644 --- a/src/libs/libxalarm/register_xalarm.c +++ b/src/libs/libxalarm/register_xalarm.c @@ -31,8 +31,8 @@ #define DIR_XALARM "/var/run/xalarm" #define PATH_REG_ALARM "/var/run/xalarm/alarm" #define PATH_REPORT_ALARM "/var/run/xalarm/report" -#define ALARM_DIR_PERMISSION 0750 -#define ALARM_SOCKET_PERMISSION 0600 +#define ALARM_DIR_PERMISSION 0755 +#define ALARM_SOCKET_PERMISSION 0666 #define TIME_UNIT_MILLISECONDS 1000 #define MAX_PARAS_LEN 8191 @@ -121,10 +121,6 @@ static int create_unix_socket(const char *path) printf("create_unix_socket: connect alarm_addr failed, ret: %d\n", ret); goto release_socket; } - if (chmod(path, ALARM_SOCKET_PERMISSION) < 0) { - printf("chmod %s failed: %s\n", path, strerror(errno)); - goto release_socket; - } return fd; diff --git a/src/libs/pyxalarm/register_xalarm.py b/src/libs/pyxalarm/register_xalarm.py index 7416478..91debf7 100644 --- a/src/libs/pyxalarm/register_xalarm.py +++ b/src/libs/pyxalarm/register_xalarm.py @@ -16,7 +16,7 @@ MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) DIR_XALARM = "/var/run/xalarm" PATH_REG_ALARM = "/var/run/xalarm/alarm" PATH_REPORT_ALARM = "/var/run/xalarm/report" -ALARM_DIR_PERMISSION = 0o0750 +ALARM_DIR_PERMISSION = 0o0755 ALARM_REG_SOCK_PERMISSION = 0o0700 ALARM_SOCKET_PERMISSION = 0o0700 TIME_UNIT_MILLISECONDS = 1000 diff --git a/src/services/xalarm/xalarm_daemon.py b/src/services/xalarm/xalarm_daemon.py index 3ab211c..14a41cd 100644 --- a/src/services/xalarm/xalarm_daemon.py +++ b/src/services/xalarm/xalarm_daemon.py @@ -25,7 +25,7 @@ from .xalarm_config import config_init, get_log_level from .xalarm_server import server_loop, SOCK_FILE ALARM_DIR = "/var/run/xalarm" -ALARM_DIR_PERMISSION = 0o750 +ALARM_DIR_PERMISSION = 0o755 ALARM_LOGFILE = '/var/log/sysSentry/xalarm.log' XALARMD_PID_FILE = "/var/run/xalarm/xalarmd.pid" PID_FILE_FLOCK = None diff --git a/src/services/xalarm/xalarm_server.py b/src/services/xalarm/xalarm_server.py index 3ed0a24..0ada5ca 100644 --- a/src/services/xalarm/xalarm_server.py +++ b/src/services/xalarm/xalarm_server.py @@ -35,8 +35,8 @@ ALARM_DIR = "/var/run/xalarm" USER_RECV_SOCK = "/var/run/xalarm/alarm" SOCK_FILE = "/var/run/xalarm/report" ALARM_REPORT_LEN = 8216 -ALARM_DIR_PERMISSION = 0o750 -SOCKET_FILE_PERMISSON = 0o600 +ALARM_DIR_PERMISSION = 0o755 +SOCKET_FILE_PERMISSON = 0o666 PERMISION_MASK = 0o777 PEROID_CHECK_TIME = 3 ALARM_LISTEN_QUEUE_LEN = 5 -- Gitee From 9adfaa97245020f835f8309a33cf8295e09f3575 Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 17 Sep 2025 17:36:29 +0800 Subject: [PATCH 16/62] add test . --- .../bmc_block_io/include/cbmcblockio.h | 3 ++ .../bmc_block_io/src/cbmcblockio.cpp | 32 ++++++++++++++----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h index 488b700..a9829fe 100644 --- a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h +++ b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace BMCBlockIoPlu { @@ -59,6 +60,8 @@ private: std::mutex m_mutex; std::condition_variable m_cv; std::string m_bmcIp; + std::set m_lastDeviceIds; + std::set m_currentDeviceIds; int m_patrolSeconds; }; } diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp index 9711cbb..72cc819 100644 --- a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -42,8 +42,7 @@ const std::string JSON_KEY_DETAILS = "details"; CBMCBlockIo::CBMCBlockIo() : m_running(false), - m_patrolSeconds(BMCPLU_PATROL_DEFAULT), - m_bmcIp("") + m_patrolSeconds(BMCPLU_PATROL_DEFAULT) { } @@ -174,26 +173,26 @@ db 07 00 03 00 03 00 39 00 00 02 2f ab 91 68 00 02 04 00 00 00 00 int CBMCBlockIo::QueryEvents() { uint16_t currentIndex = 0; - int ret = BMCPLU_FAILED; + int ret = BMCPLU_SUCCESS; + m_currentDeviceIds.clear(); while (true) { std::string cmd = BuildIPMICommand(currentIndex); std::vector hexBytes = ExecuteIPMICommand(cmd); if (hexBytes.empty()) { - ret = BMCPLU_SUCCESS; break; } ResponseHeader header = ParseResponseHeader(hexBytes); if (!header.valid) { + ret = BMCPLU_FAILED; break; } - BMC_LOG_DEBUG << "Total events: " << header.totalEvents + BMC_LOG_INFO << "Total events: " << header.totalEvents << ", returned: " << static_cast(header.eventCount) << ", current index: " << currentIndex; if (header.eventCount == 0) { - ret = BMCPLU_SUCCESS; break; } @@ -201,6 +200,7 @@ int CBMCBlockIo::QueryEvents() if (hexBytes.size() != expectedSize) { BMC_LOG_ERROR << "Response size invalid. Expected: " << expectedSize << ", Actual: " << hexBytes.size(); + ret = BMCPLU_FAILED; break; } @@ -208,10 +208,20 @@ int CBMCBlockIo::QueryEvents() currentIndex += header.eventCount; if (currentIndex >= header.totalEvents) { - ret = BMCPLU_SUCCESS; break; } } + + if (ret == BMCPLU_SUCCESS) { + for (const auto& id : m_lastDeviceIds) { + if (m_currentDeviceIds.find(id) == m_currentDeviceIds.end()) { + uint32_t timeNow = static_cast(std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())); + IPMIEvent clearEvent = {ALARM_CLEAR_CODE, timeNow, 0, 0x02, id, true}; + ReportAlarm(clearEvent); + } + } + m_lastDeviceIds = m_currentDeviceIds; + } return ret; } @@ -362,13 +372,19 @@ void CBMCBlockIo::ReportAlarm(const IPMIEvent& event) uint8_t ucAlarmType = 0; if (event.alarmTypeCode == ALARM_OCCUR_CODE) { ucAlarmType = ALARM_TYPE_OCCUR; + m_currentDeviceIds.insert(event.deviceId); } else if (event.alarmTypeCode == ALARM_CLEAR_CODE) { ucAlarmType = ALARM_TYPE_RECOVER; } else { - BMC_LOG_ERROR << "Skipping unknown alarm type: 0x" + BMC_LOG_DEBUG << "Skipping unknown alarm type: 0x" << std::hex << event.alarmTypeCode; return; } + + BMC_LOG_INFO << "Report alarm, type: " << static_cast(ucAlarmType); + BMC_LOG_INFO << "level: " << static_cast(ucAlarmLevel); + BMC_LOG_INFO << "deviceId: " << static_cast(event.deviceId); + BMC_LOG_INFO << "timestamp: " << event.timestamp; json_object* jObject = json_object_new_object(); json_object_object_add(jObject, JSON_KEY_ALARM_SOURCE.c_str(), json_object_new_string(BMC_TASK_NAME.c_str())); json_object_object_add(jObject, JSON_KEY_DRIVER_NAME.c_str(), json_object_new_string(std::to_string(event.deviceId).c_str())); -- Gitee From 5992155009dc08056479f1c785bf237f1565b9d2 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 22 Sep 2025 17:09:40 +0800 Subject: [PATCH 17/62] add test . --- src/sentryPlugins/bmc_block_io/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sentryPlugins/bmc_block_io/CMakeLists.txt b/src/sentryPlugins/bmc_block_io/CMakeLists.txt index 2234dee..70ed354 100644 --- a/src/sentryPlugins/bmc_block_io/CMakeLists.txt +++ b/src/sentryPlugins/bmc_block_io/CMakeLists.txt @@ -6,6 +6,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/output) +set(CMAKE_SKIP_BUILD_RPATH TRUE) +set(CMAKE_SKIP_INSTALL_RPATH TRUE) + include_directories( ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/../../libs/libxalarm -- Gitee From 779a65006b4f02ccc82d2ab59f39dcfb3e14cbe3 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 23 Sep 2025 15:59:59 +0800 Subject: [PATCH 18/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 7 ++-- .../ai_block_io/config_parser.py | 34 +++++++++++++++++++ .../avg_block_io/avg_block_io.py | 4 +-- src/sentryPlugins/avg_block_io/config.py | 24 ++++++++++++- 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 0905c0d..c3fd140 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -113,6 +113,9 @@ class SlowIODetection: window_size, window_threshold = ( self._config_parser.get_window_size_and_window_minimum_threshold() ) + window_size_iodump, window_threshold_iodump = ( + self._config_parser.get_window_size_and_window_threshold_iodump() + ) for disk, metric_name_list in self._detector_name_list.items(): disk_detector = DiskDetector(disk) @@ -159,8 +162,8 @@ class SlowIODetection: abs_threshold = self._config_parser.write_iodump_lim sliding_window = SlidingWindowFactory().get_sliding_window( sliding_window_type, - queue_length=window_size, - threshold=window_threshold + queue_length=window_size_iodump, + threshold=window_threshold_iodump ) detector = Detector(metric_name, threshold, sliding_window) threshold.set_threshold(abs_threshold) diff --git a/src/sentryPlugins/ai_block_io/config_parser.py b/src/sentryPlugins/ai_block_io/config_parser.py index a918c8b..4024e44 100644 --- a/src/sentryPlugins/ai_block_io/config_parser.py +++ b/src/sentryPlugins/ai_block_io/config_parser.py @@ -75,6 +75,8 @@ class ConfigParser: "win_type": get_sliding_window_type_enum("not_continuous"), "win_size": 30, "win_threshold": 6, + "win_size_iodump": 30, + "win_threshold_iodump": 6, }, "latency_sata_ssd": { "read_avg_lim": 10000, @@ -438,6 +440,30 @@ class ConfigParser: ) ) + def _read_window_size_iodump(self, items_sliding_window: dict): + default_window_size_iodump = self._conf["algorithm"]["win_size"] + self._conf["algorithm"]["win_size_iodump"] = self._get_config_value( + items_sliding_window, + "win_size_iodump", + int, + default_window_size_iodump, + gt=0, + le=300, + ) + + def _read_window_threshold_iodump(self, items_sliding_window: dict): + default_window_threshold_iodump = self._conf["algorithm"]["win_threshold"] + self._conf["algorithm"]["win_threshold_iodump"] = ( + self._get_config_value( + items_sliding_window, + "win_threshold_iodump", + int, + default_window_threshold_iodump, + gt=0, + le=self._conf["algorithm"]["win_size_iodump"], + ) + ) + def read_config_from_file(self): if not os.path.exists(self._config_file_name): init_log_format(self._conf["log"]["level"]) @@ -498,6 +524,8 @@ class ConfigParser: self._read_sliding_window_type(items_algorithm) self._read_window_size(items_algorithm) self._read_window_minimum_threshold(items_algorithm) + self._read_window_size_iodump(items_algorithm) + self._read_window_threshold_iodump(items_algorithm) if con.has_section("latency_sata_ssd"): items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) @@ -708,6 +736,12 @@ class ConfigParser: self._conf["algorithm"]["win_threshold"], ) + def get_window_size_and_window_threshold_iodump(self): + return ( + self._conf["algorithm"]["win_size_iodump"], + self._conf["algorithm"]["win_threshold_iodump"], + ) + @property def period_time(self): return self._conf["common"]["period_time"] diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index 9f6af99..f1e0e24 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -58,7 +58,7 @@ def init_io_win(io_dic, config, common_param): logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: - io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) + io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size_iodump"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) return io_data, io_avg_value @@ -178,7 +178,7 @@ def main(): # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 # step1. 解析公共配置 --- algorithm - io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) + io_dic["win_size"], io_dic["win_threshold"], io_dic["win_size_iodump"], io_dic["win_threshold_iodump"] = read_config_algorithm(config) # step2. 解析公共配置 --- latency_xxx common_param = read_config_latency(config) diff --git a/src/sentryPlugins/avg_block_io/config.py b/src/sentryPlugins/avg_block_io/config.py index c1e8ab1..60ec9bc 100644 --- a/src/sentryPlugins/avg_block_io/config.py +++ b/src/sentryPlugins/avg_block_io/config.py @@ -25,6 +25,8 @@ CONF_COMMON_PER_TIME = 'period_time' CONF_ALGO = 'algorithm' CONF_ALGO_SIZE = 'win_size' CONF_ALGO_THRE = 'win_threshold' +CONF_ALGO_SIZE_IODUMP = 'win_size_iodump' +CONF_ALGO_THRE_IODUMP = 'win_threshold_iodump' CONF_LATENCY = 'latency_{}' CONF_IODUMP = 'iodump' @@ -171,7 +173,27 @@ def read_config_algorithm(config): win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold'] logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default") - return win_size, win_threshold + try: + win_size_iodump = int(config.get(CONF_ALGO, CONF_ALGO_SIZE_IODUMP)) + if not (1 <= win_size_iodump <= 300): + raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP}") + except ValueError: + report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP} config") + except configparser.NoOptionError: + win_size_iodump = win_size + logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP}, use {win_size_iodump} as default") + + try: + win_threshold_iodump = int(config.get(CONF_ALGO, CONF_ALGO_THRE_IODUMP)) + if win_threshold_iodump < 1 or win_threshold_iodump > 300 or win_threshold_iodump > win_size_iodump: + raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}") + except ValueError: + report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP} config") + except configparser.NoOptionError: + win_threshold_iodump = win_threshold + logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}, use {win_threshold_iodump} as default") + + return win_size, win_threshold, win_size_iodump, win_threshold_iodump def read_config_latency(config): -- Gitee From d358a66a18dc7e891554dba086676dc9d08eda0d Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 23 Sep 2025 17:19:04 +0800 Subject: [PATCH 19/62] add test . --- config/plugins/ai_block_io.ini | 2 ++ config/plugins/avg_block_io.ini | 2 ++ 2 files changed, 4 insertions(+) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 53ac486..284c412 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -15,6 +15,8 @@ boxplot_parameter=1.5 win_type=not_continuous win_size=30 win_threshold=6 +win_size_iodump=30 +win_threshold_iodump=6 [latency_sata_ssd] read_avg_lim=10000 diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 3b4ee33..57a0ba6 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -10,6 +10,8 @@ period_time=1 [algorithm] win_size=30 win_threshold=6 +win_size_iodump=30 +win_threshold_iodump=6 [latency_nvme_ssd] read_avg_lim=10000 -- Gitee From 83cf034e615f7bd6fe2a7a995dc0a1c74b855f87 Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 24 Sep 2025 11:23:00 +0800 Subject: [PATCH 20/62] add test . --- .../bmc_block_io/include/cbmcblockio.h | 1 + .../bmc_block_io/include/common.h | 2 + .../bmc_block_io/include/configure.h | 3 ++ .../bmc_block_io/src/cbmcblockio.cpp | 19 ++++++++- src/sentryPlugins/bmc_block_io/src/common.cpp | 41 +++++++++++++++++++ src/sentryPlugins/bmc_block_io/src/logger.cpp | 14 +------ src/sentryPlugins/bmc_block_io/src/main.cpp | 22 ++++++++-- 7 files changed, 84 insertions(+), 18 deletions(-) diff --git a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h index a9829fe..a579c4a 100644 --- a/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h +++ b/src/sentryPlugins/bmc_block_io/include/cbmcblockio.h @@ -63,6 +63,7 @@ private: std::set m_lastDeviceIds; std::set m_currentDeviceIds; int m_patrolSeconds; + int m_alarmId; }; } #endif diff --git a/src/sentryPlugins/bmc_block_io/include/common.h b/src/sentryPlugins/bmc_block_io/include/common.h index e23f4b9..20fa8b1 100644 --- a/src/sentryPlugins/bmc_block_io/include/common.h +++ b/src/sentryPlugins/bmc_block_io/include/common.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "configure.h" #include "logger.h" @@ -34,6 +35,7 @@ namespace BMCBlockIoPlu { std::string Trim(const std::string& str); bool IsValidNumber(const std::string& str, int& num); int ParseConfig(const std::string& path, PluConfig& config); +std::map> parseModConfig(const std::string& path); std::string ExtractFileName(const std::string& path); int ExecCommand(const std::string& cmd, std::vector& result); std::string ByteToHex(uint8_t byte); diff --git a/src/sentryPlugins/bmc_block_io/include/configure.h b/src/sentryPlugins/bmc_block_io/include/configure.h index f253fb4..380b6b8 100644 --- a/src/sentryPlugins/bmc_block_io/include/configure.h +++ b/src/sentryPlugins/bmc_block_io/include/configure.h @@ -14,10 +14,13 @@ namespace BMCBlockIoPlu { const std::string BMCPLU_CONFIG_PATH = "/etc/sysSentry/plugins/bmc_block_io.ini"; const std::string BMCPLU_LOG_PATH = "/var/log/sysSentry/bmc_block_io.log"; +const std::string BMCPLU_MOD_CONFIG = "/etc/sysSentry/tasks/bmc_block_io.mod"; const int BMCPLU_PATROL_MIN = 60; const int BMCPLU_PATROL_MAX = 3600; const int BMCPLU_PATROL_DEFAULT = 300; const int BMCPLU_CONFIG_CHECK_CYCLE = 10; // seconds const int BMCPLU_DEFAULT_SLEEP_CYCLE = 3; // seconds +const int BMCPLU_LOGFILE_CHECK_CYCLE = 30; // second +const int BMCPLU_DEFAULT_ALARM_ID = 1002; } #endif diff --git a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp index 72cc819..e2867d9 100644 --- a/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp +++ b/src/sentryPlugins/bmc_block_io/src/cbmcblockio.cpp @@ -12,6 +12,7 @@ #include #include #include +#include extern "C" { #include "register_xalarm.h" } @@ -21,7 +22,6 @@ extern "C" { namespace BMCBlockIoPlu { -const int BMC_ALARM_ID = 1002; const int RESP_HEADER_SIZE = 7; const int EVENT_SIZE = 15; const uint32_t ALARM_OCCUR_CODE = 0x02000039; @@ -39,11 +39,26 @@ const std::string JSON_KEY_IO_TYPE = "io_type"; const std::string JSON_KEY_REASON = "reason"; const std::string JSON_KEY_BLOCK_STACK = "block_stack"; const std::string JSON_KEY_DETAILS = "details"; +const std::string MOD_SECTION_COMMON = "common"; +const std::string MOD_COMMON_ALARM_ID = "alarm_id"; CBMCBlockIo::CBMCBlockIo() : m_running(false), m_patrolSeconds(BMCPLU_PATROL_DEFAULT) { + std::map> modConfig = parseModConfig(BMCPLU_MOD_CONFIG); + if (modConfig.find(MOD_SECTION_COMMON) != modConfig.end()) { + auto commonSection = modConfig[MOD_SECTION_COMMON]; + if (commonSection.find(MOD_COMMON_ALARM_ID) != commonSection.end()) { + int alarmId = 0; + if (IsValidNumber(commonSection[MOD_COMMON_ALARM_ID], alarmId) && alarmId > 0) { + m_alarmId = alarmId; + } else { + m_alarmId = BMCPLU_DEFAULT_ALARM_ID; + BMC_LOG_WARNING << "Invalid alarm_id in mod config, use default alarm id: " << BMCPLU_DEFAULT_ALARM_ID; + } + } + } } CBMCBlockIo::~CBMCBlockIo() @@ -393,7 +408,7 @@ void CBMCBlockIo::ReportAlarm(const IPMIEvent& event) json_object_object_add(jObject, JSON_KEY_BLOCK_STACK.c_str(), json_object_new_string("rq_driver")); json_object_object_add(jObject, JSON_KEY_DETAILS.c_str(), json_object_new_string("{}}")); const char *jData = json_object_to_json_string(jObject); - int ret = xalarm_Report(BMC_ALARM_ID, ucAlarmLevel, ucAlarmType, const_cast(jData)); + int ret = xalarm_Report(m_alarmId, ucAlarmLevel, ucAlarmType, const_cast(jData)); if (ret != RETURE_CODE_SUCCESS) { BMC_LOG_ERROR << "Failed to xalarm_Report, ret: " << ret; } diff --git a/src/sentryPlugins/bmc_block_io/src/common.cpp b/src/sentryPlugins/bmc_block_io/src/common.cpp index cd01b8a..94a9b92 100644 --- a/src/sentryPlugins/bmc_block_io/src/common.cpp +++ b/src/sentryPlugins/bmc_block_io/src/common.cpp @@ -123,6 +123,47 @@ int ParseConfig(const std::string& path, PluConfig& config) return BMCPLU_SUCCESS; } +std::map> parseModConfig(const std::string& path) +{ + std::map> result; + + std::ifstream file(path); + if (!file.is_open()) { + BMC_LOG_ERROR << "Failed to open mod file: " << path; + return result; + } + + std::string line; + std::string currentSection; + while (std::getline(file, line)) { + line = Trim(line); + if (line.empty() || line[0] == '#') { + continue; + } + + // check for section + if (line[0] == '[' && line[line.length() - 1] == ']') { + currentSection = Trim(line.substr(1, line.length() - 2)); + if (!currentSection.empty()) { + result[currentSection] = std::map(); + } + continue; + } + + // check for key=value + size_t eqPos = line.find('='); + if (eqPos != std::string::npos && !currentSection.empty()) { + std::string key = Trim(line.substr(0, eqPos)); + std::string value = Trim(line.substr(eqPos + 1)); + if (!key.empty()) { + result[currentSection][key] = value; + } + } + } + + return result; +} + std::string ExtractFileName(const std::string& path) { size_t lastSlashPos = path.find_last_of('/'); diff --git a/src/sentryPlugins/bmc_block_io/src/logger.cpp b/src/sentryPlugins/bmc_block_io/src/logger.cpp index f0c84b1..f80e46c 100644 --- a/src/sentryPlugins/bmc_block_io/src/logger.cpp +++ b/src/sentryPlugins/bmc_block_io/src/logger.cpp @@ -65,9 +65,8 @@ void Logger::OpenLogFile() void Logger::CheckFileState() { - const int timeInterval = 30; // second std::time_t timeNow = std::time(nullptr); - if (timeNow - m_checkTime < timeInterval) { + if (timeNow - m_checkTime < BMCPLU_LOGFILE_CHECK_CYCLE) { return; } @@ -82,16 +81,7 @@ void Logger::CheckFileState() return; } - bool needReopen = false; - if (fileStat.st_ino != m_inode) { - needReopen = true; - } else if (fileStat.st_dev != m_device) { - needReopen = true; - } else if (fileStat.st_size < m_fileSize) { - needReopen = true; - } - - if (needReopen) { + if (fileStat.st_ino != m_inode || fileStat.st_dev != m_device || fileStat.st_size < m_fileSize) { ReopenLogFile(); } else { m_fileSize = fileStat.st_size; diff --git a/src/sentryPlugins/bmc_block_io/src/main.cpp b/src/sentryPlugins/bmc_block_io/src/main.cpp index 29773b0..136bf13 100644 --- a/src/sentryPlugins/bmc_block_io/src/main.cpp +++ b/src/sentryPlugins/bmc_block_io/src/main.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -18,12 +20,15 @@ #include "common.h" std::atomic g_exit(false); +std::mutex g_mutex; +std::condition_variable g_cv; void HandleSignal(int sig) { if (sig == SIGTERM || sig == SIGINT) { g_exit = true; BMC_LOG_INFO << "Receive signal SIGTERM or SIGINT, exit."; + g_cv.notify_all(); } return; } @@ -70,9 +75,13 @@ int main(int argc, char* argv[]) } while (!g_exit) { - std::this_thread::sleep_for(std::chrono::seconds(BMCBlockIoPlu::BMCPLU_CONFIG_CHECK_CYCLE)); - if (g_exit) { - break; + { + std::unique_lock lock(g_mutex); + g_cv.wait_for(lock, std::chrono::seconds(BMCBlockIoPlu::BMCPLU_CONFIG_CHECK_CYCLE), + [] { return g_exit.load(); }); + if (g_exit) { + break; + } } struct stat st_; @@ -100,9 +109,14 @@ int main(int argc, char* argv[]) }); blockIo.Start(); while (!g_exit) { - std::this_thread::sleep_for(std::chrono::seconds(BMCBlockIoPlu::BMCPLU_DEFAULT_SLEEP_CYCLE)); + { + std::unique_lock lock(g_mutex); + g_cv.wait_for(lock, std::chrono::seconds(BMCBlockIoPlu::BMCPLU_DEFAULT_SLEEP_CYCLE), + [] { return g_exit.load(); }); + } if (!blockIo.IsRunning()) { g_exit = true; + g_cv.notify_all(); break; } } -- Gitee From 5cfa5135074de280971c235b230ed29dce58b5af Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 26 Sep 2025 16:28:36 +0800 Subject: [PATCH 21/62] add test . --- config/collector.conf | 3 ++ .../sentryCollector/collect_config.py | 34 ++++++++++++++++++ src/services/sentryCollector/collect_io.py | 36 +++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/config/collector.conf b/config/collector.conf index 56b0ed1..8913530 100644 --- a/config/collector.conf +++ b/config/collector.conf @@ -5,6 +5,9 @@ modules=io period_time=1 max_save=10 disk=default +nvme_ssd_threshold=1000 +sata_ssd_threshold=1000 +sata_hdd_threshold=1000 [log] level=info \ No newline at end of file diff --git a/src/services/sentryCollector/collect_config.py b/src/services/sentryCollector/collect_config.py index 7ca9898..25b42eb 100644 --- a/src/services/sentryCollector/collect_config.py +++ b/src/services/sentryCollector/collect_config.py @@ -31,6 +31,10 @@ CONF_IO_DISK = 'disk' CONF_IO_PERIOD_TIME_DEFAULT = 1 CONF_IO_MAX_SAVE_DEFAULT = 10 CONF_IO_DISK_DEFAULT = "default" +CONF_IO_NVME_SSD = "nvme_ssd_threshold" +CONF_IO_SATA_SSD = "sata_ssd_threshold" +CONF_IO_SATA_HDD = "sata_hdd_threshold" +CONF_IO_THRESHOLD_DEFAULT = 1000 # log CONF_LOG = 'log' @@ -144,5 +148,35 @@ class CollectConfig: logging.debug("config get_io_config: %s", result_io_config) return result_io_config + def get_io_threshold(self): + result_io_threshold = {} + io_map_value = self.load_module_config(CONF_IO) + # nvme ssd threshold + nvme_ssd_threshold = io_map_value.get(CONF_IO_NVME_SSD) + if nvme_ssd_threshold and nvme_ssd_threshold.isdigit() and int(nvme_ssd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + result_io_threshold[CONF_IO_NVME_SSD] = int(nvme_ssd_threshold) + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %d", + CONF_IO, CONF_IO_NVME_SSD, CONF_IO_THRESHOLD_DEFAULT) + result_io_threshold[CONF_IO_NVME_SSD] = CONF_IO_THRESHOLD_DEFAULT + # sata ssd threshold + sata_ssd_threshold = io_map_value.get(CONF_IO_SATA_SSD) + if sata_ssd_threshold and sata_ssd_threshold.isdigit() and int(sata_ssd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + result_io_threshold[CONF_IO_SATA_SSD] = int(sata_ssd_threshold) + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %d", + CONF_IO, CONF_IO_SATA_SSD, CONF_IO_THRESHOLD_DEFAULT) + result_io_threshold[CONF_IO_SATA_SSD] = CONF_IO_THRESHOLD_DEFAULT + # sata hdd threshold + sata_hdd_threshold = io_map_value.get(CONF_IO_SATA_HDD) + if sata_hdd_threshold and sata_hdd_threshold.isdigit() and int(sata_hdd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + result_io_threshold[CONF_IO_SATA_HDD] = int(sata_hdd_threshold) + else: + logging.warning("module_name = %s section, field = %s is incorrect, use default %d", + CONF_IO, CONF_IO_SATA_HDD, CONF_IO_THRESHOLD_DEFAULT) + result_io_threshold[CONF_IO_SATA_HDD] = CONF_IO_THRESHOLD_DEFAULT + logging.debug("config get_io_threshold: %s", result_io_threshold) + return result_io_threshold + def get_common_config(self): return {key.lower(): value for key, value in self.config['common'].items()} diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 6db28ec..76ef1f8 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -20,6 +20,8 @@ import subprocess from typing import Union from .collect_config import CollectConfig +from .collect_config import CONF_IO_NVME_SSD, CONF_IO_SATA_SSD, CONF_IO_SATA_HDD, CONF_IO_THRESHOLD_DEFAULT +from .collect_plugin import get_disk_type, DiskType Io_Category = ["read", "write", "flush", "discard"] IO_GLOBAL_DATA = {} @@ -39,6 +41,10 @@ class CollectIo(): def __init__(self, module_config): io_config = module_config.get_io_config() + self.io_threshold_config[CONF_IO_NVME_SSD] = CONF_IO_THRESHOLD_DEFAULT + self.io_threshold_config[CONF_IO_SATA_SSD] = CONF_IO_THRESHOLD_DEFAULT + self.io_threshold_config[CONF_IO_SATA_HDD] = CONF_IO_THRESHOLD_DEFAULT + self.module_config = module_config self.period_time = io_config['period_time'] self.max_save = io_config['max_save'] @@ -148,6 +154,35 @@ class CollectIo(): else: return round(value, 1) + def update_io_threshold(self, disk_name, stage_list): + temp_io_threshold = self.module_config.get_io_threshold() + disk_type_result = get_disk_type(disk_name) + if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( + '0', + '1', + '2', + ): + disk_type = int(disk_type_result["message"]) + if disk_type == DiskType.NVME_SSD and temp_io_threshold[CONF_IO_NVME_SSD] != self.io_threshold_config[CONF_IO_NVME_SSD]: + io_threshold = self.io_threshold_config[CONF_IO_NVME_SSD] = temp_io_threshold[CONF_IO_NVME_SSD] + elif disk_type == DiskType.SATA_SSD and temp_io_threshold[CONF_IO_SATA_SSD] != self.io_threshold_config[CONF_IO_SATA_SSD]: + io_threshold = self.io_threshold_config[CONF_IO_SATA_SSD] = temp_io_threshold[CONF_IO_SATA_SSD] + elif disk_type == DiskType.SATA_HDD and temp_io_threshold[CONF_IO_SATA_HDD] != self.io_threshold_config[CONF_IO_SATA_HDD]: + io_threshold = self.io_threshold_config[CONF_IO_SATA_HDD] = temp_io_threshold[CONF_IO_SATA_HDD] + else: + return + + for stage in stage_list: + io_threshold_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/threshold'.format(disk_name, stage) + try: + with open(io_threshold_file, 'w') as file: + file.write(str(io_threshold)) + logging.debug("set %s io_dump_threshold to %d", io_threshold_file, io_threshold) + except FileNotFoundError: + logging.error("The file %s does not exist.", io_threshold_file) + except Exception as e: + logging.error("An error occurred3: %s", e) + def get_io_dump(self, disk_name, stage, category): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) count = 0 @@ -439,6 +474,7 @@ class CollectIo(): for disk_name, stage_list in self.disk_map_stage.items(): if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: continue + self.update_io_threshold(disk_name, stage_list) self.append_period_lat(disk_name, stage_list) elapsed_time = time.time() - start_time -- Gitee From 70218355648bb92aa31715e1259ece44ce1136cb Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 26 Sep 2025 16:47:16 +0800 Subject: [PATCH 22/62] add test . --- src/services/sentryCollector/collect_io.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 76ef1f8..57a0a0f 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -41,9 +41,11 @@ class CollectIo(): def __init__(self, module_config): io_config = module_config.get_io_config() - self.io_threshold_config[CONF_IO_NVME_SSD] = CONF_IO_THRESHOLD_DEFAULT - self.io_threshold_config[CONF_IO_SATA_SSD] = CONF_IO_THRESHOLD_DEFAULT - self.io_threshold_config[CONF_IO_SATA_HDD] = CONF_IO_THRESHOLD_DEFAULT + self.io_threshold_config = { + CONF_IO_NVME_SSD: CONF_IO_THRESHOLD_DEFAULT, + CONF_IO_SATA_SSD: CONF_IO_THRESHOLD_DEFAULT, + CONF_IO_SATA_HDD: CONF_IO_THRESHOLD_DEFAULT + } self.module_config = module_config self.period_time = io_config['period_time'] -- Gitee From 64edcdd620077a6fc4b8a771c1e73c69d359a2eb Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 26 Sep 2025 16:51:47 +0800 Subject: [PATCH 23/62] add test . --- src/services/sentryCollector/collect_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 57a0a0f..2a2945a 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -179,7 +179,7 @@ class CollectIo(): try: with open(io_threshold_file, 'w') as file: file.write(str(io_threshold)) - logging.debug("set %s io_dump_threshold to %d", io_threshold_file, io_threshold) + logging.info("set %s io_dump_threshold to %d", io_threshold_file, io_threshold) except FileNotFoundError: logging.error("The file %s does not exist.", io_threshold_file) except Exception as e: -- Gitee From 8d2eb1f586a9e365bc9b8a0f7750e048a72cd5f7 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 26 Sep 2025 17:15:53 +0800 Subject: [PATCH 24/62] add test . --- src/services/sentryCollector/collect_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 2a2945a..a06d919 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -165,11 +165,11 @@ class CollectIo(): '2', ): disk_type = int(disk_type_result["message"]) - if disk_type == DiskType.NVME_SSD and temp_io_threshold[CONF_IO_NVME_SSD] != self.io_threshold_config[CONF_IO_NVME_SSD]: + if disk_type == DiskType.TYPE_NVME_SSD and temp_io_threshold[CONF_IO_NVME_SSD] != self.io_threshold_config[CONF_IO_NVME_SSD]: io_threshold = self.io_threshold_config[CONF_IO_NVME_SSD] = temp_io_threshold[CONF_IO_NVME_SSD] - elif disk_type == DiskType.SATA_SSD and temp_io_threshold[CONF_IO_SATA_SSD] != self.io_threshold_config[CONF_IO_SATA_SSD]: + elif disk_type == DiskType.TYPE_SATA_SSD and temp_io_threshold[CONF_IO_SATA_SSD] != self.io_threshold_config[CONF_IO_SATA_SSD]: io_threshold = self.io_threshold_config[CONF_IO_SATA_SSD] = temp_io_threshold[CONF_IO_SATA_SSD] - elif disk_type == DiskType.SATA_HDD and temp_io_threshold[CONF_IO_SATA_HDD] != self.io_threshold_config[CONF_IO_SATA_HDD]: + elif disk_type == DiskType.TYPE_SATA_HDD and temp_io_threshold[CONF_IO_SATA_HDD] != self.io_threshold_config[CONF_IO_SATA_HDD]: io_threshold = self.io_threshold_config[CONF_IO_SATA_HDD] = temp_io_threshold[CONF_IO_SATA_HDD] else: return -- Gitee From 7950a95533a15f79a116097441ced472692100b8 Mon Sep 17 00:00:00 2001 From: hewh Date: Sun, 28 Sep 2025 10:51:38 +0800 Subject: [PATCH 25/62] add test . --- src/services/sentryCollector/collect_config.py | 6 +++--- src/services/sentryCollector/collect_io.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/services/sentryCollector/collect_config.py b/src/services/sentryCollector/collect_config.py index 25b42eb..5793fa3 100644 --- a/src/services/sentryCollector/collect_config.py +++ b/src/services/sentryCollector/collect_config.py @@ -153,7 +153,7 @@ class CollectConfig: io_map_value = self.load_module_config(CONF_IO) # nvme ssd threshold nvme_ssd_threshold = io_map_value.get(CONF_IO_NVME_SSD) - if nvme_ssd_threshold and nvme_ssd_threshold.isdigit() and int(nvme_ssd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + if nvme_ssd_threshold and nvme_ssd_threshold.isdigit() and int(nvme_ssd_threshold) >= 1: result_io_threshold[CONF_IO_NVME_SSD] = int(nvme_ssd_threshold) else: logging.warning("module_name = %s section, field = %s is incorrect, use default %d", @@ -161,7 +161,7 @@ class CollectConfig: result_io_threshold[CONF_IO_NVME_SSD] = CONF_IO_THRESHOLD_DEFAULT # sata ssd threshold sata_ssd_threshold = io_map_value.get(CONF_IO_SATA_SSD) - if sata_ssd_threshold and sata_ssd_threshold.isdigit() and int(sata_ssd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + if sata_ssd_threshold and sata_ssd_threshold.isdigit() and int(sata_ssd_threshold) >= 1: result_io_threshold[CONF_IO_SATA_SSD] = int(sata_ssd_threshold) else: logging.warning("module_name = %s section, field = %s is incorrect, use default %d", @@ -169,7 +169,7 @@ class CollectConfig: result_io_threshold[CONF_IO_SATA_SSD] = CONF_IO_THRESHOLD_DEFAULT # sata hdd threshold sata_hdd_threshold = io_map_value.get(CONF_IO_SATA_HDD) - if sata_hdd_threshold and sata_hdd_threshold.isdigit() and int(sata_hdd_threshold) >= CONF_IO_THRESHOLD_DEFAULT: + if sata_hdd_threshold and sata_hdd_threshold.isdigit() and int(sata_hdd_threshold) >= 1: result_io_threshold[CONF_IO_SATA_HDD] = int(sata_hdd_threshold) else: logging.warning("module_name = %s section, field = %s is incorrect, use default %d", diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index a06d919..0b7e491 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -46,7 +46,6 @@ class CollectIo(): CONF_IO_SATA_SSD: CONF_IO_THRESHOLD_DEFAULT, CONF_IO_SATA_HDD: CONF_IO_THRESHOLD_DEFAULT } - self.module_config = module_config self.period_time = io_config['period_time'] self.max_save = io_config['max_save'] @@ -157,7 +156,8 @@ class CollectIo(): return round(value, 1) def update_io_threshold(self, disk_name, stage_list): - temp_io_threshold = self.module_config.get_io_threshold() + temp_config = CollectConfig() + temp_io_threshold = temp_config.get_io_threshold() disk_type_result = get_disk_type(disk_name) if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( '0', -- Gitee From c48b8094b0f26df722234a3e32a1ee8e20580ddf Mon Sep 17 00:00:00 2001 From: hewh Date: Sun, 28 Sep 2025 11:57:07 +0800 Subject: [PATCH 26/62] add test . --- src/services/sentryCollector/collect_io.py | 41 ++++++++++++---------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 0b7e491..5951e1c 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -159,31 +159,36 @@ class CollectIo(): temp_config = CollectConfig() temp_io_threshold = temp_config.get_io_threshold() disk_type_result = get_disk_type(disk_name) - if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( - '0', - '1', - '2', - ): + if disk_type_result["ret"] == 0 and disk_type_result["message"] in ('0', '1', '2'): disk_type = int(disk_type_result["message"]) - if disk_type == DiskType.TYPE_NVME_SSD and temp_io_threshold[CONF_IO_NVME_SSD] != self.io_threshold_config[CONF_IO_NVME_SSD]: - io_threshold = self.io_threshold_config[CONF_IO_NVME_SSD] = temp_io_threshold[CONF_IO_NVME_SSD] - elif disk_type == DiskType.TYPE_SATA_SSD and temp_io_threshold[CONF_IO_SATA_SSD] != self.io_threshold_config[CONF_IO_SATA_SSD]: - io_threshold = self.io_threshold_config[CONF_IO_SATA_SSD] = temp_io_threshold[CONF_IO_SATA_SSD] - elif disk_type == DiskType.TYPE_SATA_HDD and temp_io_threshold[CONF_IO_SATA_HDD] != self.io_threshold_config[CONF_IO_SATA_HDD]: - io_threshold = self.io_threshold_config[CONF_IO_SATA_HDD] = temp_io_threshold[CONF_IO_SATA_HDD] - else: - return - for stage in stage_list: io_threshold_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/threshold'.format(disk_name, stage) try: - with open(io_threshold_file, 'w') as file: - file.write(str(io_threshold)) - logging.info("set %s io_dump_threshold to %d", io_threshold_file, io_threshold) + with open(io_threshold_file, 'r') as file: + current_threshold = file.read().strip() except FileNotFoundError: logging.error("The file %s does not exist.", io_threshold_file) + continue except Exception as e: - logging.error("An error occurred3: %s", e) + logging.error("An error occurred while reading: %s", e) + continue + + if disk_type == DiskType.TYPE_NVME_SSD: + config_threshold = str(temp_io_threshold[CONF_IO_NVME_SSD]) + elif disk_type == DiskType.TYPE_SATA_SSD: + config_threshold = str(temp_io_threshold[CONF_IO_SATA_SSD]) + elif disk_type == DiskType.TYPE_SATA_HDD: + config_threshold = str(temp_io_threshold[CONF_IO_SATA_HDD]) + else: + continue + + if current_threshold != config_threshold: + try: + with open(io_threshold_file, 'w') as file: + file.write(config_threshold) + logging.info("update %s io_dump_threshold from %s to %s", io_threshold_file, current_threshold, config_threshold) + except Exception as e: + logging.error("An error occurred while writing: %s", e) def get_io_dump(self, disk_name, stage, category): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) -- Gitee From 0635bf21c7908dea44b410c626bdb5991ab794c4 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 29 Sep 2025 10:16:22 +0800 Subject: [PATCH 27/62] add test . --- config/plugins/ai_block_io.ini | 1 - config/plugins/avg_block_io.ini | 1 - src/sentryPlugins/ai_block_io/ai_block_io.py | 7 ++---- .../ai_block_io/config_parser.py | 22 ++----------------- .../avg_block_io/avg_block_io.py | 9 +++++--- src/sentryPlugins/avg_block_io/config.py | 18 ++++----------- .../avg_block_io/extra_logger.py | 21 ++++++++++++++---- .../avg_block_io/stage_window.py | 5 +++++ src/sentryPlugins/avg_block_io/utils.py | 9 +++++++- 9 files changed, 44 insertions(+), 49 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 284c412..5f55ac7 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -15,7 +15,6 @@ boxplot_parameter=1.5 win_type=not_continuous win_size=30 win_threshold=6 -win_size_iodump=30 win_threshold_iodump=6 [latency_sata_ssd] diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 57a0ba6..061839c 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -10,7 +10,6 @@ period_time=1 [algorithm] win_size=30 win_threshold=6 -win_size_iodump=30 win_threshold_iodump=6 [latency_nvme_ssd] diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index c3fd140..fa0bbf7 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -110,12 +110,9 @@ class SlowIODetection: train_data_duration, train_update_duration, slow_io_detection_frequency ) sliding_window_type = self._config_parser.sliding_window_type - window_size, window_threshold = ( + window_size, window_threshold, window_threshold_iodump = ( self._config_parser.get_window_size_and_window_minimum_threshold() ) - window_size_iodump, window_threshold_iodump = ( - self._config_parser.get_window_size_and_window_threshold_iodump() - ) for disk, metric_name_list in self._detector_name_list.items(): disk_detector = DiskDetector(disk) @@ -162,7 +159,7 @@ class SlowIODetection: abs_threshold = self._config_parser.write_iodump_lim sliding_window = SlidingWindowFactory().get_sliding_window( sliding_window_type, - queue_length=window_size_iodump, + queue_length=window_size, threshold=window_threshold_iodump ) detector = Detector(metric_name, threshold, sliding_window) diff --git a/src/sentryPlugins/ai_block_io/config_parser.py b/src/sentryPlugins/ai_block_io/config_parser.py index 4024e44..6f93580 100644 --- a/src/sentryPlugins/ai_block_io/config_parser.py +++ b/src/sentryPlugins/ai_block_io/config_parser.py @@ -75,7 +75,6 @@ class ConfigParser: "win_type": get_sliding_window_type_enum("not_continuous"), "win_size": 30, "win_threshold": 6, - "win_size_iodump": 30, "win_threshold_iodump": 6, }, "latency_sata_ssd": { @@ -440,19 +439,8 @@ class ConfigParser: ) ) - def _read_window_size_iodump(self, items_sliding_window: dict): - default_window_size_iodump = self._conf["algorithm"]["win_size"] - self._conf["algorithm"]["win_size_iodump"] = self._get_config_value( - items_sliding_window, - "win_size_iodump", - int, - default_window_size_iodump, - gt=0, - le=300, - ) - def _read_window_threshold_iodump(self, items_sliding_window: dict): - default_window_threshold_iodump = self._conf["algorithm"]["win_threshold"] + default_window_threshold_iodump = self.DEFAULT_CONF["algorithm"]["win_threshold_iodump"] self._conf["algorithm"]["win_threshold_iodump"] = ( self._get_config_value( items_sliding_window, @@ -460,7 +448,7 @@ class ConfigParser: int, default_window_threshold_iodump, gt=0, - le=self._conf["algorithm"]["win_size_iodump"], + le=self._conf["algorithm"]["win_size"], ) ) @@ -524,7 +512,6 @@ class ConfigParser: self._read_sliding_window_type(items_algorithm) self._read_window_size(items_algorithm) self._read_window_minimum_threshold(items_algorithm) - self._read_window_size_iodump(items_algorithm) self._read_window_threshold_iodump(items_algorithm) if con.has_section("latency_sata_ssd"): @@ -734,11 +721,6 @@ class ConfigParser: return ( self._conf["algorithm"]["win_size"], self._conf["algorithm"]["win_threshold"], - ) - - def get_window_size_and_window_threshold_iodump(self): - return ( - self._conf["algorithm"]["win_size_iodump"], self._conf["algorithm"]["win_threshold_iodump"], ) diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index f1e0e24..c23c724 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -14,7 +14,7 @@ import configparser import time from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage -from .stage_window import IoWindow, IoDumpWindow +from .stage_window import IoWindow, IoDumpWindow,IopsWindow from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation from .utils import update_avg_and_check_abnormal from .extra_logger import init_extra_logger @@ -58,8 +58,11 @@ def init_io_win(io_dic, config, common_param): logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: - io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size_iodump"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) + io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) + + io_data[disk_name][stage_name][rw]["iops"] = IopsWindow(window_size=io_dic["win_size"]) + logging.debug("Successfully create {}-{}-{}-iops window".format(disk_name, stage_name, rw)) return io_data, io_avg_value @@ -178,7 +181,7 @@ def main(): # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 # step1. 解析公共配置 --- algorithm - io_dic["win_size"], io_dic["win_threshold"], io_dic["win_size_iodump"], io_dic["win_threshold_iodump"] = read_config_algorithm(config) + io_dic["win_size"], io_dic["win_threshold"], io_dic["win_threshold_iodump"] = read_config_algorithm(config) # step2. 解析公共配置 --- latency_xxx common_param = read_config_latency(config) diff --git a/src/sentryPlugins/avg_block_io/config.py b/src/sentryPlugins/avg_block_io/config.py index 60ec9bc..2cc00fb 100644 --- a/src/sentryPlugins/avg_block_io/config.py +++ b/src/sentryPlugins/avg_block_io/config.py @@ -25,7 +25,6 @@ CONF_COMMON_PER_TIME = 'period_time' CONF_ALGO = 'algorithm' CONF_ALGO_SIZE = 'win_size' CONF_ALGO_THRE = 'win_threshold' -CONF_ALGO_SIZE_IODUMP = 'win_size_iodump' CONF_ALGO_THRE_IODUMP = 'win_threshold_iodump' CONF_LATENCY = 'latency_{}' @@ -42,7 +41,8 @@ DEFAULT_PARAM = { CONF_COMMON_PER_TIME: 1 }, CONF_ALGO: { CONF_ALGO_SIZE: 30, - CONF_ALGO_THRE: 6 + CONF_ALGO_THRE: 6, + CONF_ALGO_THRE_IODUMP: 6 }, 'latency_nvme_ssd': { 'read_avg_lim': 10000, 'write_avg_lim': 10000, @@ -173,19 +173,9 @@ def read_config_algorithm(config): win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold'] logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default") - try: - win_size_iodump = int(config.get(CONF_ALGO, CONF_ALGO_SIZE_IODUMP)) - if not (1 <= win_size_iodump <= 300): - raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP}") - except ValueError: - report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP} config") - except configparser.NoOptionError: - win_size_iodump = win_size - logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE_IODUMP}, use {win_size_iodump} as default") - try: win_threshold_iodump = int(config.get(CONF_ALGO, CONF_ALGO_THRE_IODUMP)) - if win_threshold_iodump < 1 or win_threshold_iodump > 300 or win_threshold_iodump > win_size_iodump: + if win_threshold_iodump < 1 or win_threshold_iodump > 300 or win_threshold_iodump > win_size: raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}") except ValueError: report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP} config") @@ -193,7 +183,7 @@ def read_config_algorithm(config): win_threshold_iodump = win_threshold logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}, use {win_threshold_iodump} as default") - return win_size, win_threshold, win_size_iodump, win_threshold_iodump + return win_size, win_threshold, win_threshold_iodump def read_config_latency(config): diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index cd050f3..d99aded 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -48,14 +48,27 @@ def extra_slow_log(msg): def extra_latency_log(msg): - extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}") + pattern = r'(\w+):\s*\[([0-9.,]+)\]' + + # Parse the latency string from msg + iops_avg = 0 + iops_str = msg['details']['iops'] + iops_matches = re.findall(pattern, iops_str) + iops_data = {} + for match in iops_matches: + key = match[0] + values = list(map(float, match[1].split(','))) + iops_data[key] = values + if 'rq_driver' in iops_data and iops_data['rq_driver']: + iops_avg = sum(iops_data['rq_driver']) / len(iops_data['rq_driver']) + + extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}, iops: {int(iops_avg)}") # Parse the latency string from msg latency_str = msg['details']['latency'] - pattern = r'(\w+):\s*\[([0-9.,]+)\]' - matches = re.findall(pattern, latency_str) + latency_matches = re.findall(pattern, latency_str) latency_data = {} - for match in matches: + for match in latency_matches: key = match[0] values = list(map(float, match[1].split(','))) latency_data[key] = values diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index 587bd49..433cd7c 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -53,3 +53,8 @@ class IoDumpWindow(AbnormalWindowBase): def is_abnormal_period(self, value, avg_val=0): return value > self.abnormal_time + + +class IopsWindow(AbnormalWindowBase): + def is_abnormal_period(self, value, avg_val=10): + return false; diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 27c1f84..2800da5 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -40,6 +40,7 @@ def get_win_data(disk_name, rw, io_data): """get latency and iodump win data""" latency = '' iodump = '' + iops = '' for stage_name in io_data[disk_name]: if 'latency' in io_data[disk_name][stage_name][rw]: latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() @@ -47,7 +48,10 @@ def get_win_data(disk_name, rw, io_data): if 'iodump' in io_data[disk_name][stage_name][rw]: iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string() iodump += f'{stage_name}: [{iodump_list}], ' - return {"latency": latency[:-2], "iodump": iodump[:-2]} + if 'iops' in io_data[disk_name][stage_name][rw]: + iops_list = io_data[disk_name][stage_name][rw]['iops'].window_data_to_string() + iops += f'{stage_name}: [{iops_list}], ' + return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2]} def is_abnormal(io_key, io_data): @@ -91,6 +95,8 @@ def update_io_data(period_value, io_data, io_key): io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0]) if all_wins and "iodump" in all_wins: io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) + if all_wins and "iops" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["iops"].append_new_data(period_value[2]) def log_abnormal_period(old_avg, period_value, io_data, io_key): @@ -112,6 +118,7 @@ def log_slow_win(msg, reason): f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") logging.info(f"latency: {msg['details']['latency']}") logging.info(f"iodump: {msg['details']['iodump']}") + logging.info(f"iops: {msg['details']['iops']}") extra_slow_log(msg) -- Gitee From 706bb3be8b0bc38fb91747d403f57ede3f7af9c7 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 30 Sep 2025 15:45:45 +0800 Subject: [PATCH 28/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 12 ++++++++ src/sentryPlugins/ai_block_io/data_access.py | 1 + src/sentryPlugins/ai_block_io/detector.py | 9 ++++-- src/sentryPlugins/ai_block_io/extra_logger.py | 29 +++++++++++-------- src/sentryPlugins/ai_block_io/io_data.py | 1 + .../ai_block_io/sliding_window.py | 6 ++-- .../avg_block_io/extra_logger.py | 6 ++-- 7 files changed, 42 insertions(+), 22 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index fa0bbf7..04fdd27 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -94,6 +94,7 @@ class SlowIODetection: for iotype in iotypes: self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) + self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "iops")) if not self._detector_name_list: Report.report_pass("the disks to detection is empty, ai_block_io will exit.") @@ -166,6 +167,16 @@ class SlowIODetection: threshold.set_threshold(abs_threshold) disk_detector.add_detector(detector) + elif metric_name.metric_name == 'iops': + threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold) + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold + ) + detector = Detector(metric_name, threshold, sliding_window) + disk_detector.add_detector(detector) + logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") self._disk_detectors[disk] = disk_detector @@ -216,6 +227,7 @@ class SlowIODetection: f'reason: {str(tmp_alarm_content.get("reason"))}') logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) + logging.warning(f"iops: " + str(alarm_content.get("details").get("iops"))) extra_slow_log(alarm_content) # Step4:等待检测时间 diff --git a/src/sentryPlugins/ai_block_io/data_access.py b/src/sentryPlugins/ai_block_io/data_access.py index 2f2d607..845c76a 100644 --- a/src/sentryPlugins/ai_block_io/data_access.py +++ b/src/sentryPlugins/ai_block_io/data_access.py @@ -33,6 +33,7 @@ COLLECT_STAGES = [ "rq_driver", "bio", "iocost", + "deadline", ] diff --git a/src/sentryPlugins/ai_block_io/detector.py b/src/sentryPlugins/ai_block_io/detector.py index 2688cb1..ebcf4c9 100644 --- a/src/sentryPlugins/ai_block_io/detector.py +++ b/src/sentryPlugins/ai_block_io/detector.py @@ -98,12 +98,15 @@ class DiskDetector: def get_detector_list_window(self): latency_wins = {"read": {}, "write": {}} iodump_wins = {"read": {}, "write": {}} + iops_wins = {"read": {}, "write": {}} for detector in self._detector_list: if detector.metric_name.metric_name == 'latency': latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() elif detector.metric_name.metric_name == 'io_dump': iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() - return latency_wins, iodump_wins + elif detector.metric_name.metric_name == 'iops': + iops_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() + return latency_wins, iodump_wins, iops_wins def is_slow_io_event(self, io_data_dict_with_disk_name: dict): diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []} @@ -134,8 +137,8 @@ class DiskDetector: io_type.add(metric_name.io_access_type_name) alarm_type.add(metric_name.metric_name) - latency_wins, iodump_wins = self.get_detector_list_window() - details = {"latency": latency_wins, "iodump": iodump_wins} + latency_wins, iodump_wins, iops_wins = self.get_detector_list_window() + details = {"latency": latency_wins, "iodump": iodump_wins, "iops": iops_wins} io_press = {"throtl", "wbt", "iocost", "bfq"} driver_slow = {"rq_driver"} diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index 1148139..55e8526 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -50,21 +50,26 @@ def extra_slow_log(msg): def extra_latency_log(msg): io_types = [iot.strip() for iot in re.split(r',+', msg['io_type'])] - # Define stage groups - groups = { - 'B->Q': ['throtl', 'wbt', 'iocost'], - 'Q->G': ['gettag', 'requeue'], - 'G->I': ['plug'], - 'I->D': ['deadline', 'bfq', 'hctx'], - 'D->C': ['rq_driver'] - } - + # Calculate iops average for io_type in io_types: - extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}") - - latency_data_dict = msg['details']['latency'].get(io_type, {}) + iops_avg = 0 + iops_data_dict = msg['details']['iops'].get(io_type, {}) + if 'rq_driver' in iops_data_dict: + iops_avg = sum(iops_data_dict['rq_driver']) / len(iops_data_dict['rq_driver']) + + extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}, iops: {int(iops_avg)}") + + # Define stage groups + groups = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], + 'D->C': ['rq_driver'] + } # Calculate statistics for each group + latency_data_dict = msg['details']['latency'].get(io_type, {}) group_stats = {} for group_name, stages in groups.items(): all_values = [] diff --git a/src/sentryPlugins/ai_block_io/io_data.py b/src/sentryPlugins/ai_block_io/io_data.py index 6042911..eea45da 100644 --- a/src/sentryPlugins/ai_block_io/io_data.py +++ b/src/sentryPlugins/ai_block_io/io_data.py @@ -42,6 +42,7 @@ class IOData: requeue: IOStageData = field(default_factory=lambda: IOStageData()) rq_driver: IOStageData = field(default_factory=lambda: IOStageData()) bio: IOStageData = field(default_factory=lambda: IOStageData()) + deadline: IOStageData = field(default_factory=lambda: IOStageData()) time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index b174d94..fe9b8c9 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -33,10 +33,8 @@ class SlidingWindow: def is_abnormal(self, data): if self._avg_lim is not None and data < self._avg_lim: return False - if self._avg_lim is not None and self._ai_threshold is not None: - threshold = max(self._avg_lim, self._ai_threshold) - if data > threshold: - return True + if self._ai_threshold is not None and data > self._ai_threshold: + return True if self._abs_threshold is not None and data > self._abs_threshold: return True return False diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index d99aded..86c9820 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -50,7 +50,7 @@ def extra_slow_log(msg): def extra_latency_log(msg): pattern = r'(\w+):\s*\[([0-9.,]+)\]' - # Parse the latency string from msg + # Parse the iops string from msg iops_avg = 0 iops_str = msg['details']['iops'] iops_matches = re.findall(pattern, iops_str) @@ -76,9 +76,9 @@ def extra_latency_log(msg): # Define stage groups groups = { 'B->Q': ['throtl', 'wbt', 'iocost'], - 'Q->G': ['gettag', 'requeue'], + 'Q->G': ['gettag'], 'G->I': ['plug'], - 'I->D': ['deadline', 'bfq', 'hctx'], + 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], 'D->C': ['rq_driver'] } -- Gitee From 69894ff92ee15123edeb820b3af8af3a56d386c4 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 30 Sep 2025 17:25:35 +0800 Subject: [PATCH 29/62] add test . --- src/sentryPlugins/avg_block_io/utils.py | 2 +- src/services/sentryCollector/collect_io.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 2800da5..a59aa4d 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -96,7 +96,7 @@ def update_io_data(period_value, io_data, io_key): if all_wins and "iodump" in all_wins: io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) if all_wins and "iops" in all_wins: - io_data[io_key[0]][io_key[1]][io_key[2]]["iops"].append_new_data(period_value[2]) + io_data[io_key[0]][io_key[1]][io_key[2]]["iops"].append_new_data(period_value[3]) def log_abnormal_period(old_avg, period_value, io_data, io_key): diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 5951e1c..456cdfa 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -17,6 +17,7 @@ import time import logging import threading import subprocess +import re from typing import Union from .collect_config import CollectConfig @@ -196,7 +197,20 @@ class CollectIo(): try: with open(io_dump_file, 'r') as file: for line in file: - count += line.count('.op=' + Io_Category[category].upper()) + if line.count('.op=' + Io_Category[category].upper()) > 0: + pattern = re.compile(r'(?P\w+)-(?P\d+)\s+' + r'\w+\s+' + r'stage\s+(?P\w+)\s+' + r'(?P[0-9a-fA-F]{16})\s+' + r'.*started\s+(?P\d+)\s+ns\s+age') + match = pattern.match(line) + if match: + parsed = match.groupdict() + for k, v in parsed.items(): + logging.info(f"io_dump info : {k} = {v}") + else: + logging.info(f"io_dump parse err, info : {line.strip()}") + count += 1 if count > 0: logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}") except FileNotFoundError: -- Gitee From 8e91c94e3ae7b1a8f7fe6304f5003f32a5f1d9cf Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 9 Oct 2025 09:42:46 +0800 Subject: [PATCH 30/62] add test . --- src/sentryPlugins/avg_block_io/config.py | 2 +- src/sentryPlugins/avg_block_io/stage_window.py | 2 +- src/services/sentryCollector/collect_io.py | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/config.py b/src/sentryPlugins/avg_block_io/config.py index 2cc00fb..c7fca64 100644 --- a/src/sentryPlugins/avg_block_io/config.py +++ b/src/sentryPlugins/avg_block_io/config.py @@ -180,7 +180,7 @@ def read_config_algorithm(config): except ValueError: report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP} config") except configparser.NoOptionError: - win_threshold_iodump = win_threshold + win_threshold_iodump = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_THRE_IODUMP] logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}, use {win_threshold_iodump} as default") return win_size, win_threshold, win_threshold_iodump diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index 433cd7c..71aba31 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -57,4 +57,4 @@ class IoDumpWindow(AbnormalWindowBase): class IopsWindow(AbnormalWindowBase): def is_abnormal_period(self, value, avg_val=10): - return false; + return False diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 456cdfa..25cae7d 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -194,15 +194,18 @@ class CollectIo(): def get_io_dump(self, disk_name, stage, category): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) count = 0 + pattern = re.compile( + r'(?P\w+)-(?P\d+)\s+' + r'\w+\s+' + r'stage\s+(?P\w+)\s+' + r'(?P[0-9a-fA-F]{16})\s+' + r'.*started\s+(?P\d+)\s+ns\s+ago' + ) + try: with open(io_dump_file, 'r') as file: for line in file: if line.count('.op=' + Io_Category[category].upper()) > 0: - pattern = re.compile(r'(?P\w+)-(?P\d+)\s+' - r'\w+\s+' - r'stage\s+(?P\w+)\s+' - r'(?P[0-9a-fA-F]{16})\s+' - r'.*started\s+(?P\d+)\s+ns\s+age') match = pattern.match(line) if match: parsed = match.groupdict() -- Gitee From c94c1593106a41f27ebe28fc8ec68a61c8b262df Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 9 Oct 2025 16:55:49 +0800 Subject: [PATCH 31/62] add test . --- .../pySentryCollector/collect_plugin.py | 38 +++++++++++-------- src/services/sentryCollector/collect_io.py | 4 ++ .../sentryCollector/collect_server.py | 19 +++++++--- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/libsentry/python/pySentryCollector/collect_plugin.py b/src/libsentry/python/pySentryCollector/collect_plugin.py index 3395f89..f000cf8 100644 --- a/src/libsentry/python/pySentryCollector/collect_plugin.py +++ b/src/libsentry/python/pySentryCollector/collect_plugin.py @@ -52,6 +52,7 @@ LIMIT_MAX_SAVE_LEN = 300 class ClientProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 + GET_IO_DUMP_DATA = 2 PRO_END = 3 class ResultMessage(): @@ -234,14 +235,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): result['message'] = result_message return result -def get_io_data(period, disk_list, stage, iotype): - result = inter_get_io_data(period, disk_list, stage, iotype) - error_code = result['ret'] - if error_code != ResultMessage.RESULT_SUCCEED: - result['message'] = Result_Messages[error_code] - return result - -def inter_get_io_data(period, disk_list, stage, iotype): +def inter_get_io_common(period, disk_list, stage, iotype, protocol): result = {} result['ret'] = ResultMessage.RESULT_UNKNOWN result['message'] = "" @@ -269,21 +263,21 @@ def inter_get_io_data(period, disk_list, stage, iotype): return result req_msg_struct = { - 'disk_list': json.dumps(disk_list), - 'period': period, - 'stage': json.dumps(stage), - 'iotype' : json.dumps(iotype) - } + 'disk_list': json.dumps(disk_list), + 'period': period, + 'stage': json.dumps(stage), + 'iotype': json.dumps(iotype) + } request_message = json.dumps(req_msg_struct) - result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) + result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, protocol) if not result_message: logging.error("collect_plugin: client_send_and_recv failed") return result try: json.loads(result_message) except json.JSONDecodeError: - logging.error("get_io_data: json decode error") + logging.error("get_io_common: json decode error") result['ret'] = ResultMessage.RESULT_PARSE_FAILED return result @@ -291,6 +285,20 @@ def inter_get_io_data(period, disk_list, stage, iotype): result['message'] = result_message return result +def get_io_data(period, disk_list, stage, iotype): + result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IO_DATA) + error_code = result['ret'] + if error_code != ResultMessage.RESULT_SUCCEED: + result['message'] = Result_Messages[error_code] + return result + +def get_io_dump_data(period, disk_list, stage, iotype): + result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IO_DUMP_DATA) + error_code = result['ret'] + if error_code != ResultMessage.RESULT_SUCCEED: + result['message'] = Result_Messages[error_code] + return result + def get_disk_type(disk): result = {} result['ret'] = ResultMessage.RESULT_UNKNOWN diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 25cae7d..a5a5e0a 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -27,6 +27,7 @@ from .collect_plugin import get_disk_type, DiskType Io_Category = ["read", "write", "flush", "discard"] IO_GLOBAL_DATA = {} IO_CONFIG_DATA = [] +IO_DUMP_DATA = {} EBPF_GLOBAL_DATA = [] EBPF_PROCESS = None EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] @@ -210,6 +211,7 @@ class CollectIo(): if match: parsed = match.groupdict() for k, v in parsed.items(): + IO_DUMP_DATA[disk_name][stage][Io_Category[category]].insert(0, {k: v}) logging.info(f"io_dump info : {k} = {v}") else: logging.info(f"io_dump parse err, info : {line.strip()}") @@ -485,8 +487,10 @@ class CollectIo(): for stage in stage_list: self.window_value[disk_name][stage] = [] IO_GLOBAL_DATA[disk_name][stage] = {} + IO_DUMP_DATA[disk_name][stage] = {} for category in Io_Category: IO_GLOBAL_DATA[disk_name][stage][category] = [] + IO_DUMP_DATA[disk_name][stage][category] = [] while True: start_time = time.time() diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index ad3ac0e..d3d5064 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -24,7 +24,7 @@ import select import threading import time -from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA +from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA, IO_DUMP_DATA from .collect_config import CollectConfig SENTRY_RUN_DIR = "/var/run/sysSentry" @@ -48,6 +48,7 @@ RES_MAGIC = "RES" class ServerProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 + GET_IO_DUMP_DATA = 2 PRO_END = 3 class CollectServer(): @@ -91,9 +92,8 @@ class CollectServer(): return json.dumps(result_rev) - def get_io_data(self, data_struct): + def get_io_common(self, data_struct, data_source): result_rev = {} - self.io_global_data = IO_GLOBAL_DATA if len(IO_CONFIG_DATA) == 0: logging.error("the collect thread is not started, the data is invalid.") @@ -107,13 +107,13 @@ class CollectServer(): iotype_list = json.loads(data_struct['iotype']) if (period < period_time) or (period > period_time * max_save) or (period % period_time): - logging.error("get_io_data: period time is invalid, user period: %d, config period_time: %d", period, period_time) + logging.error("get_io_common: period time is invalid, user period: %d, config period_time: %d", period, period_time) return json.dumps(result_rev) collect_index = period // period_time - 1 logging.debug("user period: %d, config period_time: %d, collect_index: %d", period, period_time, collect_index) - for disk_name, stage_info in self.io_global_data.items(): + for disk_name, stage_info in data_source.items(): if disk_name not in disk_list: continue result_rev[disk_name] = {} @@ -130,6 +130,13 @@ class CollectServer(): return json.dumps(result_rev) + def get_io_data(self, data_struct): + self.io_global_data = IO_GLOBAL_DATA + return get_io_common(data_struct, self.io_global_data) + + def get_io_dump_data(self, data_struct): + return get_io_common(data_struct, IO_DUMP_DATA) + def msg_data_process(self, msg_data, protocal_id): """message data process""" logging.debug("msg_data %s", msg_data) @@ -144,6 +151,8 @@ class CollectServer(): res_msg = self.is_iocollect_valid(data_struct) elif protocal_id == ServerProtocol.GET_IO_DATA: res_msg = self.get_io_data(data_struct) + elif protocal_id == ServerProtocol.GET_IO_DUMP_DATA: + res_msg = self.get_io_dump_data(data_struct) return res_msg -- Gitee From 5bd7681d9a2ffe7d5c62de5b454779e1526365d9 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 9 Oct 2025 20:18:52 +0800 Subject: [PATCH 32/62] add test . --- src/services/sentryCollector/collect_io.py | 10 +++++++++- src/services/sentryCollector/collect_server.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index a5a5e0a..f68a4d8 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -104,6 +104,8 @@ class CollectIo(): # read=0, write=1, flush=2, discard=3 if (len(IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save: IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].pop() + if (len(IO_DUMP_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save: + IO_DUMP_DATA[disk_name][stage][Io_Category[index]].pop() curr_lat = self.get_latency_value(curr_stage_value, last_stage_value, index) curr_iops = self.get_iops(curr_stage_value, last_stage_value, index) @@ -111,6 +113,8 @@ class CollectIo(): curr_io_dump = self.get_io_dump(disk_name, stage, index) IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) + if (curr_io_dump == 0) : + IO_DUMP_DATA[disk_name][stage][Io_Category[index]].insert(0, []) def get_iops(self, curr_stage_value, last_stage_value, category): try: @@ -195,6 +199,7 @@ class CollectIo(): def get_io_dump(self, disk_name, stage, category): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) count = 0 + io_dump_msg = [] pattern = re.compile( r'(?P\w+)-(?P\d+)\s+' r'\w+\s+' @@ -210,13 +215,14 @@ class CollectIo(): match = pattern.match(line) if match: parsed = match.groupdict() + io_dump_msg.append(parsed) for k, v in parsed.items(): - IO_DUMP_DATA[disk_name][stage][Io_Category[category]].insert(0, {k: v}) logging.info(f"io_dump info : {k} = {v}") else: logging.info(f"io_dump parse err, info : {line.strip()}") count += 1 if count > 0: + IO_DUMP_DATA[disk_name][stage][Io_Category[category]].insert(0, io_dump_msg) logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}") except FileNotFoundError: logging.error("The file %s does not exist.", io_dump_file) @@ -272,6 +278,7 @@ class CollectIo(): self.disk_map_stage[disk_name] = stage_list self.window_value[disk_name] = {} IO_GLOBAL_DATA[disk_name] = {} + IO_DUMP_DATA[disk_name] = {} return len(IO_GLOBAL_DATA) != 0 @@ -480,6 +487,7 @@ class CollectIo(): def main_loop(self): global IO_GLOBAL_DATA + global IO_DUMP_DATA logging.info("collect io thread start") if self.is_kernel_avaliable() and len(self.disk_map_stage) != 0: diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index d3d5064..b2936ce 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -132,10 +132,10 @@ class CollectServer(): def get_io_data(self, data_struct): self.io_global_data = IO_GLOBAL_DATA - return get_io_common(data_struct, self.io_global_data) + return self.get_io_common(data_struct, self.io_global_data) def get_io_dump_data(self, data_struct): - return get_io_common(data_struct, IO_DUMP_DATA) + return self.get_io_common(data_struct, IO_DUMP_DATA) def msg_data_process(self, msg_data, protocal_id): """message data process""" -- Gitee From 1f4e0a58a39d59e5f598e7c1c8e6d8e6cf9d388c Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 10 Oct 2025 09:58:50 +0800 Subject: [PATCH 33/62] add test . --- src/services/sentryCollector/collect_io.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index f68a4d8..4ed7cf1 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -33,6 +33,9 @@ EBPF_PROCESS = None EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] EBPF_SUPPORT_VERSION = ["6.6.0"] +#iodump msg limit +IO_DUMP_MSG_LIMIT = 20 + class IoStatus(): TOTAL = 0 FINISH = 1 @@ -215,9 +218,10 @@ class CollectIo(): match = pattern.match(line) if match: parsed = match.groupdict() - io_dump_msg.append(parsed) + if count < IO_DUMP_MSG_LIMIT: + io_dump_msg.append(parsed) for k, v in parsed.items(): - logging.info(f"io_dump info : {k} = {v}") + logging.debug(f"io_dump info : {k} = {v}") else: logging.info(f"io_dump parse err, info : {line.strip()}") count += 1 -- Gitee From b7d1cbbfe30565dc3c8cec2a2bfe073e3d3709c3 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 10 Oct 2025 16:46:40 +0800 Subject: [PATCH 34/62] add test . --- .../python/pySentryCollector/collect_plugin.py | 6 +++--- src/sentryPlugins/avg_block_io/avg_block_io.py | 13 ++++++++++--- src/sentryPlugins/avg_block_io/module_conn.py | 9 +++++++-- src/sentryPlugins/avg_block_io/stage_window.py | 18 ++++++++++++++++++ src/sentryPlugins/avg_block_io/utils.py | 16 +++++++++++++++- src/services/sentryCollector/collect_io.py | 2 +- src/services/sentryCollector/collect_server.py | 8 ++++---- 7 files changed, 58 insertions(+), 14 deletions(-) diff --git a/src/libsentry/python/pySentryCollector/collect_plugin.py b/src/libsentry/python/pySentryCollector/collect_plugin.py index f000cf8..4e7f6bb 100644 --- a/src/libsentry/python/pySentryCollector/collect_plugin.py +++ b/src/libsentry/python/pySentryCollector/collect_plugin.py @@ -52,7 +52,7 @@ LIMIT_MAX_SAVE_LEN = 300 class ClientProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 - GET_IO_DUMP_DATA = 2 + GET_IODUMP_MSG = 2 PRO_END = 3 class ResultMessage(): @@ -292,8 +292,8 @@ def get_io_data(period, disk_list, stage, iotype): result['message'] = Result_Messages[error_code] return result -def get_io_dump_data(period, disk_list, stage, iotype): - result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IO_DUMP_DATA) +def get_iodump_msg(period, disk_list, stage, iotype): + result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IODUMP_MSG) error_code = result['ret'] if error_code != ResultMessage.RESULT_SUCCEED: result['message'] = Result_Messages[error_code] diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index c23c724..292804a 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -14,9 +14,9 @@ import configparser import time from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage -from .stage_window import IoWindow, IoDumpWindow,IopsWindow -from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation -from .utils import update_avg_and_check_abnormal +from .stage_window import IoWindow, IoDumpWindow,IopsWindow,IodumpMsgWindow +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, avg_get_iodump_msg, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation +from .utils import update_avg_and_check_abnormal, update_avg_iodump_msg from .extra_logger import init_extra_logger CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -63,6 +63,9 @@ def init_io_win(io_dic, config, common_param): io_data[disk_name][stage_name][rw]["iops"] = IopsWindow(window_size=io_dic["win_size"]) logging.debug("Successfully create {}-{}-{}-iops window".format(disk_name, stage_name, rw)) + + io_data[disk_name][stage_name][rw]["iodump_msg"] = IodumpMsgWindow(window_size=io_dic["win_size"]) + logging.debug("Successfully create {}-{}-{}-iodump_msg window".format(disk_name, stage_name, rw)) return io_data, io_avg_value @@ -129,6 +132,9 @@ def main_loop(io_dic, io_data, io_avg_value): logging.error(f"{curr_period_data['msg']}") continue + # 获取iodump的详细信息 + is_success, iodump_msg = avg_get_iodump_msg(io_dic) + # 处理周期数据 reach_size = False for disk_name in disk_list: @@ -137,6 +143,7 @@ def main_loop(io_dic, io_data, io_avg_value): if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: io_key = (disk_name, stage_name, rw) reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) + update_avg_iodump_msg(iodump_msg, is_success, io_key, io_data) # win_size不满时不进行告警判断 if not reach_size: diff --git a/src/sentryPlugins/avg_block_io/module_conn.py b/src/sentryPlugins/avg_block_io/module_conn.py index bc10802..06af8b4 100644 --- a/src/sentryPlugins/avg_block_io/module_conn.py +++ b/src/sentryPlugins/avg_block_io/module_conn.py @@ -13,7 +13,7 @@ import logging import sys from .utils import is_abnormal, get_win_data, log_slow_win -from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, get_iodump_msg, Result_Messages, get_disk_type, Disk_Type from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -33,6 +33,12 @@ def avg_get_io_data(io_dic): res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) return check_result_validation(res, 'get io data') +def avg_get_iodump_msg(io_dic): + """avg_get_iodump_msg from sentryCollector""" + logging.debug(f"send to sentryCollector avg_get_iodump_msg: period={io_dic['period_time']}, " + f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}") + res = get_iodump_msg(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) + return check_result_validation(res, 'get io dump data') def avg_is_iocollect_valid(io_dic, config_disk, config_stage): """is_iocollect_valid from sentryCollector""" @@ -59,7 +65,6 @@ def check_result_validation(res, reason): return True, json_data - def report_alarm_fail(alarm_info): """report result to xalarmd""" report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index 71aba31..b076b20 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -58,3 +58,21 @@ class IoDumpWindow(AbnormalWindowBase): class IopsWindow(AbnormalWindowBase): def is_abnormal_period(self, value, avg_val=10): return False + +class IodumpMsgWindow: + def __init__(self, window_size=10): + self.window_size = window_size + self.window_data = [[] for _ in range(window_size)] + + def append_new_data(self, msg): + self.window_data.pop(0) + self.window_data.append(msg) + + def window_data_to_string(self): + str_list = [] + for period in self.window_data: + if period: + str_list.append("{" + "; ".join(f"{k}: {v}" for k, v in period.items()) + "}") + else: + str_list.append("{}") + return ", ".join(str_list) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index a59aa4d..fd9da98 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -51,7 +51,10 @@ def get_win_data(disk_name, rw, io_data): if 'iops' in io_data[disk_name][stage_name][rw]: iops_list = io_data[disk_name][stage_name][rw]['iops'].window_data_to_string() iops += f'{stage_name}: [{iops_list}], ' - return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2]} + if 'iodump_msg' in io_data[disk_name][stage_name][rw]: + iodump_msg_list = io_data[disk_name][stage_name][rw]['iodump_msg'].window_data_to_string() + iodump_msg += f'{stage_name}: [{iodump_msg_list}], ' + return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2], "iodump_msg": iodump_msg[:-2]} def is_abnormal(io_key, io_data): @@ -146,3 +149,14 @@ def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data) return True set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) return True + +def update_avg_iodump_msg(iodump_msg, is_success, io_key, io_data): + """update iodump data to io_data""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "iodump_msg" in all_wins: + if not is_success: + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_msg"].append_new_data([]) + else: + period_value = get_nested_value(iodump_msg, io_key) + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_msg"].append_new_data(period_value) + diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 4ed7cf1..6cefbc4 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -34,7 +34,7 @@ EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] EBPF_SUPPORT_VERSION = ["6.6.0"] #iodump msg limit -IO_DUMP_MSG_LIMIT = 20 +IO_DUMP_MSG_LIMIT = 15 class IoStatus(): TOTAL = 0 diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index b2936ce..87c31bd 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -48,7 +48,7 @@ RES_MAGIC = "RES" class ServerProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 - GET_IO_DUMP_DATA = 2 + GET_IODUMP_MSG = 2 PRO_END = 3 class CollectServer(): @@ -134,7 +134,7 @@ class CollectServer(): self.io_global_data = IO_GLOBAL_DATA return self.get_io_common(data_struct, self.io_global_data) - def get_io_dump_data(self, data_struct): + def get_iodump_msg(self, data_struct): return self.get_io_common(data_struct, IO_DUMP_DATA) def msg_data_process(self, msg_data, protocal_id): @@ -151,8 +151,8 @@ class CollectServer(): res_msg = self.is_iocollect_valid(data_struct) elif protocal_id == ServerProtocol.GET_IO_DATA: res_msg = self.get_io_data(data_struct) - elif protocal_id == ServerProtocol.GET_IO_DUMP_DATA: - res_msg = self.get_io_dump_data(data_struct) + elif protocal_id == ServerProtocol.GET_IODUMP_MSG: + res_msg = self.get_iodump_msg(data_struct) return res_msg -- Gitee From dc35a25f51385b926852d3bff32074cdb6195d19 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 10 Oct 2025 20:37:45 +0800 Subject: [PATCH 35/62] add test . --- src/sentryPlugins/avg_block_io/stage_window.py | 14 +++++++++++--- src/sentryPlugins/avg_block_io/utils.py | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index b076b20..7184f7a 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -71,8 +71,16 @@ class IodumpMsgWindow: def window_data_to_string(self): str_list = [] for period in self.window_data: - if period: - str_list.append("{" + "; ".join(f"{k}: {v}" for k, v in period.items()) + "}") + if not period: + str_list.append("[]") else: - str_list.append("{}") + dict_strs = [] + for item in period: + if not item: + dict_strs.append("{}") + else: + item_str = ", ".join(f"{k}: {v}" for k, v in item.items()) + dict_strs.append("{" + item_str + "}") + period_str = "[" + "; ".join(dict_strs) + "]" + str_list.append(period_str) return ", ".join(str_list) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index fd9da98..484d243 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -41,6 +41,7 @@ def get_win_data(disk_name, rw, io_data): latency = '' iodump = '' iops = '' + iodump_msg = '' for stage_name in io_data[disk_name]: if 'latency' in io_data[disk_name][stage_name][rw]: latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() -- Gitee From 719f7f980aa60716a47bb44d5a9bfac5ea934080 Mon Sep 17 00:00:00 2001 From: hewh Date: Sat, 11 Oct 2025 10:37:39 +0800 Subject: [PATCH 36/62] add test . --- src/sentryPlugins/avg_block_io/stage_window.py | 10 ++-------- src/sentryPlugins/avg_block_io/utils.py | 2 +- src/services/sentryCollector/collect_io.py | 7 ++++--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index 7184f7a..ca69e0c 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -74,13 +74,7 @@ class IodumpMsgWindow: if not period: str_list.append("[]") else: - dict_strs = [] - for item in period: - if not item: - dict_strs.append("{}") - else: - item_str = ", ".join(f"{k}: {v}" for k, v in item.items()) - dict_strs.append("{" + item_str + "}") - period_str = "[" + "; ".join(dict_strs) + "]" + item_values = [f'"{item}"' for item in period] + period_str = "[" + ", ".join(item_values) + "]" str_list.append(period_str) return ", ".join(str_list) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 484d243..261a7fe 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -52,7 +52,7 @@ def get_win_data(disk_name, rw, io_data): if 'iops' in io_data[disk_name][stage_name][rw]: iops_list = io_data[disk_name][stage_name][rw]['iops'].window_data_to_string() iops += f'{stage_name}: [{iops_list}], ' - if 'iodump_msg' in io_data[disk_name][stage_name][rw]: + if 'iodump_msg' in io_data[disk_name][stage_name][rw] and stage_name == 'bio': iodump_msg_list = io_data[disk_name][stage_name][rw]['iodump_msg'].window_data_to_string() iodump_msg += f'{stage_name}: [{iodump_msg_list}], ' return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2], "iodump_msg": iodump_msg[:-2]} diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 6cefbc4..302ae29 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -34,7 +34,7 @@ EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] EBPF_SUPPORT_VERSION = ["6.6.0"] #iodump msg limit -IO_DUMP_MSG_LIMIT = 15 +IO_DUMP_MSG_LIMIT = 10 class IoStatus(): TOTAL = 0 @@ -218,10 +218,11 @@ class CollectIo(): match = pattern.match(line) if match: parsed = match.groupdict() - if count < IO_DUMP_MSG_LIMIT: - io_dump_msg.append(parsed) for k, v in parsed.items(): logging.debug(f"io_dump info : {k} = {v}") + parsed["start_time_ns"] = str(int(parsed["start_time_ns"]) // 1000000) + if count < IO_DUMP_MSG_LIMIT: + io_dump_msg.append(parsed) else: logging.info(f"io_dump parse err, info : {line.strip()}") count += 1 -- Gitee From 0ce0300bd808bb5f4e4db3abcacd2851a19a4f47 Mon Sep 17 00:00:00 2001 From: hewh Date: Sat, 11 Oct 2025 10:57:46 +0800 Subject: [PATCH 37/62] add test . --- src/services/sentryCollector/collect_io.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 302ae29..2dd84bd 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -217,12 +217,16 @@ class CollectIo(): if line.count('.op=' + Io_Category[category].upper()) > 0: match = pattern.match(line) if match: - parsed = match.groupdict() - for k, v in parsed.items(): - logging.debug(f"io_dump info : {k} = {v}") - parsed["start_time_ns"] = str(int(parsed["start_time_ns"]) // 1000000) if count < IO_DUMP_MSG_LIMIT: - io_dump_msg.append(parsed) + parsed = match.groupdict() + values = [ + parsed["task_name"], + parsed["pid"], + parsed["stage"], + parsed["ptr"], + str(int(parsed["start_time_ns"]) // 1000000) + ] + io_dump_msg.append(values) else: logging.info(f"io_dump parse err, info : {line.strip()}") count += 1 -- Gitee From 5233e3c9365d83c1decef66eaf434b7ac27b438e Mon Sep 17 00:00:00 2001 From: hewh Date: Sat, 11 Oct 2025 11:48:36 +0800 Subject: [PATCH 38/62] add test . --- src/services/sentryCollector/collect_io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 2dd84bd..b70206e 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -226,7 +226,8 @@ class CollectIo(): parsed["ptr"], str(int(parsed["start_time_ns"]) // 1000000) ] - io_dump_msg.append(values) + value_str = ",".join(values) + io_dump_msg.append(value_str) else: logging.info(f"io_dump parse err, info : {line.strip()}") count += 1 -- Gitee From 7ddd2e76d8769f18f303a6b3ff2e93ec4e4befb1 Mon Sep 17 00:00:00 2001 From: hewh Date: Sat, 11 Oct 2025 15:53:49 +0800 Subject: [PATCH 39/62] add test . --- src/sentryPlugins/ai_block_io/sliding_window.py | 6 ++---- src/sentryPlugins/avg_block_io/module_conn.py | 4 ++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index b174d94..fe9b8c9 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -33,10 +33,8 @@ class SlidingWindow: def is_abnormal(self, data): if self._avg_lim is not None and data < self._avg_lim: return False - if self._avg_lim is not None and self._ai_threshold is not None: - threshold = max(self._avg_lim, self._ai_threshold) - if data > threshold: - return True + if self._ai_threshold is not None and data > self._ai_threshold: + return True if self._abs_threshold is not None and data > self._abs_threshold: return True return False diff --git a/src/sentryPlugins/avg_block_io/module_conn.py b/src/sentryPlugins/avg_block_io/module_conn.py index 06af8b4..5a64304 100644 --- a/src/sentryPlugins/avg_block_io/module_conn.py +++ b/src/sentryPlugins/avg_block_io/module_conn.py @@ -94,6 +94,7 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = f"bio,{stage_name}" msg["alarm_type"] = abnormal_list log_slow_win(msg, "IO press") + del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return @@ -104,6 +105,7 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = "bio,rq_driver" msg["alarm_type"] = abnormal_list log_slow_win(msg, "driver slow") + del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return @@ -117,10 +119,12 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = f"bio,{stage_name}" msg["alarm_type"] = abnormal_list log_slow_win(msg, "kernel slow") + del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return log_slow_win(msg, "unknown") + del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -- Gitee From bc8abd681e789b2212770a2db43c85d72cb6a775 Mon Sep 17 00:00:00 2001 From: hewh Date: Mon, 13 Oct 2025 10:30:24 +0800 Subject: [PATCH 40/62] add test . --- .../pySentryCollector/collect_plugin.py | 4 ++ .../avg_block_io/extra_logger.py | 45 ++++++++++++++++++- src/sentryPlugins/avg_block_io/module_conn.py | 3 ++ .../avg_block_io/stage_window.py | 1 + src/sentryPlugins/avg_block_io/utils.py | 1 + src/services/sentryCollector/collect_io.py | 2 +- 6 files changed, 54 insertions(+), 2 deletions(-) diff --git a/src/libsentry/python/pySentryCollector/collect_plugin.py b/src/libsentry/python/pySentryCollector/collect_plugin.py index 4e7f6bb..9e4d16c 100644 --- a/src/libsentry/python/pySentryCollector/collect_plugin.py +++ b/src/libsentry/python/pySentryCollector/collect_plugin.py @@ -235,6 +235,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): result['message'] = result_message return result + def inter_get_io_common(period, disk_list, stage, iotype, protocol): result = {} result['ret'] = ResultMessage.RESULT_UNKNOWN @@ -285,6 +286,7 @@ def inter_get_io_common(period, disk_list, stage, iotype, protocol): result['message'] = result_message return result + def get_io_data(period, disk_list, stage, iotype): result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IO_DATA) error_code = result['ret'] @@ -292,6 +294,7 @@ def get_io_data(period, disk_list, stage, iotype): result['message'] = Result_Messages[error_code] return result + def get_iodump_msg(period, disk_list, stage, iotype): result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IODUMP_MSG) error_code = result['ret'] @@ -299,6 +302,7 @@ def get_iodump_msg(period, disk_list, stage, iotype): result['message'] = Result_Messages[error_code] return result + def get_disk_type(disk): result = {} result['ret'] = ResultMessage.RESULT_UNKNOWN diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 86c9820..8031876 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -11,6 +11,7 @@ import logging import os import re +import ast extra_logger = None @@ -44,6 +45,7 @@ def extra_slow_log(msg): extra_latency_log(msg) return if "iodump" in str(msg.get('alarm_type', '')): + extra_iodump_log(msg) return @@ -62,7 +64,7 @@ def extra_latency_log(msg): if 'rq_driver' in iops_data and iops_data['rq_driver']: iops_avg = sum(iops_data['rq_driver']) / len(iops_data['rq_driver']) - extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {msg['io_type']}, iops: {int(iops_avg)}") + extra_logger.warning(f"[SLOW IO] alarm_type: latency, disk: {msg['driver_name']}, iotype: {msg['io_type']}, iops: {int(iops_avg)}") # Parse the latency string from msg latency_str = msg['details']['latency'] @@ -161,3 +163,44 @@ def extra_latency_log(msg): ) except KeyError: return + + +def extra_iodump_log(msg): + extra_logger.warning(f"[SLOW IO] alarm_type: iodump, disk: {msg['driver_name']}, iotype: {msg['io_type']}") + iodump_str = msg['details']['iodump_msg'] + + groups = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], + 'D->C': ['rq_driver'] + } + + try: + list_str = iodump_str.split('bio: ', 1)[-1].strip() + bio_data = ast.literal_eval(list_str) + except Exception as e: + extra_logger.error(f"Failed to parse iodump data: {e}") + return + + stack_to_stage = {} + for stage, stacks in groups.items(): + for stack in stacks: + stack_to_stage[stack] = stage + + last_bio_record = {} + for window in bio_data: + for entry in window: + parts = entry.split(',') + task_name, pid, io_stack, bio_ptr, start_ago = parts + stage = stack_to_stage.get(io_stack, 'Unknown') + last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) + + header = f"{'TASK_NAME':<10} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" + extra_logger.warning(header) + + for bio_ptr in last_bio_record: + task_name, pid, io_stack, stage, bio_ptr, start_ago = last_bio_record[bio_ptr] + line = f"{task_name:<10} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" + extra_logger.warning(line) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/module_conn.py b/src/sentryPlugins/avg_block_io/module_conn.py index 5a64304..5feea7b 100644 --- a/src/sentryPlugins/avg_block_io/module_conn.py +++ b/src/sentryPlugins/avg_block_io/module_conn.py @@ -33,6 +33,7 @@ def avg_get_io_data(io_dic): res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) return check_result_validation(res, 'get io data') + def avg_get_iodump_msg(io_dic): """avg_get_iodump_msg from sentryCollector""" logging.debug(f"send to sentryCollector avg_get_iodump_msg: period={io_dic['period_time']}, " @@ -40,6 +41,7 @@ def avg_get_iodump_msg(io_dic): res = get_iodump_msg(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) return check_result_validation(res, 'get io dump data') + def avg_is_iocollect_valid(io_dic, config_disk, config_stage): """is_iocollect_valid from sentryCollector""" logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, " @@ -65,6 +67,7 @@ def check_result_validation(res, reason): return True, json_data + def report_alarm_fail(alarm_info): """report result to xalarmd""" report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index ca69e0c..3b4840d 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -59,6 +59,7 @@ class IopsWindow(AbnormalWindowBase): def is_abnormal_period(self, value, avg_val=10): return False + class IodumpMsgWindow: def __init__(self, window_size=10): self.window_size = window_size diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 261a7fe..5a8bd10 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -151,6 +151,7 @@ def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data) set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) return True + def update_avg_iodump_msg(iodump_msg, is_success, io_key, io_data): """update iodump data to io_data""" all_wins = get_nested_value(io_data, io_key) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index b70206e..f675233 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -116,7 +116,7 @@ class CollectIo(): curr_io_dump = self.get_io_dump(disk_name, stage, index) IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) - if (curr_io_dump == 0) : + if curr_io_dump == 0: IO_DUMP_DATA[disk_name][stage][Io_Category[index]].insert(0, []) def get_iops(self, curr_stage_value, last_stage_value, category): -- Gitee From f75cf4886f0c93b959c5196d696c51f926b7552d Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 14 Oct 2025 15:28:40 +0800 Subject: [PATCH 41/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 15 ++++++ src/sentryPlugins/ai_block_io/data_access.py | 36 ++++++++++++++ .../avg_block_io/extra_logger.py | 47 ++++++++----------- src/sentryPlugins/avg_block_io/utils.py | 22 +++++---- src/services/sentryCollector/collect_io.py | 6 +-- .../sentryCollector/collect_server.py | 14 +++--- 6 files changed, 92 insertions(+), 48 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 04fdd27..b77285f 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -21,6 +21,7 @@ from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser from .data_access import ( get_io_data_from_collect_plug, + get_iodump_data_from_collect_plug, check_collect_valid, get_disk_type, check_disk_is_available @@ -95,6 +96,7 @@ class SlowIODetection: self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "iops")) + self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "iodump_data")) if not self._detector_name_list: Report.report_pass("the disks to detection is empty, ai_block_io will exit.") @@ -177,6 +179,16 @@ class SlowIODetection: detector = Detector(metric_name, threshold, sliding_window) disk_detector.add_detector(detector) + elif metric_name.metric_name == 'iodump_data': + threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold) + sliding_window = SlidingWindowFactory().get_sliding_window( + sliding_window_type, + queue_length=window_size, + threshold=window_threshold + ) + detector = Detector(metric_name, threshold, sliding_window) + disk_detector.add_detector(detector) + logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") self._disk_detectors[disk] = disk_detector @@ -188,6 +200,9 @@ class SlowIODetection: io_data_dict_with_disk_name = get_io_data_from_collect_plug( self._config_parser.period_time, self._disk_list ) + iodump_data_dict_with_disk_name = get_iodump_data_from_collect_plug( + self._config_parser.period_time, self._disk_list + ) logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}") if io_data_dict_with_disk_name is None: Report.report_pass( diff --git a/src/sentryPlugins/ai_block_io/data_access.py b/src/sentryPlugins/ai_block_io/data_access.py index 845c76a..85abcad 100644 --- a/src/sentryPlugins/ai_block_io/data_access.py +++ b/src/sentryPlugins/ai_block_io/data_access.py @@ -15,6 +15,7 @@ import logging from sentryCollector.collect_plugin import ( Result_Messages, get_io_data, + get_iodump_data, is_iocollect_valid, get_disk_type ) @@ -126,3 +127,38 @@ def get_io_data_from_collect_plug(period, disk_list): return ret logging.warning(f'get io data failed with message: {data_raw["message"]}') return None + + +def _get_raw_iodump_data(period, disk_list): + return get_iodump_data( + period, + disk_list, + COLLECT_STAGES, + ["read", "write", "flush", "discard"], + ) + + +def get_iodump_data_from_collect_plug(period, disk_list): + data_raw = _get_raw_iodump_data(period, disk_list) + if data_raw["ret"] == 0: + ret = {} + try: + data = json.loads(data_raw["message"]) + except json.decoder.JSONDecodeError as e: + logging.warning(f"get iodump data failed, {e}") + return None + + for disk in data: + disk_data = data[disk] + disk_ret = IOData() + for k, v in disk_data.items(): + try: + getattr(disk_ret, k) + setattr(disk_ret, k, _get_io_stage_data(v)) + except AttributeError: + logging.debug(f"no attr {k}") + continue + ret[disk] = disk_ret + return ret + logging.warning(f'get io data failed with message: {data_raw["message"]}') + return None \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 8031876..a7a9929 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -16,6 +16,16 @@ import ast extra_logger = None +STAGE_GROUPS = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], + 'D->C': ['rq_driver'] +} + +PATTERN = re.compile(r'(\w+):\s*\[([0-9.,]+)\]') + def init_extra_logger(log_path, log_level, log_format): global extra_logger try: @@ -50,12 +60,10 @@ def extra_slow_log(msg): def extra_latency_log(msg): - pattern = r'(\w+):\s*\[([0-9.,]+)\]' - # Parse the iops string from msg iops_avg = 0 iops_str = msg['details']['iops'] - iops_matches = re.findall(pattern, iops_str) + iops_matches = re.findall(PATTERN, iops_str) iops_data = {} for match in iops_matches: key = match[0] @@ -68,25 +76,16 @@ def extra_latency_log(msg): # Parse the latency string from msg latency_str = msg['details']['latency'] - latency_matches = re.findall(pattern, latency_str) + latency_matches = re.findall(PATTERN, latency_str) latency_data = {} for match in latency_matches: key = match[0] values = list(map(float, match[1].split(','))) latency_data[key] = values - # Define stage groups - groups = { - 'B->Q': ['throtl', 'wbt', 'iocost'], - 'Q->G': ['gettag'], - 'G->I': ['plug'], - 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], - 'D->C': ['rq_driver'] - } - # Calculate statistics for each group group_stats = {} - for group_name, stages in groups.items(): + for group_name, stages in STAGE_GROUPS.items(): all_values = [] for stage in stages: if stage in latency_data: @@ -113,7 +112,7 @@ def extra_latency_log(msg): total_avg = 0 total_min = 0 total_max = 0 - for group_name in groups: + for group_name in STAGE_GROUPS: total_avg += group_stats[group_name]['avg'] total_min += group_stats[group_name]['min'] total_max += group_stats[group_name]['max'] @@ -124,7 +123,7 @@ def extra_latency_log(msg): } # Calculate PCT for each group (except B->C) - for group_name in groups: + for group_name in STAGE_GROUPS: if total_avg > 0: pct = (group_stats[group_name]['avg'] / total_avg) * 100 else: @@ -167,25 +166,17 @@ def extra_latency_log(msg): def extra_iodump_log(msg): extra_logger.warning(f"[SLOW IO] alarm_type: iodump, disk: {msg['driver_name']}, iotype: {msg['io_type']}") - iodump_str = msg['details']['iodump_msg'] - - groups = { - 'B->Q': ['throtl', 'wbt', 'iocost'], - 'Q->G': ['gettag'], - 'G->I': ['plug'], - 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], - 'D->C': ['rq_driver'] - } + iodump_str = msg['details']['iodump_data'] try: - list_str = iodump_str.split('bio: ', 1)[-1].strip() - bio_data = ast.literal_eval(list_str) + iodump_data = ast.literal_eval(iodump_str) + bio_data = iodump_data['bio'] except Exception as e: extra_logger.error(f"Failed to parse iodump data: {e}") return stack_to_stage = {} - for stage, stacks in groups.items(): + for stage, stacks in STAGE_GROUPS.items(): for stack in stacks: stack_to_stage[stack] = stage diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 5a8bd10..4c3977c 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -41,7 +41,7 @@ def get_win_data(disk_name, rw, io_data): latency = '' iodump = '' iops = '' - iodump_msg = '' + iodump_data = '' for stage_name in io_data[disk_name]: if 'latency' in io_data[disk_name][stage_name][rw]: latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() @@ -52,10 +52,12 @@ def get_win_data(disk_name, rw, io_data): if 'iops' in io_data[disk_name][stage_name][rw]: iops_list = io_data[disk_name][stage_name][rw]['iops'].window_data_to_string() iops += f'{stage_name}: [{iops_list}], ' - if 'iodump_msg' in io_data[disk_name][stage_name][rw] and stage_name == 'bio': - iodump_msg_list = io_data[disk_name][stage_name][rw]['iodump_msg'].window_data_to_string() - iodump_msg += f'{stage_name}: [{iodump_msg_list}], ' - return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2], "iodump_msg": iodump_msg[:-2]} + if 'iodump_data' in io_data[disk_name][stage_name][rw]: + iodump_data_list = io_data[disk_name][stage_name][rw]['iodump_data'].window_data_to_string() + iodump_data += f'{stage_name}: [{iodump_data_list}], ' + if iodump_data: + iodump_data = '{' + iodump_data[:-2] + '}' + return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2], "iodump_data": iodump_data} def is_abnormal(io_key, io_data): @@ -152,13 +154,13 @@ def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data) return True -def update_avg_iodump_msg(iodump_msg, is_success, io_key, io_data): +def update_avg_iodump_data(iodump_data, is_success, io_key, io_data): """update iodump data to io_data""" all_wins = get_nested_value(io_data, io_key) - if all_wins and "iodump_msg" in all_wins: + if all_wins and "iodump_data" in all_wins: if not is_success: - io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_msg"].append_new_data([]) + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_data"].append_new_data([]) else: - period_value = get_nested_value(iodump_msg, io_key) - io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_msg"].append_new_data(period_value) + period_value = get_nested_value(iodump_data, io_key) + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump_data"].append_new_data(period_value) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index f675233..3726062 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -33,8 +33,8 @@ EBPF_PROCESS = None EBPF_STAGE_LIST = ["wbt", "rq_driver", "bio", "gettag"] EBPF_SUPPORT_VERSION = ["6.6.0"] -#iodump msg limit -IO_DUMP_MSG_LIMIT = 10 +#iodump data limit +IO_DUMP_DATA_LIMIT = 10 class IoStatus(): TOTAL = 0 @@ -217,7 +217,7 @@ class CollectIo(): if line.count('.op=' + Io_Category[category].upper()) > 0: match = pattern.match(line) if match: - if count < IO_DUMP_MSG_LIMIT: + if count < IO_DUMP_DATA_LIMIT: parsed = match.groupdict() values = [ parsed["task_name"], diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index 87c31bd..5ed1e65 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -48,7 +48,7 @@ RES_MAGIC = "RES" class ServerProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 - GET_IODUMP_MSG = 2 + GET_IODUMP_DATA = 2 PRO_END = 3 class CollectServer(): @@ -121,12 +121,12 @@ class CollectServer(): if len(stage_list) > 0 and stage_name not in stage_list: continue result_rev[disk_name][stage_name] = {} - for iotype_name, iotype_info in iotype_info.items(): + for iotype_name, iotype_data in iotype_info.items(): if iotype_name not in iotype_list: continue - if len(iotype_info) - 1 < collect_index: + if len(iotype_data) - 1 < collect_index: continue - result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] + result_rev[disk_name][stage_name][iotype_name] = iotype_data[collect_index] return json.dumps(result_rev) @@ -134,7 +134,7 @@ class CollectServer(): self.io_global_data = IO_GLOBAL_DATA return self.get_io_common(data_struct, self.io_global_data) - def get_iodump_msg(self, data_struct): + def get_iodump_data(self, data_struct): return self.get_io_common(data_struct, IO_DUMP_DATA) def msg_data_process(self, msg_data, protocal_id): @@ -151,8 +151,8 @@ class CollectServer(): res_msg = self.is_iocollect_valid(data_struct) elif protocal_id == ServerProtocol.GET_IO_DATA: res_msg = self.get_io_data(data_struct) - elif protocal_id == ServerProtocol.GET_IODUMP_MSG: - res_msg = self.get_iodump_msg(data_struct) + elif protocal_id == ServerProtocol.GET_IODUMP_DATA: + res_msg = self.get_iodump_data(data_struct) return res_msg -- Gitee From 198deb1b8329d4cb486b592c8201b5e26de4b5b2 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 14 Oct 2025 15:46:53 +0800 Subject: [PATCH 42/62] add test . --- src/sentryPlugins/avg_block_io/avg_block_io.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index 292804a..df86138 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -15,8 +15,8 @@ import time from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage from .stage_window import IoWindow, IoDumpWindow,IopsWindow,IodumpMsgWindow -from .module_conn import avg_is_iocollect_valid, avg_get_io_data, avg_get_iodump_msg, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation -from .utils import update_avg_and_check_abnormal, update_avg_iodump_msg +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, avg_get_iodump_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation +from .utils import update_avg_and_check_abnormal, update_avg_iodump_data from .extra_logger import init_extra_logger CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -64,8 +64,8 @@ def init_io_win(io_dic, config, common_param): io_data[disk_name][stage_name][rw]["iops"] = IopsWindow(window_size=io_dic["win_size"]) logging.debug("Successfully create {}-{}-{}-iops window".format(disk_name, stage_name, rw)) - io_data[disk_name][stage_name][rw]["iodump_msg"] = IodumpMsgWindow(window_size=io_dic["win_size"]) - logging.debug("Successfully create {}-{}-{}-iodump_msg window".format(disk_name, stage_name, rw)) + io_data[disk_name][stage_name][rw]["iodump_data"] = IodumpMsgWindow(window_size=io_dic["win_size"]) + logging.debug("Successfully create {}-{}-{}-iodump_data window".format(disk_name, stage_name, rw)) return io_data, io_avg_value @@ -133,7 +133,7 @@ def main_loop(io_dic, io_data, io_avg_value): continue # 获取iodump的详细信息 - is_success, iodump_msg = avg_get_iodump_msg(io_dic) + is_success, iodump_data = avg_get_iodump_data(io_dic) # 处理周期数据 reach_size = False @@ -143,7 +143,7 @@ def main_loop(io_dic, io_data, io_avg_value): if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: io_key = (disk_name, stage_name, rw) reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) - update_avg_iodump_msg(iodump_msg, is_success, io_key, io_data) + update_avg_iodump_data(iodump_data, is_success, io_key, io_data) # win_size不满时不进行告警判断 if not reach_size: -- Gitee From 4e3fab44a7ebe93f42b922b128c7e9c8c3d02c5e Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 14 Oct 2025 15:49:20 +0800 Subject: [PATCH 43/62] add test . --- .../python/pySentryCollector/collect_plugin.py | 6 +++--- src/sentryPlugins/ai_block_io/extra_logger.py | 2 +- src/sentryPlugins/avg_block_io/module_conn.py | 18 +++++++++--------- src/sentryPlugins/avg_block_io/stage_window.py | 10 +--------- 4 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/libsentry/python/pySentryCollector/collect_plugin.py b/src/libsentry/python/pySentryCollector/collect_plugin.py index 9e4d16c..e1befe7 100644 --- a/src/libsentry/python/pySentryCollector/collect_plugin.py +++ b/src/libsentry/python/pySentryCollector/collect_plugin.py @@ -52,7 +52,7 @@ LIMIT_MAX_SAVE_LEN = 300 class ClientProtocol(): IS_IOCOLLECT_VALID = 0 GET_IO_DATA = 1 - GET_IODUMP_MSG = 2 + GET_IODUMP_DATA = 2 PRO_END = 3 class ResultMessage(): @@ -295,8 +295,8 @@ def get_io_data(period, disk_list, stage, iotype): return result -def get_iodump_msg(period, disk_list, stage, iotype): - result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IODUMP_MSG) +def get_iodump_data(period, disk_list, stage, iotype): + result = inter_get_io_common(period, disk_list, stage, iotype, ClientProtocol.GET_IODUMP_DATA) error_code = result['ret'] if error_code != ResultMessage.RESULT_SUCCEED: result['message'] = Result_Messages[error_code] diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index 55e8526..287f759 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -57,7 +57,7 @@ def extra_latency_log(msg): if 'rq_driver' in iops_data_dict: iops_avg = sum(iops_data_dict['rq_driver']) / len(iops_data_dict['rq_driver']) - extra_logger.warning(f"[SLOW IO] disk: {msg['driver_name']}, iotype: {io_type}, iops: {int(iops_avg)}") + extra_logger.warning(f"[SLOW IO] alarm_type: latency, disk: {msg['driver_name']}, iotype: {io_type}, iops: {int(iops_avg)}") # Define stage groups groups = { diff --git a/src/sentryPlugins/avg_block_io/module_conn.py b/src/sentryPlugins/avg_block_io/module_conn.py index 5feea7b..5e4a3d0 100644 --- a/src/sentryPlugins/avg_block_io/module_conn.py +++ b/src/sentryPlugins/avg_block_io/module_conn.py @@ -13,7 +13,7 @@ import logging import sys from .utils import is_abnormal, get_win_data, log_slow_win -from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, get_iodump_msg, Result_Messages, get_disk_type, Disk_Type +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, get_iodump_data, Result_Messages, get_disk_type, Disk_Type from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -34,11 +34,11 @@ def avg_get_io_data(io_dic): return check_result_validation(res, 'get io data') -def avg_get_iodump_msg(io_dic): - """avg_get_iodump_msg from sentryCollector""" - logging.debug(f"send to sentryCollector avg_get_iodump_msg: period={io_dic['period_time']}, " +def avg_get_iodump_data(io_dic): + """avg_get_iodump_data from sentryCollector""" + logging.debug(f"send to sentryCollector avg_get_iodump_data: period={io_dic['period_time']}, " f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}") - res = get_iodump_msg(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) + res = get_iodump_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) return check_result_validation(res, 'get io dump data') @@ -97,7 +97,7 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = f"bio,{stage_name}" msg["alarm_type"] = abnormal_list log_slow_win(msg, "IO press") - del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + del msg["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return @@ -108,7 +108,7 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = "bio,rq_driver" msg["alarm_type"] = abnormal_list log_slow_win(msg, "driver slow") - del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + del msg["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return @@ -122,12 +122,12 @@ def process_report_data(disk_name, rw, io_data): msg["block_stack"] = f"bio,{stage_name}" msg["alarm_type"] = abnormal_list log_slow_win(msg, "kernel slow") - del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + del msg["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return log_slow_win(msg, "unknown") - del msg["details"]["iodump_msg"] # 极端场景下iodump_msg可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + del msg["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) diff --git a/src/sentryPlugins/avg_block_io/stage_window.py b/src/sentryPlugins/avg_block_io/stage_window.py index 3b4840d..29fa6e1 100644 --- a/src/sentryPlugins/avg_block_io/stage_window.py +++ b/src/sentryPlugins/avg_block_io/stage_window.py @@ -70,12 +70,4 @@ class IodumpMsgWindow: self.window_data.append(msg) def window_data_to_string(self): - str_list = [] - for period in self.window_data: - if not period: - str_list.append("[]") - else: - item_values = [f'"{item}"' for item in period] - period_str = "[" + ", ".join(item_values) + "]" - str_list.append(period_str) - return ", ".join(str_list) \ No newline at end of file + return str(self.window_data) \ No newline at end of file -- Gitee From c77ab84d7a7de469204487de0fa8ed068c9f84d2 Mon Sep 17 00:00:00 2001 From: hewh Date: Tue, 14 Oct 2025 17:02:27 +0800 Subject: [PATCH 44/62] add test . --- src/sentryPlugins/avg_block_io/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sentryPlugins/avg_block_io/utils.py b/src/sentryPlugins/avg_block_io/utils.py index 4c3977c..d9af7fe 100644 --- a/src/sentryPlugins/avg_block_io/utils.py +++ b/src/sentryPlugins/avg_block_io/utils.py @@ -54,9 +54,9 @@ def get_win_data(disk_name, rw, io_data): iops += f'{stage_name}: [{iops_list}], ' if 'iodump_data' in io_data[disk_name][stage_name][rw]: iodump_data_list = io_data[disk_name][stage_name][rw]['iodump_data'].window_data_to_string() - iodump_data += f'{stage_name}: [{iodump_data_list}], ' - if iodump_data: - iodump_data = '{' + iodump_data[:-2] + '}' + iodump_data += f'"{stage_name}": {iodump_data_list}, ' + if iodump_data: + iodump_data = '{' + iodump_data[:-2] + '}' return {"latency": latency[:-2], "iodump": iodump[:-2], "iops": iops[:-2], "iodump_data": iodump_data} -- Gitee From a7ce7ea15071844c71a59cb798564ff1b40dda5a Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 15 Oct 2025 19:43:19 +0800 Subject: [PATCH 45/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 22 ++++----- src/sentryPlugins/ai_block_io/data_access.py | 14 ++++-- src/sentryPlugins/ai_block_io/detector.py | 45 ++++++++++++++++++- src/sentryPlugins/ai_block_io/extra_logger.py | 2 +- src/sentryPlugins/ai_block_io/io_data.py | 29 +++++++++++- .../ai_block_io/sliding_window.py | 18 ++++++++ src/sentryPlugins/ai_block_io/utils.py | 12 ++++- .../avg_block_io/extra_logger.py | 3 +- src/services/sentryCollector/collect_io.py | 20 ++++----- 9 files changed, 132 insertions(+), 33 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index b77285f..e26a880 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -14,9 +14,9 @@ import signal import logging from collections import defaultdict -from .detector import Detector, DiskDetector +from .detector import Detector, DiskDetector, DataDetector from .threshold import ThresholdFactory, ThresholdType -from .sliding_window import SlidingWindowFactory +from .sliding_window import SlidingWindowFactory, DataWindow from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser from .data_access import ( @@ -180,14 +180,9 @@ class SlowIODetection: disk_detector.add_detector(detector) elif metric_name.metric_name == 'iodump_data': - threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold) - sliding_window = SlidingWindowFactory().get_sliding_window( - sliding_window_type, - queue_length=window_size, - threshold=window_threshold - ) - detector = Detector(metric_name, threshold, sliding_window) - disk_detector.add_detector(detector) + data_window = DataWindow(window_size) + data_detector = DataDetector(metric_name, data_window) + disk_detector.add_data_detector(data_detector) logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") self._disk_detectors[disk] = disk_detector @@ -214,9 +209,12 @@ class SlowIODetection: logging.debug("step2. Start to detection slow io event.") slow_io_event_list = [] for disk, disk_detector in self._disk_detectors.items(): + disk_detector.push_data_to_data_detectors(iodump_data_dict_with_disk_name) result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name) if result[0]: + result[6]["iodump_data"] = disk_detector.get_data_detector_list_window() slow_io_event_list.append(result) + logging.debug("step2. End to detection slow io event.") # Step3:慢IO事件上报 @@ -231,8 +229,9 @@ class SlowIODetection: "alarm_type": slow_io_event[5], "details": slow_io_event[6] } - Xalarm.major(alarm_content) tmp_alarm_content = alarm_content.copy() + del tmp_alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + Xalarm.major(tmp_alarm_content) del tmp_alarm_content["details"] logging.warning("[SLOW IO] " + str(tmp_alarm_content)) logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, ' @@ -243,6 +242,7 @@ class SlowIODetection: logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) logging.warning(f"iops: " + str(alarm_content.get("details").get("iops"))) + logging.warning(f"iodump_data: " + str(alarm_content.get("details").get("iodump_data"))) extra_slow_log(alarm_content) # Step4:等待检测时间 diff --git a/src/sentryPlugins/ai_block_io/data_access.py b/src/sentryPlugins/ai_block_io/data_access.py index 85abcad..9be6b49 100644 --- a/src/sentryPlugins/ai_block_io/data_access.py +++ b/src/sentryPlugins/ai_block_io/data_access.py @@ -21,7 +21,7 @@ from sentryCollector.collect_plugin import ( ) -from .io_data import IOStageData, IOData +from .io_data import IOStageData, IOData, IOStageDumpData, IODumpData COLLECT_STAGES = [ "throtl", @@ -137,6 +137,12 @@ def _get_raw_iodump_data(period, disk_list): ["read", "write", "flush", "discard"], ) +def _get_iodump_stage_data(data): + io_stage_data = IOStageDumpData() + for data_type in ("read", "write", "flush", "discard"): + if data_type in data: + getattr(io_stage_data, data_type).strings = data[data_type] + return io_stage_data def get_iodump_data_from_collect_plug(period, disk_list): data_raw = _get_raw_iodump_data(period, disk_list) @@ -150,15 +156,15 @@ def get_iodump_data_from_collect_plug(period, disk_list): for disk in data: disk_data = data[disk] - disk_ret = IOData() + disk_ret = IODumpData() for k, v in disk_data.items(): try: getattr(disk_ret, k) - setattr(disk_ret, k, _get_io_stage_data(v)) + setattr(disk_ret, k, _get_iodump_stage_data(v)) except AttributeError: logging.debug(f"no attr {k}") continue ret[disk] = disk_ret return ret - logging.warning(f'get io data failed with message: {data_raw["message"]}') + logging.warning(f'get iodump data failed with message: {data_raw["message"]}') return None \ No newline at end of file diff --git a/src/sentryPlugins/ai_block_io/detector.py b/src/sentryPlugins/ai_block_io/detector.py index ebcf4c9..57dfb68 100644 --- a/src/sentryPlugins/ai_block_io/detector.py +++ b/src/sentryPlugins/ai_block_io/detector.py @@ -13,8 +13,8 @@ from datetime import datetime from .io_data import MetricName from .threshold import Threshold -from .sliding_window import SlidingWindow -from .utils import get_metric_value_from_io_data_dict_by_metric_name +from .sliding_window import SlidingWindow, DataWindow +from .utils import get_metric_value_from_io_data_dict_by_metric_name, get_metric_value_from_iodump_data_dict class Detector: @@ -74,6 +74,33 @@ class Detector: f' sliding_window_type: {self._slidingWindow}') +class DataDetector: + + def __init__(self, metric_name: MetricName, data_window: DataWindow): + self._metric_name = metric_name + self._data_window = data_window + + def metric_name(self): + return self._metric_name + + def get_data_window_data(self): + return self._data_window.get_data() + + def push_data(self, iodump_data_dict_with_disk_name: dict): + logging.debug(f'enter Detector: {self}') + metric_value = get_metric_value_from_iodump_data_dict(iodump_data_dict_with_disk_name, self._metric_name) + if metric_value is None: + logging.debug('not found metric value, so return None.') + return False + logging.debug(f'input metric value: {str(metric_value)}') + self._data_window.push(metric_value) + return True + + def __repr__(self): + return (f'disk_name: {self._metric_name.disk_name}, stage_name: {self._metric_name.stage_name},' + f' io_type_name: {self._metric_name.io_access_type_name},' + f' metric_name: {self._metric_name.metric_name}') + def set_to_str(parameter: set): ret = "" parameter = list(parameter) @@ -91,10 +118,14 @@ class DiskDetector: def __init__(self, disk_name: str): self._disk_name = disk_name self._detector_list = [] + self._data_detector_list = [] def add_detector(self, detector: Detector): self._detector_list.append(detector) + def add_data_detector(self, data_detector: DataDetector): + self._data_detector_list.append(data_detector) + def get_detector_list_window(self): latency_wins = {"read": {}, "write": {}} iodump_wins = {"read": {}, "write": {}} @@ -108,6 +139,12 @@ class DiskDetector: iops_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() return latency_wins, iodump_wins, iops_wins + def get_data_detector_list_window(self): + iodump_data_wins = {"read": {}, "write": {}} + for data_detector in self._data_detector_list: + iodump_data_wins[data_detector.metric_name.io_access_type_name][data_detector.metric_name.stage_name] = data_detector.get_data_window_data() + return iodump_data_wins + def is_slow_io_event(self, io_data_dict_with_disk_name: dict): diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []} for detector in self._detector_list: @@ -153,6 +190,10 @@ class DiskDetector: return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details + def push_data_to_data_detectors(self, iodump_data_dict_with_disk_name: dict): + for data_detector in self._data_detector_list: + data_detector.push_data(iodump_data_dict_with_disk_name) + def __repr__(self): msg = f'disk: {self._disk_name}, ' for detector in self._detector_list: diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index 287f759..29ff9e9 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -57,7 +57,7 @@ def extra_latency_log(msg): if 'rq_driver' in iops_data_dict: iops_avg = sum(iops_data_dict['rq_driver']) / len(iops_data_dict['rq_driver']) - extra_logger.warning(f"[SLOW IO] alarm_type: latency, disk: {msg['driver_name']}, iotype: {io_type}, iops: {int(iops_avg)}") + extra_logger.warning(f"[SLOW IO] latency, disk:{msg['driver_name']}, iotype:{io_type}, iops:{int(iops_avg)}") # Define stage groups groups = { diff --git a/src/sentryPlugins/ai_block_io/io_data.py b/src/sentryPlugins/ai_block_io/io_data.py index eea45da..1f3061a 100644 --- a/src/sentryPlugins/ai_block_io/io_data.py +++ b/src/sentryPlugins/ai_block_io/io_data.py @@ -11,7 +11,7 @@ from dataclasses import dataclass, field from datetime import datetime -from typing import Optional +from typing import Optional, List @dataclass @@ -46,6 +46,33 @@ class IOData: time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) +@dataclass +class IoDumpListData: + iodump_data: List[str] = field(default_factory=list) + +@dataclass +class IOStageDumpData: + read: IoDumpListData = field(default_factory=lambda: IoDumpListData()) + write: IoDumpListData = field(default_factory=lambda: IoDumpListData()) + flush: IoDumpListData = field(default_factory=lambda: IoDumpListData()) + discard: IoDumpListData = field(default_factory=lambda: IoDumpListData()) + +@dataclass +class IODumpData: + throtl: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + wbt: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + gettag: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + iocost: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + plug: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + bfq: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + hctx: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + requeue: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + rq_driver: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + bio: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + deadline: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) + time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) + + @dataclass(frozen=True) class MetricName: disk_name: str diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index fe9b8c9..9a97de4 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -10,6 +10,7 @@ # See the Mulan PSL v2 for more details. from enum import Enum, unique +from typing import Any import numpy as np @@ -128,3 +129,20 @@ class SlidingWindowFactory: return MedianSlidingWindow(*args, **kwargs) else: return NotContinuousSlidingWindow(*args, **kwargs) + + +class DataWindow: + def __init__(self, window_size: int): + self._window_size = window_size + self._data_queue = [] + + def push(self, data: Any): + if len(self._data_queue) == self._window_size: + self._data_queue.pop(0) + self._data_queue.append(data) + + def window_data_to_string(self): + return ', '.join([str(data) for data in self._data_queue]) + + def __repr__(self): + return f"[SingleDataWindow, window size: {self._window_size}]" \ No newline at end of file diff --git a/src/sentryPlugins/ai_block_io/utils.py b/src/sentryPlugins/ai_block_io/utils.py index 7d2390b..8970b6f 100644 --- a/src/sentryPlugins/ai_block_io/utils.py +++ b/src/sentryPlugins/ai_block_io/utils.py @@ -15,7 +15,7 @@ from dataclasses import asdict from .threshold import ThresholdType from .sliding_window import SlidingWindowType -from .io_data import MetricName, IOData +from .io_data import MetricName, IOData, IODumpData def get_threshold_type_enum(algorithm_type: str): @@ -49,6 +49,16 @@ def get_metric_value_from_io_data_dict_by_metric_name( return None +def get_metric_value_from_iodump_data_dict(io_dump_data_dict: dict, metric_name: MetricName): + try: + io_dump_data: IODumpData = io_dump_data_dict[metric_name.disk_name] + io_stage_dump_data = asdict(io_dump_data)[metric_name.stage_name] + base_data = io_stage_dump_data[metric_name.io_access_type_name] + metric_value = base_data[metric_name.metric_name] + return metric_value + except KeyError: + return None + def get_data_queue_size_and_update_size( training_data_duration: float, train_update_duration: float, diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index a7a9929..3abeba4 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -26,6 +26,7 @@ STAGE_GROUPS = { PATTERN = re.compile(r'(\w+):\s*\[([0-9.,]+)\]') + def init_extra_logger(log_path, log_level, log_format): global extra_logger try: @@ -165,7 +166,7 @@ def extra_latency_log(msg): def extra_iodump_log(msg): - extra_logger.warning(f"[SLOW IO] alarm_type: iodump, disk: {msg['driver_name']}, iotype: {msg['io_type']}") + extra_logger.warning(f"[SLOW IO] iodump, disk:{msg['driver_name']}, iotype:{msg['io_type']}") iodump_str = msg['details']['iodump_data'] try: diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 3726062..dcd8756 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -46,11 +46,6 @@ class CollectIo(): def __init__(self, module_config): io_config = module_config.get_io_config() - self.io_threshold_config = { - CONF_IO_NVME_SSD: CONF_IO_THRESHOLD_DEFAULT, - CONF_IO_SATA_SSD: CONF_IO_THRESHOLD_DEFAULT, - CONF_IO_SATA_HDD: CONF_IO_THRESHOLD_DEFAULT - } self.period_time = io_config['period_time'] self.max_save = io_config['max_save'] @@ -69,6 +64,13 @@ class CollectIo(): self.disk_list = disk_str.strip().split(',') self.stop_event = threading.Event() + self.iodump_pattern = re.compile( + r'(?P\w+)-(?P\d+)\s+' + r'\w+\s+' + r'stage\s+(?P\w+)\s+' + r'(?P[0-9a-fA-F]{16})\s+' + r'.*started\s+(?P\d+)\s+ns\s+ago' + ) IO_CONFIG_DATA.append(self.period_time) IO_CONFIG_DATA.append(self.max_save) @@ -203,13 +205,7 @@ class CollectIo(): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) count = 0 io_dump_msg = [] - pattern = re.compile( - r'(?P\w+)-(?P\d+)\s+' - r'\w+\s+' - r'stage\s+(?P\w+)\s+' - r'(?P[0-9a-fA-F]{16})\s+' - r'.*started\s+(?P\d+)\s+ns\s+ago' - ) + pattern = self.iodump_pattern try: with open(io_dump_file, 'r') as file: -- Gitee From 71c52167498857375b4dca842c38a0507a64cc6f Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 15 Oct 2025 19:53:51 +0800 Subject: [PATCH 46/62] add test . --- src/sentryPlugins/ai_block_io/sliding_window.py | 4 ++-- src/sentryPlugins/ai_block_io/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index 9a97de4..cbd0139 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -141,8 +141,8 @@ class DataWindow: self._data_queue.pop(0) self._data_queue.append(data) - def window_data_to_string(self): - return ', '.join([str(data) for data in self._data_queue]) + def get_data(self): + return self._io_data_queue def __repr__(self): return f"[SingleDataWindow, window size: {self._window_size}]" \ No newline at end of file diff --git a/src/sentryPlugins/ai_block_io/utils.py b/src/sentryPlugins/ai_block_io/utils.py index 8970b6f..ce3eea3 100644 --- a/src/sentryPlugins/ai_block_io/utils.py +++ b/src/sentryPlugins/ai_block_io/utils.py @@ -52,8 +52,8 @@ def get_metric_value_from_io_data_dict_by_metric_name( def get_metric_value_from_iodump_data_dict(io_dump_data_dict: dict, metric_name: MetricName): try: io_dump_data: IODumpData = io_dump_data_dict[metric_name.disk_name] - io_stage_dump_data = asdict(io_dump_data)[metric_name.stage_name] - base_data = io_stage_dump_data[metric_name.io_access_type_name] + io_dump_stage_data = asdict(io_dump_data)[metric_name.stage_name] + base_data = io_dump_stage_data[metric_name.io_access_type_name] metric_value = base_data[metric_name.metric_name] return metric_value except KeyError: -- Gitee From 3c897fcbff6fb97d7b2546d053160129d0f3ee4f Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 15 Oct 2025 19:55:26 +0800 Subject: [PATCH 47/62] add test . --- src/sentryPlugins/ai_block_io/sliding_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index cbd0139..303142b 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -142,7 +142,7 @@ class DataWindow: self._data_queue.append(data) def get_data(self): - return self._io_data_queue + return self._data_queue def __repr__(self): return f"[SingleDataWindow, window size: {self._window_size}]" \ No newline at end of file -- Gitee From 112e5f7abb161dbe0fcdb2d6e12a77ce1c969dc3 Mon Sep 17 00:00:00 2001 From: hewh Date: Wed, 15 Oct 2025 20:26:59 +0800 Subject: [PATCH 48/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 1 + src/sentryPlugins/ai_block_io/detector.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index e26a880..4e7151b 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -212,6 +212,7 @@ class SlowIODetection: disk_detector.push_data_to_data_detectors(iodump_data_dict_with_disk_name) result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name) if result[0]: + # 产生告警时获取iodump的详细数据 result[6]["iodump_data"] = disk_detector.get_data_detector_list_window() slow_io_event_list.append(result) diff --git a/src/sentryPlugins/ai_block_io/detector.py b/src/sentryPlugins/ai_block_io/detector.py index 57dfb68..f2f15dd 100644 --- a/src/sentryPlugins/ai_block_io/detector.py +++ b/src/sentryPlugins/ai_block_io/detector.py @@ -80,6 +80,7 @@ class DataDetector: self._metric_name = metric_name self._data_window = data_window + @property def metric_name(self): return self._metric_name @@ -142,7 +143,8 @@ class DiskDetector: def get_data_detector_list_window(self): iodump_data_wins = {"read": {}, "write": {}} for data_detector in self._data_detector_list: - iodump_data_wins[data_detector.metric_name.io_access_type_name][data_detector.metric_name.stage_name] = data_detector.get_data_window_data() + if data_detector.metric_name.metric_name == 'iodump_data': + iodump_data_wins[data_detector.metric_name.io_access_type_name][data_detector.metric_name.stage_name] = data_detector.get_data_window_data() return iodump_data_wins def is_slow_io_event(self, io_data_dict_with_disk_name: dict): @@ -198,4 +200,6 @@ class DiskDetector: msg = f'disk: {self._disk_name}, ' for detector in self._detector_list: msg += f'\n detector: [{detector}]' + for data_detector in self._data_detector_list: + msg += f'\n data_detector: [{data_detector}]' return msg -- Gitee From 7bdee1b595790f65df21518bb5b8cb21ab93b46d Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 16 Oct 2025 09:17:27 +0800 Subject: [PATCH 49/62] add test . --- src/sentryPlugins/ai_block_io/data_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentryPlugins/ai_block_io/data_access.py b/src/sentryPlugins/ai_block_io/data_access.py index 9be6b49..7d1c347 100644 --- a/src/sentryPlugins/ai_block_io/data_access.py +++ b/src/sentryPlugins/ai_block_io/data_access.py @@ -141,7 +141,7 @@ def _get_iodump_stage_data(data): io_stage_data = IOStageDumpData() for data_type in ("read", "write", "flush", "discard"): if data_type in data: - getattr(io_stage_data, data_type).strings = data[data_type] + getattr(io_stage_data, data_type).iodump_data = data[data_type] return io_stage_data def get_iodump_data_from_collect_plug(period, disk_list): -- Gitee From 47f6e1c5f38be7b5cbdce492e4c896de57360af6 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 16 Oct 2025 11:58:41 +0800 Subject: [PATCH 50/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 4e7151b..1e481a1 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -230,21 +230,18 @@ class SlowIODetection: "alarm_type": slow_io_event[5], "details": slow_io_event[6] } - tmp_alarm_content = alarm_content.copy() - del tmp_alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 - Xalarm.major(tmp_alarm_content) - del tmp_alarm_content["details"] - logging.warning("[SLOW IO] " + str(tmp_alarm_content)) - logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, ' - f'stage: {str(tmp_alarm_content.get("block_stack"))}, ' - f'iotype: {str(tmp_alarm_content.get("io_type"))}, ' - f'type: {str(tmp_alarm_content.get("alarm_type"))}, ' - f'reason: {str(tmp_alarm_content.get("reason"))}') + extra_slow_log(alarm_content) + del alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + Xalarm.major(alarm_content) + logging.warning(f'[SLOW IO] disk: {str(alarm_content.get("driver_name"))}, ' + f'stage: {str(alarm_content.get("block_stack"))}, ' + f'iotype: {str(alarm_content.get("io_type"))}, ' + f'type: {str(alarm_content.get("alarm_type"))}, ' + f'reason: {str(alarm_content.get("reason"))}') logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) logging.warning(f"iops: " + str(alarm_content.get("details").get("iops"))) logging.warning(f"iodump_data: " + str(alarm_content.get("details").get("iodump_data"))) - extra_slow_log(alarm_content) # Step4:等待检测时间 logging.debug("step4. Wait to start next slow io event detection loop.") -- Gitee From 811039e568a638a8a982e8df21ad5fd5ce00bb3d Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 16 Oct 2025 12:01:47 +0800 Subject: [PATCH 51/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 1e481a1..0ddba5a 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -230,9 +230,6 @@ class SlowIODetection: "alarm_type": slow_io_event[5], "details": slow_io_event[6] } - extra_slow_log(alarm_content) - del alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 - Xalarm.major(alarm_content) logging.warning(f'[SLOW IO] disk: {str(alarm_content.get("driver_name"))}, ' f'stage: {str(alarm_content.get("block_stack"))}, ' f'iotype: {str(alarm_content.get("io_type"))}, ' @@ -242,6 +239,9 @@ class SlowIODetection: logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) logging.warning(f"iops: " + str(alarm_content.get("details").get("iops"))) logging.warning(f"iodump_data: " + str(alarm_content.get("details").get("iodump_data"))) + extra_slow_log(alarm_content) + del alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 + Xalarm.major(alarm_content) # Step4:等待检测时间 logging.debug("step4. Wait to start next slow io event detection loop.") -- Gitee From 91b216b309f62ed1d648fc8637cffc7349f8e331 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 16 Oct 2025 14:34:02 +0800 Subject: [PATCH 52/62] add test . --- src/sentryPlugins/ai_block_io/extra_logger.py | 60 +++++++++++++++---- .../avg_block_io/extra_logger.py | 1 + 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index 29ff9e9..fff9915 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -15,6 +15,15 @@ import re extra_logger = None +# Define stage groups +STAGE_GROUPS = { + 'B->Q': ['throtl', 'wbt', 'iocost'], + 'Q->G': ['gettag'], + 'G->I': ['plug'], + 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], + 'D->C': ['rq_driver'] +} + def init_extra_logger(log_path, log_level, log_format): global extra_logger try: @@ -44,6 +53,7 @@ def extra_slow_log(msg): extra_latency_log(msg) return if "iodump" in str(msg.get('alarm_type', '')): + extra_iodump_log(msg) return @@ -59,19 +69,10 @@ def extra_latency_log(msg): extra_logger.warning(f"[SLOW IO] latency, disk:{msg['driver_name']}, iotype:{io_type}, iops:{int(iops_avg)}") - # Define stage groups - groups = { - 'B->Q': ['throtl', 'wbt', 'iocost'], - 'Q->G': ['gettag'], - 'G->I': ['plug'], - 'I->D': ['deadline', 'bfq', 'hctx', 'requeue'], - 'D->C': ['rq_driver'] - } - # Calculate statistics for each group latency_data_dict = msg['details']['latency'].get(io_type, {}) group_stats = {} - for group_name, stages in groups.items(): + for group_name, stages in STAGE_GROUPS.items(): all_values = [] for stage in stages: if stage in latency_data_dict: @@ -98,7 +99,7 @@ def extra_latency_log(msg): total_avg = 0 total_min = 0 total_max = 0 - for group_name in groups: + for group_name in STAGE_GROUPS: total_avg += group_stats[group_name]['avg'] total_min += group_stats[group_name]['min'] total_max += group_stats[group_name]['max'] @@ -109,7 +110,7 @@ def extra_latency_log(msg): } # Calculate PCT for each group (except B->C) - for group_name in groups: + for group_name in STAGE_GROUPS: if total_avg > 0: pct = (group_stats[group_name]['avg'] / total_avg) * 100 else: @@ -148,3 +149,38 @@ def extra_latency_log(msg): ) except KeyError: return + + +def extra_iodump_log(msg): + io_types = [iot.strip() for iot in re.split(r',+', msg['io_type'])] + + for io_type in io_types: + extra_logger.warning(f"[SLOW IO] iodump, disk:{msg['driver_name']}, iotype:{io_type}") + iodump_data = msg['details']['iodump_data'] + + try: + bio_data = iodump_data['bio'].get(io_type, {}) + except Exception as e: + extra_logger.error(f"Failed to parse iodump data: {e}") + return + + stack_to_stage = {} + for stage, stacks in STAGE_GROUPS.items(): + for stack in stacks: + stack_to_stage[stack] = stage + + last_bio_record = {} + for window in bio_data: + for entry in window: + parts = entry.split(',') + task_name, pid, io_stack, bio_ptr, start_ago = parts + stage = stack_to_stage.get(io_stack, 'Unknown') + last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) + + header = f"{'TASK_NAME':<10} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" + extra_logger.warning(header) + + for bio_ptr in last_bio_record: + task_name, pid, io_stack, stage, bio_ptr, start_ago = last_bio_record[bio_ptr] + line = f"{task_name:<10} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" + extra_logger.warning(line) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 3abeba4..14a3927 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -16,6 +16,7 @@ import ast extra_logger = None +# Define stage groups STAGE_GROUPS = { 'B->Q': ['throtl', 'wbt', 'iocost'], 'Q->G': ['gettag'], -- Gitee From 6756b9e7a666bee353685ec41e10fef413155bbf Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 16 Oct 2025 14:48:22 +0800 Subject: [PATCH 53/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 1 - src/sentryPlugins/ai_block_io/extra_logger.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 0ddba5a..5fcee22 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -238,7 +238,6 @@ class SlowIODetection: logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) logging.warning(f"iops: " + str(alarm_content.get("details").get("iops"))) - logging.warning(f"iodump_data: " + str(alarm_content.get("details").get("iodump_data"))) extra_slow_log(alarm_content) del alarm_content["details"]["iodump_data"] # 极端场景下iodump_data可能过大,导致发送失败,所以只在日志中打印,不发送到告警模块 Xalarm.major(alarm_content) diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index fff9915..f94549d 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -52,7 +52,7 @@ def extra_slow_log(msg): if "latency" in str(msg.get('alarm_type', '')): extra_latency_log(msg) return - if "iodump" in str(msg.get('alarm_type', '')): + if "io_dump" in str(msg.get('alarm_type', '')): extra_iodump_log(msg) return @@ -156,10 +156,10 @@ def extra_iodump_log(msg): for io_type in io_types: extra_logger.warning(f"[SLOW IO] iodump, disk:{msg['driver_name']}, iotype:{io_type}") - iodump_data = msg['details']['iodump_data'] + iodump_data = msg['details']['iodump_data'].get(io_type, {}) try: - bio_data = iodump_data['bio'].get(io_type, {}) + bio_data = iodump_data['bio'] except Exception as e: extra_logger.error(f"Failed to parse iodump data: {e}") return -- Gitee From dbff37f67b945345ae58b38c898f256890353df1 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 17 Oct 2025 10:03:37 +0800 Subject: [PATCH 54/62] add test . --- src/sentryPlugins/ai_block_io/data_access.py | 2 ++ src/sentryPlugins/ai_block_io/detector.py | 31 ++++++++++--------- src/sentryPlugins/ai_block_io/extra_logger.py | 1 + src/sentryPlugins/ai_block_io/io_data.py | 2 ++ .../ai_block_io/sliding_window.py | 8 ++--- .../avg_block_io/avg_block_io.py | 3 +- .../avg_block_io/extra_logger.py | 3 +- src/sentryPlugins/avg_block_io/module_conn.py | 2 +- src/services/sentryCollector/collect_io.py | 4 ++- .../sentryCollector/collect_server.py | 4 ++- 10 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/data_access.py b/src/sentryPlugins/ai_block_io/data_access.py index 7d1c347..f1c2bc2 100644 --- a/src/sentryPlugins/ai_block_io/data_access.py +++ b/src/sentryPlugins/ai_block_io/data_access.py @@ -137,6 +137,7 @@ def _get_raw_iodump_data(period, disk_list): ["read", "write", "flush", "discard"], ) + def _get_iodump_stage_data(data): io_stage_data = IOStageDumpData() for data_type in ("read", "write", "flush", "discard"): @@ -144,6 +145,7 @@ def _get_iodump_stage_data(data): getattr(io_stage_data, data_type).iodump_data = data[data_type] return io_stage_data + def get_iodump_data_from_collect_plug(period, disk_list): data_raw = _get_raw_iodump_data(period, disk_list) if data_raw["ret"] == 0: diff --git a/src/sentryPlugins/ai_block_io/detector.py b/src/sentryPlugins/ai_block_io/detector.py index f2f15dd..6c0a03f 100644 --- a/src/sentryPlugins/ai_block_io/detector.py +++ b/src/sentryPlugins/ai_block_io/detector.py @@ -80,6 +80,11 @@ class DataDetector: self._metric_name = metric_name self._data_window = data_window + def __repr__(self): + return (f'disk_name: {self._metric_name.disk_name}, stage_name: {self._metric_name.stage_name},' + f' io_type_name: {self._metric_name.io_access_type_name},' + f' metric_name: {self._metric_name.metric_name}') + @property def metric_name(self): return self._metric_name @@ -97,10 +102,6 @@ class DataDetector: self._data_window.push(metric_value) return True - def __repr__(self): - return (f'disk_name: {self._metric_name.disk_name}, stage_name: {self._metric_name.stage_name},' - f' io_type_name: {self._metric_name.io_access_type_name},' - f' metric_name: {self._metric_name.metric_name}') def set_to_str(parameter: set): ret = "" @@ -121,6 +122,14 @@ class DiskDetector: self._detector_list = [] self._data_detector_list = [] + def __repr__(self): + msg = f'disk: {self._disk_name}, ' + for detector in self._detector_list: + msg += f'\n detector: [{detector}]' + for data_detector in self._data_detector_list: + msg += f'\n data_detector: [{data_detector}]' + return msg + def add_detector(self, detector: Detector): self._detector_list.append(detector) @@ -137,14 +146,16 @@ class DiskDetector: elif detector.metric_name.metric_name == 'io_dump': iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() elif detector.metric_name.metric_name == 'iops': - iops_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() + iops_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] =\ + detector.get_sliding_window_data() return latency_wins, iodump_wins, iops_wins def get_data_detector_list_window(self): iodump_data_wins = {"read": {}, "write": {}} for data_detector in self._data_detector_list: if data_detector.metric_name.metric_name == 'iodump_data': - iodump_data_wins[data_detector.metric_name.io_access_type_name][data_detector.metric_name.stage_name] = data_detector.get_data_window_data() + iodump_data_wins[data_detector.metric_name.io_access_type_name][data_detector.metric_name.stage_name] =\ + data_detector.get_data_window_data() return iodump_data_wins def is_slow_io_event(self, io_data_dict_with_disk_name: dict): @@ -195,11 +206,3 @@ class DiskDetector: def push_data_to_data_detectors(self, iodump_data_dict_with_disk_name: dict): for data_detector in self._data_detector_list: data_detector.push_data(iodump_data_dict_with_disk_name) - - def __repr__(self): - msg = f'disk: {self._disk_name}, ' - for detector in self._detector_list: - msg += f'\n detector: [{detector}]' - for data_detector in self._data_detector_list: - msg += f'\n data_detector: [{data_detector}]' - return msg diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index f94549d..dda0680 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -24,6 +24,7 @@ STAGE_GROUPS = { 'D->C': ['rq_driver'] } + def init_extra_logger(log_path, log_level, log_format): global extra_logger try: diff --git a/src/sentryPlugins/ai_block_io/io_data.py b/src/sentryPlugins/ai_block_io/io_data.py index 1f3061a..023e7b1 100644 --- a/src/sentryPlugins/ai_block_io/io_data.py +++ b/src/sentryPlugins/ai_block_io/io_data.py @@ -50,6 +50,7 @@ class IOData: class IoDumpListData: iodump_data: List[str] = field(default_factory=list) + @dataclass class IOStageDumpData: read: IoDumpListData = field(default_factory=lambda: IoDumpListData()) @@ -57,6 +58,7 @@ class IOStageDumpData: flush: IoDumpListData = field(default_factory=lambda: IoDumpListData()) discard: IoDumpListData = field(default_factory=lambda: IoDumpListData()) + @dataclass class IODumpData: throtl: IOStageDumpData = field(default_factory=lambda: IOStageDumpData()) diff --git a/src/sentryPlugins/ai_block_io/sliding_window.py b/src/sentryPlugins/ai_block_io/sliding_window.py index 303142b..6881baa 100644 --- a/src/sentryPlugins/ai_block_io/sliding_window.py +++ b/src/sentryPlugins/ai_block_io/sliding_window.py @@ -136,13 +136,13 @@ class DataWindow: self._window_size = window_size self._data_queue = [] + def __repr__(self): + return f"[SingleDataWindow, window size: {self._window_size}]" + def push(self, data: Any): if len(self._data_queue) == self._window_size: self._data_queue.pop(0) self._data_queue.append(data) def get_data(self): - return self._data_queue - - def __repr__(self): - return f"[SingleDataWindow, window size: {self._window_size}]" \ No newline at end of file + return self._data_queue \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index df86138..fcf6be7 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -58,7 +58,8 @@ def init_io_win(io_dic, config, common_param): logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: - io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) + io_data[disk_name][stage_name][rw]["iodump"] =\ + IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) io_data[disk_name][stage_name][rw]["iops"] = IopsWindow(window_size=io_dic["win_size"]) diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index 14a3927..d6625c3 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -74,7 +74,8 @@ def extra_latency_log(msg): if 'rq_driver' in iops_data and iops_data['rq_driver']: iops_avg = sum(iops_data['rq_driver']) / len(iops_data['rq_driver']) - extra_logger.warning(f"[SLOW IO] alarm_type: latency, disk: {msg['driver_name']}, iotype: {msg['io_type']}, iops: {int(iops_avg)}") + extra_logger.warning(f"[SLOW IO] alarm_type: latency, disk: {msg['driver_name']}, " + f"iotype: {msg['io_type']}, iops: {int(iops_avg)}") # Parse the latency string from msg latency_str = msg['details']['latency'] diff --git a/src/sentryPlugins/avg_block_io/module_conn.py b/src/sentryPlugins/avg_block_io/module_conn.py index 5e4a3d0..7bb0c93 100644 --- a/src/sentryPlugins/avg_block_io/module_conn.py +++ b/src/sentryPlugins/avg_block_io/module_conn.py @@ -12,10 +12,10 @@ import json import logging import sys -from .utils import is_abnormal, get_win_data, log_slow_win from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, get_iodump_data, Result_Messages, get_disk_type, Disk_Type from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR +from .utils import is_abnormal, get_win_data, log_slow_win TASK_NAME = "avg_block_io" diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index dcd8756..559a187 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -166,6 +166,7 @@ class CollectIo(): else: return round(value, 1) + @staticmethod def update_io_threshold(self, disk_name, stage_list): temp_config = CollectConfig() temp_io_threshold = temp_config.get_io_threshold() @@ -197,7 +198,8 @@ class CollectIo(): try: with open(io_threshold_file, 'w') as file: file.write(config_threshold) - logging.info("update %s io_dump_threshold from %s to %s", io_threshold_file, current_threshold, config_threshold) + logging.info("update %s io_dump_threshold from %s to %s", + io_threshold_file, current_threshold, config_threshold) except Exception as e: logging.error("An error occurred while writing: %s", e) diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index 5ed1e65..1b58f9f 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -92,6 +92,7 @@ class CollectServer(): return json.dumps(result_rev) + @staticmethod def get_io_common(self, data_struct, data_source): result_rev = {} @@ -107,7 +108,8 @@ class CollectServer(): iotype_list = json.loads(data_struct['iotype']) if (period < period_time) or (period > period_time * max_save) or (period % period_time): - logging.error("get_io_common: period time is invalid, user period: %d, config period_time: %d", period, period_time) + logging.error("get_io_common: period time is invalid, user period: %d, config period_time: %d", + period, period_time) return json.dumps(result_rev) collect_index = period // period_time - 1 -- Gitee From 8cb534a2f49e8007a8d6d864ccb3207476dd134b Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 17 Oct 2025 10:51:51 +0800 Subject: [PATCH 55/62] add test . --- .../ai_block_io/config_parser.py | 28 +++---- src/sentryPlugins/ai_block_io/utils.py | 3 +- .../avg_block_io/avg_block_io.py | 3 +- src/services/sentryCollector/collect_io.py | 74 +++++++++---------- .../sentryCollector/collect_server.py | 66 ++++++++--------- 5 files changed, 88 insertions(+), 86 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/config_parser.py b/src/sentryPlugins/ai_block_io/config_parser.py index 6f93580..af00702 100644 --- a/src/sentryPlugins/ai_block_io/config_parser.py +++ b/src/sentryPlugins/ai_block_io/config_parser.py @@ -439,19 +439,6 @@ class ConfigParser: ) ) - def _read_window_threshold_iodump(self, items_sliding_window: dict): - default_window_threshold_iodump = self.DEFAULT_CONF["algorithm"]["win_threshold_iodump"] - self._conf["algorithm"]["win_threshold_iodump"] = ( - self._get_config_value( - items_sliding_window, - "win_threshold_iodump", - int, - default_window_threshold_iodump, - gt=0, - le=self._conf["algorithm"]["win_size"], - ) - ) - def read_config_from_file(self): if not os.path.exists(self._config_file_name): init_log_format(self._conf["log"]["level"]) @@ -786,4 +773,17 @@ class ConfigParser: @property def write_iodump_lim(self): - return self._conf["iodump"]["write_iodump_lim"] \ No newline at end of file + return self._conf["iodump"]["write_iodump_lim"] + + def _read_window_threshold_iodump(self, items_sliding_window: dict): + default_window_threshold_iodump = self.DEFAULT_CONF["algorithm"]["win_threshold_iodump"] + self._conf["algorithm"]["win_threshold_iodump"] = ( + self._get_config_value( + items_sliding_window, + "win_threshold_iodump", + int, + default_window_threshold_iodump, + gt=0, + le=self._conf["algorithm"]["win_size"], + ) + ) \ No newline at end of file diff --git a/src/sentryPlugins/ai_block_io/utils.py b/src/sentryPlugins/ai_block_io/utils.py index ce3eea3..919cf9b 100644 --- a/src/sentryPlugins/ai_block_io/utils.py +++ b/src/sentryPlugins/ai_block_io/utils.py @@ -58,7 +58,8 @@ def get_metric_value_from_iodump_data_dict(io_dump_data_dict: dict, metric_name: return metric_value except KeyError: return None - + + def get_data_queue_size_and_update_size( training_data_duration: float, train_update_duration: float, diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index fcf6be7..302d8e0 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -59,7 +59,8 @@ def init_io_win(io_dic, config, common_param): if iodump_lim_value is not None: io_data[disk_name][stage_name][rw]["iodump"] =\ - IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_iodump"], abnormal_time=iodump_lim_value) + IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_iodump"],\ + abnormal_time=iodump_lim_value) logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) io_data[disk_name][stage_name][rw]["iops"] = IopsWindow(window_size=io_dic["win_size"]) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 559a187..d33d883 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -75,6 +75,43 @@ class CollectIo(): IO_CONFIG_DATA.append(self.period_time) IO_CONFIG_DATA.append(self.max_save) + @staticmethod + def update_io_threshold(self, disk_name, stage_list): + temp_config = CollectConfig() + temp_io_threshold = temp_config.get_io_threshold() + disk_type_result = get_disk_type(disk_name) + if disk_type_result["ret"] == 0 and disk_type_result["message"] in ('0', '1', '2'): + disk_type = int(disk_type_result["message"]) + for stage in stage_list: + io_threshold_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/threshold'.format(disk_name, stage) + try: + with open(io_threshold_file, 'r') as file: + current_threshold = file.read().strip() + except FileNotFoundError: + logging.error("The file %s does not exist.", io_threshold_file) + continue + except Exception as e: + logging.error("An error occurred while reading: %s", e) + continue + + if disk_type == DiskType.TYPE_NVME_SSD: + config_threshold = str(temp_io_threshold[CONF_IO_NVME_SSD]) + elif disk_type == DiskType.TYPE_SATA_SSD: + config_threshold = str(temp_io_threshold[CONF_IO_SATA_SSD]) + elif disk_type == DiskType.TYPE_SATA_HDD: + config_threshold = str(temp_io_threshold[CONF_IO_SATA_HDD]) + else: + continue + + if current_threshold != config_threshold: + try: + with open(io_threshold_file, 'w') as file: + file.write(config_threshold) + logging.info("update %s io_dump_threshold from %s to %s", + io_threshold_file, current_threshold, config_threshold) + except Exception as e: + logging.error("An error occurred while writing: %s", e) + def get_blk_io_hierarchy(self, disk_name, stage_list): stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) try: @@ -166,43 +203,6 @@ class CollectIo(): else: return round(value, 1) - @staticmethod - def update_io_threshold(self, disk_name, stage_list): - temp_config = CollectConfig() - temp_io_threshold = temp_config.get_io_threshold() - disk_type_result = get_disk_type(disk_name) - if disk_type_result["ret"] == 0 and disk_type_result["message"] in ('0', '1', '2'): - disk_type = int(disk_type_result["message"]) - for stage in stage_list: - io_threshold_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/threshold'.format(disk_name, stage) - try: - with open(io_threshold_file, 'r') as file: - current_threshold = file.read().strip() - except FileNotFoundError: - logging.error("The file %s does not exist.", io_threshold_file) - continue - except Exception as e: - logging.error("An error occurred while reading: %s", e) - continue - - if disk_type == DiskType.TYPE_NVME_SSD: - config_threshold = str(temp_io_threshold[CONF_IO_NVME_SSD]) - elif disk_type == DiskType.TYPE_SATA_SSD: - config_threshold = str(temp_io_threshold[CONF_IO_SATA_SSD]) - elif disk_type == DiskType.TYPE_SATA_HDD: - config_threshold = str(temp_io_threshold[CONF_IO_SATA_HDD]) - else: - continue - - if current_threshold != config_threshold: - try: - with open(io_threshold_file, 'w') as file: - file.write(config_threshold) - logging.info("update %s io_dump_threshold from %s to %s", - io_threshold_file, current_threshold, config_threshold) - except Exception as e: - logging.error("An error occurred while writing: %s", e) - def get_io_dump(self, disk_name, stage, category): io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) count = 0 diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index 1b58f9f..0fd5ba2 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -59,39 +59,6 @@ class CollectServer(): self.stop_event = threading.Event() - def is_iocollect_valid(self, data_struct): - - result_rev = {} - self.io_global_data = IO_GLOBAL_DATA - - if len(IO_CONFIG_DATA) == 0: - logging.error("the collect thread is not started, the data is invalid.") - return json.dumps(result_rev) - - period_time = IO_CONFIG_DATA[0] - max_save = IO_CONFIG_DATA[1] - - disk_list = json.loads(data_struct['disk_list']) - period = int(data_struct['period']) - stage_list = json.loads(data_struct['stage']) - - if (period < period_time) or (period > period_time * max_save) or (period % period_time): - logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time) - return json.dumps(result_rev) - - for disk_name, stage_info in self.io_global_data.items(): - if len(disk_list) > 0 and disk_name not in disk_list: - continue - result_rev[disk_name] = [] - if len(stage_list) == 0: - result_rev[disk_name] = list(stage_info.keys()) - continue - for stage_name, stage_data in stage_info.items(): - if stage_name in stage_list: - result_rev[disk_name].append(stage_name) - - return json.dumps(result_rev) - @staticmethod def get_io_common(self, data_struct, data_source): result_rev = {} @@ -132,6 +99,39 @@ class CollectServer(): return json.dumps(result_rev) + def is_iocollect_valid(self, data_struct): + + result_rev = {} + self.io_global_data = IO_GLOBAL_DATA + + if len(IO_CONFIG_DATA) == 0: + logging.error("the collect thread is not started, the data is invalid.") + return json.dumps(result_rev) + + period_time = IO_CONFIG_DATA[0] + max_save = IO_CONFIG_DATA[1] + + disk_list = json.loads(data_struct['disk_list']) + period = int(data_struct['period']) + stage_list = json.loads(data_struct['stage']) + + if (period < period_time) or (period > period_time * max_save) or (period % period_time): + logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time) + return json.dumps(result_rev) + + for disk_name, stage_info in self.io_global_data.items(): + if len(disk_list) > 0 and disk_name not in disk_list: + continue + result_rev[disk_name] = [] + if len(stage_list) == 0: + result_rev[disk_name] = list(stage_info.keys()) + continue + for stage_name, stage_data in stage_info.items(): + if stage_name in stage_list: + result_rev[disk_name].append(stage_name) + + return json.dumps(result_rev) + def get_io_data(self, data_struct): self.io_global_data = IO_GLOBAL_DATA return self.get_io_common(data_struct, self.io_global_data) -- Gitee From 074764f1c344d0e60f352bc41aedb54d53c56413 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 17 Oct 2025 15:20:21 +0800 Subject: [PATCH 56/62] add test . --- src/services/sentryCollector/collect_io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index d33d883..524ae3b 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -302,13 +302,16 @@ class CollectIo(): self.disk_map_stage[disk_name] = EBPF_STAGE_LIST self.window_value[disk_name] = {} IO_GLOBAL_DATA[disk_name] = {} + IO_DUMP_DATA[disk_name] = {} for disk_name, stage_list in self.disk_map_stage.items(): for stage in stage_list: self.window_value[disk_name][stage] = {} IO_GLOBAL_DATA[disk_name][stage] = {} + IO_DUMP_DATA[disk_name][stage] = {} for category in Io_Category: IO_GLOBAL_DATA[disk_name][stage][category] = [] + IO_DUMP_DATA[disk_name][stage][category] = [] self.window_value[disk_name][stage][category] = [[0,0,0], [0,0,0]] return major_version in EBPF_SUPPORT_VERSION and os.path.exists('/usr/bin/ebpf_collector') and len(IO_GLOBAL_DATA) != 0 -- Gitee From 496418433e40a9c2cd97819607e3a1a48d944f8d Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 17 Oct 2025 15:32:30 +0800 Subject: [PATCH 57/62] add test . --- src/services/sentryCollector/collect_io.py | 2 +- src/services/sentryCollector/collect_server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 524ae3b..e751d65 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -76,7 +76,7 @@ class CollectIo(): IO_CONFIG_DATA.append(self.max_save) @staticmethod - def update_io_threshold(self, disk_name, stage_list): + def update_io_threshold(disk_name, stage_list): temp_config = CollectConfig() temp_io_threshold = temp_config.get_io_threshold() disk_type_result = get_disk_type(disk_name) diff --git a/src/services/sentryCollector/collect_server.py b/src/services/sentryCollector/collect_server.py index 0fd5ba2..b045d4c 100644 --- a/src/services/sentryCollector/collect_server.py +++ b/src/services/sentryCollector/collect_server.py @@ -60,7 +60,7 @@ class CollectServer(): self.stop_event = threading.Event() @staticmethod - def get_io_common(self, data_struct, data_source): + def get_io_common(data_struct, data_source): result_rev = {} if len(IO_CONFIG_DATA) == 0: -- Gitee From 7cd3bf1d0ef1e6ea994213986449e41e79f07fd3 Mon Sep 17 00:00:00 2001 From: hewh Date: Fri, 17 Oct 2025 15:49:32 +0800 Subject: [PATCH 58/62] add test . --- src/services/sentryCollector/collect_io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index e751d65..494ed54 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -390,6 +390,8 @@ class CollectIo(): return if (len(IO_GLOBAL_DATA[disk_name][stage][io_type])) >= self.max_save: IO_GLOBAL_DATA[disk_name][stage][io_type].pop() + if (len(IO_DUMP_DATA[disk_name][stage][io_type])) >= self.max_save: + IO_DUMP_DATA[disk_name][stage][io_type].pop() curr_finish_count, curr_latency, curr_io_dump_count = self.window_value[disk_name][stage][io_type][-1] prev_finish_count, prev_latency, prev_io_dump_count = self.window_value[disk_name][stage][io_type][-2] self.window_value[disk_name][stage][io_type].pop(0) @@ -401,6 +403,7 @@ class CollectIo(): if curr_io_dump > 0: logging.info(f"ebpf io_dump info : {disk_name}, {stage}, {io_type}, {curr_io_dump}") IO_GLOBAL_DATA[disk_name][stage][io_type].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) + IO_DUMP_DATA[disk_name][stage][io_type].insert(0, []) elapsed_time = time.time() - start_time sleep_time = self.period_time - elapsed_time -- Gitee From 9e868c98421dc75f7b44b22a74b3c749827ac630 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 23 Oct 2025 10:42:45 +0800 Subject: [PATCH 59/62] add test . --- src/sentryPlugins/ai_block_io/ai_block_io.py | 6 ++--- .../ai_block_io/config_parser.py | 14 +++++----- .../avg_block_io/avg_block_io.py | 7 +++-- src/sentryPlugins/avg_block_io/config.py | 20 +++++++------- src/services/sentryCollector/collect_io.py | 26 +++++++++---------- 5 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/ai_block_io.py b/src/sentryPlugins/ai_block_io/ai_block_io.py index 5fcee22..2973d52 100644 --- a/src/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/sentryPlugins/ai_block_io/ai_block_io.py @@ -113,7 +113,7 @@ class SlowIODetection: train_data_duration, train_update_duration, slow_io_detection_frequency ) sliding_window_type = self._config_parser.sliding_window_type - window_size, window_threshold, window_threshold_iodump = ( + window_size, window_threshold_latency, window_threshold_iodump = ( self._config_parser.get_window_size_and_window_minimum_threshold() ) @@ -145,7 +145,7 @@ class SlowIODetection: sliding_window = SlidingWindowFactory().get_sliding_window( sliding_window_type, queue_length=window_size, - threshold=window_threshold, + threshold=window_threshold_latency, abs_threshold=tot_lim, avg_lim=avg_lim ) @@ -174,7 +174,7 @@ class SlowIODetection: sliding_window = SlidingWindowFactory().get_sliding_window( sliding_window_type, queue_length=window_size, - threshold=window_threshold + threshold=window_threshold_latency ) detector = Detector(metric_name, threshold, sliding_window) disk_detector.add_detector(detector) diff --git a/src/sentryPlugins/ai_block_io/config_parser.py b/src/sentryPlugins/ai_block_io/config_parser.py index af00702..b457e14 100644 --- a/src/sentryPlugins/ai_block_io/config_parser.py +++ b/src/sentryPlugins/ai_block_io/config_parser.py @@ -74,8 +74,8 @@ class ConfigParser: "n_sigma_parameter": 3.0, "win_type": get_sliding_window_type_enum("not_continuous"), "win_size": 30, - "win_threshold": 6, - "win_threshold_iodump": 6, + "win_threshold_latency": 6, + "win_threshold_iodump": 3, }, "latency_sata_ssd": { "read_avg_lim": 10000, @@ -427,11 +427,11 @@ class ConfigParser: ) def _read_window_minimum_threshold(self, items_sliding_window: dict): - default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"] - self._conf["algorithm"]["win_threshold"] = ( + default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold_latency"] + self._conf["algorithm"]["win_threshold_latency"] = ( self._get_config_value( items_sliding_window, - "win_threshold", + "win_threshold_latency", int, default_window_minimum_threshold, gt=0, @@ -707,7 +707,7 @@ class ConfigParser: def get_window_size_and_window_minimum_threshold(self): return ( self._conf["algorithm"]["win_size"], - self._conf["algorithm"]["win_threshold"], + self._conf["algorithm"]["win_threshold_latency"], self._conf["algorithm"]["win_threshold_iodump"], ) @@ -737,7 +737,7 @@ class ConfigParser: @property def window_minimum_threshold(self): - return self._conf["algorithm"]["win_threshold"] + return self._conf["algorithm"]["win_threshold_latency"] @property def absolute_threshold(self): diff --git a/src/sentryPlugins/avg_block_io/avg_block_io.py b/src/sentryPlugins/avg_block_io/avg_block_io.py index 302d8e0..ef19b7b 100644 --- a/src/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/sentryPlugins/avg_block_io/avg_block_io.py @@ -54,7 +54,10 @@ def init_io_win(io_dic, config, common_param): iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key)) if avg_lim_value and avg_time_value and tot_lim_value: - io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) + io_data[disk_name][stage_name][rw]["latency"] = \ + IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold_latency"], \ + abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, \ + abnormal_time=tot_lim_value) logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: @@ -190,7 +193,7 @@ def main(): # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 # step1. 解析公共配置 --- algorithm - io_dic["win_size"], io_dic["win_threshold"], io_dic["win_threshold_iodump"] = read_config_algorithm(config) + io_dic["win_size"], io_dic["win_threshold_latency"], io_dic["win_threshold_iodump"] = read_config_algorithm(config) # step2. 解析公共配置 --- latency_xxx common_param = read_config_latency(config) diff --git a/src/sentryPlugins/avg_block_io/config.py b/src/sentryPlugins/avg_block_io/config.py index c7fca64..79bd21a 100644 --- a/src/sentryPlugins/avg_block_io/config.py +++ b/src/sentryPlugins/avg_block_io/config.py @@ -24,7 +24,7 @@ CONF_COMMON_PER_TIME = 'period_time' CONF_ALGO = 'algorithm' CONF_ALGO_SIZE = 'win_size' -CONF_ALGO_THRE = 'win_threshold' +CONF_ALGO_THRE_LATENCY = 'win_threshold_latency' CONF_ALGO_THRE_IODUMP = 'win_threshold_iodump' CONF_LATENCY = 'latency_{}' @@ -41,8 +41,8 @@ DEFAULT_PARAM = { CONF_COMMON_PER_TIME: 1 }, CONF_ALGO: { CONF_ALGO_SIZE: 30, - CONF_ALGO_THRE: 6, - CONF_ALGO_THRE_IODUMP: 6 + CONF_ALGO_THRE_LATENCY: 6, + CONF_ALGO_THRE_IODUMP: 3 }, 'latency_nvme_ssd': { 'read_avg_lim': 10000, 'write_avg_lim': 10000, @@ -164,14 +164,14 @@ def read_config_algorithm(config): logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default") try: - win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE)) - if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: - raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}") + win_threshold_latency = int(config.get(CONF_ALGO, CONF_ALGO_THRE_LATENCY)) + if win_threshold_latency < 1 or win_threshold_latency > 300 or win_threshold_latency > win_size: + raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_LATENCY}") except ValueError: - report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config") + report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE_LATENCY} config") except configparser.NoOptionError: - win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold'] - logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default") + win_threshold_latency = DEFAULT_PARAM[CONF_ALGO]['win_threshold_latency'] + logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE_LATENCY}, use {win_threshold_latency} as default") try: win_threshold_iodump = int(config.get(CONF_ALGO, CONF_ALGO_THRE_IODUMP)) @@ -183,7 +183,7 @@ def read_config_algorithm(config): win_threshold_iodump = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_THRE_IODUMP] logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE_IODUMP}, use {win_threshold_iodump} as default") - return win_size, win_threshold, win_threshold_iodump + return win_size, win_threshold_latency, win_threshold_iodump def read_config_latency(config): diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 494ed54..405cd84 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -57,6 +57,7 @@ class CollectIo(): self.ebpf_base_path = 'ebpf_collector' self.loop_all = False + self.io_threshold_config = module_config.get_io_threshold_config() if disk_str == "default": self.loop_all = True @@ -65,7 +66,7 @@ class CollectIo(): self.stop_event = threading.Event() self.iodump_pattern = re.compile( - r'(?P\w+)-(?P\d+)\s+' + r'(?P[^-]+)-(?P\d+)\s+' r'\w+\s+' r'stage\s+(?P\w+)\s+' r'(?P[0-9a-fA-F]{16})\s+' @@ -75,13 +76,19 @@ class CollectIo(): IO_CONFIG_DATA.append(self.period_time) IO_CONFIG_DATA.append(self.max_save) - @staticmethod - def update_io_threshold(disk_name, stage_list): - temp_config = CollectConfig() - temp_io_threshold = temp_config.get_io_threshold() + def update_io_threshold(self, disk_name, stage_list): disk_type_result = get_disk_type(disk_name) if disk_type_result["ret"] == 0 and disk_type_result["message"] in ('0', '1', '2'): disk_type = int(disk_type_result["message"]) + if disk_type == DiskType.TYPE_NVME_SSD: + config_threshold = str(self.io_threshold_config[CONF_IO_NVME_SSD]) + elif disk_type == DiskType.TYPE_SATA_SSD: + config_threshold = str(self.io_threshold_config[CONF_IO_SATA_SSD]) + elif disk_type == DiskType.TYPE_SATA_HDD: + config_threshold = str(self.io_threshold_config[CONF_IO_SATA_HDD]) + else: + return + for stage in stage_list: io_threshold_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/threshold'.format(disk_name, stage) try: @@ -94,15 +101,6 @@ class CollectIo(): logging.error("An error occurred while reading: %s", e) continue - if disk_type == DiskType.TYPE_NVME_SSD: - config_threshold = str(temp_io_threshold[CONF_IO_NVME_SSD]) - elif disk_type == DiskType.TYPE_SATA_SSD: - config_threshold = str(temp_io_threshold[CONF_IO_SATA_SSD]) - elif disk_type == DiskType.TYPE_SATA_HDD: - config_threshold = str(temp_io_threshold[CONF_IO_SATA_HDD]) - else: - continue - if current_threshold != config_threshold: try: with open(io_threshold_file, 'w') as file: -- Gitee From c2deec67dd413d54f5c63bd9b0cabd8dc6ec95b2 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 23 Oct 2025 16:47:47 +0800 Subject: [PATCH 60/62] add test . --- config/plugins/ai_block_io.ini | 4 ++-- config/plugins/avg_block_io.ini | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini index 5f55ac7..ca926ac 100644 --- a/config/plugins/ai_block_io.ini +++ b/config/plugins/ai_block_io.ini @@ -14,8 +14,8 @@ algorithm_type=boxplot boxplot_parameter=1.5 win_type=not_continuous win_size=30 -win_threshold=6 -win_threshold_iodump=6 +win_threshold_latency=6 +win_threshold_iodump=3 [latency_sata_ssd] read_avg_lim=10000 diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 061839c..52c2ab9 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -9,8 +9,8 @@ period_time=1 [algorithm] win_size=30 -win_threshold=6 -win_threshold_iodump=6 +win_threshold_latency=6 +win_threshold_iodump=3 [latency_nvme_ssd] read_avg_lim=10000 -- Gitee From e0908d759755fe9a1e63403e3c3fca7459f3bf1a Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 23 Oct 2025 20:47:52 +0800 Subject: [PATCH 61/62] add test . --- src/services/sentryCollector/collect_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 405cd84..35eb38e 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -57,7 +57,7 @@ class CollectIo(): self.ebpf_base_path = 'ebpf_collector' self.loop_all = False - self.io_threshold_config = module_config.get_io_threshold_config() + self.io_threshold_config = module_config.get_io_threshold() if disk_str == "default": self.loop_all = True -- Gitee From 03461ff6495c76088b623af0574160efb0cd2662 Mon Sep 17 00:00:00 2001 From: hewh Date: Thu, 30 Oct 2025 11:36:47 +0800 Subject: [PATCH 62/62] add test . --- src/sentryPlugins/ai_block_io/extra_logger.py | 11 +++++------ src/sentryPlugins/avg_block_io/extra_logger.py | 11 +++++------ src/services/sentryCollector/collect_io.py | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/sentryPlugins/ai_block_io/extra_logger.py b/src/sentryPlugins/ai_block_io/extra_logger.py index dda0680..cfd1929 100644 --- a/src/sentryPlugins/ai_block_io/extra_logger.py +++ b/src/sentryPlugins/ai_block_io/extra_logger.py @@ -52,10 +52,8 @@ def init_extra_logger(log_path, log_level, log_format): def extra_slow_log(msg): if "latency" in str(msg.get('alarm_type', '')): extra_latency_log(msg) - return if "io_dump" in str(msg.get('alarm_type', '')): extra_iodump_log(msg) - return def extra_latency_log(msg): @@ -175,13 +173,14 @@ def extra_iodump_log(msg): for entry in window: parts = entry.split(',') task_name, pid, io_stack, bio_ptr, start_ago = parts - stage = stack_to_stage.get(io_stack, 'Unknown') - last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) + if io_stack in stack_to_stage: + stage = stack_to_stage[io_stack] + last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) - header = f"{'TASK_NAME':<10} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" + header = f"{'TASK_NAME':<18} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" extra_logger.warning(header) for bio_ptr in last_bio_record: task_name, pid, io_stack, stage, bio_ptr, start_ago = last_bio_record[bio_ptr] - line = f"{task_name:<10} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" + line = f"{task_name:<18} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" extra_logger.warning(line) \ No newline at end of file diff --git a/src/sentryPlugins/avg_block_io/extra_logger.py b/src/sentryPlugins/avg_block_io/extra_logger.py index d6625c3..ac86306 100644 --- a/src/sentryPlugins/avg_block_io/extra_logger.py +++ b/src/sentryPlugins/avg_block_io/extra_logger.py @@ -55,10 +55,8 @@ def init_extra_logger(log_path, log_level, log_format): def extra_slow_log(msg): if "latency" in str(msg.get('alarm_type', '')): extra_latency_log(msg) - return if "iodump" in str(msg.get('alarm_type', '')): extra_iodump_log(msg) - return def extra_latency_log(msg): @@ -188,13 +186,14 @@ def extra_iodump_log(msg): for entry in window: parts = entry.split(',') task_name, pid, io_stack, bio_ptr, start_ago = parts - stage = stack_to_stage.get(io_stack, 'Unknown') - last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) + if io_stack in stack_to_stage: + stage = stack_to_stage[io_stack] + last_bio_record[bio_ptr] = (task_name, pid, io_stack, stage, bio_ptr, start_ago) - header = f"{'TASK_NAME':<10} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" + header = f"{'TASK_NAME':<18} {'PID':>8} {'IO_STACK':<12} {'STAGE':<8} {'BIO_PTR':<20} {'START_AGO(ms)':>10}" extra_logger.warning(header) for bio_ptr in last_bio_record: task_name, pid, io_stack, stage, bio_ptr, start_ago = last_bio_record[bio_ptr] - line = f"{task_name:<10} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" + line = f"{task_name:<18} {pid:>8} {io_stack:<12} {stage:<8} {bio_ptr:<20} {start_ago:>10}" extra_logger.warning(line) \ No newline at end of file diff --git a/src/services/sentryCollector/collect_io.py b/src/services/sentryCollector/collect_io.py index 35eb38e..612ee69 100644 --- a/src/services/sentryCollector/collect_io.py +++ b/src/services/sentryCollector/collect_io.py @@ -511,6 +511,7 @@ class CollectIo(): for category in Io_Category: IO_GLOBAL_DATA[disk_name][stage][category] = [] IO_DUMP_DATA[disk_name][stage][category] = [] + self.update_io_threshold(disk_name, stage_list) while True: start_time = time.time() @@ -522,7 +523,6 @@ class CollectIo(): for disk_name, stage_list in self.disk_map_stage.items(): if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: continue - self.update_io_threshold(disk_name, stage_list) self.append_period_lat(disk_name, stage_list) elapsed_time = time.time() - start_time -- Gitee