From f1612b1b026444f1b6a93c526f062dce3c726def Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Tue, 15 Oct 2024 12:03:00 +0000
Subject: [PATCH] add hbm_online_repair

Signed-off-by: luckky <guodashun1@huawei.com>
---
 add-hbm-online-repair.patch | 4090 +++++++++++++++++++++++++++++++++++
 sysSentry.spec              |   41 +-
 2 files changed, 4130 insertions(+), 1 deletion(-)
 create mode 100644 add-hbm-online-repair.patch

diff --git a/add-hbm-online-repair.patch b/add-hbm-online-repair.patch
new file mode 100644
index 0000000..3804a75
--- /dev/null
+++ b/add-hbm-online-repair.patch
@@ -0,0 +1,4090 @@
+From 844e1ad845c58a98fc803b754c0e44ad071bb707 Mon Sep 17 00:00:00 2001
+From: luckky <guodashun1@huawei.com>
+Date: Tue, 22 Oct 2024 18:09:30 +0800
+Subject: [PATCH] add hbm online repair
+
+---
+ build/build.sh                                |  16 +-
+ config/collector.conf                         |   7 +
+ config/plugins/avg_block_io.ini               |  21 +
+ config/tasks/avg_block_io.mod                 |   5 +
+ config/tasks/hbm_online_repair.mod            |   9 +
+ service/sentryCollector.service               |  12 +
+ .../plugin/cpu_patrol/cpu_patrol_result.c     |   4 +-
+ .../plugin/cpu_patrol/cpu_patrol_result.h     |   4 +-
+ src/c/hbm_online_repair/.gitignore            |   6 +
+ src/c/hbm_online_repair/Makefile              |  25 +
+ src/c/hbm_online_repair/hbm_online_repair.c   | 144 ++++
+ src/c/hbm_online_repair/hbm_online_repair.env |   2 +
+ src/c/hbm_online_repair/logger.h              |  31 +
+ .../non-standard-hbm-repair.c                 | 799 ++++++++++++++++++
+ .../non-standard-hbm-repair.h                 |  89 ++
+ src/c/hbm_online_repair/ras-events.c          | 534 ++++++++++++
+ src/c/hbm_online_repair/ras-events.h          |  28 +
+ .../ras-non-standard-handler.c                |  81 ++
+ .../ras-non-standard-handler.h                |  25 +
+ src/python/.gitignore                         |   1 +
+ src/python/sentryCollector/__init__.py        |   0
+ src/python/sentryCollector/__main__.py        |  17 +
+ src/python/sentryCollector/collect_config.py  | 118 +++
+ src/python/sentryCollector/collect_io.py      | 243 ++++++
+ src/python/sentryCollector/collect_plugin.py  | 276 ++++++
+ src/python/sentryCollector/collect_server.py  | 285 +++++++
+ src/python/sentryCollector/collectd.py        |  97 +++
+ src/python/sentryPlugins/__init__.py          |   0
+ .../sentryPlugins/avg_block_io/__init__.py    |   0
+ .../avg_block_io/avg_block_io.py              | 257 ++++++
+ .../sentryPlugins/avg_block_io/module_conn.py |  86 ++
+ .../avg_block_io/stage_window.py              |  47 ++
+ .../sentryPlugins/avg_block_io/utils.py       |  86 ++
+ src/python/setup.py                           |   4 +-
+ src/python/syssentry/bmc_alarm.py             | 159 ++++
+ src/python/syssentry/callbacks.py             |   2 +-
+ src/python/syssentry/cpu_alarm.py             |   1 +
+ src/python/syssentry/cpu_sentry.py            |  36 +-
+ src/python/syssentry/cron_process.py          |   2 +-
+ src/python/syssentry/syssentry.py             |  79 +-
+ src/python/xalarm/xalarm_api.py               |   2 +-
+ 41 files changed, 3592 insertions(+), 48 deletions(-)
+ create mode 100644 config/collector.conf
+ create mode 100644 config/plugins/avg_block_io.ini
+ create mode 100644 config/tasks/avg_block_io.mod
+ create mode 100644 config/tasks/hbm_online_repair.mod
+ create mode 100644 service/sentryCollector.service
+ create mode 100644 src/c/hbm_online_repair/.gitignore
+ create mode 100644 src/c/hbm_online_repair/Makefile
+ create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c
+ create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env
+ create mode 100644 src/c/hbm_online_repair/logger.h
+ create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c
+ create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h
+ create mode 100644 src/c/hbm_online_repair/ras-events.c
+ create mode 100644 src/c/hbm_online_repair/ras-events.h
+ create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c
+ create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h
+ create mode 100644 src/python/.gitignore
+ create mode 100644 src/python/sentryCollector/__init__.py
+ create mode 100644 src/python/sentryCollector/__main__.py
+ create mode 100644 src/python/sentryCollector/collect_config.py
+ create mode 100644 src/python/sentryCollector/collect_io.py
+ create mode 100644 src/python/sentryCollector/collect_plugin.py
+ create mode 100644 src/python/sentryCollector/collect_server.py
+ create mode 100644 src/python/sentryCollector/collectd.py
+ create mode 100644 src/python/sentryPlugins/__init__.py
+ create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py
+ create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py
+ create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py
+ create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py
+ create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py
+ create mode 100644 src/python/syssentry/bmc_alarm.py
+
+diff --git a/build/build.sh b/build/build.sh
+index 17af8a0..e2442e6 100644
+--- a/build/build.sh
++++ b/build/build.sh
+@@ -43,23 +43,23 @@ function install()
+ }
+ 
+ [ "$1" == "-b" ] && {
+-	INTALL_DIR=$2
++	INSTALL_DIR=$2
+ 	[ -z $2 ] && {
+-		INTALL_DIR=/usr/lib64
+-		mkdir -p ${INTALL_DIR}
++		INSTALL_DIR=/usr/lib64
++		mkdir -p ${INSTALL_DIR}
+ 	}
+ 
+-	build ${INTALL_DIR}
++	build ${INSTALL_DIR}
+ 	exit  0
+ }
+ 
+ [ "$1" == "-i" ] && {
+-    INTALL_DIR=$2
++    INSTALL_DIR=$2
+     [ -z $2 ] && {
+-        INTALL_DIR=/usr/lib64
+-        mkdir -p ${INTALL_DIR}
++        INSTALL_DIR=/usr/lib64
++        mkdir -p ${INSTALL_DIR}
+     }
+-	install ${INTALL_DIR}
++	install ${INSTALL_DIR}
+ 	exit  0
+ }
+ 
+diff --git a/config/collector.conf b/config/collector.conf
+new file mode 100644
+index 0000000..9baa086
+--- /dev/null
++++ b/config/collector.conf
+@@ -0,0 +1,7 @@
++[common]
++modules=io
++
++[io]
++period_time=1
++max_save=10
++disk=default
+\ No newline at end of file
+diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
+new file mode 100644
+index 0000000..bc33dde
+--- /dev/null
++++ b/config/plugins/avg_block_io.ini
+@@ -0,0 +1,21 @@
++[common]
++disk=default 
++stage=default
++iotype=read,write
++period_time=1 
++
++[algorithm]
++win_size=30
++win_threshold=6
++
++[latency]
++read_avg_lim=10 
++write_avg_lim=10
++read_avg_time=3
++write_avg_time=3
++read_tot_lim=50
++write_tot_lim=50
++
++[iodump]
++read_iodump_lim=0
++write_iodump_lim=0
+diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
+new file mode 100644
+index 0000000..75c7299
+--- /dev/null
++++ b/config/tasks/avg_block_io.mod
+@@ -0,0 +1,5 @@
++[common]
++enabled=yes
++task_start=/usr/bin/avg_block_io
++task_stop=pkill avg_block_io
++type=oneshot
+\ No newline at end of file
+diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod
+new file mode 100644
+index 0000000..77dd73e
+--- /dev/null
++++ b/config/tasks/hbm_online_repair.mod
+@@ -0,0 +1,9 @@
++[common]
++enabled=yes
++task_start=/usr/bin/hbm_online_repair
++task_stop=kill $pid
++type=period
++interval=180
++onstart=yes
++env_file=/etc/sysconfig/hbm_online_repair.env
++conflict=up
+\ No newline at end of file
+diff --git a/service/sentryCollector.service b/service/sentryCollector.service
+new file mode 100644
+index 0000000..2e50d7a
+--- /dev/null
++++ b/service/sentryCollector.service
+@@ -0,0 +1,12 @@
++[Unit]
++Description = Collection module added for sysSentry and kernel lock-free collection
++ 
++[Service]
++ExecStart=/usr/bin/sentryCollector
++ExecStop=/bin/kill $MAINPID
++KillMode=process
++Restart=on-failure
++RestartSec=10s
++ 
++[Install]
++WantedBy = multi-user.target
+diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
+index 8e31312..9f8d80c 100644
+--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
++++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
+@@ -22,8 +22,8 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid)
+         CAT_LOG_W("Core %d is a special core and cannot be isolated", coreid);
+         return CAT_OK;
+     }
+-    if (coreid < 0) {
+-        CAT_LOG_W("Inner error, coreid is a negative number");
++    if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) {
++        CAT_LOG_E("Insert error, core id(%d)", coreid);
+         return CAT_ERR;
+     }
+ 
+diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h
+index 9722ec9..92dcdc3 100644
+--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h
++++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.h
+@@ -30,9 +30,9 @@ typedef enum {
+ #define CAT_LOG_W(...) CAT_LOG("WARN", __VA_ARGS__)
+ #define CAT_LOG_E(...) CAT_LOG("ERROR", __VA_ARGS__)
+ 
+-#define MAX_CPU_CORES 4096
++#define MAX_ISOLATE_CORES_PER_PATROL 64 // 一次巡检最大支持隔离故障核数量，一次巡检同时检测到2个以上故障核的概率非常低
+ typedef struct {
+-    unsigned int order_list[MAX_CPU_CORES];
++    unsigned int order_list[MAX_ISOLATE_CORES_PER_PATROL];
+     unsigned short current_nums;
+ } core_list_st;
+ 
+diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore
+new file mode 100644
+index 0000000..a577882
+--- /dev/null
++++ b/src/c/hbm_online_repair/.gitignore
+@@ -0,0 +1,6 @@
++*.o
++*.c~
++*.h~
++hbm_online_repair
++
++.vscode/
+diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile
+new file mode 100644
+index 0000000..16ebcd8
+--- /dev/null
++++ b/src/c/hbm_online_repair/Makefile
+@@ -0,0 +1,25 @@
++CC = gcc
++
++CFLAGS = -Wall -o3
++
++LDFLAGS = -ltraceevent
++
++SRC = $(wildcard *.c)
++HDR = $(wildcard *.h)
++
++OBJ = $(SRC:.c=.o)
++
++TARGET = hbm_online_repair
++
++all: $(TARGET)
++
++$(TARGET): $(OBJ)
++	$(CC) $(OBJ) -o $@ $(LDFLAGS)
++
++%.o: %.c $(HDR)
++	$(CC) $(CFLAGS) -c $< -o $@
++
++clean:
++	rm -f $(OBJ) $(TARGET)
++
++.PHONY: all clean
+diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
+new file mode 100644
+index 0000000..3ace206
+--- /dev/null
++++ b/src/c/hbm_online_repair/hbm_online_repair.c
+@@ -0,0 +1,144 @@
++#include <argp.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include "logger.h"
++#include "ras-events.h"
++#include "non-standard-hbm-repair.h"
++
++#define DEFAULT_LOG_LEVEL LOG_INFO
++#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128
++
++int global_level_setting;
++int page_isolation_threshold;
++
++int string2int(const char* str, int* value)
++{
++    if (!str) {
++        return -1;
++    }
++    char *endptr;
++    errno = 0;
++    long val = strtol(str, &endptr, 10);
++    if (errno != 0 || *endptr != '\0') {
++        return -1;
++    }
++    *value = (int)val;
++    if (val != (long)*value) {
++        return -1;
++    }
++    return 0;
++}
++
++int execute_command(const char *command)
++{
++    FILE *fp;
++    char buffer[128] = {0};
++    int ret;
++    fp = popen(command, "r");
++    if (!fp) {
++        log(LOG_ERROR, "popen failed\n");
++        return -1;
++    }
++
++    fgets(buffer, sizeof(buffer), fp);
++    log(LOG_DEBUG, "output of command is: %s\n", buffer);
++
++    ret = pclose(fp);
++    if (ret < 0) {
++        log(LOG_ERROR, "pclose failed\n");
++        return -1;
++    }
++
++    if (!WIFEXITED(ret)) {
++        log(LOG_ERROR, "command did not terminate normally\n");
++        return -1;
++    }
++
++    ret = WEXITSTATUS(ret);
++    log(LOG_DEBUG, "command exited with status: %d\n", ret);
++    return ret;
++}
++
++int load_required_driver(void)
++{
++    int ret;
++    ret = execute_command("modprobe hisi_mem_ras 2>&1");
++    if (ret < 0) {
++        log(LOG_ERROR, "load repair driver failed\n");
++        return ret;
++    }
++    ret = execute_command("modprobe page_eject 2>&1");
++    if (ret < 0) {
++        log(LOG_ERROR, "load page driver failed\n");
++        return ret;
++    }
++    log(LOG_INFO, "load required driver success\n");
++    return ret;
++}
++
++void hbm_param_init(void)
++{
++    int ret;
++    char *env;
++
++    env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL");
++    ret = string2int(env, &global_level_setting);
++    if (ret < 0) {
++        global_level_setting = DEFAULT_LOG_LEVEL;
++        log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL);
++    } else {
++        log(LOG_INFO, "log level: %d\n", global_level_setting);
++    }
++
++    env = getenv("PAGE_ISOLATION_THRESHOLD");
++    ret = string2int(env, &page_isolation_threshold);
++    if (ret < 0) {
++        page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
++        log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD);
++    } else {
++        log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold);
++    }
++}
++
++
++int main(int argc, char *argv[])
++{
++    int ret;
++
++    hbm_param_init();
++
++    ret = load_required_driver();
++    if (ret < 0) {
++        log(LOG_DEBUG, "load required driver failed\n");
++        return ret;
++    }
++
++    struct ras_events *ras = init_trace_instance();
++    if (!ras)
++        return -1;
++
++    ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1);
++    if (ret < 0) {
++        log(LOG_WARNING, "unable to enable ras non_standard_event.\n");
++        free(ras);
++        return -1;
++    }
++
++    ret = init_all_flash();
++    if (ret < 0) {
++        log(LOG_ERROR, "flash writer init failed\n");
++    }
++
++    handle_ras_events(ras);
++
++    ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0);
++    if (ret < 0) {
++        log(LOG_WARNING, "unable to disable ras non_standard_event.\n");
++    }
++
++    free(ras);
++    return ret;
++}
+diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env
+new file mode 100644
+index 0000000..de56079
+--- /dev/null
++++ b/src/c/hbm_online_repair/hbm_online_repair.env
+@@ -0,0 +1,2 @@
++HBM_ONLINE_REPAIR_LOG_LEVEL=1
++PAGE_ISOLATION_THRESHOLD=128
+diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h
+new file mode 100644
+index 0000000..ddfa932
+--- /dev/null
++++ b/src/c/hbm_online_repair/logger.h
+@@ -0,0 +1,31 @@
++#ifndef __LOGGER_H
++#define __LOGGER_H
++
++#define TOOL_NAME "hbm_online_repair"
++
++#define LOG_DEBUG   0
++#define LOG_INFO    1
++#define LOG_WARNING 2
++#define LOG_ERROR   3
++
++extern int global_level_setting;
++
++#define log_prefix(level) \
++	(level == LOG_DEBUG ? "DEBUG" : \
++	 level == LOG_INFO ? "INFO" : \
++	 level == LOG_WARNING ? "WARNING" : \
++	 level == LOG_ERROR ? "ERROR" : \
++	 "UNKNOWN_LEVEL")
++
++#define log_fd(level) \
++	(level == LOG_ERROR ? stderr : stdout)
++
++#define log(level, fmt, args...) do {\
++	if (level >= global_level_setting) {\
++		fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\
++		fprintf(log_fd(level), fmt, ##args);\
++		fflush(log_fd(level));\
++	}\
++} while (0)
++
++#endif
+diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
+new file mode 100644
+index 0000000..b175e14
+--- /dev/null
++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
+@@ -0,0 +1,799 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <dirent.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <stdbool.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <linux/fs.h>
++#include <sys/stat.h>
++
++#include "logger.h"
++#include "non-standard-hbm-repair.h"
++
++extern int page_isolation_threshold;
++size_t total_size = 0;
++struct hisi_common_error_section {
++    uint32_t   val_bits;
++    uint8_t    version;
++    uint8_t    soc_id;
++    uint8_t    socket_id;
++    uint8_t    totem_id;
++    uint8_t    nimbus_id;
++    uint8_t    subsystem_id;
++    uint8_t    module_id;
++    uint8_t    submodule_id;
++    uint8_t    core_id;
++    uint8_t    port_id;
++    uint16_t   err_type;
++    struct {
++        uint8_t  function;
++        uint8_t  device;
++        uint16_t segment;
++        uint8_t  bus;
++        uint8_t  reserved[3];
++    }          pcie_info;
++    uint8_t    err_severity;
++    uint8_t    reserved[3];
++    uint32_t   reg_array_size;
++    uint32_t   reg_array[];
++};
++
++struct fault_addr_info {
++    uint32_t processer_id;
++    uint32_t die_id;
++    uint32_t stack_id;
++    uint32_t sid;
++    uint32_t channel_id;
++    uint32_t bankgroup_id;
++    uint32_t bank_id;
++    uint32_t row_id;
++    uint32_t column_id;
++    uint32_t error_type;
++    uint32_t repair_type;
++    uint32_t reserved;
++    uint32_t crc8;
++};
++
++typedef struct {
++    const char    *VariableName;
++    const char    *VendorGuid;
++    uint32_t      DataSize;
++    uint8_t       *Data;
++    uint32_t      Attributes;
++} efi_variable_t;
++
++char* flash_names[FLASH_ENTRY_NUM] = {
++    "repair0000",
++    "repair0001",
++    "repair0100",
++    "repair0101",
++    "repair0200",
++    "repair0201",
++    "repair0300",
++    "repair0301",
++};
++char *flash_guids[FLASH_ENTRY_NUM] = {
++    "CD2FF4D9-D937-4e1d-B810-A1A568C37C01",
++    "DD92CC91-43E6-4c69-A42A-B08F72FCB157",
++    "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B",
++    "733F9979-4ED4-478d-BD6A-E4D0F0390FDB",
++    "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914",
++    "A0920D6F-78B8-4c09-9F61-7CEC845F116C",
++    "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7",
++    "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC"
++};
++
++static int get_guid_index(uint32_t socket_id, uint32_t error_type) {
++    if (2 * socket_id + error_type >= FLASH_ENTRY_NUM)
++        return -1;
++    return 2 * socket_id + error_type;
++}
++
++static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr)
++{
++    info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK;
++    fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN;
++    info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK;
++    fault_addr >>= FAULT_ADDR_DIE_ID_LEN;
++    info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK;
++    fault_addr >>= FAULT_ADDR_STACK_ID_LEN;
++    info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK;
++    fault_addr >>= FAULT_ADDR_SID_LEN;
++    info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK;
++    fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
++    info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK;
++    fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN;
++    info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK;
++    fault_addr >>= FAULT_ADDR_BANK_ID_LEN;
++    info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK;
++    fault_addr >>= FAULT_ADDR_ROW_ID_LEN;
++    info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK;
++    fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
++    info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK;
++    fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN;
++    info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK;
++    fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN;
++    info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK;
++    fault_addr >>= FAULT_ADDR_RESERVED_LEN;
++    info_struct->crc8 = (uint32_t)fault_addr;
++}
++
++static bool variable_existed(char *name, char *guid)
++{
++    char filename[PATH_MAX];
++    int fd;
++
++    snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
++
++    // open var file
++    fd = open(filename, O_RDONLY);
++    if (fd < 0) {
++        log(LOG_WARNING, "open file %s failed\n", filename);
++        return false;
++    }
++    close(fd);
++    return true;
++}
++
++static uint32_t read_variable_attribute(char *name, char *guid) {
++    char filename[PATH_MAX];
++    int fd;
++    size_t readsize;
++    uint32_t attribute = (uint32_t)-1;
++
++    snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
++
++    // open var file
++    fd = open(filename, O_RDONLY);
++    if (fd < 0) {
++        log(LOG_ERROR, "open %s failed\n", filename);
++        return attribute;
++    }
++
++    // read attributes from first 4 bytes
++    readsize = read(fd, &attribute, sizeof(uint32_t));
++    if (readsize != sizeof(uint32_t)) {
++        log(LOG_ERROR, "read attribute of %s failed\n", filename);
++    }
++
++    close(fd);
++    return attribute;
++}
++
++static int efivarfs_set_mutable(char *name, char *guid, bool mutable)
++{
++	unsigned long orig_attrs, new_attrs;
++    char filename[PATH_MAX];
++    int fd;
++
++    snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
++
++    fd = open(filename, O_RDONLY);
++    if (fd < 0) {
++        log(LOG_ERROR, "open %s failed\n", filename);
++        goto err;
++    }
++
++	if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) {
++		log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n");
++		goto err;
++	}
++
++    if (mutable)
++	    new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL;
++    else
++        new_attrs = orig_attrs | FS_IMMUTABLE_FL;
++
++    if (new_attrs == orig_attrs) {
++        close(fd);
++        return 0;
++    }
++
++	if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) {
++		log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n");
++		goto err;
++	}
++    close(fd);
++	return 0;
++err:
++    if (fd >= 0)
++        close(fd);
++    return -1;
++}
++
++static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) {
++    int fd, mode;
++    size_t writesize;
++    void *buffer;
++    unsigned long total;
++    char filename[PATH_MAX];
++
++    snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
++
++    // prepare attributes(size 4 bytes) and data
++    total = size + sizeof(uint32_t);
++    buffer = malloc(total);
++    if (buffer == NULL) {
++        log(LOG_ERROR, "malloc data for %s failed\n", filename);
++        goto err;
++    }
++    memcpy(buffer, &attribute, sizeof(uint32_t));
++    memcpy(buffer + sizeof(uint32_t), value, size);
++
++    // change attr
++    if (efivarfs_set_mutable(name, guid, 1) != 0) {
++        log(LOG_ERROR, "set mutable for %s failed\n", filename);
++        goto err;
++    }
++
++    mode = O_WRONLY;
++    if (attribute & EFI_VARIABLE_APPEND_WRITE)
++        mode |= O_APPEND;
++    else
++        mode |= O_CREAT;
++
++    // open var file
++    fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
++    if (fd < 0) {
++        log(LOG_ERROR, "open %s failed\n", filename);
++        goto err;
++    }
++
++    // write to var file
++    writesize = write(fd, buffer, total);
++    if (writesize != total) {
++        log(LOG_ERROR, "write %s failed\n", filename);
++        goto err;
++    }
++
++    close(fd);
++    free(buffer);
++    if (efivarfs_set_mutable(name, guid, 0) != 0) {
++        log(LOG_ERROR, "set immutable for %s failed\n", filename);
++    }
++    return 0;
++err:
++    if (fd >= 0)
++        close(fd);
++    if (buffer)
++        free(buffer);
++    if (efivarfs_set_mutable(name, guid, 0) != 0) {
++        log(LOG_ERROR, "set immutable for %s failed\n", filename);
++    }
++    return -1;
++}
++
++static int append_variable(char *name, char *guid, void *data, unsigned long size) {
++    // prepare append attribute
++    uint32_t attribute = read_variable_attribute(name, guid);
++    if (attribute == (uint32_t)-1) {
++        log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid);
++        return -1;
++    }
++    attribute |= EFI_VARIABLE_APPEND_WRITE;
++
++    return write_variable(name, guid, data, size, attribute);
++}
++
++static size_t get_var_size(char *name, char *guid) {
++    char filename[PATH_MAX];
++    int fd;
++    struct stat stat;
++
++    snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
++
++    // open var file
++    fd = open(filename, O_RDONLY);
++    if (fd < 0) {
++        log(LOG_WARNING, "open %s failed\n", filename);
++        goto err;
++    }
++    // read stat
++    if (fstat(fd, &stat) != 0) {
++        log(LOG_WARNING, "fstat %s failed\n", filename);
++        goto err;
++    }
++    close(fd);
++    return stat.st_size;
++err:
++    if (fd >= 0)
++        close(fd);
++    return (size_t)-1;
++}
++
++int init_all_flash() {
++    for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
++        // check existed entry
++        if (variable_existed(flash_names[i], flash_guids[i])) {
++            total_size += get_var_size(flash_names[i], flash_guids[i]);
++            continue;
++        }
++        // create new entry
++        uint32_t attribute = EFI_VARIABLE_NON_VOLATILE |
++                             EFI_VARIABLE_BOOTSERVICE_ACCESS |
++                             EFI_VARIABLE_RUNTIME_ACCESS;
++        char *data = "";
++        unsigned long size = 1;
++        int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute);
++        if (ret) {
++            log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]);
++            return -1;
++        }
++        total_size += sizeof(uint32_t) + 1;
++    }
++    // check total entry size
++    log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
++           total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
++    if (total_size > MAX_VAR_SIZE) {
++        log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n");
++    }
++    return 0;
++}
++
++static int write_fault_info_to_flash(const struct hisi_common_error_section *err) {
++    int ret, guid_index;
++    uint32_t reg_size;
++    uint64_t fault_addr;
++
++    // check flash usage threshold
++    if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
++        log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n");
++        return -1;
++    }
++
++    // parse physical addr
++    reg_size = err->reg_array_size / sizeof(uint32_t);
++    fault_addr = err->reg_array[reg_size - 1];
++    fault_addr <<= TYPE_UINT32_WIDTH;
++    fault_addr += err->reg_array[reg_size - 2];
++
++    // get guid
++    struct fault_addr_info info_struct;
++    parse_fault_addr_info(&info_struct, fault_addr);
++    guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type);
++    if (guid_index < 0) {
++        log(LOG_ERROR, "invalid fault info\n");
++        return -1;
++    }
++    // record physical addr in flash
++    ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t));
++    if (ret < 0) {
++        log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
++        return -1;
++    }
++    total_size += sizeof(uint64_t);
++    log(LOG_INFO, "write hbm fault info to flash success\n");
++    return 0;
++}
++
++static int write_file(char *path, const char *name, unsigned long long value)
++{
++    char fname[MAX_PATH];
++    char buf[20];
++    int ret;
++    int fd;
++
++    snprintf(fname, MAX_PATH, "%s/%s", path, name);
++
++    fd = open(fname, O_WRONLY);
++    if (fd < 0) {
++        log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n",
++                    fname, strerror(errno));
++        return -errno;
++    }
++
++    snprintf(buf, sizeof(buf), "0x%llx\n", value);
++    ret = write(fd, buf, strlen(buf));
++    if (ret <= 0)
++        log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n",
++                    fname, value, strerror(errno));
++
++    close(fd);
++    return ret > 0 ? 0 : -errno;
++}
++
++static int get_hardware_corrupted_size()
++{
++    FILE *fp;
++    char line[256];
++    int hardware_corrupted_size = -1;
++    char *key = "HardwareCorrupted:";
++
++    fp = fopen("/proc/meminfo", "r");
++    if (fp == NULL) {
++        log(LOG_ERROR, "Failed to open /proc/meminfo\n");
++        return -1;
++    }
++
++    while (fgets(line, sizeof(line), fp) != NULL) {
++        char *pos;
++        if ((pos = strstr(line, key)) != NULL) {
++            sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size);
++            break;
++        }
++    }
++
++    fclose(fp);
++    return hardware_corrupted_size;
++}
++
++static uint8_t get_repair_result_code(int ret)
++{
++    if (ret == -ENOSPC) {
++        return REPAIR_FAILED_NO_RESOURCE;
++    } else if (ret == -EIO) {
++        return REPAIR_FAILED_OTHER_REASON;
++    } else if (ret == -ENXIO || ret == -EINVAL) {
++        return REPAIR_FAILED_INVALID_PARAM;
++    }
++    return REPAIR_FAILED_OTHER_REASON;
++}
++
++static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code)
++{
++    int sockfd;
++    struct sockaddr_un addr;
++    char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0};
++    uint8_t repair_type_code, isolation_type_code;
++    uint32_t repair_type;
++    unsigned long long fault_addr;
++
++    sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
++    if (sockfd < 0) {
++        log(LOG_ERROR, "Failed to create BMC notice socket\n");
++        return -1;
++    }
++
++    memset(&addr, 0, sizeof(struct sockaddr_un));
++    addr.sun_family = AF_UNIX;
++    strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1);
++    if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) {
++        log(LOG_ERROR, "Failed to connect BMC notice socket\n");
++        close(sockfd);
++        return -1;
++    }
++
++    /* assemble bmc specific msg */
++    repair_type_code = 0;
++    isolation_type_code = 0;
++    repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE];
++    if (repair_type & HBM_CE_ACLS) {
++        repair_type_code = 0;
++        isolation_type_code = SINGLE_ADDR_FAULT;
++    } else if (repair_type & HBM_PSUE_ACLS) {
++        repair_type_code = 1;
++        isolation_type_code = SINGLE_ADDR_FAULT;
++    } else if (repair_type & HBM_CE_SPPR) {
++        repair_type_code = 2;
++        isolation_type_code = ROW_FAULT;
++    } else if (repair_type & HBM_PSUE_SPPR) {
++        repair_type_code = 3;
++        isolation_type_code = ROW_FAULT;
++    }
++    
++    const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t);
++
++    fault_addr = err->reg_array[reg_size - 1];
++    fault_addr <<= TYPE_UINT32_WIDTH;
++    fault_addr += err->reg_array[reg_size - 2];
++
++    log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr);
++
++    struct fault_addr_info info_struct;
++    parse_fault_addr_info(&info_struct, fault_addr);
++
++    log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id);
++    log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id);
++    log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id);
++    log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid);
++    log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id);
++    log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id);
++    log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id);
++    log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id);
++    log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id);
++    log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type);
++    log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type);
++    log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved);
++    log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8);
++
++    snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT,
++        repair_type_code,
++        repair_result_code,
++        isolation_type_code,
++        info_struct.processer_id,
++        info_struct.die_id,
++        info_struct.stack_id,
++        info_struct.sid,
++        info_struct.channel_id,
++        info_struct.bankgroup_id,
++        info_struct.bank_id,
++        info_struct.row_id,
++        info_struct.column_id
++    );
++
++    log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg);
++
++    if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) {
++        log(LOG_ERROR, "Failed to send data to BMC notice socket\n");
++        close(sockfd);
++        return -1;
++    }
++
++    close(sockfd);
++    return 0;
++}
++
++static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
++{
++    unsigned long long paddr;
++    int ret;
++    bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS);
++    int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB;
++    int hardware_corrupted_size = get_hardware_corrupted_size();
++    if (hardware_corrupted_size < 0) {
++        log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed");
++        notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
++        return -1;
++    }
++    if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) {
++        log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n");
++        notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD);
++        return -1;
++    }
++    if (is_acls) {
++        /* ACLS */
++        paddr = err->reg_array[HBM_ADDH];
++        paddr <<= TYPE_UINT32_WIDTH;
++        paddr += err->reg_array[HBM_ADDL];
++
++        ret = write_file("/sys/kernel/page_eject", "offline_page", paddr);
++        if (ret < 0) {
++            notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
++            log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr);
++            return ret;
++        }
++    } else {
++        /* SPPR */
++        bool all_success = true;
++        uint32_t i;
++        for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) {
++            paddr = err->reg_array[2 * i + HBM_ADDH];
++            paddr <<= TYPE_UINT32_WIDTH;
++            paddr += err->reg_array[2 * i + HBM_ADDL];
++            ret = write_file("/sys/kernel/page_eject", "offline_page", paddr);
++            if (ret < 0) {
++                all_success = false;
++                log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr);
++                continue;
++            }
++        }
++        if (!all_success) {
++            notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
++            ret = -1;
++        }
++    }
++    return ret < 0 ? ret : 0;
++}
++
++static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
++{
++    int ret;
++    if (repair_ret < 0) {
++        log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
++        /* not much we can do about errors here */
++        (void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
++        return get_repair_result_code(repair_ret);
++    }
++
++    ret = write_file("/sys/kernel/page_eject", "online_page", paddr);
++    if (ret < 0) {
++        log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr);
++        return ONLINE_PAGE_FAILED;
++    } else {
++        log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr);
++        return ISOLATE_REPAIR_ONLINE_SUCCESS;
++    }
++}
++
++static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
++{
++    unsigned long long paddr;
++    int ret;
++    uint8_t repair_result_code;
++    bool is_acls;
++
++    /* Both ACLS and SPPR only repair the first address */
++    paddr = err->reg_array[HBM_ADDH];
++    paddr <<= TYPE_UINT32_WIDTH;
++    paddr += err->reg_array[HBM_ADDL];
++
++    is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS ||
++        err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS;
++
++    ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
++    if (ret < 0) {
++        notice_BMC(err, get_repair_result_code(ret));
++        log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
++        return ret;
++    }
++
++    ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr);
++
++    if (is_acls) {
++        /* ACLS */
++        repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr);
++        notice_BMC(err, repair_result_code);
++        return ret;
++    } else {
++        /* SPPR */
++        bool all_online_success = true;
++        uint32_t i;
++        for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) {
++            paddr = err->reg_array[2 * i + HBM_ADDH];
++            paddr <<= TYPE_UINT32_WIDTH;
++            paddr += err->reg_array[2 * i + HBM_ADDL];
++
++            repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr);
++            if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) {
++                all_online_success = false;
++            }
++        }
++        if (ret < 0) {
++            notice_BMC(err, get_repair_result_code(ret));
++            return ret;
++        } else if (all_online_success) {
++            notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS);
++            return 0;
++        } else {
++            notice_BMC(err, ONLINE_PAGE_FAILED);
++            return ret;
++        }
++    }
++    /* The final return code is not necessary */
++    return ret < 0 ? ret : 0;
++}
++
++static int hbmc_get_memory_type(char *path)
++{
++    int type = HBM_UNKNOWN;
++    char fname[MAX_PATH];
++    char buf[128];
++    FILE *file;
++
++    snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type");
++    file = fopen(fname, "r");
++    if (!file) {
++        log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n",
++                    fname, strerror(errno));
++        return -errno;
++    }
++
++    if (!fgets(buf, sizeof(buf), file)) {
++        log(LOG_WARNING, "HBM: Failed to read %s\n", fname);
++        goto err;
++    }
++
++    /* Remove the last '\n' */
++    buf[strlen(buf) - 1] = 0;
++
++    if (strcmp(buf, "HBM") == 0)
++        type = HBM_HBM_MEMORY;
++    else if (strcmp(buf, "DDR") == 0)
++        type = HBM_DDR_MEMORY;
++
++err:
++    fclose(file);
++    return type;
++}
++
++static void hbm_repair_handler(const struct hisi_common_error_section *err)
++{
++    log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n");
++    char *sys_dev_path = "/sys/devices/platform";
++    char path[MAX_PATH];
++    struct dirent *dent;
++    DIR *dir;
++    int ret;
++    bool find_device = false, find_hbm_mem = false;
++
++    ret = hbmc_hbm_page_isolate(err);
++    if (ret < 0) {
++        return;
++    }
++
++    dir = opendir(sys_dev_path);
++    if (!dir) {
++        log(LOG_WARNING, "Can't read '%s': %s\n",
++                    sys_dev_path, strerror(errno));
++        notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
++        return;
++    }
++
++    while ((dent = readdir(dir))) {
++        if (!strstr(dent->d_name, HBM_MEM_RAS_NAME))
++            continue;
++        find_device = true;
++
++        snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name);
++
++        if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) {
++            find_hbm_mem = true;
++            ret = hbmc_hbm_repair(err, path);
++            if (ret != -ENXIO)
++                break;
++        }
++    }
++    if (!find_device) {
++        log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n",
++                err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
++        notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
++    } else if (!find_hbm_mem) {
++        log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n",
++                err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
++        notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
++    }
++
++    closedir(dir);
++}
++
++static bool hbm_repair_validate(const struct hisi_common_error_section *err)
++{
++    if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) &&
++          (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) &&
++          (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE))
++        )) {
++        log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits);
++        return false;
++    }
++    log(LOG_DEBUG, "err->module_id: %u\n", err->module_id);
++    log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id);
++    log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits);
++    log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size);
++
++    if (err->module_id != HBMC_MODULE_ID ||
++        err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) {
++        log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n");
++        return false;
++    }
++
++    uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK;
++    bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) &&
++        (err->reg_array_size == HBM_ACLS_ARRAY_SIZE);
++    bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) &&
++        (err->reg_array_size == HBM_SPPR_ARRAY_SIZE);
++    bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) && 
++        (err->reg_array_size == HBM_CACHE_ARRAY_SIZE);
++
++    if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) {
++        log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n",
++                    hbm_repair_reg_type, err->reg_array_size);
++        return false;
++    }
++
++    log(LOG_INFO, "Received ACLS/SPPR repair request\n");
++    return true;
++}
++
++static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err)
++{
++    uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK;
++    return !(hbm_repair_reg_type & HBM_CACHE_MODE);
++}
++
++int decode_hisi_common_section(struct ras_non_standard_event *event)
++{
++    const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error;
++
++    if (hbm_repair_validate(err)) {
++        write_fault_info_to_flash(err);
++        if (hbm_flat_mode_validate(err)) {
++            hbm_repair_handler(err);
++        }
++    }
++
++    return 0;
++}
+diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h
+new file mode 100644
+index 0000000..7e8e448
+--- /dev/null
++++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h
+@@ -0,0 +1,89 @@
++#ifndef __NON_STANDARD_HBM_REPAIR
++#define __NON_STANDARD_HBM_REPAIR
++
++#include "ras-non-standard-handler.h"
++
++#define DEFAULT_PAGE_SIZE_KB   4
++#define HBM_MEM_RAS_NAME   "HISI0521"
++#define HBM_UNKNOWN        0
++#define HBM_HBM_MEMORY     1
++#define HBM_DDR_MEMORY     2
++
++#define TYPE_UINT32_WIDTH    32
++#define HBM_REPAIR_REQ_TYPE  0
++#define HBM_CE_ACLS          BIT(0)
++#define HBM_PSUE_ACLS        BIT(1)
++#define HBM_CE_SPPR          BIT(2)
++#define HBM_PSUE_SPPR        BIT(3)
++#define HBM_CACHE_MODE       (BIT(4) | BIT(5) | BIT(6) | BIT(7))
++#define HBM_ERROR_MASK       0b11111111
++#define HBM_ADDL             1
++#define HBM_ADDH             2
++#define HBM_ERROR_TYPE_SIZE  4
++#define HBM_ADDR_SIZE        8
++#define HBM_ACLS_ADDR_NUM    1
++#define HBM_SPPR_ADDR_NUM    16
++#define HBM_ACLS_ARRAY_SIZE  (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE)
++#define HBM_SPPR_ARRAY_SIZE  (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE)
++#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE)
++#define HBMC_MODULE_ID              0x28
++#define HBMC_SUBMOD_HBM_REPAIR      6
++#define COMMON_VALID_MODULE_ID      5
++#define COMMON_VALID_SUBMODULE_ID   6
++#define COMMON_VALID_REG_ARRAY_SIZE 12
++
++#define BMC_SOCKET_PATH    "/var/run/sysSentry/bmc.sock"
++#define BMC_REPORT_FORMAT  "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000"
++
++#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001
++#define ISOLATE_FAILED_OTHER_REASON   0b10000010
++#define REPAIR_FAILED_NO_RESOURCE     0b10010100
++#define REPAIR_FAILED_INVALID_PARAM   0b10011000
++#define REPAIR_FAILED_OTHER_REASON    0b10011100
++#define ONLINE_PAGE_FAILED            0b10100000
++#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000
++
++#define ROW_FAULT         1
++#define SINGLE_ADDR_FAULT 6
++
++#define FAULT_ADDR_PROCESSOR_ID_LEN  2
++#define FAULT_ADDR_DIE_ID_LEN        1
++#define FAULT_ADDR_STACK_ID_LEN      3
++#define FAULT_ADDR_SID_LEN           3
++#define FAULT_ADDR_CHANNEL_ID_LEN    8
++#define FAULT_ADDR_BANKGROUP_ID_LEN  3
++#define FAULT_ADDR_BANK_ID_LEN       3
++#define FAULT_ADDR_ROW_ID_LEN        17
++#define FAULT_ADDR_COLUMN_ID_LEN     10
++#define FAULT_ADDR_ERROR_TYPE_LEN    2
++#define FAULT_ADDR_REPAIR_TYPE_LEN   2
++#define FAULT_ADDR_RESERVED_LEN      2
++#define FAULT_ADDR_CRC8_LEN          8
++
++#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1)
++#define FAULT_ADDR_DIE_ID_MASK       ((1 << FAULT_ADDR_DIE_ID_LEN       ) - 1)
++#define FAULT_ADDR_STACK_ID_MASK     ((1 << FAULT_ADDR_STACK_ID_LEN     ) - 1)
++#define FAULT_ADDR_SID_MASK          ((1 << FAULT_ADDR_SID_LEN          ) - 1)
++#define FAULT_ADDR_CHANNEL_ID_MASK   ((1 << FAULT_ADDR_CHANNEL_ID_LEN   ) - 1)
++#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1)
++#define FAULT_ADDR_BANK_ID_MASK      ((1 << FAULT_ADDR_BANK_ID_LEN      ) - 1)
++#define FAULT_ADDR_ROW_ID_MASK       ((1 << FAULT_ADDR_ROW_ID_LEN       ) - 1)
++#define FAULT_ADDR_COLUMN_ID_MASK    ((1 << FAULT_ADDR_COLUMN_ID_LEN    ) - 1)
++#define FAULT_ADDR_ERROR_TYPE_MASK   ((1 << FAULT_ADDR_ERROR_TYPE_LEN   ) - 1)
++#define FAULT_ADDR_REPAIR_TYPE_MASK  ((1 << FAULT_ADDR_REPAIR_TYPE_LEN  ) - 1)
++#define FAULT_ADDR_RESERVED_MASK     ((1 << FAULT_ADDR_RESERVED_LEN     ) - 1)
++#define FAULT_ADDR_CRC8_MASK         ((1 << FAULT_ADDR_CRC8_LEN         ) - 1)
++
++#define EFI_VARIABLE_NON_VOLATILE       0x1
++#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2
++#define EFI_VARIABLE_RUNTIME_ACCESS     0x4
++#define EFI_VARIABLE_APPEND_WRITE       0x40
++
++#define EFIVARFS_PATH "/sys/firmware/efi/efivars"
++#define MAX_VAR_SIZE (128 * 1024)
++#define FLASH_ENTRY_NUM 8
++#define KB_SIZE 1024
++
++extern int init_all_flash();
++
++#endif
+diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c
+new file mode 100644
+index 0000000..0b12329
+--- /dev/null
++++ b/src/c/hbm_online_repair/ras-events.c
+@@ -0,0 +1,534 @@
++#include <dirent.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <stdbool.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/stat.h>
++#include <sys/types.h>
++#include <sys/poll.h>
++#include <signal.h>
++#include <sys/signalfd.h>
++
++#include <traceevent/kbuffer.h>
++#include <traceevent/event-parse.h>
++#include "ras-non-standard-handler.h"
++#include "logger.h"
++
++/*
++ * Polling time, if read() doesn't block. Currently, trace_pipe_raw never
++ * blocks on read(). So, we need to sleep for a while, to avoid spending
++ * too much CPU cycles. A fix for it is expected for 3.10.
++ */
++#define POLLING_TIME 3
++
++/* Test for a little-endian machine */
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++    #define ENDIAN KBUFFER_ENDIAN_LITTLE
++#else
++    #define ENDIAN KBUFFER_ENDIAN_BIG
++#endif
++
++static int get_debugfs_dir(char *debugfs_dir, size_t len)
++{
++    FILE *fp;
++    char line[MAX_PATH + 1 + 256];
++
++    fp = fopen("/proc/mounts","r");
++    if (!fp) {
++        log(LOG_INFO, "Can't open /proc/mounts");
++        return errno;
++    }
++
++    do {
++        char *p, *type, *dir;
++        if (!fgets(line, sizeof(line), fp))
++            break;
++
++        p = strtok(line, " \t");
++        if (!p)
++            break;
++
++        dir = strtok(NULL, " \t");
++        if (!dir)
++            break;
++
++        type = strtok(NULL, " \t");
++        if (!type)
++            break;
++
++        if (!strcmp(type, "debugfs")) {
++            fclose(fp);
++            strncpy(debugfs_dir, dir, len - 1);
++            debugfs_dir[len - 1] = '\0';
++            return 0;
++        }
++    } while(1);
++
++    fclose(fp);
++    log(LOG_INFO, "Can't find debugfs\n");
++    return ENOENT;
++}
++
++
++static int open_trace(char *trace_dir, char *name, int flags)
++{
++    int ret;
++    char fname[MAX_PATH + 1];
++
++    strcpy(fname, trace_dir);
++    strcat(fname, "/");
++    strcat(fname, name);
++
++    ret = open(fname, flags);
++    if (ret < 0)
++        log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno);
++
++    return ret;
++}
++
++static int create_trace_instance(char *trace_instance_dir)
++{
++    char fname[MAX_PATH + 1];
++    int rc;
++
++    get_debugfs_dir(fname, sizeof(fname));
++    strcat(fname, "/tracing/instances/"TOOL_NAME);
++    rc = mkdir(fname, S_IRWXU);
++    if (rc < 0 && errno != EEXIST) {
++        log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname);
++        return -1;
++    }
++    strcpy(trace_instance_dir, fname);
++    return 0;
++}
++
++struct ras_events *init_trace_instance(void)
++{
++    struct ras_events *ras = calloc(1, sizeof(*ras));
++    if (!ras) {
++        log(LOG_ERROR, "Can't allocate memory for ras struct\n");
++        return NULL;
++    }
++    int rc = create_trace_instance(ras->tracing);
++    if (rc < 0) {
++        free(ras);
++        return NULL;
++    }
++    return ras;
++}
++
++/*
++ * Tracing enable/disable code
++ */
++int toggle_ras_event(char *trace_dir, char *group, char *event, int enable)
++{
++    int fd, rc;
++    char fname[MAX_PATH + 1];
++
++    snprintf(fname, sizeof(fname), "%s%s:%s\n",
++         enable ? "" : "!",
++         group, event);
++
++    /* Enable RAS events */
++    fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND);
++    if (fd < 0) {
++        log(LOG_WARNING, "Can't open set_event\n");
++        rc = -errno;
++        goto err;
++    }
++
++    rc = write(fd, fname, strlen(fname));
++    close(fd);
++    if (rc <= 0) {
++        log(LOG_WARNING, "Can't write to set_event\n");
++        rc = -EIO;
++        goto err;
++    }
++
++    log(LOG_INFO, "%s:%s event %s\n",
++        group, event,
++        enable ? "enabled" : "disabled");
++    return 0;
++err:
++    log(LOG_ERROR, "Can't %s %s:%s tracing\n",
++        enable ? "enable" : "disable", group, event);
++    return rc;
++}
++
++static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent)
++{
++    int fd, len, page_size = DEFAULT_PAGE_SIZE;
++    char buf[page_size];
++
++    fd = open_trace(ras->tracing, "events/header_page", O_RDONLY);
++    if (fd < 0) {
++        log(LOG_WARNING, "Open event header page failed\n");
++        return -1;
++    }
++
++    len = read(fd, buf, page_size);
++    close(fd);
++    if (len <= 0) {
++        log(LOG_WARNING, "Read event header page failed\n");
++        return -1;
++    }
++
++    if (tep_parse_header_page(pevent, buf, len, sizeof(long))) {
++        log(LOG_WARNING, "Parse event header page failed\n");
++        return -1;
++    }
++
++    return 0;
++}
++
++static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf,
++               void *data, unsigned long long time_stamp)
++{
++    struct tep_record record;
++    struct trace_seq s;
++
++    record.ts = time_stamp;
++    record.size = kbuffer_event_size(kbuf);
++    record.data = data;
++    record.offset = kbuffer_curr_offset(kbuf);
++    record.cpu = pdata->cpu;
++
++    /* note offset is just offset in subbuffer */
++    record.missed_events = kbuffer_missed_events(kbuf);
++    record.record_size = kbuffer_curr_size(kbuf);
++
++    trace_seq_init(&s);
++    tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s", 
++                    TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO);
++    trace_seq_do_printf(&s);
++    fflush(stdout);
++    trace_seq_destroy(&s);
++}
++
++static int get_num_cpus()
++{
++    return sysconf(_SC_NPROCESSORS_ONLN);
++}
++
++static int set_buffer_percent(struct ras_events *ras, int percent)
++{
++    int res = 0;
++    int fd;
++
++    fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY);
++    if (fd >= 0) {
++        char buf[16];
++        ssize_t size;
++        snprintf(buf, sizeof(buf), "%d", percent);
++        size = write(fd, buf, strlen(buf));
++        if (size <= 0) {
++            log(LOG_WARNING, "can't write to buffer_percent\n");
++            res = -1;
++        }
++        close(fd);
++    } else {
++        log(LOG_WARNING, "Can't open buffer_percent\n");
++        res = -1;
++    }
++
++    return res;
++}
++
++static int read_ras_event_all_cpus(struct pcpu_data *pdata,
++                   unsigned n_cpus)
++{
++    ssize_t size;
++    unsigned long long time_stamp;
++    void *data;
++    int ready, i, count_nready;
++    struct kbuffer *kbuf;
++    void *page;
++    struct pollfd fds[n_cpus + 1];
++    struct signalfd_siginfo fdsiginfo;
++    sigset_t mask;
++    int warnonce[n_cpus];
++    char pipe_raw[PATH_MAX];
++
++    memset(&warnonce, 0, sizeof(warnonce));
++
++    page = malloc(pdata[0].ras->page_size);
++    if (!page) {
++        log(LOG_ERROR, "Can't allocate page\n");
++        return -ENOMEM;
++    }
++
++    kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN);
++    if (!kbuf) {
++        log(LOG_ERROR, "Can't allocate kbuf\n");
++        free(page);
++        return -ENOMEM;
++    }
++
++    /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks
++     * indefinitely with the default buffer_percent in the kernel trace system,
++     * which is introduced by the following change in the kernel.
++     * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u.
++     * Set buffer_percent to 0 so that poll() will return immediately
++     * when the trace data is available in the ras per_cpu trace pipe_raw
++     */
++    if (set_buffer_percent(pdata[0].ras, 0))
++        log(LOG_WARNING, "Set buffer_percent failed\n");
++
++    for (i = 0; i < (n_cpus + 1); i++)
++        fds[i].fd = -1;
++
++    for (i = 0; i < n_cpus; i++) {
++        fds[i].events = POLLIN;
++
++        snprintf(pipe_raw, sizeof(pipe_raw),
++            "per_cpu/cpu%d/trace_pipe_raw", i);
++
++        fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY);
++        if (fds[i].fd < 0) {
++            log(LOG_ERROR, "Can't open trace_pipe_raw\n");
++            goto error;
++        }
++    }
++
++    sigemptyset(&mask);
++    sigaddset(&mask, SIGINT);
++    sigaddset(&mask, SIGTERM);
++    sigaddset(&mask, SIGHUP);
++    sigaddset(&mask, SIGQUIT);
++    if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
++        log(LOG_WARNING, "sigprocmask\n");
++    fds[n_cpus].events = POLLIN;
++    fds[n_cpus].fd = signalfd(-1, &mask, 0);
++    if (fds[n_cpus].fd < 0) {
++        log(LOG_WARNING, "signalfd\n");
++        goto error;
++    }
++
++    log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1);
++
++    do {
++        ready = poll(fds, (n_cpus + 1), -1);
++        if (ready < 0) {
++            log(LOG_WARNING, "poll\n");
++        }
++
++        /* check for the signal */
++        if (fds[n_cpus].revents & POLLIN) {
++            size = read(fds[n_cpus].fd, &fdsiginfo,
++                    sizeof(struct signalfd_siginfo));
++            if (size != sizeof(struct signalfd_siginfo)) {
++                log(LOG_WARNING, "signalfd read\n");
++                continue;
++            }
++
++            if (fdsiginfo.ssi_signo == SIGINT ||
++                fdsiginfo.ssi_signo == SIGTERM ||
++                fdsiginfo.ssi_signo == SIGHUP ||
++                fdsiginfo.ssi_signo == SIGQUIT) {
++                log(LOG_INFO, "Recevied signal=%d\n",
++                    fdsiginfo.ssi_signo);
++                goto error;
++            } else {
++                log(LOG_INFO,
++                    "Received unexpected signal=%d\n",
++                    fdsiginfo.ssi_signo);
++                continue;
++            }
++        }
++
++        count_nready = 0;
++        for (i = 0; i < n_cpus; i++) {
++            if (fds[i].revents & POLLERR) {
++                if (!warnonce[i]) {
++                    log(LOG_INFO,
++                        "Error on CPU %i\n", i);
++                    warnonce[i]++;
++                }
++                continue;
++            }
++            if (!(fds[i].revents & POLLIN)) {
++                count_nready++;
++                continue;
++            }
++            size = read(fds[i].fd, page, pdata[i].ras->page_size);
++            if (size < 0) {
++                log(LOG_WARNING, "read\n");
++                goto error;
++            } else if (size > 0) {
++                log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size);
++                kbuffer_load_subbuffer(kbuf, page);
++
++                while ((data = kbuffer_read_event(kbuf, &time_stamp))) {
++                    if (kbuffer_curr_size(kbuf) < 0) {
++                        log(LOG_ERROR, "invalid kbuf data, discard\n");
++                        break;
++                    }
++
++                    log(LOG_DEBUG, "parse_ras_data\n");
++                    parse_ras_data(&pdata[i],
++                               kbuf, data, time_stamp);
++
++                    /* increment to read next event */
++                    log(LOG_DEBUG, "kbuffer_next_event\n");
++                    kbuffer_next_event(kbuf, NULL);
++                }
++            } else {
++                count_nready++;
++            }
++        }
++
++        /*
++         * If count_nready == n_cpus, there is no cpu fd in POLLIN state,
++         * so we need to break the cycle
++         */
++        if (count_nready == n_cpus) {
++            log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n");
++            break;
++        }
++    } while (1);
++
++error:
++    kbuffer_free(kbuf);
++    free(page);
++    sigprocmask(SIG_UNBLOCK, &mask, NULL);
++
++    for (i = 0; i < (n_cpus + 1); i++) {
++        if (fds[i].fd > 0)
++            close(fds[i].fd);
++    }
++
++    return -1;
++}
++
++static int init_header_page(struct ras_events *ras, struct tep_handle *pevent)
++{
++    int rc;
++
++    rc = parse_header_page(ras, pevent);
++    if (rc) {
++        log(LOG_ERROR, "cannot read trace header_page: %d\n", rc);
++        return rc;
++    }
++    return 0;
++}
++
++static int init_event_format(struct ras_events *ras, struct tep_handle *pevent,
++                 char *group, char *event)
++{
++    char *page, fname[MAX_PATH + 1];
++    int fd, size, rc, page_size = DEFAULT_PAGE_SIZE;
++
++    // read one page from format
++    snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event);
++    fd = open_trace(ras->tracing, fname, O_RDONLY);
++    if (fd < 0) {
++        log(LOG_ERROR,
++            "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n",
++            group, event);
++        return errno;
++    }
++
++    log(LOG_INFO, "page_size: %d\n", page_size);
++    ras->page_size = page_size;
++    page = malloc(page_size);
++    if (!page) {
++        log(LOG_ERROR, "Can't allocate page to read %s:%s format\n",
++            group, event);
++        rc = errno;
++        close(fd);
++        return rc;
++    }
++
++    size = read(fd, page, page_size);
++    close(fd);
++    if (size < 0) {
++        log(LOG_ERROR, "Can't read format\n");
++        free(page);
++        return size;
++    }
++
++    // parse event format
++    rc = tep_parse_event(pevent, page, size, group);
++    if (rc) {
++        log(LOG_ERROR, "Can't parse event %s:%s\n", group, event);
++        free(page);
++        return EINVAL;
++    }
++    return 0;
++}
++
++static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
++                 char *group, char *event,
++                 tep_event_handler_func func)
++{
++    int rc;
++
++    rc = init_event_format(ras, pevent, group, event);
++    if (rc) {
++        log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event);
++        return rc;
++    }
++
++    /* Registers the special event handlers */
++    rc = tep_register_event_handler(pevent, -1, group, event, func, ras);
++    if (rc < 0) {
++        log(LOG_ERROR, "Can't register event handler for %s:%s\n",
++            group, event);
++        return EINVAL;
++    }
++
++    return 0;
++}
++
++int handle_ras_events(struct ras_events *ras)
++{
++    int rc, i;
++    unsigned cpus;
++    struct tep_handle *pevent = NULL;
++    struct pcpu_data *data = NULL;
++
++    pevent = tep_alloc();
++    if (!pevent) {
++        log(LOG_ERROR, "Can't allocate pevent\n");
++        rc = errno;
++        goto err;
++    }
++    ras->pevent = pevent;
++
++    rc = init_header_page(ras, pevent);
++    if (rc) {
++        log(LOG_ERROR, "init_header_page failed\n");
++        goto err;
++    }
++
++    rc = add_event_handler(ras, pevent, "ras", "non_standard_event",
++                ras_non_standard_event_handler);
++    if (rc) {
++        log(LOG_ERROR, "Can't get traces from %s:%s\n",
++            "ras", "non_standard_event");
++        goto err;
++    }
++    log(LOG_INFO, "add_event_handler done\n");
++
++    cpus = get_num_cpus();
++    data = calloc(sizeof(*data), cpus);
++    if (!data)
++        goto err;
++
++    for (i = 0; i < cpus; i++) {
++        data[i].ras = ras;
++        data[i].cpu = i;
++    }
++    rc = read_ras_event_all_cpus(data, cpus);
++
++err:
++    if (data)
++        free(data);
++    if (pevent)
++        tep_free(pevent);
++    return rc;
++}
+diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h
+new file mode 100644
+index 0000000..4218d93
+--- /dev/null
++++ b/src/c/hbm_online_repair/ras-events.h
+@@ -0,0 +1,28 @@
++#ifndef __RAS_EVENTS_H
++#define __RAS_EVENTS_H
++
++#include <stdint.h>
++#include <time.h>
++
++#define MAX_PATH 1024
++
++#define DEFAULT_PAGE_SIZE  4096
++
++struct ras_events {
++    char              tracing[MAX_PATH + 1];
++    struct tep_handle *pevent;
++    int               page_size;
++};
++
++struct pcpu_data {
++    struct tep_handle *pevent;
++    struct ras_events *ras;
++    int               cpu;
++};
++
++/* Function prototypes */
++int toggle_ras_event(char *trace_dir, char *group, char *event, int enable);
++int handle_ras_events(struct ras_events *ras);
++struct ras_events *init_trace_instance(void);
++
++#endif
+diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c
+new file mode 100644
+index 0000000..1d1fd04
+--- /dev/null
++++ b/src/c/hbm_online_repair/ras-non-standard-handler.c
+@@ -0,0 +1,81 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdbool.h>
++#include <string.h>
++#include <unistd.h>
++#include <traceevent/kbuffer.h>
++#include "ras-non-standard-handler.h"
++#include "logger.h"
++
++static char *uuid_le(const char *uu)
++{
++    static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
++    if (!uu) {
++        log(LOG_ERROR, "uuid_le failed: uu is empty");
++        return uuid;
++    }
++    size_t uu_len = strlen(uu);
++    if (uu_len < SECTION_TYPE_UUID_LEN) {
++        log(LOG_ERROR, "uuid_le failed: uu is too short");
++        return uuid;
++    }
++
++    char *p = uuid;
++    int i;
++    static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
++
++    for (i = 0; i < 16; i++) {
++        p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]);
++        switch (i) {
++        case 3:
++        case 5:
++        case 7:
++        case 9:
++            *p++ = '-';
++            break;
++        }
++    }
++
++    *p = 0;
++
++    return uuid;
++}
++
++int ras_non_standard_event_handler(struct trace_seq *s,
++             struct tep_record *record,
++             struct tep_event *event, void *context)
++{
++    int len;
++    unsigned long long val;
++    struct ras_non_standard_event ev;
++
++    ev.sec_type = tep_get_field_raw(s, event, "sec_type",
++                       record, &len, 1);
++    if(!ev.sec_type) {
++        log(LOG_WARNING, "get event section type failed");
++        return -1;
++    }
++
++    trace_seq_printf(s, "\n");
++    trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type));
++
++    if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) {
++        log(LOG_WARNING, "tep get field val failed");
++        return -1;
++    }
++
++    ev.length = val;
++    trace_seq_printf(s, "length: %d\n", ev.length);
++
++    ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1);
++    if(!ev.error || ev.length != len) {
++        log(LOG_WARNING, "get event error failed");
++        return -1;
++    }
++
++    if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) {
++        decode_hisi_common_section(&ev);
++    }
++
++    return 0;
++}
+diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h
+new file mode 100644
+index 0000000..0272dc1
+--- /dev/null
++++ b/src/c/hbm_online_repair/ras-non-standard-handler.h
+@@ -0,0 +1,25 @@
++#ifndef __RAS_NON_STANDARD_HANDLER_H
++#define __RAS_NON_STANDARD_HANDLER_H
++
++#include <traceevent/event-parse.h>
++#include "ras-events.h"
++
++#define BIT(nr) (1UL << (nr))
++
++#define SECTION_TYPE_UUID_LEN         16
++#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586"
++
++struct ras_non_standard_event {
++    char timestamp[64];
++    const char *sec_type;
++    const uint8_t *error;
++    uint32_t length;
++};
++
++int ras_non_standard_event_handler(struct trace_seq *s,
++            struct tep_record *record,
++            struct tep_event *event, void *context);
++
++int decode_hisi_common_section(struct ras_non_standard_event *event);
++
++#endif
+diff --git a/src/python/.gitignore b/src/python/.gitignore
+new file mode 100644
+index 0000000..58200d4
+--- /dev/null
++++ b/src/python/.gitignore
+@@ -0,0 +1 @@
++__pycache__/
+diff --git a/src/python/sentryCollector/__init__.py b/src/python/sentryCollector/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/src/python/sentryCollector/__main__.py b/src/python/sentryCollector/__main__.py
+new file mode 100644
+index 0000000..9c2ae50
+--- /dev/null
++++ b/src/python/sentryCollector/__main__.py
+@@ -0,0 +1,17 @@
++# coding: utf-8
++# Copyright (c) 2023 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++main
++"""
++from collectd import collectd
++
++collectd.main()
+diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
+new file mode 100644
+index 0000000..b6cc75c
+--- /dev/null
++++ b/src/python/sentryCollector/collect_config.py
+@@ -0,0 +1,118 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++Read and save collector.conf value.
++"""
++import configparser
++import logging
++import os
++import re
++
++
++COLLECT_CONF_PATH = "/etc/sysSentry/collector.conf"
++
++CONF_COMMON = 'common'
++CONF_MODULES = 'modules'
++
++# io
++CONF_IO = 'io'
++CONF_IO_PERIOD_TIME = 'period_time'
++CONF_IO_MAX_SAVE = 'max_save'
++CONF_IO_DISK = 'disk'
++CONF_IO_PERIOD_TIME_DEFAULT = 1
++CONF_IO_MAX_SAVE_DEFAULT = 10
++CONF_IO_DISK_DEFAULT = "default"
++
++class CollectConfig:
++    def __init__(self, filename=COLLECT_CONF_PATH):
++        
++        self.filename = filename
++        self.modules = []
++        self.module_count = 0
++        self.load_config()
++
++    def load_config(self):
++        if not os.path.exists(self.filename):
++            logging.error("%s is not exists", self.filename)
++            return
++
++        try:
++            self.config = configparser.ConfigParser()
++            self.config.read(self.filename)
++        except configparser.Error:
++            logging.error("collectd configure file read failed")
++            return
++        
++        try:
++            common_config = self.config[CONF_COMMON]
++            modules_str = common_config[CONF_MODULES]
++            # remove space
++            modules_list = modules_str.replace(" ", "").split(',')
++        except KeyError as e:
++            logging.error("read config data failed, %s", e)
++            return
++
++        pattern = r'^[a-zA-Z0-9-_]+$'
++        for module_name in modules_list:
++            if not re.match(pattern, module_name):
++                logging.warning("module_name: %s is invalid", module_name)
++                continue
++            if not self.config.has_section(module_name):
++                logging.warning("module_name: %s config is incorrect", module_name)
++                continue
++            self.modules.append(module_name)
++
++    def load_module_config(self, module_name):
++        module_name = module_name.strip().lower()
++        if module_name in self.modules and self.config.has_section(module_name):
++            return {key.lower(): value for key, value in self.config[module_name].items()}
++        else:
++            raise ValueError(f"Module '{module_name}' not found in configuration")
++
++    def get_io_config(self):
++        result_io_config = {}
++        io_map_value = self.load_module_config(CONF_IO)
++        # period_time
++        period_time = io_map_value.get(CONF_IO_PERIOD_TIME)
++        if period_time and period_time.isdigit() and int(period_time) >= 1 and int(period_time) <= 300:
++            result_io_config[CONF_IO_PERIOD_TIME] = int(period_time)
++        else:
++            logging.warning("module_name = %s section, field = %s is incorrect, use default %d", 
++                CONF_IO, CONF_IO_PERIOD_TIME, CONF_IO_PERIOD_TIME_DEFAULT)
++            result_io_config[CONF_IO_PERIOD_TIME] = CONF_IO_PERIOD_TIME_DEFAULT
++        # max_save
++        max_save = io_map_value.get(CONF_IO_MAX_SAVE)
++        if max_save and max_save.isdigit() and int(max_save) >= 1 and int(max_save) <= 300:
++            result_io_config[CONF_IO_MAX_SAVE] = int(max_save)
++        else:
++            logging.warning("module_name = %s section, field = %s is incorrect, use default %d", 
++                CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT)
++            result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT
++        # disk
++        disk = io_map_value.get(CONF_IO_DISK)
++        if disk:
++            disk_str = disk.replace(" ", "")
++            pattern = r'^[a-zA-Z0-9-_,]+$'
++            if not re.match(pattern, disk_str):
++                logging.warning("module_name = %s section, field = %s is incorrect, use default %s", 
++                CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
++                disk_str = CONF_IO_DISK_DEFAULT
++            result_io_config[CONF_IO_DISK] = disk_str
++        else:
++            logging.warning("module_name = %s section, field = %s is incorrect, use default %s", 
++                CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
++            result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT
++        logging.info("config get_io_config: %s", result_io_config)
++        return result_io_config
++
++    def get_common_config(self):
++        return {key.lower(): value for key, value in self.config['common'].items()}
+diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
+new file mode 100644
+index 0000000..104b734
+--- /dev/null
++++ b/src/python/sentryCollector/collect_io.py
+@@ -0,0 +1,243 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++collect module
++"""
++import os
++import time
++import logging
++import threading
++
++from .collect_config import CollectConfig
++
++Io_Category = ["read", "write", "flush", "discard"]
++IO_GLOBAL_DATA = {}
++IO_CONFIG_DATA = []
++
++class IoStatus():
++    TOTAL = 0
++    FINISH = 1
++    LATENCY = 2
++
++class CollectIo():
++
++    def __init__(self, module_config):
++
++        io_config = module_config.get_io_config()
++
++        self.period_time = io_config['period_time']
++        self.max_save = io_config['max_save']
++        disk_str = io_config['disk']
++
++        self.disk_map_stage = {}
++        self.window_value = {}
++
++        self.loop_all = False
++
++        if disk_str == "default":
++            self.loop_all = True
++        else:
++            self.disk_list = disk_str.strip().split(',')
++
++        self.stop_event = threading.Event()
++
++        IO_CONFIG_DATA.append(self.period_time)
++        IO_CONFIG_DATA.append(self.max_save)
++
++    def get_blk_io_hierarchy(self, disk_name, stage_list):
++        stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name)
++        try:
++            with open(stats_file, 'r') as file:
++                lines = file.read()
++        except FileNotFoundError:
++            logging.error("The file %s does not exist", stats_file)
++            return -1
++        except Exception as e:
++            logging.error("An error occurred3: %s", e)
++            return -1
++
++        curr_value = lines.strip().split('\n')
++
++        for stage_val in curr_value:
++            stage = stage_val.split(' ')[0]
++            if (len(self.window_value[disk_name][stage])) >= 2:
++                self.window_value[disk_name][stage].pop(0)
++
++            curr_stage_value = stage_val.split(' ')[1:-1]
++            self.window_value[disk_name][stage].append(curr_stage_value)
++        return 0
++
++    def append_period_lat(self, disk_name, stage_list):
++        for stage in stage_list:
++            if len(self.window_value[disk_name][stage]) < 2:
++                return
++            curr_stage_value = self.window_value[disk_name][stage][-1]
++            last_stage_value = self.window_value[disk_name][stage][-2]
++
++            for index in range(len(Io_Category)):
++                # read=0, write=1, flush=2, discard=3
++                if (len(IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save:
++                    IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].pop()
++
++                curr_lat = self.get_latency_value(curr_stage_value, last_stage_value, index)
++                curr_iops = self.get_iops(curr_stage_value, last_stage_value, index)
++                curr_io_length = self.get_io_length(curr_stage_value, last_stage_value, index)
++                curr_io_dump = self.get_io_dump(disk_name, stage, index)
++
++                IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops])
++
++    def get_iops(self, curr_stage_value, last_stage_value, category):
++        try:
++            finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
++        except ValueError as e:
++            logging.error("get_iops convert to int failed, %s", e)
++            return 0
++        value = finish / self.period_time
++        if value.is_integer():
++            return int(value)
++        else:
++            return round(value, 1)
++
++    def get_latency_value(self, curr_stage_value, last_stage_value, category):
++        try:
++            finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
++            lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY]))
++        except ValueError as e:
++            logging.error("get_latency_value convert to int failed, %s", e)
++            return 0
++        if finish <= 0 or lat_time <= 0:
++            return 0
++        value = lat_time / finish / 1000 / 1000
++        if value.is_integer():
++            return int(value)
++        else:
++            return round(value, 1)
++
++    def get_io_length(self, curr_stage_value, last_stage_value, category):
++        try:
++            finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
++        except ValueError as e:
++            logging.error("get_io_length convert to int failed, %s", e)
++            return 0
++        value = finish / self.period_time / 1000 / 1000
++        if value.is_integer():
++            return int(value)
++        else:
++            return round(value, 1)
++
++    def get_io_dump(self, disk_name, stage, category):
++        io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage)
++        count = 0
++        try:
++            with open(io_dump_file, 'r') as file:
++                for line in file:
++                    count += line.count('.op=' + Io_Category[category])
++        except FileNotFoundError:
++            logging.error("The file %s does not exist.", io_dump_file)
++            return count
++        except Exception as e:
++            logging.error("An error occurred1: %s", e)
++            return count
++        return count
++
++    def extract_first_column(self, file_path):
++        column_names = [] 
++        try:
++            with open(file_path, 'r') as file:
++                for line in file:
++                    parts = line.strip().split()
++                    if parts:
++                        column_names.append(parts[0])
++        except FileNotFoundError:
++            logging.error("The file %s does not exist.", file_path)
++        except Exception as e:
++            logging.error("An error occurred2: %s", e)
++        return column_names
++
++    def task_loop(self):
++        if self.stop_event.is_set():
++            logging.info("collect io thread exit")
++            return
++
++        for disk_name, stage_list in self.disk_map_stage.items():
++            if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
++                continue
++            self.append_period_lat(disk_name, stage_list)
++
++        threading.Timer(self.period_time, self.task_loop).start()
++
++    def is_kernel_avaliable(self):
++        base_path = '/sys/kernel/debug/block'
++        for disk_name in os.listdir(base_path):
++            if not self.loop_all and disk_name not in self.disk_list:
++                continue
++
++            disk_path = os.path.join(base_path, disk_name)
++            blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy')
++
++            if not os.path.exists(blk_io_hierarchy_path):
++                logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name)
++                continue
++
++            for file_name in os.listdir(blk_io_hierarchy_path):
++                file_path = os.path.join(blk_io_hierarchy_path, file_name)
++
++                if file_name == 'stats':
++                    stage_list = self.extract_first_column(file_path)
++                    self.disk_map_stage[disk_name] = stage_list
++                    self.window_value[disk_name] = {}
++                    IO_GLOBAL_DATA[disk_name] = {}
++
++        return len(IO_GLOBAL_DATA) != 0
++
++    def main_loop(self):
++        logging.info("collect io thread start")
++        
++        if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0:
++            logging.warning("no disks meet the requirements. collect io thread exits")
++            return
++
++        for disk_name, stage_list in self.disk_map_stage.items():
++            for stage in stage_list:
++                self.window_value[disk_name][stage] = []
++                IO_GLOBAL_DATA[disk_name][stage] = {}
++                for category in Io_Category:
++                    IO_GLOBAL_DATA[disk_name][stage][category] = []
++
++        while True:
++            start_time = time.time()
++
++            if self.stop_event.is_set():
++                logging.debug("collect io thread exit")
++                return
++
++            for disk_name, stage_list in self.disk_map_stage.items():
++                if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
++                    continue
++                self.append_period_lat(disk_name, stage_list)
++                
++            elapsed_time = time.time() - start_time
++            sleep_time = self.period_time - elapsed_time
++            if sleep_time < 0:
++                continue
++            while sleep_time > 1:
++                if self.stop_event.is_set():
++                    logging.debug("collect io thread exit")
++                    return
++                time.sleep(1)
++                sleep_time -= 1
++            time.sleep(sleep_time)
++
++    # set stop event, notify thread exit
++    def stop_thread(self):
++        logging.debug("collect io thread is preparing to exit")
++        self.stop_event.set() 
+diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
+new file mode 100644
+index 0000000..49ce0a8
+--- /dev/null
++++ b/src/python/sentryCollector/collect_plugin.py
+@@ -0,0 +1,276 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++collcet plugin
++"""
++import json
++import socket
++import logging
++import re
++
++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
++
++# data length param
++CLT_MSG_HEAD_LEN = 9    #3+2+4
++CLT_MSG_PRO_LEN = 2
++CLT_MSG_MAGIC_LEN = 3
++CLT_MSG_LEN_LEN = 4
++
++CLT_MAGIC = "CLT"
++RES_MAGIC = "RES"
++
++# disk limit
++LIMIT_DISK_CHAR_LEN = 32
++LIMIT_DISK_LIST_LEN = 10
++
++# stage limit
++LIMIT_STAGE_CHAR_LEN = 20
++LIMIT_STAGE_LIST_LEN = 15
++
++#iotype limit
++LIMIT_IOTYPE_CHAR_LEN = 7
++LIMIT_IOTYPE_LIST_LEN = 4
++
++#period limit
++LIMIT_PERIOD_MIN_LEN = 1
++LIMIT_PERIOD_MAX_LEN = 300
++
++# interface protocol
++class ClientProtocol():
++    IS_IOCOLLECT_VALID = 0
++    GET_IO_DATA = 1
++    PRO_END = 3
++
++class ResultMessage():
++    RESULT_SUCCEED = 0
++    RESULT_UNKNOWN = 1 # unknown error
++    RESULT_NOT_PARAM = 2 # the parameter does not exist or the type does not match.
++    RESULT_INVALID_LENGTH = 3 # invalid parameter length.
++    RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit.
++    RESULT_PARSE_FAILED = 5 # parse failed
++    RESULT_INVALID_CHAR = 6 # invalid char
++
++Result_Messages = {
++    ResultMessage.RESULT_SUCCEED: "Succeed",
++    ResultMessage.RESULT_UNKNOWN: "Unknown error",
++    ResultMessage.RESULT_NOT_PARAM: "The parameter does not exist or the type does not match",
++    ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length",
++    ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit",
++    ResultMessage.RESULT_PARSE_FAILED: "Parse failed",
++    ResultMessage.RESULT_INVALID_CHAR: "Invalid char"
++}
++
++
++def client_send_and_recv(request_data, data_str_len, protocol):
++    """client socket send and recv message"""
++    try:
++        client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
++    except socket.error:
++        print("collect_plugin: client creat socket error")
++        return None
++
++    try:
++        client_socket.connect(COLLECT_SOCKET_PATH)
++    except OSError:
++        client_socket.close()
++        print("collect_plugin: client connect error")
++        return None
++
++    req_data_len = len(request_data)
++    request_msg = CLT_MAGIC + str(protocol).zfill(CLT_MSG_PRO_LEN) + str(req_data_len).zfill(CLT_MSG_LEN_LEN) + request_data
++
++    try:
++        client_socket.send(request_msg.encode())
++        res_data = client_socket.recv(len(RES_MAGIC) + CLT_MSG_PRO_LEN + data_str_len)
++        res_data = res_data.decode()
++    except (OSError, UnicodeError):
++        client_socket.close()
++        print("collect_plugin: client communicate error")
++        return None
++
++    res_magic = res_data[:CLT_MSG_MAGIC_LEN]
++    if res_magic != "RES":
++        print("res msg format error")
++        return None
++
++    protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
++    try:
++        protocol_id = int(protocol_str)
++    except ValueError:
++        print("recv msg protocol id is invalid %s", protocol_str)
++        return None
++
++    if protocol_id >= ClientProtocol.PRO_END:
++        print("protocol id is invalid")
++        return None
++
++    try:
++        res_data_len = int(res_data[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:])
++        res_msg_data = client_socket.recv(res_data_len)
++        res_msg_data = res_msg_data.decode()
++        return res_msg_data
++    except (OSError, ValueError, UnicodeError):
++        print("collect_plugin: client recv res msg error")
++    finally:
++        client_socket.close()
++
++    return None
++
++def validate_parameters(param, len_limit, char_limit):
++    ret = ResultMessage.RESULT_SUCCEED
++    if not param:
++        print("parm is invalid")
++        ret = ResultMessage.RESULT_NOT_PARAM
++        return [False, ret]
++
++    if not isinstance(param, list):
++        print(f"{param} is not list type.")
++        ret = ResultMessage.RESULT_NOT_PARAM
++        return [False, ret]
++
++    if len(param) <= 0:
++        print(f"{param} length is 0.")
++        ret =  ResultMessage.RESULT_INVALID_LENGTH
++        return [False, ret]
++
++    if len(param) > len_limit:
++        print(f"{param} length more than {len_limit}")
++        ret =  ResultMessage.RESULT_EXCEED_LIMIT
++        return [False, ret]
++
++    pattern = r'^[a-zA-Z0-9_-]+$'
++    for info in param:
++        if len(info) > char_limit:
++            print(f"{info} length more than {char_limit}")
++            ret =  ResultMessage.RESULT_EXCEED_LIMIT
++            return [False, ret]
++        if not re.match(pattern, info):
++            print(f"{info} is invalid char")
++            ret =  ResultMessage.RESULT_INVALID_CHAR
++            return [False, ret]
++
++    return [True, ret]
++
++def is_iocollect_valid(period, disk_list=None, stage=None):
++    result = inter_is_iocollect_valid(period, disk_list, stage)
++    error_code = result['ret']
++    if error_code != ResultMessage.RESULT_SUCCEED:
++        result['message'] = Result_Messages[error_code]
++    return result
++
++def inter_is_iocollect_valid(period, disk_list=None, stage=None):
++    result = {}
++    result['ret'] = ResultMessage.RESULT_UNKNOWN
++    result['message'] = ""
++
++    if not period or not isinstance(period, int):
++        result['ret'] = ResultMessage.RESULT_NOT_PARAM
++        return result
++    if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
++        result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
++        return result
++
++    if not disk_list:
++        disk_list = []
++    else:
++        res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN)
++        if not res[0]:
++            result['ret'] = res[1]
++            return result
++
++    if not stage:
++        stage = []
++    else:
++        res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN)
++        if not res[0]:
++            result['ret'] = res[1]
++            return result
++
++    req_msg_struct = {
++            'disk_list': json.dumps(disk_list),
++            'period': period,
++            'stage': json.dumps(stage)
++        }
++    request_message = json.dumps(req_msg_struct)
++    result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID)
++    if not result_message:
++        print("collect_plugin: client_send_and_recv failed")
++        return result
++        
++    try:
++        json.loads(result_message)
++    except json.JSONDecodeError:
++        print("is_iocollect_valid: json decode error")
++        result['ret'] = ResultMessage.RESULT_PARSE_FAILED
++        return result
++
++    result['ret'] = ResultMessage.RESULT_SUCCEED
++    result['message'] = result_message
++    return result
++
++def get_io_data(period, disk_list, stage, iotype):
++    result = inter_get_io_data(period, disk_list, stage, iotype)
++    error_code = result['ret']
++    if error_code != ResultMessage.RESULT_SUCCEED:
++        result['message'] = Result_Messages[error_code]
++    return result
++
++def inter_get_io_data(period, disk_list, stage, iotype):
++    result = {}
++    result['ret'] = ResultMessage.RESULT_UNKNOWN
++    result['message'] = ""
++
++    if not isinstance(period, int):
++        result['ret'] = ResultMessage.RESULT_NOT_PARAM
++        return result
++    if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
++        result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
++        return result
++
++    res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN)
++    if not res[0]:
++        result['ret'] = res[1]
++        return result
++
++    res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN)
++    if not res[0]:
++        result['ret'] = res[1]
++        return result
++
++    res = validate_parameters(iotype, LIMIT_IOTYPE_LIST_LEN, LIMIT_IOTYPE_CHAR_LEN)
++    if not res[0]:
++        result['ret'] = res[1]
++        return result
++
++    req_msg_struct = {
++            'disk_list': json.dumps(disk_list),
++            'period': period,
++            'stage': json.dumps(stage),
++            'iotype' : json.dumps(iotype)
++        }
++
++    request_message = json.dumps(req_msg_struct)
++    result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA)
++    if not result_message:
++        print("collect_plugin: client_send_and_recv failed")
++        return result
++    try:
++        json.loads(result_message)
++    except json.JSONDecodeError:
++        print("get_io_data: json decode error")
++        result['ret'] = ResultMessage.RESULT_PARSE_FAILED
++        return result
++
++    result['ret'] = ResultMessage.RESULT_SUCCEED
++    result['message'] = result_message
++    return result
++
+diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
+new file mode 100644
+index 0000000..bab4e56
+--- /dev/null
++++ b/src/python/sentryCollector/collect_server.py
+@@ -0,0 +1,285 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++listen module
++"""
++import sys
++import signal
++import traceback
++import socket
++import os
++import json
++import logging
++import fcntl
++import select
++import threading
++import time
++
++from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA
++from .collect_config import CollectConfig
++
++SENTRY_RUN_DIR = "/var/run/sysSentry"
++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
++
++# socket param
++CLT_LISTEN_QUEUE_LEN = 5
++SERVER_EPOLL_TIMEOUT = 0.3
++
++# data length param
++CLT_MSG_HEAD_LEN = 9    #3+2+4
++CLT_MSG_PRO_LEN = 2
++CLT_MSG_MAGIC_LEN = 3
++CLT_MSG_LEN_LEN = 4
++
++# data flag param
++CLT_MAGIC = "CLT"
++RES_MAGIC = "RES"
++
++# interface protocol
++class ServerProtocol():
++    IS_IOCOLLECT_VALID = 0
++    GET_IO_DATA = 1
++    PRO_END = 3
++
++class CollectServer():
++
++    def __init__(self):
++
++        self.io_global_data = {}
++
++        self.stop_event = threading.Event()
++
++    def is_iocollect_valid(self, data_struct):
++
++        result_rev = {}
++        self.io_global_data = IO_GLOBAL_DATA
++
++        if len(IO_CONFIG_DATA) == 0:
++            logging.error("the collect thread is not started, the data is invalid. ")
++            return json.dumps(result_rev)
++
++        period_time = IO_CONFIG_DATA[0]
++        max_save = IO_CONFIG_DATA[1]
++
++        disk_list = json.loads(data_struct['disk_list'])
++        period = int(data_struct['period'])
++        stage_list = json.loads(data_struct['stage'])
++
++        if (period < period_time) or (period > period_time * max_save) or (period % period_time):
++            logging.error("is_iocollect_valid: period time: %d is invalid", period)
++            return json.dumps(result_rev)
++
++        for disk_name, stage_info in self.io_global_data.items():
++            if len(disk_list) > 0 and disk_name not in disk_list:
++                continue
++            result_rev[disk_name] = []
++            if len(stage_list) == 0:
++                result_rev[disk_name] = list(stage_info.keys())
++                continue
++            for stage_name, stage_data in stage_info.items():
++                if stage_name in stage_list:
++                    result_rev[disk_name].append(stage_name)
++
++        return json.dumps(result_rev)
++
++    def get_io_data(self, data_struct):
++        result_rev = {}
++        self.io_global_data = IO_GLOBAL_DATA
++
++        if len(IO_CONFIG_DATA) == 0:
++            logging.error("the collect thread is not started, the data is invalid. ")
++            return json.dumps(result_rev)
++        period_time = IO_CONFIG_DATA[0]
++        max_save = IO_CONFIG_DATA[1]
++
++        period = int(data_struct['period'])
++        disk_list = json.loads(data_struct['disk_list'])
++        stage_list = json.loads(data_struct['stage'])
++        iotype_list = json.loads(data_struct['iotype'])
++
++        if (period < period_time) or (period > period_time * max_save) or (period % period_time):
++            logging.error("get_io_data: period time: %d is invalid", period)
++            return json.dumps(result_rev)
++
++        collect_index = period // period_time - 1
++        logging.debug("period: %d, collect_index: %d", period, collect_index)
++
++        for disk_name, stage_info in self.io_global_data.items():
++            if disk_name not in disk_list:
++                continue
++            result_rev[disk_name] = {}
++            for stage_name, iotype_info in stage_info.items():
++                if len(stage_list) > 0 and stage_name not in stage_list:
++                    continue
++                result_rev[disk_name][stage_name] = {}
++                for iotype_name, iotype_info in iotype_info.items():
++                    if iotype_name not in iotype_list:
++                        continue
++                    if len(iotype_info) < collect_index:
++                        continue
++                    result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index]
++
++        return json.dumps(result_rev)
++
++    def msg_data_process(self, msg_data, protocal_id):
++        """message data process"""
++        logging.debug("msg_data %s", msg_data)
++        protocol_name = msg_data[0]
++        try:
++            data_struct = json.loads(msg_data)
++        except json.JSONDecodeError:
++            logging.error("msg data process: json decode error")
++            return "Request message decode failed"
++
++        if protocal_id == ServerProtocol.IS_IOCOLLECT_VALID:
++            res_msg = self.is_iocollect_valid(data_struct)
++        elif protocal_id == ServerProtocol.GET_IO_DATA:
++            res_msg = self.get_io_data(data_struct)
++
++        return res_msg
++
++    def msg_head_process(self, msg_head):
++        """message head process"""
++        ctl_magic = msg_head[:CLT_MSG_MAGIC_LEN]
++        if ctl_magic != CLT_MAGIC:
++            logging.error("recv msg head magic invalid")
++            return None
++
++        protocol_str = msg_head[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
++        try:
++            protocol_id = int(protocol_str)
++        except ValueError:
++            logging.error("recv msg protocol id is invalid")
++            return None
++
++        data_len_str = msg_head[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:CLT_MSG_HEAD_LEN]
++        try:
++            data_len = int(data_len_str)
++        except ValueError:
++            logging.error("recv msg data len is invalid %s", data_len_str)
++            return None
++
++        return [protocol_id, data_len]
++
++    def server_recv(self, server_socket: socket.socket):
++        """server receive"""
++        try:
++            client_socket, _ = server_socket.accept()
++            logging.debug("server_fd listen ok")
++        except socket.error:
++            logging.error("server accept failed, %s", socket.error)
++            return
++
++        try:
++            msg_head = client_socket.recv(CLT_MSG_HEAD_LEN)
++            logging.debug("recv msg head: %s", msg_head.decode())
++            head_info = self.msg_head_process(msg_head.decode())
++        except (OSError, UnicodeError):
++            client_socket.close()
++            logging.error("server recv HEAD failed")
++            return
++
++        protocol_id = head_info[0]
++        data_len = head_info[1]
++        logging.debug("msg protocol id: %d, data length: %d", protocol_id, data_len)
++        if protocol_id >= ServerProtocol.PRO_END:
++            client_socket.close()
++            logging.error("protocol id is invalid")
++            return
++
++        if data_len < 0:
++            client_socket.close()
++            logging.error("msg head parse failed")
++            return
++
++        try:
++            msg_data = client_socket.recv(data_len)
++            msg_data_decode = msg_data.decode()
++            logging.debug("msg data %s", msg_data_decode)
++        except (OSError, UnicodeError):
++            client_socket.close()
++            logging.error("server recv MSG failed")
++            return
++
++        res_data = self.msg_data_process(msg_data_decode, protocol_id)
++        logging.debug("res data %s", res_data)
++
++        # server send
++        res_head = RES_MAGIC
++        res_head += str(protocol_id).zfill(CLT_MSG_PRO_LEN)
++        res_data_len = str(len(res_data)).zfill(CLT_MSG_LEN_LEN)
++        res_head += res_data_len
++        logging.debug("res head %s", res_head)
++
++        res_msg = res_head + res_data
++        logging.debug("res msg %s", res_msg)
++
++        try:
++            client_socket.send(res_msg.encode())
++        except OSError:
++            logging.error("server recv failed")
++        finally:
++            client_socket.close()
++        return
++
++    def server_fd_create(self):
++        """create server fd"""
++        if not os.path.exists(SENTRY_RUN_DIR):
++            logging.error("%s not exist, failed", SENTRY_RUN_DIR)
++            return None
++
++        try:
++            server_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
++            server_fd.setblocking(False)
++            if os.path.exists(COLLECT_SOCKET_PATH):
++                os.remove(COLLECT_SOCKET_PATH)
++
++            server_fd.bind(COLLECT_SOCKET_PATH)
++            os.chmod(COLLECT_SOCKET_PATH, 0o600)
++            server_fd.listen(CLT_LISTEN_QUEUE_LEN)
++            logging.debug("%s bind and listen", COLLECT_SOCKET_PATH)
++        except socket.error:
++            logging.error("server fd create failed")
++            server_fd = None
++
++        return server_fd
++
++
++    def server_loop(self):
++        """main loop"""
++        logging.info("collect listen thread start")
++        server_fd = self.server_fd_create()
++        if not server_fd:
++            return
++
++        epoll_fd = select.epoll()
++        epoll_fd.register(server_fd.fileno(), select.EPOLLIN)
++
++        logging.debug("start server_loop loop")
++        while True:
++            if self.stop_event.is_set():
++                logging.debug("collect listen thread exit")
++                server_fd = None
++                return
++            try:
++                events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT)
++                for event_fd, _ in events_list:
++                    if event_fd == server_fd.fileno():
++                        self.server_recv(server_fd)
++                    else:
++                        continue
++            except socket.error:
++                pass
++
++    def stop_thread(self):
++        logging.debug("collect listen thread is preparing to exit")
++        self.stop_event.set() 
+diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
+new file mode 100644
+index 0000000..3a836df
+--- /dev/null
++++ b/src/python/sentryCollector/collectd.py
+@@ -0,0 +1,97 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++"""
++main loop for collect.
++"""
++import sys
++import signal
++import traceback
++import socket
++import os
++import json
++import logging
++import fcntl
++import select
++
++import threading
++
++from .collect_io import CollectIo
++from .collect_server import CollectServer
++from .collect_config import CollectConfig
++
++SENTRY_RUN_DIR = "/var/run/sysSentry"
++COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
++SENTRY_RUN_DIR_PERM = 0o750
++
++COLLECT_LOG_FILE = "/var/log/sysSentry/collector.log"
++Thread_List = []
++Module_Map_Class = {"io" : CollectIo}
++
++def remove_sock_file():
++    try:
++        os.unlink(COLLECT_SOCKET_PATH)
++    except FileNotFoundError:
++        pass
++
++def sig_handler(signum, _f):
++    if signum not in (signal.SIGINT, signal.SIGTERM):
++        return
++    for i in range(len(Thread_List)):
++        Thread_List[i][0].stop_thread()
++
++    remove_sock_file()
++
++def main():
++    """main
++    """
++    if not os.path.exists(SENTRY_RUN_DIR):
++        os.mkdir(SENTRY_RUN_DIR)
++        os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
++
++    logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO)
++    os.chmod(COLLECT_LOG_FILE, 0o600)
++
++    try:
++        signal.signal(signal.SIGINT, sig_handler)
++        signal.signal(signal.SIGTERM, sig_handler)
++
++        logging.info("finish main parse_args")
++
++        module_config = CollectConfig()
++        module_list = module_config.modules
++
++        # listen thread
++        cs = CollectServer()
++        listen_thread = threading.Thread(target=cs.server_loop)
++        listen_thread.start()
++        Thread_List.append([cs, listen_thread])
++
++        # collect thread
++        for info in module_list:
++            class_name = Module_Map_Class.get(info)
++            if not class_name:
++                logging.info("%s correspond to class is not exists", info)
++                continue
++            cn = class_name(module_config)
++            collect_thread = threading.Thread(target=cn.main_loop)
++            collect_thread.start()
++            Thread_List.append([cn, collect_thread])
++
++        for i in range(len(Thread_List)):
++            Thread_List[i][1].join()
++            
++    except Exception:
++        logging.error('%s', traceback.format_exc())
++    finally:
++        pass
++
++    logging.info("All threads have finished. Main thread is exiting.")
+\ No newline at end of file
+diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+new file mode 100644
+index 0000000..ff2071d
+--- /dev/null
++++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+@@ -0,0 +1,257 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++import logging
++import signal
++import configparser
++import time
++
++from .stage_window import IoWindow, IoDumpWindow
++from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
++from .utils import update_avg_and_check_abnormal
++
++CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
++
++def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
++    """print invalid log"""
++    if config_list and default_list:
++        logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
++    elif config_list == ["default"]:
++        logging.warning("Default {} use {}".format(keys_name, default_list))
++
++
++def read_config_common(config):
++    """read config file, get [common] section value"""
++    try:
++        common_sec = config['common']
++    except configparser.NoSectionError:
++        report_alarm_fail("Cannot find common section in config file")
++
++    try:
++        period_time = int(common_sec.get("period_time", 1))
++        if not (1 <= period_time <= 300):
++            raise ValueError("Invalid period_time")
++    except ValueError:
++        period_time = 1
++        logging.warning("Invalid period_time, set to 1s")
++
++    disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
++    stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
++
++    if len(disk) > 10:
++        logging.warning("Too many disks, record only max 10 disks")
++        disk = disk[:10]
++
++    iotype = common_sec.get('iotype', 'read,write').split(",")
++    iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
++    err_iotype =  [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
++
++    if err_iotype:
++        logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
++
++    return period_time, disk, stage, iotype_list
++
++
++def read_config_algorithm(config):
++    """read config file, get [algorithm] section value"""
++    if not config.has_section("algorithm"):
++        report_alarm_fail("Cannot find algorithm section in config file")
++
++    try:
++        win_size = int(config.get("algorithm", "win_size"))
++        if not (1 <= win_size <= 300):
++            raise ValueError("Invalid win_size")
++        win_threshold = int(config.get("algorithm", "win_threshold"))
++        if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
++            raise ValueError("Invalid win_threshold")
++    except ValueError:
++        report_alarm_fail("Invalid win_threshold or win_size")
++
++    return win_size, win_threshold
++
++
++def read_config_lat_iodump(io_dic, config):
++    """read config file, get [latency] [iodump] section value"""
++    common_param = {}
++    for io_type in io_dic["iotype_list"]:
++        common_param[io_type] = {}
++
++        latency_keys = {
++            "avg_lim": "{}_avg_lim".format(io_type),
++            "avg_time": "{}_avg_time".format(io_type),
++            "tot_lim": "{}_tot_lim".format(io_type),
++        }
++        iodump_key = "{}_iodump_lim".format(io_type)
++
++        for key_suffix, key_template in latency_keys.items():
++            if key_template in config["latency"] and config["latency"][key_template].isdecimal():
++                common_param[io_type][key_template] = int(config["latency"][key_template])
++
++        if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
++            common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
++   
++    return common_param
++
++
++def read_config_stage(config, stage, iotype_list):
++    """read config file, get [STAGE_NAME] section value"""
++    res = {}
++    if not stage in config:
++        return res
++
++    for key in config[stage]:
++        if config[stage][key].isdecimal():
++            res[key] = int(config[stage][key])
++
++    return res
++
++
++def init_io_win(io_dic, config, common_param):
++    """initialize windows of latency, iodump, and dict of avg_value"""
++    iotype_list = io_dic["iotype_list"]
++    io_data = {}
++    io_avg_value = {}
++    for disk_name in io_dic["disk_list"]:
++        io_data[disk_name] = {}
++        io_avg_value[disk_name] = {}
++        for stage_name in io_dic["stage_list"]:
++            io_data[disk_name][stage_name] = {}
++            io_avg_value[disk_name][stage_name] = {}
++            # step3. 解析stage配置
++            curr_stage_param = read_config_stage(config, stage_name, iotype_list)
++            for rw in iotype_list:
++                io_data[disk_name][stage_name][rw] = {}
++                io_avg_value[disk_name][stage_name][rw] = [0, 0]
++
++                # 对每个rw创建latency和iodump窗口
++                avg_lim_key = "{}_avg_lim".format(rw)
++                avg_time_key = "{}_avg_time".format(rw)
++                tot_lim_key = "{}_tot_lim".format(rw)
++                iodump_lim_key = "{}_iodump_lim".format(rw)
++
++                # 获取值，优先从 curr_stage_param 获取，如果不存在，则从 common_param 获取
++                avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
++                avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
++                tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
++                iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
++
++                if avg_lim_value and avg_time_value and tot_lim_value:
++                    io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
++
++                if iodump_lim_value is not None:
++                    io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
++    return io_data, io_avg_value
++
++
++def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
++    """get disk_list and stage_list by sentryCollector"""
++    json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage)
++
++    all_disk_set = json_data.keys()
++    all_stage_set = set()
++    for disk_stage_list in json_data.values():
++        all_stage_set.update(disk_stage_list)
++
++    disk_list = [key for key in config_disk if key in all_disk_set]
++    not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
++
++    stage_list = [key for key in config_stage if key in all_stage_set]
++    not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
++
++    if not config_disk:
++        disk_list = [key for key in all_disk_set]
++
++    if not config_stage:
++        stage_list = [key for key in all_stage_set]
++
++    if config_disk and not disk_list:
++        logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
++        disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
++
++    if config_stage and not stage_list:
++        logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
++        disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
++
++    if not stage_list or not disk_list:
++        report_alarm_fail("Cannot get valid disk name or stage name.")
++
++    log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
++    log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
++
++    return disk_list, stage_list
++
++
++def main_loop(io_dic, io_data, io_avg_value):
++    """main loop of avg_block_io"""
++    period_time = io_dic["period_time"]
++    disk_list = io_dic["disk_list"]
++    stage_list = io_dic["stage_list"]
++    iotype_list = io_dic["iotype_list"]
++    win_size = io_dic["win_size"]
++    # 开始循环
++    while True:
++        # 等待x秒
++        time.sleep(period_time)
++
++        # 采集模块对接，获取周期数据
++        curr_period_data = avg_get_io_data(io_dic)
++
++        # 处理周期数据
++        reach_size = False
++        for disk_name in disk_list:
++            for stage_name in stage_list:
++                for rw in iotype_list:
++                    if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]:
++                        io_key = (disk_name, stage_name, rw)
++                        reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data)
++
++        # win_size不满时不进行告警判断
++        if not reach_size:
++            continue
++
++        # 判断异常窗口、异常场景
++        for disk_name in disk_list:
++            for rw in iotype_list:
++                process_report_data(disk_name, rw, io_data)
++
++
++def main():
++    """main func"""
++    # 注册停止信号-2/-15
++    signal.signal(signal.SIGINT, sig_handler)
++    signal.signal(signal.SIGTERM, sig_handler)
++
++    # 初始化配置读取
++    config = configparser.ConfigParser(comment_prefixes=('#', ';'))
++    try:
++        config.read(CONFIG_FILE)
++    except configparser.Error:
++        report_alarm_fail("Failed to read config file")
++
++    io_dic = {}
++
++    # 读取配置文件 -- common段
++    io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config)
++
++    # 采集模块对接，is_iocollect_valid()
++    io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
++
++    if "bio" not in io_dic["stage_list"]:
++        report_alarm_fail("Cannot run avg_block_io without bio stage")
++
++    # 初始化窗口 -- config读取，对应is_iocollect_valid返回的结果
++    # step1. 解析公共配置 --- algorithm
++    io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
++
++    # step2. 循环创建窗口
++    common_param = read_config_lat_iodump(io_dic, config)
++    io_data, io_avg_value = init_io_win(io_dic, config, common_param)
++
++    main_loop(io_dic, io_data, io_avg_value)
+diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
+new file mode 100644
+index 0000000..0da4208
+--- /dev/null
++++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
+@@ -0,0 +1,86 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++import json
++import logging
++import sys
++import time
++
++from .utils import is_abnormal
++from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
++from syssentry.result import ResultLevel, report_result
++
++
++TASK_NAME = "avg_block_io"
++
++def sig_handler(signum, _f):
++    """stop avg_block_io"""
++    report_result(TASK_NAME, ResultLevel.PASS, json.dumps({}))
++    logging.info("Finished avg_block_io plugin running.")
++    sys.exit(0)
++
++def avg_get_io_data(io_dic):
++    """get_io_data from sentryCollector"""
++    res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
++    return check_result_validation(res, 'get io data')
++
++
++def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
++    """is_iocollect_valid from sentryCollector"""
++    res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
++    return check_result_validation(res, 'check config validation')
++
++
++def check_result_validation(res, reason):
++    """check validation of result from sentryCollector"""
++    if not 'ret' in res or not 'message' in res:
++        err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
++        report_alarm_fail(err_msg)
++    if res['ret'] != 0:
++        err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
++        report_alarm_fail(err_msg)
++
++    try:
++        json_data = json.loads(res['message'])
++    except json.JSONDecodeError:
++        err_msg = "Failed to {}: invalid return message".format(reason)
++        report_alarm_fail(err_msg)
++
++    return json_data
++
++
++def report_alarm_fail(alarm_info):
++    """report result to xalarmd"""
++    report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
++    logging.error(alarm_info)
++    sys.exit(1)
++
++
++def process_report_data(disk_name, rw, io_data):
++    """check abnormal window and report to xalarm"""
++    if not is_abnormal((disk_name, 'bio', rw), io_data):
++        return
++
++    ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
++    for stage_name in ctrl_stage:
++        if is_abnormal((disk_name, stage_name, rw), io_data):
++            logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
++            return
++
++    if is_abnormal((disk_name, 'rq_driver', rw), io_data):
++        logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
++        return
++
++    kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
++    for stage_name in kernel_stage:
++        if is_abnormal((disk_name, stage_name, rw), io_data):
++            logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
++            return
++    logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
+diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
+new file mode 100644
+index 0000000..9b0ce79
+--- /dev/null
++++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
+@@ -0,0 +1,47 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++
++class AbnormalWindowBase:
++    def __init__(self, window_size=10, window_threshold=7):
++        self.window_size = window_size
++        self.window_threshold = window_threshold
++        self.abnormal_window = [False] * window_size
++
++    def append_new_period(self, ab_res, avg_val=0):
++        self.abnormal_window.pop(0)
++        if self.is_abnormal_period(ab_res, avg_val):
++            self.abnormal_window.append(True)
++        else:
++            self.abnormal_window.append(False)
++
++    def is_abnormal_window(self):
++        return sum(self.abnormal_window) > self.window_threshold
++
++
++class IoWindow(AbnormalWindowBase):
++    def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
++        super().__init__(window_size, window_threshold)
++        self.abnormal_multiple = abnormal_multiple
++        self.abnormal_multiple_lim = abnormal_multiple_lim
++        self.abnormal_time = abnormal_time
++
++    def is_abnormal_period(self, value, avg_val):
++        return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \
++               (value > self.abnormal_time)
++
++
++class IoDumpWindow(AbnormalWindowBase):
++    def __init__(self, window_size=10, window_threshold=7, abnormal_time=40):
++        super().__init__(window_size, window_threshold)
++        self.abnormal_time = abnormal_time
++
++    def is_abnormal_period(self, value, avg_val=0):
++        return value > self.abnormal_time
+diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
+new file mode 100644
+index 0000000..54ed080
+--- /dev/null
++++ b/src/python/sentryPlugins/avg_block_io/utils.py
+@@ -0,0 +1,86 @@
++# coding: utf-8
++# Copyright (c) 2024 Huawei Technologies Co., Ltd.
++# sysSentry is licensed under the Mulan PSL v2.
++# You can use this software according to the terms and conditions of the Mulan PSL v2.
++# You may obtain a copy of Mulan PSL v2 at:
++#     http://license.coscl.org.cn/MulanPSL2
++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
++# PURPOSE.
++# See the Mulan PSL v2 for more details.
++AVG_VALUE = 0
++AVG_COUNT = 1
++
++
++def get_nested_value(data, keys):
++    """get data from nested dict"""
++    for key in keys:
++        if key in data:
++            data = data[key]
++        else:
++            return None
++    return data
++
++
++def set_nested_value(data, keys, value):
++    """set data to nested dict"""
++    for key in keys[:-1]:
++        if key in data:
++            data = data[key]
++        else:
++            return False
++    data[keys[-1]] = value
++    return True
++
++
++def is_abnormal(io_key, io_data):
++    """check if latency and iodump win abnormal"""
++    for key in ['latency', 'iodump']:
++        all_keys = get_nested_value(io_data, io_key)
++        if all_keys and key in all_keys:
++            win = get_nested_value(io_data, io_key + (key,))
++            if win and win.is_abnormal_window():
++                return True
++    return False
++
++
++def update_io_avg(old_avg, period_value, win_size):
++    """update average of latency window"""
++    if old_avg[AVG_COUNT] < win_size:
++        new_avg_count = old_avg[AVG_COUNT] + 1
++        new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count
++    else:
++        new_avg_count = old_avg[AVG_COUNT]
++        new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count
++    return [new_avg_value, new_avg_count]
++
++
++def update_io_data(old_avg, period_value, win_size, io_data, io_key):
++    """update data of latency and iodump window"""
++    all_wins = get_nested_value(io_data, io_key)
++    if all_wins and "latency" in all_wins:
++        io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
++    if all_wins and "iodump" in all_wins:
++        io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
++
++
++def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
++    """update avg and check abonrmal, return true if win_size full"""
++    period_value = get_nested_value(data, io_key)
++    old_avg = get_nested_value(io_avg_value, io_key)
++
++    # 更新avg数据
++    if old_avg[AVG_COUNT] < win_size:
++        set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
++        return False
++
++    # 更新win数据 -- 判断异常周期
++    update_io_data(old_avg, period_value, win_size, io_data, io_key)
++    all_wins = get_nested_value(io_data, io_key)
++    if all_wins and 'latency' not in all_wins:
++        return True
++    period = get_nested_value(io_data, io_key + ("latency",))
++    if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
++        return True
++    set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
++    return True
+diff --git a/src/python/setup.py b/src/python/setup.py
+index f96a96e..c28c691 100644
+--- a/src/python/setup.py
++++ b/src/python/setup.py
+@@ -31,7 +31,9 @@ setup(
+         'console_scripts': [
+             'cpu_sentry=syssentry.cpu_sentry:main',
+             'syssentry=syssentry.syssentry:main',
+-            'xalarmd=xalarm.xalarm_daemon:alarm_process_create'
++            'xalarmd=xalarm.xalarm_daemon:alarm_process_create',
++            'sentryCollector=sentryCollector.collectd:main',
++            'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main'
+         ]
+     },
+ )
+diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py
+new file mode 100644
+index 0000000..5956538
+--- /dev/null
++++ b/src/python/syssentry/bmc_alarm.py
+@@ -0,0 +1,159 @@
++import logging
++import socket
++from enum import Enum
++
++from .utils import execute_command
++
++HEX_CHAR_LEN = 2
++SOCKET_RECEIVE_LEN = 128
++BMC_DATA_HEAD = "REP"
++BMC_REPORT_TYPE_BIT = 0
++HBMC_REPAIR_TYPE_BIT = 1
++HBMC_REPAIR_RESULT_BIT = 2
++HBMC_ISOLATION_TYPE_BIT = 3
++HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92"
++HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN
++HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN
++HBMC_REPAIR_TYPE_OFFSET = 7
++
++HBMC_SEND_SUCCESS_CODE = "db 07 00"
++
++
++class ReportType(Enum):
++    HBMC_REPAIR_BMC = 0x00
++
++
++class HBMCRepairType(Enum):
++    CE_ACLS = 7
++    PS_UCE_ACLS = 8
++    CE_SPPR = 9
++    PS_UCE_SPPR = 10
++
++
++class HBMCRepairResultType(Enum):
++    ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001
++    ISOLATE_FAILED_OTHER_REASON   = 0b10000010
++    REPAIR_FAILED_NO_RESOURCE     = 0b10010100
++    REPAIR_FAILED_INVALID_PARAM   = 0b10011000
++    REPAIR_FAILED_OTHER_REASON	  = 0b10011100
++    ONLINE_PAGE_FAILED            = 0b10100000
++    ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000
++
++
++class HBMCIsolationType(Enum):
++    ROW_FAULT = 1
++    SINGLE_ADDR_FAULT = 6
++
++
++def find_value_is_in_enum(value: int, enum: Enum):
++    for item in enum:
++        if value == item.value:
++            return True
++    return False
++
++
++def convert_hex_char_to_int(data, bit):
++    if len(data) < (bit+1)*HEX_CHAR_LEN:
++        logging.error(f"Data {data} len is too short, current convert bit is {bit}")
++    char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN]
++    try:
++        value = int(char, 16)
++    except ValueError:
++        logging.error(f"Cannot convert char [{char}] to int")
++        raise ValueError
++    return value
++
++
++def reverse_byte(data):
++    return data[3], data[2], data[1], data[0]
++
++
++def parse_hbmc_report(data: str):
++    logging.debug(f"bmc receive raw data is {data}")
++    repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT)
++    repair_type += HBMC_REPAIR_TYPE_OFFSET
++    if not find_value_is_in_enum(repair_type, HBMCRepairType):
++        logging.warning(f"HBMC msg repair type ({repair_type}) is unknown")
++        raise ValueError
++
++    repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT)
++    if not find_value_is_in_enum(repair_result, HBMCRepairResultType):
++        logging.warning(f"HBMC msg repair result ({repair_result}) is unknown")
++        raise ValueError
++
++    isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT)
++    if not find_value_is_in_enum(isolation_type, HBMCIsolationType):
++        logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown")
++        raise ValueError
++
++    cmd_list = [
++        "ipmitool",
++        "raw",
++        "0x30", # Netfn
++        "0x92", # cmd
++        "0xdb",
++        "0x07",
++        "0x00",
++        "0x65", # sub command
++        "0x01", # SystemId
++        "0x00", # LocalSystemId
++        "{:#04X}".format(repair_type),
++        "{:#04X}".format(repair_result),
++        "{:#04X}".format(isolation_type),
++    ]
++    # send the remain data directly
++    data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:]
++    other_info_str = []
++    for i in range(len(data) // 2):
++        other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i)))
++    cmd_list.extend(other_info_str)
++
++    cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4])
++    cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4])
++
++    logging.info(f"Send bmc alarm command is {cmd_list}")
++
++    ret = execute_command(cmd_list)
++    if HBMC_SEND_SUCCESS_CODE not in ret:
++        logging.warning(f"Send bmc alarm failed, error code is {ret}")
++        raise ValueError
++    logging.debug("Send bmc alarm success")
++
++
++PARSE_REPORT_MSG_FUNC_DICT = {
++    ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report,
++}
++
++
++def bmc_recv(server_socket: socket.socket):
++    logging.debug("Get hbm socket connection request")
++    try:
++        client_socket, _ = server_socket.accept()
++        logging.debug("cpu alarm fd listen ok")
++
++        data = client_socket.recv(SOCKET_RECEIVE_LEN)
++        data = data.decode()
++
++        data_head = data[0:len(BMC_DATA_HEAD)]
++        if data_head != BMC_DATA_HEAD:
++            logging.warning(f"The head of the msg is incorrect, head is {data_head}")
++            raise ValueError
++
++        # remove the data head
++        data = data[len(BMC_DATA_HEAD):]
++        logging.info(f"Remove head data is {data}")
++
++        report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT)
++        if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys():
++            logging.warning(f"The type of the msg ({report_type}) is unknown")
++            raise ValueError
++
++        PARSE_REPORT_MSG_FUNC_DICT[report_type](data)
++
++    except socket.error:
++        logging.error("socket error")
++        return
++    except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
++        logging.error("server recv bmc msg failed!")
++        client_socket.close()
++        return
+diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py
+index d0d0719..b38b381 100644
+--- a/src/python/syssentry/callbacks.py
++++ b/src/python/syssentry/callbacks.py
+@@ -53,7 +53,7 @@ def task_stop(mod_name):
+             return "failed", "mod is not enabled"
+         logging.info("%s stop", mod_name)
+         if task.runtime_status == EXITED_STATUS:
+-            return "success", "task already stoped"
++            return "success", "task already stopped"
+         if task.runtime_status == WAITING_STATUS:
+             set_runtime_status(task.name, EXITED_STATUS)
+             return "success", ""
+diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py
+index 0b1642b..1fce462 100644
+--- a/src/python/syssentry/cpu_alarm.py
++++ b/src/python/syssentry/cpu_alarm.py
+@@ -249,3 +249,4 @@ def cpu_alarm_recv(server_socket: socket.socket):
+ 
+     upload_bmc(_type, module, command, event_type, socket_id, core_id)
+ 
++
+diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
+index 2f18d14..72925eb 100644
+--- a/src/python/syssentry/cpu_sentry.py
++++ b/src/python/syssentry/cpu_sentry.py
+@@ -26,8 +26,6 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini"
+ # Inspection commands running at the bottom layer
+ LOW_LEVEL_INSPECT_CMD = "cat-cli"
+ 
+-# max length of msg in details
+-DETAILS_LOG_MSG_MAX_LEN = 255
+ 
+ class CpuSentry:
+     """
+@@ -96,10 +94,22 @@ class CpuSentry:
+             self.send_result["details"]["msg"] = "cpu_sentry task is killed!"
+             return
+ 
++        if "ERROR" in stdout:
++            self.send_result["result"] = ResultLevel.FAIL
++            self.send_result["details"]["code"] = 1004
++
++            # Remove ANSI escape sequences
++            error_info = stdout.split("\n")[0]
++            if error_info.startswith("\u001b"):
++                ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
++                error_info = re.sub(ansi_escape, '', error_info)
++
++            self.send_result["details"]["msg"] = error_info
++            return
++
+         out_split = stdout.split("\n")
+-        isolated_cores_number = -1
++        isolated_cores_number = 0
+         found_fault_cores_list = []
+-        error_msg_list = []
+         for out_line_i in out_split:
+             if "handle_patrol_result: Found fault cores" in out_line_i:
+                 cores_number_tmp = out_line_i.split("Found fault cores:")[1]
+@@ -111,25 +121,9 @@ class CpuSentry:
+             elif out_line_i.startswith('<ISOLATED-CORE-LIST>'):
+                 self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1]
+                 break
+-            elif "ERROR" in out_line_i:
+-                logging.error("[cat-cli error] - %s\n", out_line_i)
+-                error_msg_list.append(out_line_i)
+ 
+         found_fault_cores_number = len(set(found_fault_cores_list))
+-        if isolated_cores_number == -1:
+-            self.send_result["result"] = ResultLevel.FAIL
+-            self.send_result["details"]["code"] = 1004
+-
+-            send_error_msg = ""
+-            # Remove ANSI escape sequences
+-            for error_info in error_msg_list:
+-                if error_info.startswith("\u001b"):
+-                    ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
+-                    error_info = re.sub(ansi_escape, '', error_info)
+-                if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN:
+-                    send_error_msg += ";" + error_info
+-            self.send_result["details"]["msg"] = send_error_msg
+-        elif found_fault_cores_number == 0:
++        if found_fault_cores_number == 0:
+             self.send_result["details"]["code"] = 0
+             self.send_result["result"] = ResultLevel.PASS
+         elif 0 in found_fault_cores_list:
+diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py
+index 50780b3..f161f1f 100644
+--- a/src/python/syssentry/cron_process.py
++++ b/src/python/syssentry/cron_process.py
+@@ -21,7 +21,7 @@ import subprocess
+ from .utils import get_current_time_string
+ from .result import ResultLevel, RESULT_LEVEL_ERR_MSG_DICT
+ from .global_values import InspectTask
+-from .task_map import TasksMap, PERIOD_TYPE, ONESHOT_TYPE
++from .task_map import TasksMap, PERIOD_TYPE
+ from .mod_status import set_runtime_status, WAITING_STATUS, RUNNING_STATUS, \
+     FAILED_STATUS, EXITED_STATUS
+ 
+diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
+index 776971f..debff4e 100644
+--- a/src/python/syssentry/syssentry.py
++++ b/src/python/syssentry/syssentry.py
+@@ -44,6 +44,12 @@ try:
+ except ImportError:
+     CPU_EXIST = False
+ 
++BMC_EXIST = True
++try:
++    from .bmc_alarm import bmc_recv
++except ImportError:
++    BMC_EXIST = False
++
+ 
+ INSPECTOR = None
+ 
+@@ -83,6 +89,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock"
+ 
+ CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock"
+ 
++BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock"
++
++fd_list = []
+ 
+ def msg_data_process(msg_data):
+     """message data process"""
+@@ -325,6 +334,41 @@ def cpu_alarm_fd_create():
+ 
+     return cpu_alarm_fd
+ 
++def bmc_fd_create():
++    """create bmc fd"""
++    if not os.path.exists(SENTRY_RUN_DIR):
++        logging.debug("%s not exist", SENTRY_RUN_DIR)
++        return None
++
++    try:
++        bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
++    except socket.error:
++        logging.error("bmc fd create failed")
++        return None
++
++    bmc_fd.setblocking(False)
++    if os.path.exists(BMC_SOCKET_PATH):
++        os.remove(BMC_SOCKET_PATH)
++
++    try:
++        bmc_fd.bind(BMC_SOCKET_PATH)
++    except OSError:
++        logging.error("bmc fd bind failed")
++        bmc_fd.close()
++        return None
++
++    os.chmod(BMC_SOCKET_PATH, 0o600)
++    try:
++        bmc_fd.listen(5)
++    except OSError:
++        logging.error("bmc fd listen failed")
++        bmc_fd.close()
++        return None
++
++    logging.debug("%s bind and listen", BMC_SOCKET_PATH)
++
++    return bmc_fd
++
+ 
+ def server_result_recv(server_socket: socket.socket):
+     """server result receive"""
+@@ -398,35 +442,47 @@ def server_result_fd_create():
+     return server_result_fd
+ 
+ 
++def close_all_fd():
++    for fd in fd_list:
++        fd.close()
++
++
+ def main_loop():
+     """main loop"""
++
+     server_fd = server_fd_create()
+     if not server_fd:
++        close_all_fd()
+         return
++    fd_list.append(server_fd)
+ 
+     server_result_fd = server_result_fd_create()
+     if not server_result_fd:
+-        server_fd.close()
++        close_all_fd()
+         return
++    fd_list.append(server_result_fd)
+ 
+     heartbeat_fd = heartbeat_fd_create()
+     if not heartbeat_fd:
+-        server_fd.close()
+-        server_result_fd.close()
++        close_all_fd()
+         return
++    fd_list.append(heartbeat_fd)
+ 
+     cpu_alarm_fd = cpu_alarm_fd_create()
+     if not cpu_alarm_fd:
+-        server_fd.close()
+-        heartbeat_fd.close()
+-        server_result_fd.close()
++        close_all_fd()
+         return
++    fd_list.append(cpu_alarm_fd)
++
++    bmc_fd = bmc_fd_create()
++    if not bmc_fd:
++        close_all_fd()
++        return
++    fd_list.append(bmc_fd)
+ 
+     epoll_fd = select.epoll()
+-    epoll_fd.register(server_fd.fileno(), select.EPOLLIN)
+-    epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN)
+-    epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN)
+-    epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN)
++    for fd in fd_list:
++        epoll_fd.register(fd.fileno(), select.EPOLLIN)
+ 
+     logging.debug("start main loop")
+     # onstart_tasks_handle()
+@@ -449,6 +505,8 @@ def main_loop():
+                     heartbeat_recv(heartbeat_fd)
+                 elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno():
+                     cpu_alarm_recv(cpu_alarm_fd)
++                elif BMC_EXIST and event_fd == bmc_fd.fileno():
++                    bmc_recv(bmc_fd)
+                 else:
+                     continue
+ 
+@@ -587,4 +645,3 @@ def main():
+         logging.error('%s', traceback.format_exc())
+     finally:
+         release_pidfile()
+-
+diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py
+index 94d7638..a2cdb25 100644
+--- a/src/python/xalarm/xalarm_api.py
++++ b/src/python/xalarm/xalarm_api.py
+@@ -98,7 +98,7 @@ class Xalarm:
+         """msg1 setter
+         """
+         if len(msg) > 512:
+-            raise ValueError("msg1 length must below 255")
++            raise ValueError("msg1 length must below 512")
+         self._msg1 = msg
+ 
+ 
+-- 
+2.27.0
+
diff --git a/sysSentry.spec b/sysSentry.spec
index 1497a74..3d21a4b 100644
--- a/sysSentry.spec
+++ b/sysSentry.spec
@@ -4,7 +4,7 @@
 Summary: System Inspection Framework
 Name: sysSentry
 Version: 1.0.2
-Release: 15
+Release: 16
 License: Mulan PSL v2
 Group: System Environment/Daemons
 Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
@@ -26,6 +26,7 @@ Patch13:   optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch
 Patch14:   over-threshold-should-be-warn-level-log-in-cat-cli.patch
 Patch15:   add-separator-to-err-info.patch
 Patch16:   remove-threshold-max-cpu-cores.patch
+Patch17:   add-hbm-online-repair.patch
 
 BuildRequires: cmake gcc-c++
 BuildRequires: python3 python3-setuptools
@@ -62,6 +63,16 @@ Recommends:     ipmitool
 %description -n cpu_sentry
 This package provides CPU fault detection
 
+%package -n hbm_online_repair
+Summary:        hbm_online_repair for the sysSentry
+Provides:       hbm_online_repair = %{version}
+BuildRequires:  libtraceevent-devel
+Requires:       libtraceevent ipmitool
+Requires:       sysSentry = %{version}-%{release}
+
+%description -n hbm_online_repair
+This package provides hbm_online_repair for the sysSentry.
+
 %prep
 %autosetup -n %{name}-%{version} -p1
 
@@ -81,6 +92,11 @@ make
 popd
 popd
 
+# hbm_online_repair
+pushd src/c/hbm_online_repair
+make
+popd
+
 %install
 # sysSentry
 mkdir -p %{buildroot}%{_bindir}
@@ -109,6 +125,12 @@ install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sen
 install src/c/catcli/catlib/build/cat-cli %{buildroot}%{_bindir}/cat-cli
 install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot}%{_libdir}
 
+# hbm_online_repair
+mkdir -p %{buildroot}/etc/sysconfig/
+install config/tasks/hbm_online_repair.mod %{buildroot}/etc/sysSentry/tasks/
+install src/c/hbm_online_repair/hbm_online_repair %{buildroot}%{_bindir}
+install src/c/hbm_online_repair/hbm_online_repair.env %{buildroot}/etc/sysconfig/hbm_online_repair.env
+
 chrpath -d %{buildroot}%{_bindir}/cat-cli
 chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so
 
@@ -166,6 +188,11 @@ rm -rf %{buildroot}
 %exclude %{python3_sitelib}/syssentry/cpu_*
 %exclude %{python3_sitelib}/syssentry/*/cpu_*
 
+# hbm repair module
+%exclude %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod
+%exclude %{python3_sitelib}/syssentry/bmc_*
+%exclude %{python3_sitelib}/syssentry/*/bmc_*
+
 %files -n libxalarm
 %attr(0550,root,root) %{_libdir}/libxalarm.so
 
@@ -182,7 +209,19 @@ rm -rf %{buildroot}
 %attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
 %attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_*
 
+%files -n hbm_online_repair
+%attr(0550,root,root) %{_bindir}/hbm_online_repair
+%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/hbm_online_repair.env
+%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod
+%attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py
+
 %changelog
+* Mon Oct 21 2024 luckky <guodashun1@huawei.com> - 1.0.2-16
+- Type:requirement
+- CVE:NA
+- SUG:NA
+- DESC:add hbm_online_repair
+
 * Wed Sep 25 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-15
 - Type:bugfix
 - CVE:NA
-- 
Gitee