From 82d55ae511cdad4efe012206ba8b308296fbc591 Mon Sep 17 00:00:00 2001 From: kail20gakki Date: Sun, 16 Feb 2025 19:46:56 +0800 Subject: [PATCH] add log --- test/profiler/test_npu_profiler.py | 2 +- .../profiler/analysis/_profiling_parser.py | 4 + .../analysis/prof_common_func/_log.py | 114 ++++++++++++++++++ .../prof_common_func/_task_manager.py | 2 +- .../analysis/prof_parse/_cann_file_parser.py | 25 +++- .../prof_parse/_fwk_cann_relation_parser.py | 18 ++- .../analysis/prof_parse/_fwk_file_parser.py | 21 +++- .../analysis/prof_view/_base_parser.py | 2 +- .../prof_view/_communication_parser.py | 16 ++- .../analysis/prof_view/_integrate_parser.py | 9 +- .../analysis/prof_view/_kernel_view_parser.py | 18 ++- .../prof_view/_memory_prepare_parser.py | 10 +- .../prof_view/_memory_timeline_parser.py | 7 +- .../analysis/prof_view/_memory_view_parser.py | 9 +- .../prof_view/_operator_view_parser.py | 9 +- .../analysis/prof_view/_stack_view_parser.py | 9 +- .../prof_view/_trace_step_time_parser.py | 9 +- .../analysis/prof_view/_trace_view_parser.py | 9 +- .../prof_view/cann_parse/_cann_analyze.py | 8 +- .../prof_view/cann_parse/_cann_export.py | 5 + .../prepare_parse/_fwk_pre_parser.py | 14 ++- .../prepare_parse/_relation_parser.py | 10 +- .../prof_db_parse/_communication_db_parser.py | 13 +- .../prof_view/prof_db_parse/_db_parser.py | 12 +- .../prof_db_parse/_fwk_api_db_parser.py | 8 +- .../prof_db_parse/_gc_record_db_parser.py | 10 +- .../prof_db_parse/_memory_db_parser.py | 9 +- .../prof_db_parse/_step_info_db_parser.py | 5 +- .../_trace_step_time_db_parser.py | 7 +- torch_npu/profiler/profiler_interface.py | 4 +- 30 files changed, 309 insertions(+), 89 deletions(-) create mode 100644 torch_npu/profiler/analysis/prof_common_func/_log.py diff --git a/test/profiler/test_npu_profiler.py b/test/profiler/test_npu_profiler.py index 03a1a82848..54f3526477 100644 --- a/test/profiler/test_npu_profiler.py +++ b/test/profiler/test_npu_profiler.py @@ -295,7 +295,7 @@ class TestNpuProfiler(TestCase): prof.stop() result_dir = os.path.join(self.results_work_path, "profiling_data") torch_npu.profiler.profiler.analyse(result_dir) - work_names = os.listdir(result_dir) + work_names = [p for p in os.listdir(result_dir) if p.endswith("ascend_pt")] os.environ["ASCEND_WORK_PATH"] = "" # only one device valid_work_name = len(work_names) == 1 and work_names[0].endswith("ascend_pt") diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py index 1bfcac8fbe..db63d97fcd 100644 --- a/torch_npu/profiler/analysis/_profiling_parser.py +++ b/torch_npu/profiler/analysis/_profiling_parser.py @@ -6,6 +6,7 @@ from .prof_common_func._constant import Constant, print_info_msg, print_error_ms from .prof_common_func._cann_package_manager import CannPackageManager from .prof_common_func._path_manager import ProfilerPathManager from .prof_common_func._task_manager import ConcurrentTasksManager +from .prof_common_func._log import ProfilerLogger from .prof_config._parser_config import ParserConfig from .prof_parse._cann_file_parser import CANNFileParser from ._profiler_config import ProfilerConfig @@ -25,6 +26,8 @@ class ProfilingParser: self._output_path = os.path.join(profiler_path, Constant.OUTPUT_DIR) PathManager.remove_path_safety(self._output_path) PathManager.make_dir_safety(self._output_path) + ProfilerLogger.init(self._profiler_path, "ProfilingParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def simplify_data(profiler_path: str, simplify_flag: bool): @@ -91,6 +94,7 @@ class ProfilingParser: self.run_parser() except Exception as err: print_error_msg(f"Failed to parsing profiling data. {err}") + self.logger.error("Failed to parsing profiling data, error: %s", str(err), exc_info=True) if self._analysis_type == Constant.TENSORBOARD_TRACE_HANDLER: self.simplify_data(self._profiler_path, ProfilerConfig().data_simplification) end_time = datetime.utcnow() diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py new file mode 100644 index 0000000000..15ba7a80f9 --- /dev/null +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -0,0 +1,114 @@ +import os +import logging +from logging.handlers import RotatingFileHandler +from datetime import datetime, timezone +from typing import Optional + +from torch_npu.utils._path_manager import PathManager + + +class ProfilerLogger: + """ + Profiler Logger class for managing log operations. + + This class provides a centralized logging mechanism for the profiler, + writing logs to file with rotation support. + + Attributes: + LOG_FORMAT: The format string for log messages + DATE_FORMAT: The format string for timestamps in log messages + DEFAULT_LOGGER_NAME: Default name for the logger instance + DEFAULT_LOG_DIR: Default directory name for log files + MAX_BYTES: Maximum size of each log file + BACKUP_COUNT: Number of backup files to keep + """ + + LOG_FORMAT = "[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s" + DATE_FORMAT = "%Y-%m-%d-%H:%M:%S" + DEFAULT_LOGGER_NAME = "AscendProfiler" + DEFAULT_LOG_LEVEL = logging.INFO + DEFAULT_LOG_DIR = "logs" + # 10MB per file + MAX_BYTES = 10 * 1024 * 1024 + # Keep 3 backup files + BACKUP_COUNT = 3 + # logger instance + _instance = None + + @classmethod + def get_instance(cls) -> logging.Logger: + """Get the singleton logger instance.""" + if cls._instance is None: + raise RuntimeError("Logger not initialized. Call init first.") + return cls._instance + + @classmethod + def init(cls, output_dir: str, custom_name: Optional[str] = None) -> None: + """ + Initialize the logger with rotating file handler. + + Args: + output_dir (str): Directory where log files will be stored + + Raises: + RuntimeError: If logger initialization fails + """ + if cls._instance is not None: + return + + # Create logs directory + log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) + PathManager.make_dir_safety(log_dir) + + # Create logger + logger = logging.getLogger(cls.DEFAULT_LOGGER_NAME) + logger.setLevel(cls.DEFAULT_LOG_LEVEL) + logger.propagate = False + + # Create formatters + formatter = logging.Formatter(fmt=cls.LOG_FORMAT, datefmt=cls.DATE_FORMAT) + + # Add rotating file handler + timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d%H%M%S") + log_file = os.path.join( + log_dir, + ( + f"profiler_{timestamp}_{os.getpid()}_{custom_name}.log" + if custom_name + else f"profiler_{timestamp}_{os.getpid()}.log" + ), + ) + file_handler = RotatingFileHandler( + filename=log_file, + maxBytes=cls.MAX_BYTES, + backupCount=cls.BACKUP_COUNT, + encoding="utf-8", + ) + file_handler.setFormatter(formatter) + file_handler.setLevel(cls.DEFAULT_LOG_LEVEL) + logger.addHandler(file_handler) + + cls._instance = logger + logger.info("Profiler logger initialized at: %s", log_file) + + @classmethod + def set_level(cls, level: int) -> None: + """ + Set the logging level for both file and console handlers. + + Args: + level (int): Logging level (e.g., logging.DEBUG, logging.INFO) + """ + logger = cls.get_instance() + logger.setLevel(level) + for handler in logger.handlers: + handler.setLevel(level) + + @classmethod + def destroy(cls) -> None: + """Close and cleanup the logger.""" + if cls._instance: + for handler in cls._instance.handlers[:]: + handler.close() + cls._instance.removeHandler(handler) + cls._instance = None diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py index 7de29238af..b52763aa4e 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py @@ -156,7 +156,7 @@ class ConcurrentTasksManager: def finalize(self): for task_info in self.task_infos.values(): if task_info.status != TaskStatus.Succeed: - print_error_msg("Task %s has not run successfully." % task_info.task.name) + print_error_msg(f"Task [{task_info.task.__class__.__name__}] run failed.") self.__stop_task(task_info) if self.progress_bar: diff --git a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py index 21a9831a60..b70b3e049d 100644 --- a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py @@ -12,6 +12,7 @@ from ..prof_common_func._constant import Constant, print_warn_msg from ..prof_common_func._constant import convert_us2ns from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._file_manager import FileManager +from ..prof_common_func._log import ProfilerLogger __all__ = [] @@ -66,9 +67,12 @@ class CANNFileParser: } def __init__(self, profiler_path: str): + self._profiler_path = profiler_path self._cann_path = ProfilerPathManager.get_cann_path(profiler_path) self._file_dict = {} self._file_dispatch() + ProfilerLogger.init(profiler_path, "CANNFileParser") + self.logger = ProfilerLogger.get_instance() @classmethod def _json_load(cls, data: str) -> list: @@ -98,6 +102,7 @@ class CANNFileParser: @classmethod def combine_acl_to_npu(cls, timeline_data: list) -> dict: + logger = ProfilerLogger.get_instance() flow_dict, event_dict = {}, {} for data in timeline_data: if data.get("cat") == cls.HOST_TO_DEVICE and data.get("ph") == cls.START_FLOW: @@ -112,6 +117,13 @@ class CANNFileParser: ts = data.get("ts") unique_id = f"{pid}-{tid}-{ts}" event_dict[unique_id] = data + + if not flow_dict: + logger.error("There is no HostToDevice flow events in msprof timeline.") + + if not event_dict: + logger.error("There is no kernel events in msprof timeline.") + acl_to_npu_dict = {} for flow in flow_dict.values(): start_event = flow.get("start") @@ -123,8 +135,11 @@ class CANNFileParser: unique_id = f"{pid}-{tid}-{ts}" kernel_event = event_dict.get(unique_id) if not kernel_event: + logger.warning("The kernel event of unique_id(pid: %d, tid: %d, ts: %d) is not exist in msprof timeline.", + pid, tid, ts) continue acl_to_npu_dict.setdefault(convert_us2ns(start_event.get("ts", 0)), []).append(EventBean(kernel_event)) + return acl_to_npu_dict def get_timeline_all_data(self) -> list: @@ -133,6 +148,9 @@ class CANNFileParser: for msprof_file in msprof_file_list: data = self._json_load(FileManager.file_read_all(msprof_file, "rt")) timeline_data.extend(data) + + if not timeline_data: + self.logger.error("Get timeline all data failed, the timeline data is empty.") return timeline_data def get_analyze_communication_data(self, file_type: Enum) -> dict: @@ -153,21 +171,24 @@ class CANNFileParser: def get_localtime_diff(self) -> float: localtime_diff = 0 if not self._cann_path: + self.logger.error("Get localtime diff failed, the CANN path is not exist.") return localtime_diff start_info_path = ProfilerPathManager.get_start_info_path(self._cann_path) if not start_info_path: + self.logger.error("Get localtime diff failed, the start info path is not exist.") return localtime_diff try: info_json = ast.literal_eval(FileManager.file_read_all(start_info_path, "rt")) localtime_diff = convert_us2ns(info_json.get(Constant.CANN_BEGIN_TIME, 0)) - int( info_json.get(Constant.CANN_BEGIN_MONOTONIC, 0)) - except Exception: - print_warn_msg("Failed to get CANN localtime diff.") + except Exception as e: + self.logger.error("Failed to get CANN localtime diff, error: %s", str(e), exc_info=True) return localtime_diff def del_summary_and_timeline_data(self): device_path = ProfilerPathManager.get_device_path(self._cann_path) if not device_path: + self.logger.error("Delete summary and timeline data failed, the device path is not exist.") return summary_path = os.path.join(device_path, "summary") timeline_path = os.path.join(device_path, "timeline") diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py index d3ecdb3a2e..f39698fb0a 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py @@ -2,6 +2,7 @@ from ._fwk_file_parser import FwkFileParser from ..prof_bean._torch_op_node import TorchOpNode from ..prof_common_func._constant import Constant, print_error_msg from ._cann_file_parser import CANNFileParser +from ..prof_common_func._log import ProfilerLogger __all__ = [] @@ -9,6 +10,8 @@ __all__ = [] class FwkCANNRelationParser: def __init__(self, profiler_path: str): self._profiler_path = profiler_path + ProfilerLogger.init(self._profiler_path, "FwkCANNRelationParser") + self.logger = ProfilerLogger.get_instance() @classmethod def combine_kernel_dict(cls, acl_to_npu_dict: dict, dequeue_data_list: list): @@ -45,28 +48,31 @@ class FwkCANNRelationParser: def get_kernel_dict(self) -> dict: acl_to_npu_dict = CANNFileParser(self._profiler_path).get_acl_to_npu_data() if not acl_to_npu_dict: + print_error_msg("Failed to get acl to npu flow events.") return acl_to_npu_dict dequeue_data_list = FwkFileParser(self._profiler_path).get_dequeue_data() return self.combine_kernel_dict(acl_to_npu_dict, dequeue_data_list) def get_step_range(self, root_node: TorchOpNode, kernel_dict: dict): if not kernel_dict: - print_error_msg("Failed to get acl to npu flow events.") + self.logger.error("Get step range failed, the kernel dict is empty.") return [] - step_node_list = [] - for level1_node in root_node.child_node_list: - if level1_node.is_profiler_step(): - step_node_list.append(level1_node) + # Get ProfilerStep#x node + step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()] if not step_node_list: + self.logger.error("Get step range failed, the step node list is empty.") return [] + + # Gather flow events start time in each step node if not FwkFileParser(self._profiler_path).has_task_queue_data(): acl_start_time_list = sorted(list(kernel_dict.keys())) self._update_step_node_info(step_node_list, acl_start_time_list) + # Get step range on device by flow events step_range = [] for step_node in step_node_list: step_id = step_node.event.name.split("#")[-1] if not step_node.corr_id_total: - print_error_msg("Some step lost the correlation id information.") + self.logger.error("There is no flow events in %s range.", step_node.event.name) return [] corr_id_list = sorted(step_node.corr_id_total) min_index, max_index = 0, len(corr_id_list) - 1 diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py index dbc6c63208..f8010197a8 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py @@ -4,13 +4,14 @@ from collections import defaultdict from ..prof_bean._torch_op_bean import TorchOpBean from ..prof_common_func._binary_decoder import BinaryDecoder -from ..prof_common_func._constant import Constant, DbConstant, contact_2num, print_warn_msg +from ..prof_common_func._constant import Constant, contact_2num from ..prof_common_func._file_manager import FileManager from ..prof_common_func._file_tag import FileTag from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._tlv_decoder import TLVDecoder from ..prof_common_func._trace_event_manager import TraceEventManager from ..prof_common_func._tree_builder import TreeBuilder +from ..prof_common_func._log import ProfilerLogger from ..prof_config._fwk_file_parser_config import FwkFileParserConfig from ._python_trace_parser import PythonTraceParser @@ -23,6 +24,8 @@ class FwkFileParser: self._profiler_path = profiler_path self._file_list = {} self._file_dispatch() + ProfilerLogger.init(self._profiler_path, "FwkFileParser") + self.logger = ProfilerLogger.get_instance() def get_file_data_by_tag(self, file_tag: int) -> list: file_path = self._file_list.get(file_tag) @@ -41,6 +44,7 @@ class FwkFileParser: enqueue_data_list = [] op_mark_data = self.get_file_data_by_tag(FileTag.OP_MARK) if not op_mark_data: + self.logger.error("Get enqueue data failed, the op mark data is empty.") return enqueue_data_list op_mark_data.sort(key=lambda x: x.time_ns) tid_op_dict = defaultdict(lambda: defaultdict(list)) @@ -52,7 +56,8 @@ class FwkFileParser: continue start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, []) if not start_op_list: - print_warn_msg("Enquque data match failed") + self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", + op_mark.tid, op_mark.origin_name) continue start_op = start_op_list.pop() op_mark.ts = start_op.time_ns @@ -65,6 +70,7 @@ class FwkFileParser: dequeue_data_list = [] op_mark_data = self.get_file_data_by_tag(FileTag.OP_MARK) if not op_mark_data: + self.logger.error("Get dequeue data failed, the op mark data is empty.") return dequeue_data_list op_mark_data.sort(key=lambda x: x.time_ns) tid_op_dict = defaultdict(lambda: defaultdict(list)) @@ -76,7 +82,8 @@ class FwkFileParser: continue start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, []) if not start_op_list: - print_warn_msg("Dequque data match failed") + self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", + op_mark.tid, op_mark.origin_name) continue start_op = start_op_list.pop() op_mark.ts = start_op.time_ns @@ -103,7 +110,8 @@ class FwkFileParser: if op_mark.is_enqueue_end: start_op_list = enqueue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, []) if not start_op_list: - print_warn_msg("Enquque data match failed") + self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", + op_mark.tid, op_mark.origin_name) continue start_op = start_op_list.pop() op_mark.ts = start_op.time_ns @@ -114,7 +122,8 @@ class FwkFileParser: if op_mark.is_dequeue_end: start_op_list = dequeue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, []) if not start_op_list: - print_warn_msg("Dequque data match failed") + self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", + op_mark.tid, op_mark.origin_name) continue start_op = start_op_list.pop() op_mark.ts = start_op.time_ns @@ -126,6 +135,7 @@ class FwkFileParser: def get_torch_op_tree_node(self, only_fwk: bool = False) -> list: torch_op_list = self.get_file_data_by_tag(FileTag.TORCH_OP) if not torch_op_list: + self.logger.error("Get torch op tree node failed, the torch op data is empty.") return [] enqueue_data_list = [] if not only_fwk: @@ -136,6 +146,7 @@ class FwkFileParser: def get_fwk_trace_data(self): torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP) if not torch_op_data: + self.logger.error("Get fwk trace data failed, the torch op data is empty.") return [] enqueue_data_list, dequeue_data_list = self.get_task_queue_data() pid = torch_op_data[0].pid diff --git a/torch_npu/profiler/analysis/prof_view/_base_parser.py b/torch_npu/profiler/analysis/prof_view/_base_parser.py index cb270a54f1..26dc595cd7 100644 --- a/torch_npu/profiler/analysis/prof_view/_base_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_base_parser.py @@ -30,7 +30,7 @@ class BaseParser(ConcurrentTask, ABC): self._output_path = None deps, mode = self._init_param(name) super(BaseParser, self).__init__(name, deps, mode) - + def _init_param(self, name: str) -> any: self._profiler_path = self._param_dict.get("profiler_path") self._output_path = self._param_dict.get("output_path") diff --git a/torch_npu/profiler/analysis/prof_view/_communication_parser.py b/torch_npu/profiler/analysis/prof_view/_communication_parser.py index e1751abfab..fff6d265d6 100644 --- a/torch_npu/profiler/analysis/prof_view/_communication_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_communication_parser.py @@ -3,11 +3,12 @@ from collections import defaultdict from ._base_parser import BaseParser from ..prof_bean._torch_op_node import TorchOpNode -from ..prof_common_func._constant import Constant, print_error_msg, print_warn_msg +from ..prof_common_func._constant import Constant, print_warn_msg from ..prof_common_func._file_manager import FileManager from ..prof_parse._cann_file_parser import CANNFileParser from ..prof_parse._cann_file_parser import CANNDataEnum from ..prof_common_func._constant import convert_us2ns +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser __all__ = [] @@ -45,6 +46,8 @@ class CommunicationParser(BaseParser): self._root_node = TorchOpNode() self._kernel_dict = {} self.step_list = [] + ProfilerLogger.init(self._profiler_path, "CommunicationParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def combine_size_distribution(op_dict: dict, total_dict: dict): @@ -63,8 +66,8 @@ class CommunicationParser(BaseParser): try: self._init_step_list(deps_data) self.generate_view() - except Exception: - print_error_msg("Failed to generate communication.json or communication_matrix.json.") + except Exception as e: + self.logger.error("Failed to generate communication.json or communication_matrix.json, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None @@ -272,7 +275,10 @@ class CommunicationParser(BaseParser): def _init_step_list(self, deps_data: dict): torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) if torch_op_node: - self.step_list = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], deps_data.get( - Constant.RELATION_PARSER, {})) + kernels_dict = deps_data.get(Constant.RELATION_PARSER, {}) + if not kernels_dict: + self.logger.error("Init step list failed, the kernel dict is empty.") + self.step_list = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernels_dict) + if not self.step_list: self.step_list = [{"step_id": None, "start_ts": 0, "end_ts": float('inf'), "comm_ops": {}}] diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py index 01c6a00538..b1344bc0a2 100644 --- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py @@ -1,6 +1,7 @@ from ._base_parser import BaseParser -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_common_func._file_manager import FileManager +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum from .._profiler_config import ProfilerConfig @@ -21,13 +22,15 @@ class IntegrateParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) + ProfilerLogger.init(self._profiler_path, "IntegrateParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: ProfilerConfig().load_info(self._profiler_path) self.generate_view() - except Exception: - print_error_msg("Failed to generate data_preprocess.csv or l2_cache.csv.") + except Exception as e: + self.logger.error("Failed to generate data_preprocess.csv or l2_cache.csv, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py index f35f7c6e6d..b06d7d3d72 100644 --- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py @@ -1,7 +1,8 @@ from ._base_parser import BaseParser -from ..prof_common_func._constant import Constant, print_error_msg, convert_ns2us_str +from ..prof_common_func._constant import Constant, convert_ns2us_str from ..prof_common_func._csv_headers import CsvHeaders from ..prof_common_func._file_manager import FileManager +from ..prof_common_func._log import ProfilerLogger from ..prof_bean._op_summary_bean import OpSummaryBean from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser @@ -16,6 +17,8 @@ class KernelViewParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] + ProfilerLogger.init(self._profiler_path, "KernelViewParser") + self.logger = ProfilerLogger.get_instance() @classmethod def _project_map_for_headers(cls, input_headers: list): @@ -36,8 +39,8 @@ class KernelViewParser(BaseParser): ProfilerConfig().load_info(self._profiler_path) self._init_step_range(deps_data) self.generate_view() - except Exception: - print_error_msg("Failed to generate kernel_details.csv.") + except Exception as e: + self.logger.error("Failed to generate kernel_details.csv, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None @@ -68,8 +71,13 @@ class KernelViewParser(BaseParser): def _init_step_range(self, deps_data: dict): torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) if torch_op_node: - step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], deps_data.get( - Constant.RELATION_PARSER, {})) + kernel_dict = deps_data.get(Constant.RELATION_PARSER, {}) + if not kernel_dict: + self.logger.error("Kernel view get step range failed, the kernel dict is empty.") + return + step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernel_dict) + if not step_range: + self.logger.error("Kernel view get step range failed, the step range is empty.") for step_data in step_range: step_id = step_data.get(Constant.STEP_ID) step_start = convert_ns2us_str(step_data.get(Constant.START_TS, 0)) diff --git a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py index 52437738fd..1af90d759f 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py @@ -16,7 +16,6 @@ from collections import defaultdict from warnings import warn from math import ceil -import os from ._base_parser import BaseParser from ..prof_common_func._file_tag import FileTag @@ -24,8 +23,9 @@ from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_parse._fwk_file_parser import FwkFileParser from ..prof_bean._memory_use_bean import MemoryUseBean from ..prof_bean._op_mark_bean import OpMarkBean -from ..prof_common_func._constant import Constant, print_error_msg, print_warn_msg +from ..prof_common_func._constant import Constant, print_warn_msg from ..prof_common_func._constant import convert_ns2us_float, convert_ns2us_str +from ..prof_common_func._log import ProfilerLogger from .._profiler_config import ProfilerConfig __all__ = [] @@ -46,6 +46,8 @@ class MemoryPrepareParser(BaseParser): self._enqueue_record_dict = {} # {corrid: enqueue} self._dequeue_pids = set() self._dequeue_tids = set() + ProfilerLogger.init(self._profiler_path, "MemoryPrepareParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def _find_torch_ops_by_binary_search(ts: int, torch_ops: list): @@ -63,8 +65,8 @@ class MemoryPrepareParser(BaseParser): try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self.generate_view() - except Exception: - print_error_msg("Failed to generate pytorch memory data.") + except Exception as e: + self.logger.error("Failed to generate pytorch memory data, error: %s", str(e), exc_info=True) return Constant.FAIL, {} if self._incomplete_num > 0: print_warn_msg(f"{self._incomplete_num} memory record(s) are incomplete.") diff --git a/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py index 79ba8768ed..986ec5935a 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py @@ -15,6 +15,7 @@ from torch.profiler._utils import traverse_dfs from ._base_parser import BaseParser from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._file_manager import FileManager +from ..prof_common_func._log import ProfilerLogger from ..prof_common_func._constant import Constant, print_warn_msg, print_error_msg from ..prof_parse._event_tree_parser import ( EventTree, @@ -1074,6 +1075,8 @@ class MemoryTimelineParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self._device = self._param_dict.get("device") + ProfilerLogger.init(self._profiler_path, "MemoryTimelineParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: @@ -1087,7 +1090,7 @@ class MemoryTimelineParser(BaseParser): mem_timeline.export_memory_timeline_json_raw(self._output_path, self._device) else: mem_timeline.export_memory_timeline_json(self._output_path, self._device) - except Exception: - print_error_msg(f"Failed to generate {self._output_path}.") + except Exception as e: + self.logger.error("Failed to generate %s, error: %s", self._output_path, str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None \ No newline at end of file diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py index ca70813380..fa834e543b 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py @@ -7,11 +7,12 @@ from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_parse._fwk_file_parser import FwkFileParser from ..prof_common_func._file_manager import FileManager from ..prof_common_func._constant import convert_ns2us_str -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_bean._npu_mem_bean import NpuMemoryBean from ..prof_bean._ge_op_memory_bean import GeOpMemoryBean from ..prof_bean._ge_memory_record_bean import GeMemoryRecordBean from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum +from ..prof_common_func._log import ProfilerLogger __all__ = [] @@ -33,6 +34,8 @@ class MemoryViewParser(BaseParser): self.ge_record_list = [] self.memory_data = [] self.component_list = [] + ProfilerLogger.init(self._profiler_path, "MemoryViewParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def _get_data_from_file(file_set: set, file_type_bean: any, bean_list: bool = False) -> list: @@ -69,8 +72,8 @@ class MemoryViewParser(BaseParser): self.memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Text, []) self.pta_record_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", []) self.generate_view() - except Exception: - print_error_msg("Failed to generate operator_memory.csv or memory_record.csv.") + except Exception as e: + self.logger.error("Failed to generate operator_memory.csv or memory_record.csv, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py index 35d3e3edc2..f87e8dc8b8 100644 --- a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py @@ -1,10 +1,11 @@ from ._base_parser import BaseParser -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_common_func._file_manager import FileManager from ..prof_common_func._constant import convert_ns2us_float from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._tree_builder import TreeBuilder +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._fwk_file_parser import FwkFileParser __all__ = [] @@ -21,14 +22,16 @@ class OperatorViewParser(BaseParser): self._torch_op_node = [] self._root_node = None self._kernel_dict = {} + ProfilerLogger.init(self._profiler_path, "OperatorViewParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self._kernel_dict = deps_data.get(Constant.RELATION_PARSER, {}) self.generate_view() - except Exception: - print_error_msg("Failed to generate operator_details.csv.") + except Exception as e: + self.logger.error("Failed to generate operator_details.csv, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py index 6601f56b0a..2f793a8af8 100644 --- a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py @@ -3,11 +3,12 @@ import os from ..prof_common_func._constant import convert_ns2us_float from ._base_parser import BaseParser from ..prof_bean._torch_op_node import TorchOpNode -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_common_func._constant import print_warn_msg from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._tree_builder import TreeBuilder from ..prof_common_func._file_manager import FileManager +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser from ..prof_parse._fwk_file_parser import FwkFileParser from ....utils._path_manager import PathManager @@ -22,13 +23,15 @@ class StackViewParser(BaseParser): self._root_node = None self._kernel_dict = {} self._metric = param_dict.get("metric") + ProfilerLogger.init(self._profiler_path, "StackViewParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self.generate_view() - except Exception: - print_error_msg("Failed to export stack.") + except Exception as e: + self.logger.error("Failed to export stack, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 8cb1df91e3..b9b7af71ad 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -1,8 +1,9 @@ from enum import Enum from ._base_parser import BaseParser -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_common_func._file_manager import FileManager from ..prof_common_func._constant import convert_ns2us_float +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._cann_file_parser import CANNFileParser from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser from ..prof_parse._fwk_file_parser import FwkFileParser @@ -30,6 +31,8 @@ class TraceStepTimeParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] + ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser") + self.logger = ProfilerLogger.get_instance() @classmethod def is_float_num(cls, num): @@ -131,8 +134,8 @@ class TraceStepTimeParser(BaseParser): try: self._init_step_range(deps_data) self.generate_view() - except Exception: - print_error_msg("Failed to generate step_trace_time.csv.") + except Exception as e: + self.logger.error("Failed to generate step_trace_time.csv, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py index 1c4d252fb8..f90100e869 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py @@ -1,11 +1,12 @@ import os from ._base_parser import BaseParser -from ..prof_common_func._constant import Constant, print_error_msg +from ..prof_common_func._constant import Constant from ..prof_common_func._file_manager import FileManager from ..prof_common_func._path_manager import ProfilerPathManager from ..prof_common_func._trace_event_manager import TraceEventManager from ..prof_common_func._tree_builder import TreeBuilder +from ..prof_common_func._log import ProfilerLogger from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser from .._profiler_config import ProfilerConfig from ..prof_parse._cann_file_parser import CANNFileParser @@ -26,6 +27,8 @@ class TraceViewParser(BaseParser): self._trace_data = [] self._torch_op_node = [] self._root_node = None + ProfilerLogger.init(self._profiler_path, "TraceViewParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def _prune_trace_by_level(json_data: list) -> list: @@ -51,8 +54,8 @@ class TraceViewParser(BaseParser): self._root_node = torch_op_node[0] self._torch_op_node = torch_op_node[1:] self.generate_view() - except Exception: - print_error_msg("Failed to generate trace_view.json.") + except Exception as e: + self.logger.error("Failed to generate trace_view.json, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py index cf8f05bfca..fb5cc72494 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py @@ -20,7 +20,8 @@ import subprocess from torch_npu.utils._error_code import ErrCode, prof_error from ...prof_common_func._constant import print_warn_msg, Constant, print_error_msg from ...prof_common_func._path_manager import ProfilerPathManager -from .._base_parser import BaseParser +from ...prof_common_func._log import ProfilerLogger +from ...prof_view._base_parser import BaseParser from ..._profiler_config import ProfilerConfig __all__ = [] @@ -33,6 +34,8 @@ class CANNAnalyzeParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") + ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: @@ -56,7 +59,8 @@ class CANNAnalyzeParser(BaseParser): if completed_analysis.returncode != self.COMMAND_SUCCESS: print_warn_msg("Failed to analyze CANN TEXT Profiling data.") - except Exception: + except Exception as e: print_error_msg("Failed to analyze CANN Profiling data.") + self.logger.error("Failed to analyze CANN Profiling data, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py index db6fb176a9..cff2628575 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py @@ -25,6 +25,8 @@ from ...prof_common_func._constant import Constant, print_warn_msg, print_error_ from ...prof_common_func._path_manager import ProfilerPathManager from .._base_parser import BaseParser from ..._profiler_config import ProfilerConfig +from ...prof_common_func._log import ProfilerLogger + __all__ = [] @@ -39,6 +41,8 @@ class CANNExportParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") + ProfilerLogger.init(self._profiler_path, "CANNExportParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: @@ -67,6 +71,7 @@ class CANNExportParser(BaseParser): except Exception as err: print_error_msg(f"Failed to export CANN Profiling data. Error msg: {err}") + self.logger.error("Failed to export CANN Profiling data, error: %s", str(err), exc_info=True) return Constant.FAIL, None end_time = datetime.utcnow() print_info_msg(f"CANN profiling data parsed in a total time of {end_time - start_time}") diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py index 2d3e3bbd16..6cc6f23516 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py @@ -15,8 +15,9 @@ import os -from ...prof_common_func._constant import print_error_msg, Constant +from ...prof_common_func._constant import Constant from ...prof_common_func._file_manager import FileManager +from ...prof_common_func._log import ProfilerLogger from ...prof_parse._fwk_file_parser import FwkFileParser from .._base_parser import BaseParser @@ -27,6 +28,8 @@ class TracePreParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) + ProfilerLogger.init(self._profiler_path, "TracePreParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: @@ -34,7 +37,8 @@ class TracePreParser(BaseParser): trace_file_path = os.path.join(self._output_path, Constant.TRACE_VIEW_TEMP) if os.path.isdir( self._output_path) else self._output_path FileManager.create_prepare_trace_json_by_path(trace_file_path, fwk_trace_data) - except Exception: + except Exception as e: + self.logger.error("Failed to create prepare trace json, error: %s", str(e), exc_info=True) return Constant.FAIL, None return Constant.SUCCESS, None @@ -43,11 +47,13 @@ class TreeBuildParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) + ProfilerLogger.init(self._profiler_path, "TracePreParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: torch_op_node = FwkFileParser(self._profiler_path).get_torch_op_tree_node() - except Exception: - print_error_msg("Failed to build torch op tree.") + except Exception as e: + self.logger.error("Failed to build torch op tree, error: %s", str(e), exc_info=True) return Constant.FAIL, [] return Constant.SUCCESS, torch_op_node diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py index 8932df64fa..e6eb02ddb8 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from ...prof_common_func._constant import Constant, print_error_msg +from ...prof_common_func._constant import Constant +from ...prof_common_func._log import ProfilerLogger from ...prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser from .._base_parser import BaseParser @@ -23,11 +23,13 @@ __all__ = [] class RelationParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) + ProfilerLogger.init(self._profiler_path, "RelationParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: kernel_dict = FwkCANNRelationParser(self._profiler_path).get_kernel_dict() - except Exception: - print_error_msg("Failed to get acl to npu flow dict.") + except Exception as e: + self.logger.error("Failed to get acl to npu flow dict, error: %s", str(e), exc_info=True) return Constant.FAIL, {} return Constant.SUCCESS, kernel_dict diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py index a77d1defbc..c827d32d1b 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py @@ -18,8 +18,9 @@ from enum import Enum from ...prof_parse._cann_file_parser import CANNDataEnum, CANNFileParser from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager -from ...prof_common_func._constant import convert_us2ns, print_error_msg, print_warn_msg +from ...prof_common_func._constant import convert_us2ns from ...prof_common_func._db_manager import DbManager +from ...prof_common_func._log import ProfilerLogger from .._communication_parser import CommunicationParser __all__ = [] @@ -74,13 +75,15 @@ class CommunicationDbParser(CommunicationParser): self.cann_comm_db_curs = None self.analysis_db_conn = None self.analysis_db_curs = None + ProfilerLogger.init(self._profiler_path, "CommunicationDbParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: self._init_step_list(deps_data) self.generate_view() - except Exception: - print_error_msg("Failed to generate communication table.") + except Exception as e: + self.logger.error("Failed to generate communication table, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self.cann_comm_db_conn, self.cann_comm_db_curs) DbManager.destroy_db_connect(self.analysis_db_conn, self.analysis_db_curs) return Constant.FAIL, None @@ -110,7 +113,7 @@ class CommunicationDbParser(CommunicationParser): band_width_data, matrix_data, time_data = [], [], [] conn, curs = DbManager.create_connect_db(db_path) if not (conn and curs): - print_warn_msg(f"Failed to connect to db file: {db_path}") + self.logger.warning("Failed to connect to db file: %s", db_path) return band_width_data, matrix_data, time_data self.cann_comm_db_conn = conn self.cann_comm_db_curs = curs @@ -219,7 +222,7 @@ class CommunicationDbParser(CommunicationParser): db_path = os.path.join(output_path, DbConstant.DB_ANALYSIS) conn, curs = DbManager.create_connect_db(db_path) if not (conn and curs): - print_warn_msg(f"Failed to connect to db file: {db_path}") + self.logger.warning("Failed to connect to db file: %s", db_path) return self.analysis_db_conn = conn self.analysis_db_curs = curs diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py index 297c3a878f..3e333e891f 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py @@ -2,13 +2,13 @@ import os import re import shutil import json - from ...prof_common_func._utils import collect_env_vars from ...prof_common_func._path_manager import ProfilerPathManager from ...prof_common_func._file_manager import FileManager -from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg, print_warn_msg +from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_warn_msg from ...prof_common_func._db_manager import DbManager from ...prof_common_func._host_info import get_host_info +from ...prof_common_func._log import ProfilerLogger from .._base_parser import BaseParser from ..._profiler_config import ProfilerConfig @@ -22,6 +22,8 @@ class DbParser(BaseParser): self._ascend_db_path = os.path.join(self._output_path, DbConstant.DB_ASCEND_PYTORCH_PROFILER) self._conn = None self._cur = None + ProfilerLogger.init(self._profiler_path, "DbParser") + self.logger = ProfilerLogger.get_instance() def run(self, depth_data: dict): try: @@ -37,8 +39,8 @@ class DbParser(BaseParser): self.save_env_vars_info_to_db() self.save_profiler_metadata_to_db() DbManager.destroy_db_connect(self._conn, self._cur) - except RuntimeError: - print_error_msg("Failed to generate ascend_pytorch_profiler db file.") + except RuntimeError as e: + self.logger.error("Failed to generate ascend_pytorch_profiler db file, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self._conn, self._cur) return Constant.FAIL, "" return Constant.SUCCESS, self._ascend_db_path @@ -90,7 +92,7 @@ class DbParser(BaseParser): try: profiler_metadata = json.loads(profiler_metadata) except json.JSONDecodeError as e: - print_warn_msg(f"profiler_metadata.json parse failed. {e}") + self.logger.warning("profiler_metadata.json parse failed. %s", str(e)) return data = [ [str(key), json.dumps(value)] for key, value in profiler_metadata.items() diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py index eede0fa6a4..6572f4a472 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py @@ -1,9 +1,7 @@ -import os - from enum import Enum from ...prof_common_func._db_manager import DbManager from ...prof_common_func._id_manager import Str2IdManager, ConnectionIdManager, CallChainIdManager -from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg +from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager from .._base_parser import BaseParser from ...prof_parse._fwk_file_parser import FwkFileParser @@ -71,8 +69,8 @@ class FwkApiDbParser(BaseParser): fwk_api_data = FwkFileParser(self._profiler_path).get_fwk_api() self.get_api_data_for_db(fwk_api_data) self.save_api_data_to_db() - except Exception: - print_error_msg("Failed to generate framework api table.") + except Exception as e: + logging.error("Failed to generate framework api table, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self._conn, self._cur) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py index f264f39baa..76a67b41bf 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from ...prof_common_func._db_manager import DbManager -from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg +from ...prof_common_func._log import ProfilerLogger +from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager from ...prof_parse._fwk_file_parser import FwkFileParser from .._base_parser import BaseParser @@ -29,6 +29,8 @@ class GCRecordDbParser(BaseParser): self._cur = None self._db_path = "" self._gc_record_data = [] + ProfilerLogger.init(self._profiler_path, "GCRecordDbParser") + self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): try: @@ -36,8 +38,8 @@ class GCRecordDbParser(BaseParser): self.init_db_connect() self._gc_record_data = FwkFileParser(self._profiler_path).get_gc_record_db_data() self.save_gc_record_data_to_db() - except Exception: - print_error_msg("Failed to generate gc record table.") + except Exception as e: + self.logger.error("Failed to generate gc record table, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self._conn, self._cur) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py index 30e9d93c0a..09ca81a73d 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py @@ -8,7 +8,8 @@ from ...prof_common_func._db_manager import DbManager from ...prof_common_func._id_manager import Str2IdManager from ...prof_common_func._path_manager import ProfilerPathManager from ...prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum -from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg +from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager +from ...prof_common_func._log import ProfilerLogger from .._base_parser import BaseParser __all__ = [] @@ -65,6 +66,8 @@ class MemoryDbParser(BaseParser): self._pta_record_list = [] self._ge_record_list = [] self._record_list = [] + ProfilerLogger.init(self._profiler_path, "MemoryDbParser") + self.logger = ProfilerLogger.get_instance() @staticmethod def _combine_record(last_record, cur_record): @@ -86,8 +89,8 @@ class MemoryDbParser(BaseParser): self._pta_memory_bean_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", []) self.init_pta_memory_data() self.save_memory_data_to_db() - except Exception: - print_error_msg("Failed to generate memory_record table or op_memory table.") + except Exception as e: + self.logger.error("Failed to generate memory_record table or op_memory table, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self._conn, self._cur) return Constant.FAIL, None return Constant.SUCCESS, None diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py index fb8d6c980c..fc871036a3 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging from .._base_parser import BaseParser from ...prof_bean._torch_op_node import TorchOpNode @@ -36,8 +37,8 @@ class StepInfoDbParser(BaseParser): self._db_path = deps_data.get(Constant.DB_PARSER, "") torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) step_range = self.get_step_range(torch_op_node[0] if torch_op_node else None) - except Exception: - print_error_msg("Failed to get step info from db.") + except Exception as e: + logging.error("Failed to get step info from db, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self.db_conn, self.db_curs) return Constant.FAIL, [] return Constant.SUCCESS, step_range diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py index 96eb06f802..6b7e1bd37e 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py @@ -14,9 +14,10 @@ # limitations under the License. import os +import logging from enum import Enum from .._base_parser import BaseParser -from ...prof_common_func._constant import Constant, print_error_msg, print_warn_msg +from ...prof_common_func._constant import Constant, print_warn_msg from ...prof_common_func._constant import DbConstant, TableColumnsManager from ...prof_common_func._db_manager import DbManager from ...prof_common_func._constant import convert_ns2us_float @@ -84,8 +85,8 @@ class TraceStepTimeDbParser(BaseParser): self._init_step_range(deps_data) self._init_task_info_from_db() self.generate_view() - except Exception: - print_error_msg("Failed to generate step_trace_time table.") + except Exception as e: + logging.error("Failed to generate step_trace_time table, error: %s", str(e), exc_info=True) DbManager.destroy_db_connect(self.task_db_con, self.task_db_curs) DbManager.destroy_db_connect(self.analysis_db_con, self.analysis_db_curs) return Constant.FAIL, None diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py index 3b1127ed0d..56107b57ac 100644 --- a/torch_npu/profiler/profiler_interface.py +++ b/torch_npu/profiler/profiler_interface.py @@ -134,8 +134,8 @@ class _ProfInterface: def analyse(self, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None, **kwargs): try: NpuProfiler.analyse(self.prof_path, analysis_type, output_path, **kwargs) - except Exception: - print_warn_msg("Profiling data parsing failed.") + except Exception as e: + print_warn_msg(f"Profiling data parsing failed, error: {e}") def check_gc_detect_enable(self): return ProfilerActivity.CPU in self.activities and self.experimental_config.with_gc -- Gitee