diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 560d939b345e169a84dd6a06f58749115e93333b..32f206fca9aa98a3d039039225ab459c53ba8ae6 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -113,9 +113,10 @@ class Const: RUN_UT = "run_ut" GRAD_PROBE = "grad_probe" STRUCTURE = "structure" + EXCEPTION_DUMP = "exception_dump" DUMP_PRECISION_HIGH = "high" DUMP_PRECISION_LOW = "low" - TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE] + TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_DUMP] DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR, STRUCTURE] DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD] DUMP_PRECISION_LIST = [DUMP_PRECISION_LOW, DUMP_PRECISION_HIGH] diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py index eca71a4755f21625601f8beaeef2ebdd83af21ff..6473f8341d37227dd18616d2d2e239f10057c660 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py @@ -43,6 +43,7 @@ from msprobe.mindspore.task_handler_factory import TaskHandlerFactory try: from mindspore._c_expression import _dump_start, _dump_stop, _dump_step, _set_init_iter, _dump_set_dynamic + import mindspore as ms except ImportError: enable_dynamic_kbyk_dump = False else: @@ -165,7 +166,8 @@ class PrecisionDebugger(BasePrecisionDebugger): instance.service.stop() else: Runtime.is_running = False - if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2: + if enable_dynamic_kbyk_dump: + ms.runtime.synchronize() _dump_stop() if cls._is_kernel_dump() and _msprobe_c: _msprobe_c._PrecisionDebugger().stop() diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..0eb7ed47879b26dfc7ae373c21d56254b7dd6d21 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from msprobe.core.common.log import logger +from msprobe.mindspore.common.const import Const +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.exception_dump.kernel_graph_exception_dump import KernelGraphExceptionDump + + +class ExceptionDumpToolFactory: + tools = { + Const.CELL: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.API: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.KERNEL: { + Const.GRAPH_KBYK_MODE: KernelGraphExceptionDump, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: KernelGraphExceptionDump + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = ExceptionDumpToolFactory.tools.get(config.level) + if not tool: + raise Exception("Valid level is needed.") + tool = tool.get(config.execution_mode) + if not tool: + logger.error(f"Overflow check is not supported in {config.execution_mode} mode " + f"when level is {config.level}.") + raise ValueError + return (tool(config),) diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..9e4d17c71c9926cf333415a046223c60c5ceb0c9 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py @@ -0,0 +1,67 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from msprobe.core.common.file_utils import create_directory, save_json +from msprobe.mindspore.common.log import logger +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig + + +class KernelGraphExceptionDump: + E2E_SETTINGS = "e2e_dump_settings" + + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 4 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + if config.stat_cal_mode and config.device_stat_precision_mode: + e2e_set = { + "enable": not config.async_dump, + "trans_flag": True, + "stat_calc_mode": config.stat_cal_mode, + "device_stat_precision_mode": config.device_stat_precision_mode + } + else: + e2e_set = { + "enable": not config.async_dump, + "trans_flag": True + } + + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + logger.warning("Step would change to all in this task.") + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + + self.dump_json["e2e_dump_settings"] = e2e_set + + def handle(self): + json_path = self.dump_json["common_dump_settings"]["path"] + create_directory(json_path) + json_path = os.path.join(json_path, "kernel_graph_exception_check.json") + save_json(json_path, self.dump_json, indent=4) + logger.info(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path diff --git a/debug/accuracy_tools/msprobe/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py index 2c27b05aa2d2772d1a3c09743797678d9b67e65a..4a7a8037926a0b8e5cc66e308688b9fd436459b2 100644 --- a/debug/accuracy_tools/msprobe/mindspore/ms_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py @@ -78,6 +78,10 @@ class OverflowCheckConfig(BaseConfig): if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: raise Exception("check_mode is invalid") +class ExceptionDumpConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.data_mode = ["all"] class FreeBenchmarkConfig(BaseConfig): def __init__(self, task_config): @@ -128,7 +132,8 @@ TaskDict = { Const.OVERFLOW_CHECK: OverflowCheckConfig, Const.FREE_BENCHMARK: FreeBenchmarkConfig, Const.GRAD_PROBE: GradProbeConfig, - Const.STRUCTURE: StructureConfig + Const.STRUCTURE: StructureConfig, + Const.EXCEPTION_DUMP: ExceptionDumpConfig } diff --git a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py index 0b3ed6221c2c4fc2d380072a480f35d5815cb89e..bae3104c249654dee5962b0d509cb53db5666b74 100644 --- a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py @@ -32,7 +32,7 @@ class OverflowCheckToolFactory: Const.PYNATIVE_MODE: None }, Const.KERNEL: { - Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_KBYK_MODE: KernelGraphOverflowCheck, Const.GRAPH_GE_MODE: KernelGraphOverflowCheck, Const.PYNATIVE_MODE: None } diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index 10b74ea22b02d0668d0b3b17a569c5e1a67c1dd8..cad37cebe8b5de39e3954da0eea2edd26b79223e 100644 --- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -18,6 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory +from msprobe.mindspore.exception_dump.exception_dump_tool_factory import ExceptionDumpToolFactory class TaskHandlerFactory: @@ -25,7 +26,8 @@ class TaskHandlerFactory: Const.TENSOR: DumpToolFactory, Const.STATISTICS: DumpToolFactory, Const.OVERFLOW_CHECK: OverflowCheckToolFactory, - Const.FREE_BENCHMARK: SelfCheckToolFactory + Const.FREE_BENCHMARK: SelfCheckToolFactory, + Const.EXCEPTION_DUMP: ExceptionDumpToolFactory } @staticmethod