From 18053172e81dceb1e3d00dac12576a657cd891c2 Mon Sep 17 00:00:00 2001 From: maning202007 Date: Sat, 29 Jul 2023 11:18:21 +0800 Subject: [PATCH] DumpAnalyser support statistic file --- mindinsight/debugger/api/dump_analyzer.py | 242 ++++++++++++++++++++++ mindinsight/debugger/api/statistic.py | 61 ++++++ 2 files changed, 303 insertions(+) create mode 100644 mindinsight/debugger/api/statistic.py diff --git a/mindinsight/debugger/api/dump_analyzer.py b/mindinsight/debugger/api/dump_analyzer.py index 0c7551ad..8cedf631 100644 --- a/mindinsight/debugger/api/dump_analyzer.py +++ b/mindinsight/debugger/api/dump_analyzer.py @@ -15,12 +15,17 @@ """Debugger python API.""" import os.path from typing import Iterable +import csv +import stat +from pathlib import Path +import numpy as np import mindinsight from mindinsight.debugger.api.conditions import WatchpointHit, HitDetail, WatchpointHandle, WatchpointHitImpl from mindinsight.debugger.api.debugger_engine import DebuggerEngine from mindinsight.debugger.api.debugger_tensor import DebuggerTensor, DebuggerTensorImpl from mindinsight.debugger.api.node import Node, NodeImpl, NodeUniqueId +from mindinsight.debugger.api.statistic import TensorStatistic, SummaryStatistic from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError from mindinsight.debugger.common.log import LOGGER as log from mindinsight.debugger.common.log import setup_logger @@ -312,6 +317,243 @@ class DumpAnalyzer: tensors.extend(node.get_output_tensors(slots=slots, iterations=iterations)) return tensors + def select_tensor_statistics( + self, + iterations=None, + ranks=None): + """ + Select tensors. + + Select the matched tensors in the directory according to the + sepicified filter condition, see the parameters for detail. + + Args: + iterations (Union[int, list[int], None], optional): The iteration(s) to select. ``None`` means all dumped + iterations will be selected. Default: ``None``. + ranks (Union[int, list[int], None], optional): The rank(s) to select. ``None`` means all ranks + will be selected. Default: ``None``. + + Returns: + Dict[TensorStatistic], the matched TensorStatistics. The format is: + {"rank_id":{ + "iteration_id":[TensorStatistic], + ... + } + ... + } + + Examples: + >>> from mindinsight.debugger import DumpAnalyzer + >>> my_run = DumpAnalyzer(dump_dir="/path/to/your/dump_dir_with_dump_data") + >>> statistics = my_run.select_tensor_statistics(ranks=[0]) + """ + ranks = self._get_iterable_ranks(ranks) + dumped_iterations = self.get_iterations(ranks) + iterations = parse_param_to_iterable_obj(iterations, 'iterations', dumped_iterations) + + tensor_statistics = {} + for rank_id in ranks: + static_in_rank = {} + for iteration_id in self.get_iterations(ranks=[rank_id]): + if iteration_id not in iterations: + continue + tensor_statistic = self._get_statistic(rank_id=rank_id, iteration_id=iteration_id) + static_in_rank[iteration_id] = tensor_statistic + tensor_statistics[rank_id] = static_in_rank + return tensor_statistics + + def _get_statistic(self, rank_id, iteration_id): + """ + Get the TensorStatic of the corresponding rank and iteration. + """ + tensor_statistics = [] + if not self._data_loader.has_data: + return tensor_statistics + net_name = self._data_loader.get_net_name() + net_dir = Path(os.path.join(self._dump_dir, 'rank_' + str(rank_id), net_name)).absolute() + for graph_dir in net_dir.iterdir(): + target_step_dir = graph_dir / str(iteration_id) + if not target_step_dir.is_dir(): + continue + statistic_file_path = os.path.join(target_step_dir, "statistic.csv") + with open(statistic_file_path, 'r') as f: + csv_reader = csv.DictReader(f) + # The first line of the csv file is the title, so skip the first line. + for statistic_line in csv_reader: + tensor_statistics.append(statistic_line) + return tensor_statistics + + def compute_statistic(self, debugger_tensors): + """ + Compute the statistic of the given tensors. + """ + statistics = {} + for tensor in debugger_tensors: + rank_id = tensor.rank + is_new_rank = rank_id not in statistics + static_in_rank = statistics.get(rank_id, {}) + iteration = tensor.iteration + is_new_iteration = iteration not in static_in_rank + static_in_iter = static_in_rank.get(iteration, []) + single_static = self._compute_statistic(tensor) + static_in_iter.append(single_static) + if is_new_iteration: + static_in_rank[iteration] = static_in_iter + if is_new_rank: + statistics[rank_id] = static_in_rank + return statistics + + def _compute_statistic(self, debugger_tensor): + """Compute the tensor statistic for one tensor.""" + statistic = TensorStatistic() + op_name = debugger_tensor.node.name + tensor_value = debugger_tensor.value() + if tensor_value is None: + return statistic + short_name = op_name.split('/')[-1] + op_type = short_name.split('-')[0] + statistic.op_name = short_name + statistic.op_type = op_type + statistic.io = 'output' + statistic.slot = debugger_tensor.slot + statistic.data_size = tensor_value.nbytes + statistic.data_type = tensor_value.dtype + statistic.shape = tensor_value.shape + statistic.min_value = np.amin(tensor_value) + statistic.max_value = np.amax(tensor_value) + statistic.avg_value = tensor_value.mean() + statistic.count = tensor_value.size + statistic.negative_zero_count = np.sum(np.where(tensor_value == 0, 1, 0)) + statistic.positive_zero_count = np.sum(np.where(tensor_value == 0, 1, 0)) + statistic.negative_inf_count = len(np.nonzero(np.isneginf(tensor_value))[0]) + statistic.positive_inf_count = len(np.nonzero(np.isposinf(tensor_value))[0]) + statistic.nan_count = len(np.nonzero(np.isnan(tensor_value))[0]) + return statistic + + def summary_statistics(self, statistics): + """Summary the statistics in the different rank and iteration""" + summary_statistics = {} + for rank_id, statistics_in_rank in statistics.items(): + log.warning("process statistics in rank, rank_id is: %s", rank_id) + for iteration_id, statistics_in_iteration in statistics_in_rank.items(): + log.warning("process statistics in iteration, iteration_id is: %s", iteration_id) + for statistic in statistics_in_iteration: + if isinstance(statistic, TensorStatistic): + self._put_tensor_statistic_to_summarystatistics(statistic, summary_statistics) + else: + self._put_dict_statistic_to_summarystatistics(statistic, summary_statistics) + return summary_statistics + + def _put_dict_statistic_to_summarystatistics(self, statistic, summary_statistics): + """Put dict_statistic to summarized statistics, used for Statistic of dict type from statistic file.""" + op_name = statistic.get('Op Name', 'unkown') + op_type = statistic.get('Op Type', 'unkown') + io = statistic.get('IO', 'output') + slot = statistic.get('Slot', 0) + tensor_name = op_name + ":" + io + ":" + str(slot) + positive_inf_count = statistic.get('Positive Inf Count', 0) + negative_inf_count = statistic.get('Negative Inf Count', 0) + nan_count = statistic.get('NaN Count', 0) + + summary_statistic = summary_statistics.get(tensor_name, SummaryStatistic()) + if not summary_statistic.op_type: + summary_statistic.op_type = op_type + if not summary_statistic.op_name: + summary_statistic.op_name = op_name + if not summary_statistic.tensor_name: + summary_statistic.tensor_name = tensor_name + summary_statistics[tensor_name] = summary_statistic + summary_statistic.total_dump_iterations += 1 + if int(positive_inf_count) > 0 or int(negative_inf_count) > 0: + summary_statistic.inf_iterations += 1 + if int(nan_count) > 0: + summary_statistic.nan_iterations += 1 + if int(positive_inf_count) == 0 and int(negative_inf_count) == 0 and int(nan_count) == 0: + summary_statistic.normal_iterations += 1 + + def _put_tensor_statistic_to_summarystatistics(self, statistic, summary_statistics): + """Put tensor_statistic to summarized statistics, used for TensorStatistic from tensor file.""" + op_name = statistic.op_name + op_type = statistic.op_type + io = statistic.io + slot = statistic.slot + tensor_name = op_name + ":" + io + ":" + str(slot) + summary_statistic = summary_statistics.get(tensor_name, SummaryStatistic()) + if not summary_statistic.op_type: + summary_statistic.op_type = op_type + if not summary_statistic.op_name: + summary_statistic.op_name = op_name + if not summary_statistic.tensor_name: + summary_statistic.tensor_name = tensor_name + summary_statistics[tensor_name] = summary_statistic + summary_statistic.total_dump_iterations += 1 + if statistic.positive_inf_count > 0 or statistic.negative_inf_count > 0: + summary_statistic.inf_iterations += 1 + if statistic.nan_count > 0: + summary_statistic.nan_iterations += 1 + if statistic.positive_inf_count == 0 and statistic.negative_inf_count == 0 and statistic.nan_count == 0: + summary_statistic.normal_iterations += 1 + + def export_statistics(self, tensor_statistics, out_path="./"): + """Export the tensor staticstics to the out_path. """ + ks = tensor_statistics.keys() + if len(ks) == 0: + log.warning("The given tensor_statistics is empty.") + return + if not os.path.exists(out_path): + os.makedirs(out_path, exist_ok=True) + if isinstance(list(ks)[0], int): + for rank_id, statistics_in_rank in tensor_statistics.items(): + for iteration_id, statistics_in_iteration in statistics_in_rank.items(): + self._export_statistics_in_one_iteration(statistics_in_iteration, rank_id, iteration_id, out_path) + elif isinstance(list(ks)[0], str): + self._export_summary_statistics(tensor_statistics, out_path) + else: + log.warning("Invalid tensor_statistics data structure.") + + def _export_summary_statistics(self, tensor_statistics, out_path): + """Export the summarized statistics to out_path.""" + statistic_file_path = os.path.join(out_path, "statistics_summary.csv") + flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + modes = stat.S_IWUSR | stat.S_IRUSR + with os.fdopen(os.open(statistic_file_path, flags, modes), 'w', newline='') as f: + csv_writer = csv.writer(f) + # statistic_header = "op_type, op_name, tensor_name,inf_number,nan_number,total_number," + statistic_header = ["op_type", "op_name", "tensor_name", "inf_iterations", "nan_iterations", + "total_dump_iterations", "normal_iterations"] + csv_writer.writerow(statistic_header) + for _, statistic in tensor_statistics.items(): + statistic_line = [statistic.op_type, statistic.op_name, statistic.tensor_name, + statistic.inf_iterations, statistic.nan_iterations, statistic.total_dump_iterations, + statistic.normal_iterations] + csv_writer.writerow(statistic_line) + log.info("export summarised statistics to file: %s", statistic_file_path) + + def _export_statistics_in_one_iteration(self, tensor_statistics_in_one_iteration, rank_id, iteration_id, out_path): + """Export tensor_statistics in one iteration.""" + iteration_path = os.path.join(out_path, str(rank_id), str(iteration_id)) + if not os.path.exists(iteration_path): + os.makedirs(iteration_path, exist_ok=True) + statistic_file_path = os.path.join(iteration_path, "statistics.csv") + flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + modes = stat.S_IWUSR | stat.S_IRUSR + with os.fdopen(os.open(statistic_file_path, flags, modes), 'w', newline='') as f: + csv_writer = csv.writer(f) + statistic_header = ["Op Type", "Op Name", "Task ID", "Stream ID", "Timestamp", "IO", "Slot", "Data Size", + "Data Type", "Shape", "Max Value", "Min Value", "Avg Value", "Count", + "Negative Zero Count", "Positive Zero Count", "NaN Count", "Negative Inf Count", + "Positive Inf Count", "Zero Count"] + csv_writer.writerow(statistic_header) + for statistic in tensor_statistics_in_one_iteration: + statistic_line = [statistic.op_type, statistic.op_name, statistic.task_id, + statistic.stream_id, statistic.time_stamp, statistic.io, statistic.slot, + statistic.data_size, statistic.data_type, statistic.shape, statistic.max_value, + statistic.min_value, statistic.avg_value, statistic.count, + statistic.negative_zero_count, statistic.positive_zero_count, statistic.nan_count, + statistic.negative_inf_count, statistic.positive_inf_count, statistic.zero_count] + csv_writer.writerow(statistic_line) + log.info("export summarised statistics to file: %s", statistic_file_path) + def get_iterations(self, ranks=None) -> Iterable[int]: """ Get available iterations which have data dumped in this run. diff --git a/mindinsight/debugger/api/statistic.py b/mindinsight/debugger/api/statistic.py new file mode 100644 index 00000000..098250db --- /dev/null +++ b/mindinsight/debugger/api/statistic.py @@ -0,0 +1,61 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""TensorStatistic for DebuggerTensor.""" + + +class TensorStatistic: + """The tensor statistic class.""" + + def __init__(self, op_type='', op_name='', task_id=0, stream_id=0, time_stamp=0, + io='', slot=0, data_size=0, data_type='', shape=(), + max_value=float('-inf'), min_value=float('inf'), avg_value=0, count=0, + negative_zero_count=0, + positive_zero_count=0, nan_count=0, negative_inf_count=0, + positive_inf_count=0, zero_count=0): + self.op_type = op_type + self.op_name = op_name + self.task_id = task_id + self.stream_id = stream_id + self.time_stamp = time_stamp + self.io = io + self.slot = slot + self.data_size = data_size + self.data_type = data_type + self.shape = shape + self.max_value = max_value + self.min_value = min_value + self.avg_value = avg_value + self.count = count + self.negative_zero_count = negative_zero_count + self.positive_zero_count = positive_zero_count + self.nan_count = nan_count + self.negative_inf_count = negative_inf_count + self.positive_inf_count = positive_inf_count + self.zero_count = zero_count + + +class SummaryStatistic: + """The tensor statistic class.""" + + def __init__(self, op_type='', op_name='', tensor_name='', inf_iterations=0, nan_iterations=0, + overflow_iterations=0, total_dump_iterations=0, normal_iterations=0): + self.op_type = op_type + self.op_name = op_name + self.tensor_name = tensor_name + self.inf_iterations = inf_iterations + self.nan_iterations = nan_iterations + self. overflow_iterations = overflow_iterations + self.total_dump_iterations = total_dump_iterations + self.normal_iterations = normal_iterations -- Gitee