diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 8c0cceebed44f21aef291ed7526da7db4eb5a4ea..cd0ec79a9d8cd0e73b33effd13565346a71c607a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -49,15 +49,12 @@ for version in torch_without_guard_version_list: if not IS_GPU and not torch_without_guard_version: from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard -device = collections.namedtuple('device', ['type', 'index']) - class Const: """ Class for const """ MODEL_TYPE = ['.onnx', '.pb', '.om'] - DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" SEMICOLON = ";" COLON = ":" EQUAL = "=" @@ -332,7 +329,7 @@ def check_file_size(input_file, max_size): file_size = os.path.getsize(input_file) except OSError as os_error: print_error_log('Failed to open "%s". %s' % (input_file, str(os_error))) - raise CompareException(CompareException.INVALID_FILE_ERROR) + raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error if file_size > max_size: print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.' % (file_size, input_file, max_size)) @@ -390,7 +387,7 @@ def create_directory(dir_path): except OSError as ex: print_error_log( 'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex))) - raise CompareException(CompareException.INVALID_PATH_ERROR) + raise CompareException(CompareException.INVALID_PATH_ERROR) from ex def execute_command(cmd): @@ -561,11 +558,11 @@ def check_input_file_valid(input_path, max_file_size=MAX_JSON_FILE_SIZE): if not os.access(input_path, os.R_OK): raise PermissionError('Input file %s is not readable!' % input_path) - check_path_pattern_valid(input_path) - if not check_path_length_valid(input_path): raise ValueError("The real path or file_name of input is too long.") + check_path_pattern_valid(input_path) + if os.path.getsize(input_path) > max_file_size: raise ValueError(f'The file is too large, exceeds {max_file_size // 1024 ** 2}MB') diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 1b0a39a31c7f591de602f34dbd322dde22a161c8..84aa576553bb8ec0829e73ca5964e4ea872e3c9d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -22,6 +22,8 @@ def compare_bool_tensor(cpu_output, npu_output): if cpu_shape != npu_shape: return CompareConst.NA, False, "" error_nums = (cpu_output != npu_output).sum() + if cpu_output.size == 0: + return CompareConst.NAN, False, "There is not cpu calculation result." error_rate = float(error_nums / cpu_output.size) return error_rate, error_rate == 0, "" @@ -157,11 +159,11 @@ def flatten_compare_result(result): def compare_core(bench_out, npu_out, alg): msg = "" if not isinstance(bench_out, type(npu_out)): - return [(CompareConst.NA, "bench and npu output type is different.")], False, CompareConst.NA, CompareConst.NA, CompareConst.NA + return [(CompareConst.NA, "bench and npu output type is different.")], False, [CompareConst.NA], [CompareConst.NA], [CompareConst.NA] if isinstance(bench_out, (list, tuple)): compare_result, test_success, bench_dtype, npu_dtype, shape = [], True, [], [], [] if len(bench_out) != len(npu_out): - return [(CompareConst.NA, "bench and npu output structure is different")], False, CompareConst.NA, CompareConst.NA, CompareConst.NA + return [(CompareConst.NA, "bench and npu output structure is different")], False, [CompareConst.NA], [CompareConst.NA], [CompareConst.NA] for b_out_i, n_out_i in zip(bench_out, npu_out): compare_result_i, test_success_i, bench_dtype_i, npu_dtype_i, shape_i = compare_core(b_out_i, n_out_i, alg) compare_result.append(compare_result_i) @@ -173,34 +175,35 @@ def compare_core(bench_out, npu_out, alg): b_keys, n_keys = set(bench_out.keys()), set(npu_out.keys()) if b_keys != n_keys: compare_result, test_success, bench_dtype, npu_dtype, shape = [(CompareConst.NA, "bench and npu output dict keys are different")], False, \ - CompareConst.NA, CompareConst.NA, CompareConst.NA - compare_result, test_success, bench_dtype, npu_dtype, shape = compare_core(list(bench_out.values()), list(npu_out.values()), alg) + [CompareConst.NA], [CompareConst.NA], [CompareConst.NA] + else: + compare_result, test_success, bench_dtype, npu_dtype, shape = compare_core(list(bench_out.values()), list(npu_out.values()), alg) elif isinstance(bench_out, torch.Tensor): copy_bench_out = bench_out.detach().clone() copy_npu_out = npu_out.detach().clone() - bench_dtype = str(copy_bench_out.dtype) - npu_dtype = str(copy_npu_out.dtype) - shape = tuple(npu_out.shape) + bench_dtype = [str(copy_bench_out.dtype)] + npu_dtype = [str(copy_npu_out.dtype)] + shape = [tuple(npu_out.shape)] if copy_npu_out.dtype == torch.bfloat16: copy_bench_out = copy_bench_out.to(torch.float32) copy_npu_out = copy_npu_out.to(torch.float32) compare_result, test_success, msg = compare_torch_tensor(copy_bench_out.numpy(), copy_npu_out.cpu().numpy(), alg) elif isinstance(bench_out, (bool, int, float, str)): compare_result, test_success, msg = compare_builtin_type(bench_out, npu_out) - bench_dtype = str(type(bench_out)) - npu_dtype = str(type(npu_out)) - shape = str(type(npu_out)) + bench_dtype = [str(type(bench_out))] + npu_dtype = [str(type(npu_out))] + shape = [str(type(npu_out))] elif bench_out is None: compare_result, test_success, msg = CompareConst.NA, True, "output is None" - bench_dtype = CompareConst.NA - npu_dtype = CompareConst.NA - shape = CompareConst.NA + bench_dtype = [CompareConst.NA] + npu_dtype = [CompareConst.NA] + shape = [CompareConst.NA] else: compare_result, test_success, msg = CompareConst.NA, True, "Unexpected output type \ in compare_core: {}".format(type(bench_out)) - bench_dtype = CompareConst.NA - npu_dtype = CompareConst.NA - shape = CompareConst.NA + bench_dtype = [CompareConst.NA] + npu_dtype = [CompareConst.NA] + shape = [CompareConst.NA] if isinstance(compare_result, list): compare_result = flatten_compare_result(compare_result) else: diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 0c8829b629c2bf4d44655bc9e480fc3c460f52b9..68c22cff2648e6ce14081ec75dac607574ff630a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -4,7 +4,7 @@ from rich.table import Table from rich.console import Console from api_accuracy_checker.compare.algorithm import compare_core, cosine_sim, cosine_standard, get_max_rel_err, get_max_abs_err, \ compare_builtin_type, get_rel_err_ratio_thousandth, get_rel_err_ratio_ten_thousandth -from api_accuracy_checker.common.utils import get_json_contents, print_info_log, write_csv +from api_accuracy_checker.common.utils import get_json_contents, print_info_log, print_error_log, write_csv, CompareException from api_accuracy_checker.compare.compare_utils import CompareConst from api_accuracy_checker.common.config import msCheckerConfig @@ -144,7 +144,7 @@ class Comparator: else: is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) else: - is_bwd_success, bwd_compare_alg_results = CompareConst.NA, None + is_bwd_success, bwd_compare_alg_results = False, None self.record_results(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results, bwd_compare_alg_results) if is_fwd_success and is_bwd_success: self.test_result_cnt['success_num'] += 1 @@ -181,13 +181,17 @@ class Comparator: detailed_result_total = detailed_result test_success_total = test_success_total or max_abs_error_success # dtype加到所有指标的前面, 是否pass放到所有指标的后面 - for i, detailed_tuple in enumerate(detailed_result_total): - detailed_result = list(detailed_tuple) - detailed_result.insert(0, bench_dtype_total[i]) - detailed_result.insert(1, npu_dtype_total[i]) - detailed_result.insert(2, shape_total[i]) - detailed_result.append(str(test_success_total)) - detailed_result_total[i] = tuple(detailed_result) + try: + for i, detailed_tuple in enumerate(detailed_result_total): + detailed_result = list(detailed_tuple) + detailed_result.insert(0, bench_dtype_total[i]) + detailed_result.insert(1, npu_dtype_total[i]) + detailed_result.insert(2, shape_total[i]) + detailed_result.append(str(test_success_total)) + detailed_result_total[i] = tuple(detailed_result) + except IndexError as error: + print_error_log(f"There is index error.\n{str(error)}") + raise CompareException(CompareException.INVALID_DATA_ERROR) from error return test_success_total, detailed_result_total @staticmethod diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py index 05354226f3346a2344cd96ed526e39b84f1b495e..a8071f422d6577e4c5c64a39f4d26a4cac3a3978 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py @@ -46,7 +46,7 @@ def write_json(file_path, data, indent=None): f.write(',\n') f.write(json.dumps(data, indent=indent)[1:-1] + '\n}') except Exception as e: - raise ValueError(f"Json save failed:{e}") + raise ValueError(f"Json save failed:{e}") from e finally: fcntl.flock(f, fcntl.LOCK_UN) lock.release() diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py index 534343b97098e733c48881b616e1a073a3900714..81460820d061b1b2660f473d708d372acbf75aea 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py @@ -141,9 +141,9 @@ def _run_overflow_check(): check_file_suffix(forward_file, FileCheckConst.JSON_SUFFIX) try: torch.npu.set_device(npu_device) - except Exception: + except Exception as error: print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") - raise NotImplementedError + raise NotImplementedError from error run_overflow_check(forward_file, backward_file) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 5f9e27216c71d1f1667972ff94294f12406b5c11..126fb26c93e4d6641c7ee84f027e7909949d696c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -246,9 +246,9 @@ def _run_ut(): npu_device = "npu:" + str(args.device_id) try: torch.npu.set_device(npu_device) - except Exception: + except Exception as error: print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") - raise NotImplementedError + raise NotImplementedError from error forward_file = os.path.realpath(args.forward_input_file) backward_file = os.path.realpath(args.backward_input_file) check_file_suffix(forward_file, FileCheckConst.JSON_SUFFIX)