diff --git a/CMakeLists.txt b/CMakeLists.txt index ad39472ed6b82729c9286e3b4ad3e54b08a47e45..d38f3d95dccc682493c9d4a78de9f8d387e09c0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,6 +246,7 @@ add_subdirectory(${TORCHNPU_ROOT}/core) add_subdirectory(${TORCHNPU_ROOT}/framework) add_subdirectory(${TORCHNPU_ROOT}/flopcount) add_subdirectory(${TORCHNPU_ROOT}/logging) +add_subdirectory(${TORCHNPU_ROOT}/custom_dtype) if (NOT DEFINED BUILD_LIBTORCH) add_subdirectory(${TORCHNPU_ROOT}/distributed) @@ -272,10 +273,10 @@ if (DEFINED BUILD_TENSORPIPE) endif() if (DEFINED BUILD_LIBTORCH) - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) else() # Compile code with pybind11 - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) endif() add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS}) diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py index bdb6c48a13a73aed84172c65409c5e65f42201fd..248704d49282c0d2a54410cc092c5abeb61d3846 100644 --- a/codegen/gen_backend_stubs.py +++ b/codegen/gen_backend_stubs.py @@ -395,6 +395,8 @@ def gen_dispatcher_registrations( ns_helper = NamespaceHelper(namespace_str="at") native_func_header = """\ #include "torch_npu/csrc/core/npu/NPURecovery.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/NPUException.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/profiler/utils.h" #endif diff --git a/codegen/utils.py b/codegen/utils.py index 187f02fc9dea81f99c8c4c624840273ec3f0f3f4..1df2bfcd01e2b50bed0ec1e26a80c79959f7f777 100644 --- a/codegen/utils.py +++ b/codegen/utils.py @@ -401,6 +401,7 @@ const DeviceGuard device_guard(device_or_default(device));""" device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" op_key = str(f.func.name) + is_ascend910_xx_version = "c10_npu::IsAscend910_xxVersion()" if enable_opplugin(): if op_key in GLOBAL_STRUCTURED_OP_INFO_CACHE: impl_name = f"op_plugin::{GLOBAL_STRUCTURED_OP_INFO_CACHE[op_key]}" @@ -472,6 +473,11 @@ if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{ if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{ return {op_api_impl_name}({args_exprs_str}); }} else {{ + if ({is_ascend910_xx_version}) {{ + TORCH_CHECK(false, + "Ascend910_xx series only support aclnn operator, and current operator {impl_name} do not support internal format.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + }} return {impl_name}({args_exprs_str}); }} """ diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index a80c38aeb1494f488801ebe1a7fbef3c7428fc33..9621cbdca87f946b3b90552d6732aa9605a67902 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2528,6 +2528,7 @@ "npu_cross_entropy_loss", "npu_format_cast_", "npu_fusion_attention", + "npu_fusion_attention_v2", "npu_get_float_status", "npu_nms_rotated", "npu_random_choice_with_mask", @@ -2539,6 +2540,7 @@ "npu_mla_prolog_v2", "npu_convert_weight_to_int4pack", "npu_ffn", + "npu_fused_matmul", "npu_geglu", "npu_grouped_matmul", "npu_moe_finalize_routing", @@ -2552,6 +2554,9 @@ "npu_scatter_nd_update_", "npu_stride_copy", "npu_gemma_rms_norm", + "npu_dynamic_mx_quant", + "npu_grouped_dynamic_mx_quant", + "npu_dtype_cast", "npu_swiglu", "npu_gelu", "npu_gelu_backward", diff --git a/test/npu/test_tensors.py b/test/npu/test_tensors.py index 237d6a1aee31c3013fd8515992bffd4a16362dfe..044a7f7d363d93f3bd9cd6bf6dd4d439f83e14ac 100644 --- a/test/npu/test_tensors.py +++ b/test/npu/test_tensors.py @@ -1,4 +1,5 @@ from copy import deepcopy +import unittest import numpy as np import torch import torch_npu @@ -22,6 +23,16 @@ types = [ ] +def skipIfUnsupport910_xx(): + def skip_dec(func): + def wrapper(self): + if "Ascend910_xx" not in torch_npu.npu.get_device_name(): + return unittest.SkipTest("Device 910_xx condition not satisfied") + return func(self) + return wrapper + return skip_dec + + def get_npu_type(type_name): if isinstance(type_name, type): type_name = '{}.{}'.format(type_name.__module__, type_name.__name__) @@ -383,5 +394,16 @@ class TestViewOps(TestCase): self.assertEqual(tensor.view(3, -1).size(), target) +class TestTensorDtype(TestCase): + @skipIfUnsupport910_xx() + def test_fp8(self): + tensor1 = torch.randn([2, 2], dtype=torch.float32).npu() + tensor2 = torch.randn([2, 2], dtype=torch.float32).npu() + tensor_f8e5m2 = tensor1.to(torch.float8_e5m2) + tensor_f8e4m3fn = tensor2.to(torch.float8_e4m3fn) + self.assertEqual(tensor_f8e5m2.dtype, torch.float8_e5m2) + self.assertEqual(tensor_f8e4m3fn.dtype, torch.float8_e4m3fn) + + if __name__ == "__main__": run_tests() diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 65908c94c1e77b76a5a9b53aa490eaeba20cec5e..1e5b2151c35c8aa44c3827ff4030b622f2b89ab0 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2588,6 +2588,9 @@ "torch_npu.npu_dynamic_quant_asymmetric": { "signature": "(input_dummy, smooth_scales=None, group_index=None, dst_type=torch.int8)" }, + "torch_npu.npu_dynamic_mx_quant": { + "signature": "(*args, **kwargs)" + }, "torch_npu.npu_group_quant": { "signature": "(x, scale, group_index, offset=None, dst_dtype=None)" }, @@ -2595,7 +2598,7 @@ "signature": "(*args, **kwargs)" }, "torch_npu.npu_format_cast": { - "signature": "(self, acl_format)" + "signature": "(self, acl_format, customize_dtype=None)" }, "torch_npu.npu_format_cast_": { "signature": "(*args, **kwargs)" @@ -2835,16 +2838,16 @@ "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor" }, "func: npu_format_cast": { - "signature": "(Tensor self, int acl_format) -> Tensor" + "signature": "(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor" }, "func: npu_format_cast_": { - "signature": "(Tensor(a!) self, Tensor src) -> Tensor(a!)" + "signature": "(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast_.acl_format": { - "signature": "(Tensor(a!) self, int acl_format) -> Tensor(a!)" + "signature": "(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast.Tensor": { - "signature": "(Tensor self, Tensor dst) -> Tensor" + "signature": "(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor" }, "func: npu_change_data_ptr": { "signature": "(Tensor dst, Tensor src, int index) -> int" @@ -2864,6 +2867,9 @@ "func: _npu_format_cast": { "signature": "(Tensor self, int acl_format) -> Tensor" }, + "func: _npu_format_cast": { + "signature": "(Tensor self, int acl_format, int customize_dtype) -> Tensor" + }, "torch_npu_public_env: INF_NAN_MODE_ENABLE": { "mode": "std::unordered_map infNanMode = {{0, \"max\"}, {1, \"inf_nan\"}}" }, diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index cbcf87b0fc061294c5fb26ace98900db789f8c2a..b9c7346d06cb4a8df03cd6734f6f9e741a3f64ce 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -164,6 +164,14 @@ typedef enum { ACL_INT4 = 29, ACL_UINT1 = 30, ACL_COMPLEX32 = 33, + ACL_HIFLOAT8 = 34, + ACL_FLOAT8_E5M2 = 35, + ACL_FLOAT8_E4M3FN = 36, + ACL_FLOAT8_E8M0 = 37, + ACL_FLOAT6_E3M2 = 38, + ACL_FLOAT6_E2M3 = 39, + ACL_FLOAT4_E2M1 = 40, + ACL_FLOAT4_E1M2 = 41, } aclDataType; typedef enum { @@ -182,6 +190,8 @@ typedef enum { ACL_FRACTAL_Z_3D = 33, ACL_FORMAT_NC = 35, ACL_FORMAT_NCL = 47, + ACL_FORMAT_FRACTAL_NZ_C0_16 = 50, + ACL_FORMAT_FRACTAL_NZ_C0_32 = 51, } aclFormat; typedef enum { diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 05ef7980b701e5b54e4a2be9a74d34dc6a4ed7b7..3e4a8e5fbc0cfef4b42b85f916cd6c2999bf6aab 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/logging/Init.h" #include "torch_npu/csrc/npu/Module.h" +#include "torch_npu/csrc/custom_dtype/Init.h" #include "torch_npu/csrc/npu/Stress_detect.h" #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" @@ -167,6 +168,7 @@ PyObject* initModule() AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions()); AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions()); AddPyMethodDefs(methods, torch_npu::logging::logging_functions()); + AddPyMethodDefs(methods, c10_npu::custom_dtype_functions()); static struct PyModuleDef torchnpu_module = { PyModuleDef_HEAD_INIT, "torch_npu._C", diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 0c4000e52458682f8fa8ede2cde47b6421637d5d..f7ead13ccca965313b84e54adfcb6a6a95d0acf9 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -1,16 +1,130 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h" +#include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/aten/CustomFunctions.h" +#include "torch_npu/csrc/custom_dtype/Init.h" +#include "third_party/op-plugin/op_plugin/utils/op_api_common.h" namespace at_npu { namespace native { using tensor_list = std::vector; +using GetFormatFunc = int (*)(const aclTensor *, const int, const int, int64_t **, uint64_t *, int *); + +std::tuple> MaybeUseAclnnNpuFormatCast(const at::Tensor& src, + int64_t acl_format, c10::optional customize_dtype) +{ + const static auto GetFormatFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCastCalculateSizeAndFormat"); + const static auto FormatCastFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCast"); + + const static bool aclnnNpuFormatCastExist = + (GetFormatFuncAddr == nullptr || FormatCastFuncAddr == nullptr) ? false : true; + + GetFormatFunc GetFormat = reinterpret_cast(GetFormatFuncAddr); + int64_t *dstStorageShape = nullptr; + uint64_t dstShapeSize = 0; + int dstFormat; + at::SmallVector outputShape = {}; + aclDataType customizeAcltype = (customize_dtype.has_value()) ? + c10_npu::GetAclDataType(customize_dtype.value()) : + at_npu::native::OpPreparation::convert_to_acl_data_type(src.scalar_type()); + + if (c10_npu::IsAscend910_xxVersion()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } + TORCH_CHECK(false, + "aclnnNpuFormatCast does not exist, Ascend910_xx series only support aclnn operators.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + if (at_npu::native::env::CheckJitDisable()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + if (api_ret != 0) { + if (customize_dtype.has_value()) { + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + } + return std::make_tuple(false, dstFormat, outputShape); + } + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while aclnnNpuFormatCast does not exist.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while jit_compile=True.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } +} + +at::Tensor create_tensor_with_format_and_shape(c10::IntArrayRef baseSizes, + c10::IntArrayRef storageSizes, + const caffe2::TypeMeta dtype, int64_t acl_format) +{ + c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get(); + int64_t nelements = 1; + for (const auto& num : storageSizes) { + nelements *= num; + } + int64_t size_bytes = nelements * dtype.itemsize(); + c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( + c10::StorageImpl::use_byte_size_t(), + c10::SymInt(size_bytes), + allocator, + true); + auto tensor = at::detail::make_tensor(storage_impl, dtype); + + if (baseSizes.size() != 1 || baseSizes[0] != 0) { + tensor.unsafeGetTensorImpl()->set_sizes_contiguous(baseSizes); + } + tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous); + StorageDescHelper::SetDesc(tensor, baseSizes, storageSizes, tensor.strides(), static_cast(acl_format)); + return tensor; +} + +at::Tensor format_cast_impl_out_npu_aclnn(const at::Tensor& src, + int64_t acl_format, c10::IntArrayRef storageSizes) +{ + auto src_new = src.contiguous(); + auto src_new_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src_new)->npu_desc_; + + at::Tensor dst = create_tensor_with_format_and_shape( + src_new_desc.base_sizes_, storageSizes, src.dtype(), acl_format); + + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnNpuFormatCast, src_new, dst); + + // format cast only change physical layout of base tensor and view tensor's + // metadata remain unchanged + dst.set_(dst.storage(), src_new.storage_offset(), src_new.sizes(), src_new.strides()); + return dst; +} at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) { @@ -36,7 +150,8 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) } // convert src from src_format to dst_format, write the result into dst(self) -at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src) +at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); torch_npu::utils::torch_check_npu(src); @@ -47,6 +162,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Ten return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, dst_desc.npu_format_, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + // calculate the output result of the NPU format_cast_impl_out_npu(self, src); @@ -59,16 +181,6 @@ at::Tensor npu_format_cast_impl( int64_t acl_format) { auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_; - if (src_desc.npu_format_ == acl_format) { - ASCEND_LOGD("no need to do format cast"); - return src; - } - if (FormatHelper::IsBaseFormatType(src) && - FormatHelper::IsBaseFormatType(static_cast(acl_format))) { - FormatCastHelper::format_cast_as_base_format(src, static_cast(acl_format)); - return src; - } - at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); @@ -84,18 +196,20 @@ at::Tensor npu_format_cast_impl( // conver self to dst'format, write the result into new result tensor at::Tensor NPUNativeFunctions::npu_format_cast( const at::Tensor& self, - const at::Tensor& dst) + const at::Tensor& dst, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(dst); auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_; int64_t dst_format = dst_desc.npu_format_; - return custom_ops::npu_format_cast(self, dst_format); + return custom_ops::npu_format_cast(self, dst_format, customize_dtype); } // conver self to acl_format, write the result into self at::Tensor& NPUNativeFunctions::npu_format_cast_( at::Tensor& self, - int64_t acl_format) + int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; @@ -108,6 +222,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, self.options(), acl_format); @@ -130,16 +251,54 @@ int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self) at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format) { - return npu_format_cast_impl(self, acl_format); + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, c10::nullopt); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); +} + +at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format, + int64_t customize_dtype) +{ + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); } -at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format) +at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); if (NPUNativeFunctions::get_npu_format(self) == acl_format) { ASCEND_LOGD("no need to do format cast"); return self; } + if (customize_dtype.has_value()) { + return custom_ops::_npu_format_cast(self, acl_format, customize_dtype.value()); + } return custom_ops::_npu_format_cast(self, acl_format); } diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp index 775d95cbfa597a61fcf71eca04008d8c21fd4e83..685f907653a96e2f36e6ee5c9ea4dc6344618cef 100644 --- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp +++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp @@ -10,13 +10,34 @@ namespace at_npu { namespace native { +#define AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) + + +#define AT_DISPATCH_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, __VA_ARGS__)) + + c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) { c10::Scalar r; - AT_DISPATCH_ALL_TYPES_AND3( + AT_DISPATCH_ALL_TYPES_AND5( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + at::ScalarType::Float8_e5m2, + at::ScalarType::Float8_e4m3fn, self.scalar_type(), "_local_scalar_dense_npu", [&] { diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp index 96e67ff5bb07dd483d0f208daf299990d02aa1ce..3d2be6452d079cf9f4aa4f797b606c98c3cb1730 100644 --- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp @@ -161,7 +161,7 @@ at::Tensor NPUNativeFunctions::to( "dtype cast repalce with float."); } dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype; - return custom_ops::npu_dtype_cast(self, dtype); + return custom_ops::_npu_dtype_cast(self, dtype); } at::Tensor NPUNativeFunctions::to( diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 95bb740db159bef654fb063934f68344c1bf257e..b186df765181599eca85294f3343033c711f8a32 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -62,12 +62,12 @@ custom: - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int device_check: NoCheck - func: get_npu_format(Tensor self) -> int - - func: npu_format_cast.Tensor(Tensor self, Tensor dst) -> Tensor + - func: npu_format_cast.Tensor(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor device_check: NoCheck exposed: True - - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format) -> Tensor(a!) + - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!) exposed: True - - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!) + - func: npu_format_cast_(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!) device_check: NoCheck exposed: True - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor @@ -82,9 +82,10 @@ custom: - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck - func: get_storage_size(Tensor self) -> int - - func: npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: npu_format_cast(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor exposed: True - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: _npu_format_cast.aclnn(Tensor self, int acl_format, int customize_dtype) -> Tensor - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor dispatch: CompositeExplicitAutograd: empty_with_swapped_memory diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 74afc22031e5f4ff3cb9464d8ed3b49d0de6bb37..e1e86a89071458e0de9d7959c7f023766574b5c5 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -726,6 +726,7 @@ BlockState::BlockState(Block *block) SegmentState::SegmentState(Block *head) { + TORCH_INTERNAL_ASSERT(head != nullptr, PTA_ERROR(ErrCode::PTR)); TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr); is_small = head->pool->is_small; @@ -882,7 +883,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vectorsize; auto new_candidate = candidate->next; + if (C10_UNLIKELY(new_candidate == nullptr)) { + return nullptr; + } if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) { return nullptr; } @@ -2442,7 +2446,11 @@ private: { bool freed_memory = false; for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) { - freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + if (FreeNPUMemoryCallbacksRegistry()->Create(name) != nullptr) { + freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + } else { + TORCH_CHECK(false, "free memory callback get nullptr", PTA_ERROR(ErrCode::PTR)); + } } return freed_memory; } diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 034726549b38840474c96166ce2deb6019f754c6..a91b1d3cacb23d2a1b9e750774c833f97c940bdd 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -91,7 +91,7 @@ MemUceInfo memUceInfo; std::mutex memUceInfoMutex; -void set_mem_uce_info(MemUceInfo info) +void set_mem_uce_info(MemUceInfo& info) { std::lock_guard lock(memUceInfoMutex); memUceInfo = info; diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 94e38a5edbd42b70addb22a5c94443301cd378e3..88a77ab810ef327be254ea143d5d46606b772cd5 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -259,7 +259,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg); void record_mem_hbm_ecc_error(); -void set_mem_uce_info(MemUceInfo info); +void set_mem_uce_info(MemUceInfo& info); MemUceInfo get_mem_uce_info(); diff --git a/torch_npu/csrc/core/npu/NPUMacros.h b/torch_npu/csrc/core/npu/NPUMacros.h index 3223c4f325b3de69b8e5cdc783954d84033b37b4..960dcb97b6e52bffc37582250ffd99b1f7ac08a6 100644 --- a/torch_npu/csrc/core/npu/NPUMacros.h +++ b/torch_npu/csrc/core/npu/NPUMacros.h @@ -29,6 +29,6 @@ #define TORCH_NPU_API C10_NPU_API -#define C10_COMPILE_TIME_MAX_NPUS 16 +#define C10_COMPILE_TIME_MAX_NPUS 32 // A maximum of 8 P2P links can be created on a NPU device #define C10_P2P_ACCESS_MAX_NPUS 8 diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp index 3fedb9d387ef61702a7414912b5572a8e187e7cd..9aaca59b54520df2294b6d69adbbb54dbe08d582 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.cpp +++ b/torch_npu/csrc/core/npu/NpuVariables.cpp @@ -41,27 +41,35 @@ static std::map socVersionMap = { void SetSocVersion(const char* const socVersion) { - if (socVersion == nullptr || - g_curSocVersion != SocVersion::UnsupportedSocVersion) { - return; - } + if (socVersion == nullptr || + g_curSocVersion != SocVersion::UnsupportedSocVersion) { + return; + } - SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; + SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; + std::string inputVersion = socVersion; + std::string ascend95Version = "Ascend910_xx"; - auto const& iter = socVersionMap.find(socVersion); - if (iter != socVersionMap.end()) { - curSocVersion = iter->second; - } else { - std::string unsupported_soc(socVersion); - std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); - AT_ERROR("Unsupported soc version: ", unsupported_soc); - } + auto const& iter = socVersionMap.find(socVersion); + if (iter != socVersionMap.end()) { + curSocVersion = iter->second; + } else if ((inputVersion.compare(0, ascend95Version.size(), ascend95Version) == 0)) { + curSocVersion = SocVersion::Ascend910_xx; + } else { + std::string unsupported_soc(socVersion); + std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); + AT_ERROR("Unsupported soc version: ", unsupported_soc); + } - g_curSocVersion = curSocVersion; + g_curSocVersion = curSocVersion; } const SocVersion& GetSocVersion() { + if (g_curSocVersion == SocVersion::UnsupportedSocVersion) { + auto soc_name = c10_npu::acl::AclGetSocName(); + SetSocVersion(soc_name); + } return g_curSocVersion; } @@ -95,5 +103,10 @@ bool IsBF16Supported() { return GetSocVersion() >= SocVersion::Ascend910B1; } + +bool IsAscend910_xxVersion() +{ + return GetSocVersion() == SocVersion::Ascend910_xx; +} } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h index 3119a645153322225f9d0d9ea19dfa3b1ef9ab9f..6481369f581b35d04273109de4e729d37f8eccf3 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.h +++ b/torch_npu/csrc/core/npu/NpuVariables.h @@ -30,7 +30,8 @@ enum class SocVersion { Ascend910_9381, Ascend910_9382, Ascend910_9372, - Ascend910_9362 + Ascend910_9362, + Ascend910_xx = 260 }; void SetSocVersion(const char* const socVersion); @@ -40,6 +41,8 @@ const SocVersion& GetSocVersion(); bool IsSupportInfNan(); bool IsBF16Supported(); + +bool IsAscend910_xxVersion(); } // namespace c10_npu #endif diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index b59e9c85c96e2998273953d3d068a3465bd0efde..54190a681d3364ed1c74d8e94afacc8cfc7244ce 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -174,6 +174,7 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { if (stream == nullptr) { // default stream return ACL_ERROR_INVALID_PARAM; } + typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t); static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode); if (func == nullptr) { @@ -844,7 +845,8 @@ bool IsCaptureSupported() static bool have_load_func = false; static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) && (GetSocVersion() < SocVersion::Ascend310B1)) || - (GetSocVersion() >= SocVersion::Ascend910_9391); + ((GetSocVersion() >= SocVersion::Ascend910_9391) && + (GetSocVersion() < SocVersion::Ascend910_xx)); if (default_support_capture && !have_load_func) { have_load_func = true; typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp index 8f7f17a0114a517ef7f5ef4b201b1bf749274210..69acfc234f254695ffe6733f81c4e9dfdead5bcf 100644 --- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp +++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp @@ -4,6 +4,7 @@ #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/npu_log.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" namespace c10_npu { namespace option { @@ -84,6 +85,18 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::u void SetOption(const std::string &key, const std::string &val) { + if (c10_npu::IsAscend910_xxVersion()) { + if (key == "jitCompile" && val == "enable") { + TORCH_NPU_WARN_ONCE("Ascend910_xx series only support jit_compile=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + if (key == "ALLOW_INTERNAL_FORMAT" && val == "enable") { + TORCH_NPU_WARN_ONCE("Ascend910_xx series only support allow_internal_format=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + } register_options::OptionRegister::GetInstance()->Set(key, val); } diff --git a/torch_npu/csrc/custom_dtype/CMakeLists.txt b/torch_npu/csrc/custom_dtype/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d3d7c0e5379a0c23354a45a6dbd12c0bffea0ac --- /dev/null +++ b/torch_npu/csrc/custom_dtype/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _CUS_DTYPE_SRCS *.cpp) + +LIST(APPEND CUS_DTYPE_SRCS ${_CUS_DTYPE_SRCS}) + +# Pass to parent +set(CUS_DTYPE_SRCS ${CUS_DTYPE_SRCS} PARENT_SCOPE) diff --git a/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2293ba94dd063eba4023415405a230b2baa7ae6d --- /dev/null +++ b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp @@ -0,0 +1,43 @@ +#include "torch_npu/csrc/custom_dtype/extension.h" +#include "op_plugin/AclOpsInterface.h" +#include "op_plugin/OpApiInterface.h" +#include "op_plugin/utils/op_api_common.h" + + +namespace c10_npu { + +at::Tensor cast_to_fp8(const at::Tensor &input, int otype) +{ + auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); + + if (input.numel() == 0) { + return output; + } + + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); + + return output; +} + +void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype) +{ + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); + return; +} + +at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype) +{ + aclDataType input_acltype = c10_npu::GetAclDataType(itype); + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); + TensorWrapper input_wrapper = {input, input_acltype}; + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input_wrapper, out_acltype, out_wrapper); + + return output; +} +} diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..90644aa1e3e14b2a9aef00e9272ceca995f6e9c5 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.cpp @@ -0,0 +1,163 @@ +#include "torch_npu/csrc/custom_dtype/Init.h" +#ifndef BUILD_LIBTORCH +#include +#include +#endif +#include "torch_npu/csrc/custom_dtype/extension.h" + + +namespace c10_npu { +struct DTypeConstants { + static const int float32_value; + static const int float16_value; + static const int int8_value; + static const int int32_value; + static const int uint8_value; + static const int int16_value; + static const int uint16_value; + static const int uint32_value; + static const int int64_value; + static const int uint64_value; + static const int float64_value; + static const int bool_value; + static const int string_value; + static const int complex64_value; + static const int complex128_value; + static const int bfloat16_value; + static const int int4_value; + static const int uint1_value; + static const int complex32_value; + static const int hifloat8_value; + static const int float8_e5m2_value; + static const int float8_e4m3fn_value; + static const int float8_e8m0_value; + static const int float6_e3m2_value; + static const int float6_e2m3_value; + static const int float4_e2m1_value; + static const int float4_e1m2_value; +}; + +const int DTypeConstants::float32_value = static_cast(DType::FLOAT); +const int DTypeConstants::float16_value = static_cast(DType::FLOAT16); +const int DTypeConstants::int8_value = static_cast(DType::INT8); +const int DTypeConstants::int32_value = static_cast(DType::INT32); +const int DTypeConstants::uint8_value = static_cast(DType::UINT8); +const int DTypeConstants::int16_value = static_cast(DType::INT16); +const int DTypeConstants::uint16_value = static_cast(DType::UINT16); +const int DTypeConstants::uint32_value = static_cast(DType::UINT32); +const int DTypeConstants::int64_value = static_cast(DType::INT64); +const int DTypeConstants::uint64_value = static_cast(DType::UINT64); +const int DTypeConstants::float64_value = static_cast(DType::DOUBLE); +const int DTypeConstants::bool_value = static_cast(DType::BOOL); +const int DTypeConstants::string_value = static_cast(DType::STRING); +const int DTypeConstants::complex64_value = static_cast(DType::COMPLEX64); +const int DTypeConstants::complex128_value = static_cast(DType::COMPLEX128); +const int DTypeConstants::bfloat16_value = static_cast(DType::BF16); +const int DTypeConstants::int4_value = static_cast(DType::INT4); +const int DTypeConstants::uint1_value = static_cast(DType::UINT1); +const int DTypeConstants::complex32_value = static_cast(DType::COMPLEX32); +const int DTypeConstants::hifloat8_value = static_cast(DType::HIFLOAT8); +const int DTypeConstants::float8_e5m2_value = static_cast(DType::FLOAT8_E5M2); +const int DTypeConstants::float8_e4m3fn_value = static_cast(DType::FLOAT8_E4M3FN); +const int DTypeConstants::float8_e8m0_value = static_cast(DType::FLOAT8_E8M0); +const int DTypeConstants::float6_e3m2_value = static_cast(DType::FLOAT6_E3M2); +const int DTypeConstants::float6_e2m3_value = static_cast(DType::FLOAT6_E2M3); +const int DTypeConstants::float4_e2m1_value = static_cast(DType::FLOAT4_E2M1); +const int DTypeConstants::float4_e1m2_value = static_cast(DType::FLOAT4_E1M2); + +#ifndef BUILD_LIBTORCH +PyObject* cd_initExtension(PyObject*, PyObject *) +{ + auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C")); + if (!torch_npu_C_module) { + return nullptr; + } + auto torch_npu_C_m = py::handle(torch_npu_C_module).cast(); + auto m = torch_npu_C_m.def_submodule("_cd", "_cd bindings"); + + py::class_(m, "DType") + .def_readonly_static("float32", &DTypeConstants::float32_value) + .def_readonly_static("float16", &DTypeConstants::float16_value) + .def_readonly_static("int8", &DTypeConstants::int8_value) + .def_readonly_static("int32", &DTypeConstants::int32_value) + .def_readonly_static("uint8", &DTypeConstants::uint8_value) + .def_readonly_static("int16", &DTypeConstants::int16_value) + .def_readonly_static("uint16", &DTypeConstants::uint16_value) + .def_readonly_static("uint32", &DTypeConstants::uint32_value) + .def_readonly_static("int64", &DTypeConstants::int64_value) + .def_readonly_static("uint64", &DTypeConstants::uint64_value) + .def_readonly_static("float64", &DTypeConstants::float64_value) + .def_readonly_static("bool", &DTypeConstants::bool_value) + .def_readonly_static("string", &DTypeConstants::string_value) + .def_readonly_static("complex64", &DTypeConstants::complex64_value) + .def_readonly_static("complex128", &DTypeConstants::complex128_value) + .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value) + .def_readonly_static("int4", &DTypeConstants::int4_value) + .def_readonly_static("uint1", &DTypeConstants::uint1_value) + .def_readonly_static("complex32", &DTypeConstants::complex32_value) + .def_readonly_static("hifloat8", &DTypeConstants::hifloat8_value) + .def_readonly_static("float8_e5m2", &DTypeConstants::float8_e5m2_value) + .def_readonly_static("float8_e4m3fn", &DTypeConstants::float8_e4m3fn_value) + .def_readonly_static("float8_e8m0", &DTypeConstants::float8_e8m0_value) + .def_readonly_static("float6_e3m2", &DTypeConstants::float6_e3m2_value) + .def_readonly_static("float6_e2m3", &DTypeConstants::float6_e2m3_value) + .def_readonly_static("float4_e2m1", &DTypeConstants::float4_e2m1_value) + .def_readonly_static("float4_e1m2", &DTypeConstants::float4_e1m2_value); + + m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8", py::call_guard()); + m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8", + py::call_guard()); + m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8", py::call_guard()); + + Py_RETURN_NONE; +} + +static PyMethodDef NPUCustomDtypeMethods[] = { // NOLINT + {"_cd_init", cd_initExtension, METH_NOARGS, nullptr}, + {nullptr, nullptr, 0, nullptr} +}; +#endif + +const std::string CustomDataTypeToString(int64_t dType) +{ + const std::map + TYPE_TO_STRING_MAP = { + {DType::FLOAT, "torch_npu.float32"}, + {DType::FLOAT16, "torch_npu.float16"}, + {DType::INT8, "torch_npu.int8"}, + {DType::INT32, "torch_npu.int32"}, + {DType::UINT8, "torch_npu.uint8"}, + {DType::INT16, "torch_npu.int16"}, + {DType::UINT16, "torch_npu.uint16"}, + {DType::UINT32, "torch_npu.uint32"}, + {DType::INT64, "torch_npu.int64"}, + {DType::UINT64, "torch_npu.uint64"}, + {DType::DOUBLE, "torch_npu.float64"}, + {DType::BOOL, "torch_npu.bool"}, + {DType::STRING, "torch_npu.string"}, + {DType::COMPLEX64, "torch_npu.complex64"}, + {DType::COMPLEX128, "torch_npu.complex128"}, + {DType::BF16, "torch_npu.bfloat16"}, + {DType::INT4, "torch_npu.int4"}, + {DType::UINT1, "torch_npu.uint1"}, + {DType::COMPLEX32, "torch_npu.complex32"}, + {DType::HIFLOAT8, "torch_npu.hifloat8"}, + {DType::FLOAT8_E5M2, "torch_npu.float8_e5m2"}, + {DType::FLOAT8_E4M3FN, "torch_npu.float8_e4m3fn"}, + {DType::FLOAT8_E8M0, "torch_npu.float8_e8m0"}, + {DType::FLOAT6_E3M2, "torch_npu.float6_e3m2"}, + {DType::FLOAT6_E2M3, "torch_npu.float6_e2m3"}, + {DType::FLOAT4_E2M1, "torch_npu.float4_e2m1"}, + {DType::FLOAT4_E1M2, "torch_npu.float4_e1m2"}}; + + const auto iter = TYPE_TO_STRING_MAP.find(static_cast(dType)); + return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype"; +} + +#ifndef BUILD_LIBTORCH +PyMethodDef* custom_dtype_functions() +{ + return NPUCustomDtypeMethods; +} +#endif +} diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h new file mode 100644 index 0000000000000000000000000000000000000000..23235a002749d5ea278c353d2ab97e62c047c2c8 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#ifndef BUILD_LIBTORCH +#include +#endif +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" +#include "third_party/acl/inc/acl/acl_base.h" + +namespace c10_npu { +const int g_toAclOffset = 256; + +#define ENUM_OFFSET(new_name, old_name) new_name = static_cast(old_name) + g_toAclOffset, + +#ifndef BUILD_LIBTORCH +TORCH_NPU_API PyMethodDef* custom_dtype_functions(); +#endif + +enum class DType { + UNDEFINED = -1, + ENUM_OFFSET(FLOAT, ACL_FLOAT) + ENUM_OFFSET(FLOAT16, ACL_FLOAT16) + ENUM_OFFSET(INT8, ACL_INT8) + ENUM_OFFSET(INT32, ACL_INT32) + ENUM_OFFSET(UINT8, ACL_UINT8) + ENUM_OFFSET(INT16, ACL_INT16) + ENUM_OFFSET(UINT16, ACL_UINT16) + ENUM_OFFSET(UINT32, ACL_UINT32) + ENUM_OFFSET(INT64, ACL_INT64) + ENUM_OFFSET(UINT64, ACL_UINT64) + ENUM_OFFSET(DOUBLE, ACL_DOUBLE) + ENUM_OFFSET(BOOL, ACL_BOOL) + ENUM_OFFSET(STRING, ACL_STRING) + ENUM_OFFSET(COMPLEX64, ACL_COMPLEX64) + ENUM_OFFSET(COMPLEX128, ACL_COMPLEX128) + ENUM_OFFSET(BF16, ACL_BF16) + ENUM_OFFSET(INT4, ACL_INT4) + ENUM_OFFSET(UINT1, ACL_UINT1) + ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32) + ENUM_OFFSET(HIFLOAT8, ACL_HIFLOAT8) + ENUM_OFFSET(FLOAT8_E5M2, ACL_FLOAT8_E5M2) + ENUM_OFFSET(FLOAT8_E4M3FN, ACL_FLOAT8_E4M3FN) + ENUM_OFFSET(FLOAT8_E8M0, ACL_FLOAT8_E8M0) + ENUM_OFFSET(FLOAT6_E3M2, ACL_FLOAT6_E3M2) + ENUM_OFFSET(FLOAT6_E2M3, ACL_FLOAT6_E2M3) + ENUM_OFFSET(FLOAT4_E2M1, ACL_FLOAT4_E2M1) + ENUM_OFFSET(FLOAT4_E1M2, ACL_FLOAT4_E1M2) +}; + +inline bool IsCustomDType(int64_t t) +{ + if (t >= g_toAclOffset) { + return true; + } + return false; +} + +// Both c10_npu::DType and ScalarType are supported +inline aclDataType GetAclDataType(int64_t t) +{ + if (t >= g_toAclOffset) { + return static_cast(t - g_toAclOffset); + } + return at_npu::native::OpPreparation::convert_to_acl_data_type( + static_cast(t)); +} + +inline aclDataType GetAclDataType(DType t) +{ + return static_cast(static_cast(t) - g_toAclOffset); +} + +inline at::ScalarType GetATenDType(int64_t t) +{ + aclDataType aclType = GetAclDataType(t); + return at_npu::native::OpPreparation::convert_to_scalar_type(aclType); +} + +const std::string CustomDataTypeToString(int64_t dType); + +} // namespace c10_npu diff --git a/torch_npu/csrc/custom_dtype/extension.h b/torch_npu/csrc/custom_dtype/extension.h new file mode 100644 index 0000000000000000000000000000000000000000..91ef1df8a51cdf6929c630a99370020eb7ed59a8 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/extension.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include "torch_npu/csrc/custom_dtype/Init.h" + +namespace c10_npu { +at::Tensor cast_to_fp8(const at::Tensor &input, int otype); + +void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype); + +at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype); +} diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3d91f787bfdfb13871886738f7d3c1cbb100caa9..4b2d28025d6ea104f373a82129c35a377de86e71 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -3634,7 +3634,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3812,7 +3812,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce_coalesced( for (const auto i : c10::irange(tensors.size())) { if (tensors[i].scalar_type() == at::kBool || tensors[i].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[i] = at_npu::native::custom_ops::npu_dtype_cast(tensors[i], at::kInt); + tensors_cp[i] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[i], at::kInt); } } }, @@ -3876,7 +3876,7 @@ c10::intrusive_ptr ProcessGroupHCCL::reduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3936,11 +3936,11 @@ c10::intrusive_ptr ProcessGroupHCCL::_reduce_oop( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (inputTensors[0].scalar_type() == at::kBool || inputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - inputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(inputTensors[0], at::kInt); + inputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(inputTensors[0], at::kInt); } if (outputTensors[0].scalar_type() == at::kBool || outputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - outputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(outputTensors[0], at::kInt); + outputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(outputTensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3975,14 +3975,14 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const if (num_add != 0) { bool transflag = false; if (inter_tensors.scalar_type() == at::ScalarType::Bool) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Int); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Int); transflag = true; } inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0); if (transflag) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Bool); } } return inter_tensors; diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp index 655082b56feb313d99c441c690bbef46a12c6aa1..edebbba53f3dacfa01a87c601db16ccc9c93cf6b 100644 --- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -423,6 +423,9 @@ void TensorPipeAgent::startImpl() priority = opts_.transports->size() - 1 - (iter - opts_.transports->begin()); } std::unique_ptr reg = TensorPipeTransportRegistry()->Create(key); + if (reg == nullptr || reg->transport == nullptr) { + TORCH_CHECK(false, "TensorPipeTransport get nullptr", DIST_ERROR(ErrCode::PTR)); + } if (!reg->transport->isViable()) { continue; } diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp index 6a92fe5af4d8039b3d0ff9c50e49d1fd5fa30a00..9bd270b8fd231cb39a8bc9b98c8680b88a66e6a2 100644 --- a/torch_npu/csrc/framework/FormatHelper.cpp +++ b/torch_npu/csrc/framework/FormatHelper.cpp @@ -52,6 +52,10 @@ std::unordered_map FormatHelper::Initialize {ACL_FORMAT_NDC1HWC0, (FormatInfo){ACL_FORMAT_NDC1HWC0, ACL_FORMAT_NCDHW, InferShapeOfNDC1HWC0, "NDC1HWC0", true}}, {ACL_FRACTAL_Z_3D, (FormatInfo){ACL_FRACTAL_Z_3D, ACL_FORMAT_NCDHW, InferShapeOfFZ3D, "FRACTAL_Z_3D", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_16, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_16", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_32, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_32, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_32", true}}, }; }; diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index 6b98651c51dba728c9062a47d777650ae7ac93a6..80af05f94b321cac1e928484e18027c1e5cc836b 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -24,7 +24,9 @@ static std::unordered_map> floating_limits_m {at::ScalarType::Double, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Float, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::BFloat16, {std::numeric_limits::max(), std::numeric_limits::min()}}, - {at::ScalarType::Half, {65504, -65504}}}; + {at::ScalarType::Half, {65504, -65504}}, + {at::ScalarType::Float8_e5m2, {57345, -57345}}, + {at::ScalarType::Float8_e4m3fn, {449, -449}}}; static std::unordered_map> integral_limits_map{ {at::ScalarType::Long, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Int, {std::numeric_limits::max(), std::numeric_limits::min()}}, @@ -274,7 +276,7 @@ OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceSca { std::tuple res; if (commonType.has_value() && commonType.value() != tensor.scalar_type()) { - tensor = custom_ops::npu_dtype_cast(tensor, commonType.value()); + tensor = custom_ops::_npu_dtype_cast(tensor, commonType.value()); } // as for dim=0, the dtype of tensor can not be `uint16` because of `TBE` if (torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).storage_sizes_.empty()) { @@ -331,7 +333,7 @@ OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType ty OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType) { if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) { - output = custom_ops::npu_dtype_cast(output, commonType.value()); + output = custom_ops::_npu_dtype_cast(output, commonType.value()); } auto res = OpCmdHelper::CovertToAclOutput(output, realType); aclCmd->AddOutput(std::get<0>(res), std::get<1>(res)); diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index f1b9064b6de7cfa89da5e861ed6e4237eda1d39b..aac0ba181419f19e0e2e7dd742e67472f1fdeee3 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -574,6 +574,7 @@ void *NewFunc(int caption, int &size) void DeleteFunc(void *ptr) { free(ptr); + ptr = nullptr; } using Func = int (*)(c10_npu::queue::QueueParas *, aclrtStream); diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index fecbb86f1f0faad206c6b1ec6e77c2e26e15b042..6f52465d1abeed9f4dd7efe6d6d8c56c1fa5e0d6 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -97,6 +97,13 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &size, c torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = SetDesc(dst.dtype(), size, strides, format); } +void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format) +{ + torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = + SetDesc(dst.dtype(), base_size, storage_size, strides, format); +} + bool StorageDescHelper::CheckDescInit(const c10::Storage &storage) { return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ != @@ -254,6 +261,22 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty return npu_desc; } +torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format) +{ + struct torch_npu::NPUStorageDesc npu_desc; + npu_desc.data_type_ = dtype; + npu_desc.base_sizes_ = base_size; + npu_desc.base_strides_ = strides; + aclFormat baseFormat; + aclFormat npuFormat; + std::tie(baseFormat, npuFormat) = InferFormat::GuessFormatUnit(base_size, format); + npu_desc.storage_sizes_ = storage_size; + npu_desc.origin_format_ = baseFormat; + npu_desc.npu_format_ = npuFormat; + return npu_desc; +} + int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst) { const auto &physical_size = FormatHelper::GetStorageSizes(dst); diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h index 5c16ee74e2dc97165d97cd1d3d15857948b7457f..37b8933c1a67a4e687de68220c7e31097211a2e5 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.h +++ b/torch_npu/csrc/framework/StorageDescHelper.h @@ -35,6 +35,8 @@ public: static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides); static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &size, const c10::IntArrayRef &strides, aclFormat format); + static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format); static bool CheckDescInit(const c10::Storage &storage); // For Serialization to Get and Set NpuStorageDesc @@ -63,6 +65,8 @@ private: const c10::IntArrayRef& strides); static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& size, const c10::IntArrayRef& strides, aclFormat format); + static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format); }; } // namespace native diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp index c2abf7f4b2ae45e57a603f91ea3480e9519e4b1f..ee90387910967e7113f0153b0a8aea3099c0cb50 100644 --- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp @@ -70,6 +70,14 @@ private: ResetDataPtr(src, self, static_cast(src.storage().data_ptr().get())); return true; + case at::ScalarType::Float8_e5m2: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; + case at::ScalarType::Float8_e4m3fn: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; default: // Turn to conducting d2dCopyAsync for other dtypes. return false; diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index c2bb14ca66b8f807972e3f4dbf6f92b0a013788f..453a7082da2b271aa2e5adc35ae34b7d9b6b756e 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC) _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits8, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits16, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e5m2, ACL_FLOAT8_E5M2) \ + _(at::ScalarType::Float8_e4m3fn, ACL_FLOAT8_E4M3FN) \ _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) @@ -74,6 +74,37 @@ AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC) static std::map STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = { {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}}; +// at::ScalarType::UInt16/UInt32/UInt64 will be supported after v2.1.0 +static std::unordered_map + ACL_TYPE_TO_SCALAR_TYPE_MAP = {{ACL_DT_UNDEFINED, at::ScalarType::Undefined}, + {ACL_FLOAT, at::ScalarType::Float}, + {ACL_FLOAT16, at::ScalarType::Half}, + {ACL_INT8, at::ScalarType::Char}, + {ACL_INT32, at::ScalarType::Int}, + {ACL_UINT8, at::ScalarType::Byte}, + {ACL_INT16, at::ScalarType::Short}, + {ACL_UINT16, at::ScalarType::Undefined}, + {ACL_UINT32, at::ScalarType::Undefined}, + {ACL_INT64, at::ScalarType::Long}, + {ACL_UINT64, at::ScalarType::Undefined}, + {ACL_DOUBLE, at::ScalarType::Double}, + {ACL_BOOL, at::ScalarType::Bool}, + {ACL_STRING, at::ScalarType::Undefined}, + {ACL_COMPLEX64, at::ScalarType::ComplexFloat}, + {ACL_COMPLEX128, at::ScalarType::ComplexDouble}, + {ACL_BF16, at::ScalarType::BFloat16}, + {ACL_INT4, at::ScalarType::Undefined}, + {ACL_UINT1, at::ScalarType::Undefined}, + {ACL_COMPLEX32, at::ScalarType::ComplexHalf}, + {ACL_HIFLOAT8, at::ScalarType::Byte}, + {ACL_FLOAT8_E5M2, at::ScalarType::Float8_e5m2}, + {ACL_FLOAT8_E4M3FN, at::ScalarType::Float8_e4m3fn}, + {ACL_FLOAT8_E8M0, at::ScalarType::Byte}, + {ACL_FLOAT6_E3M2, at::ScalarType::Byte}, + {ACL_FLOAT6_E2M3, at::ScalarType::Byte}, + {ACL_FLOAT4_E2M1, at::ScalarType::Byte}, + {ACL_FLOAT4_E1M2, at::ScalarType::Byte}}; + aclError AclrtMemcpyAsyncParamCheck( void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) { @@ -291,5 +322,17 @@ int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32) return iter->second; } +at::ScalarType CalcuOpUtil::ConvertToScalarType(const aclDataType data_type) +{ + auto iter = ACL_TYPE_TO_SCALAR_TYPE_MAP.find(data_type); + if (iter == ACL_TYPE_TO_SCALAR_TYPE_MAP.end()) { + TORCH_CHECK(false, + std::string("aclDataType:") + std::to_string(data_type) + " has not been supported", + OPS_ERROR(ErrCode::NOT_SUPPORT)) + } + + return iter->second; +} + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h index b06ab06f9053fe7c46207789a95984eaa34610af..9a4a8024435cf77db1a1aba49e22cf73b580062f 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h @@ -86,6 +86,7 @@ public: static int64_t GetTensorNpuFormat(const at::Tensor &tensor); static c10::SmallVector ConvertIntArrayRefToSmallVector(c10::IntArrayRef intArray); static int8_t GetCubeMathType(bool allowHf32); + static at::ScalarType ConvertToScalarType(const aclDataType data_type); }; } // namespace native diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index 20f357c654b94a8c618ab339f68a68eeed8b67b6..530d359df23006ef4e099c7da9d39578632357ef 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -102,6 +102,11 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType); } +at::ScalarType OpPreparation::convert_to_scalar_type(const aclDataType data_type) +{ + return CalcuOpUtil::ConvertToScalarType(data_type); +} + at::Tensor OpPreparation::copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) { return CalcuOpUtil::CopyScalarToDevice(cpu_scalar, scalar_data_type); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h index 74ac30389872e4c0c8cb7da7a1ae3d7c2d4e075c..e87a91011218a4aa55b3f5187523af97ba1226f6 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.h +++ b/torch_npu/csrc/framework/utils/OpPreparation.h @@ -22,6 +22,7 @@ public: // From CalcuOpUtil part static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type); static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType); + static at::ScalarType convert_to_scalar_type(const aclDataType data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type, const c10::Device device); diff --git a/torch_npu/csrc/npu/DataParallelComm.cpp b/torch_npu/csrc/npu/DataParallelComm.cpp index db0d3efabefc96ca39c8bcaad354ed07b159bd38..c744e1e1baf961dbfa42de031c4c371c9be22672 100644 --- a/torch_npu/csrc/npu/DataParallelComm.cpp +++ b/torch_npu/csrc/npu/DataParallelComm.cpp @@ -137,7 +137,7 @@ void check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, i { // need to check len(inputs) == len(outputs) size_t len = inputs.size(); - if (len <= 0) { + if (len == 0) { throw std::runtime_error("input sequence can't be empty" + PTA_ERROR(ErrCode::PARAM)); } diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py index a261d1785911b154eae6ee0a18e82b236de76921..68113cb5effad5f857c41dccfab7d8d280842b05 100644 --- a/torch_npu/onnx/wrapper_onnx_ops.py +++ b/torch_npu/onnx/wrapper_onnx_ops.py @@ -244,8 +244,8 @@ class _NPUFormatCastOP(torch.autograd.Function): return torch.ops.npu.npu_format_cast(*args, **kwargs) @staticmethod - def symbolic(g, self: Tensor, acl_format: int): - return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format) + def symbolic(g, self: Tensor, acl_format: int, customize_dtype: int = None): + return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format, customize_dtype_i=customize_dtype) class _NPUSoftmaxCrossEntropyWithLogitsOP(torch.autograd.Function): @@ -1097,8 +1097,8 @@ def _wrapper_npu_deformable_conv2d(inputs, weight, offset, bias, kernel_size, st padding, dilation, groups, deformable_groups, modulated) -def _wrapper_npu_format_cast(self, acl_format): - return _NPUFormatCastOP.apply(self, acl_format) +def _wrapper_npu_format_cast(self, acl_format, customize_dtype=None): + return _NPUFormatCastOP.apply(self, acl_format, customize_dtype) def _wrapper_npu_softmax_cross_entropy_with_logits(self, labels):