diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad39472ed6b82729c9286e3b4ad3e54b08a47e45..d38f3d95dccc682493c9d4a78de9f8d387e09c0a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,6 +246,7 @@ add_subdirectory(${TORCHNPU_ROOT}/core)
 add_subdirectory(${TORCHNPU_ROOT}/framework)
 add_subdirectory(${TORCHNPU_ROOT}/flopcount)
 add_subdirectory(${TORCHNPU_ROOT}/logging)
+add_subdirectory(${TORCHNPU_ROOT}/custom_dtype)
 
 if (NOT DEFINED BUILD_LIBTORCH)
   add_subdirectory(${TORCHNPU_ROOT}/distributed)
@@ -272,10 +273,10 @@ if (DEFINED BUILD_TENSORPIPE)
 endif()
 
 if (DEFINED BUILD_LIBTORCH)
-  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS})
+  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS})
 else()
 # Compile code with pybind11
-  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
+  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
 endif()
 
 add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS})
diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py
index bdb6c48a13a73aed84172c65409c5e65f42201fd..248704d49282c0d2a54410cc092c5abeb61d3846 100644
--- a/codegen/gen_backend_stubs.py
+++ b/codegen/gen_backend_stubs.py
@@ -395,6 +395,8 @@ def gen_dispatcher_registrations(
     ns_helper = NamespaceHelper(namespace_str="at")
     native_func_header = """\
 #include "torch_npu/csrc/core/npu/NPURecovery.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/profiler/utils.h"
 #endif
diff --git a/codegen/utils.py b/codegen/utils.py
index 187f02fc9dea81f99c8c4c624840273ec3f0f3f4..1df2bfcd01e2b50bed0ec1e26a80c79959f7f777 100644
--- a/codegen/utils.py
+++ b/codegen/utils.py
@@ -401,6 +401,7 @@ const DeviceGuard device_guard(device_or_default(device));"""
                     device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
 
             op_key = str(f.func.name)
+            is_ascend910_xx_version = "c10_npu::IsAscend910_xxVersion()"
             if enable_opplugin():
                 if op_key in GLOBAL_STRUCTURED_OP_INFO_CACHE:
                     impl_name = f"op_plugin::{GLOBAL_STRUCTURED_OP_INFO_CACHE[op_key]}"
@@ -472,6 +473,11 @@ if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{
 if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{
         return {op_api_impl_name}({args_exprs_str});
     }} else {{
+        if ({is_ascend910_xx_version}) {{
+            TORCH_CHECK(false,
+                "Ascend910_xx series only support aclnn operator, and current operator {impl_name} do not support internal format.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+        }}
         return {impl_name}({args_exprs_str});
     }}
 """
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index a80c38aeb1494f488801ebe1a7fbef3c7428fc33..9621cbdca87f946b3b90552d6732aa9605a67902 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2528,6 +2528,7 @@
     "npu_cross_entropy_loss",
     "npu_format_cast_",
     "npu_fusion_attention",
+    "npu_fusion_attention_v2",
     "npu_get_float_status",
     "npu_nms_rotated",
     "npu_random_choice_with_mask",
@@ -2539,6 +2540,7 @@
     "npu_mla_prolog_v2",
     "npu_convert_weight_to_int4pack",
     "npu_ffn",
+    "npu_fused_matmul",
     "npu_geglu",
     "npu_grouped_matmul",
     "npu_moe_finalize_routing",
@@ -2552,6 +2554,9 @@
     "npu_scatter_nd_update_",
     "npu_stride_copy",
     "npu_gemma_rms_norm",
+    "npu_dynamic_mx_quant",
+    "npu_grouped_dynamic_mx_quant",
+    "npu_dtype_cast",
     "npu_swiglu",
     "npu_gelu",
     "npu_gelu_backward",
diff --git a/test/npu/test_tensors.py b/test/npu/test_tensors.py
index 237d6a1aee31c3013fd8515992bffd4a16362dfe..044a7f7d363d93f3bd9cd6bf6dd4d439f83e14ac 100644
--- a/test/npu/test_tensors.py
+++ b/test/npu/test_tensors.py
@@ -1,4 +1,5 @@
 from copy import deepcopy
+import unittest
 import numpy as np
 import torch
 import torch_npu
@@ -22,6 +23,16 @@ types = [
 ]
 
 
+def skipIfUnsupport910_xx():
+    def skip_dec(func):
+        def wrapper(self):
+            if "Ascend910_xx" not in torch_npu.npu.get_device_name():
+                return unittest.SkipTest("Device 910_xx condition not satisfied")
+            return func(self)
+        return wrapper
+    return skip_dec
+
+
 def get_npu_type(type_name):
     if isinstance(type_name, type):
         type_name = '{}.{}'.format(type_name.__module__, type_name.__name__)
@@ -383,5 +394,16 @@ class TestViewOps(TestCase):
         self.assertEqual(tensor.view(3, -1).size(), target)
 
 
+class TestTensorDtype(TestCase):
+    @skipIfUnsupport910_xx()
+    def test_fp8(self):
+        tensor1 = torch.randn([2, 2], dtype=torch.float32).npu()
+        tensor2 = torch.randn([2, 2], dtype=torch.float32).npu()
+        tensor_f8e5m2 = tensor1.to(torch.float8_e5m2)
+        tensor_f8e4m3fn = tensor2.to(torch.float8_e4m3fn)
+        self.assertEqual(tensor_f8e5m2.dtype, torch.float8_e5m2)
+        self.assertEqual(tensor_f8e4m3fn.dtype, torch.float8_e4m3fn)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 65908c94c1e77b76a5a9b53aa490eaeba20cec5e..1e5b2151c35c8aa44c3827ff4030b622f2b89ab0 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2588,6 +2588,9 @@
   "torch_npu.npu_dynamic_quant_asymmetric": {
     "signature": "(input_dummy, smooth_scales=None, group_index=None, dst_type=torch.int8)"
   },
+  "torch_npu.npu_dynamic_mx_quant": {
+    "signature": "(*args, **kwargs)"
+  },
   "torch_npu.npu_group_quant": {
     "signature": "(x, scale, group_index, offset=None, dst_dtype=None)"
   },
@@ -2595,7 +2598,7 @@
     "signature": "(*args, **kwargs)"
   },
   "torch_npu.npu_format_cast": {
-    "signature": "(self, acl_format)"
+    "signature": "(self, acl_format, customize_dtype=None)"
   },
   "torch_npu.npu_format_cast_": {
     "signature": "(*args, **kwargs)"
@@ -2835,16 +2838,16 @@
     "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor"
   },
   "func: npu_format_cast": {
-    "signature": "(Tensor self, int acl_format) -> Tensor"
+    "signature": "(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor"
   },
   "func: npu_format_cast_": {
-    "signature": "(Tensor(a!) self, Tensor src) -> Tensor(a!)"
+    "signature": "(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)"
   },
   "func: npu_format_cast_.acl_format": {
-    "signature": "(Tensor(a!) self, int acl_format) -> Tensor(a!)"
+    "signature": "(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)"
   },
   "func: npu_format_cast.Tensor": {
-    "signature": "(Tensor self, Tensor dst) -> Tensor"
+    "signature": "(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor"
   },
   "func: npu_change_data_ptr": {
     "signature": "(Tensor dst, Tensor src, int index) -> int"
@@ -2864,6 +2867,9 @@
   "func: _npu_format_cast": {
     "signature": "(Tensor self, int acl_format) -> Tensor"
   },
+  "func: _npu_format_cast": {
+    "signature": "(Tensor self, int acl_format, int customize_dtype) -> Tensor"
+  },
   "torch_npu_public_env: INF_NAN_MODE_ENABLE": {
     "mode": "std::unordered_map<int32_t, std::string> infNanMode = {{0, \"max\"}, {1, \"inf_nan\"}}"
   },
diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
index cbcf87b0fc061294c5fb26ace98900db789f8c2a..b9c7346d06cb4a8df03cd6734f6f9e741a3f64ce 100755
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -164,6 +164,14 @@ typedef enum {
     ACL_INT4 = 29,
     ACL_UINT1 = 30,
     ACL_COMPLEX32 = 33,
+    ACL_HIFLOAT8 = 34,
+    ACL_FLOAT8_E5M2 = 35,
+    ACL_FLOAT8_E4M3FN = 36,
+    ACL_FLOAT8_E8M0 = 37,
+    ACL_FLOAT6_E3M2 = 38,
+    ACL_FLOAT6_E2M3 = 39,
+    ACL_FLOAT4_E2M1 = 40,
+    ACL_FLOAT4_E1M2 = 41,
 } aclDataType;
 
 typedef enum {
@@ -182,6 +190,8 @@ typedef enum {
     ACL_FRACTAL_Z_3D = 33,
     ACL_FORMAT_NC = 35,
     ACL_FORMAT_NCL = 47,
+    ACL_FORMAT_FRACTAL_NZ_C0_16 = 50,
+    ACL_FORMAT_FRACTAL_NZ_C0_32 = 51,
 } aclFormat;
 
 typedef enum {
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 05ef7980b701e5b54e4a2be9a74d34dc6a4ed7b7..3e4a8e5fbc0cfef4b42b85f916cd6c2999bf6aab 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -15,6 +15,7 @@
 #include "torch_npu/csrc/flopcount/Init.h"
 #include "torch_npu/csrc/logging/Init.h"
 #include "torch_npu/csrc/npu/Module.h"
+#include "torch_npu/csrc/custom_dtype/Init.h"
 #include "torch_npu/csrc/npu/Stress_detect.h"
 #include "torch_npu/csrc/utils/TensorType.h"
 #include "torch_npu/csrc/utils/AutocastMode.h"
@@ -167,6 +168,7 @@ PyObject* initModule()
     AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions());
     AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions());
     AddPyMethodDefs(methods, torch_npu::logging::logging_functions());
+    AddPyMethodDefs(methods, c10_npu::custom_dtype_functions());
     static struct PyModuleDef torchnpu_module = {
         PyModuleDef_HEAD_INIT,
         "torch_npu._C",
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 0c4000e52458682f8fa8ede2cde47b6421637d5d..f7ead13ccca965313b84e54adfcb6a6a95d0acf9 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -1,16 +1,130 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h"
+#include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
+#include "torch_npu/csrc/custom_dtype/Init.h"
+#include "third_party/op-plugin/op_plugin/utils/op_api_common.h"
 
 namespace at_npu {
 namespace native {
 
 using tensor_list = std::vector<at::Tensor>;
+using GetFormatFunc = int (*)(const aclTensor *, const int, const int, int64_t **, uint64_t *, int *);
+
+std::tuple<bool, int64_t, c10::SmallVector<int64_t, SIZE>> MaybeUseAclnnNpuFormatCast(const at::Tensor& src,
+    int64_t acl_format, c10::optional<int64_t> customize_dtype)
+{
+    const static auto GetFormatFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCastCalculateSizeAndFormat");
+    const static auto FormatCastFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCast");
+
+    const static bool aclnnNpuFormatCastExist =
+        (GetFormatFuncAddr == nullptr || FormatCastFuncAddr == nullptr) ? false : true;
+
+    GetFormatFunc GetFormat = reinterpret_cast<GetFormatFunc>(GetFormatFuncAddr);
+    int64_t *dstStorageShape = nullptr;
+    uint64_t dstShapeSize = 0;
+    int dstFormat;
+    at::SmallVector<int64_t, SIZE> outputShape = {};
+    aclDataType customizeAcltype = (customize_dtype.has_value()) ?
+        c10_npu::GetAclDataType(customize_dtype.value()) :
+        at_npu::native::OpPreparation::convert_to_acl_data_type(src.scalar_type());
+
+    if (c10_npu::IsAscend910_xxVersion()) {
+        if (aclnnNpuFormatCastExist) {
+            auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape,
+                &dstShapeSize, &dstFormat);
+            NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat");
+            for (uint64_t i = 0; i < dstShapeSize; i++) {
+                outputShape.push_back(dstStorageShape[i]);
+            }
+            delete[] dstStorageShape;
+            return std::make_tuple(true, dstFormat, outputShape);
+        }
+        TORCH_CHECK(false,
+            "aclnnNpuFormatCast does not exist, Ascend910_xx series only support aclnn operators.",
+            PTA_ERROR(ErrCode::NOT_SUPPORT));
+    }
+    if (at_npu::native::env::CheckJitDisable()) {
+        if (aclnnNpuFormatCastExist) {
+            auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape,
+                &dstShapeSize, &dstFormat);
+            if (api_ret != 0) {
+                if (customize_dtype.has_value()) {
+                    NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat");
+                }
+                return std::make_tuple(false, dstFormat, outputShape);
+            }
+            for (uint64_t i = 0; i < dstShapeSize; i++) {
+                outputShape.push_back(dstStorageShape[i]);
+            }
+            delete[] dstStorageShape;
+            return std::make_tuple(true, dstFormat, outputShape);
+        } else {
+            if (C10_UNLIKELY(customize_dtype.has_value())) {
+                TORCH_CHECK(false,
+                    "customize_dtype is not supported while aclnnNpuFormatCast does not exist.",
+                    PTA_ERROR(ErrCode::NOT_SUPPORT));
+            }
+            return std::make_tuple(false, dstFormat, outputShape);
+        }
+    } else {
+        if (C10_UNLIKELY(customize_dtype.has_value())) {
+            TORCH_CHECK(false,
+                "customize_dtype is not supported while jit_compile=True.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+        }
+        return std::make_tuple(false, dstFormat, outputShape);
+    }
+}
+
+at::Tensor create_tensor_with_format_and_shape(c10::IntArrayRef baseSizes,
+    c10::IntArrayRef storageSizes,
+    const caffe2::TypeMeta dtype, int64_t acl_format)
+{
+    c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get();
+    int64_t nelements = 1;
+    for (const auto& num : storageSizes) {
+        nelements *= num;
+    }
+    int64_t size_bytes = nelements * dtype.itemsize();
+    c10::intrusive_ptr<c10::StorageImpl> storage_impl = torch_npu::make_npu_storage_impl(
+        c10::StorageImpl::use_byte_size_t(),
+        c10::SymInt(size_bytes),
+        allocator,
+        true);
+    auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+    if (baseSizes.size() != 1 || baseSizes[0] != 0) {
+        tensor.unsafeGetTensorImpl()->set_sizes_contiguous(baseSizes);
+    }
+    tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous);
+    StorageDescHelper::SetDesc(tensor, baseSizes, storageSizes, tensor.strides(), static_cast<aclFormat>(acl_format));
+    return tensor;
+}
+
+at::Tensor format_cast_impl_out_npu_aclnn(const at::Tensor& src,
+    int64_t acl_format, c10::IntArrayRef storageSizes)
+{
+    auto src_new = src.contiguous();
+    auto src_new_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src_new)->npu_desc_;
+
+    at::Tensor dst = create_tensor_with_format_and_shape(
+        src_new_desc.base_sizes_, storageSizes, src.dtype(), acl_format);
+
+    // calculate the output result of the NPU
+    EXEC_NPU_CMD(aclnnNpuFormatCast, src_new, dst);
+
+    // format cast only change physical layout of base tensor and view tensor's
+    // metadata remain unchanged
+    dst.set_(dst.storage(), src_new.storage_offset(), src_new.sizes(), src_new.strides());
+    return dst;
+}
 
 at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
 {
@@ -36,7 +150,8 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
 }
 
 // convert src from src_format to dst_format, write the result into dst(self)
-at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src)
+at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src,
+                                                 c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     torch_npu::utils::torch_check_npu(src);
@@ -47,6 +162,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Ten
         return self;
     }
 
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, dst_desc.npu_format_, customize_dtype);
+    if (useAclnn == true) {
+        at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+        self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides());
+        return self;
+    }
+
     // calculate the output result of the NPU
     format_cast_impl_out_npu(self, src);
 
@@ -59,16 +181,6 @@ at::Tensor npu_format_cast_impl(
     int64_t acl_format)
 {
     auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-    if (src_desc.npu_format_ == acl_format) {
-        ASCEND_LOGD("no need to do format cast");
-        return src;
-    }
-    if (FormatHelper::IsBaseFormatType(src) &&
-        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
-        FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
-        return src;
-    }
-
     at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
         src_desc.base_sizes_, src.options(), acl_format);
 
@@ -84,18 +196,20 @@ at::Tensor npu_format_cast_impl(
 // conver self to dst'format, write the result into new result tensor
 at::Tensor NPUNativeFunctions::npu_format_cast(
     const at::Tensor& self,
-    const at::Tensor& dst)
+    const at::Tensor& dst,
+    c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(dst);
     auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
     int64_t dst_format = dst_desc.npu_format_;
-    return custom_ops::npu_format_cast(self, dst_format);
+    return custom_ops::npu_format_cast(self, dst_format, customize_dtype);
 }
 
 // conver self to acl_format, write the result into self
 at::Tensor& NPUNativeFunctions::npu_format_cast_(
     at::Tensor& self,
-    int64_t acl_format)
+    int64_t acl_format,
+    c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
@@ -108,6 +222,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
         return self;
     }
 
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype);
+    if (useAclnn == true) {
+        at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+        self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides());
+        return self;
+    }
+
     at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
         src_desc.base_sizes_, self.options(), acl_format);
 
@@ -130,16 +251,54 @@ int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self)
 
 at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format)
 {
-    return npu_format_cast_impl(self, acl_format);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return self;
+    }
+    if (FormatHelper::IsBaseFormatType(self) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(self, static_cast<aclFormat>(acl_format));
+        return self;
+    }
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, c10::nullopt);
+    if (useAclnn == false) {
+        return npu_format_cast_impl(self, acl_format);
+    }
+    return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+}
+
+at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format,
+                                                int64_t customize_dtype)
+{
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return self;
+    }
+    if (FormatHelper::IsBaseFormatType(self) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(self, static_cast<aclFormat>(acl_format));
+        return self;
+    }
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype);
+    if (useAclnn == false) {
+        return npu_format_cast_impl(self, acl_format);
+    }
+    return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
 }
 
-at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format)
+at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format,
+                                               c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     if (NPUNativeFunctions::get_npu_format(self) == acl_format) {
         ASCEND_LOGD("no need to do format cast");
         return self;
     }
+    if (customize_dtype.has_value()) {
+        return custom_ops::_npu_format_cast(self, acl_format, customize_dtype.value());
+    }
     return custom_ops::_npu_format_cast(self, acl_format);
 }
 
diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
index 775d95cbfa597a61fcf71eca04008d8c21fd4e83..685f907653a96e2f36e6ee5c9ea4dc6344618cef 100644
--- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
+++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
@@ -10,13 +10,34 @@
 namespace at_npu {
 namespace native {
 
+#define AT_DISPATCH_CASE_ALL_TYPES_AND5(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+    AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)       \
+    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+
+#define AT_DISPATCH_ALL_TYPES_AND5(                         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(                                       \
+        TYPE,                                                 \
+        NAME,                                                 \
+        AT_DISPATCH_CASE_ALL_TYPES_AND5(                      \
+            SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, __VA_ARGS__))
+
+
 c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self)
 {
     c10::Scalar r;
-    AT_DISPATCH_ALL_TYPES_AND3(
+    AT_DISPATCH_ALL_TYPES_AND5(
         at::ScalarType::Half,
         at::ScalarType::Bool,
         at::ScalarType::BFloat16,
+        at::ScalarType::Float8_e5m2,
+        at::ScalarType::Float8_e4m3fn,
         self.scalar_type(),
         "_local_scalar_dense_npu",
         [&] {
diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index 96e67ff5bb07dd483d0f208daf299990d02aa1ce..3d2be6452d079cf9f4aa4f797b606c98c3cb1730 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -161,7 +161,7 @@ at::Tensor NPUNativeFunctions::to(
             "dtype cast repalce with float.");
     }
     dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype;
-    return custom_ops::npu_dtype_cast(self, dtype);
+    return custom_ops::_npu_dtype_cast(self, dtype);
 }
 
 at::Tensor NPUNativeFunctions::to(
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 95bb740db159bef654fb063934f68344c1bf257e..b186df765181599eca85294f3343033c711f8a32 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -62,12 +62,12 @@ custom:
   - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int
     device_check: NoCheck
   - func: get_npu_format(Tensor self) -> int
-  - func: npu_format_cast.Tensor(Tensor self, Tensor dst) -> Tensor
+  - func: npu_format_cast.Tensor(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor
     device_check: NoCheck
     exposed: True
-  - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format) -> Tensor(a!)
+  - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)
     exposed: True
-  - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!)
+  - func: npu_format_cast_(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)
     device_check: NoCheck
     exposed: True
   - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor
@@ -82,9 +82,10 @@ custom:
   - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
     device_check: NoCheck
   - func: get_storage_size(Tensor self) -> int
-  - func: npu_format_cast(Tensor self, int acl_format) -> Tensor
+  - func: npu_format_cast(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor
     exposed: True
   - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor
+  - func: _npu_format_cast.aclnn(Tensor self, int acl_format, int customize_dtype) -> Tensor
   - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_swapped_memory
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 74afc22031e5f4ff3cb9464d8ed3b49d0de6bb37..e1e86a89071458e0de9d7959c7f023766574b5c5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -726,6 +726,7 @@ BlockState::BlockState(Block *block)
 
 SegmentState::SegmentState(Block *head)
 {
+    TORCH_INTERNAL_ASSERT(head != nullptr, PTA_ERROR(ErrCode::PTR));
     TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
     is_small = head->pool->is_small;
 
@@ -882,7 +883,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector<std::st
         if (m_expandable_segments) {
             void *ptr = nullptr;
             auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, nullptr, 1);
-            if (status == ACL_ERROR_NONE) {
+            if (status == ACL_ERROR_NONE && ptr != nullptr) {
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
             } else {
                 NPU_CHECK_ERROR(status, "aclrtReserveMemAddress");
@@ -2219,6 +2220,9 @@ private:
             // map_block will map some of unmapped and merge with free
             auto remaining = size - candidate->size;
             auto new_candidate = candidate->next;
+            if (C10_UNLIKELY(new_candidate == nullptr)) {
+                return nullptr;
+            }
             if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
                 return nullptr;
             }
@@ -2442,7 +2446,11 @@ private:
     {
         bool freed_memory = false;
         for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
-            freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+            if (FreeNPUMemoryCallbacksRegistry()->Create(name) != nullptr) {
+                freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+            } else {
+                TORCH_CHECK(false, "free memory callback get nullptr", PTA_ERROR(ErrCode::PTR));
+            }
         }
         return freed_memory;
     }
diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 034726549b38840474c96166ce2deb6019f754c6..a91b1d3cacb23d2a1b9e750774c833f97c940bdd 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -91,7 +91,7 @@ MemUceInfo memUceInfo;
 
 std::mutex memUceInfoMutex;
 
-void set_mem_uce_info(MemUceInfo info)
+void set_mem_uce_info(MemUceInfo& info)
 {
     std::lock_guard<std::mutex> lock(memUceInfoMutex);
     memUceInfo = info;
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index 94e38a5edbd42b70addb22a5c94443301cd378e3..88a77ab810ef327be254ea143d5d46606b772cd5 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -259,7 +259,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg);
 
 void record_mem_hbm_ecc_error();
 
-void set_mem_uce_info(MemUceInfo info);
+void set_mem_uce_info(MemUceInfo& info);
 
 MemUceInfo get_mem_uce_info();
 
diff --git a/torch_npu/csrc/core/npu/NPUMacros.h b/torch_npu/csrc/core/npu/NPUMacros.h
index 3223c4f325b3de69b8e5cdc783954d84033b37b4..960dcb97b6e52bffc37582250ffd99b1f7ac08a6 100644
--- a/torch_npu/csrc/core/npu/NPUMacros.h
+++ b/torch_npu/csrc/core/npu/NPUMacros.h
@@ -29,6 +29,6 @@
 
 #define TORCH_NPU_API C10_NPU_API
 
-#define C10_COMPILE_TIME_MAX_NPUS 16
+#define C10_COMPILE_TIME_MAX_NPUS 32
 // A maximum of 8 P2P links can be created on a NPU device
 #define C10_P2P_ACCESS_MAX_NPUS 8
diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index 3fedb9d387ef61702a7414912b5572a8e187e7cd..9aaca59b54520df2294b6d69adbbb54dbe08d582 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -41,27 +41,35 @@ static std::map<std::string, SocVersion> socVersionMap = {
 
 void SetSocVersion(const char* const socVersion)
 {
-  if (socVersion == nullptr ||
-      g_curSocVersion != SocVersion::UnsupportedSocVersion) {
-    return;
-  }
+    if (socVersion == nullptr ||
+        g_curSocVersion != SocVersion::UnsupportedSocVersion) {
+        return;
+    }
 
-  SocVersion curSocVersion = SocVersion::UnsupportedSocVersion;
+    SocVersion curSocVersion = SocVersion::UnsupportedSocVersion;
+    std::string inputVersion = socVersion;
+    std::string ascend95Version = "Ascend910_xx";
 
-  auto const& iter = socVersionMap.find(socVersion);
-  if (iter != socVersionMap.end()) {
-    curSocVersion = iter->second;
-  } else {
-    std::string unsupported_soc(socVersion);
-    std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' ');
-    AT_ERROR("Unsupported soc version: ", unsupported_soc);
-  }
+    auto const& iter = socVersionMap.find(socVersion);
+    if (iter != socVersionMap.end()) {
+        curSocVersion = iter->second;
+    } else if ((inputVersion.compare(0, ascend95Version.size(), ascend95Version) == 0)) {
+        curSocVersion = SocVersion::Ascend910_xx;
+    } else {
+        std::string unsupported_soc(socVersion);
+        std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' ');
+        AT_ERROR("Unsupported soc version: ", unsupported_soc);
+    }
 
-  g_curSocVersion = curSocVersion;
+    g_curSocVersion = curSocVersion;
 }
 
 const SocVersion& GetSocVersion()
 {
+    if (g_curSocVersion == SocVersion::UnsupportedSocVersion) {
+        auto soc_name = c10_npu::acl::AclGetSocName();
+        SetSocVersion(soc_name);
+    }
     return g_curSocVersion;
 }
 
@@ -95,5 +103,10 @@ bool IsBF16Supported()
 {
     return GetSocVersion() >= SocVersion::Ascend910B1;
 }
+
+bool IsAscend910_xxVersion()
+{
+    return GetSocVersion() == SocVersion::Ascend910_xx;
+}
 }  // namespace c10_npu
 
diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h
index 3119a645153322225f9d0d9ea19dfa3b1ef9ab9f..6481369f581b35d04273109de4e729d37f8eccf3 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.h
+++ b/torch_npu/csrc/core/npu/NpuVariables.h
@@ -30,7 +30,8 @@ enum class SocVersion {
   Ascend910_9381,
   Ascend910_9382,
   Ascend910_9372,
-  Ascend910_9362
+  Ascend910_9362,
+  Ascend910_xx = 260
 };
 
 void SetSocVersion(const char* const socVersion);
@@ -40,6 +41,8 @@ const SocVersion& GetSocVersion();
 bool IsSupportInfNan();
 
 bool IsBF16Supported();
+
+bool IsAscend910_xxVersion();
 }  // namespace c10_npu
 #endif
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index b59e9c85c96e2998273953d3d068a3465bd0efde..54190a681d3364ed1c74d8e94afacc8cfc7244ce 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -174,6 +174,7 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) {
     if (stream == nullptr) { // default stream
         return ACL_ERROR_INVALID_PARAM;
     }
+
     typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t);
     static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode);
     if (func == nullptr) {
@@ -844,7 +845,8 @@ bool IsCaptureSupported()
     static bool have_load_func = false;
     static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) &&
         (GetSocVersion() < SocVersion::Ascend310B1)) ||
-        (GetSocVersion() >= SocVersion::Ascend910_9391);
+        ((GetSocVersion() >= SocVersion::Ascend910_9391) &&
+        (GetSocVersion() < SocVersion::Ascend910_xx));
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
         typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
index 8f7f17a0114a517ef7f5ef4b201b1bf749274210..69acfc234f254695ffe6733f81c4e9dfdead5bcf 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
@@ -4,6 +4,7 @@
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
 
 namespace c10_npu {
 namespace option {
@@ -84,6 +85,18 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::u
 
 void SetOption(const std::string &key, const std::string &val)
 {
+    if (c10_npu::IsAscend910_xxVersion()) {
+        if (key == "jitCompile" && val == "enable") {
+            TORCH_NPU_WARN_ONCE("Ascend910_xx series only support jit_compile=False, ",
+                "the requested value True is invalid and has been reverted to False.");
+            return register_options::OptionRegister::GetInstance()->Set(key, "disable");
+        }
+        if (key == "ALLOW_INTERNAL_FORMAT" && val == "enable") {
+            TORCH_NPU_WARN_ONCE("Ascend910_xx series only support allow_internal_format=False, ",
+                "the requested value True is invalid and has been reverted to False.");
+            return register_options::OptionRegister::GetInstance()->Set(key, "disable");
+        }
+    }
     register_options::OptionRegister::GetInstance()->Set(key, val);
 }
 
diff --git a/torch_npu/csrc/custom_dtype/CMakeLists.txt b/torch_npu/csrc/custom_dtype/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d3d7c0e5379a0c23354a45a6dbd12c0bffea0ac
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB _CUS_DTYPE_SRCS *.cpp)
+
+LIST(APPEND CUS_DTYPE_SRCS ${_CUS_DTYPE_SRCS})
+
+# Pass to parent
+set(CUS_DTYPE_SRCS ${CUS_DTYPE_SRCS} PARENT_SCOPE)
diff --git a/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2293ba94dd063eba4023415405a230b2baa7ae6d
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp
@@ -0,0 +1,43 @@
+#include "torch_npu/csrc/custom_dtype/extension.h"
+#include "op_plugin/AclOpsInterface.h"
+#include "op_plugin/OpApiInterface.h"
+#include "op_plugin/utils/op_api_common.h"
+
+
+namespace c10_npu {
+
+at::Tensor cast_to_fp8(const at::Tensor &input, int otype)
+{
+    auto output = at::empty_like(input, c10_npu::GetATenDType(otype));
+
+    if (input.numel() == 0) {
+        return output;
+    }
+
+    aclDataType out_acltype = c10_npu::GetAclDataType(otype);
+    TensorWrapper out_wrapper = {output, out_acltype};
+    EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper);
+
+    return output;
+}
+
+void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype)
+{
+    aclDataType out_acltype = c10_npu::GetAclDataType(otype);
+    TensorWrapper out_wrapper = {output, out_acltype};
+    EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper);
+    return;
+}
+
+at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype)
+{
+    aclDataType input_acltype = c10_npu::GetAclDataType(itype);
+    aclDataType out_acltype = c10_npu::GetAclDataType(otype);
+    auto output = at::empty_like(input, c10_npu::GetATenDType(otype));
+    TensorWrapper input_wrapper = {input, input_acltype};
+    TensorWrapper out_wrapper = {output, out_acltype};
+    EXEC_NPU_CMD(aclnnCast, input_wrapper, out_acltype, out_wrapper);
+
+    return output;
+}
+}
diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90644aa1e3e14b2a9aef00e9272ceca995f6e9c5
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/Init.cpp
@@ -0,0 +1,163 @@
+#include "torch_npu/csrc/custom_dtype/Init.h"
+#ifndef BUILD_LIBTORCH
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#endif
+#include "torch_npu/csrc/custom_dtype/extension.h"
+
+
+namespace c10_npu {
+struct DTypeConstants {
+    static const int float32_value;
+    static const int float16_value;
+    static const int int8_value;
+    static const int int32_value;
+    static const int uint8_value;
+    static const int int16_value;
+    static const int uint16_value;
+    static const int uint32_value;
+    static const int int64_value;
+    static const int uint64_value;
+    static const int float64_value;
+    static const int bool_value;
+    static const int string_value;
+    static const int complex64_value;
+    static const int complex128_value;
+    static const int bfloat16_value;
+    static const int int4_value;
+    static const int uint1_value;
+    static const int complex32_value;
+    static const int hifloat8_value;
+    static const int float8_e5m2_value;
+    static const int float8_e4m3fn_value;
+    static const int float8_e8m0_value;
+    static const int float6_e3m2_value;
+    static const int float6_e2m3_value;
+    static const int float4_e2m1_value;
+    static const int float4_e1m2_value;
+};
+
+const int DTypeConstants::float32_value = static_cast<int>(DType::FLOAT);
+const int DTypeConstants::float16_value = static_cast<int>(DType::FLOAT16);
+const int DTypeConstants::int8_value = static_cast<int>(DType::INT8);
+const int DTypeConstants::int32_value = static_cast<int>(DType::INT32);
+const int DTypeConstants::uint8_value = static_cast<int>(DType::UINT8);
+const int DTypeConstants::int16_value = static_cast<int>(DType::INT16);
+const int DTypeConstants::uint16_value = static_cast<int>(DType::UINT16);
+const int DTypeConstants::uint32_value = static_cast<int>(DType::UINT32);
+const int DTypeConstants::int64_value = static_cast<int>(DType::INT64);
+const int DTypeConstants::uint64_value = static_cast<int>(DType::UINT64);
+const int DTypeConstants::float64_value = static_cast<int>(DType::DOUBLE);
+const int DTypeConstants::bool_value = static_cast<int>(DType::BOOL);
+const int DTypeConstants::string_value = static_cast<int>(DType::STRING);
+const int DTypeConstants::complex64_value = static_cast<int>(DType::COMPLEX64);
+const int DTypeConstants::complex128_value = static_cast<int>(DType::COMPLEX128);
+const int DTypeConstants::bfloat16_value = static_cast<int>(DType::BF16);
+const int DTypeConstants::int4_value = static_cast<int>(DType::INT4);
+const int DTypeConstants::uint1_value = static_cast<int>(DType::UINT1);
+const int DTypeConstants::complex32_value = static_cast<int>(DType::COMPLEX32);
+const int DTypeConstants::hifloat8_value = static_cast<int>(DType::HIFLOAT8);
+const int DTypeConstants::float8_e5m2_value = static_cast<int>(DType::FLOAT8_E5M2);
+const int DTypeConstants::float8_e4m3fn_value = static_cast<int>(DType::FLOAT8_E4M3FN);
+const int DTypeConstants::float8_e8m0_value = static_cast<int>(DType::FLOAT8_E8M0);
+const int DTypeConstants::float6_e3m2_value = static_cast<int>(DType::FLOAT6_E3M2);
+const int DTypeConstants::float6_e2m3_value = static_cast<int>(DType::FLOAT6_E2M3);
+const int DTypeConstants::float4_e2m1_value = static_cast<int>(DType::FLOAT4_E2M1);
+const int DTypeConstants::float4_e1m2_value = static_cast<int>(DType::FLOAT4_E1M2);
+
+#ifndef BUILD_LIBTORCH
+PyObject* cd_initExtension(PyObject*, PyObject *)
+{
+    auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
+    if (!torch_npu_C_module) {
+        return nullptr;
+    }
+    auto torch_npu_C_m = py::handle(torch_npu_C_module).cast<py::module>();
+    auto m = torch_npu_C_m.def_submodule("_cd", "_cd bindings");
+
+    py::class_<DTypeConstants>(m, "DType")
+        .def_readonly_static("float32", &DTypeConstants::float32_value)
+        .def_readonly_static("float16", &DTypeConstants::float16_value)
+        .def_readonly_static("int8", &DTypeConstants::int8_value)
+        .def_readonly_static("int32", &DTypeConstants::int32_value)
+        .def_readonly_static("uint8", &DTypeConstants::uint8_value)
+        .def_readonly_static("int16", &DTypeConstants::int16_value)
+        .def_readonly_static("uint16", &DTypeConstants::uint16_value)
+        .def_readonly_static("uint32", &DTypeConstants::uint32_value)
+        .def_readonly_static("int64", &DTypeConstants::int64_value)
+        .def_readonly_static("uint64", &DTypeConstants::uint64_value)
+        .def_readonly_static("float64", &DTypeConstants::float64_value)
+        .def_readonly_static("bool", &DTypeConstants::bool_value)
+        .def_readonly_static("string", &DTypeConstants::string_value)
+        .def_readonly_static("complex64", &DTypeConstants::complex64_value)
+        .def_readonly_static("complex128", &DTypeConstants::complex128_value)
+        .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value)
+        .def_readonly_static("int4", &DTypeConstants::int4_value)
+        .def_readonly_static("uint1", &DTypeConstants::uint1_value)
+        .def_readonly_static("complex32", &DTypeConstants::complex32_value)
+        .def_readonly_static("hifloat8", &DTypeConstants::hifloat8_value)
+        .def_readonly_static("float8_e5m2", &DTypeConstants::float8_e5m2_value)
+        .def_readonly_static("float8_e4m3fn", &DTypeConstants::float8_e4m3fn_value)
+        .def_readonly_static("float8_e8m0", &DTypeConstants::float8_e8m0_value)
+        .def_readonly_static("float6_e3m2", &DTypeConstants::float6_e3m2_value)
+        .def_readonly_static("float6_e2m3", &DTypeConstants::float6_e2m3_value)
+        .def_readonly_static("float4_e2m1", &DTypeConstants::float4_e2m1_value)
+        .def_readonly_static("float4_e1m2", &DTypeConstants::float4_e1m2_value);
+
+    m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8", py::call_guard<py::gil_scoped_release>());
+    m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8",
+        py::call_guard<py::gil_scoped_release>());
+    m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8", py::call_guard<py::gil_scoped_release>());
+
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef NPUCustomDtypeMethods[] = { // NOLINT
+    {"_cd_init", cd_initExtension, METH_NOARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}
+};
+#endif
+
+const std::string CustomDataTypeToString(int64_t dType)
+{
+    const std::map<const DType, const std::string>
+        TYPE_TO_STRING_MAP = {
+            {DType::FLOAT, "torch_npu.float32"},
+            {DType::FLOAT16, "torch_npu.float16"},
+            {DType::INT8, "torch_npu.int8"},
+            {DType::INT32, "torch_npu.int32"},
+            {DType::UINT8, "torch_npu.uint8"},
+            {DType::INT16, "torch_npu.int16"},
+            {DType::UINT16, "torch_npu.uint16"},
+            {DType::UINT32, "torch_npu.uint32"},
+            {DType::INT64, "torch_npu.int64"},
+            {DType::UINT64, "torch_npu.uint64"},
+            {DType::DOUBLE, "torch_npu.float64"},
+            {DType::BOOL, "torch_npu.bool"},
+            {DType::STRING, "torch_npu.string"},
+            {DType::COMPLEX64, "torch_npu.complex64"},
+            {DType::COMPLEX128, "torch_npu.complex128"},
+            {DType::BF16, "torch_npu.bfloat16"},
+            {DType::INT4, "torch_npu.int4"},
+            {DType::UINT1, "torch_npu.uint1"},
+            {DType::COMPLEX32, "torch_npu.complex32"},
+            {DType::HIFLOAT8, "torch_npu.hifloat8"},
+            {DType::FLOAT8_E5M2, "torch_npu.float8_e5m2"},
+            {DType::FLOAT8_E4M3FN, "torch_npu.float8_e4m3fn"},
+            {DType::FLOAT8_E8M0, "torch_npu.float8_e8m0"},
+            {DType::FLOAT6_E3M2, "torch_npu.float6_e3m2"},
+            {DType::FLOAT6_E2M3, "torch_npu.float6_e2m3"},
+            {DType::FLOAT4_E2M1, "torch_npu.float4_e2m1"},
+            {DType::FLOAT4_E1M2, "torch_npu.float4_e1m2"}};
+
+    const auto iter = TYPE_TO_STRING_MAP.find(static_cast<DType>(dType));
+    return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype";
+}
+
+#ifndef BUILD_LIBTORCH
+PyMethodDef* custom_dtype_functions()
+{
+    return NPUCustomDtypeMethods;
+}
+#endif
+}
diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h
new file mode 100644
index 0000000000000000000000000000000000000000..23235a002749d5ea278c353d2ab97e62c047c2c8
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/Init.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#ifndef BUILD_LIBTORCH
+#include <torch/csrc/python_headers.h>
+#endif
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
+#include "third_party/acl/inc/acl/acl_base.h"
+
+namespace c10_npu {
+const int g_toAclOffset = 256;
+
+#define ENUM_OFFSET(new_name, old_name) new_name = static_cast<int>(old_name) + g_toAclOffset,
+
+#ifndef BUILD_LIBTORCH
+TORCH_NPU_API PyMethodDef* custom_dtype_functions();
+#endif
+
+enum class DType {
+    UNDEFINED = -1,
+    ENUM_OFFSET(FLOAT, ACL_FLOAT)
+    ENUM_OFFSET(FLOAT16, ACL_FLOAT16)
+    ENUM_OFFSET(INT8, ACL_INT8)
+    ENUM_OFFSET(INT32, ACL_INT32)
+    ENUM_OFFSET(UINT8, ACL_UINT8)
+    ENUM_OFFSET(INT16, ACL_INT16)
+    ENUM_OFFSET(UINT16, ACL_UINT16)
+    ENUM_OFFSET(UINT32, ACL_UINT32)
+    ENUM_OFFSET(INT64, ACL_INT64)
+    ENUM_OFFSET(UINT64, ACL_UINT64)
+    ENUM_OFFSET(DOUBLE, ACL_DOUBLE)
+    ENUM_OFFSET(BOOL, ACL_BOOL)
+    ENUM_OFFSET(STRING, ACL_STRING)
+    ENUM_OFFSET(COMPLEX64, ACL_COMPLEX64)
+    ENUM_OFFSET(COMPLEX128, ACL_COMPLEX128)
+    ENUM_OFFSET(BF16, ACL_BF16)
+    ENUM_OFFSET(INT4, ACL_INT4)
+    ENUM_OFFSET(UINT1, ACL_UINT1)
+    ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32)
+    ENUM_OFFSET(HIFLOAT8, ACL_HIFLOAT8)
+    ENUM_OFFSET(FLOAT8_E5M2, ACL_FLOAT8_E5M2)
+    ENUM_OFFSET(FLOAT8_E4M3FN, ACL_FLOAT8_E4M3FN)
+    ENUM_OFFSET(FLOAT8_E8M0, ACL_FLOAT8_E8M0)
+    ENUM_OFFSET(FLOAT6_E3M2, ACL_FLOAT6_E3M2)
+    ENUM_OFFSET(FLOAT6_E2M3, ACL_FLOAT6_E2M3)
+    ENUM_OFFSET(FLOAT4_E2M1, ACL_FLOAT4_E2M1)
+    ENUM_OFFSET(FLOAT4_E1M2, ACL_FLOAT4_E1M2)
+};
+
+inline bool IsCustomDType(int64_t t)
+{
+    if (t >= g_toAclOffset) {
+        return true;
+    }
+    return false;
+}
+
+// Both c10_npu::DType and ScalarType are supported
+inline aclDataType GetAclDataType(int64_t t)
+{
+    if (t >= g_toAclOffset) {
+        return static_cast<aclDataType>(t - g_toAclOffset);
+    }
+    return at_npu::native::OpPreparation::convert_to_acl_data_type(
+        static_cast<at::ScalarType>(t));
+}
+
+inline aclDataType GetAclDataType(DType t)
+{
+    return static_cast<aclDataType>(static_cast<int32_t>(t) - g_toAclOffset);
+}
+
+inline at::ScalarType GetATenDType(int64_t t)
+{
+    aclDataType aclType = GetAclDataType(t);
+    return at_npu::native::OpPreparation::convert_to_scalar_type(aclType);
+}
+
+const std::string CustomDataTypeToString(int64_t dType);
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/custom_dtype/extension.h b/torch_npu/csrc/custom_dtype/extension.h
new file mode 100644
index 0000000000000000000000000000000000000000..91ef1df8a51cdf6929c630a99370020eb7ed59a8
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/extension.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "torch_npu/csrc/custom_dtype/Init.h"
+
+namespace c10_npu {
+at::Tensor cast_to_fp8(const at::Tensor &input, int otype);
+
+void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype);
+
+at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype);
+}
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 3d91f787bfdfb13871886738f7d3c1cbb100caa9..4b2d28025d6ea104f373a82129c35a377de86e71 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -3634,7 +3634,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt);
+                tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3812,7 +3812,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
             for (const auto i : c10::irange(tensors.size())) {
                 if (tensors[i].scalar_type() == at::kBool || tensors[i].scalar_type() == at::kByte) {
                     c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                    tensors_cp[i] = at_npu::native::custom_ops::npu_dtype_cast(tensors[i], at::kInt);
+                    tensors_cp[i] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[i], at::kInt);
                 }
             }
         },
@@ -3876,7 +3876,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt);
+                tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3936,11 +3936,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (inputTensors[0].scalar_type() == at::kBool || inputTensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                inputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(inputTensors[0], at::kInt);
+                inputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(inputTensors[0], at::kInt);
             }
             if (outputTensors[0].scalar_type() == at::kBool || outputTensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                outputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(outputTensors[0], at::kInt);
+                outputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(outputTensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3975,14 +3975,14 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const
     if (num_add != 0) {
         bool transflag = false;
         if (inter_tensors.scalar_type() == at::ScalarType::Bool) {
-            inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Int);
+            inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Int);
             transflag = true;
         }
 
         inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0);
 
         if (transflag) {
-            inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool);
+            inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Bool);
         }
     }
     return inter_tensors;
diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
index 655082b56feb313d99c441c690bbef46a12c6aa1..edebbba53f3dacfa01a87c601db16ccc9c93cf6b 100644
--- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -423,6 +423,9 @@ void TensorPipeAgent::startImpl()
             priority = opts_.transports->size() - 1 - (iter - opts_.transports->begin());
         }
         std::unique_ptr<TransportRegistration> reg = TensorPipeTransportRegistry()->Create(key);
+        if (reg == nullptr || reg->transport == nullptr) {
+            TORCH_CHECK(false, "TensorPipeTransport get nullptr", DIST_ERROR(ErrCode::PTR));
+        }
         if (!reg->transport->isViable()) {
             continue;
         }
diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp
index 6a92fe5af4d8039b3d0ff9c50e49d1fd5fa30a00..9bd270b8fd231cb39a8bc9b98c8680b88a66e6a2 100644
--- a/torch_npu/csrc/framework/FormatHelper.cpp
+++ b/torch_npu/csrc/framework/FormatHelper.cpp
@@ -52,6 +52,10 @@ std::unordered_map<aclFormat, FormatHelper::FormatInfo> FormatHelper::Initialize
         {ACL_FORMAT_NDC1HWC0,
          (FormatInfo){ACL_FORMAT_NDC1HWC0, ACL_FORMAT_NCDHW, InferShapeOfNDC1HWC0, "NDC1HWC0", true}},
         {ACL_FRACTAL_Z_3D, (FormatInfo){ACL_FRACTAL_Z_3D, ACL_FORMAT_NCDHW, InferShapeOfFZ3D, "FRACTAL_Z_3D", true}},
+        {ACL_FORMAT_FRACTAL_NZ_C0_16,
+            (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_16", true}},
+        {ACL_FORMAT_FRACTAL_NZ_C0_32,
+            (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_32, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_32", true}},
     };
 };
 
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 6b98651c51dba728c9062a47d777650ae7ac93a6..80af05f94b321cac1e928484e18027c1e5cc836b 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -24,7 +24,9 @@ static std::unordered_map<at::ScalarType, std::vector<double>> floating_limits_m
     {at::ScalarType::Double, {std::numeric_limits<double>::max(), std::numeric_limits<double>::min()}},
     {at::ScalarType::Float, {std::numeric_limits<float>::max(), std::numeric_limits<float>::min()}},
     {at::ScalarType::BFloat16, {std::numeric_limits<float>::max(), std::numeric_limits<float>::min()}},
-    {at::ScalarType::Half, {65504, -65504}}};
+    {at::ScalarType::Half, {65504, -65504}},
+    {at::ScalarType::Float8_e5m2, {57345, -57345}},
+    {at::ScalarType::Float8_e4m3fn, {449, -449}}};
 static std::unordered_map<at::ScalarType, std::vector<long>> integral_limits_map{
     {at::ScalarType::Long, {std::numeric_limits<long>::max(), std::numeric_limits<long>::min()}},
     {at::ScalarType::Int, {std::numeric_limits<int>::max(), std::numeric_limits<int>::min()}},
@@ -274,7 +276,7 @@ OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceSca
 {
     std::tuple<aclTensorDesc*, aclDataBuffer*> res;
     if (commonType.has_value() && commonType.value() != tensor.scalar_type()) {
-        tensor = custom_ops::npu_dtype_cast(tensor, commonType.value());
+        tensor = custom_ops::_npu_dtype_cast(tensor, commonType.value());
     }
     // as for dim=0, the dtype of tensor can not be `uint16` because of `TBE`
     if (torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).storage_sizes_.empty()) {
@@ -331,7 +333,7 @@ OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType ty
 OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType)
 {
     if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) {
-        output = custom_ops::npu_dtype_cast(output, commonType.value());
+        output = custom_ops::_npu_dtype_cast(output, commonType.value());
     }
     auto res = OpCmdHelper::CovertToAclOutput(output, realType);
     aclCmd->AddOutput(std::get<0>(res), std::get<1>(res));
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index f1b9064b6de7cfa89da5e861ed6e4237eda1d39b..aac0ba181419f19e0e2e7dd742e67472f1fdeee3 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -574,6 +574,7 @@ void *NewFunc(int caption, int &size)
 void DeleteFunc(void *ptr)
 {
     free(ptr);
+    ptr = nullptr;
 }
 
 using Func = int (*)(c10_npu::queue::QueueParas *, aclrtStream);
diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index fecbb86f1f0faad206c6b1ec6e77c2e26e15b042..6f52465d1abeed9f4dd7efe6d6d8c56c1fa5e0d6 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -97,6 +97,13 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &size, c
     torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = SetDesc(dst.dtype(), size, strides, format);
 }
 
+void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size,
+    const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format)
+{
+    torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ =
+        SetDesc(dst.dtype(), base_size, storage_size, strides, format);
+}
+
 bool StorageDescHelper::CheckDescInit(const c10::Storage &storage)
 {
     return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ !=
@@ -254,6 +261,22 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty
     return npu_desc;
 }
 
+torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size,
+    const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format)
+{
+    struct torch_npu::NPUStorageDesc npu_desc;
+    npu_desc.data_type_ = dtype;
+    npu_desc.base_sizes_ = base_size;
+    npu_desc.base_strides_ = strides;
+    aclFormat baseFormat;
+    aclFormat npuFormat;
+    std::tie(baseFormat, npuFormat) = InferFormat::GuessFormatUnit(base_size, format);
+    npu_desc.storage_sizes_ = storage_size;
+    npu_desc.origin_format_ = baseFormat;
+    npu_desc.npu_format_ = npuFormat;
+    return npu_desc;
+}
+
 int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst)
 {
     const auto &physical_size = FormatHelper::GetStorageSizes(dst);
diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h
index 5c16ee74e2dc97165d97cd1d3d15857948b7457f..37b8933c1a67a4e687de68220c7e31097211a2e5 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.h
+++ b/torch_npu/csrc/framework/StorageDescHelper.h
@@ -35,6 +35,8 @@ public:
     static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides);
     static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &size, const c10::IntArrayRef &strides,
                         aclFormat format);
+    static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size,
+        const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format);
     static bool CheckDescInit(const c10::Storage &storage);
 
     // For Serialization to Get and Set NpuStorageDesc
@@ -63,6 +65,8 @@ private:
                                              const c10::IntArrayRef& strides);
     static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& size,
                                              const c10::IntArrayRef& strides, aclFormat format);
+    static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size,
+        const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format);
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
index c2abf7f4b2ae45e57a603f91ea3480e9519e4b1f..ee90387910967e7113f0153b0a8aea3099c0cb50 100644
--- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
@@ -70,6 +70,14 @@ private:
             ResetDataPtr(src, self,
                          static_cast<int64_t *>(src.storage().data_ptr().get()));
             return true;
+        case at::ScalarType::Float8_e5m2:
+            ResetDataPtr(src, self,
+                         static_cast<uint8_t *>(src.storage().data_ptr().get()));
+            return true;
+        case at::ScalarType::Float8_e4m3fn:
+            ResetDataPtr(src, self,
+                         static_cast<uint8_t *>(src.storage().data_ptr().get()));
+            return true;
         default:
             // Turn to conducting d2dCopyAsync for other dtypes.
             return false;
diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
index c2bb14ca66b8f807972e3f4dbf6f92b0a013788f..453a7082da2b271aa2e5adc35ae34b7d9b6b756e 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
@@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC)
     _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED)                                                                       \
     _(at::ScalarType::Bits8, ACL_DT_UNDEFINED)                                                                         \
     _(at::ScalarType::Bits16, ACL_DT_UNDEFINED)                                                                        \
-    _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED)                                                                   \
-    _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED)                                                                 \
+    _(at::ScalarType::Float8_e5m2, ACL_FLOAT8_E5M2)                                                                   \
+    _(at::ScalarType::Float8_e4m3fn, ACL_FLOAT8_E4M3FN)                                                                 \
     _(at::ScalarType::Undefined, ACL_DT_UNDEFINED)                                                                     \
     _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED)
 
@@ -74,6 +74,37 @@ AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC)
 static std::map<const std::string, const aclDataType> STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = {
     {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}};
 
+// at::ScalarType::UInt16/UInt32/UInt64 will be supported after v2.1.0
+static std::unordered_map<const aclDataType, const at::ScalarType>
+    ACL_TYPE_TO_SCALAR_TYPE_MAP = {{ACL_DT_UNDEFINED, at::ScalarType::Undefined},
+                                   {ACL_FLOAT, at::ScalarType::Float},
+                                   {ACL_FLOAT16, at::ScalarType::Half},
+                                   {ACL_INT8, at::ScalarType::Char},
+                                   {ACL_INT32, at::ScalarType::Int},
+                                   {ACL_UINT8, at::ScalarType::Byte},
+                                   {ACL_INT16, at::ScalarType::Short},
+                                   {ACL_UINT16, at::ScalarType::Undefined},
+                                   {ACL_UINT32, at::ScalarType::Undefined},
+                                   {ACL_INT64, at::ScalarType::Long},
+                                   {ACL_UINT64, at::ScalarType::Undefined},
+                                   {ACL_DOUBLE, at::ScalarType::Double},
+                                   {ACL_BOOL, at::ScalarType::Bool},
+                                   {ACL_STRING, at::ScalarType::Undefined},
+                                   {ACL_COMPLEX64, at::ScalarType::ComplexFloat},
+                                   {ACL_COMPLEX128, at::ScalarType::ComplexDouble},
+                                   {ACL_BF16, at::ScalarType::BFloat16},
+                                   {ACL_INT4, at::ScalarType::Undefined},
+                                   {ACL_UINT1, at::ScalarType::Undefined},
+                                   {ACL_COMPLEX32, at::ScalarType::ComplexHalf},
+                                   {ACL_HIFLOAT8, at::ScalarType::Byte},
+                                   {ACL_FLOAT8_E5M2, at::ScalarType::Float8_e5m2},
+                                   {ACL_FLOAT8_E4M3FN, at::ScalarType::Float8_e4m3fn},
+                                   {ACL_FLOAT8_E8M0, at::ScalarType::Byte},
+                                   {ACL_FLOAT6_E3M2, at::ScalarType::Byte},
+                                   {ACL_FLOAT6_E2M3, at::ScalarType::Byte},
+                                   {ACL_FLOAT4_E2M1, at::ScalarType::Byte},
+                                   {ACL_FLOAT4_E1M2, at::ScalarType::Byte}};
+
 aclError AclrtMemcpyAsyncParamCheck(
     void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream)
 {
@@ -291,5 +322,17 @@ int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32)
     return iter->second;
 }
 
+at::ScalarType CalcuOpUtil::ConvertToScalarType(const aclDataType data_type)
+{
+    auto iter = ACL_TYPE_TO_SCALAR_TYPE_MAP.find(data_type);
+    if (iter == ACL_TYPE_TO_SCALAR_TYPE_MAP.end()) {
+        TORCH_CHECK(false,
+            std::string("aclDataType:") + std::to_string(data_type) + " has not been supported",
+            OPS_ERROR(ErrCode::NOT_SUPPORT))
+    }
+    
+    return iter->second;
+}
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
index b06ab06f9053fe7c46207789a95984eaa34610af..9a4a8024435cf77db1a1aba49e22cf73b580062f 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
@@ -86,6 +86,7 @@ public:
     static int64_t GetTensorNpuFormat(const at::Tensor &tensor);
     static c10::SmallVector<int64_t, SHAPE_SIZE> ConvertIntArrayRefToSmallVector(c10::IntArrayRef intArray);
     static int8_t GetCubeMathType(bool allowHf32);
+    static at::ScalarType ConvertToScalarType(const aclDataType data_type);
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index 20f357c654b94a8c618ab339f68a68eeed8b67b6..530d359df23006ef4e099c7da9d39578632357ef 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -102,6 +102,11 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t
     return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType);
 }
 
+at::ScalarType OpPreparation::convert_to_scalar_type(const aclDataType data_type)
+{
+    return CalcuOpUtil::ConvertToScalarType(data_type);
+}
+
 at::Tensor OpPreparation::copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type)
 {
     return CalcuOpUtil::CopyScalarToDevice(cpu_scalar, scalar_data_type);
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h
index 74ac30389872e4c0c8cb7da7a1ae3d7c2d4e075c..e87a91011218a4aa55b3f5187523af97ba1226f6 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.h
+++ b/torch_npu/csrc/framework/utils/OpPreparation.h
@@ -22,6 +22,7 @@ public:
     // From CalcuOpUtil part
     static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type);
     static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType);
+    static at::ScalarType convert_to_scalar_type(const aclDataType data_type);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type,
                                             const c10::Device device);
diff --git a/torch_npu/csrc/npu/DataParallelComm.cpp b/torch_npu/csrc/npu/DataParallelComm.cpp
index db0d3efabefc96ca39c8bcaad354ed07b159bd38..c744e1e1baf961dbfa42de031c4c371c9be22672 100644
--- a/torch_npu/csrc/npu/DataParallelComm.cpp
+++ b/torch_npu/csrc/npu/DataParallelComm.cpp
@@ -137,7 +137,7 @@ void check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, i
 {
     // need to check len(inputs) == len(outputs)
     size_t len = inputs.size();
-    if (len <= 0) {
+    if (len == 0) {
         throw std::runtime_error("input sequence can't be empty" + PTA_ERROR(ErrCode::PARAM));
     }
 
diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py
index a261d1785911b154eae6ee0a18e82b236de76921..68113cb5effad5f857c41dccfab7d8d280842b05 100644
--- a/torch_npu/onnx/wrapper_onnx_ops.py
+++ b/torch_npu/onnx/wrapper_onnx_ops.py
@@ -244,8 +244,8 @@ class _NPUFormatCastOP(torch.autograd.Function):
         return torch.ops.npu.npu_format_cast(*args, **kwargs)
 
     @staticmethod
-    def symbolic(g, self: Tensor, acl_format: int):
-        return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format)
+    def symbolic(g, self: Tensor, acl_format: int, customize_dtype: int = None):
+        return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format, customize_dtype_i=customize_dtype)
 
 
 class _NPUSoftmaxCrossEntropyWithLogitsOP(torch.autograd.Function):
@@ -1097,8 +1097,8 @@ def _wrapper_npu_deformable_conv2d(inputs, weight, offset, bias, kernel_size, st
                                        padding, dilation, groups, deformable_groups, modulated)
 
 
-def _wrapper_npu_format_cast(self, acl_format):
-    return _NPUFormatCastOP.apply(self, acl_format)
+def _wrapper_npu_format_cast(self, acl_format, customize_dtype=None):
+    return _NPUFormatCastOP.apply(self, acl_format, customize_dtype)
 
 
 def _wrapper_npu_softmax_cross_entropy_with_logits(self, labels):