diff --git a/CMakeLists.txt b/CMakeLists.txt
index 113c17f7a69f97d7dc8d1af053b922f0feb83576..9d506ad2dc6d0482dd316b1d56d33277401bb6a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,7 @@ if (NOT DEFINED BUILD_LIBTORCH)
   set(FLOP_SRCS)
   set(NPU_SRCS)
   set(PROF_SRCS)
+  set(IPC_SRCS)
   set(UTILS_SRCS)
   set(SAN_SRCS)
 endif()
@@ -247,11 +248,13 @@ add_subdirectory(${TORCHNPU_ROOT}/core)
 add_subdirectory(${TORCHNPU_ROOT}/framework)
 add_subdirectory(${TORCHNPU_ROOT}/flopcount)
 add_subdirectory(${TORCHNPU_ROOT}/logging)
+add_subdirectory(${TORCHNPU_ROOT}/custom_dtype)
 
 if (NOT DEFINED BUILD_LIBTORCH)
   add_subdirectory(${TORCHNPU_ROOT}/distributed)
   add_subdirectory(${TORCHNPU_ROOT}/npu)
   add_subdirectory(${TORCHNPU_ROOT}/profiler)
+  add_subdirectory(${TORCHNPU_ROOT}/ipc)
   add_subdirectory(${TORCHNPU_ROOT}/utils)
   add_subdirectory(${TORCHNPU_ROOT}/sanitizer)
 endif()
@@ -273,10 +276,10 @@ if (DEFINED BUILD_TENSORPIPE)
 endif()
 
 if (DEFINED BUILD_LIBTORCH)
-  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS})
+  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS})
 else()
 # Compile code with pybind11
-  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
+  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${IPC_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
 endif()
 
 add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS})
diff --git a/README.zh.md b/README.zh.md
index 1fe8456923835b42bdd1ea9cf294c4e4628e5271..b1748aad80a742dd554505b6c191400d6fdac326 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -253,7 +253,7 @@ AscendPyTorch版本分支的维护阶段如下：
 
 ## 安全声明
 
-[Ascend Extension for PyTorch插件 安全声明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md)
+[Ascend Extension for PyTorch插件 安全声明](./SECURITYNOTE.md)
 
 ## 参考文档
 
diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index 3a4ddb3c1ffcb92dd9dbb9dcdd3b0c78a3f602d6..2dfff1b2b34a839c3ef6fb6edfda9be2ec61ec24 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -1,9 +1,5 @@
 # Ascend Extension for PyTorch插件 安全声明
 
-## 漏洞风险提示
-
-PyTorch 2.6.0以下版本存在CVE-2025-32434漏洞，该漏洞因torch/serialization.py组件兼容性处理导致潜在的远程代码执行（RCE）风险。 torch_npu已参考[LINK](https://github.com/pytorch/pytorch/pull/145020)进行修复。
-
 ## 系统安全加固
 
 建议用户在系统中配置开启ASLR（级别2 ），又称**全随机地址空间布局随机化**，可参考以下方式进行配置：
@@ -229,3 +225,90 @@ PyTorch提供分布式训练能力，支持在单机和多机场景下进行训
 | 版本                  | 所有版本                              | 所有版本                               |
 | 特殊场景              | 无                                    | 无                                     |
 | 备注                  | 该通信过程由开源软件PyTorch控制，配置为PyTorch原生设置，可参考[PyTorch文档](https://pytorch.org/docs/stable/distributed.html#launch-utility)。源端口由操作系统自动分配，分配范围由操作系统的配置决定，例如ubuntu：采用/proc/sys/net/ipv4/ipv4_local_port_range文件指定，可通过cat /proc/sys/net/ipv4/ipv4_local_port_range或sysctl net.ipv4.ip_local_port_range查看       | 该通信过程由CANN中HCCL组件控制，torch_npu不进行控制，端口范围可参考[《环境变量参考》](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/envvar/envref_07_0001.html)的“执行相关 > 集合通信与分布式训练 > 集合通信相关配置>HCCL_IF_BASE_PORT”          |
+
+## 漏洞机制说明
+
+Ascend Extension for PyTorch 社区非常重视社区版本的安全性，专门设置了漏洞管理专员负责处理漏洞相关的事务，同时为了构建更安全的AI全流程工具链，我们也欢迎您一起参与。
+
+### 漏洞处理流程
+
+对于每一个安全漏洞，Ascend Extension for PyTorch 社区会安排人员进行跟踪和处理，漏洞处理的端到端流程如下图所示。
+
+![漏洞处理流程](./figures/cve.png)
+
+下面将重点解释漏洞上报、漏洞评估、漏洞披露的流程。
+
+### 漏洞上报
+
+您可以通过提交issue的方式联系 Ascend Extension for PyTorch 社区团队，我们将会第一时间安排安全漏洞专项人员向您联系。
+注意，为了确保安全性，请不要在issue中描述涉及安全隐私的具体信息。
+
+#### 上报响应
+
+1. Ascend Extension for PyTorch 社区会在3个工作日内确认、分析、上报安全漏洞问题，同时启动安全处理流程。
+2. Ascend Extension for PyTorch 安全团队在确认安全漏洞问题后，会对问题进行分发和跟进。
+3. 在安全漏洞问题从分类、确定、修复和发布的过程中，我们会及时更新报告。
+
+### 漏洞评估
+
+业界普遍使用 CVSS 标准评估漏洞的严重性，Ascend Extension for PyTorch 在使用 CVSS v3.1 进行漏洞评估时，需要设定漏洞攻击场景，基于在该攻击场景下的实际影响进行评估。漏洞严重等级评估是指针对漏洞利用难易程度，以及利用后对机密性、完整性、可用性的影响进行评估，并生成一个评分值。
+
+#### 漏洞评估标准
+
+Ascend Extension for PyTorch 通过以下向量评估一个漏洞的严重等级：
+
+- 攻击向量（AV）：表示攻击的“远程性”以及如何利用此漏洞。
+- 攻击复杂性（AC）：讲述攻击执行的难度以及成功进行攻击需要哪些因素。
+- 用户交互（UI）：确定攻击是否需要用户参与。
+- 所需的权限（PR）：记录成功进行攻击所需的用户身份验证级别。
+- 范围（S）：确定攻击者是否可以影响具有不同权限级别的组件。
+- 机密性（C）：衡量信息泄露给非授权方后导致的影响程度。
+- 完整性（I）：衡量信息被篡改后导致的影响程度。
+- 可用性（A）：衡量用户在需要访问数据或服务时受影响的程度。
+
+#### 评估原则
+
+- 评估漏洞的严重等级，不是评估风险。
+- 评估时必须基于攻击场景，且保证在该场景下，攻击者成功攻击后能对系统造成机密性、完整性、可用性影响。
+- 当安全漏洞有多个攻击场景时，应以造成最大的影响，即 CVSS 评分最高的攻击场景为依据。
+- 被嵌入调用的库存在漏洞，要根据该库在产品中的使用方式，确定漏洞的攻击场景后进行评估。
+- 安全缺陷不能被触发或不影响 CIA（机密性、完整性、可用性），CVSS 评分为 0 分。
+
+#### 评估步骤
+
+评估漏洞严重等级时，可根据下述步骤进行操作：
+
+1. 设定可能的攻击场景，基于攻击场景评分。
+2. 确定漏洞组件（Vulnerable Component）和受影响组件（Impact Component）。
+
+3. 选择基础指标的值。
+
+   - 可利用指标（攻击向量、攻击复杂度、所需权限、用户交互、范围）根据漏洞组件选择指标值。
+
+   - 影响指标（机密性、完整性、可用性）要么反映对漏洞组件的影响，要么反映对受影响组件影响，以结果最严重的为准。
+
+#### 严重等级划分
+
+| **严重等级（Severity Rating）** | **CVSS评分（Score）** | **漏洞修复时长** |
+| ------------------------------- | --------------------- | ---------------- |
+| 致命（Critical）                | 9.0~10.0              | 7天              |
+| 高（High）                      | 7.0~8.9               | 14天             |
+| 中（Medium）                    | 4.0~6.9               | 30天             |
+| 低（Low）                       | 0.1~3.9               | 30天             |
+
+### 漏洞披露
+
+安全漏洞修复后 Ascend Extension for PyTorch 社区会发布安全公告 （SA）以及安全说明（SN） ，安全公告内容包括该漏洞的技术细节、类型、上报人、CVE ID 以及受到该漏洞影响的版本和修复版本等信息。
+为了保护 Ascend Extension for PyTorch 用户的安全，在进行调查、修复和发布安全公告之前， Ascend Extension for PyTorch 社区不会公开披露、讨论或确认 Ascend Extension for PyTorch 产品的安全问题。
+
+### 附录
+
+#### 安全公告（SA）
+
+目前在维护版本，无安全漏洞
+
+#### 安全说明（SN）
+
+涉及第三方的开源组件部分漏洞说明：
+
+PyTorch 2.6.0以下版本存在CVE-2025-32434漏洞，该漏洞因torch/serialization.py组件兼容性处理导致潜在的远程代码执行（RCE）风险。 torch_npu已参考[LINK](https://github.com/pytorch/pytorch/pull/145020)进行修复。
\ No newline at end of file
diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py
index c60f38b8d4896c2d646c162cb5ad4c28df7bb1a6..439863e039ed2362b6f1f1cabb5bad57690c67cd 100644
--- a/codegen/gen_backend_stubs.py
+++ b/codegen/gen_backend_stubs.py
@@ -402,6 +402,8 @@ def gen_dispatcher_registrations(
     ns_helper = NamespaceHelper(namespace_str="at")
     native_func_header = """\
 #include "torch_npu/csrc/core/npu/NPURecovery.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/profiler/utils.h"
 #endif
diff --git a/codegen/utils.py b/codegen/utils.py
index 8e0713982eae0f63145d56ed0db59d6c15885fa7..fa5903ee34ed3904b8c887576c2f7cd701d7da10 100644
--- a/codegen/utils.py
+++ b/codegen/utils.py
@@ -418,6 +418,7 @@ const DeviceGuard device_guard(device_or_default(device));"""
                     device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
 
             op_key = str(f.func.name)
+            is_aclnn_only = "c10_npu::IsAclnnOnly()"
             if enable_opplugin():
                 if op_key in GLOBAL_STRUCTURED_OP_INFO_CACHE:
                     impl_name = f"op_plugin::{GLOBAL_STRUCTURED_OP_INFO_CACHE[op_key]}"
@@ -436,6 +437,11 @@ const DeviceGuard device_guard(device_or_default(device));"""
 if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{
         return {op_api_impl_name}({args_exprs_str});
     }} else {{
+        if ({is_aclnn_only}) {{
+            TORCH_CHECK(false,
+                "Current device only support aclnn operator, and current operator {impl_name} do not support internal format.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+        }}
         return {impl_name}({args_exprs_str});
     }}
 """
diff --git a/env.sh b/env.sh
index ff54b797d211caad86b37132a8fdc101157c1388..96fa71d80f4f94d140314654b82bfe8fa0f469c2 100644
--- a/env.sh
+++ b/env.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # 配置CANN相关环境变量
 CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
 
diff --git a/figures/cve.png b/figures/cve.png
new file mode 100644
index 0000000000000000000000000000000000000000..095d0f7ba20c416165a57aa8870c0325bbcb8af0
Binary files /dev/null and b/figures/cve.png differ
diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py
index 99cc499a93c457b0c6732dd3de015c76a280c695..a05fe8538a776b4e79ae9f0a19c86b7090d175c3 100644
--- a/test/allocator/test_pluggable_allocator_extensions.py
+++ b/test/allocator/test_pluggable_allocator_extensions.py
@@ -2,6 +2,7 @@ import os
 import sys
 import shutil
 import subprocess
+import ctypes
 import torch
 import torch.utils.cpp_extension
 
@@ -27,6 +28,7 @@ def build_stub(base_dir):
 
 class TestPluggableAllocator(TestCase):
     module = None
+    new_alloc = None
     build_directory = "allocator/build"
 
     @classmethod
@@ -59,9 +61,9 @@ class TestPluggableAllocator(TestCase):
     def test_pluggable_allocator(self):
         os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
         # Load the allocator
-        new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free')
+        TestPluggableAllocator.new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free')
         # Swap the current allocator
-        torch_npu.npu.memory.change_current_allocator(new_alloc)
+        torch_npu.npu.memory.change_current_allocator(TestPluggableAllocator.new_alloc)
         # This will allocate memory in the device using the new allocator
         self.assertFalse(self.module.check_custom_allocator_used())
         npu_tensor = torch.zeros(10, device='npu')
@@ -69,6 +71,23 @@ class TestPluggableAllocator(TestCase):
         self.assertRtolEqual(npu_tensor.cpu().numpy(), cpu_tensor.numpy())
         self.assertTrue(self.module.check_custom_allocator_used())
 
+    def test_set_get_device_stats_fn(self):
+        os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
+        myallocator = ctypes.CDLL(os_path)
+        get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value
+
+        TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn)
+        self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0)
+
+    def test_set_reset_peak_status_fn(self):
+        os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
+        myallocator = ctypes.CDLL(os_path)
+        reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value
+
+        TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn)
+        torch.npu.reset_peak_memory_stats()
+        self.assertEqual(torch.npu.max_memory_allocated(), 0)
+
     def test_pluggable_allocator_after_init(self):
         os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
         # Do an initial memory allocator
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 04ae192e478e93ab28a92b38b6be116531f0e1dd..fe31a00e3a594adc0f11d0f21173c2ceaa92d2b0 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -583,7 +583,9 @@
     "ForkingPickler",
     "Union",
     "check_serializing_named_tensor",
-    "register_after_fork"
+    "register_after_fork",
+    "reduce_tensor",
+    "reduce_storage"
   ],
   "torch.multiprocessing.spawn": [
     "Optional"
@@ -2820,16 +2822,19 @@
     "npu_cross_entropy_loss",
     "npu_format_cast_",
     "npu_fusion_attention",
+    "npu_fusion_attention_v2",
     "npu_get_float_status",
     "npu_nms_rotated",
     "npu_random_choice_with_mask",
     "npu_rms_norm",
     "npu_add_rms_norm_cast",
     "npu_fused_infer_attention_score",
+    "npu_fused_infer_attention_v2",
     "npu_mla_prolog",
     "npu_mla_prolog_v2",
     "npu_convert_weight_to_int4pack",
     "npu_ffn",
+    "npu_fused_matmul",
     "npu_geglu",
     "npu_grouped_matmul",
     "npu_quant_matmul",
@@ -2848,6 +2853,7 @@
     "npu_trans_quant_param",
     "npu_stride_add",
     "npu_sort_v2",
+    "npu_dtype_cast",
     "npu_gelu",
     "npu_gelu_backward",
     "npu_all_gather_base_mm",
diff --git a/test/contrib/test_transfer_to_npu.py b/test/contrib/test_transfer_to_npu.py
index b2f84413e1c57e3859be1495137fc4dabf73b130..073119a878e5266af0a2be6210bdcc39c0a874e5 100644
--- a/test/contrib/test_transfer_to_npu.py
+++ b/test/contrib/test_transfer_to_npu.py
@@ -12,6 +12,27 @@ from torch_npu.contrib import transfer_to_npu
 
 class TestTransferToNpu(TestCase):
 
+    def test_generator(self):
+        g0 = torch.Generator()
+        self.assertTrue(isinstance(g0, torch.Generator))
+        self.assertEqual(g0.device.type, 'cpu')
+
+        g1 = torch.Generator('cuda')
+        self.assertTrue(isinstance(g1, torch.Generator))
+        self.assertEqual(g1.device.type, 'npu')
+
+        g2 = torch.Generator(torch.device('cuda'))
+        self.assertTrue(isinstance(g2, torch.Generator))
+        self.assertEqual(g2.device.type, 'npu')
+
+        g3 = torch.Generator(device='cuda')
+        self.assertTrue(isinstance(g3, torch.Generator))
+        self.assertEqual(g3.device.type, 'npu')
+
+        g4 = torch.Generator(device=torch.device('cuda'))
+        self.assertTrue(isinstance(g4, torch.Generator))
+        self.assertEqual(g4.device.type, 'npu')
+
     def test_wrap_isinstance(self):
         # check builtins isinstance grammar
         self.assertTrue(isinstance(1, int))
diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
index 62c0591e06e359b574f50665545df965bbe39372..f13200aca1da40b966e008a1b43f297ca2f0cd5e 100644
--- a/test/cpp_extensions/extension.cpp
+++ b/test/cpp_extensions/extension.cpp
@@ -45,6 +45,17 @@ bool check_from_blob()
     return dtype_same && num_same && pos1_same && pos2_same && pos3_same && sub_same;
 }
 
+bool check_from_blob_delete()
+{
+    int isgone = 0;
+    {
+        auto data = torch::tensor({1.0, 2.0, 3.0}, torch::kFloat).to(at::Device("npu:0"));
+        auto res = at_npu::native::from_blob(data.data_ptr(), data.sizes(), [&](void*) { isgone++; });
+    }
+    bool is_deleted = (isgone == 1);
+    return is_deleted;
+}
+
 bool check_from_blob_strides()
 {
     auto data = torch::tensor({1, 2, 3, 4, 5, 6, 7, 8, 9}, torch::kInt32).to(at::Device("npu:0"));
@@ -95,5 +106,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("check_storage_sizes", &check_storage_sizes, "check_storage_sizes");
     m.def("check_from_blob", &check_from_blob, "check_from_blob");
     m.def("check_from_blob_strides", &check_from_blob_strides, "check_from_blob_strides");
+    m.def("check_from_blob_delete", &check_from_blob_delete, "check_from_blob_delete");
     m.def("blocking_ops", &blocking_ops, "blocking_ops");
 }
diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp
index 3ed2606b021ba7796ed6e94ad11f41625a88d169..6bb80e59dd5c4911d79fcb50cadc69b6f6babdbb 100644
--- a/test/cpp_extensions/pluggable_allocator_extensions.cpp
+++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp
@@ -4,8 +4,10 @@
 
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "third_party/acl/inc/acl/acl_rt.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 
 extern "C" {
+using c10_npu::NPUCachingAllocator::DeviceStats;
 static bool useflag = false;
 
 void* my_malloc(ssize_t size, int device, aclrtStream stream)
@@ -27,6 +29,17 @@ bool check_custom_allocator_used()
 {
     return useflag;
 }
+
+DeviceStats my_get_device_stats(int device)
+{
+    DeviceStats stats;
+    return stats;
+}
+
+void my_reset_peak_status(int device)
+{
+    std::cout<<"resetPeakStatus success!"<<std::endl;
+}
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
diff --git a/test/cpp_extensions/test/test_cpp_extensions_aot.py b/test/cpp_extensions/test/test_cpp_extensions_aot.py
index 8e0672d451728bba615ef1a75a7525b89b5040e1..44fc4224333a7b1129063f6b0d5f6fdbb039b141 100644
--- a/test/cpp_extensions/test/test_cpp_extensions_aot.py
+++ b/test/cpp_extensions/test/test_cpp_extensions_aot.py
@@ -48,6 +48,7 @@ class TestCppExtensionAOT(TestCase):
     def test_from_blob(self):
         self.assertTrue(npu_extension.check_from_blob())
         self.assertTrue(npu_extension.check_from_blob_strides())
+        self.assertTrue(npu_extension.check_from_blob_delete())
 
     def test_dispatch_allreduce(self):
         flags = os.O_WRONLY | os.O_RDONLY | os.O_CREAT
diff --git a/test/custom_ops/test_resize_.py b/test/custom_ops/test_resize_.py
new file mode 100644
index 0000000000000000000000000000000000000000..93b3018324930e4011458b277e8630c11aeb290a
--- /dev/null
+++ b/test/custom_ops/test_resize_.py
@@ -0,0 +1,32 @@
+import numpy as np
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestResize(TestCase):
+
+    def test_masked_select_out(self):
+
+        input_data = torch.tensor([[[[[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20], [21, 22, 23, 24, 25]]]]]], dtype=torch.float)
+        mask = torch.tensor([True, False, True, False, True])
+
+        input_data_npu = input_data.npu()
+        mask_npu = mask.npu()
+
+        out_tensor = torch.empty((1, 1, 1, 1, 1), dtype=input_data.dtype)
+        out_tensor_npu = out_tensor.npu()
+
+        out_tensor_npu = out_tensor_npu.view(-1)
+        out_tensor_npu = torch.masked_select(input_data_npu, mask_npu, out=out_tensor_npu)
+        out_tensor = torch.masked_select(input_data, mask, out=out_tensor)
+        self.assertRtolEqual(out_tensor_npu, out_tensor)
+    
+    def test_resize_ncdhw(self):
+        out_tensor = torch.empty((1, 1, 1, 1, 1), dtype=torch.float16).npu()
+        shape = [25]
+        out_tensor.resize_(shape)
+        self.assertEqual(shape, out_tensor.shape)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 63fd38c791bdf5e7c8efeeb0f13c73879877e7f6..0aa0bf6ac3a3c9d58a92f80a64a08d6be3d3f36e 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -67,7 +67,7 @@ class TestConvolutionNN(NNTestCase):
         path = download_file(get_url('legacy_conv2d'))
         with warnings.catch_warnings():
             warnings.simplefilter('ignore', SourceChangeWarning)
-            m = torch.load(path, encoding='utf-8')
+            m = torch.load(path, encoding='utf-8', weights_only=False)
         input1 = torch.randn((1, 1, 1, 1), dtype=torch.float)
         self.assertEqual(m(input1).size(), (1, 1, 1, 1))
 
diff --git a/test/npu/_fault_mode_cases/error_set_device.py b/test/npu/_fault_mode_cases/error_set_device.py
index ddda33f34663704bdd2d1ab5647ce1892c14cceb..1545ee9123ae20a5f4bb6998fe739df8415dede9 100644
--- a/test/npu/_fault_mode_cases/error_set_device.py
+++ b/test/npu/_fault_mode_cases/error_set_device.py
@@ -11,7 +11,7 @@ def _worker(i: int) -> None:
 def set_device():
     torch_npu.npu.set_device(0)
     multiprocessing.set_start_method("spawn", force=True)
-    jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(70)]
+    jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(100)]
 
     for p in jobs:
         p.start()
@@ -20,4 +20,5 @@ def set_device():
         p.join()
 
 
-set_device()
+if __name__ == "__main__":
+    set_device()
diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py
index 644579b9f1e6875854a35ad426adf9dd6272adde..18dbb79c5cb4691e6a9629e81777cd1b345777c3 100644
--- a/test/npu/test_aclgraph_update.py
+++ b/test/npu/test_aclgraph_update.py
@@ -122,6 +122,53 @@ class TestAclgraphUpdate(TestCase):
         g.replay()
         self.assertEqual(output.cpu(), res_src[0].cpu())
         self.assertEqual(softmax_lse.cpu(), res_src[1].cpu())
+    
+    @SupportedDevices(['Ascend910B'])
+    def test_npu_fused_infer_attention_v2(self):
+        torch.npu.set_device(0)
+        length = [29]
+        length_new = [100]
+        scale = 1 / 0.0078125
+        query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        res_src = torch_npu.npu_fused_infer_attention_v2(
+            query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535,
+            next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new)
+        g = torch.npu.NPUGraph()
+        event = torch.npu.ExternalEvent()
+        update_stream = torch.npu.Stream()
+        handle = None
+        output = None
+        softmax_lse = None
+
+        workspace = torch_npu._npu_fused_infer_attention_v2_get_max_workspace(
+            query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535,
+            next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length)
+
+        with torch.npu.graph(g):
+            stream = torch.npu.current_stream()
+            output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu")
+            softmax_lse = torch.empty(1, dtype=torch.float16, device="npu")
+            event.wait(stream)
+            event.reset(stream)
+            torch.npu.graph_task_group_begin(stream)
+            torch_npu.npu_fused_infer_attention_v2.out(
+                query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace,
+                next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length, out=[output, softmax_lse])
+            handle = torch.npu.graph_task_group_end(stream)
+        
+        with torch.npu.stream(update_stream):
+            torch.npu.graph_task_update_begin(update_stream, handle)
+            torch_npu.npu_fused_infer_attention_v2.out(
+                query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace,
+                next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new, out=[output, softmax_lse])
+            torch.npu.graph_task_update_end(update_stream)
+            event.record(update_stream)
+
+        g.replay()
+        self.assertEqual(output.cpu(), res_src[0].cpu())
+        self.assertEqual(softmax_lse.cpu(), res_src[1].cpu())
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py
index 0d52d5d11c6a55f679676dd6d50ea3e5bf601e99..6877fced8f5c4cf0831c663ccb5e81527db3e6f1 100644
--- a/test/npu/test_fault_mode.py
+++ b/test/npu/test_fault_mode.py
@@ -6,6 +6,9 @@ from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.utils.checkpoint import checkpoint
 import torch.distributed as dist
 import torch.nn as nn
+
+from torch_npu.testing.common_utils import SupportedDevices
+
 os.environ["ASCEND_LAUNCH_BLOCKING"] = '0'
 import torch_npu
 
@@ -156,6 +159,7 @@ class TestMode(TestCase):
         with self.assertRaisesRegex(RuntimeError, "Invalid device argument"):
             torch.npu.reset_max_memory_allocated(device="npu:8")
 
+    @SupportedDevices(['Ascend910B'])
     def test_aclrtSetDevice(self):
         path = os.path.join(os.path.dirname(__file__), '_fault_mode_cases/error_set_device.py')
         process = subprocess.Popen(["python", f"{path}"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
diff --git a/test/npu/test_npu_format.py b/test/npu/test_npu_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bc1c067ff4496896e816493c36529074bbfb2a8
--- /dev/null
+++ b/test/npu/test_npu_format.py
@@ -0,0 +1,49 @@
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestNPUFormat(TestCase):
+    
+    def test_enum_values(self):
+        """test the enumeration value"""
+        self.assertEqual(torch_npu.Format.NCHW.value, 0)
+        self.assertEqual(torch_npu.Format.NHWC.value, 1)
+
+    def test_npu_format_cast(self):
+        """test npu_format_cast"""
+        tensor = torch.ones(2, 2).npu()
+
+        out1 = torch_npu.npu_format_cast(tensor, 0)
+        fmt1 = torch_npu.get_npu_format(out1)
+        self.assertEqual(fmt1, torch_npu.Format.NCHW)
+
+        out2 = torch_npu.npu_format_cast(tensor, torch_npu.Format.NHWC)
+        fmt2 = torch_npu.get_npu_format(out2)
+        self.assertEqual(fmt2, torch_npu.Format.NHWC)
+
+    def test_npu_format_cast_(self):
+        """test npu_format_cast_"""
+        x1 = torch.ones(2, 2).npu()
+        x2 = torch.ones(2, 2).npu()
+
+        torch_npu.npu_format_cast_(x1, 0)
+        fmt1 = torch_npu.get_npu_format(x1)
+        self.assertEqual(fmt1, torch_npu.Format.NCHW)
+
+        torch_npu.npu_format_cast_(x2, torch_npu.Format.NHWC)
+        fmt2 = torch_npu.get_npu_format(x2)
+        self.assertEqual(fmt2, torch_npu.Format.NHWC)
+
+    def test_get_npu_format(self):
+        """test get_npu_format"""
+        x1 = torch.ones(2, 2).npu()
+        torch_npu.npu_format_cast_(x1, 0)
+
+        fmt1 = torch_npu.get_npu_format(x1)
+        self.assertEqual(fmt1, torch_npu.Format.NCHW)
+        self.assertEqual(fmt1, 0)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index 17bbe73886fe58c42b4df227e6e48390060da3c5..51c288ecda6d314bc7c44bb2f13afa9f70ef57f7 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -545,7 +545,7 @@ class TestPublicBindings(TestCase):
             "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_dequant_bias",
             "torch_npu.utils.collect_hccl_info",
             "torch_npu.op_plugin.meta._meta_registrations",
-
+            "torch_npu.op_plugin.atb._atb_meta_registrations",
         }
 
         # No new entries should be added to this list.
diff --git a/test/npu/test_save_async.py b/test/npu/test_save_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c446df9f572702c94df3265e170404a0d0d1121
--- /dev/null
+++ b/test/npu/test_save_async.py
@@ -0,0 +1,119 @@
+import os
+import time
+import copy
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.utils._path_manager import PathManager
+
+
+class TestAsyncSave(TestCase):
+    test_save_path = os.path.join(
+        os.path.realpath(os.path.dirname(__file__)), "test_save_async")
+
+    @classmethod
+    def setUpClass(cls):
+        PathManager.make_dir_safety(TestAsyncSave.test_save_path)
+
+    @classmethod
+    def tearDownClass(cls):
+        PathManager.remove_path_safety(TestAsyncSave.test_save_path)
+    
+    def wait_for_save_completion(self, file_path, timeout_sec=60, poll_interval_sec=0.5):
+        start_time = time.time()
+
+        while time.time() - start_time < timeout_sec:
+            if os.path.exists(file_path):
+                current_size = os.path.getsize(file_path)
+                time.sleep(poll_interval_sec)
+                new_size = os.path.getsize(file_path)
+
+                if current_size == new_size:
+                    return True
+            else:
+                time.sleep(poll_interval_sec)
+
+        return False
+
+    def test_save_async_tensor(self):
+        save_tensor = torch.rand(1024, dtype=torch.float32).npu()
+        async_save_path = os.path.join(TestAsyncSave.test_save_path, "async_save_tensor.pt")
+        torch_npu.utils.save_async(save_tensor, async_save_path)
+     
+        if self.wait_for_save_completion(async_save_path):
+            tensor_async = torch.load(async_save_path)
+            self.assertEqual(tensor_async, save_tensor)
+        else:
+            self.assertTrue(False, f"{async_save_path} is not exist!")
+    
+    def test_save_async(self):
+        loss1 = [1.6099495, 1.6099086, 1.6098710]
+        loss2 = []
+        model_list = []
+        checkpoint_list = []
+        model_origin = nn.Sequential(
+            nn.Linear(100, 50),
+            nn.ReLU(),
+            nn.Linear(50, 20),
+            nn.ReLU(),
+            nn.Linear(20, 5),
+            nn.ReLU()
+        )
+
+        input_data = torch.ones(6400, 100).npu()
+        labels = torch.arange(5).repeat(1280).npu()
+
+        criterion = nn.CrossEntropyLoss()
+        model = model_origin.npu()
+        optimerizer = optim.SGD(model.parameters(), lr=0.1)
+        for step in range(3):
+            outputs = model(input_data)
+            loss = criterion(outputs, labels)
+
+            optimerizer.zero_grad()
+            loss.backward()
+
+            optimerizer.step()
+            
+            loss2.append(loss)
+            checkpoint = {
+                "model": model.state_dict(),
+                "optimizer": optimerizer.state_dict()
+            }
+            checkpoint_list.append(copy.deepcopy(checkpoint))
+            model_list.append(copy.deepcopy(model))
+            checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{step}.path")
+            model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{step}.path")
+            torch_npu.utils.save_async(checkpoint, checkpoint_async_path, model=model)
+            torch_npu.utils.save_async(model, model_async_path, model=model)
+
+        for i in range(3):
+            self.assertEqual(loss1[i], loss2[i].item())
+            checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{i}.path")
+            if self.wait_for_save_completion(checkpoint_async_path):
+                checkpoint_async = torch.load(checkpoint_async_path)
+                self.assertEqual(checkpoint_list[i], checkpoint_async, prec=2e-3)
+            else:
+                self.assertTrue(False, f"{checkpoint_async_path} is not exist!")
+            model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{i}.path")
+            if self.wait_for_save_completion(model_async_path):
+                model_async = torch.load(model_async_path)
+            else:
+                self.assertTrue(False, f"{model_async_path} is not exist!")
+            state_dict_sync = model_list[i].state_dict()
+            state_dict_async = model_async.state_dict()
+
+            key_sync = sorted(state_dict_sync.keys())
+            key_async = sorted(state_dict_async.keys())
+
+            self.assertEqual(key_sync, key_async)
+            for key in key_async:
+                self.assertEqual(state_dict_async[key], state_dict_sync[key], prec=2e-3)
+
+if __name__ == '__main__':
+    torch.npu.set_device(0)
+    run_tests()
diff --git a/test/npu/test_torch_npu.py b/test/npu/test_torch_npu.py
index 0e2c96e1bd7dd4709b73c1ff8f9418f839f254dc..29709ef991175785012ecc7aab547d3dae82f15a 100644
--- a/test/npu/test_torch_npu.py
+++ b/test/npu/test_torch_npu.py
@@ -78,6 +78,12 @@ class TorchNPUDeviceTestCase(TestCase):
         torch_npu.npu.synchronize()
         after_free_memory, after_total_memory = torch_npu.npu.mem_get_info(0)
         self.assertEqual(before_total_memory, after_total_memory)
+    
+    @unittest.skip("CANN doesn't support now.")
+    def test_set_device_res_limit(self):
+        ans_dict = {'cube_core_num': 12, 'vector_core_num': 24}
+        torch.npu.set_device_limit(torch.npu.current_device(), 12, 24)
+        self.assertEqual(ans_dict, torch.npu.get_device_limit(torch.npu.current_device()))
 
 class TorchNPUMemoryApiTestCase(TestCase):
     def test_npu_memory_stats(self):
diff --git a/test/npu/test_unsupport_api.py b/test/npu/test_unsupport_api.py
index 8883f3eb06e54a587d3e960b08fa262fdaef6494..54af07e0b21e3b5d86f3c01c4ae1cb06734a4a04 100644
--- a/test/npu/test_unsupport_api.py
+++ b/test/npu/test_unsupport_api.py
@@ -67,16 +67,6 @@ class TestPtaUnsupportApi(TestCase):
             coalesce_tensor = sparse_tensor.coalesce().npu()
             coalesce_tensor.ccol_indices()
 
-    def test_Tensor_is_shared_runtimeerror(self):
-        with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."):
-            input_tensor = torch.tensor([1, 2, 3]).npu()
-            input_tensor.is_shared()
-
-    def test_Tensor_share_memory__runtimeerror(self):
-        with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."):
-            input_tensor = torch.tensor([1, 2, 3]).npu()
-            input_tensor.share_memory_()
-
     def test_Module_share_memory_runtimeerror(self):
         with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."):
             model = SimpleModel().npu()
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index b42929fd6dfe02a69659a18f89c1a0cc387928c3..4be850975ba7e71fcd92089d7ad36d68efd38a68 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1028,9 +1028,6 @@
   "torch_npu.npu.check_uce_in_memory": {
     "signature": "(device_id)"
   },
-  "torch_npu.npu.get_uce_addr": {
-    "signature": "()"
-  },
   "torch_npu.npu.clear_npu_overflow_flag": {
     "signature": "()"
   },
@@ -2550,7 +2547,7 @@
     "signature": "(*args, **kwargs)"
   },
   "torch_npu.npu_format_cast": {
-    "signature": "(self, acl_format)"
+    "signature": "(self, acl_format, customize_dtype=None)"
   },
   "torch_npu.npu_format_cast_": {
     "signature": "(*args, **kwargs)"
@@ -2768,6 +2765,9 @@
   "torch_npu.utils.set_thread_affinity": {
     "signature": "(core_range: List[int] = None)"
   },
+  "torch_npu.utils.reset_thread_affinity": {
+    "signature": "()"
+  },
   "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
@@ -2789,6 +2789,9 @@
   "torch_npu.distributed.all_gather_into_tensor_uneven": {
     "signature": "(output, input, output_split_sizes=None, group=None, async_op=False)"
   },
+  "torch_npu.multiprocessing.reductions.rebuild_npu_tensor": {
+    "signature": "(tensor_cls, tensor_size, tensor_stride, tensor_offset, storage_cls, dtype, storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required)"
+  },
   "func: unsafe_empty_with_format": {
     "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor"
   },
@@ -2811,16 +2814,16 @@
     "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor"
   },
   "func: npu_format_cast": {
-    "signature": "(Tensor self, int acl_format) -> Tensor"
+    "signature": "(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor"
   },
   "func: npu_format_cast_": {
-    "signature": "(Tensor(a!) self, Tensor src) -> Tensor(a!)"
+    "signature": "(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)"
   },
   "func: npu_format_cast_.acl_format": {
-    "signature": "(Tensor(a!) self, int acl_format) -> Tensor(a!)"
+    "signature": "(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)"
   },
   "func: npu_format_cast.Tensor": {
-    "signature": "(Tensor self, Tensor dst) -> Tensor"
+    "signature": "(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor"
   },
   "func: npu_change_data_ptr": {
     "signature": "(Tensor dst, Tensor src, int index) -> int"
@@ -2840,6 +2843,9 @@
   "func: _npu_format_cast": {
     "signature": "(Tensor self, int acl_format) -> Tensor"
   },
+  "func: _npu_format_cast.aclnn": {
+    "signature": "(Tensor self, int acl_format, int customize_dtype) -> Tensor"
+  },
   "torch_c_func: torch_npu::init_npu(const c10::DeviceIndex device_index = 0)": {
     "signature": "(const c10::DeviceIndex device_index = 0) -> void",
     "file": "torch_npu/csrc/libs/init_npu.h"
diff --git a/third_party/acl/inc/acl/acl.h b/third_party/acl/inc/acl/acl.h
index 95abdb6368eaee10dd19e50d292554c708c84be3..a31b673d070888e47052e084704213c0268e8457 100755
--- a/third_party/acl/inc/acl/acl.h
+++ b/third_party/acl/inc/acl/acl.h
@@ -25,6 +25,7 @@ extern "C" {
 #define ACL_PATCH_VERSION              0
 #define ACL_PKG_VERSION_MAX_SIZE       128
 #define ACL_PKG_VERSION_PARTS_MAX_SIZE 64
+#define ACL_IPC_HANDLE_SIZE            65
 
 /**
  * @ingroup AscendCL
diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
index b8ef9dbd34075370416a049efff05be7b4c110df..7d592db6ed09d4e53ad5df29773b0935fcfc5515 100755
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -48,6 +48,7 @@ extern "C" {
 typedef void *aclrtStream;
 typedef void *aclrtEvent;
 typedef void *aclrtContext;
+typedef void *aclrtNotify;
 typedef int aclError;
 typedef uint16_t aclFloat16;
 typedef struct aclDataBuffer aclDataBuffer;
@@ -183,6 +184,8 @@ typedef enum {
     ACL_FRACTAL_Z_3D = 33,
     ACL_FORMAT_NC = 35,
     ACL_FORMAT_NCL = 47,
+    ACL_FORMAT_FRACTAL_NZ_C0_16 = 50,
+    ACL_FORMAT_FRACTAL_NZ_C0_32 = 51,
 } aclFormat;
 
 typedef enum {
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index 98b520ba4ac73a4b5072d98fd436edde37b51655..ecc36f38128bd746bc9f9cb5064e6f47f9bc5b6a 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -181,6 +181,11 @@ typedef enum aclrtLastErrLevel {
     ACL_RT_THREAD_LEVEL = 0,
 } aclrtLastErrLevel;
 
+typedef enum {
+    ACL_RT_DEV_RES_CUBE_CORE = 0,
+    ACL_RT_DEV_RES_VECTOR_CORE,
+} aclrtDevResModelType;
+
 typedef void* aclrtDrvMemHandle;
 
 typedef void (*aclrtCallback)(void *userData);
@@ -1541,6 +1546,37 @@ ACL_FUNC_VISIBILITY aclError aclrtPeekAtLastError(aclrtLastErrLevel level);
  */
 ACL_FUNC_VISIBILITY aclError aclrtGetLastError(aclrtLastErrLevel level);
 
+/**
+ * @ingroup AscendCL
+ * @brief Get the value of the current device's limited resources
+ * @param [in] deviceId  the device id
+ * @param [in] type      resources type
+ * @param [out] value    resources limit value
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value);
+
+/**
+ * @ingroup AscendCL
+ * @brief Set the value of the current device's limited resources
+ * @param [in] deviceId  the device id
+ * @param [in] type      resource type
+ * @param [in] value     resource limit value
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value);
+
+/**
+ * @ingroup AscendCL
+ * @brief Reset the value of the current device's limited resources
+ * @param [in] deviceId  the device id
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtResetDeviceResLimit(int32_t deviceId);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp
index 4f24e6bf043cba7c53c7015e597f5c6e82164bd6..9bb32581dd7ea6ca7d1b5fe01c7896dfb7d84764 100644
--- a/third_party/acl/libs/acl.cpp
+++ b/third_party/acl/libs/acl.cpp
@@ -18,6 +18,9 @@ aclError aclmdlSetDump(const char *configPath){return 0;}
 aclError aclmdlInitDump(){return 0;}
 aclError aclmdlFinalizeDump(){return 0;}
 aclError aclrtDeviceTaskAbort(int32_t deviceId, uint32_t timeout){return 0;}
+aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value){return 0;}
+aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value){return 0;}
+aclError aclrtResetDeviceResLimit(int32_t deviceId){return 0;}
 
 // Stream
 aclError aclrtCreateStream(aclrtStream *stream) { return 0; }
diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index 023914a348285ad17c459b077cdd03c4593637ea..216ef7a83847e424ee1b0679b351d188452a2981 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -212,6 +212,8 @@ inline void HcclCommConfigInit(HcclCommConfig *config)
     config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET;
     config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET;
     config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE;
+    config->hcclWorldRankID = 0;
+    config->hcclJobID = 0;
 }
 
 /**
diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h
index 40631676c1bdc9bb44256b083e647e99e8f6fc8f..9a02c61c0414a96af23bf2468ab96482512240fa 100644
--- a/third_party/hccl/inc/hccl/hccl_types.h
+++ b/third_party/hccl/inc/hccl/hccl_types.h
@@ -15,7 +15,7 @@ extern "C" {
 
 const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24;
 const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0;
-const uint32_t HCCL_COMM_CONFIG_VERSION = 5;
+const uint32_t HCCL_COMM_CONFIG_VERSION = 6;
 const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200;                // 200MB buffer size
 const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0;             // Disable deterministic calculations
 const uint32_t COMM_NAME_MAX_LENGTH = 128;
@@ -132,6 +132,8 @@ typedef struct HcclCommConfigDef {
     uint32_t hcclOpExpansionMode;
     uint32_t hcclRdmaTrafficClass;
     uint32_t hcclRdmaServiceLevel;
+    uint32_t hcclWorldRankID;
+    uint64_t hcclJobID;
 } HcclCommConfig;
 
 typedef enum {
diff --git a/third_party/op-plugin b/third_party/op-plugin
index 680dea4984135de69dc1ee031e08942c4049fa72..c94178b515bd4c1cc88f6598a72c7d019fa10b7a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 680dea4984135de69dc1ee031e08942c4049fa72
+Subproject commit c94178b515bd4c1cc88f6598a72c7d019fa10b7a
diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index fe8e22ad3a347ab8116eb61ff44450ef1fb07f91..9818eff91d926398e6bc2a733d044efe21629477 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit fe8e22ad3a347ab8116eb61ff44450ef1fb07f91
+Subproject commit 9818eff91d926398e6bc2a733d044efe21629477
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index d84f72b37a3dd1069eb06d1b43a64ff4604d9902..10e15a15224e216ee116060b6838936b4a8d2712 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -75,6 +75,7 @@ from torch_npu.utils import _apply_module_patch, _add_tensor_methods, _add_colle
     _apply_npu_show_warning
 from torch_npu.utils._dynamo_device import _dynamo_register_interface_for_device
 from torch_npu.npu._stream_check import apply_sanitizer_patch
+from torch_npu.npu._format import _apply_npu_format_patch
 import torch_npu.utils.custom_ops
 import torch_npu.distributed.rpc
 import torch_npu.op_plugin
@@ -83,6 +84,7 @@ from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry
 from torch_npu.utils import _cann_package_check, _add_intercept_methods
 from torch_npu.utils import _register_ops_under_dtensor_rules
 from torch_npu.utils.exposed_api import public_npu_functions
+from torch_npu.multiprocessing.reductions import _add_reductions_methods
 from torch_npu.npu.utils import _erase_stream as erase_stream
 from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler
 from torch_npu.asd.asd import _asd_patch
@@ -113,6 +115,7 @@ for name in dir(torch.ops.npu):
         __all__.append(name)
     setattr(torch, name, _wrap_torch_error_func(getattr(torch.ops.npu, name)))
 
+
 all_monkey_patches = [
     ["nn.functional", npu_functional],
     ["nn", npu_modules],
@@ -171,6 +174,8 @@ def _apply_class_patches():
     add_perf_dump_patch()
     _apply_distributed_methods_patch()
     _apply_mstx_patch()
+    _add_reductions_methods()
+    _apply_npu_format_patch()
 
 
 def _apply_distributed_methods_patch():
@@ -193,6 +198,7 @@ torch._register_device_module('npu', torch_npu.npu)
 unsupported_dtype = [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]
 torch.utils.generate_methods_for_privateuse1_backend(for_tensor=True, for_module=True, for_storage=True,
                                                      unsupported_dtype=unsupported_dtype)
+torch.nn.parameter.UninitializedTensorMixin._allowed_methods.append(torch.Tensor.npu)
 
 # Apply monkey-patches.
 _apply_patches(all_monkey_patches)
diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py
index 41e84feded80fa570ee6639c29c614e098e35233..47486142336c06040d66792eb2f399285f85635b 100644
--- a/torch_npu/asd/asd.py
+++ b/torch_npu/asd/asd.py
@@ -1,5 +1,5 @@
 import os
-from functools import wraps
+from functools import wraps, partial
 import logging
 import time
 import warnings
@@ -16,8 +16,6 @@ from ._silent_fault_data import SilentFaultData, SilentFaultDataV2
 __all__ = []
 
 
-original_matmul = torch.matmul
-original_tensor_matmul = torch.Tensor.matmul
 loggerSilent = logging.getLogger("torch_npu.silent_check")
 
 
@@ -314,17 +312,11 @@ class _MatmulSilentCheck:
         self.checksum_result = None
         self.checksum_state = None
         self.checksum_state_thread_running = False
-        self.checksum_state_thread = threading.Thread(
-            target=self._tcp_comm_checksum_state,
-            daemon=True
-        )
+        self.checksum_state_thread = None
         # Use another thread to receive the statistic value and detect SDC
         self.check_thread_running = False
-        self.check_thread = threading.Thread(
-            target=self._async_detect,
-            daemon=True
-        )
-        self.lock = threading.Lock()
+        self.check_thread = None
+        self._lock = None
         self.queue_len = 1024
         self.statistic_cpu_value = None
         self.name_list = ["" for _ in range(self.queue_len)]
@@ -409,7 +401,13 @@ class _MatmulSilentCheck:
 
     def get_grad_sample_interval(self):
         return self.filter_interval
-    
+
+    @property
+    def lock(self):
+        if self._lock is None:
+            self._lock = threading.Lock()
+        return self._lock
+
     def init_stream(self):
         if self.statistic_cpu_value is None:
             self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}")
@@ -431,7 +429,8 @@ class _MatmulSilentCheck:
     
     def register_module_hook(self, module, name):
         self.check_stat[name + "_backward"] = {'avg': 0, 'pre_val': 0, 'step': 0, 'none_zero_step': 0}
-        self.hook_dict[name + "_backward"] = module.register_full_backward_hook(lambda module, grad_input, grad_output, n=name + "_backward": self.module_hook(module, grad_input, grad_output, n))
+        hook = partial(self.module_hook, name=name + "_backward")
+        self.hook_dict[name + "_backward"] = module.register_full_backward_hook(hook)
         self.registered_modules.append(name)
     
     def module_hook(self, module, grad_input, grad_output, name):
@@ -472,6 +471,8 @@ class _MatmulSilentCheck:
             if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized():
                 break
             time.sleep(10)
+        if not self.check_thread_running:
+            return
         local_rank = os.getenv("LOCAL_RANK", "-1")
         if local_rank.isdigit():
             torch.npu.set_device(int(local_rank))
@@ -481,7 +482,7 @@ class _MatmulSilentCheck:
             val = self.statistic_cpu_value[self.head_index].item()
             name = self.name_list[self.head_index]
             while val != -1 and name != "":
-                loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}")
+                loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, bp time: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}")
                 result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check(
                     val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'],
                     self.upper_thresh1, self.upper_thresh2
@@ -604,21 +605,21 @@ class _MatmulSilentCheck:
     def _generate_event_log(self, new_abnormal):
         info_str = f"[Event][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: A grad-norm spike may happen, "
         info_str = info_str + f"param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, "
-        info_str = info_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
+        info_str = info_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
         loggerSilent.info(info_str)
         if self.store is not None and self.rank is not None and self.rank != 0:
             current_log = self.store.get(f"rank_{self.rank}_info_log").decode()
             self.store.set(f"rank_{self.rank}_info_log", current_log + "\n" + info_str if current_log != "" else info_str)
 
     def _generate_warning_log(self, counting_abnormal_pos, new_abnormal):
-        warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: Training instability happens, feature detection detects abnormal results!"
+        warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: feature detection detects abnormal results!"
         index = 0
         for pos in reversed(counting_abnormal_pos):
             warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, "
-            warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, step {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}."
+            warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, bp time {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}."
             index += 1
         warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {new_abnormal['time_str']}, param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, "
-        warning_str = warning_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
+        warning_str = warning_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
         loggerSilent.warning(warning_str)
         if self.store is not None and self.rank is not None and self.rank != 0:
             current_log = self.store.get(f"rank_{self.rank}_warn_log").decode()
@@ -636,6 +637,8 @@ class _MatmulSilentCheck:
             if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None:
                 break
             time.sleep(10)
+        if not self.checksum_state_thread_running:
+            return
         local_rank = os.getenv("LOCAL_RANK", "-1")
         self.rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
@@ -673,7 +676,7 @@ class _MatmulSilentCheck:
             if global_state:
                 now_time = time.time()
                 if last_checksum_time is None or abs(now_time - last_checksum_time) > self.checksum_cooldown * 60:
-                    loggerSilent.info(f'[Info] Rank {self.rank}: Training instability happened, checksum is on.')
+                    loggerSilent.info(f'[Info] Rank {self.rank}: feature detection detects abnormal results, checksum is on.')
                     last_checksum_time = now_time
                     if self.checksum_result is None:
                         self.checksum_result = torch.tensor(False, dtype=torch.bool, device='npu')
@@ -697,14 +700,44 @@ class _MatmulSilentCheck:
 
             time.sleep(10)
 
+    def __getstate__(self):
+        self._cleanup()
+        state = self.__dict__.copy()
+        state['_lock'] = None
+        state['store'] = None
+        return state
+    
+    def __setstate(self, state):
+        self.__dict__.update(state)
+        self.store = None
+
+    def _startup(self):        
+        if not self.check_thread_running:
+            self.check_thread_running = True
+            self.check_thread = threading.Thread(
+                target=self._async_detect,
+                daemon=True
+            )
+            self.check_thread.start()
+
+        if not self.checksum_state_thread_running:
+            self.checksum_state_thread_running = True
+            self.checksum_state_thread = threading.Thread(
+                target=self._tcp_comm_checksum_state,
+                daemon=True
+            )
+            self.checksum_state_thread.start()
+
     def _cleanup(self):
         if self.check_thread_running:
             self.check_thread_running = False
             self.check_thread.join()
+            self.check_thread = None
 
         if self.checksum_state_thread_running:
             self.checksum_state_thread_running = False
             self.checksum_state_thread.join()
+            self.checksum_state_thread = None
 
 
 matmul_check = _MatmulSilentCheck()
@@ -747,15 +780,10 @@ def _matmul_silent_check_decorator(func):
             matmul_check.init_module_info(id(self), self.training)
             self.matmul_check_outer = True
 
-            if not matmul_check.check_thread_running:
-                matmul_check.check_thread_running = True
-                matmul_check.check_thread.start()
-
-            # 2 for checksum
-            if not matmul_check.checksum_state_thread_running:
-                matmul_check.checksum_state_thread_running = True
-                matmul_check.checksum_state_thread.start()
+            matmul_check._startup()
             if matmul_check.with_checksum and not matmul_check.matmul_trigger:
+                original_matmul = torch.matmul
+                original_tensor_matmul = torch.Tensor.matmul
                 torch_npu.asd.checksum.matmul = original_matmul
                 torch.matmul = _trigger_matmul_decorator(original_matmul)
                 torch.Tensor.matmul = _trigger_tensor_matmul_decorator(original_tensor_matmul)
diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py
index cc6832f39832ac1be197a6cd5b5e393d52aef4dc..a9576675cb071d5d71e03843773026d62a6a8345 100644
--- a/torch_npu/asd/checksum.py
+++ b/torch_npu/asd/checksum.py
@@ -41,5 +41,10 @@ def _matmul_checksum(a, b, c):
     error_total = (c_ele_round_error_accum).to(torch.float)
 
     error = torch.abs(c_sum - c1_trans)
-    flag = (error - error_total) > 1e-20
-    return torch.any(flag)
+    flag = (error - 5 * error_total) > 5 * 1e-20
+    any_flag = torch.any(flag)
+    if any_flag:
+        matmul(a, b, out=c)
+        c_mean2 = torch.mean(torch.abs(c), dim=-1)
+        return torch.any(c_mean != c_mean2)
+    return any_flag
diff --git a/torch_npu/contrib/function/roll.py b/torch_npu/contrib/function/roll.py
index 97037c8d0754427942742c28de5ba0f6568ee3ac..550064e693c5ce5fc1110ef7f2b666ceb739a531 100644
--- a/torch_npu/contrib/function/roll.py
+++ b/torch_npu/contrib/function/roll.py
@@ -30,7 +30,7 @@ _roll_with_index_select = _RollWithIndexSelect.apply
 def _get_roll_index(H, W, shifts, device='cpu'):
     index = torch.arange(0, H * W).reshape(H, W)
     index_fp = torch.roll(index, shifts=shifts, dims=(0, 1)).reshape(-1).long()
-    index_bp_dict = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())}
+    index_bp_dict = {i: idx for idx, i in enumerate(index_fp.numpy().tolist())}
     index_bp_list = [index_bp_dict[i] for i in range(H * W)]
     index_bp = torch.LongTensor(index_bp_list)
     return [index_fp.to(device), index_bp.to(device)]
diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py
index 5a8a355d1a5aac1bc102ded8893bfd9b23b8457e..e7e6014c4f824a5d3d98d5fb1a73b27748957520 100644
--- a/torch_npu/contrib/transfer_to_npu.py
+++ b/torch_npu/contrib/transfer_to_npu.py
@@ -28,7 +28,7 @@ torch_fn_white_list = ['logspace', 'randint', 'hann_window', 'rand', 'full_like'
                        'eye', '_sparse_csr_tensor_unsafe', 'empty', '_sparse_coo_tensor_unsafe', 'blackman_window',
                        'zeros_like', 'range', 'sparse_csr_tensor', 'randn_like', 'from_file',
                        '_cudnn_init_dropout_state', '_empty_affine_quantized', 'linspace', 'hamming_window',
-                       'empty_quantized', '_pin_memory', 'autocast', 'load', "Generator", 'set_default_device']
+                       'empty_quantized', '_pin_memory', 'autocast', 'load', 'set_default_device']
 torch_tensor_fn_white_list = ['new_empty', 'new_empty_strided', 'new_full', 'new_ones', 'new_tensor', 'new_zeros', 'to', 
                               'pin_memory']
 torch_module_fn_white_list = ['to', 'to_empty']
@@ -45,6 +45,14 @@ cur_path = os.path.dirname(os.path.realpath(__file__))
 config_path = os.path.join(cur_path, 'apis_config.json')
 
 
+class _GeneratorProxy(torch.Generator):
+
+    def __new__(cls, device='cpu'):
+        device = _replace_cuda_to_npu_in_list([device], None)[0]
+        instance = super().__new__(cls, device)
+        return instance
+
+
 def _get_function_from_string(attribute_string):
     try:
         module_path, _, attr_name = attribute_string.rpartition('.')
@@ -331,6 +339,7 @@ def _init():
     # torch.*
     _device_wrapper(torch, torch_fn_white_list)
     torch.UntypedStorage.__new__ = _wrapper_cuda(torch.UntypedStorage.__new__)
+    torch.Generator = _GeneratorProxy
 
     # torch.Tensor.*
     _device_wrapper(torch.Tensor, torch_tensor_fn_white_list)
@@ -350,6 +359,8 @@ def _init():
         _wrapper_cuda(torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.__init__)
     _del_nccl_device_backend_map()
     torch.distributed.device_mesh.init_device_mesh = _wrapper_cuda(torch.distributed.device_mesh.init_device_mesh)
+    torch.distributed.distributed_c10d._new_group_with_tag = _wrapper_hccl(
+        torch.distributed.distributed_c10d._new_group_with_tag)
     
     # CUDAGraph
     torch.cuda.CUDAGraph = torch.npu.NPUGraph
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index c8084af923b506b9fd08466c21dfcaa73b68b79e..672b5289f5c8580ef21add7c8fafd824bd92bc86 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -15,7 +15,9 @@
 #include "torch_npu/csrc/profiler/init.h"
 #include "torch_npu/csrc/flopcount/Init.h"
 #include "torch_npu/csrc/logging/Init.h"
+#include "torch_npu/csrc/ipc/StorageSharing.h"
 #include "torch_npu/csrc/npu/Module.h"
+#include "torch_npu/csrc/custom_dtype/Init.h"
 #include "torch_npu/csrc/npu/Stress_detect.h"
 #include "torch_npu/csrc/utils/TensorType.h"
 #include "torch_npu/csrc/utils/AutocastMode.h"
@@ -168,6 +170,8 @@ PyObject* initModule()
     AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions());
     AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions());
     AddPyMethodDefs(methods, torch_npu::logging::logging_functions());
+    AddPyMethodDefs(methods, torch_npu::reductions::reductions_functions());
+    AddPyMethodDefs(methods, c10_npu::custom_dtype_functions());
     static struct PyModuleDef torchnpu_module = {
         PyModuleDef_HEAD_INIT,
         "torch_npu._C",
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 0c4000e52458682f8fa8ede2cde47b6421637d5d..2c35eaf44c791e9351d132a8d5db1944a41b0bcd 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -1,16 +1,131 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h"
+#include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
+#include "torch_npu/csrc/custom_dtype/Init.h"
+#include "third_party/op-plugin/op_plugin/utils/op_api_common.h"
 
 namespace at_npu {
 namespace native {
 
 using tensor_list = std::vector<at::Tensor>;
+using GetFormatFunc = int (*)(const aclTensor *, const int, const int, int64_t **, uint64_t *, int *);
+
+std::tuple<bool, int64_t, c10::SmallVector<int64_t, SIZE>> MaybeUseAclnnNpuFormatCast(const at::Tensor& src,
+    int64_t acl_format, c10::optional<int64_t> customize_dtype)
+{
+    const static auto GetFormatFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCastCalculateSizeAndFormat");
+    const static auto FormatCastFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCast");
+
+    const static bool aclnnNpuFormatCastExist =
+        (GetFormatFuncAddr == nullptr || FormatCastFuncAddr == nullptr) ? false : true;
+
+    GetFormatFunc GetFormat = reinterpret_cast<GetFormatFunc>(GetFormatFuncAddr);
+    int64_t *dstStorageShape = nullptr;
+    uint64_t dstShapeSize = 0;
+    int dstFormat;
+    at::SmallVector<int64_t, SIZE> outputShape = {};
+    aclDataType customizeAcltype = (customize_dtype.has_value()) ?
+        c10_npu::GetAclDataType(customize_dtype.value()) :
+        at_npu::native::OpPreparation::convert_to_acl_data_type(src.scalar_type());
+
+    if (c10_npu::IsAclnnOnly()) {
+        if (aclnnNpuFormatCastExist) {
+            auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape,
+                &dstShapeSize, &dstFormat);
+            NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat");
+            for (uint64_t i = 0; i < dstShapeSize; i++) {
+                outputShape.push_back(dstStorageShape[i]);
+            }
+            delete[] dstStorageShape;
+            return std::make_tuple(true, dstFormat, outputShape);
+        }
+        TORCH_CHECK(false,
+            "aclnnNpuFormatCast does not exist, Current device only support aclnn operators.",
+            PTA_ERROR(ErrCode::NOT_SUPPORT));
+    }
+    if (at_npu::native::env::CheckJitDisable()) {
+        if (aclnnNpuFormatCastExist) {
+            auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape,
+                &dstShapeSize, &dstFormat);
+            if (api_ret != 0) {
+                if (customize_dtype.has_value()) {
+                    NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat");
+                }
+                return std::make_tuple(false, dstFormat, outputShape);
+            }
+            for (uint64_t i = 0; i < dstShapeSize; i++) {
+                outputShape.push_back(dstStorageShape[i]);
+            }
+            delete[] dstStorageShape;
+            return std::make_tuple(true, dstFormat, outputShape);
+        } else {
+            if (C10_UNLIKELY(customize_dtype.has_value())) {
+                TORCH_CHECK(false,
+                    "customize_dtype is not supported while aclnnNpuFormatCast does not exist.",
+                    PTA_ERROR(ErrCode::NOT_SUPPORT));
+            }
+            return std::make_tuple(false, dstFormat, outputShape);
+        }
+    } else {
+        if (C10_UNLIKELY(customize_dtype.has_value())) {
+            TORCH_CHECK(false,
+                "customize_dtype is not supported while jit_compile=True.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+        }
+        return std::make_tuple(false, dstFormat, outputShape);
+    }
+}
+
+at::Tensor create_tensor_with_format_and_shape(c10::IntArrayRef baseSizes,
+    c10::IntArrayRef storageSizes,
+    const caffe2::TypeMeta dtype, int64_t acl_format)
+{
+    c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get();
+    int64_t nelements = 1;
+    for (const auto& num : storageSizes) {
+        nelements *= num;
+    }
+    int64_t size_bytes = nelements * dtype.itemsize();
+    c10::intrusive_ptr<c10::StorageImpl> storage_impl = torch_npu::make_npu_storage_impl(
+        c10::StorageImpl::use_byte_size_t(),
+        c10::SymInt(size_bytes),
+        allocator->allocate(size_bytes),
+        allocator,
+        true);
+    auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+    if (baseSizes.size() != 1 || baseSizes[0] != 0) {
+        tensor.unsafeGetTensorImpl()->set_sizes_contiguous(baseSizes);
+    }
+    tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous);
+    StorageDescHelper::SetDesc(tensor, baseSizes, storageSizes, tensor.strides(), static_cast<aclFormat>(acl_format));
+    return tensor;
+}
+
+at::Tensor format_cast_impl_out_npu_aclnn(const at::Tensor& src,
+    int64_t acl_format, c10::IntArrayRef storageSizes)
+{
+    auto src_new = src.contiguous();
+    auto src_new_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src_new)->npu_desc_;
+
+    at::Tensor dst = create_tensor_with_format_and_shape(
+        src_new.sizes(), storageSizes, src.dtype(), acl_format);
+
+    // calculate the output result of the NPU
+    EXEC_NPU_CMD(aclnnNpuFormatCast, src_new, dst);
+
+    // format cast only change physical layout of base tensor and view tensor's
+    // metadata remain unchanged
+    dst.set_(dst.storage(), src_new.storage_offset(), src_new.sizes(), src_new.strides());
+    return dst;
+}
 
 at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
 {
@@ -36,7 +151,8 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
 }
 
 // convert src from src_format to dst_format, write the result into dst(self)
-at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src)
+at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src,
+                                                 c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     torch_npu::utils::torch_check_npu(src);
@@ -47,6 +163,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Ten
         return self;
     }
 
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, dst_desc.npu_format_, customize_dtype);
+    if (useAclnn == true) {
+        at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+        self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides());
+        return self;
+    }
+
     // calculate the output result of the NPU
     format_cast_impl_out_npu(self, src);
 
@@ -59,16 +182,6 @@ at::Tensor npu_format_cast_impl(
     int64_t acl_format)
 {
     auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-    if (src_desc.npu_format_ == acl_format) {
-        ASCEND_LOGD("no need to do format cast");
-        return src;
-    }
-    if (FormatHelper::IsBaseFormatType(src) &&
-        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
-        FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
-        return src;
-    }
-
     at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
         src_desc.base_sizes_, src.options(), acl_format);
 
@@ -84,18 +197,20 @@ at::Tensor npu_format_cast_impl(
 // conver self to dst'format, write the result into new result tensor
 at::Tensor NPUNativeFunctions::npu_format_cast(
     const at::Tensor& self,
-    const at::Tensor& dst)
+    const at::Tensor& dst,
+    c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(dst);
     auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
     int64_t dst_format = dst_desc.npu_format_;
-    return custom_ops::npu_format_cast(self, dst_format);
+    return custom_ops::npu_format_cast(self, dst_format, customize_dtype);
 }
 
 // conver self to acl_format, write the result into self
 at::Tensor& NPUNativeFunctions::npu_format_cast_(
     at::Tensor& self,
-    int64_t acl_format)
+    int64_t acl_format,
+    c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
@@ -108,6 +223,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
         return self;
     }
 
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype);
+    if (useAclnn == true) {
+        at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+        self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides());
+        return self;
+    }
+
     at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
         src_desc.base_sizes_, self.options(), acl_format);
 
@@ -130,16 +252,54 @@ int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self)
 
 at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format)
 {
-    return npu_format_cast_impl(self, acl_format);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return self;
+    }
+    if (FormatHelper::IsBaseFormatType(self) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(self, static_cast<aclFormat>(acl_format));
+        return self;
+    }
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, c10::nullopt);
+    if (useAclnn == false) {
+        return npu_format_cast_impl(self, acl_format);
+    }
+    return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
+}
+
+at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format,
+                                                int64_t customize_dtype)
+{
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return self;
+    }
+    if (FormatHelper::IsBaseFormatType(self) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(self, static_cast<aclFormat>(acl_format));
+        return self;
+    }
+    auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype);
+    if (useAclnn == false) {
+        return npu_format_cast_impl(self, acl_format);
+    }
+    return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape);
 }
 
-at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format)
+at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format,
+                                               c10::optional<int64_t> customize_dtype)
 {
     torch_npu::utils::torch_check_npu(self);
     if (NPUNativeFunctions::get_npu_format(self) == acl_format) {
         ASCEND_LOGD("no need to do format cast");
         return self;
     }
+    if (customize_dtype.has_value()) {
+        return custom_ops::_npu_format_cast(self, acl_format, customize_dtype.value());
+    }
     return custom_ops::_npu_format_cast(self, acl_format);
 }
 
diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
index 775d95cbfa597a61fcf71eca04008d8c21fd4e83..685f907653a96e2f36e6ee5c9ea4dc6344618cef 100644
--- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
+++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
@@ -10,13 +10,34 @@
 namespace at_npu {
 namespace native {
 
+#define AT_DISPATCH_CASE_ALL_TYPES_AND5(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+    AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)       \
+    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)    \
+    AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+
+#define AT_DISPATCH_ALL_TYPES_AND5(                         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(                                       \
+        TYPE,                                                 \
+        NAME,                                                 \
+        AT_DISPATCH_CASE_ALL_TYPES_AND5(                      \
+            SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, __VA_ARGS__))
+
+
 c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self)
 {
     c10::Scalar r;
-    AT_DISPATCH_ALL_TYPES_AND3(
+    AT_DISPATCH_ALL_TYPES_AND5(
         at::ScalarType::Half,
         at::ScalarType::Bool,
         at::ScalarType::BFloat16,
+        at::ScalarType::Float8_e5m2,
+        at::ScalarType::Float8_e4m3fn,
         self.scalar_type(),
         "_local_scalar_dense_npu",
         [&] {
diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index 3b7a2245571eeadb800a93be4b1c105460a6b33a..143d40bf5e24f4c8ca5607fe158a53524672aa58 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -162,11 +162,11 @@ at::Tensor NPUNativeFunctions::to(
     }
     if (dtype == at::ScalarType::Double) {
         TORCH_NPU_WARN_ONCE(
-            "Warning: Device do not support double dtype now, "
-            "dtype cast repalce with float.");
+            "Device do not support double dtype now, "
+            "dtype cast replace with float.");
     }
     dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype;
-    return custom_ops::npu_dtype_cast(self, dtype);
+    return custom_ops::_npu_dtype_cast(self, dtype);
 }
 
 at::Tensor NPUNativeFunctions::to(
diff --git a/torch_npu/csrc/aten/common/from_blob.cpp b/torch_npu/csrc/aten/common/from_blob.cpp
index 08f2e63fd20ad2fc219853ea86c42a185d6a9284..1363d69459a00888c0d2ae215f28afd4d47923ba 100644
--- a/torch_npu/csrc/aten/common/from_blob.cpp
+++ b/torch_npu/csrc/aten/common/from_blob.cpp
@@ -36,7 +36,12 @@ at::Tensor TensorMaker::make_tensor()
 
     std::size_t size_bytes = computeStorageSize();
 
-    c10::DataPtr data_ptr{data_, *device_};
+    c10::DataPtr data_ptr{};
+    if (deleter_) {
+        data_ptr = c10::InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_);
+    } else {
+        data_ptr = c10::DataPtr(data_, *device_);
+    }
 
     c10::intrusive_ptr<c10::StorageImpl> storage_impl = torch_npu::make_npu_storage_impl(
         c10::StorageImpl::use_byte_size_t(),
@@ -86,6 +91,54 @@ std::size_t TensorMaker::computeStorageSize() const noexcept
     return storage_size;
 }
 
+at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    std::function<void(void*)> deleter,
+    const at::TensorOptions& options,
+    const c10::optional<c10::Device> target_device)
+{
+    return for_blob(data, sizes)
+        .deleter(std::move(deleter))
+        .options(options)
+        .target_device(target_device)
+        .make_tensor();
+}
+
+at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    int64_t storage_offset,
+    const std::function<void(void*)>& deleter,
+    const at::TensorOptions& options,
+    const c10::optional<c10::Device> target_device)
+{
+    return for_blob(data, sizes)
+        .strides(strides)
+        .storage_offset(storage_offset)
+        .deleter(deleter)
+        .options(options)
+        .target_device(target_device)
+        .make_tensor();
+}
+
+at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const std::function<void(void*)>& deleter,
+    const at::TensorOptions& options,
+    const c10::optional<c10::Device> target_device)
+{
+    return for_blob(data, sizes)
+        .strides(strides)
+        .deleter(deleter)
+        .options(options)
+        .target_device(target_device)
+        .make_tensor();
+}
+
 at::Tensor from_blob(
     void* data,
     at::IntArrayRef sizes,
diff --git a/torch_npu/csrc/aten/common/from_blob.h b/torch_npu/csrc/aten/common/from_blob.h
index f0d6bbd12700ec295d322762febe80070286bb43..0669d2fdca08965e9797918b35d83b185ef1272e 100644
--- a/torch_npu/csrc/aten/common/from_blob.h
+++ b/torch_npu/csrc/aten/common/from_blob.h
@@ -41,6 +41,12 @@ public:
         return *this;
     }
 
+    TensorMaker& deleter(std::function<void(void*)> value) noexcept
+    {
+        deleter_ = std::move(value);
+
+        return *this;
+    }
     at::Tensor make_tensor();
 
 private:
@@ -58,6 +64,7 @@ private:
     c10::optional<c10::Device> device_{};
     at::TensorOptions opts_{};
     c10::Allocator* allocator_{};
+    std::function<void(void*)> deleter_{};
 };
 
 inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept
@@ -65,6 +72,30 @@ inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept
     return TensorMaker{data, sizes};
 }
 
+TORCH_NPU_API at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    std::function<void(void*)> deleter,
+    const at::TensorOptions& options = {},
+    const c10::optional<c10::Device> target_device = c10::nullopt);
+
+TORCH_NPU_API at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    int64_t storage_offset,
+    const std::function<void(void*)>& deleter,
+    const at::TensorOptions& options = {},
+    const c10::optional<c10::Device> target_device = c10::nullopt);
+
+TORCH_NPU_API at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const std::function<void(void*)>& deleter,
+    const at::TensorOptions& options = {},
+    const c10::optional<c10::Device> target_device = c10::nullopt);
+
 TORCH_NPU_API at::Tensor from_blob(
     void* data,
     at::IntArrayRef sizes,
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 95bb740db159bef654fb063934f68344c1bf257e..b186df765181599eca85294f3343033c711f8a32 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -62,12 +62,12 @@ custom:
   - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int
     device_check: NoCheck
   - func: get_npu_format(Tensor self) -> int
-  - func: npu_format_cast.Tensor(Tensor self, Tensor dst) -> Tensor
+  - func: npu_format_cast.Tensor(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor
     device_check: NoCheck
     exposed: True
-  - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format) -> Tensor(a!)
+  - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)
     exposed: True
-  - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!)
+  - func: npu_format_cast_(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)
     device_check: NoCheck
     exposed: True
   - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor
@@ -82,9 +82,10 @@ custom:
   - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
     device_check: NoCheck
   - func: get_storage_size(Tensor self) -> int
-  - func: npu_format_cast(Tensor self, int acl_format) -> Tensor
+  - func: npu_format_cast(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor
     exposed: True
   - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor
+  - func: _npu_format_cast.aclnn(Tensor self, int acl_format, int customize_dtype) -> Tensor
   - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_swapped_memory
diff --git a/torch_npu/csrc/core/NPUStorageImpl.h b/torch_npu/csrc/core/NPUStorageImpl.h
index bb8d5062dd1968730abbea9e720e31d14449789b..5bbaa0428c1d7cb9aa4b78601ebdfb714318e03e 100644
--- a/torch_npu/csrc/core/NPUStorageImpl.h
+++ b/torch_npu/csrc/core/NPUStorageImpl.h
@@ -24,7 +24,7 @@ public:
     aclFormat origin_format_ = ACL_FORMAT_UNDEFINED;
     aclFormat npu_format_ = ACL_FORMAT_ND;
     // used to make CANN GE tensor from storagImpl
-    caffe2::TypeMeta data_type_;
+    caffe2::TypeMeta data_type_ = caffe2::TypeMeta::Make<uint8_t>();
 };
 
 struct NPUStorageImpl : public c10::StorageImpl {
diff --git a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
index f03bfdb05b05dd3754747cd0a13d2af411fafe6e..ca093bb83726a1d217a1f6f9bf764ad798155a8e 100644
--- a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
+++ b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/DeviceGuard.h>
+#include <c10/util/llvmMathExtras.h>
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include <c10/util/Logging.h>
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -133,14 +134,16 @@ struct HostAllocator {
             c10_npu::SetCurrentDevice();
         }
 
+        // Round up the allocation to the nearest power of two to improve reuse.
+        size_t roundSize = c10::llvm::PowerOf2Ceil(size);
         // allocate a new block if no cached allocation is found
-        err = aclrtMallocHost(ptr, size);
+        err = aclrtMallocHost(ptr, roundSize);
         if (err != ACL_ERROR_NONE) {
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err);
             return err;
         }
 
-        blocks.insert({*ptr, Block(size, *ptr, true)});
+        blocks.insert({*ptr, Block(roundSize, *ptr, true)});
         return ACL_ERROR_NONE;
     }
 
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
index bcb89e6c88354a1a290b320c0d9003417b0339f7..c5f6f913d44632c09ebea3978b0f1bdbccbaf82a 100644
--- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
@@ -93,8 +93,10 @@ void GetExclusiveAffinityCPU()
             offset = find_offset->second;
         }
         c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu);
-        int length = (cpu_range.end - cpu_range.start + 1) / same_num;
-        c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length};
+        unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast<unsigned int>(same_num);
+        c10_npu::CoreIdRange exclusiveAffinityCpu = {
+            cpu_range.start + static_cast<unsigned int>(offset) * length,
+            (cpu_range.start + length - 1) + static_cast<unsigned int>(offset) * length};
         offsetMap[affinity_cpu] = offset + 1;
         CardIdAffinityCPU[card_id] = exclusiveAffinityCpu;
     }
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index a331439d9f22f4516d4e41ade170d7c0ee6eb584..6c2d35fd951aaae50b0293c0437db458a3896874 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -9,11 +9,16 @@
 #include <sys/prctl.h>
 #include <string>
 #include <unordered_map>
+#include <mutex>
 
 namespace c10_npu {
 
 static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
 
+static pthread_t main_thread;
+static bool start_main_thread_bind = false;
+static std::mutex core_map_mutex;
+
 using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
 
 static uint32_t cpu_affinity_mode;
@@ -28,8 +33,7 @@ const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
     {ACL_THREAD,        "acl_thread"},
     {RELEASE_THREAD,    "release_thread"},
     {WATCHDOG_THREAD,   "hccl_watchdog_t"},
-    {OTHER_THREAD,      "other_thread"},
-    {USER_THREAD,       "user_thread"}};
+    {OTHER_THREAD,      "other_thread"}};
 
 CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id)
 {
@@ -147,7 +151,7 @@ void printCoreRanges(const uint32_t mode, const std::vector<CoreIdRange> &ranges
     oss << "Mode: " << mode << ". Core range for each device ID: ";
 
     for (size_t i = 0; i < ranges.size(); ++i) {
-        oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]";
+        oss << "Device " << i << ": [" << ranges[i].start << ", " << ranges[i].end << "]";
         if (i != ranges.size() - 1) {
             oss << "; ";
         } else {
@@ -194,18 +198,18 @@ void SetThreadType(ThreadType type)
         return;
     }
     if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
-        ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str());
+        ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str());
     }
 }
 
 std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap)
 {
     std::ostringstream oss;
-    for (auto local_thread : threadTypeList) {
-        oss << threadTypeToNameMap.at(local_thread) << " : ["
-            << threadCoreMap.at(local_thread).start << ","
-            << threadCoreMap.at(local_thread).end << "]";
-        if (local_thread != OTHER_THREAD) {
+    for (auto thread_type : threadTypeList) {
+        oss << threadTypeToNameMap.at(thread_type) << ": ["
+            << threadCoreMap.at(thread_type).start << ", "
+            << threadCoreMap.at(thread_type).end << "]";
+        if (thread_type != OTHER_THREAD) {
             oss << "; ";
         } else {
             oss << ".";
@@ -222,16 +226,16 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<Co
     if (core_nums < threadTypeList.size()) {
         ASCEND_LOGW("Device %d available core numbers (%d) are insufficient for all %zu thread types and will bind available cores to all threads.",
                     device_id, core_nums, threadTypeList.size());
-        for (auto local_thread : threadTypeList) {
-            threadCoreMap[local_thread] = range;
+        for (auto thread_type : threadTypeList) {
+            threadCoreMap[thread_type] = range;
         }
         return threadCoreMap;
     }
 
     CoreId now = range.start;
-    for (auto local_thread : threadTypeList) {
-        if (local_thread != ThreadType::OTHER_THREAD) {
-            threadCoreMap[local_thread] = CoreIdRange{now, now};
+    for (auto thread_type : threadTypeList) {
+        if (thread_type != ThreadType::OTHER_THREAD) {
+            threadCoreMap[thread_type] = CoreIdRange{now, now};
         } else {
             threadCoreMap[ThreadType::OTHER_THREAD] = CoreIdRange{now, range.end};
         }
@@ -242,28 +246,43 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<Co
     return threadCoreMap;
 }
 
-void SetThreadAffinity(c10::DeviceIndex device_id)
+bool setThreadAffinityImpl(pthread_t thread, CoreIdRange core_range)
 {
-    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
-        return;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i = core_range.start; i <= core_range.end; i++) {
+        CPU_SET(i, &mask);
+    }
+    if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
+        return true;
+    } else {
+        return false;
     }
+}
 
+CoreIdRange getCoreRange(c10::DeviceIndex device_id, ThreadType type)
+{
     CoreIdRange core_range;
-    if (cpu_affinity_mode == 1) {
+    if (cpu_affinity_mode == 0 || cpu_affinity_mode == 1) {
         core_range = device_ranges[device_id];
     } else {
+        std::lock_guard<std::mutex> lock(core_map_mutex);
         if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) {
             device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges));
         }
-        core_range = device_thread_core_maps.at(device_id).at(local_thread);
+        core_range = device_thread_core_maps.at(device_id).at(type);
     }
+    return core_range;
+}
 
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    for (auto i = core_range.start; i <= core_range.end; i++) {
-        CPU_SET(i, &mask);
+void SetThreadAffinity(c10::DeviceIndex device_id)
+{
+    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
+        return;
     }
-    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
+
+    CoreIdRange core_range = getCoreRange(device_id, local_thread);
+    if (setThreadAffinityImpl(pthread_self(), core_range)) {
         ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
                     device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
     } else {
@@ -280,7 +299,10 @@ void SetThreadAffinity(ThreadType type)
     int device_index;
     NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index));
     c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-    SetThreadType(type);
+    local_thread = type;
+    if (local_thread == ThreadType::MAIN_THREAD) {
+        start_main_thread_bind = true;
+    }
     SetThreadAffinity(device);
 }
 
@@ -289,20 +311,55 @@ void SetThreadAffinity(int core_start, int core_end)
     if (!needToSetThreadAffinity()) {
         return;
     }
+
     static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-    core_start = std::min(core_start, core_nums);
-    core_end = std::min(core_end, core_nums);
+    CoreIdRange core_range;
+    core_range.start = static_cast<CoreId>(std::min(core_start, core_nums));
+    core_range.end = static_cast<CoreId>(std::min(core_end, core_nums));
     local_thread = ThreadType::USER_THREAD;
 
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    for (auto i = core_start; i <= core_end; i++) {
-        CPU_SET(i, &mask);
-    }
-    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
-        ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
+    if (setThreadAffinityImpl(pthread_self(), core_range)) {
+        ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end);
     } else {
-        ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
+        ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end);
+    }
+}
+
+void SetMainThread()
+{
+    main_thread = pthread_self();
+}
+
+bool NeedMainThreadBind()
+{
+    return start_main_thread_bind && (local_thread == ThreadType::MAIN_THREAD);
+}
+
+void StartMainThreadBind(c10::DeviceIndex device_id)
+{
+    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
+        return;
+    }
+
+    static thread_local bool seted = false;
+    if (!seted) {
+        seted = true;
+        if (syscall(SYS_gettid) != getpid()) {
+            start_main_thread_bind = true;
+
+            SetThreadAffinity(device_id);
+
+            CoreIdRange core_range = getCoreRange(device_id, ThreadType::MAIN_THREAD);
+            if (setThreadAffinityImpl(main_thread, core_range)) {
+                ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
+                            device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(),
+                            core_range.start, core_range.end);
+            } else {
+                ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.",
+                            device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(),
+                            core_range.start, core_range.end);
+            }
+        }
     }
 }
 
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index 0ec3c4d995de635118e450c3553489facb1dc1a2..e850a47b67f3484ffafb56f0b4cc67b0eea0c0ee 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -20,9 +20,12 @@ enum ThreadType {
 };
 
 void SetThreadType(ThreadType type);
-
 void SetThreadAffinity(c10::DeviceIndex device);
 void SetThreadAffinity(ThreadType type);
 void SetThreadAffinity(int core_start, int core_end);
 
+void SetMainThread();
+bool NeedMainThreadBind();
+void StartMainThreadBind(c10::DeviceIndex device_id);
+
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index c8575d5bc4fdfdbd1270bb4f3f16291b82b691d1..1691427e4602ce281a79344d3a5af0b4b8db29ec 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -24,6 +24,7 @@
 #include "torch_npu/csrc/core/npu/GetCANNInfo.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/NPUEvent.h"
+#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h"
 #include "torch_npu/csrc/profiler/npu_profiler.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
@@ -100,6 +101,12 @@ const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version wh
 const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
 const std::string kCannModule = "CANN";               // cann module name
 
+static char SHAREABLE_HANDLE_VERSION = 1;
+enum ShareableHandleType : char {
+    SHAREABLE_NPU_MALLOC = 'c',
+    SHAREABLE_NPU_EXPANDABLE_SEGMENT = 'e'
+};
+
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
 void update_stat(Stat &stat, int64_t amount)
@@ -355,7 +362,10 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-    ExpandableSegment(int device, aclrtStream stream, size_t size)
+    ExpandableSegment(
+        int device,
+        std::optional<aclrtStream> stream,
+        size_t size)
         : device_(device),
           stream_(stream),
           max_handles_(0),
@@ -376,7 +386,7 @@ struct ExpandableSegment {
             auto default_stream = c10_npu::getDefaultNPUStream().stream(false);
             if (kSmallBuffer == segment_size_) {
                 max_handles_ = numSegments(kSmallPoolVirAddrSize);
-            } else if (default_stream != stream) {
+            } else if (default_stream != *stream) {
                 max_handles_ = numSegments(kLargePoolVirAddrSize);
             }
         }
@@ -416,17 +426,17 @@ struct ExpandableSegment {
                 for (auto j : c10::irange(begin, i)) {
                     auto h = handles_.at(j).value();
                     handles_.at(j) = c10::nullopt;
-                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle));
                 }
                 trimHandles();
                 return rangeFromHandles(begin, begin);
             }
             NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-            handles_.at(i) = handle;
+            handles_.at(i) = Handle{handle, std::nullopt};
         }
         for (auto i : c10::irange(begin, end)) {
             NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
-                handles_.at(i).value(), 0, getHcclComm()));
+                handles_.at(i).value().handle, 0, getHcclComm()));
         }
         ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_);
         return rangeFromHandles(begin, end);
@@ -446,6 +456,59 @@ struct ExpandableSegment {
         return rangeFromHandles(begin, end);
     }
 
+    // Setup IPC sharing for range.
+    // Returns the (larger) range that was actually shared.
+    // Serializes data to std::ostream that can be passed to the
+    // other process, and then restored as an exapandable segment
+    // via ExpandableSegment::fromShared(istream);
+    SegmentRange share(SegmentRange range, std::ostream& buf)
+    {
+        auto begin = segmentLeft(range.ptr);
+        auto end = segmentRight(range.ptr + range.size);
+        ShareHeader header{segment_size_, end - begin};
+        buf.write((const char*)&header, sizeof(ShareHeader));
+        for (auto i : c10::irange(begin, end)) {
+            // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+            auto& handle = handles_.at(i).value();
+            if (!handle.shareableHandle) {
+                uint64_t shareableHandle = 0;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMemExportToShareableHandle(
+                    handle.handle, ACL_MEM_HANDLE_TYPE_NONE, 0, &shareableHandle));
+                int32_t* pids = nullptr;
+                int pid_num = torch_npu::ipc::getPids(&pids);
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMemSetPidToShareableHandle(shareableHandle, pids, pid_num));
+                handle.shareableHandle = shareableHandle;
+            }
+            uint64_t shandle = *handle.shareableHandle;
+            buf.write((const char*)&shandle, sizeof(uint64_t));
+        }
+        return rangeFromHandles(begin, end);
+    }
+
+    static std::unique_ptr<ExpandableSegment> fromShared(
+        c10::DeviceIndex device,
+        std::istream& buf)
+    {
+        ShareHeader header{};
+        buf.read((char*)&header, sizeof(ShareHeader));
+        auto segment = std::make_unique<ExpandableSegment>(
+            device,
+            std::nullopt,
+            header.segment_size);
+        for (auto i : c10::irange(header.num_handles)) {
+            (void)i;
+            uint64_t shareableHandle = 0;
+            buf.read((char*)&shareableHandle, sizeof(uint64_t));
+            int32_t deviceId = static_cast<int32_t>(device);
+            aclrtDrvMemHandle handle;
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMemImportFromShareableHandle(
+                shareableHandle, deviceId, &handle));
+            segment->handles_.emplace_back(Handle{handle, std::nullopt});
+        }
+        segment->mapAndSetAccess(0, header.num_handles);
+        return segment;
+    }
+
     char *ptr() const
     {
         return (char *)ptr_;
@@ -464,7 +527,7 @@ struct ExpandableSegment {
             segment_size_ * max_handles_, 0, 1));
         for (auto i : c10::irange(handles_.size())) {
             HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(),
-                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0));
+                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().handle, 0));
         }
     }
 
@@ -476,6 +539,15 @@ struct ExpandableSegment {
     }
 
 private:
+    void mapAndSetAccess(size_t begin, size_t end)
+    {
+        for (auto i : c10::irange(begin, end)) {
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
+                handles_.at(i).value().handle, 0, getHcclComm()));
+        }
+        ASCEND_LOGD("NPUCachingAllocator mapAndSetAccess: segment_size=%zu", segment_size_);
+    }
+
     void unmapHandles(size_t begin, size_t end)
     {
         // note: unlike aclrtFree, MemUnmap and MemRelease do
@@ -485,18 +557,23 @@ private:
         // cannot call c10::npu::stream_synchronize because
         // it might grab the GIL which can lead to a deadlock
         // Locking order must be GIL -> Allocator Lock
-        NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_));
+        if (stream_) {
+            NPU_CHECK_ERROR(aclrtSynchronizeStream(*stream_));
+        } else {
+            c10_npu::NPUGuard device_guard(device_);
+            c10_npu::npuSynchronizeDevice(true);
+        }
 #ifndef BUILD_LIBTORCH
         const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
         if (C10_UNLIKELY(trigger)) {
-            trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
+            trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(*stream_));
         }
 #endif
         for (auto i : c10::irange(begin, end)) {
-            aclrtDrvMemHandle h = handles_.at(i).value();
+            Handle h = handles_.at(i).value();
             handles_.at(i) = c10::nullopt;
             NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle));
         }
         ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
         trimHandles();
@@ -553,11 +630,19 @@ private:
     }
 
     int device_;
-    aclrtStream stream_;
+    std::optional<aclrtStream> stream_;
     void *ptr_{};
     size_t max_handles_;
     size_t segment_size_;
-    std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
+    struct Handle {
+        aclrtDrvMemHandle handle;
+        std::optional<uint64_t> shareableHandle;
+    };
+    struct ShareHeader {
+        size_t segment_size;
+        size_t num_handles;
+    };
+    std::vector<std::optional<Handle>> handles_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 };
 
@@ -726,6 +811,7 @@ BlockState::BlockState(Block *block)
 
 SegmentState::SegmentState(Block *head)
 {
+    TORCH_INTERNAL_ASSERT(head != nullptr, PTA_ERROR(ErrCode::PTR));
     TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
     is_small = head->pool->is_small;
 
@@ -882,7 +968,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector<std::st
         if (m_expandable_segments) {
             void *ptr = nullptr;
             auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, nullptr, 1);
-            if (status == ACL_ERROR_NONE) {
+            if (status == ACL_ERROR_NONE && ptr != nullptr) {
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
             } else {
                 NPU_CHECK_ERROR(status, "aclrtReserveMemAddress");
@@ -1009,6 +1095,13 @@ private:
     std::unique_lock<std::recursive_mutex>& lock_;
 };
 
+struct handle_str {
+    char data[ACL_IPC_HANDLE_SIZE];
+};
+
+// handle for ptr
+ska::flat_hash_map<void *, handle_str> ipc_handle_map;
+
 class DeviceCachingAllocator {
 private:
     // lock around all operations
@@ -1542,6 +1635,40 @@ public:
         return basePtr;
     }
 
+    ShareableHandle shareIpcHandle(Block* block)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        std::ostringstream ss;
+        ss.put(SHAREABLE_HANDLE_VERSION);
+        ptrdiff_t offset = 0;
+        if (!block->expandable_segment_) {
+            ss.put(SHAREABLE_NPU_MALLOC);
+            size_t base_size;
+            void* base_ptr = getBaseAllocation(block, &base_size);
+            offset = (char*)block->ptr - (char*)base_ptr;
+
+            handle_str handle;
+            auto it = ipc_handle_map.find(base_ptr);
+            if (it == ipc_handle_map.end()) {
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemGetExportKey(
+                    base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE, 0));
+                int32_t* pids = nullptr;
+                size_t pid_num = torch_npu::ipc::getPids(&pids);
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num));
+                ipc_handle_map[base_ptr] = handle;
+            } else {
+                handle = it->second;
+            }
+            ss.write((char*)&handle, ACL_IPC_HANDLE_SIZE);
+        } else {
+            ss.put(SHAREABLE_NPU_EXPANDABLE_SEGMENT);
+            auto full_range = block->expandable_segment_->share(
+                SegmentRange(block->ptr, block->size), ss);
+            offset = (char*)block->ptr - (char*)full_range.ptr;
+        }
+        return ShareableHandle{offset, ss.str()};
+    }
+
     void recordStream(Block *block, c10_npu::NPUStream stream)
     {
         std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -2220,6 +2347,9 @@ private:
             // map_block will map some of unmapped and merge with free
             auto remaining = size - candidate->size;
             auto new_candidate = candidate->next;
+            if (C10_UNLIKELY(new_candidate == nullptr)) {
+                return nullptr;
+            }
             if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
                 return nullptr;
             }
@@ -2443,7 +2573,11 @@ private:
     {
         bool freed_memory = false;
         for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
-            freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+            if (FreeNPUMemoryCallbacksRegistry()->Create(name) != nullptr) {
+                freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+            } else {
+                TORCH_CHECK(false, "free memory callback get nullptr", PTA_ERROR(ErrCode::PTR));
+            }
         }
         return freed_memory;
     }
@@ -2486,7 +2620,7 @@ private:
 
         // Repeat GC until we reach reclaim > target size.
         bool block_freed = true;
-        while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) {
+        while (gc_reclaimed < target_size && block_freed && freeable_block_count > 0) {
             // Free blocks exceeding this age threshold first.
             double age_threshold = total_age / freeable_block_count;
             // Stop iteration if we can no longer free a block.
@@ -2678,6 +2812,12 @@ private:
         record_trace(TraceEntry::SEGMENT_FREE, int64_t(block->ptr), block->size, block->stream, block->device,
             context ? context : block->context_when_segment_allocated);
 
+        auto it = ipc_handle_map.find(block->ptr);
+        if (it != ipc_handle_map.end()) {
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(it->second.data));
+            ipc_handle_map.erase(it);
+        }
+
         aclrtFree((void *)block->ptr);
         total_allocated_memory -= block->size;
 
@@ -3152,6 +3292,15 @@ public:
         return device_allocator[block->device]->getBaseAllocation(block, outSize);
     }
 
+    ShareableHandle shareIpcHandle(void* ptr) override
+    {
+        Block* block = get_allocated_block(ptr);
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr);
+        }
+        return device_allocator[block->device]->shareIpcHandle(block);
+    }
+
     void recordStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream) override
     {
         // Empty tensor's storage().data() might be a null ptr. As there is no
@@ -3402,6 +3551,109 @@ public:
         this->free(ptr);
     }
 
+    std::mutex IpcMutex;
+    struct MemHandleCacheEntry {
+        MemHandleCacheEntry(
+            c10::DeviceIndex device,
+            std::string& handle,
+            const DeviceCachingAllocator& allocator)
+            : device_(device)
+        {
+            int type = SHAREABLE_NPU_MALLOC;
+            std::istringstream ss(handle);
+            if (handle.size() != ACL_IPC_HANDLE_SIZE) {
+                auto version = ss.get();
+                TORCH_CHECK(
+                    version <= SHAREABLE_HANDLE_VERSION,
+                    "received sharable handle from a future version of torch that this version does not know how to handle",
+                    PTA_ERROR(ErrCode::NOT_SUPPORT));
+                type = ss.get();
+            }
+            // otherwise this is coming from an old pytorch where it has to be a raw
+            // SHAREABLE_NPU_MALLOC
+            if (type == SHAREABLE_NPU_MALLOC) {
+                handle_str handle_r;
+                ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE);
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data, 0));
+                handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE);
+            } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) {
+                expandable_segment_ =
+                    ExpandableSegment::fromShared(device, ss)
+                        .release();
+            } else {
+                TORCH_INTERNAL_ASSERT(
+                    false, "Unexpected or illformed shareable handle type");
+            }
+        }
+        // this struct expects that clear is explicitly called to
+        // free resources, because we only want this code running when
+        // the shared pointer to this entry is destructed, not during
+        // deinitialization when npu may already have been shutdown.
+        // This replicates the previous behavior of this map when it
+        // stored raw npu_ipc_ptr_ handles.
+        void clear()
+        {
+            if (npu_ipc_ptr_) {
+                c10_npu::NPUGuard device_guard(device_);
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(handle_s.c_str()));
+                npu_ipc_ptr_ = nullptr;
+            }
+            if (expandable_segment_) {
+                delete expandable_segment_;
+                expandable_segment_ = nullptr;
+            }
+        }
+        void* ptr()
+        {
+            if (npu_ipc_ptr_) {
+                return npu_ipc_ptr_;
+            } else {
+                return expandable_segment_->ptr();
+            }
+        }
+        c10::DeviceIndex device_;
+        ExpandableSegment* expandable_segment_{nullptr};
+        void* npu_ipc_ptr_{nullptr}; // nullptr if expandable_segment_ is not null
+        std::weak_ptr<void> wp_;
+        std::string handle_s;
+    };
+    ska::flat_hash_map<std::string, MemHandleCacheEntry> ipcMemHandle_to_devptr;
+
+    std::shared_ptr<void> getIpcDevPtr(std::string handle) override
+    {
+        std::lock_guard<std::mutex> lock(IpcMutex);
+
+        auto iter = ipcMemHandle_to_devptr.find(handle);
+        if (iter != ipcMemHandle_to_devptr.end()) {
+            auto devptr = iter->second.wp_.lock();
+            TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr");
+            return devptr;
+        }
+        int curr_device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&curr_device));
+        auto inserted = ipcMemHandle_to_devptr.insert(
+            iter,
+            {handle,
+            MemHandleCacheEntry(
+                static_cast<c10::DeviceIndex>(curr_device), handle, *device_allocator[curr_device])});
+        auto sp = std::shared_ptr<void>(
+            inserted->second.ptr(), [handle, this](void* ptr) {
+                std::unique_lock<std::mutex> deleter_lock(IpcMutex);
+
+                auto it = ipcMemHandle_to_devptr.find(handle);
+                TORCH_INTERNAL_ASSERT(it != ipcMemHandle_to_devptr.end());
+                auto entry = std::move(it->second);
+                ipcMemHandle_to_devptr.erase(it);
+
+                // ExpandableSegment synchronizes on destruction in unmapHandles, so
+                // we need to release the lock first to minimize the performance hit.
+                deleter_lock.unlock();
+                entry.clear();
+            });
+        inserted->second.wp_ = sp;
+        return sp;
+    }
+
     void FreeDeviceCachedMemory(int device) override
     {
         device_allocator[device]->emptyCache(device, true);
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index c33f51fbc895f989609cd6ef0953678d7b0e1cdf..c7082c89044158360f39373593e2deabb658b776 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex();
 // block inside of already allocated area.
 class FreeMemoryCallback {
 public:
-  virtual ~FreeMemoryCallback(){};
-  virtual bool Execute() = 0;
+    virtual ~FreeMemoryCallback(){};
+    virtual bool Execute() = 0;
 };
 
 C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
@@ -188,6 +188,11 @@ using OutOfMemoryObserver =
     std::function<void(int64_t device, int64_t allocated, int64_t device_total,
                        int64_t device_free)>;
 
+struct ShareableHandle {
+    ptrdiff_t offset;
+    std::string handle;
+};
+
 class NPUAllocator : public c10::Allocator {
 public:
     virtual c10::DataPtr allocate_with_aligned(size_t size, size_t aligned) const = 0;
@@ -227,6 +232,8 @@ public:
             " does not yet support checkPoolLiveAllocations. "
             "If you need it, please file an issue describing your use case.", PTA_ERROR(ErrCode::NOT_SUPPORT));
     }
+    virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
+    virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
     virtual bool isHistoryEnabled()
     {
         TORCH_CHECK(
@@ -376,6 +383,16 @@ inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id)
     return get()->releasePool(device, mempool_id);
 }
 
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle)
+{
+    return get()->getIpcDevPtr(handle);
+}
+
+inline ShareableHandle shareIpcHandle(void* ptr)
+{
+    return get()->shareIpcHandle(ptr);
+}
+
 inline void FreeDeviceCachedMemory(int device)
 {
     return get()->FreeDeviceCachedMemory(device);
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h
index c01491aa033752413dd880329445a9eb2d8556e2..ac7f0176e0f52daf9f88fdd39bdb2f5b0d546f5b 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.h
+++ b/torch_npu/csrc/core/npu/NPUEventManager.h
@@ -22,7 +22,7 @@ public:
     void DecreaseUnrecordedCount(aclrtEvent event);
     bool IsEventRecorded(aclrtEvent event);
     void ClearUnrecordedCount();
-    ~NPUEventManager() {}   
+    ~NPUEventManager() {}
 
 private:
     void run(aclrtEvent event);
diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 9c667f1fdb120c30696c49c1a911702332ffb54b..290daec7f11529eb1e744325ab335b84194f7db9 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -47,13 +47,14 @@ void warn_(const ::c10::Warning& warning)
 
 std::string formatErrorCode(SubModule submodule, ErrCode errorCode)
 {
+    if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) {
+        return "";
+    }
     std::ostringstream oss;
     int deviceIndex = -1;
     c10_npu::GetDevice(&deviceIndex);
     auto rank_id = c10_npu::option::OptionsManager::GetRankId();
-    if (!(c10_npu::option::OptionsManager::ShouldPrintLessError())) {
     oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") ";
-    }
     oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast<int>(submodule);
     oss << std::setw(3) << std::setfill('0') << static_cast<int>(errorCode);
     oss << " " << submoduleMap[submodule] << " " << errCodeMap[errorCode];
@@ -92,7 +93,7 @@ MemUceInfo memUceInfo;
 
 std::mutex memUceInfoMutex;
 
-void set_mem_uce_info(MemUceInfo info)
+void set_mem_uce_info(MemUceInfo& info)
 {
     std::lock_guard<std::mutex> lock(memUceInfoMutex);
     memUceInfo = info;
@@ -132,10 +133,12 @@ const std::string c10_npu_check_error_message(std::string& errmsg)
 
         std::regex ws_regex("[\\s\\t\\n\\r]+");
         content = std::regex_replace(content, ws_regex, " ");
-        if (!content.empty() && content.front() == ' ')
+        if (!content.empty() && content.front() == ' ') {
             content.erase(0, 1);
-        if (!content.empty() && content.back() == ' ')
+        }
+        if (!content.empty() && content.back() == ' ') {
             content.pop_back();
+        }
 
         return content;
     }
@@ -147,10 +150,10 @@ const std::string c10_npu_check_error_message(std::string& errmsg)
 const char *c10_npu_get_error_message()
 {
     auto errmsg = c10_npu::acl::AclGetErrMsg();
-    if (c10_npu::option::OptionsManager::ShouldPrintLessError()) {
+    if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) {
         std::string log(errmsg);
         std::string errmsg_ = c10_npu::c10_npu_check_error_message(log);
-        thread_local std::string processedErrMsg = errmsg_;
+        thread_local std::string processedErrMsg = "CANN error: " + errmsg_;
         c10_npu::setRepoErrMsg(processedErrMsg.c_str());
         return processedErrMsg.c_str();
     } else {
@@ -172,7 +175,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg)
     int device = 0;
     auto err = c10_npu::GetDevice(&device);
     if (err != ACL_ERROR_NONE) {
-        err_msg = "ERROR happend in GetDevice.";
+        err_msg = "ERROR happened in GetDevice.";
         if (check_error) {
             TORCH_CHECK(false, err_msg, PTA_ERROR(ErrCode::ACL));
         } else {
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index a82f8f15688c9da828cc977954527869c99708d3..1d34ae2050ded401c5508ccaa57a89ad481e2a74 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -151,7 +151,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
                            " that driver and firmware packages do not match.");  \
                     return true;                                                 \
                 }();                                                             \
-            } else if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \
+            } else if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \
                 std::ostringstream oss;                                          \
                 oss << " NPU function error: "                                   \
                     << (device_error_msg.empty() ? getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg) \
@@ -166,8 +166,20 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
                     false,                                                       \
                     (device_error_msg.empty() ? "" : device_error_msg),        \
                     c10_npu::c10_npu_get_error_message());                       \
+            } else if (error_code == ACL_ERROR_RT_DEVICE_TASK_ABORT) {       \
+                TORCH_CHECK(                                                 \
+                false,                                                       \
+                __func__,                                                    \
+                ":",                                                         \
+                __FILE__,                                                    \
+                ":",                                                         \
+                __LINE__,                                                    \
+                " NPU function error: ", (device_error_msg.empty() ?         \
+                " FORCE STOP" : device_error_msg),                           \
+                ", error code is ", error_code,                              \
+                PTA_ERROR(ErrCode::ACL));                                    \
             } else {                                                         \
-                TORCH_CHECK(                                                     \
+                TORCH_CHECK(                                                 \
                 false,                                                       \
                 __func__,                                                    \
                 ":",                                                         \
@@ -195,7 +207,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
         static c10_npu::acl::AclErrorCode err_map;                           \
         if ((Error) != ACL_ERROR_NONE) {                                     \
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);               \
-            if (c10_npu::option::OptionsManager::ShouldPrintLessError())      \
+            if (c10_npu::option::OptionsManager::IsCompactErrorOutput())      \
             {                                                                \
                 std::ostringstream oss;                                      \
                 oss << " OPS function error: " << getErrorFunction(#err_code, ##__VA_ARGS__)    \
@@ -260,7 +272,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg);
 
 void record_mem_hbm_ecc_error();
 
-void set_mem_uce_info(MemUceInfo info);
+void set_mem_uce_info(MemUceInfo& info);
 
 MemUceInfo get_mem_uce_info();
 
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index cca42ed288f29bfac5b6c644ae42b2af80366356..0ceb84847bfd235be562af9a8e742da9d34eac30 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -5,6 +5,7 @@
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
 #endif
@@ -46,7 +47,6 @@ aclError GetDevice(int32_t *device)
 {
     if (targetDeviceIndex >= 0) {
         *device = targetDeviceIndex;
-        NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(targetDeviceIndex));
         return ACL_ERROR_NONE;
     }
 
@@ -60,13 +60,8 @@ aclError GetDevice(int32_t *device)
     }
     if (err == ACL_ERROR_NONE) {
         local_device = *device;
-    } else if (err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) {
+    } else if (err == ACL_ERROR_RT_CONTEXT_NULL) {
         *device = 0;
-        local_device = 0;
-        std::lock_guard<std::recursive_mutex> lock(mtx);
-        if (used_devices.find(local_device) == used_devices.end()) {
-            NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device]));
-        }
         return ACL_ERROR_NONE;
     }
     return err;
@@ -103,7 +98,10 @@ aclError SetDevice(c10::DeviceIndex device)
     if (local_device == device) {
         return ACL_ERROR_NONE;
     }
-    c10_npu::SetThreadAffinity(device);
+
+    if (c10_npu::NeedMainThreadBind()) {
+        c10_npu::SetThreadAffinity(device);
+    }
 
     aclError err = aclrtSetDevice(device);
     if (err == ACL_ERROR_NONE) {
@@ -116,6 +114,17 @@ aclError SetDevice(c10::DeviceIndex device)
     return err;
 }
 
+aclError MaybeSetDevice(c10::DeviceIndex device)
+{
+    if (isDeviceCtxActive(device)) {
+        ASCEND_LOGI("MaybeSetDevice: NPU device %d has not been initialized! We will set targetDeviceIndex.", device);
+        NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(device));
+    } else {
+        targetDeviceIndex = device;
+    }
+    return ACL_ERROR_NONE;
+}
+
 aclError ResetUsedDevices()
 {
     std::lock_guard<std::recursive_mutex> lock(mtx);
@@ -137,7 +146,7 @@ aclError DestroyUsedStreams()
     for (const auto it : used_devices) {
         NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first));
         NPUStream stream = getCurrentNPUStream(it.first);
-        aclError acl_ret = acl::AclrtDestroyStreamForce(stream);
+        aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false));
         if (acl_ret != ACL_ERROR_NONE) {
             return acl_ret;
         }
@@ -290,4 +299,42 @@ void stream_synchronize(aclrtStream stream)
     NPU_CHECK_ERROR(aclrtSynchronizeStream(stream));
 }
 
+aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value)
+{
+    std::lock_guard<std::recursive_mutex> lock(mtx);
+    if (used_devices.find(device) == used_devices.end()) {
+        TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit");
+    }
+    TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
+    c10_npu::acl::aclrtDevResModelType restype = static_cast<c10_npu::acl::aclrtDevResModelType>(type);
+    aclError err = c10_npu::acl::AclrtSetDeviceResLimit(device, restype, value);
+    NPU_CHECK_ERROR(err);
+    return err;
+}
+
+uint32_t GetDeviceResLimit(int32_t device, int32_t type)
+{
+    std::lock_guard<std::recursive_mutex> lock(mtx);
+    if (used_devices.find(device) == used_devices.end()) {
+        TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit");
+    }
+    TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
+    c10_npu::acl::aclrtDevResModelType restype = static_cast<c10_npu::acl::aclrtDevResModelType>(type);
+    uint32_t value;
+    NPU_CHECK_ERROR(c10_npu::acl::AclrtGetDeviceResLimit(device, restype, &value));
+    return value;
+}
+
+aclError ResetDeviceResLimit(int32_t device)
+{
+    std::lock_guard<std::recursive_mutex> lock(mtx);
+    if (used_devices.find(device) == used_devices.end()) {
+        TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not reset device resource limit");
+    }
+    TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
+    aclError err = c10_npu::acl::AclrtResetDeviceResLimit(device);
+    NPU_CHECK_ERROR(err);
+    return err;
+}
+
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h
index 948998459799d300bf76f7cd3367ed7c7a392e3a..e162f8fe8f0ca3b00af68b6bf57efc0b9c19078f 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.h
+++ b/torch_npu/csrc/core/npu/NPUFunctions.h
@@ -48,6 +48,8 @@ aclError GetDeviceWithoutSet(int32_t *device);
  */
 C10_NPU_API aclError SetDevice(c10::DeviceIndex device);
 
+C10_NPU_API aclError MaybeSetDevice(c10::DeviceIndex device);
+
 /**
  * @ingroup torch_npu
  * @brief reset all device id by ACL interface: aclrtResetDevice.
@@ -79,6 +81,12 @@ void SetTargetDevice();
 
 int GetLocalDevice();
 
+aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value);
+
+C10_NPU_API uint32_t GetDeviceResLimit(int32_t deviceId, int32_t type);
+
+aclError ResetDeviceResLimit(int32_t deviceId);
+
 enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
 
 // it's used to store npu synchronization state
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index fd060dcb086f911f0e2b345a18ba9672ded2c34c..a00448bd1cf547cb2f641eb47ff940c902341610 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -1,4 +1,3 @@
-#ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/core/npu/NPUGraph.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
@@ -10,7 +9,6 @@
 #include <vector>
 
 #include <ATen/Functions.h>
-#include <torch/csrc/Exceptions.h>
 
 namespace c10_npu {
 
@@ -261,4 +259,3 @@ NPUGraph::~NPUGraph()
 }
 
 } // namespace c10_npu
-#endif
diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..393b4706c60decfb6171dfb50d8670d92f74b102
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp
@@ -0,0 +1,36 @@
+#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h"
+namespace torch_npu {
+namespace ipc {
+
+int32_t* pids = nullptr;
+size_t pid_num = 0;
+size_t capacity = 0;
+
+void addPid(int pid)
+{
+    const size_t requiredCapacity = pid_num + 1;
+
+    if (requiredCapacity > capacity) {
+        size_t newCapacity = capacity + 10;
+
+        int32_t* newArray = new int32_t[newCapacity];
+        for (int i = 0; i < pid_num; ++i) {
+            newArray[i] = pids[i];
+        }
+
+        delete[] pids;
+        pids = newArray;
+        capacity = newCapacity;
+    }
+
+    pids[pid_num++] = static_cast<int32_t>(pid);
+}
+
+size_t getPids(int32_t** ret_pids)
+{
+    *ret_pids = pids;
+    return pid_num;
+}
+
+} // namespace ipc
+} // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.h b/torch_npu/csrc/core/npu/NPUIPCPidManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..f27cd240d15723f743fbcefe7204c81588ca60b3
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <cstdint>
+#include <cstddef>
+
+namespace torch_npu {
+namespace ipc {
+
+void addPid(int pid);
+size_t getPids(int32_t** pids);
+
+} // namespace ipc
+} // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUMacros.h b/torch_npu/csrc/core/npu/NPUMacros.h
index 3223c4f325b3de69b8e5cdc783954d84033b37b4..960dcb97b6e52bffc37582250ffd99b1f7ac08a6 100644
--- a/torch_npu/csrc/core/npu/NPUMacros.h
+++ b/torch_npu/csrc/core/npu/NPUMacros.h
@@ -29,6 +29,6 @@
 
 #define TORCH_NPU_API C10_NPU_API
 
-#define C10_COMPILE_TIME_MAX_NPUS 16
+#define C10_COMPILE_TIME_MAX_NPUS 32
 // A maximum of 8 P2P links can be created on a NPU device
 #define C10_P2P_ACCESS_MAX_NPUS 8
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 3eee4167a9d0c26130fa1ebb436965c218588456..579514ab37390f36aa208e7711c6fcec131a9f98 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -249,7 +249,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     // occur.
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
-    if (PyGILState_Check()) {
+    if (PyGILState_Check() != 0) {
         gilState = PyEval_SaveThread();
     }
 #endif
@@ -290,11 +290,11 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
             error_msg = c10_npu::c10_npu_get_error_message();
         }
         runtime_error = throwError + ", " + error_msg + PTA_ERROR(ErrCode::ACL);
-        error_msg = throwError + " happend.";
+        error_msg = throwError + " happened.";
     }
 
     if (current_status == RepoStatus::CAN_EXIT) {
-        error_msg = "Inner error happend with CAN_EXIT status, detail: " + repo_error;
+        error_msg = "Inner error happened with CAN_EXIT status, detail: " + repo_error;
     }
 
     if (current_status == RepoStatus::ERROR_EXIT) {
@@ -314,12 +314,12 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
             repo_error + ".\n" +
             "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
             "If you want to get the accurate stacktrace, "
-            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
             "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
             "resulting in performance degradation. "
             "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
             PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error;
-        error_msg = "Inner error happend, detail: " + repo_error;
+        error_msg = "Inner error happened, detail: " + repo_error;
     }
 
 #ifndef BUILD_LIBTORCH
@@ -330,7 +330,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 #endif
 
     if (!error_msg.empty()) {
-        ASCEND_LOGE(error_msg);
+        ASCEND_LOGE("%s", error_msg.c_str());
     }
     if (check_error && !runtime_error.empty()) {
         throw std::runtime_error(runtime_error);
@@ -470,7 +470,7 @@ void Repository::Enqueue(void *cur_paras)
     ThrowDeviceError(current_status, cur_paras);
 
     if (current_status == RepoStatus::CAN_EXIT) {
-        ASCEND_LOGE("Inner error happend with CAN_EXIT status, detail: %s", repo_error.c_str());
+        ASCEND_LOGE("Inner error happened with CAN_EXIT status, detail: %s", repo_error.c_str());
     }
 
     if (current_status == RepoStatus::ERROR_EXIT) {
@@ -490,7 +490,7 @@ void Repository::Enqueue(void *cur_paras)
             repo_error + ".\n" +
             "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
             "If you want to get the accurate stacktrace, "
-            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
             "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
             "resulting in performance degradation. "
             "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
@@ -523,7 +523,7 @@ void Repository::Enqueue(void *cur_paras)
     uint64_t u = 1;
 
     SetWriteWorking(true);
-    while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) {
+    while (!ret && (GetStatus() == RUN || GetStatus() == INIT)) {
         ret = WriteQueue(cur_paras);
         if (ret == false) {
             SetWriteWorking(false);
@@ -531,7 +531,7 @@ void Repository::Enqueue(void *cur_paras)
             if (IsFullQueue()) {
 #ifndef BUILD_LIBTORCH
                 // double check the current thread hold a Gil lock
-                if (PyGILState_Check()) {
+                if (PyGILState_Check() != 0) {
                     Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u);
                     Py_END_ALLOW_THREADS
                 } else {
@@ -707,6 +707,8 @@ bool Repository::CheckInit() const
 void StartConsume(Repository *repo, c10::DeviceIndex device_id)
 {
     SetThreadType(ThreadType::ACL_THREAD);
+    SetThreadAffinity(device_id);
+
     aclError ret = c10_npu::SetDevice(device_id);
     if (ret != 0) {
         C10_NPU_SHOW_ERR_MSG();
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index 7910946197df1e53d0dd29e15f1b922e2c420962..31ade0aa41d9710c78ae591678d7962f3af954a1 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -229,6 +229,8 @@ static void initNPUStreamsOnce()
 {
     // Inits default and secondary streams (once, globally)
     c10::DeviceIndex device_index = current_device();
+    // makesure on real devcie
+    SetTargetDevice();
     if (!initialize_flag[device_index]) {
         std::lock_guard<std::mutex> lock(mtx[device_index]);
         if (!initialize_flag[device_index]) {
@@ -259,7 +261,7 @@ static uint32_t get_idx(std::atomic<uint32_t>& counter)
 {
     auto raw_idx = counter++;
     static int StreamsPerPool = GetStreamsPerPool();
-    return raw_idx % StreamsPerPool;
+    return raw_idx % static_cast<uint32_t>(StreamsPerPool);
 }
 
 static uint32_t get_sync_launch_stream_idx(std::atomic<uint32_t>& counter)
diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
index 084f1df577a58ba352c76361211f5741df8ab4ef..39d19b0b628e1a191aab9ff4182a7e5bd1f6c657 100644
--- a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
@@ -47,7 +47,7 @@ void* registerSvmMem(void* ptr, size_t size)
 void* mallocHostSwapMemory(size_t size)
 {
     if (!initialized) {
-        kAlignSize = sysconf(_SC_PAGESIZE);
+        kAlignSize = static_cast<size_t>(sysconf(_SC_PAGESIZE));
         initialized = true;
     }
     size = (size + kAlignSize - 1) & ~(kAlignSize - 1);
diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index 4e0fce02fb311d239c95a913939e41b554333b02..24a2a8da62cf6ecb23684a51be0dadf01b671b04 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -39,28 +39,33 @@ static std::map<std::string, SocVersion> socVersionMap = {
     {"Ascend910_9372", SocVersion::Ascend910_9372},
     {"Ascend910_9362", SocVersion::Ascend910_9362}};
 
-void SetSocVersion(const char* const socVersion) {
-  if (socVersion == nullptr ||
-      g_curSocVersion != SocVersion::UnsupportedSocVersion) {
-    return;
-  }
+void SetSocVersion(const char* const socVersion)
+{
+    if (socVersion == nullptr ||
+        g_curSocVersion != SocVersion::UnsupportedSocVersion) {
+        return;
+    }
 
-  SocVersion curSocVersion = SocVersion::UnsupportedSocVersion;
+    SocVersion curSocVersion = SocVersion::UnsupportedSocVersion;
 
-  auto const& iter = socVersionMap.find(socVersion);
-  if (iter != socVersionMap.end()) {
-    curSocVersion = iter->second;
-  } else {
-    std::string unsupported_soc(socVersion);
-    std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' ');
-    AT_ERROR("Unsupported soc version: ", unsupported_soc);
-  }
+    auto const& iter = socVersionMap.find(socVersion);
+    if (iter != socVersionMap.end()) {
+        curSocVersion = iter->second;
+    } else {
+        std::string unsupported_soc(socVersion);
+        std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' ');
+        AT_ERROR("Unsupported soc version: ", unsupported_soc);
+    }
 
-  g_curSocVersion = curSocVersion;
+    g_curSocVersion = curSocVersion;
 }
 
 const SocVersion& GetSocVersion()
 {
+    if (g_curSocVersion == SocVersion::UnsupportedSocVersion) {
+        auto soc_name = c10_npu::acl::AclGetSocName();
+        SetSocVersion(soc_name);
+    }
     return g_curSocVersion;
 }
 
@@ -94,5 +99,10 @@ bool IsBF16Supported()
 {
     return GetSocVersion() >= SocVersion::Ascend910B1;
 }
+
+bool IsAclnnOnly()
+{
+    return false;
+}
 }  // namespace c10_npu
 
diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h
index 3119a645153322225f9d0d9ea19dfa3b1ef9ab9f..6a3a8cdfd7e9b8a59fcb5712ab81146dac5be875 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.h
+++ b/torch_npu/csrc/core/npu/NpuVariables.h
@@ -40,6 +40,8 @@ const SocVersion& GetSocVersion();
 bool IsSupportInfNan();
 
 bool IsBF16Supported();
+
+bool IsAclnnOnly();
 }  // namespace c10_npu
 #endif
 
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
index 951b508240a3999a5c2773702103419112951ac5..7cfcb2a483ebc424cdde19c7d969c6da4375ae63 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
@@ -53,7 +53,8 @@ void NPUGuardImpl::setDevice(c10::Device d) const
 
 void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept
 {
-    NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
+    c10_npu::StartMainThreadBind(d.index());
+    NPU_CHECK_WARN(c10_npu::MaybeSetDevice(d.index()));
 }
 
 c10::Stream NPUGuardImpl::getStream(c10::Device d) const noexcept
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 3616d2e4568c1677baaa4688919787580baad870..b97a8d4c39bbc2cbabc6d951e48d4b012e72b745 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -82,6 +82,17 @@ LOAD_FUNCTION(aclmdlRICaptureTaskUpdateBegin)
 LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd)
 LOAD_FUNCTION(aclrtHostRegister)
 LOAD_FUNCTION(aclrtHostUnregister)
+LOAD_FUNCTION(aclrtIpcMemGetExportKey)
+LOAD_FUNCTION(aclrtIpcMemSetImportPid)
+LOAD_FUNCTION(aclrtIpcMemImportByKey)
+LOAD_FUNCTION(aclrtIpcMemClose)
+LOAD_FUNCTION(aclrtMemExportToShareableHandle)
+LOAD_FUNCTION(aclrtMemSetPidToShareableHandle)
+LOAD_FUNCTION(aclrtMemImportFromShareableHandle)
+LOAD_FUNCTION(aclrtDeviceGetBareTgid)
+LOAD_FUNCTION(aclrtGetDeviceResLimit)
+LOAD_FUNCTION(aclrtSetDeviceResLimit)
+LOAD_FUNCTION(aclrtResetDeviceResLimit)
 
 
 aclprofStepInfoPtr init_stepinfo() {
@@ -175,6 +186,7 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) {
     if (stream == nullptr) { // default stream
         return ACL_ERROR_INVALID_PARAM;
     }
+
     typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t);
     static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode);
     if (func == nullptr) {
@@ -411,7 +423,7 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) {
         }
         TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND));
         return func_backup(stream);
-  }
+    }
 }
 
 aclError AclrtDestroyStreamForce(aclrtStream stream) {
@@ -845,7 +857,7 @@ bool IsCaptureSupported()
     static bool have_load_func = false;
     static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) &&
         (GetSocVersion() < SocVersion::Ascend310B1)) ||
-        (GetSocVersion() >= SocVersion::Ascend910_9391);
+        ((GetSocVersion() >= SocVersion::Ascend910_9391));
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
         typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
@@ -928,5 +940,138 @@ aclError AclrtHostUnregister(void *ptr)
     return func(ptr);
 }
 
+aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag)
+{
+    typedef aclError (*AclrtIpcMemGetExportKey)(void *, size_t, char *, size_t, uint64_t);
+    static AclrtIpcMemGetExportKey func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtIpcMemGetExportKey) GET_FUNC(aclrtIpcMemGetExportKey);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtIpcMemGetExportKey", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(devPtr, size, key, len, flag);
+}
+
+aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num)
+{
+    typedef aclError (*AclrtIpcMemSetImportPid)(const char *, int32_t *, size_t);
+    static AclrtIpcMemSetImportPid func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtIpcMemSetImportPid) GET_FUNC(aclrtIpcMemSetImportPid);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtIpcMemSetImportPid", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(key, pid, num);
+}
+
+aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag)
+{
+    typedef aclError (*AclrtIpcMemImportByKey)(void **, const char *, uint64_t);
+    static AclrtIpcMemImportByKey func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtIpcMemImportByKey) GET_FUNC(aclrtIpcMemImportByKey);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtIpcMemImportByKey", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(devPtr, key, flag);
+}
+
+aclError AclrtIpcMemClose(const char *key)
+{
+    typedef aclError (*AclrtIpcMemClose)(const char *);
+    static AclrtIpcMemClose func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtIpcMemClose) GET_FUNC(aclrtIpcMemClose);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtIpcMemClose", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(key);
+}
+
+aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType,
+                                         uint64_t flags, uint64_t *shareableHandle)
+{
+    typedef aclError (*AclrtMemExportToShareableHandle)(aclrtDrvMemHandle, aclrtMemHandleType, uint64_t, uint64_t *);
+    static AclrtMemExportToShareableHandle func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtMemExportToShareableHandle) GET_FUNC(aclrtMemExportToShareableHandle);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtMemExportToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(handle, handleType, flags, shareableHandle);
+}
+
+aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum)
+{
+    typedef aclError (*AclrtMemSetPidToShareableHandle)(uint64_t, int32_t *, size_t);
+    static AclrtMemSetPidToShareableHandle func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtMemSetPidToShareableHandle) GET_FUNC(aclrtMemSetPidToShareableHandle);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtMemSetPidToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(shareableHandle, pid, pidNum);
+}
+
+aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle)
+{
+    typedef aclError (*AclrtMemImportFromShareableHandle)(uint64_t, int32_t, aclrtDrvMemHandle *);
+    static AclrtMemImportFromShareableHandle func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtMemImportFromShareableHandle) GET_FUNC(aclrtMemImportFromShareableHandle);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtMemImportFromShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(shareableHandle, deviceId, handle);
+}
+
+aclError AclrtDeviceGetBareTgid(int32_t *pid)
+{
+    typedef aclError (*AclrtDeviceGetBareTgid)(int32_t *);
+    static AclrtDeviceGetBareTgid func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtDeviceGetBareTgid) GET_FUNC(aclrtDeviceGetBareTgid);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtDeviceGetBareTgid", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(pid);
+}
+
+aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value)
+{
+    typedef aclError (*AclrtGetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t*);
+    static AclrtGetDeviceResLimit func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtGetDeviceResLimit) GET_FUNC(aclrtGetDeviceResLimit);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtGetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceId, type, value);
+}
+
+aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value)
+{
+    typedef aclError (*AclrtSetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t);
+    static AclrtSetDeviceResLimit func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtSetDeviceResLimit) GET_FUNC(aclrtSetDeviceResLimit);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtSetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceId, type, value);
+}
+
+aclError AclrtResetDeviceResLimit(int32_t deviceId)
+{
+    typedef aclError (*AclrtResetDeviceResLimit)(int32_t);
+    static AclrtResetDeviceResLimit func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtResetDeviceResLimit) GET_FUNC(aclrtResetDeviceResLimit);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtResetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceId);
+}
+
 } // namespace acl
 } // namespace c10
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 3eebe56d358a189f170860edbb9dc9b32265d4ff..3b6d47cf4a260c1dfa282dd96b12bb4ea8f232e0 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -32,6 +32,12 @@ enum aclrtStreamStatus {
 };
 using aclrtStreamStatus = enum aclrtStreamStatus;
 
+enum aclrtDevResModelType {
+    ACL_RT_DEV_RES_CUBE_CORE = 0,
+    ACL_RT_DEV_RES_VECTOR_CORE = 1,
+};
+using aclrtDevResModelType = enum aclrtDevResModelType;
+
 /**
   aclprofStepInfo is provide by acl, it used to be store dispatch op info.
  */
@@ -228,5 +234,28 @@ aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type,
  */
 aclError AclrtHostUnregister(void *ptr);
 
+aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag);
+
+aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num);
+
+aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag);
+
+aclError AclrtIpcMemClose(const char *key);
+
+aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType,
+                                         uint64_t flags, uint64_t *shareableHandle);
+
+aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum);
+
+aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle);
+
+aclError AclrtDeviceGetBareTgid(int32_t *pid);
+
+aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value);
+
+aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value);
+
+aclError AclrtResetDeviceResLimit(int32_t deviceId);
+
 } // namespace acl
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
index 8f7f17a0114a517ef7f5ef4b201b1bf749274210..9e46d36a6f0e74deb33502b64b143c4f7622f86d 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
@@ -4,6 +4,7 @@
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
 
 namespace c10_npu {
 namespace option {
@@ -84,6 +85,18 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::u
 
 void SetOption(const std::string &key, const std::string &val)
 {
+    if (c10_npu::IsAclnnOnly()) {
+        if (key == "jitCompile" && val == "enable") {
+            TORCH_NPU_WARN_ONCE("Current device only support jit_compile=False, ",
+                "the requested value True is invalid and has been reverted to False.");
+            return register_options::OptionRegister::GetInstance()->Set(key, "disable");
+        }
+        if (key == "ALLOW_INTERNAL_FORMAT" && val == "enable") {
+            TORCH_NPU_WARN_ONCE("Current device only support allow_internal_format=False, ",
+                "the requested value True is invalid and has been reverted to False.");
+            return register_options::OptionRegister::GetInstance()->Set(key, "disable");
+        }
+    }
     register_options::OptionRegister::GetInstance()->Set(key, val);
 }
 
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 5dcf4e88aa742236ff3f48b6e020f7a2a485e51b..9fc3bac5a89dd5336a777ef797e7defb5c01f9df 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -470,7 +470,7 @@ uint32_t OptionsManager::GetP2PBufferSize()
     const static uint32_t buf_size = []() -> uint32_t {
         char* buf_val = std::getenv("P2P_HCCL_BUFFSIZE");
         // Default 0M
-        int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
+        int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 20;
         TORCH_CHECK(buf_size >= 0, "P2P_HCCL_BUFFSIZE cannot be negative.", PTA_ERROR(ErrCode::VALUE));
         return static_cast<uint32_t>(buf_size);
     }();
@@ -485,6 +485,7 @@ uint32_t OptionsManager::GetAclOpInitMode()
         int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
         std::unordered_map<int32_t, std::string> aclOpInitMode = getAclOpInitMode();
         if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) {
+            acl_op_init_mode = 0;
             TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0.");
         }
         return static_cast<uint32_t>(acl_op_init_mode);
@@ -622,7 +623,7 @@ bool OptionsManager::IsOomSnapshotEnable()
     return (envFlag != 0);
 }
 
-bool OptionsManager::ShouldPrintLessError()
+bool OptionsManager::IsCompactErrorOutput()
 {
     static bool should_print = []() -> bool {
         int32_t disabled_error = OptionsManager::GetBoolTypeOption("TORCH_NPU_COMPACT_ERROR_OUTPUT");
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 5be33e06daae47716164f4ad7299afabd8c3426c..73f5dbcb81f9fc268d8ef9122407e66b976dad08 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -133,7 +133,7 @@ public:
     static std::string GetOomSnapshotDumpPath();
     static bool IsOomSnapshotEnable();
     static bool ShouldPrintWarning();
-    static bool ShouldPrintLessError();
+    static bool IsCompactErrorOutput();
 
 private:
     static int GetBoolTypeOption(const char* env_str, int defaultVal = 0);
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index de1010347fb5ffa785dc7f0b1e44acb8e5fb5d7f..4b6707b8495b9a89c500e06f160b42950e9ae6fb 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -189,6 +189,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
+    SetMainThread();
+
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
@@ -275,8 +277,8 @@ void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack& call_, const std::
     lazy_fn_.emplace_back(std::make_pair(call_, in));
 }
 
-void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn,
-                                   ReleasePriority priority) {
+void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, ReleasePriority priority)
+{
     const auto& iter = this->release_fn_.find(priority);
     if (iter != release_fn_.end()) {
         release_fn_[priority].emplace_back(release_fn);
diff --git a/torch_npu/csrc/custom_dtype/CMakeLists.txt b/torch_npu/csrc/custom_dtype/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d3d7c0e5379a0c23354a45a6dbd12c0bffea0ac
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB _CUS_DTYPE_SRCS *.cpp)
+
+LIST(APPEND CUS_DTYPE_SRCS ${_CUS_DTYPE_SRCS})
+
+# Pass to parent
+set(CUS_DTYPE_SRCS ${CUS_DTYPE_SRCS} PARENT_SCOPE)
diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a88344ce5e969bc484b164ce2c28c41004394a52
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/Init.cpp
@@ -0,0 +1,125 @@
+#include "torch_npu/csrc/custom_dtype/Init.h"
+#ifndef BUILD_LIBTORCH
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#endif
+
+
+namespace c10_npu {
+struct DTypeConstants {
+    static const int float32_value;
+    static const int float16_value;
+    static const int int8_value;
+    static const int int32_value;
+    static const int uint8_value;
+    static const int int16_value;
+    static const int uint16_value;
+    static const int uint32_value;
+    static const int int64_value;
+    static const int uint64_value;
+    static const int float64_value;
+    static const int bool_value;
+    static const int string_value;
+    static const int complex64_value;
+    static const int complex128_value;
+    static const int bfloat16_value;
+    static const int int4_value;
+    static const int uint1_value;
+    static const int complex32_value;
+};
+
+const int DTypeConstants::float32_value = static_cast<int>(DType::FLOAT);
+const int DTypeConstants::float16_value = static_cast<int>(DType::FLOAT16);
+const int DTypeConstants::int8_value = static_cast<int>(DType::INT8);
+const int DTypeConstants::int32_value = static_cast<int>(DType::INT32);
+const int DTypeConstants::uint8_value = static_cast<int>(DType::UINT8);
+const int DTypeConstants::int16_value = static_cast<int>(DType::INT16);
+const int DTypeConstants::uint16_value = static_cast<int>(DType::UINT16);
+const int DTypeConstants::uint32_value = static_cast<int>(DType::UINT32);
+const int DTypeConstants::int64_value = static_cast<int>(DType::INT64);
+const int DTypeConstants::uint64_value = static_cast<int>(DType::UINT64);
+const int DTypeConstants::float64_value = static_cast<int>(DType::DOUBLE);
+const int DTypeConstants::bool_value = static_cast<int>(DType::BOOL);
+const int DTypeConstants::string_value = static_cast<int>(DType::STRING);
+const int DTypeConstants::complex64_value = static_cast<int>(DType::COMPLEX64);
+const int DTypeConstants::complex128_value = static_cast<int>(DType::COMPLEX128);
+const int DTypeConstants::bfloat16_value = static_cast<int>(DType::BF16);
+const int DTypeConstants::int4_value = static_cast<int>(DType::INT4);
+const int DTypeConstants::uint1_value = static_cast<int>(DType::UINT1);
+const int DTypeConstants::complex32_value = static_cast<int>(DType::COMPLEX32);
+
+#ifndef BUILD_LIBTORCH
+PyObject* cd_initExtension(PyObject*, PyObject *)
+{
+    auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
+    if (!torch_npu_C_module) {
+        return nullptr;
+    }
+    auto torch_npu_C_m = py::handle(torch_npu_C_module).cast<py::module>();
+    auto m = torch_npu_C_m.def_submodule("_cd", "_cd bindings");
+
+    py::class_<DTypeConstants>(m, "DType")
+        .def_readonly_static("float32", &DTypeConstants::float32_value)
+        .def_readonly_static("float16", &DTypeConstants::float16_value)
+        .def_readonly_static("int8", &DTypeConstants::int8_value)
+        .def_readonly_static("int32", &DTypeConstants::int32_value)
+        .def_readonly_static("uint8", &DTypeConstants::uint8_value)
+        .def_readonly_static("int16", &DTypeConstants::int16_value)
+        .def_readonly_static("uint16", &DTypeConstants::uint16_value)
+        .def_readonly_static("uint32", &DTypeConstants::uint32_value)
+        .def_readonly_static("int64", &DTypeConstants::int64_value)
+        .def_readonly_static("uint64", &DTypeConstants::uint64_value)
+        .def_readonly_static("float64", &DTypeConstants::float64_value)
+        .def_readonly_static("bool", &DTypeConstants::bool_value)
+        .def_readonly_static("string", &DTypeConstants::string_value)
+        .def_readonly_static("complex64", &DTypeConstants::complex64_value)
+        .def_readonly_static("complex128", &DTypeConstants::complex128_value)
+        .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value)
+        .def_readonly_static("int4", &DTypeConstants::int4_value)
+        .def_readonly_static("uint1", &DTypeConstants::uint1_value)
+        .def_readonly_static("complex32", &DTypeConstants::complex32_value);
+
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef NPUCustomDtypeMethods[] = { // NOLINT
+    {"_cd_init", cd_initExtension, METH_NOARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}
+};
+#endif
+
+const std::string CustomDataTypeToString(int64_t dType)
+{
+    const std::map<const DType, const std::string>
+        TYPE_TO_STRING_MAP = {
+            {DType::FLOAT, "torch_npu.float32"},
+            {DType::FLOAT16, "torch_npu.float16"},
+            {DType::INT8, "torch_npu.int8"},
+            {DType::INT32, "torch_npu.int32"},
+            {DType::UINT8, "torch_npu.uint8"},
+            {DType::INT16, "torch_npu.int16"},
+            {DType::UINT16, "torch_npu.uint16"},
+            {DType::UINT32, "torch_npu.uint32"},
+            {DType::INT64, "torch_npu.int64"},
+            {DType::UINT64, "torch_npu.uint64"},
+            {DType::DOUBLE, "torch_npu.float64"},
+            {DType::BOOL, "torch_npu.bool"},
+            {DType::STRING, "torch_npu.string"},
+            {DType::COMPLEX64, "torch_npu.complex64"},
+            {DType::COMPLEX128, "torch_npu.complex128"},
+            {DType::BF16, "torch_npu.bfloat16"},
+            {DType::INT4, "torch_npu.int4"},
+            {DType::UINT1, "torch_npu.uint1"},
+            {DType::COMPLEX32, "torch_npu.complex32"}};
+
+    const auto iter = TYPE_TO_STRING_MAP.find(static_cast<DType>(dType));
+    return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype";
+}
+
+#ifndef BUILD_LIBTORCH
+PyMethodDef* custom_dtype_functions()
+{
+    return NPUCustomDtypeMethods;
+}
+#endif
+}
diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h
new file mode 100644
index 0000000000000000000000000000000000000000..867e07ae3fe0671f4f8ddcd9fcda323a1bb6a5c9
--- /dev/null
+++ b/torch_npu/csrc/custom_dtype/Init.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#ifndef BUILD_LIBTORCH
+#include <torch/csrc/python_headers.h>
+#endif
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
+#include "third_party/acl/inc/acl/acl_base.h"
+
+namespace c10_npu {
+const int g_toAclOffset = 256;
+
+#define ENUM_OFFSET(new_name, old_name) new_name = static_cast<int>(old_name) + g_toAclOffset,
+
+#ifndef BUILD_LIBTORCH
+TORCH_NPU_API PyMethodDef* custom_dtype_functions();
+#endif
+
+enum class DType {
+    UNDEFINED = -1,
+    ENUM_OFFSET(FLOAT, ACL_FLOAT)
+    ENUM_OFFSET(FLOAT16, ACL_FLOAT16)
+    ENUM_OFFSET(INT8, ACL_INT8)
+    ENUM_OFFSET(INT32, ACL_INT32)
+    ENUM_OFFSET(UINT8, ACL_UINT8)
+    ENUM_OFFSET(INT16, ACL_INT16)
+    ENUM_OFFSET(UINT16, ACL_UINT16)
+    ENUM_OFFSET(UINT32, ACL_UINT32)
+    ENUM_OFFSET(INT64, ACL_INT64)
+    ENUM_OFFSET(UINT64, ACL_UINT64)
+    ENUM_OFFSET(DOUBLE, ACL_DOUBLE)
+    ENUM_OFFSET(BOOL, ACL_BOOL)
+    ENUM_OFFSET(STRING, ACL_STRING)
+    ENUM_OFFSET(COMPLEX64, ACL_COMPLEX64)
+    ENUM_OFFSET(COMPLEX128, ACL_COMPLEX128)
+    ENUM_OFFSET(BF16, ACL_BF16)
+    ENUM_OFFSET(INT4, ACL_INT4)
+    ENUM_OFFSET(UINT1, ACL_UINT1)
+    ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32)
+};
+
+inline bool IsCustomDType(int64_t t)
+{
+    if (t >= g_toAclOffset) {
+        return true;
+    }
+    return false;
+}
+
+// Both c10_npu::DType and ScalarType are supported
+inline aclDataType GetAclDataType(int64_t t)
+{
+    if (t >= g_toAclOffset) {
+        return static_cast<aclDataType>(t - g_toAclOffset);
+    }
+    return at_npu::native::OpPreparation::convert_to_acl_data_type(
+        static_cast<at::ScalarType>(t));
+}
+
+inline aclDataType GetAclDataType(DType t)
+{
+    return static_cast<aclDataType>(static_cast<int32_t>(t) - g_toAclOffset);
+}
+
+inline at::ScalarType GetATenDType(int64_t t)
+{
+    aclDataType aclType = GetAclDataType(t);
+    return at_npu::native::OpPreparation::convert_to_scalar_type(aclType);
+}
+
+const std::string CustomDataTypeToString(int64_t dType);
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp
index e9ad7bbd6af6a6d44dab6224888fe34b9d7526b7..1033d8de97f5d0d5eaa099041fd7089616fa3589 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.hpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.hpp
@@ -17,7 +17,7 @@
         auto Error = err_code;                                               \
         if ((Error) != HCCL_SUCCESS) {                                       \
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);              \
-            if (c10_npu::option::OptionsManager::ShouldPrintLessError()) {   \
+            if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) {   \
                 std::ostringstream oss;                                      \
                 oss << " HCCL function error: " << getErrorFunction(#err_code, ##__VA_ARGS__)    \
                    << ", error code is " << Error << " "                    \
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 86574926794476595f71ceed89eae6ea20a561af..c61e2b4ffacb80dc615e2721da8bd5cfe79af33c 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -576,7 +576,7 @@ Example::
               Default settings return everything - i.e. contains HCCL comm dumps and collective traces.
         )");
 
-  Py_RETURN_TRUE;
+    Py_RETURN_TRUE;
 }
 
 // c10d methods on torch._C
diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
index 38899ea5a8d8eb3efdf25afcd0494db1603f8238..72e7ebf9a096c630e2029ac6e1abffe20cf5a465 100644
--- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
@@ -16,11 +16,13 @@
 #include <sys/socket.h>
 #include <sys/epoll.h>
 #include <sys/un.h>
+#include <sys/stat.h>
 #include <netinet/in.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <arpa/inet.h>
 #include "c10/util/Logging.h"
+#include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "ParallelTcpServer.hpp"
 
 namespace c10d {
@@ -315,6 +317,11 @@ int ParallelTcpServer::CreateLocalSocket(const std::string &localSocketPath) noe
         return -1;
     }
 
+    if (!at_npu::native::NpuUtils::setFilePermissions(sockFd, S_IRUSR | S_IWUSR | S_IRGRP)) {
+        close(sockFd);
+        return -1;
+    }
+
     ret = listen(sockFd, MAX_EVENT_COUNT);
     if (ret != 0) {
         LOG(ERROR) << "listen local socket fd failed " << errno << " : " << strerror(errno);
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index b6d189608b7d424566901370aac5067862c2d96e..d39604b299ec82036e816b53fcd24d6bb0968803 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -19,8 +19,12 @@
 #include <c10d/ParamCommsUtils.hpp>
 #include <c10d/TraceUtils.h>
 #include <c10d/Utils.hpp>
+#include <c10d/TCPStore.hpp>
+#include <c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 
+#include <arpa/inet.h>
+
 #include "op_plugin/OpInterface.h"
 #include "third_party/acl/inc/acl/acl.h"
 #include "third_party/acl/inc/acl/acl_base.h"
@@ -62,6 +66,7 @@ constexpr const char* P2P_DEVICE_KEY = "_p2p";
 using hcclUs = std::chrono::steady_clock::time_point;
 
 constexpr int32_t MAX_GROUP_NAME_LEN = 128;
+constexpr int32_t NSLB_JOBID_OFFSET = 32;
 
 // HCCL ReduceOp mapping
 std::map<c10d::ReduceOp, HcclReduceOp> hcclOp = {
@@ -926,6 +931,9 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     const char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
 
     logPrefix_ = createLogPrefix();
+    if (options_->global_ranks_in_group.empty()) {
+        numRanks_ = size_;
+    }
     dumpOnException_ = c10d::getCvarBool(TORCH_HCCL_DUMP_ON_TIMEOUT, false);
     heartbeat_ = 1ULL;
     monitorThreadEnabled_.store(c10d::getCvarBool(TORCH_HCCL_ENABLE_MONITORING, false));
@@ -941,6 +949,24 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     c10d::PrefixStore *prefixStore = dynamic_cast<c10d::PrefixStore *>(store_.get());
     globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
 
+    c10::intrusive_ptr<c10d::Store> getTcpStore = store_;
+    while (getTcpStore) {
+        c10d::PrefixStore *asPrefixStore = dynamic_cast<c10d::PrefixStore *>(getTcpStore.get());
+        c10d::TCPStore *tcpStore = dynamic_cast<c10d::TCPStore *>(getTcpStore.get());
+        if (tcpStore) {
+            if (!(tcpStore->getHost().empty())) {
+                tcpMasterAddr = tcpStore->getHost();
+                tcpMasterPort = tcpStore->getPort();
+                break;
+            }
+        }
+        if (asPrefixStore) {
+            getTcpStore = asPrefixStore->getUnderlyingStore();
+        } else {
+            break;
+        }
+    }
+
     try {
         if (blockingWait != nullptr) {
             auto val = std::stoi(blockingWait);
@@ -1172,6 +1198,7 @@ void ProcessGroupHCCL::abortAndClearHcclComm(c10::optional<std::string> abortRea
     abortCommsFromMap(devHCCLCommMap_, rank_, abortReason);
     devHCCLCommMap_.clear();
     devHCCLCommNameMap_.clear();
+    p2pSendRecvKeys_.clear();
     hcclCommCounter_ = 0;
     return;
 }
@@ -1214,6 +1241,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL()
             }
         }
         devHCCLCommMap_.clear();
+        p2pSendRecvKeys_.clear();
     }
     ASCEND_LOGI("process group destroyed, group id is %s.", options_->group_id.c_str());
     logger->info("process group destroyed, group id is %s.", options_->group_id.c_str());
@@ -1707,6 +1735,7 @@ void ProcessGroupHCCL::workCleanupLoop()
             try {
                 if (needSetDevice) {
                     c10::DeviceIndex device = static_cast<int>(work.devices_[0].index());
+                    c10_npu::SetThreadAffinity(device);
                     NPU_CHECK_ERROR(c10_npu::SetDevice(device));
                     deviceId_ = static_cast<int>(work.devices_[0].index());
                     needSetDevice = false;
@@ -2049,30 +2078,34 @@ bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool e
         }
         fileName << "torch_hccl_status-" << std::to_string(global_rank) << "_" << master_addr << "_" << std::to_string(deviceId_) << "_";
         fileName << std::to_string(numRanks_) << "_" << std::to_string(pid) << "_" << std::to_string(duration) << ".log";
-        std::string isMaster = "false";
+        bool isMaster = false;
         if (global_rank == 0) {
-            isMaster = "true";
+            isMaster = true;
         }
         std::string out_file_path = c10::str(path, "/", fileName.str());
         checkAndMakePath(path.c_str(), "Open shared directory failed. Please check whether input path is valid.");
         createFile(out_file_path.c_str());
-        outfile.open(out_file_path.c_str(), std::ios::trunc);
-        outfile << "{\"last_comm_op\":[";
-        bool first_op = true;
+        using json = nlohmann::json;
+        json result;
+        std::list<json> last_comm_ops;
         for (auto info = StatusOutput_.begin(); info != StatusOutput_.end(); info++) {
-            if (first_op) {
-                outfile << "{";
-            } else {
-                outfile << ", {";
-            }
-            outfile << "\"seq\":" << info->second.seq << ", \"op_type\":\"" << info->second.opType;
-            outfile << "\", \"pg_id\":\"" << info->second.pgId << "\", \"comm_ids\":\"" << info->second.commIds;
-            outfile << "\", \"status\":\""<< info->second.status << "\"}";
-            first_op = false;
-        }
-        outfile << "], \"is_master\":" << isMaster;
-        outfile << ", \"exception_message\":\"" << exceptionMessage_;
-        outfile << "\", \"global_pg_end_time\":" << end_duration << "}" << std::endl;
+            json comm_op;
+            comm_op["seq"] = info->second.seq;
+            comm_op["op_type"] = info->second.opType;
+            comm_op["pg_id"] = info->second.pgId;
+            comm_op["comm_ids"] = info->second.commIds;
+            comm_op["status"] = info->second.status;
+            last_comm_ops.emplace_back(comm_op);
+        }
+        if (!last_comm_ops.empty()) {
+            result["last_comm_op"] = last_comm_ops;
+        }
+        result["is_master"] = isMaster;
+        result["exception_message"] = exceptionMessage_;
+        result["global_pg_end_time"] = end_duration;
+        std::string result_str = result.dump();
+        outfile.open(out_file_path.c_str(), std::ios::trunc);
+        outfile << result_str << std::endl;
         outfile.close();
         return true;
     }
@@ -2141,6 +2174,30 @@ std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::getHCCLComm(
     return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank);
 }
 
+void ProcessGroupHCCL::setNSLBCommConfig(HcclCommConfig** commConfig)
+{
+    const char* envPtr = std::getenv("RANK");
+    if (envPtr == nullptr) {
+        ASCEND_LOGI("Failed to get env info for NSLB-DP.");
+        return;
+    }
+    uint32_t worldRankID = std::stoi(std::string(envPtr));
+    options_->hccl_config["hccl_world_rank_id"] = worldRankID;
+    uint32_t masterPort = tcpMasterPort;
+    struct sockaddr_in sa;
+    std::string master_addr = tcpMasterAddr;
+    inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr));
+    uint32_t masterIp = ntohl(sa.sin_addr.s_addr);
+    uint64_t jobID = masterPort;
+    jobID = (jobID << NSLB_JOBID_OFFSET);
+    jobID += masterIp;
+    options_->hccl_config["hccl_job_id"] = jobID;
+    if ((*commConfig) != nullptr) {
+        (*commConfig)->hcclWorldRankID = worldRankID;
+        (*commConfig)->hcclJobID = jobID;
+    }
+}
+
 void ProcessGroupHCCL::createHCCLComm(
     const std::string& devicesKey,
     const std::vector<at::Device>& devices,
@@ -2165,6 +2222,10 @@ void ProcessGroupHCCL::createHCCLComm(
 
         HcclCommConfig config;
 
+        if (options_->global_ranks_in_group.empty()) {
+            setNSLBCommConfig(&commConfig);
+        }
+
         npuGuard.set_index(devices[i].index());
         switch (commType) {
             case HcclCommType::DEFAULT:
@@ -2295,6 +2356,9 @@ bool ProcessGroupHCCL::createHCCLCommEx(
             return false;
         }
         hcclComms[i] = subComm;
+        if (commType == HcclCommType::P2P) {
+            hcclComms[i]->p2pPeer = getP2pPeer();
+        }
         // Creates the HCCL streams
         streamVal.push_back(getNPUStreamByCurrentType(devices[i].index()));
     }
@@ -2397,6 +2461,14 @@ std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::createHCCLComm(
 
     // Move the HCCL resource to cache
     devHCCLCommMap_.emplace(devicesKey, std::move(hcclComms));
+    if (commType == HcclCommType::P2P) {
+        auto iter = p2pSendRecvKeys_.find(rank_);
+        if (iter == p2pSendRecvKeys_.end()) {
+            p2pSendRecvKeys_.emplace(rank_, std::vector<std::string>{devicesKey});
+        } else {
+            iter->second.push_back(devicesKey);
+        }
+    }
     return devHCCLCommMap_[devicesKey];
 }
 
@@ -2407,7 +2479,13 @@ int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer)
     std::vector<at::Device> devices = {at::Device(c10::DeviceType::PrivateUse1, device)};
     auto key = getKeyFromDevices(devices);
     if (p2p && hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) {
-        TORCH_CHECK(peer >= 0, "In p2p scenarios, the passed 'dst rank id' is error.", DIST_ERROR(ErrCode::PARAM));
+        TORCH_CHECK(
+            peer >= 0,
+            "In p2p scenarios, the passed 'dst rank id' : ",
+            peer,
+            " is error, ",
+            "expected value >= 0.",
+            DIST_ERROR(ErrCode::PARAM));
         key = getKeySendRecv(rank_, peer);
     }
     if ((hcclStreams_.count(key) == 0) || hcclStreams_[key].empty()) {
@@ -2735,7 +2813,7 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id)
 {
     at::Device device = at::Device(c10::DeviceType::PrivateUse1, device_id);
     std::vector<at::Device> devices = {device};
-    const auto key = getKeyFromDevices(devices);
+    auto key = getKeyFromDevices(devices);
 
     {
         std::lock_guard<std::mutex> lock(mutex_);
@@ -2747,6 +2825,19 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id)
                 HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm));
             }
         }
+        if (p2pSendRecvKeys_.find(rank_) != p2pSendRecvKeys_.end()) {
+            auto p2pKeys = p2pSendRecvKeys_[rank_];
+            for (const auto& p2pKey : p2pKeys) {
+                if (devHCCLCommMap_.find(p2pKey) != devHCCLCommMap_.end()) {
+                    // Reuse the cached communicator if there is one.
+                    auto& hcclComms = devHCCLCommMap_[p2pKey];
+                    for (const auto& hcclComm : hcclComms) {
+                        auto comm = hcclComm->getHcclComm();
+                        HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm));
+                    }
+                }
+            }
+        }
     }
     ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str());
 }
@@ -3087,6 +3178,22 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
         }
     }
 
+    if (options_->hccl_config.find("hccl_world_rank_id") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["hccl_world_rank_id"])) {
+            config.hcclOpExpansionMode = std::get<uint32_t>(options_->hccl_config["hccl_world_rank_id"]);
+        } else {
+            TORCH_CHECK(false, "Value type of hccl_world_rank_id should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
+    if (options_->hccl_config.find("hccl_job_id") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint64_t>(options_->hccl_config["hccl_job_id"])) {
+            config.hcclOpExpansionMode = std::get<uint64_t>(options_->hccl_config["hccl_job_id"]);
+        } else {
+            TORCH_CHECK(false, "Value type of hccl_job_id should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
     return config;
 }
 
@@ -3684,7 +3791,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt);
+                tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3854,7 +3961,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
             for (const auto i : c10::irange(tensors.size())) {
                 if (tensors[i].scalar_type() == at::kBool || tensors[i].scalar_type() == at::kByte) {
                     c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                    tensors_cp[i] = at_npu::native::custom_ops::npu_dtype_cast(tensors[i], at::kInt);
+                    tensors_cp[i] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[i], at::kInt);
                 }
             }
         },
@@ -3913,7 +4020,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt);
+                tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3973,11 +4080,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
             if (inputTensors[0].scalar_type() == at::kBool || inputTensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                inputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(inputTensors[0], at::kInt);
+                inputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(inputTensors[0], at::kInt);
             }
             if (outputTensors[0].scalar_type() == at::kBool || outputTensors[0].scalar_type() == at::kByte) {
                 c10_npu::NPUStreamGuard guard(hcclStreams[0]);
-                outputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(outputTensors[0], at::kInt);
+                outputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(outputTensors[0], at::kInt);
             }
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -4012,14 +4119,14 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const
     if (num_add != 0) {
         bool transflag = false;
         if (inter_tensors.scalar_type() == at::ScalarType::Bool) {
-            inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Int);
+            inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Int);
             transflag = true;
         }
 
         inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0);
 
         if (transflag) {
-            inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool);
+            inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Bool);
         }
     }
     return inter_tensors;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 26a88874ab24c9f8134217f07d9064534036aa56..0c638ef579957aa94b1466545e94867842efa448 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -384,7 +384,7 @@ public:
             return c10::make_intrusive<Options>(_is_high_priority_stream);
         }
 
-        std::unordered_map<std::string, std::variant<uint32_t, std::string>> hccl_config;
+        std::unordered_map<std::string, std::variant<uint32_t, uint64_t, std::string>> hccl_config;
 
         std::chrono::milliseconds opTimeout;
         // Schedule HCCL operations on high priority CUDA streams
@@ -571,6 +571,8 @@ public:
 
     void resumeHcclComm(int device_id);
 
+    void setNSLBCommConfig(HcclCommConfig** commConfig);
+
     bool setCommWorkingDevNic(
         const HcclComm& comm,
         int nranks,
@@ -746,6 +748,8 @@ protected:
     //
     //      Note that the order of the device for the tensor list matters.
     std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLComm>>> devHCCLCommMap_;
+    
+    std::unordered_map<int, std::vector<std::string>> p2pSendRecvKeys_;
 
     std::unordered_map<std::string, std::string> devHCCLCommNameMap_;
 
@@ -960,6 +964,10 @@ protected:
 
     std::string pg_desc_;
 
+    std::string tcpMasterAddr;
+
+    uint32_t tcpMasterPort;
+
 private:
     // Helper that encapsulates work shared across all collective communication
     // primitives.
diff --git a/torch_npu/csrc/distributed/StoreMessagePacker.cpp b/torch_npu/csrc/distributed/StoreMessagePacker.cpp
index 0ff08c8d95f08a43032cba4895fb7021749c98f7..61f0388e662327fc6031014637880a01d17597af 100644
--- a/torch_npu/csrc/distributed/StoreMessagePacker.cpp
+++ b/torch_npu/csrc/distributed/StoreMessagePacker.cpp
@@ -86,6 +86,7 @@ int64_t StoreMessagePacker::Unpack(const std::vector<uint8_t> &buffer, StoreMess
     }
 
     auto ptr = buffer.data();
+    auto ptr_end = ptr + buffer.size();
     auto totalSize = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
 
@@ -97,22 +98,26 @@ int64_t StoreMessagePacker::Unpack(const std::vector<uint8_t> &buffer, StoreMess
 
     auto keyCount = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
-    message.keys.reserve(keyCount);
     for (auto i = 0UL; i < keyCount; i++) {
         auto keySize = *reinterpret_cast<const uint64_t *>(ptr);
         ptr += sizeof(uint64_t);
         message.keys.emplace_back(reinterpret_cast<const char *>(ptr), keySize);
         ptr += keySize;
+        if (ptr > ptr_end) {
+            break;
+        }
     }
 
     auto valueCount = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
-    message.values.reserve(valueCount);
     for (auto i = 0UL; i < valueCount; i++) {
         auto valueSize = *reinterpret_cast<const uint64_t *>(ptr);
         ptr += sizeof(uint64_t);
         message.values.emplace_back(ptr, ptr + valueSize);
         ptr += valueSize;
+        if (ptr > ptr_end) {
+            break;
+        }
     }
 
     return static_cast<int64_t>(totalSize);
diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h
index 9d4f9d9d523d14b2dc2421cedae95e268d43e236..f6140a3d06f6a81d1ee208f04f5945a2f4e820a8 100644
--- a/torch_npu/csrc/distributed/TraceUtils.h
+++ b/torch_npu/csrc/distributed/TraceUtils.h
@@ -711,71 +711,71 @@ DEFINE_CONSTANT(started_state, "started")
             if (includeCollectives) {
                 std::list<json> entries;
                 for (auto& e : dump_entries()) {
-                json j;
-                if (onlyActive && e.time_discovered_completed_.has_value()) {
-                    continue;
-                }
-                j[record_id_key_str] = int64_t(e.id_);
-                j[pg_id_key_str] = int64_t(e.pg_id_);
-                j[pg_name_key_str] = e.pg_name_;
-                j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
-                j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
-                j[op_id_key_str] = int64_t(e.op_id_);
-                j[profiling_name_key_str] = e.profiling_name_;
-                j[time_created_key_str] = int64_t(e.time_created_);
-                if (e.duration_) {
-                    j[duration_key_str] = *e.duration_;
-                }
-                auto it = e.sizes_.begin();
-                auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
-                    auto sizes = std::list<std::list<int64_t>>();
-                    for (auto dim : dims) {
-                    auto arg_sizes = std::list<int64_t>();
-                    for (auto i : c10::irange(dim)) {
-                        (void)i;
-                        arg_sizes.push_back(*it++);
+                    json j;
+                    if (onlyActive && e.time_discovered_completed_.has_value()) {
+                        continue;
                     }
-                    sizes.push_back(arg_sizes);
+                    j[record_id_key_str] = int64_t(e.id_);
+                    j[pg_id_key_str] = int64_t(e.pg_id_);
+                    j[pg_name_key_str] = e.pg_name_;
+                    j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
+                    j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
+                    j[op_id_key_str] = int64_t(e.op_id_);
+                    j[profiling_name_key_str] = e.profiling_name_;
+                    j[time_created_key_str] = int64_t(e.time_created_);
+                    if (e.duration_) {
+                        j[duration_key_str] = *e.duration_;
                     }
-                    return sizes;
-                };
-                j[input_sizes_key_str] = read_sizes(e.input_dims_);
-                std::vector<std::string> input_dtypes_strs;
-                input_dtypes_strs.reserve(e.input_dtypes_.size());
-                for (const auto& input_dtype : e.input_dtypes_) {
-                    input_dtypes_strs.emplace_back(c10::toString(input_dtype));
-                }
-                j[input_dtypes_key_str] = input_dtypes_strs;
-                j[output_sizes_key_str] = read_sizes(e.output_dims_);
-                std::vector<std::string> output_dtypes_strs;
-                output_dtypes_strs.reserve(e.output_dtypes_.size());
-                for (const auto& output_dtype : e.output_dtypes_) {
-                    output_dtypes_strs.emplace_back(c10::toString(output_dtype));
-                }
-                j[output_dtypes_key_str] = output_dtypes_strs;
-                if (e.time_discovered_completed_.has_value()) {
-                    j[state_key_str] = completed_state_str;
-                } else if (e.time_discovered_started_.has_value()) {
-                    j[state_key_str] = started_state_str;
-                } else {
-                    j[state_key_str] = scheduled_state_str;
-                }
-                j[time_discovered_started_key_str] =
-                    e.time_discovered_started_.has_value()
-                    ? int64_t(*e.time_discovered_started_)
-                    : 0;
-                j[time_discovered_completed_key_str] =
-                    e.time_discovered_completed_.has_value()
-                    ? int64_t(*e.time_discovered_completed_)
-                    : 0;
-                j[retired_key_str] = e.retired_;
-                j[timeout_key_str] = e.timeout_ms_;
-                j[is_p2p_key_str] = e.isP2P_;
-                entries.emplace_back(j);
+                    auto it = e.sizes_.begin();
+                    auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+                        auto sizes = std::list<std::list<int64_t>>();
+                        for (auto dim : dims) {
+                            auto arg_sizes = std::list<int64_t>();
+                            for (auto i : c10::irange(dim)) {
+                                (void)i;
+                                arg_sizes.push_back(*it++);
+                            }
+                            sizes.push_back(arg_sizes);
+                        }
+                        return sizes;
+                    };
+                    j[input_sizes_key_str] = read_sizes(e.input_dims_);
+                    std::vector<std::string> input_dtypes_strs;
+                    input_dtypes_strs.reserve(e.input_dtypes_.size());
+                    for (const auto& input_dtype : e.input_dtypes_) {
+                        input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+                    }
+                    j[input_dtypes_key_str] = input_dtypes_strs;
+                    j[output_sizes_key_str] = read_sizes(e.output_dims_);
+                    std::vector<std::string> output_dtypes_strs;
+                    output_dtypes_strs.reserve(e.output_dtypes_.size());
+                    for (const auto& output_dtype : e.output_dtypes_) {
+                        output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+                    }
+                    j[output_dtypes_key_str] = output_dtypes_strs;
+                    if (e.time_discovered_completed_.has_value()) {
+                        j[state_key_str] = completed_state_str;
+                    } else if (e.time_discovered_started_.has_value()) {
+                        j[state_key_str] = started_state_str;
+                    } else {
+                        j[state_key_str] = scheduled_state_str;
+                    }
+                    j[time_discovered_started_key_str] =
+                        e.time_discovered_started_.has_value()
+                        ? int64_t(*e.time_discovered_started_)
+                        : 0;
+                    j[time_discovered_completed_key_str] =
+                        e.time_discovered_completed_.has_value()
+                        ? int64_t(*e.time_discovered_completed_)
+                        : 0;
+                    j[retired_key_str] = e.retired_;
+                    j[timeout_key_str] = e.timeout_ms_;
+                    j[is_p2p_key_str] = e.isP2P_;
+                    entries.emplace_back(j);
                 }
 
                 if (!entries.empty()) {
-                result[entries_key_str] = entries;
+                    result[entries_key_str] = entries;
                 }
             }
 
diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
index 1b85e7fce6438a9a4bb657a0ea1124c62be066c5..319de4ae93d3704562c0cfb74a2418d7afff3ea7 100644
--- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -424,6 +424,9 @@ void TensorPipeAgent::startImpl()
             priority = opts_.transports->size() - 1 - (iter - opts_.transports->begin());
         }
         std::unique_ptr<TransportRegistration> reg = TensorPipeTransportRegistry()->Create(key);
+        if (reg == nullptr || reg->transport == nullptr) {
+            TORCH_CHECK(false, "TensorPipeTransport get nullptr", DIST_ERROR(ErrCode::PTR));
+        }
         if (!reg->transport->isViable()) {
             continue;
         }
diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp
index 6a92fe5af4d8039b3d0ff9c50e49d1fd5fa30a00..9bd270b8fd231cb39a8bc9b98c8680b88a66e6a2 100644
--- a/torch_npu/csrc/framework/FormatHelper.cpp
+++ b/torch_npu/csrc/framework/FormatHelper.cpp
@@ -52,6 +52,10 @@ std::unordered_map<aclFormat, FormatHelper::FormatInfo> FormatHelper::Initialize
         {ACL_FORMAT_NDC1HWC0,
          (FormatInfo){ACL_FORMAT_NDC1HWC0, ACL_FORMAT_NCDHW, InferShapeOfNDC1HWC0, "NDC1HWC0", true}},
         {ACL_FRACTAL_Z_3D, (FormatInfo){ACL_FRACTAL_Z_3D, ACL_FORMAT_NCDHW, InferShapeOfFZ3D, "FRACTAL_Z_3D", true}},
+        {ACL_FORMAT_FRACTAL_NZ_C0_16,
+            (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_16", true}},
+        {ACL_FORMAT_FRACTAL_NZ_C0_32,
+            (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_32, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_32", true}},
     };
 };
 
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 8d12df0a312afb289cf6164c6d613a49a3fb0caa..5f51f9f0a5cfd43afceb87cf6f4552ea61d4e50c 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -4,7 +4,6 @@
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -158,8 +157,6 @@ void SetPrecisionMode()
 
 void LazyInitAclopsCore()
 {
-    c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
     if (PyGILState_Check()) {
@@ -175,8 +172,6 @@ void LazyInitAclopsCore()
         PyEval_RestoreThread(gilState);
     }
 #endif
-
-    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void LazyInitAclops()
@@ -198,14 +193,10 @@ void LazyInitAclops()
 
 void InitAclopsCore()
 {
-    SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-
     SetPrecisionMode();
     MakeCompileCacheDirAndSetOption();
     GetAndSetDefaultJitCompileByAcl();
     SetHF32DefaultValue();
-
-    SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void InitAclops()
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 6b98651c51dba728c9062a47d777650ae7ac93a6..80af05f94b321cac1e928484e18027c1e5cc836b 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -24,7 +24,9 @@ static std::unordered_map<at::ScalarType, std::vector<double>> floating_limits_m
     {at::ScalarType::Double, {std::numeric_limits<double>::max(), std::numeric_limits<double>::min()}},
     {at::ScalarType::Float, {std::numeric_limits<float>::max(), std::numeric_limits<float>::min()}},
     {at::ScalarType::BFloat16, {std::numeric_limits<float>::max(), std::numeric_limits<float>::min()}},
-    {at::ScalarType::Half, {65504, -65504}}};
+    {at::ScalarType::Half, {65504, -65504}},
+    {at::ScalarType::Float8_e5m2, {57345, -57345}},
+    {at::ScalarType::Float8_e4m3fn, {449, -449}}};
 static std::unordered_map<at::ScalarType, std::vector<long>> integral_limits_map{
     {at::ScalarType::Long, {std::numeric_limits<long>::max(), std::numeric_limits<long>::min()}},
     {at::ScalarType::Int, {std::numeric_limits<int>::max(), std::numeric_limits<int>::min()}},
@@ -274,7 +276,7 @@ OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceSca
 {
     std::tuple<aclTensorDesc*, aclDataBuffer*> res;
     if (commonType.has_value() && commonType.value() != tensor.scalar_type()) {
-        tensor = custom_ops::npu_dtype_cast(tensor, commonType.value());
+        tensor = custom_ops::_npu_dtype_cast(tensor, commonType.value());
     }
     // as for dim=0, the dtype of tensor can not be `uint16` because of `TBE`
     if (torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).storage_sizes_.empty()) {
@@ -331,7 +333,7 @@ OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType ty
 OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType)
 {
     if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) {
-        output = custom_ops::npu_dtype_cast(output, commonType.value());
+        output = custom_ops::_npu_dtype_cast(output, commonType.value());
     }
     auto res = OpCmdHelper::CovertToAclOutput(output, realType);
     aclCmd->AddOutput(std::get<0>(res), std::get<1>(res));
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index ce8b9065146b362aca3e35a6531994d21a13cea0..0db71b00a109eadcbfacc134d79614ddff98b6e9 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -575,6 +575,7 @@ void *NewFunc(int caption, int &size)
 void DeleteFunc(void *ptr)
 {
     free(ptr);
+    ptr = nullptr;
 }
 
 using Func = int (*)(c10_npu::queue::QueueParas *, aclrtStream);
diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index 6a23a5e4b9094896dc11efa83fed5deb413792eb..08a2d603b6cd96acbe4cd0a05435b15fae9d275c 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -62,9 +62,13 @@ void StorageDescHelper::UpdateDesc(torch_npu::NPUStorageDesc &npuDesc, const c10
         }
     }
     npuDesc.base_strides_ = new_stride;
-
     // 更新物理内存信息
-    npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc);
+    int NCDHW_OR_NDHWC_DIM = 5;
+    if ((npuDesc.npu_format_ == ACL_FORMAT_NCDHW || npuDesc.npu_format_ == ACL_FORMAT_NDHWC) && new_size.size() < NCDHW_OR_NDHWC_DIM) {
+        npuDesc.storage_sizes_ = new_size;
+    } else {
+        npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc);
+    }
     if (new_data_numel > new_shape_numel) {
         // Refresh format to base format only when flattening storage data
         npuDesc.storage_sizes_ = new_size;
@@ -98,6 +102,13 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, c
     torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = SetDesc(dst.dtype(), size, strides, format);
 }
 
+void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size,
+    const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format)
+{
+    torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ =
+        SetDesc(dst.dtype(), base_size, storage_size, strides, format);
+}
+
 bool StorageDescHelper::CheckDescInit(const c10::Storage &storage)
 {
     return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ !=
@@ -255,6 +266,22 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty
     return npu_desc;
 }
 
+torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size,
+    const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format)
+{
+    struct torch_npu::NPUStorageDesc npu_desc;
+    npu_desc.data_type_ = dtype;
+    npu_desc.base_sizes_ = base_size;
+    npu_desc.base_strides_ = strides;
+    aclFormat baseFormat;
+    aclFormat npuFormat;
+    std::tie(baseFormat, npuFormat) = InferFormat::GuessFormatUnit(base_size, format);
+    npu_desc.storage_sizes_ = storage_size;
+    npu_desc.origin_format_ = baseFormat;
+    npu_desc.npu_format_ = npuFormat;
+    return npu_desc;
+}
+
 int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst)
 {
     const auto &physical_size = FormatHelper::GetStorageSizes(dst);
diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h
index 6497ee1a8825551acc85d71de0a2582db771c548..f3b35067e0f5fcf8f9f46f9886bd2286b48e887b 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.h
+++ b/torch_npu/csrc/framework/StorageDescHelper.h
@@ -35,6 +35,8 @@ public:
     static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides);
     static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides,
                         aclFormat format);
+    static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size,
+        const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format);
     static bool CheckDescInit(const c10::Storage &storage);
 
     // For Serialization to Get and Set NpuStorageDesc
@@ -63,6 +65,8 @@ private:
                                              const c10::IntArrayRef& strides);
     static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& size,
                                              const c10::IntArrayRef& strides, aclFormat format);
+    static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size,
+        const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format);
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
index c2abf7f4b2ae45e57a603f91ea3480e9519e4b1f..ee90387910967e7113f0153b0a8aea3099c0cb50 100644
--- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
@@ -70,6 +70,14 @@ private:
             ResetDataPtr(src, self,
                          static_cast<int64_t *>(src.storage().data_ptr().get()));
             return true;
+        case at::ScalarType::Float8_e5m2:
+            ResetDataPtr(src, self,
+                         static_cast<uint8_t *>(src.storage().data_ptr().get()));
+            return true;
+        case at::ScalarType::Float8_e4m3fn:
+            ResetDataPtr(src, self,
+                         static_cast<uint8_t *>(src.storage().data_ptr().get()));
+            return true;
         default:
             // Turn to conducting d2dCopyAsync for other dtypes.
             return false;
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index ad1f76c5e57105f546893c6ae6cd916688372839..f1bab9565f504e4d62640f432be6c0346f718a9e 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -1,6 +1,5 @@
 #include <climits>
 #include "torch_npu/csrc/core/npu/NPUException.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 
 #include "third_party/acl/inc/acl/acl_mdl.h"
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
@@ -47,9 +46,23 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
-static bool acl_op_has_init = false;
+bool CheckJitDisableInner()
+{
+    auto val = c10_npu::option::GetOption("jitCompile");
+    if (val.has_value()) {
+        if (val.value() == ("disable")) {
+            return true;
+        }
+        if (val.value() == ("enable")) {
+            return false;
+        }
+    }
+    if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1) {
+        return true;
+    }
+    return false;
+}
 
-REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
     auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
@@ -60,14 +73,7 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
                     "Jit compile set is disabled! If you want to set, ",
                     "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.",
                     PTA_ERROR(ErrCode::NOT_SUPPORT));
-        if (!acl_op_has_init) {
-            c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-        }
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
-        if (!acl_op_has_init) {
-            c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
-            acl_op_has_init = true;
-        }
     }
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
index c1bf40b0ee7a351643f730b5153b52e5bcc96ef2..97a07a9025deda50040bfe4047cbbede07620532 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
@@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC)
     _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED)                                                                       \
     _(at::ScalarType::Bits8, ACL_DT_UNDEFINED)                                                                         \
     _(at::ScalarType::Bits16, ACL_DT_UNDEFINED)                                                                        \
-    _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED)                                                                   \
-    _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED)                                                                 \
+    _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED)                                                                    \
+    _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED)                                                                \
     _(at::ScalarType::Float8_e5m2fnuz, ACL_DT_UNDEFINED)                                                               \
     _(at::ScalarType::Float8_e4m3fnuz, ACL_DT_UNDEFINED)                                                               \
     _(at::ScalarType::UInt16, ACL_UINT16)                                                                              \
@@ -86,6 +86,28 @@ AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC)
 static std::map<const std::string, const aclDataType> STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = {
     {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}};
 
+static std::unordered_map<const aclDataType, const at::ScalarType>
+    ACL_TYPE_TO_SCALAR_TYPE_MAP = {{ACL_DT_UNDEFINED, at::ScalarType::Undefined},
+                                   {ACL_FLOAT, at::ScalarType::Float},
+                                   {ACL_FLOAT16, at::ScalarType::Half},
+                                   {ACL_INT8, at::ScalarType::Char},
+                                   {ACL_INT32, at::ScalarType::Int},
+                                   {ACL_UINT8, at::ScalarType::Byte},
+                                   {ACL_INT16, at::ScalarType::Short},
+                                   {ACL_UINT16, at::ScalarType::UInt16},
+                                   {ACL_UINT32, at::ScalarType::UInt32},
+                                   {ACL_INT64, at::ScalarType::Long},
+                                   {ACL_UINT64, at::ScalarType::UInt64},
+                                   {ACL_DOUBLE, at::ScalarType::Double},
+                                   {ACL_BOOL, at::ScalarType::Bool},
+                                   {ACL_STRING, at::ScalarType::Undefined},
+                                   {ACL_COMPLEX64, at::ScalarType::ComplexFloat},
+                                   {ACL_COMPLEX128, at::ScalarType::ComplexDouble},
+                                   {ACL_BF16, at::ScalarType::BFloat16},
+                                   {ACL_INT4, at::ScalarType::Undefined},
+                                   {ACL_UINT1, at::ScalarType::Undefined},
+                                   {ACL_COMPLEX32, at::ScalarType::ComplexHalf}};
+
 aclError AclrtMemcpyAsyncParamCheck(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind,
                                     aclrtStream stream)
 {
@@ -297,5 +319,17 @@ int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32)
     return iter->second;
 }
 
+at::ScalarType CalcuOpUtil::ConvertToScalarType(const aclDataType data_type)
+{
+    auto iter = ACL_TYPE_TO_SCALAR_TYPE_MAP.find(data_type);
+    if (iter == ACL_TYPE_TO_SCALAR_TYPE_MAP.end()) {
+        TORCH_CHECK(false,
+            std::string("aclDataType:") + std::to_string(data_type) + " has not been supported",
+            OPS_ERROR(ErrCode::NOT_SUPPORT))
+    }
+    
+    return iter->second;
+}
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
index 481c1756a4d484396e166f9ff869310ee7592652..5ee41e7d64b4988d356dcb69a9b06e332277e0f7 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
@@ -36,14 +36,23 @@ using std::vector;
 #define ASCEND_ALWAYS_INLINE inline
 #endif
 
-#define ACL_REQUIRE_OK_OP(expr, opstr)                                         \
-    do {                                                                       \
-        if (ASCEND_UNLIKELY((expr) != 0)) {                                    \
-            std::cout << (opstr) << std::endl;                                 \
-            TORCH_CHECK((expr) == 0, __func__, ":", __FILE__, ":", __LINE__,   \
-                        " NPU error,NPU error code is:", expr, "\n",           \
-                        c10_npu::acl::AclGetErrMsg(), OPS_ERROR(ErrCode::INTERNAL));                         \
-        }                                                                      \
+#define ACL_REQUIRE_OK_OP(expr, opstr)                                                             \
+    do {                                                                                           \
+        if (ASCEND_UNLIKELY((expr) != 0)) {                                                        \
+            std::cout << (opstr) << std::endl;                                                     \
+            if (c10_npu::option::OptionsManager::IsCompactErrorOutput())  {                        \
+                std::ostringstream oss;                                                            \
+                oss << " NPU error,NPU error code is:" << (expr) << "\n"                             \
+                  << OPS_ERROR(ErrCode::INTERNAL);                                                 \
+                std::string err_msg=oss.str();                                                     \
+                ASCEND_LOGE("%s", err_msg.c_str());                                                \
+                TORCH_CHECK((expr) == 0, c10_npu::c10_npu_get_error_message());                    \
+            } else {                                                                               \
+                TORCH_CHECK((expr) == 0, __func__, ":", __FILE__, ":", __LINE__,                   \
+                        " NPU error,NPU error code is:", expr, "\n",                               \
+                        c10_npu::acl::AclGetErrMsg(), OPS_ERROR(ErrCode::INTERNAL));               \
+            }                                                                                      \
+        }                                                                                          \
     } while (0)
 
 using StorageAndOffsetMemSizePair = std::pair<const c10::StorageImpl *, int64_t>;
@@ -81,6 +90,7 @@ public:
     static int64_t GetTensorNpuFormat(const at::Tensor &tensor);
     static c10::SmallVector<int64_t, SHAPE_SIZE> ConvertIntArrayRefToSmallVector(c10::IntArrayRef intArray);
     static int8_t GetCubeMathType(bool allowHf32);
+    static at::ScalarType ConvertToScalarType(const aclDataType data_type);
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
index a1b2b00aa12b2b2e96f75c0cf7ff4c718e653f01..1626499a80b2870fb4cad1c8c726595e003212ff 100644
--- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
+++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
@@ -18,7 +18,6 @@
 
 namespace at_npu {
 namespace native {
-
 void ForceAclnn::RegisterOp(const std::string &list)
 {
     if (list.empty()) {
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index 47d4fac21a2d2f740027e2f09b0a906ad00932e7..d9815f233580545e328236708ae64e92ffec9ebf 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -1,5 +1,6 @@
 #include <mutex>
 #include <set>
+#include <sys/stat.h>
 
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
@@ -267,6 +268,15 @@ void NpuUtils::check_1d(const at::Tensor &t, const char *arg, const char *fn)
                 OPS_ERROR(ErrCode::PARAM));
 }
 
+bool NpuUtils::setFilePermissions(int fd, mode_t mode)
+{
+    if (fchmod(fd, mode) == -1) {
+        ASCEND_LOGI("Failed to set permissions.");
+        return false;
+    }
+    return true;
+}
+
 #ifndef BUILD_LIBTORCH
 
 void NpuUtils::ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id)
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.h b/torch_npu/csrc/framework/utils/NpuUtils.h
index a891f0d2b4ccf8d35852cc522495930de61920d2..a85dbe1b19988fd20ae7e90a0f2a6fad1d4e776e 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.h
+++ b/torch_npu/csrc/framework/utils/NpuUtils.h
@@ -46,6 +46,7 @@ public:
     static bool check_5d_5d_match(const at::Tensor &tensor);
     static bool IsOomError(aclError ret, int index);
     static void check_1d(const at::Tensor &t, const char *arg, const char *fn);
+    static bool setFilePermissions(int fd, mode_t mode);
 #ifndef BUILD_LIBTORCH
     static void ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data,
                                                 uint64_t correlation_id = 0);
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index 069dc4719f050ef5579aa2976eb3931c59027b3d..c48c25786bae7f33753d3d828795d37595178a2b 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -96,6 +96,11 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t
     return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType);
 }
 
+at::ScalarType OpPreparation::convert_to_scalar_type(const aclDataType data_type)
+{
+    return CalcuOpUtil::ConvertToScalarType(data_type);
+}
+
 at::Tensor OpPreparation::copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type)
 {
     return CalcuOpUtil::CopyScalarToDevice(cpu_scalar, scalar_data_type);
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h
index 74ac30389872e4c0c8cb7da7a1ae3d7c2d4e075c..e87a91011218a4aa55b3f5187523af97ba1226f6 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.h
+++ b/torch_npu/csrc/framework/utils/OpPreparation.h
@@ -22,6 +22,7 @@ public:
     // From CalcuOpUtil part
     static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type);
     static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType);
+    static at::ScalarType convert_to_scalar_type(const aclDataType data_type);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type,
                                             const c10::Device device);
diff --git a/torch_npu/csrc/ipc/CMakeLists.txt b/torch_npu/csrc/ipc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c70da051f6f729c639eeb418daf0d154e6dc239
--- /dev/null
+++ b/torch_npu/csrc/ipc/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB _IPC_SRCS *.cpp)
+
+LIST(APPEND IPC_SRCS ${_IPC_SRCS})
+
+# Pass to parent
+set(IPC_SRCS ${IPC_SRCS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.cpp b/torch_npu/csrc/ipc/NPUIPCTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ff8458c8760e756c719083a0a2eaf89933148fb
--- /dev/null
+++ b/torch_npu/csrc/ipc/NPUIPCTypes.cpp
@@ -0,0 +1,254 @@
+#include <ATen/MapAllocator.h>
+#include <atomic>
+#include <map>
+#include <mutex>
+#include <random>
+#include <string>
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/ipc/NPUIPCTypes.h"
+
+#include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
+
+namespace torch_npu {
+namespace ipc {
+
+namespace {
+
+void warnProducerTerminatedBeforeSharedTensorsReleased()
+{
+    static bool warned = false;
+    if (!warned) {
+        LOG(WARNING)
+            << "Producer process has been terminated before all shared NPU tensors released. See Note [Sharing NPU tensors]";
+        warned = true;
+    }
+}
+
+struct NpuIPCGlobalEntities {
+    // This class is used as a singleton (see npu_ipc_global_entities)
+    // This variable is used to track its lifetime to avoid accessing it
+    // after it was destroyed which would lead to segmentation faults
+    // Note that a trvial type is used which doesn't suffer from construction
+    // and destruction order issues
+    static bool alive;
+
+    std::mutex ref_counters_mutex_;
+    std::atomic<int64_t> sync_events_used_{0};
+    std::map<std::string, std::shared_ptr<NpuIPCRefCountersFile>>
+        ref_counters_files_;
+    std::shared_ptr<NpuIPCRefCountersFile> next_available_ref_counters_file_;
+    NpuIPCSentDataLimbo NpuIPCSentDataLimbo_;
+
+    NpuIPCGlobalEntities()
+    {
+        alive = true;
+    }
+
+    ~NpuIPCGlobalEntities()
+    {
+        NpuIPCSentDataLimbo_.collect();
+        safe_clean_current_file();
+        if (next_available_ref_counters_file_) {
+            warnProducerTerminatedBeforeSharedTensorsReleased();
+        }
+        alive = false;
+    }
+
+    void safe_clean_current_file()
+    {
+        std::lock_guard<std::mutex> lock(ref_counters_mutex_);
+        if (next_available_ref_counters_file_ &&
+            next_available_ref_counters_file_->offsets_in_use() == 0) {
+            ref_counters_files_.erase(next_available_ref_counters_file_->handle());
+            next_available_ref_counters_file_.reset();
+        }
+    }
+};
+
+bool NpuIPCGlobalEntities::alive = false;
+NpuIPCGlobalEntities npu_ipc_global_entities;
+
+NpuIPCSentDataLimbo::~NpuIPCSentDataLimbo()
+{
+    collect();
+    if (size() > 0) {
+        warnProducerTerminatedBeforeSharedTensorsReleased();
+    }
+}
+
+bool NpuIPCSentDataLimbo::collect()
+{
+    bool freed_memory = false;
+    std::vector<std::unique_ptr<NpuIPCSentData>> reset_blocks;
+    {
+        // Begin critical section to modify shared blocks
+        std::lock_guard<std::mutex> lock(limbo_mutex_);
+        std::vector<std::unique_ptr<NpuIPCSentData>> kept_blocks;
+        for (auto& sd : shared_blocks_) {
+            if (sd->counter_value() > 0) {
+                kept_blocks.push_back(std::move(sd));
+            } else {
+                freed_memory = true;
+                reset_blocks.push_back(std::move(sd));
+            }
+        }
+        shared_blocks_ = std::move(kept_blocks);
+    }
+    // Need to reset blocks out of the critical section here, otherwise it
+    // deadlocks.
+    for (auto& sd : reset_blocks) {
+        sd.reset();
+    }
+    return freed_memory;
+}
+
+void NpuIPCSentDataLimbo::add(std::unique_ptr<NpuIPCSentData> shared_block)
+{
+    std::lock_guard<std::mutex> lock(limbo_mutex_);
+    static bool warned = false;
+    if (shared_blocks_.size() > NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO &&
+        !warned) {
+        LOG(WARNING)
+            << "Producer process tried to deallocate over "
+            << NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO
+            << " memory blocks referred by consumer processes. Deallocation might be significantly slowed down. "
+            << "We assume it will never going to be the case.";
+        warned = true;
+    }
+    shared_blocks_.push_back(std::move(shared_block));
+}
+
+uint64_t NpuIPCSentDataLimbo::size()
+{
+    std::lock_guard<std::mutex> lock(limbo_mutex_);
+    return shared_blocks_.size();
+}
+
+void NpuIPCSentDataDelete(void* ptr)
+{
+    std::unique_ptr<NpuIPCSentData> sent_data(
+        static_cast<NpuIPCSentData*>(ptr));
+    if (!NpuIPCGlobalEntities::alive) {
+        return;
+    }
+    if (sent_data->counter_value() > 0) {
+        npu_ipc_global_entities.NpuIPCSentDataLimbo_.add(std::move(sent_data));
+    }
+    npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect();
+}
+
+void ReturnRefCounter(const std::string& handle, uint64_t offset /* unused */)
+{
+    if (!NpuIPCGlobalEntities::alive) {
+        return;
+    }
+    std::lock_guard<std::mutex> lock(
+        npu_ipc_global_entities.ref_counters_mutex_);
+    auto& map = npu_ipc_global_entities.ref_counters_files_;
+    auto it = map.find(handle);
+    if (it != map.end()) {
+        it->second->return_offset(offset);
+        if (it->second->offsets_in_use() == 0 && !it->second->have_offsets()) {
+            map.erase(handle);
+        }
+    }
+}
+
+} // namespace
+
+NpuIPCSentData::NpuIPCSentData(
+    std::string handle,
+    uint64_t offset,
+    uint64_t* counter_ptr,
+    at::Device device)
+    :   handle_(std::move(handle)),
+        offset_(offset),
+        counter_ptr_(counter_ptr),
+        device_(device)
+{
+    if (npu_ipc_global_entities.sync_events_used_.load() <
+        NPU_IPC_MAXIMUM_EVENTS_TO_USE) {
+        // NPU does not suppurt event_sync in IPC now.
+    } else {
+        auto stream = c10_npu::getCurrentNPUStream(device.index());
+        c10_npu::stream_synchronize(stream);
+        event_ = nullptr;
+        event_sync_required_ = false;
+    }
+}
+
+NpuIPCSentData::~NpuIPCSentData()
+{
+    ReturnRefCounter(handle_, offset_);
+    try {
+        if (event_sync_required_) {
+            // NPU does not suppurt event_sync in IPC now.
+        }
+    } catch (...) { /* No throw */
+    }
+}
+
+uint64_t NpuIPCSentData::counter_value()
+{
+    return *counter_ptr_;
+}
+
+at::DataPtr GetNewRefCountedSentData(void* data, at::Device device)
+{
+    {
+        std::lock_guard<std::mutex> lock(
+            npu_ipc_global_entities.ref_counters_mutex_);
+        if (!npu_ipc_global_entities.next_available_ref_counters_file_) {
+            std::string ref_counter_handle = at::NewProcessWideShmHandle();
+
+            int flags =
+                at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE;
+            at::DataPtr sptr = at::RefcountedMapAllocator::makeDataPtr(
+                ref_counter_handle.c_str(),
+                flags,
+                sizeof(int64_t) * NPU_IPC_REF_COUNTER_FILE_SIZE,
+                nullptr);
+            auto rc = std::make_shared<NpuIPCRefCountersFile>(
+                ref_counter_handle, NPU_IPC_REF_COUNTER_FILE_SIZE, std::move(sptr));
+            npu_ipc_global_entities.ref_counters_files_[ref_counter_handle] = rc;
+            npu_ipc_global_entities.next_available_ref_counters_file_ = rc;
+        }
+    }
+    npu_ipc_global_entities.next_available_ref_counters_file_->set_counter(1);
+    auto sent_data = new NpuIPCSentData(
+        npu_ipc_global_entities.next_available_ref_counters_file_->handle(),
+        npu_ipc_global_entities.next_available_ref_counters_file_->get_offset(),
+        npu_ipc_global_entities.next_available_ref_counters_file_->counter_ptr(),
+        device);
+
+    npu_ipc_global_entities.next_available_ref_counters_file_->rotate_offset();
+    if (!npu_ipc_global_entities.next_available_ref_counters_file_
+            ->have_offsets()) {
+        npu_ipc_global_entities.next_available_ref_counters_file_.reset();
+    }
+    return at::DataPtr(data, sent_data, NpuIPCSentDataDelete, device);
+}
+
+bool NpuIPCCollect()
+{
+    if (!NpuIPCGlobalEntities::alive) {
+        return true;
+    }
+    bool freed_memory = npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect();
+    if (npu_ipc_global_entities.NpuIPCSentDataLimbo_.size() == 0) {
+        npu_ipc_global_entities.safe_clean_current_file();
+    }
+    return freed_memory;
+}
+
+} // namespace ipc
+} // namespace torch_npu
+
+namespace c10_npu {
+namespace NPUCachingAllocator {
+
+REGISTER_FREE_MEMORY_CALLBACK("npu_ipc_collect", NpuIPCCollectCallback);
+
+} // namespace NPUCachingAllocator
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.h b/torch_npu/csrc/ipc/NPUIPCTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..5156af2da429aae306f886e7366cd46a82376667
--- /dev/null
+++ b/torch_npu/csrc/ipc/NPUIPCTypes.h
@@ -0,0 +1,150 @@
+#pragma once
+#include <c10/core/Allocator.h>
+
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+
+namespace torch_npu {
+namespace ipc {
+
+TORCH_NPU_API bool NpuIPCCollect();
+
+struct NpuIPCReceivedData final {
+    NpuIPCReceivedData() = default;
+    explicit NpuIPCReceivedData(std::shared_ptr<void> shared_ptr)
+        : shared_ptr_(std::move(shared_ptr)) {}
+    std::shared_ptr<void> shared_ptr_;
+};
+
+struct NpuIPCSentData final {
+    std::string handle_;
+    uint64_t offset_;
+    uint64_t* counter_ptr_;     // Reference counter shared memory block
+    at::DataPtr original_ptr_;  // Original mem allocation
+    char* event_;               // Sync event
+    bool event_sync_required_;
+    at::Device device_;
+
+    NpuIPCSentData(
+        std::string handle,
+        uint64_t offset,
+        uint64_t* counter_ptr,
+        at::Device device);
+    ~NpuIPCSentData();
+
+    uint64_t counter_value();
+    std::string handle()
+    {
+        return handle_;
+    }
+    uint64_t offset()
+    {
+        return offset_;
+    }
+    void set_original_ptr(at::DataPtr data_ptr)
+    {
+        original_ptr_ = std::move(data_ptr);
+    }
+};
+
+TORCH_NPU_API at::DataPtr GetNewRefCountedSentData(
+    void* data,
+    at::Device device);
+
+namespace {
+
+inline constexpr int64_t NPU_IPC_REF_COUNTER_FILE_SIZE = 10000;
+inline constexpr int64_t NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
+inline constexpr int64_t NPU_IPC_MAXIMUM_EVENTS_TO_USE = 0;
+
+// All to be deleted data blocks with non zero reference counter goes there
+struct NpuIPCSentDataLimbo final {
+    ~NpuIPCSentDataLimbo();
+    bool collect();
+    void add(std::unique_ptr<NpuIPCSentData> shared_block);
+    uint64_t size();
+
+private:
+    std::vector<std::unique_ptr<NpuIPCSentData>> shared_blocks_;
+    std::mutex limbo_mutex_;
+};
+
+struct NpuIPCRefCountersFile final {
+    NpuIPCRefCountersFile(
+        std::string handle,
+        uint64_t size,
+        at::DataPtr data_ptr)
+        :   size_(size),
+            handle_(std::move(handle)),
+            refcounted_shared_mem_(std::move(data_ptr)) {}
+
+    uint64_t* counter_ptr()
+    {
+        return static_cast<uint64_t*>(refcounted_shared_mem_.get()) + next_offset_;
+    }
+
+    void set_counter(uint64_t value)
+    {
+        *counter_ptr() = value;
+    }
+
+    bool have_offsets()
+    {
+        return next_offset_ < size_;
+    }
+
+    bool offsets_in_use()
+    {
+        return used_slots_;
+    }
+
+    uint64_t get_offset()
+    {
+        return next_offset_;
+    }
+
+    void rotate_offset()
+    {
+        next_offset_++;
+        used_slots_++;
+    }
+
+    void return_offset(uint64_t offset /* unused */)
+    {
+        used_slots_--;
+    }
+
+    std::string handle()
+    {
+        return handle_;
+    }
+
+private:
+    uint64_t next_offset_{0};
+    uint64_t size_;
+    uint64_t used_slots_{0};
+    std::string handle_;
+    at::DataPtr refcounted_shared_mem_;
+};
+
+} // namespace
+} // namespace ipc
+} // namespace torch_npu
+
+namespace c10_npu {
+namespace NPUCachingAllocator {
+namespace {
+
+class NpuIPCCollectCallback : public FreeMemoryCallback {
+public:
+    bool Execute() override
+    {
+        return torch_npu::ipc::NpuIPCCollect();
+    }
+};
+
+} // namespace
+} // namespace NPUCachingAllocator
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18fdd4c5e0722bcde2133239e3ccf9c0f9ad6ba0
--- /dev/null
+++ b/torch_npu/csrc/ipc/StorageSharing.cpp
@@ -0,0 +1,309 @@
+#ifndef BUILD_LIBTORCH
+
+#include <libshm.h>
+#include <Python.h>
+#include <vector>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/utils.h>
+#include <ATen/MapAllocator.h>
+#include <torch/csrc/Storage.h>
+#include <c10/util/flat_hash_map.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+#include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/core/NPUBridge.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
+
+#include "torch_npu/csrc/ipc/NPUIPCTypes.h"
+#include "torch_npu/csrc/ipc/StorageSharing.h"
+
+#include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
+
+namespace torch_npu {
+namespace reductions {
+
+static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    const auto& storage = THPStorage_Unpack(args);
+    TORCH_CHECK(
+        storage.device_type() == at::DeviceType::PrivateUse1,
+        "_share_npu_: only available on NPU.", PTA_ERROR(ErrCode::PARAM));
+    c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
+
+    auto npu_storage_impl = static_cast<torch_npu::NPUStorageImpl*>(storage.unsafeGetStorageImpl());
+    auto format = npu_storage_impl->npu_desc_.npu_format_;
+    TORCH_CHECK(at_npu::native::FormatHelper::IsBaseFormatType(format),
+                "Try to share a storage without base format",
+                PTA_ERROR(ErrCode::TYPE));
+
+    if (storage_impl->received_cuda()) {
+        AT_ERROR(
+            "Supported to send NPU tensor received from another process; other is not currently supported. Consider cloning before sending.");
+    }
+
+    at::DeviceGuard device_guard(storage.device());
+    THPObjectPtr tuple(PyTuple_New(8));
+    THPObjectPtr device(THPUtils_packInt32(storage.device().index()));
+    THPObjectPtr _handle(Py_None);
+    Py_INCREF(Py_None);
+    THPObjectPtr size_bytes(THPUtils_packUInt64(storage.nbytes()));
+    THPObjectPtr _offset_bytes(THPUtils_packInt32(0));
+    THPObjectPtr _ref_counter(Py_None);
+    Py_INCREF(Py_None);
+    THPObjectPtr _ref_counter_offset(THPUtils_packInt32(0));
+    THPObjectPtr _event_handle(Py_None);
+    Py_INCREF(Py_None);
+    THPObjectPtr _event_sync_required(Py_None);
+    Py_INCREF(Py_None);
+    if (storage.data()) {
+        auto shandle = c10_npu::NPUCachingAllocator::shareIpcHandle(storage.mutable_data());
+        _handle = PyBytes_FromStringAndSize(
+            shandle.handle.c_str(), (Py_ssize_t)shandle.handle.size());
+        _offset_bytes = PyLong_FromSsize_t((Py_ssize_t)shandle.offset);
+
+        at::DataPtr sent_data_ptr = torch_npu::ipc::GetNewRefCountedSentData(
+            storage.mutable_data(), storage.device());
+        auto old_data_ptr = storage.set_data_ptr(std::move(sent_data_ptr));
+        auto sent_data =
+            static_cast<torch_npu::ipc::NpuIPCSentData*>(storage.data_ptr().get_context());
+        sent_data->set_original_ptr(std::move(old_data_ptr));
+        _ref_counter = PyBytes_FromString((sent_data->handle()).c_str());
+        _ref_counter_offset = THPUtils_packUInt64(sent_data->offset());
+
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        aclrtNotify ipc_event_handle;
+
+        if (sent_data->event_sync_required_) {
+            // NPU does not suppurt event_sync in IPC now.
+        }
+
+        _event_handle = PyBytes_FromStringAndSize(
+            (char*)&ipc_event_handle, sizeof(aclrtNotify));
+        _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_);
+    }
+
+    if (!tuple || !device || !_handle || !size_bytes || !_offset_bytes ||
+        !_event_handle) {
+        return nullptr;
+    }
+    PyTuple_SET_ITEM(tuple.get(), 0, device.release());
+    PyTuple_SET_ITEM(tuple.get(), 1, _handle.release());
+    // Size(in bytes) of the real storage, note this is not the size of basePtr
+    // memory block.
+    PyTuple_SET_ITEM(tuple.get(), 2, size_bytes.release());
+    // Offset(in bytes) of the real storage in the basePtr memory block.
+    // NB: this offset MUST be in bytes instead of numel, since we use
+    // (storage_handle, offset)
+    //     as key in shared_cache(multiprocessing/reduction.py).
+    //     Offset in numel cannot uniquely represent a storage.
+    PyTuple_SET_ITEM(tuple.get(), 3, _offset_bytes.release());
+    PyTuple_SET_ITEM(tuple.get(), 4, _ref_counter.release());
+    PyTuple_SET_ITEM(tuple.get(), 5, _ref_counter_offset.release());
+    PyTuple_SET_ITEM(tuple.get(), 6, _event_handle.release());
+    PyTuple_SET_ITEM(tuple.get(), 7, _event_sync_required.release());
+    return tuple.release();
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPStorage_releaseIPCCounter(PyObject* _unused, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    TORCH_CHECK(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected", PTA_ERROR(ErrCode::PARAM));
+
+    PyObject* _ref_counter = PyTuple_GET_ITEM(args, 0);
+    PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 1);
+    if (!(PyBytes_Check(_ref_counter) && THPUtils_checkLong(_ref_counter_offset))) {
+        THPUtils_invalidArguments(
+            args,
+            nullptr,
+            "_release_ipc_counter in NPU mode",
+            1,
+            "(bytes _ref_counter, int _ref_counter_offset)");
+        return nullptr;
+    }
+    std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
+    ptrdiff_t ref_counter_offset =
+        (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+    // We don't want to break existing code, so resource deletion is best
+    // effort basis. Exception expected if producer process terminated
+    // before consumer released data.
+    int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE;
+    try {
+        auto sptr = at::RefcountedMapAllocator::makeDataPtr(
+            ref_counter_handle.c_str(),
+            flags,
+            sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE,
+            nullptr);
+        *(static_cast<int64_t*>(sptr.get()) + ref_counter_offset) -= 1;
+    } catch (c10::Error& err) {
+        // Already warned inside of producer process
+    }
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+static std::string THNPStorage_bytesAsHandleString(PyObject* handle)
+{
+    HANDLE_TH_ERRORS
+    char* buffer = nullptr;
+    Py_ssize_t handle_size = 0;
+    if (PyBytes_AsStringAndSize(handle, &buffer, &handle_size) == -1) {
+        TORCH_CHECK(handle_size == ACL_IPC_HANDLE_SIZE, "incorrect handle", PTA_ERROR(ErrCode::PARAM));
+    }
+    return std::string(buffer, handle_size);
+    END_HANDLE_TH_ERRORS_RET("")
+}
+
+static PyObject* THNPStorage_newSharedNpu(PyObject* _unused, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    TORCH_CHECK(PyTuple_GET_SIZE(args) == 8, "tuple of 8 items expected", PTA_ERROR(ErrCode::PARAM));
+    PyObject* _device = PyTuple_GET_ITEM(args, 0);
+    PyObject* _handle = PyTuple_GET_ITEM(args, 1);
+    PyObject* _size_bytes = PyTuple_GET_ITEM(args, 2);
+    PyObject* _offset_bytes = PyTuple_GET_ITEM(args, 3);
+    PyObject* _ref_counter = PyTuple_GET_ITEM(args, 4);
+    PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 5);
+    PyObject* _event_handle = PyTuple_GET_ITEM(args, 6);
+    PyObject* _event_sync_required = PyTuple_GET_ITEM(args, 7);
+    if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size_bytes) &&
+            PyBytes_Check(_handle) && PyBytes_Check(_ref_counter) &&
+            PyBytes_Check(_event_handle) && THPUtils_checkLong(_offset_bytes) &&
+            THPUtils_checkLong(_ref_counter_offset) &&
+            PyBool_Check(_event_sync_required))) {
+        THPUtils_invalidArguments(
+            args,
+            nullptr,
+            "_new_shared in NPU mode",
+            1,
+            "(int device, bytes handle, int storage_size_bytes, int storage_offset_bytes, bytes _ref_counter, int _ref_counter_offset, bytes event_handle, bool event_sync_required)");
+        return nullptr;
+    }
+
+    size_t storage_size =
+        (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t);
+    ptrdiff_t storage_offset_bytes =
+        (ptrdiff_t)THPUtils_unpackLong(_offset_bytes);
+
+    const auto device = c10::checked_convert<c10::DeviceIndex>(
+        THPUtils_unpackLong(_device), "c10::DeviceIndex");
+    c10_npu::NPUGuard device_guard(device);
+
+    if (PyObject_IsTrue(_event_sync_required)) {
+        // TO BE DONE
+    }
+
+    std::string s_handle = THNPStorage_bytesAsHandleString(_handle);
+    if (s_handle.empty()) {
+        return nullptr;
+    }
+    std::shared_ptr<void> basePtr =
+        c10_npu::NPUCachingAllocator::getIpcDevPtr(s_handle);
+
+    // Offset the basePtr to reconstruct the real storage
+    // devPtr = basePtr + storage_offset
+    void* devPtr = basePtr.get();
+    devPtr = (char*)devPtr + storage_offset_bytes;
+
+    std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
+    ptrdiff_t ref_counter_offset =
+        (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+
+    struct IpcDeleterContext {
+        std::string ref_counter_handle;
+        ptrdiff_t ref_counter_offset;
+        int64_t device;
+        torch_npu::ipc::NpuIPCReceivedData received_data;
+    };
+
+    auto ctx = std::make_unique<IpcDeleterContext>();
+    ctx->ref_counter_handle = std::move(ref_counter_handle);
+    ctx->ref_counter_offset = ref_counter_offset;
+    ctx->device = device;
+    ctx->received_data.shared_ptr_ = std::move(basePtr);
+
+    auto cur_device = c10_npu::current_device();
+    c10::DataPtr data_ptr(
+        devPtr,
+        ctx.release(),
+        +[](void* ctx_) {
+            std::unique_ptr<IpcDeleterContext> ctx(
+                static_cast<IpcDeleterContext*>(ctx_));
+
+            ctx->received_data.shared_ptr_.reset();
+
+            try {
+                c10_npu::stream_synchronize(
+                    c10_npu::getCurrentNPUStream(ctx->device));
+            } catch (c10::Error& err) {
+                // Already warned inside of producer process
+            }
+
+            int flags =
+                at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE;
+            try {
+                auto sptr = at::RefcountedMapAllocator::makeDataPtr(
+                    ctx->ref_counter_handle.c_str(),
+                    flags,
+                    sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE,
+                    nullptr);
+                *(static_cast<int64_t*>(sptr.get()) + ctx->ref_counter_offset) -= 1;
+            } catch (c10::Error& err) {
+                // Already warned inside of producer process
+            }
+        },
+        at::Device(at::DeviceType::PrivateUse1, cur_device));
+
+    c10::intrusive_ptr<c10::StorageImpl> base = c10::make_intrusive<NPUStorageImpl>(
+        c10::StorageImpl::use_byte_size_t(),
+        storage_size,
+        std::move(data_ptr),
+        nullptr,
+        false);
+
+    base->set_resizable(false);
+    base->set_received_cuda(true);
+
+    return THPStorage_NewWithStorage(
+        THPStorageClass,
+        std::move(base),
+        c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPStorage_isShared(PyObject* self, PyObject* arg)
+{
+    const auto& storage = THPStorage_Unpack(self);
+    if (storage.device_type() == at::kPrivateUse1) {
+        Py_RETURN_TRUE;
+    }
+    if (at::MapAllocator::fromDataPtr(storage.data_ptr()) ||
+        THManagedMapAllocator::fromDataPtr(storage.data_ptr())) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static struct PyMethodDef TorchReductionsMethods[] = {
+    {"_share_npu_", THNPStorage_shareNpu, METH_O, nullptr},
+    {"_release_ipc_counter_npu", THNPStorage_releaseIPCCounter, METH_VARARGS, nullptr},
+    {"_new_shared_npu", THNPStorage_newSharedNpu, METH_VARARGS, nullptr},
+    {"_is_shared", THNPStorage_isShared, METH_O, nullptr},
+    {nullptr, nullptr, 0, nullptr},
+};
+
+PyMethodDef* reductions_functions()
+{
+    return TorchReductionsMethods;
+}
+
+} // namespace reductions
+} // namespace torch_npu
+
+#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/ipc/StorageSharing.h b/torch_npu/csrc/ipc/StorageSharing.h
new file mode 100644
index 0000000000000000000000000000000000000000..a38e0c0ad68248ecf542a65e5d3f5bc14cff5903
--- /dev/null
+++ b/torch_npu/csrc/ipc/StorageSharing.h
@@ -0,0 +1,15 @@
+#ifndef BUILD_LIBTORCH
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+
+namespace torch_npu {
+namespace reductions {
+
+TORCH_NPU_API PyMethodDef* reductions_functions();
+
+} // namespace reductions
+} // namespace torch_npu
+
+#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp
index eaab8bc004e588f27ad9f8f022dc3c6d72ff7611..a527b4b4f2aa0491d0f2d3f74030ec8ba48be81e 100644
--- a/torch_npu/csrc/logging/Logger.cpp
+++ b/torch_npu/csrc/logging/Logger.cpp
@@ -8,6 +8,8 @@
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 
 namespace npu_logging {
+static const int BASE_PRINT_LIMIT = 1024;
+static const int LONG_PRINT_LIMIT = 4096;
 
 static std::unordered_map<LoggingLevel, std::string> LoggingLevelNames = {
     {LoggingLevel::DEBUG, "DEBUG"},
@@ -37,9 +39,8 @@ std::string Logger::getQName()
     return qname_;
 }
 
-void Logger::log(LoggingLevel level, const char* format, va_list args)
+void Logger::log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args)
 {
-    const int log_buffer_size = 1024;
     char buffer[log_buffer_size] = {0};
 
     int ret = vsnprintf(buffer, log_buffer_size, format, args);
@@ -75,7 +76,7 @@ void Logger::debug(const char* format, ...)
     }
     va_list args;
     va_start(args, format);
-    log(LoggingLevel::DEBUG, format, args);
+    log(LoggingLevel::DEBUG, BASE_PRINT_LIMIT, format, args);
     va_end(args);
 }
 
@@ -86,7 +87,7 @@ void Logger::info(const char* format, ...)
     }
     va_list args;
     va_start(args, format);
-    log(LoggingLevel::INFO, format, args);
+    log(LoggingLevel::INFO, BASE_PRINT_LIMIT, format, args);
     va_end(args);
 }
 
@@ -97,7 +98,7 @@ void Logger::warn(const char* format, ...)
     }
     va_list args;
     va_start(args, format);
-    log(LoggingLevel::WARNING, format, args);
+    log(LoggingLevel::WARNING, BASE_PRINT_LIMIT, format, args);
     va_end(args);
 }
 
@@ -108,7 +109,7 @@ void Logger::error(const char* format, ...)
     }
     va_list args;
     va_start(args, format);
-    log(LoggingLevel::ERROR, format, args);
+    log(LoggingLevel::ERROR, BASE_PRINT_LIMIT, format, args);
     va_end(args);
 }
 
@@ -119,7 +120,62 @@ void Logger::critical(const char* format, ...)
     }
     va_list args;
     va_start(args, format);
-    log(LoggingLevel::CRITICAL, format, args);
+    log(LoggingLevel::CRITICAL, BASE_PRINT_LIMIT, format, args);
+    va_end(args);
+}
+
+void Logger::long_debug(const char* format, ...)
+{
+    if (allow_level_ > LoggingLevel::DEBUG) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+    log(LoggingLevel::DEBUG, LONG_PRINT_LIMIT, format, args);
+    va_end(args);
+}
+
+void Logger::long_info(const char* format, ...)
+{
+    if (allow_level_ > LoggingLevel::INFO) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+    log(LoggingLevel::INFO, LONG_PRINT_LIMIT, format, args);
+    va_end(args);
+}
+
+void Logger::long_warn(const char* format, ...)
+{
+    if (allow_level_ > LoggingLevel::WARNING) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+    log(LoggingLevel::WARNING, LONG_PRINT_LIMIT, format, args);
+    va_end(args);
+}
+
+void Logger::long_error(const char* format, ...)
+{
+    if (allow_level_ > LoggingLevel::ERROR) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+    log(LoggingLevel::ERROR, LONG_PRINT_LIMIT, format, args);
+    va_end(args);
+}
+
+void Logger::long_critical(const char* format, ...)
+{
+    if (allow_level_ > LoggingLevel::CRITICAL) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+    log(LoggingLevel::CRITICAL, LONG_PRINT_LIMIT, format, args);
     va_end(args);
 }
 
diff --git a/torch_npu/csrc/logging/Logger.h b/torch_npu/csrc/logging/Logger.h
index 1734a7c7bebbf574860c4675bee52ec039ce3d16..7e76af5013e564cad671323e0010493e1f96d5b1 100644
--- a/torch_npu/csrc/logging/Logger.h
+++ b/torch_npu/csrc/logging/Logger.h
@@ -29,9 +29,14 @@ public:
     void warn(const char* format, ...);
     void error(const char* format, ...);
     void critical(const char* format, ...);
+    void long_debug(const char* format, ...);
+    void long_info(const char* format, ...);
+    void long_warn(const char* format, ...);
+    void long_error(const char* format, ...);
+    void long_critical(const char* format, ...);
 
 private:
-    void log(LoggingLevel level, const char* format, va_list args);
+    void log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args);
 
     LoggingLevel allow_level_ = LoggingLevel::WARNING;
     std::string name_;
diff --git a/torch_npu/csrc/npu/DataParallelComm.cpp b/torch_npu/csrc/npu/DataParallelComm.cpp
index db0d3efabefc96ca39c8bcaad354ed07b159bd38..c744e1e1baf961dbfa42de031c4c371c9be22672 100644
--- a/torch_npu/csrc/npu/DataParallelComm.cpp
+++ b/torch_npu/csrc/npu/DataParallelComm.cpp
@@ -137,7 +137,7 @@ void check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, i
 {
     // need to check len(inputs) == len(outputs)
     size_t len = inputs.size();
-    if (len <= 0) {
+    if (len == 0) {
         throw std::runtime_error("input sequence can't be empty" + PTA_ERROR(ErrCode::PARAM));
     }
 
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index df89f0a8a803473204d20677258d140e19c8b5ca..040e4754678597ae89ba1776919184cca6d058a6 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -27,6 +27,8 @@
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/NPUQueue.h"
 #include "torch_npu/csrc/core/npu/NPUAffinityController.h"
+#include "torch_npu/csrc/core/npu/NPUPeerToPeerAccess.h"
+#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -275,6 +277,24 @@ void RegisterNpuPluggableAllocator(PyObject* module)
             std::function<FuncType> func =
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_erase_stream_fn(func);
+        })
+        .def(
+        "set_get_device_stats_fn",
+        [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
+            uint64_t func_ptr) {
+            using FuncType=c10_npu::NPUCachingAllocator::DeviceStats(int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_get_device_stats_fn(func);
+        })
+        .def(
+        "set_reset_peak_status_fn",
+        [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
+            uint64_t func_ptr) {
+            using FuncType = void(int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_reset_peak_status_fn(func);
         });
 
     m.def(
@@ -879,7 +899,13 @@ PyObject *THNPModule_is_jit_compile_false_wrap(PyObject *self, PyObject *noargs)
     if (option_value.has_value() && (option_value.value() == "disable")) {
         Py_RETURN_TRUE;
     } else {
-        Py_RETURN_FALSE;
+        static const std::string jit_compile_init_option_name = "jitCompileInit";
+        auto init_option_value = c10_npu::option::GetOption(jit_compile_init_option_name);
+        if (init_option_value.has_value() && (init_option_value.value() == "disable")) {
+            Py_RETURN_TRUE;
+        } else {
+            Py_RETURN_FALSE;
+        }
     }
     END_HANDLE_TH_ERRORS
 }
@@ -1213,7 +1239,8 @@ PyObject* THNPModule_npuCachingAllocator_raw_alloc(PyObject *_unused, PyObject *
     END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) {
+PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj)
+{
     HANDLE_TH_ERRORS
     void* mem_ptr = PyLong_AsVoidPtr(obj);
     c10_npu::NPUCachingAllocator::raw_delete(mem_ptr);
@@ -1265,7 +1292,8 @@ PyObject* THNPModule_npuUnlockMutex(PyObject *module, PyObject *noargs)
     Py_RETURN_NONE;
 }
 
-PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) {
+PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs)
+{
     HANDLE_TH_ERRORS
     pybind11::gil_scoped_release no_gil;
     NPU_CHECK_ERROR_WITHOUT_UCE(aclmdlInitDump());
@@ -1553,6 +1581,15 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
     } else {
         c10_npu::SetThreadAffinity(core_start, core_end);
     }
+
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
@@ -1626,6 +1663,87 @@ static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args)
     END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THNPModule_add_ipc_pid(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    int pid;
+    if (!PyArg_ParseTuple(args, "i", &pid)) {
+        throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE));
+    }
+    torch_npu::ipc::addPid(pid);
+ 
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_get_ipc_pid(PyObject* self, PyObject *noargs)
+{
+    HANDLE_TH_ERRORS
+    int32_t pid;
+    NPU_CHECK_ERROR(c10_npu::acl::AclrtDeviceGetBareTgid(&pid));
+    return THPUtils_packInt32(pid);
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    int src_dev;
+    int dst_dev;
+    if (!PyArg_ParseTuple(args, "ii", &src_dev, &dst_dev)) {
+        throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE));
+    }
+    bool warning_flag = false;
+    at_npu::native::NpuP2pCtrl::get_instance().get_p2p_access(src_dev, dst_dev, warning_flag);
+ 
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_set_device_res_limit(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    PyObject* device = nullptr;
+    PyObject* type = nullptr;
+    PyObject* value = nullptr;
+
+    if (!PyArg_ParseTuple(args, "OOO",  &device,  &type, &value)) {
+        throw torch::TypeError("Pybind failed to parse parameters." +
+                               PTA_ERROR(ErrCode::TYPE));
+    }
+    int32_t device_ = THPUtils_unpackLong(device);
+    int32_t type_ = THPUtils_unpackLong(type);
+    uint32_t value_ =  static_cast<uint32_t>(THPUtils_unpackUInt32(value));
+    c10_npu::SetDeviceResLimit(device_, type_, value_);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_get_device_res_limit(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    PyObject* device = nullptr;
+    PyObject* type = nullptr;
+
+    if (!PyArg_ParseTuple(args, "OO",  &device, &type)) {
+        throw torch::TypeError("Pybind failed to parse parameters." +
+                               PTA_ERROR(ErrCode::TYPE));
+    }
+    int32_t device_ = THPUtils_unpackLong(device);
+    int32_t type_ = THPUtils_unpackLong(type);
+    uint32_t value = c10_npu::GetDeviceResLimit(device_, type_);
+    return PyLong_FromUnsignedLong(value);
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_reset_device_res_limit(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    int32_t device = THPUtils_unpackLong(args);
+    c10_npu::ResetDeviceResLimit(device);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
 
 static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr},
@@ -1681,12 +1799,19 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr},
     {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr},
+    {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
     {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr},
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
     {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr},
     {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr},
     {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr},
     {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr},
+    {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr},
+    {"_get_ipc_pid", (PyCFunction)THNPModule_get_ipc_pid, METH_NOARGS, nullptr},
+    {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr},
+    {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr},
+    {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr},
+    {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr},
     {nullptr}};
 
 TORCH_NPU_API PyMethodDef* THNPModule_get_methods()
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index e8e0fd3eeffaebecc6d11e73de73e49f13af7668..14ea0ce7e73dbe0b18c255b8678c3a23ad44c5bc 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -74,6 +74,18 @@ void NPUPluggableAllocator::set_erase_stream_fn(
     erase_stream_fn_ = std::move(erase_stream_fn);
 }
 
+void NPUPluggableAllocator::set_get_device_stats_fn(
+    std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn)
+{
+    get_device_stats_fn_ = std::move(get_device_stats_fn);
+}
+
+void NPUPluggableAllocator::set_reset_peak_status_fn(
+    std::function<void(int)> reset_peak_status_fn)
+{
+    reset_peak_status_fn_ = std::move(reset_peak_status_fn);
+}
+
 void* NPUPluggableAllocator::malloc(
     size_t size,
     int device,
@@ -212,8 +224,11 @@ void NPUPluggableAllocator::eraseStream(
 
 c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats(int device)
 {
-    TORCH_NPU_WARN("NPUPluggableAllocator does not yet support getDeviceStats. "
-                  "If you need it, please file an issue describing your use case.");
+    if (get_device_stats_fn_) {
+        return get_device_stats_fn_(device);
+    } else {
+        TORCH_NPU_WARN("get_device_stats_fn_ is not define, please set by set_get_device_stats_fn");
+    }
 }
 
 void NPUPluggableAllocator::resetAccumulatedStats(int device)
@@ -224,8 +239,11 @@ void NPUPluggableAllocator::resetAccumulatedStats(int device)
 
 void NPUPluggableAllocator::resetPeakStats(int device)
 {
-    TORCH_NPU_WARN("NPUPluggableAllocator does not yet support resetPeakStats. "
-                  "If you need it, please file an issue describing your use case.");
+    if (reset_peak_status_fn_) {
+        reset_peak_status_fn_(device);
+    } else {
+        TORCH_NPU_WARN("reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn");
+    }
 }
 
 c10_npu::NPUCachingAllocator::SnapshotInfo NPUPluggableAllocator::snapshot()
@@ -282,6 +300,24 @@ void NPUPluggableAllocator::copy_data(void* dest, const void* src, std::size_t c
 {
     default_copy_data(dest, src, count);
 }
+
+std::shared_ptr<void> NPUPluggableAllocator::getIpcDevPtr(std::string handle)
+{
+    TORCH_NPU_WARN(
+        "NPUPluggableAllocator does not yet support getIpcDevPtr. "
+        "If you need it, please file an issue describing your use case.");
+    auto sp = std::shared_ptr<void>();
+    return sp;
+}
+
+c10_npu::NPUCachingAllocator::ShareableHandle NPUPluggableAllocator::shareIpcHandle(void* ptr)
+{
+    TORCH_NPU_WARN(
+        "NPUPluggableAllocator does not yet support shareIPcHandle. "
+        "If you need it, please file an issue describing your use case.");
+    return c10_npu::NPUCachingAllocator::ShareableHandle{0, nullptr};
+}
+
 void NPUPluggableAllocator::recordHistory(
     bool enabled,
     c10_npu::NPUCachingAllocator::CreateContextFn context_recorder,
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 3a71319f3c7c4f79bd208206f1543947e64b9b1e..a3691d48eefbaf3743f5ce29a304a0dab3560151 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -45,6 +45,8 @@ struct NPUPluggableAllocator
         std::function<void(void* ptr, c10_npu::NPUStream stream)> record_stream_fn);
     void set_erase_stream_fn(
         std::function<void(void* ptr, c10_npu::NPUStream stream)> erase_stream_fn);
+    void set_get_device_stats_fn(std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn);
+    void set_reset_peak_status_fn(std::function<void(int)> reset_peak_status_fn);
     void* malloc(size_t size, int device, aclrtStream stream);
 
     c10::DataPtr allocate(size_t size) override;
@@ -81,6 +83,8 @@ struct NPUPluggableAllocator
     void FreeDeviceCachedMemory(int device) override;
     std::string name() override;
     void copy_data(void* dest, const void* src, std::size_t count) const final;
+    std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
+    c10_npu::NPUCachingAllocator::ShareableHandle shareIpcHandle(void*) override;
     void recordHistory(
         bool enabled,
         c10_npu::NPUCachingAllocator::CreateContextFn context_recorder,
@@ -108,6 +112,8 @@ protected:
     std::function<void*(void*, size_t*)> base_alloc_fn_;
     std::function<void(void* ptr, c10_npu::NPUStream stream)> record_stream_fn_;
     std::function<void(void* ptr, c10_npu::NPUStream stream)> erase_stream_fn_;
+    std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn_;
+    std::function<void(int)> reset_peak_status_fn_;
     std::mutex allocator_mutex_;
     // We do the bookeeping here in order to simplify custom allocators
     std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp
index 47fbf4de6cf5916a4713f9dde961e80fc89c8f74..cc893243a76fc8dd05d60b13e78fe429d7435dcf 100644
--- a/torch_npu/csrc/npu/memory_snapshot.cpp
+++ b/torch_npu/csrc/npu/memory_snapshot.cpp
@@ -16,7 +16,11 @@ namespace torch_npu {
 
 std::shared_ptr<c10::GatheredContext> gather()
 {
+#if defined(__x86_64__)
     return torch::CapturedTraceback::gather(true, true, false);
+#else
+    return torch_npu::CapturedTraceback::gather(true, true, false);
+#endif
 }
 
 std::shared_ptr<c10::GatheredContext> gather_with_cpp()
diff --git a/torch_npu/csrc/profiler/profiler_python.cpp b/torch_npu/csrc/profiler/profiler_python.cpp
index 45ccf8f1b28155e984ad5d17f6e00acb4f18771c..571fb57bbce2c18e24ac6cd26aaedcd656968bd1 100644
--- a/torch_npu/csrc/profiler/profiler_python.cpp
+++ b/torch_npu/csrc/profiler/profiler_python.cpp
@@ -36,19 +36,6 @@ using TensorMetadata = torch_npu::toolkit::profiler::TensorMetadata;
 using ModuleParam = torch_npu::toolkit::profiler::ModuleParam;
 using OptimizerParam = torch_npu::toolkit::profiler::OptimizerParam;
 
-std::string trimPrefix(std::string s)
-{
-    static std::vector<std::string> prefixes = py::module::import("torch.profiler.python_tracer")
-        .attr("_prefix_regex")().cast<std::vector<std::string>>();
-    for (const auto& p : prefixes) {
-        if (s.compare(0, p.size(), p) == 0) {
-            s.erase(0, p.size());
-            return s;
-        }
-    }
-    return s;
-}
-
 std::vector<PyThreadState*> getInterpreterThreads(PyInterpreterState* interpreter)
 {
     pybind11::gil_scoped_acquire gil;
@@ -240,6 +227,7 @@ private:
     void reportTraceData();
     void reportHashData();
     void reportParamData();
+    std::string trimPrefix(std::string s);
 
 private:
     std::atomic<bool> active_{false};
@@ -248,6 +236,7 @@ private:
     std::deque<ThreadLocalResult> thread_local_results_;
     PyObject* module_call_code_{nullptr};
     PyObject* optimizer_call_code_{nullptr};
+    std::vector<std::string> func_name_prefixes_;
     std::unordered_map<size_t, PyCallInfo> py_call_cache_;
     std::unordered_map<size_t, at::StringView> pyc_call_cache_;
     std::unordered_map<size_t, ModuleInfo> module_info_cache_;
@@ -277,6 +266,9 @@ PythonTracer::PythonTracer() : active_(false)
         .attr("_optimizer_step_code")
         .attr("__code__")
         .ptr();
+    func_name_prefixes_ = py::module::import("torch.profiler.python_tracer")
+        .attr("_prefix_regex")()
+        .cast<std::vector<std::string>>();
 }
 
 void PythonTracer::start(size_t max_threads)
@@ -383,6 +375,17 @@ void PythonTracer::clear()
     interpreter_ = nullptr;
 }
 
+std::string PythonTracer::trimPrefix(std::string s)
+{
+    for (const auto& p : func_name_prefixes_) {
+        if (s.compare(0, p.size(), p) == 0) {
+            s.erase(0, p.size());
+            return s;
+        }
+    }
+    return s;
+}
+
 void PythonTracer::reportTraceData()
 {
     if (events_.size() > 0) {
@@ -402,7 +405,7 @@ void PythonTracer::reportHashData()
     hash_data.resize(py_call_cache_.size() + pyc_call_cache_.size() + module_info_cache_.size() + 1);
     size_t idx = 0;
     for (auto& item : py_call_cache_) {
-        hash_data[idx++] = std::make_pair(item.first, trimPrefix(item.second.get_name()));
+        hash_data[idx++] = std::make_pair(item.first, trimPrefix(std::move(item.second.get_name())));
     }
     for (auto& item : pyc_call_cache_) {
         hash_data[idx++] = std::make_pair(item.first, std::string(item.second.str()));
diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
index 8d2629a2a6f859f227313cd84742db563b4c8859..b8065251c54c08dab47b63ba825b439be7fa4a5a 100644
--- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
+++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
@@ -344,8 +344,8 @@ struct MemoryData : BaseReportData {
     uint64_t thread_id{ 0 };
     uint64_t process_id{ 0 };
     MemoryData(int64_t ptr, int64_t time_ns, int64_t alloc_size, int64_t total_allocated, int64_t total_reserved,
-        int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t data_type,
-        uint8_t component_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id)
+        int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t component_type,
+        uint8_t data_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id)
         : BaseReportData(0, "torch.memory_usage"),
           ptr(ptr),
           time_ns(time_ns),
diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp
index aeb6fd8b832e96d39fc8b8cd0724ddbdcf9125b9..e6998f57eaad6ac9368366847593d66c8d56658e 100644
--- a/torch_npu/csrc/utils/TensorType.cpp
+++ b/torch_npu/csrc/utils/TensorType.cpp
@@ -6,7 +6,6 @@
 
 namespace torch_npu {
 namespace utils {
-
 using namespace at;
 using namespace torch::autograd;
 
@@ -15,14 +14,13 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
     std::vector<std::pair<Backend, ScalarType>> ret;
     // can't easily iterate over enum classes, does not support BFloat16 now
     std::vector<Backend> backends = { c10::Backend::PrivateUse1 };
-    std::vector<ScalarType> scalar_types = {
-        ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
-        ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
-        ScalarType::Bool, ScalarType::BFloat16
-    };
-
-    for (auto& backend : backends) {
-        for (auto& scalar_type : scalar_types) {
+    std::vector<ScalarType> scalar_types = { ScalarType::Byte,    ScalarType::Char, ScalarType::Double,
+        ScalarType::Float,   ScalarType::Int,  ScalarType::Long,
+        ScalarType::Short,   ScalarType::Half, ScalarType::Bool,
+        ScalarType::BFloat16 };
+
+    for (auto &backend : backends) {
+        for (auto &scalar_type : scalar_types) {
             ret.emplace_back(std::make_pair(backend, scalar_type));
         }
     }
@@ -32,8 +30,8 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
 
 struct PyTensorType {
     PyTypeObject py_type;
-    THPDtype* dtype;
-    THPLayout* layout;
+    THPDtype *dtype;
+    THPLayout *layout;
     bool is_npu;
     char name[64];
     int backend;
@@ -57,73 +55,67 @@ struct PyTensorType {
 
 static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
 
-static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types);
+static void py_bind_tensor_types(const std::vector<PyTensorType> &tensor_types);
 
-static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+static PyObject *Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
     HANDLE_TH_ERRORS
-    auto& tensor_type = *((PyTensorType*)type);
+    auto &tensor_type = *((PyTensorType *)type);
     if (tensor_type.is_npu) {
-        TORCH_NPU_WARN_ONCE(
-            "Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. "
+        TORCH_NPU_WARN_ONCE("Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. "
             "It's best to use methods such as torch.tensor(data, dtype=*, device='npu') "
             "to create tensors.");
     }
-    TORCH_CHECK_TYPE(
-        !tensor_type.is_npu || c10_npu::device_count() != 0,
-        "type ",
-        tensor_type.name,
+    TORCH_CHECK_TYPE(!tensor_type.is_npu || c10_npu::device_count() != 0, "type ", tensor_type.name,
         " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE))
     torch_npu::utils::npu_lazy_init();
-    return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(),
-                                                             tensor_type.get_scalar_type(),
-                                                             args,
-                                                             kwargs));
+    return THPVariable_Wrap(
+        torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), tensor_type.get_scalar_type(), args, kwargs));
     END_HANDLE_TH_ERRORS
 }
 
-static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg)
+static PyObject *Tensor_instancecheck(PyObject *_self, PyObject *arg)
 {
-  HANDLE_TH_ERRORS
-  auto self = (PyTensorType*)_self;
-  if (THPVariable_Check(arg)) {
-    const auto& var = THPVariable_Unpack(arg);
-
-    if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
-        var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
-      Py_RETURN_TRUE;
+    HANDLE_TH_ERRORS
+    auto self = (PyTensorType *)_self;
+    if (THPVariable_Check(arg)) {
+        const auto &var = THPVariable_Unpack(arg);
+
+        if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
+            var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
+            Py_RETURN_TRUE;
+        }
     }
-  }
-  Py_RETURN_FALSE;
-  END_HANDLE_TH_ERRORS
+    Py_RETURN_FALSE;
+    END_HANDLE_TH_ERRORS
 }
 
-PyObject* Tensor_dtype(PyTensorType* self, void *unused)
+PyObject *Tensor_dtype(PyTensorType *self, void *unused)
 {
-  return torch::autograd::utils::wrap(self->dtype);
+    return torch::autograd::utils::wrap(self->dtype);
 }
 
-PyObject* Tensor_layout(PyTensorType* self, void *unused)
+PyObject *Tensor_layout(PyTensorType *self, void *unused)
 {
-  return torch::autograd::utils::wrap(self->layout);
+    return torch::autograd::utils::wrap(self->layout);
 }
 
-PyObject* Tensor_is_npu(PyTensorType* self, void *unused)
+PyObject *Tensor_is_npu(PyTensorType *self, void *unused)
 {
-  if (self->is_npu) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
+    if (self->is_npu) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
 }
 
-PyObject* Tensor_is_sparse(PyTensorType *self, void *unused)
+PyObject *Tensor_is_sparse(PyTensorType *self, void *unused)
 {
-  if (self->layout->layout == at::Layout::Strided) {
-    Py_RETURN_FALSE;
-  } else {
-    Py_RETURN_TRUE;
-  }
+    if (self->layout->layout == at::Layout::Strided) {
+        Py_RETURN_FALSE;
+    } else {
+        Py_RETURN_TRUE;
+    }
 }
 
 static struct PyMethodDef metaclass_methods[] = {
@@ -131,7 +123,7 @@ static struct PyMethodDef metaclass_methods[] = {
     {nullptr}
 };
 
-using getter = PyObject* (*)(PyObject *, void *);
+using getter = PyObject *(*)(PyObject *, void *);
 
 static struct PyGetSetDef metaclass_properties[] = {
     {"dtype",        (getter)Tensor_dtype, nullptr, nullptr, nullptr},
@@ -142,46 +134,44 @@ static struct PyGetSetDef metaclass_properties[] = {
 };
 
 static PyTypeObject metaclass = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch.tensortype",                          /* tp_name */
-  sizeof(PyTypeObject)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */
+    sizeof(PyTypeObject)                                  /* tp_basicsize */
 };
 
-static void py_initialize_metaclass(PyTypeObject& metaclass)
+static void py_initialize_metaclass(PyTypeObject &metaclass)
 {
-  metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
-  metaclass.tp_methods = metaclass_methods;
-  metaclass.tp_getset = metaclass_properties;
-  metaclass.tp_base = &PyType_Type;
-  if (PyType_Ready(&metaclass) < 0) {
-    throw python_error();
-  }
+    metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    metaclass.tp_methods = metaclass_methods;
+    metaclass.tp_getset = metaclass_properties;
+    metaclass.tp_base = &PyType_Type;
+    if (PyType_Ready(&metaclass) < 0) {
+        throw python_error();
+    }
 }
 
 static PyTypeObject tensor_type_prototype = {
-  PyVarObject_HEAD_INIT(&metaclass, 0)
-  nullptr,                                     /* tp_name */
-  sizeof(PyTensorType)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */
+    sizeof(PyTensorType)                          /* tp_basicsize */
 };
 
-static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict)
+static void py_initialize_tensor_type(PyTypeObject &type, const char *name, PyObject *tp_dict)
 {
-  // NOTE: we don't use the typical static declaration of PyTypeObject because
-  // we need to initialize as many types as there are VariableType instances.
-  // We copy the basic object fields from a prototype definition and initialize
-  // the remaining fields below.
-  memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
-  // Subclassing from torch.<ScalarType>Tensor isn't supported.
-  // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
-  type.tp_flags = Py_TPFLAGS_DEFAULT;
-  type.tp_name = name;
-  type.tp_new = Tensor_new;
-  if (PyType_Ready(&type) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
-    throw python_error();
-  }
+    // NOTE: we don't use the typical static declaration of PyTypeObject because
+    // we need to initialize as many types as there are VariableType instances.
+    // We copy the basic object fields from a prototype definition and initialize
+    // the remaining fields below.
+    memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
+    // Subclassing from torch.<ScalarType>Tensor isn't supported.
+    // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
+    type.tp_flags = Py_TPFLAGS_DEFAULT;
+    type.tp_name = name;
+    type.tp_new = Tensor_new;
+    if (PyType_Ready(&type) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
+        throw python_error();
+    }
 }
 
 static std::string get_module(Backend backend)
@@ -204,103 +194,103 @@ static std::string get_module(Backend backend)
 
 static std::string get_name(Backend backend, ScalarType scalarType)
 {
-  std::ostringstream ss;
-  ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
-  return ss.str();
+    std::ostringstream ss;
+    ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
+    return ss.str();
 }
 
-static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType)
+static void set_type(PyTensorType &type_obj, Backend backend, ScalarType scalarType)
 {
-  // This field is lazily initialized from backend and scalar_type
-  type_obj.backend = static_cast<int>(backend);
-  type_obj.scalar_type = static_cast<int>(scalarType);
-  type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
-  type_obj.dtype = torch::getTHPDtype(scalarType);
-  type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
+    // This field is lazily initialized from backend and scalar_type
+    type_obj.backend = static_cast<int>(backend);
+    type_obj.scalar_type = static_cast<int>(scalarType);
+    type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
+    type_obj.dtype = torch::getTHPDtype(scalarType);
+    type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
 }
 
-static void set_name(PyTensorType& type_obj, const std::string& name)
+static void set_name(PyTensorType &type_obj, const std::string &name)
 {
-  size_t n = sizeof(type_obj.name);
-  strncpy(type_obj.name, name.c_str(), n);
-  type_obj.name[n - 1] = '\0';
+    size_t n = sizeof(type_obj.name);
+    strncpy(type_obj.name, name.c_str(), n);
+    type_obj.name[n - 1] = '\0';
 }
 
 static THPObjectPtr get_tensor_dict()
 {
-  auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
-  if (!torch) {
-      throw python_error();
-  }
-
-  auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
-  if (!tensor_class) {
-      throw python_error();
-  }
-
-  auto tensor_type = (PyTypeObject*)tensor_class.get();
-  TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
-
-  auto res = THPObjectPtr(PyDict_New());
-  if (!res) {
-      throw python_error();
-  }
-
-  if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-
-  return res;
+    auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
+    if (!torch) {
+        throw python_error();
+    }
+
+    auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
+    if (!tensor_class) {
+        throw python_error();
+    }
+
+    auto tensor_type = (PyTypeObject *)tensor_class.get();
+    TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
+
+    auto res = THPObjectPtr(PyDict_New());
+    if (!res) {
+        throw python_error();
+    }
+
+    if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+
+    return res;
 }
 
 static std::vector<PyTensorType> tensor_types;
 
-static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types)
+static void initialize_npu_aten_types(std::vector<PyTensorType> &tensor_types)
 {
-  // only initialize npu types
-  auto declared_types = all_declared_types_npu();
-  tensor_types.resize(declared_types.size());
-
-  for (size_t i = 0, end = declared_types.size(); i != end; i++) {
-    auto& tensor_type = tensor_types[i];
-    Backend backend = declared_types[i].first;
-    ScalarType scalar_type = declared_types[i].second;
-    set_type(tensor_type, backend, scalar_type);
-    set_name(tensor_type, get_name(backend, scalar_type));
-  }
+    // only initialize npu types
+    auto declared_types = all_declared_types_npu();
+    tensor_types.resize(declared_types.size());
+
+    for (size_t i = 0, end = declared_types.size(); i != end; i++) {
+        auto &tensor_type = tensor_types[i];
+        Backend backend = declared_types[i].first;
+        ScalarType scalar_type = declared_types[i].second;
+        set_type(tensor_type, backend, scalar_type);
+        set_name(tensor_type, get_name(backend, scalar_type));
+    }
 }
 
 void _initialize_python_bindings()
 {
-  // Initialize the at::Type* pointers, name, and properties of the PyTensorType
-  // vector. After this call, the vector must not be resized.
-  initialize_npu_aten_types(tensor_types);
-
-  // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
-  // The metaclass handles __instancecheck__ checks and binds the dtype property
-  // on the type objects.
-  py_initialize_metaclass(metaclass);
-
-  // Get the tp_dict of the Variable class. We copy function definitions
-  // onto each Tensor type object so that they can be accessed via e.g.
-  // `torch.npu.FloatTensor.add`.
-  auto tensor_dict = get_tensor_dict();
-
-  // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
-  for (auto& tensor_type : tensor_types) {
-    py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
-  }
-
-  // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
-  // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
-  // objects to the set torch_npu._tensor_classes.
-  py_bind_tensor_types(tensor_types);
+    // Initialize the at::Type* pointers, name, and properties of the PyTensorType
+    // vector. After this call, the vector must not be resized.
+    initialize_npu_aten_types(tensor_types);
+
+    // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
+    // The metaclass handles __instancecheck__ checks and binds the dtype property
+    // on the type objects.
+    py_initialize_metaclass(metaclass);
+
+    // Get the tp_dict of the Variable class. We copy function definitions
+    // onto each Tensor type object so that they can be accessed via e.g.
+    // `torch.npu.FloatTensor.add`.
+    auto tensor_dict = get_tensor_dict();
+
+    // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
+    for (auto &tensor_type : tensor_types) {
+        py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
+    }
+
+    // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
+    // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
+    // objects to the set torch_npu._tensor_classes.
+    py_bind_tensor_types(tensor_types);
 }
 
-static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
+static void py_bind_tensor_types(const std::vector<PyTensorType> &tensor_types)
 {
     auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
     if (!torch_module) {
@@ -312,7 +302,7 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
         throw python_error();
     }
 
-    for (auto& tensor_type : tensor_types) {
+    for (auto &tensor_type : tensor_types) {
         auto name = std::string(tensor_type.name);
         auto idx = name.rfind('.');
         auto type_name = name.substr(idx + 1);
@@ -323,7 +313,7 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
             throw python_error();
         }
 
-        PyObject* type_obj = (PyObject*)&tensor_type;
+        PyObject *type_obj = (PyObject *)&tensor_type;
         Py_INCREF(type_obj);
         if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) {
             throw python_error();
@@ -335,12 +325,12 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
 }
 
 // Callback for python part. Used for additional initialization of python classes
-static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs)
+static PyObject *THPModule_initExtension(PyObject *_unused, PyObject *noargs)
 {
-  HANDLE_TH_ERRORS
-  _initialize_python_bindings();
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
+    HANDLE_TH_ERRORS
+    _initialize_python_bindings();
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
 }
 
 // autograd methods on torch._C
@@ -349,9 +339,9 @@ static PyMethodDef TorchNpuExtensionMethods[] = {
     {nullptr, nullptr, 0, nullptr}
 };
 
-PyMethodDef* npu_extension_functions()
+PyMethodDef *npu_extension_functions()
 {
-  return TorchNpuExtensionMethods;
+    return TorchNpuExtensionMethods;
 }
 }
 }
diff --git a/torch_npu/multiprocessing/reductions.py b/torch_npu/multiprocessing/reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc40949f7933337eaf6a441b688d1e941849ffa2
--- /dev/null
+++ b/torch_npu/multiprocessing/reductions.py
@@ -0,0 +1,178 @@
+__all__ = ["rebuild_npu_tensor"]
+
+import multiprocessing
+import torch
+from torch.multiprocessing.reductions import (
+    shared_cache,
+    rebuild_storage_filename,
+    rebuild_storage_empty,
+    rebuild_storage_fd,
+    StorageWeakRef,
+    fd_id,
+    rebuild_tensor,
+    storage_from_cache,
+)
+
+import torch_npu
+
+
+def rebuild_npu_tensor(
+    tensor_cls,
+    tensor_size,
+    tensor_stride,
+    tensor_offset,
+    storage_cls,
+    dtype,
+    storage_device,
+    storage_handle,
+    storage_size_bytes,
+    storage_offset_bytes,
+    requires_grad,
+    ref_counter_handle,
+    ref_counter_offset,
+    event_handle,
+    event_sync_required,
+):
+    # If storage_handle is None, storage points to nullptr.
+    if storage_handle is None or storage_size_bytes == 0:
+        storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True)
+    else:
+        storage = storage_from_cache(
+            storage_cls, (storage_handle, storage_offset_bytes)
+        )
+        if storage is None:
+            torch_npu.npu._lazy_init()
+            storage = storage_cls._new_shared_npu(
+                storage_device,
+                storage_handle,
+                storage_size_bytes,
+                storage_offset_bytes,
+                ref_counter_handle,
+                ref_counter_offset,
+                event_handle,
+                event_sync_required,
+            )
+            shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(
+                storage
+            )
+        else:
+            # We already ref counting this Storage, but producer needs new ref-counters to be released.
+            storage_cls._release_ipc_counter_npu(
+                ref_counter_handle, ref_counter_offset, device=storage_device
+            )
+
+    _storage = (
+        storage
+        if isinstance(storage, torch.UntypedStorage)
+        else storage._untyped_storage
+    )
+
+    t = torch._utils._rebuild_tensor(
+        torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True),
+        tensor_offset,
+        tensor_size,
+        tensor_stride,
+    )
+
+    if tensor_cls == torch.nn.parameter.Parameter:
+        # It is crucial for integer tensors to receive
+        # the requires_grad=False as an argument in the constructor
+        t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)
+    else:
+        t.requires_grad = requires_grad
+
+    return t
+
+
+def _npu_reduce_tensor(tensor):
+    storage = tensor._typed_storage()
+
+    if tensor.requires_grad and not tensor.is_leaf:
+        raise RuntimeError(
+            "Cowardly refusing to serialize non-leaf tensor which requires_grad, "
+            "since autograd does not support crossing process boundaries.  "
+            "If you just want to transfer the data, call detach() on the tensor "
+            "before serializing (e.g., putting it on the queue)."
+        )
+
+    torch._namedtensor_internals.check_serializing_named_tensor(tensor)
+    torch.utils.hooks.warn_if_has_hooks(tensor)
+
+    if storage._untyped_storage.device.type == "npu":
+        (
+            device,
+            handle,
+            storage_size_bytes,
+            storage_offset_bytes,
+            ref_counter_handle,
+            ref_counter_offset,
+            event_handle,
+            event_sync_required,
+        ) = storage._share_npu_()
+        tensor_offset = tensor.storage_offset()
+        shared_cache[handle] = StorageWeakRef(storage)
+        return (
+            rebuild_npu_tensor,
+            (
+                type(tensor),
+                tensor.size(),
+                tensor.stride(),
+                tensor_offset,  # tensor offset in its storage
+                type(storage),
+                tensor.dtype,
+                device,
+                handle,  # identifier which NPU allocation is the storage in.
+                storage_size_bytes,  # size(in bytes) of the storage
+                storage_offset_bytes,  # offset(in bytes) of the storage in the NPU allocation
+                tensor.requires_grad,
+                ref_counter_handle,
+                ref_counter_offset,
+                event_handle,
+                event_sync_required,
+            ),
+        )
+
+    # _backward_hooks purposely omitted here, see Note [Don't serialize hooks]
+    metadata = (
+        tensor.storage_offset(),
+        tensor.size(),
+        tensor.stride(),
+        tensor.requires_grad,
+    )
+    return (rebuild_tensor, (type(tensor), storage, metadata))
+
+
+def _npu_reduce_storage(storage):
+    from torch.multiprocessing import get_sharing_strategy
+
+    if storage.is_npu:
+        raise RuntimeError(
+            "Cannot pickle NPU storage; try pickling a NPU tensor instead"
+        )
+    elif get_sharing_strategy() == "file_system":
+        metadata = storage._share_filename_cpu_()
+        cache_key = metadata[1]
+        rebuild = rebuild_storage_filename
+        if isinstance(storage, torch.TypedStorage):
+            metadata += (storage.dtype,)
+        storage._shared_incref()
+    elif storage.size() == 0:
+        # This is special cased because Empty tensors
+        # (with size 0) cannot be mmapped.
+        return (rebuild_storage_empty, (type(storage),))
+    else:
+        fd, size = storage._share_fd_cpu_()
+        df = multiprocessing.reduction.DupFd(fd)
+        cache_key = fd_id(fd)
+        metadata = (df, size)
+        rebuild = rebuild_storage_fd  # type: ignore[assignment]
+
+    shared_cache[cache_key] = StorageWeakRef(storage)
+    return (rebuild, (type(storage),) + metadata)
+
+
+def _add_reductions_methods():
+    torch.multiprocessing.reductions.reduce_tensor = _npu_reduce_tensor
+    torch.multiprocessing.reductions.reduce_storage = _npu_reduce_storage
+
+    torch.multiprocessing.reductions.init_reductions()
\ No newline at end of file
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index ba883da8fc25a809a10eaad2c1e292db538e46b0..182859d8a5aefc290a702d41fdd36cc33631c72c 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -98,7 +98,6 @@ __all__ = [
     "stop_device",
     "restart_device",
     "check_uce_in_memory",
-    "get_uce_addr",
     "config",
     "matmul",
     "conv",
@@ -115,7 +114,9 @@ __all__ = [
     "graph_task_group_begin",
     "graph_task_group_end",
     "graph_task_update_begin",
-    "graph_task_update_end"
+    "graph_task_update_end",
+    "set_device_limit",
+    "get_device_limit"
 ]
 
 from typing import Tuple, Union, List, cast, Optional
@@ -135,7 +136,7 @@ from .utils import (synchronize, can_device_access_peer, set_device, current_dev
                     device, device_of, StreamContext, stream, set_stream, current_stream, default_stream, set_sync_debug_mode,
                     get_sync_debug_mode, init_dump, current_blas_handle, is_bf16_supported,
                     utilization, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, mem_get_info,
-                    check_uce_in_memory, stress_detect, get_uce_addr)
+                    check_uce_in_memory, stress_detect, _get_uce_addr)
 from ._recovery import restart_device, stop_device
 from .streams import Stream, Event, SyncLaunchStream, ExternalEvent
 from .mstx import mstx
diff --git a/torch_npu/npu/_format.py b/torch_npu/npu/_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb65e076f74f5537daf4bcc76a58eaae4fdedbd
--- /dev/null
+++ b/torch_npu/npu/_format.py
@@ -0,0 +1,38 @@
+from enum import IntEnum
+
+import torch
+import torch_npu
+
+
+class Format(IntEnum):
+    """NPU storage format enumeration class"""
+    UNDEFINED = -1
+    NCHW = 0
+    NHWC = 1
+    ND = 2
+    NC1HWC0 = 3
+    FRACTAL_Z = 4
+    NC1HWC0_C04 = 12
+    HWCN = 16
+    NDHWC = 27
+    FRACTAL_NZ = 29
+    NCDHW = 30
+    NDC1HWC0 = 32
+    FRACTAL_Z_3D = 33
+    NC = 35
+    NCL = 47
+
+    def __str__(self):
+        return self.name
+
+
+def _apply_npu_format_patch():
+    orig_get_format = torch_npu.get_npu_format
+    
+    def patched_get_format(tensor):
+        """get the Format type of tensor"""
+        format_int = orig_get_format(tensor)
+        return Format(format_int)
+    
+    torch_npu.get_npu_format = patched_get_format
+    torch_npu.Format = Format
diff --git a/torch_npu/npu/npu_config.py b/torch_npu/npu/npu_config.py
index 2233f7841c4b8866ee18b8f22c289562121779d2..5ca745339f0066eb0583b0a1b02cc48bbe4dbaee 100644
--- a/torch_npu/npu/npu_config.py
+++ b/torch_npu/npu/npu_config.py
@@ -6,12 +6,14 @@ import torch_npu
 import torch_npu._C
 from torch_npu.utils._path_manager import PathManager
 from torch_npu.utils._error_code import ErrCode, pta_error, prof_error
+from .utils import _get_device_index
 
 # this file is used to enhance the npu frontend API by set_option or other.
 
 __all__ = ["set_option", "set_aoe", 
            "set_compile_mode", "set_mm_bmm_format_nd", "get_mm_bmm_format_nd",
-           "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump"]
+           "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump",
+           "set_device_limit", "get_device_limit"]
 
 _option_map = {"ACL_PRECISION_MODE": ["allow_fp32_to_fp16", "must_keep_origin_dtype"],
                "ACL_OP_SELECT_IMPL_MODE": ["high_performance", "high_precision"],
@@ -170,3 +172,42 @@ class _allowHF32Conv:
             hf32_value = torch_npu._C._npu_getOption("ALLOW_CONV_HF32")
             return (hf32_value is None) or (hf32_value.decode() == "") or (hf32_value.decode() == "enable")
         return None
+
+
+class _call_once_class:
+    def __init__(self, func):
+        self.func = func
+        self.called = False
+        self.result = None
+
+    def __call__(self, *args, **kwargs):
+        if self.called:
+            raise RuntimeError(f"Function '{self.func.__name__}' has already been called, \
+                 You can only set this interface once.")
+
+        self.called = True
+        self.result = self.func(*args, **kwargs)
+        return self.result
+
+
+@_call_once_class
+def set_device_limit(device, cube_num=-1, vector_num=-1):
+    from torch_npu.npu import device_count
+    device_id = _get_device_index(device, optional=True)
+    if device_id < 0 or device_id >= device_count():
+        raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE))
+    torch_npu.npu._lazy_init()
+    if cube_num != -1:
+        torch_npu._C._npu_set_device_res_limit(device_id, 0, cube_num)
+    if vector_num != -1:
+        torch_npu._C._npu_set_device_res_limit(device_id, 1, vector_num)
+
+
+def get_device_limit(device):
+    from torch_npu.npu import device_count
+    device_id = _get_device_index(device, optional=True)
+    if device_id < 0 or device_id >= device_count():
+        raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE))
+    torch_npu.npu._lazy_init()
+    return {"cube_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 0), \
+           "vector_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 1)}
\ No newline at end of file
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index 069848345b994b93ff6610716eae20455184793b..697504d52d7f82d4d68182d990493bd2856a56c1 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -17,7 +17,7 @@ __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device"
            "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode",
            "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported",
            "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle",
-           "check_uce_in_memory", "stress_detect", "get_cann_version", "get_uce_addr"]
+           "check_uce_in_memory", "stress_detect", "get_cann_version"]
 
 
 def get_cann_version(module="CANN"):
@@ -446,7 +446,7 @@ def check_uce_in_memory(device_id):
     return torch_npu._C._npu_check_uce_in_memory(device_id)
 
 
-def get_uce_addr():
+def _get_uce_addr():
     torch_npu.npu._lazy_init()
     return torch_npu._C._npu_get_uce_addr()
 
diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py
index bc97473537d65cc6ab5df816da4192df5dc0edb8..16ae07087e5bd17121ab4519a929e7c564270ab7 100644
--- a/torch_npu/onnx/wrapper_onnx_ops.py
+++ b/torch_npu/onnx/wrapper_onnx_ops.py
@@ -255,8 +255,8 @@ class _NPUFormatCastOP(torch.autograd.Function):
         return torch.ops.npu.npu_format_cast(*args, **kwargs)
 
     @staticmethod
-    def symbolic(g, self: Tensor, acl_format: int):
-        return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format)
+    def symbolic(g, self: Tensor, acl_format: int, customize_dtype: int = None):
+        return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format, customize_dtype_i=customize_dtype)
 
 
 class _NPUSoftmaxCrossEntropyWithLogitsOP(torch.autograd.Function):
@@ -1042,8 +1042,8 @@ def _wrapper_npu_deformable_conv2d(inputs, weight, offset, bias, kernel_size, st
                                        padding, dilation, groups, deformable_groups, modulated)
 
 
-def _wrapper_npu_format_cast(self, acl_format):
-    return _NPUFormatCastOP.apply(self, acl_format)
+def _wrapper_npu_format_cast(self, acl_format, customize_dtype=None):
+    return _NPUFormatCastOP.apply(self, acl_format, customize_dtype)
 
 
 def _wrapper_npu_softmax_cross_entropy_with_logits(self, labels):
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index ab8a5abfe6ab53e8d3f1181c9c02cf0dd41a684b..5da94ae76334e5978f4da11c2313733369b52949 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -198,8 +198,8 @@ class ConfigContext:
         op_attr = json_data.get('PROFILE_OP_ATTR', 'false')
         op_attr = self.BOOL_MAP.get(op_attr.lower(), False)
         gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None)
-        if isinstance(gc_detect_threshold, str) and gc_detect_threshold != "None":
-            gc_detect_threshold = float(gc_detect_threshold)
+        if isinstance(gc_detect_threshold, str):
+            gc_detect_threshold = None if gc_detect_threshold == "None" else float(gc_detect_threshold)
         data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true')
         data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True)
         record_op_args = False
diff --git a/torch_npu/profiler/_non_intrusive_profile.py b/torch_npu/profiler/_non_intrusive_profile.py
index a60303adec0a88258dff4d7e2b3785cd20ecfc7b..c4ce45223b6f16f19aa768f18852f0b7da82e591 100644
--- a/torch_npu/profiler/_non_intrusive_profile.py
+++ b/torch_npu/profiler/_non_intrusive_profile.py
@@ -8,7 +8,7 @@ from ..utils._path_manager import PathManager
 from ._dynamic_profiler._dynamic_profiler_utils import DynamicProfilerUtils
 from .dynamic_profile import init as dp_init
 from .dynamic_profile import step as dp_step
-from .analysis.prof_common_func._constant import print_error_msg
+from .analysis.prof_common_func._constant import print_error_msg, print_warn_msg
 
 
 __all__ = [
@@ -59,11 +59,19 @@ class _NonIntrusiveProfile:
     @staticmethod
     def init():
         prof_config_path = os.getenv("PROF_CONFIG_PATH", "")
-        dyno_enable_flag = os.getenv("KINETO_USE_DAEMON", 0)
+        kine_to_value = os.getenv("KINETO_USE_DAEMON")
+        msmonitor_value = os.getenv("MSMONITOR_USE_DAEMON")
+
+        if kine_to_value is not None:
+            print_warn_msg(
+                "Environment variable 'KINETO_USE_DAEMON' will be deprecated. "
+                "Please use 'MSMONITOR_USE_DAEMON' instead."
+            )
+        dyno_enable_flag = msmonitor_value or kine_to_value or 0
         try:
             dyno_enable_flag = int(dyno_enable_flag)
         except ValueError:
-            print_error_msg("Environment variable KINETO_USE_DAEMON value not valid, will be set to 0 !")
+            print_error_msg("Environment variable 'MSMONITOR_USE_DAEMON' value not valid, will be set to 0 !")
             dyno_enable_flag = 0
         if not prof_config_path and dyno_enable_flag != 1:
             return
diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index 56809c9b7f65be2479f7dd1e9d63e068940c1eab..1a62c54d6f6af19bed262b7b5765d192e3fd3c0d 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -1,5 +1,5 @@
 import os
-from datetime import datetime
+import time
 from typing import Union
 
 from torch_npu.utils._error_code import ErrCode, prof_error
@@ -217,20 +217,23 @@ class Constant(object):
 
 
 def print_info_msg(message: str):
-    time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
-    print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}")
+    current_time = time.localtime()
+    time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time)
+    print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}", flush=True)
 
 
 def print_warn_msg(message: str):
     if not _should_print_warning():
         return
-    time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
-    print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}")
+    current_time = time.localtime()
+    time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time)
+    print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}", flush=True)
 
 
 def print_error_msg(message: str):
-    time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
-    print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}")
+    current_time = time.localtime()
+    time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time)
+    print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}", flush=True)
 
 
 def convert_ns2us_float(ns) -> float:
diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py
index 15ba7a80f9d10ed74e1e26a4a5be4ab9190b7ef0..0fecde48c41b465cf04eff26282a02911655c032 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_log.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_log.py
@@ -34,6 +34,7 @@ class ProfilerLogger:
     BACKUP_COUNT = 3
     # logger instance
     _instance = None
+    _pid = None
 
     @classmethod
     def get_instance(cls) -> logging.Logger:
@@ -54,14 +55,17 @@ class ProfilerLogger:
             RuntimeError: If logger initialization fails
         """
         if cls._instance is not None:
-            return
+            if cls._pid == os.getpid():
+                return
 
         # Create logs directory
         log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR)
         PathManager.make_dir_safety(log_dir)
 
         # Create logger
-        logger = logging.getLogger(cls.DEFAULT_LOGGER_NAME)
+        logger = logging.getLogger(
+            f"{cls.DEFAULT_LOGGER_NAME}_{custom_name}" if custom_name else cls.DEFAULT_LOGGER_NAME
+        )
         logger.setLevel(cls.DEFAULT_LOG_LEVEL)
         logger.propagate = False
 
@@ -89,6 +93,7 @@ class ProfilerLogger:
         logger.addHandler(file_handler)
 
         cls._instance = logger
+        cls._pid = os.getpid()
         logger.info("Profiler logger initialized at: %s", log_file)
 
     @classmethod
@@ -106,9 +111,13 @@ class ProfilerLogger:
 
     @classmethod
     def destroy(cls) -> None:
-        """Close and cleanup the logger."""
+        """
+        Close and cleanup the logger.
+        To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios,
+        when child process updates instance, the parent process instance obtained by fork does not call this method.
+        """
         if cls._instance:
             for handler in cls._instance.handlers[:]:
-                handler.close()
                 cls._instance.removeHandler(handler)
+                handler.close()
             cls._instance = None
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
index b5d3797c6fc2e625840ec07a8357690d94186e51..ba29da446eb5c43b8b93ce4d8bea4b9f245da487 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
@@ -74,7 +74,7 @@ class FwkCANNRelationParser:
             step_id = step_node.event.name.split("#")[-1]
             if not step_node.corr_id_total:
                 self.logger.error("There is no flow events in %s range.", step_node.event.name)
-                return []
+                continue
             corr_id_list = sorted(step_node.corr_id_total)
             min_index, max_index = 0, len(corr_id_list) - 1
             min_kernel_list, max_kernel_list = [], []
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
index aa00324c97c25909dbca2f4efb8a3f97533b216f..b8216a6995895eba8b907d03d90997f6d49c58a9 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
@@ -152,11 +152,14 @@ class FwkFileParser:
 
     def get_fwk_trace_data(self):
         torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP)
-        if not torch_op_data:
-            self.logger.error("Get fwk trace data failed, the torch op data is empty.")
-            return []
         enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
-        pid = torch_op_data[0].pid
+        if torch_op_data:
+            pid = torch_op_data[0].pid
+        elif enqueue_data_list or dequeue_data_list:
+            pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid
+        else:
+            self.logger.error("Get fwk trace data failed, framework data is empty.")
+            return []
         tid_dict = {}
         fwk_x_event_list = [None] * (
                 len(torch_op_data) + len(enqueue_data_list) * 2 + len(dequeue_data_list) * 2)
@@ -247,9 +250,15 @@ class FwkFileParser:
 
     def get_fwk_api(self) -> dict:
         torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP)
-        if not torch_op_data:
+        enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
+        if torch_op_data:
+            pid = torch_op_data[0].pid
+        elif enqueue_data_list or dequeue_data_list:
+            pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid
+        else:
+            self.logger.error("Get fwk api data failed, framework data is empty.")
             return {}
-        pid = torch_op_data[0].pid
+
         torch_op_apis = []
         fwd_bwd_dict = {}
         torch_op_idx = 0
@@ -272,13 +281,13 @@ class FwkFileParser:
         connection_ids = []
         task_enqueues = []
         task_dequeues = []
-        enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
         correlation_id_name_dict = {}
         for dequeue_data in dequeue_data_list:
             task_dequeues.append(
                 [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid),
                  dequeue_data.corr_id, dequeue_data.name])
             correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name
+            torch_tids.add(dequeue_data.tid)
         for enqueue_data in enqueue_data_list:
             name = enqueue_data.name
             if enqueue_data.corr_id in correlation_id_name_dict:
@@ -288,6 +297,7 @@ class FwkFileParser:
                 [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid),
                  enqueue_data.corr_id, name])
             connection_ids.append(enqueue_data.corr_id)
+            torch_tids.add(enqueue_data.tid)
 
         start_connection_id = max(connection_ids) + 1 if connection_ids else 0
         self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id)
diff --git a/torch_npu/profiler/analysis/prof_view/_communication_parser.py b/torch_npu/profiler/analysis/prof_view/_communication_parser.py
index fff6d265d6ceb5198681e78956b6268efc732cb9..e07f68b785b31eb509602a99a12760fad476a5f3 100644
--- a/torch_npu/profiler/analysis/prof_view/_communication_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_communication_parser.py
@@ -46,8 +46,6 @@ class CommunicationParser(BaseParser):
         self._root_node = TorchOpNode()
         self._kernel_dict = {}
         self.step_list = []
-        ProfilerLogger.init(self._profiler_path, "CommunicationParser")
-        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def combine_size_distribution(op_dict: dict, total_dict: dict):
@@ -63,6 +61,8 @@ class CommunicationParser(BaseParser):
             return round(dividend / divisor, 4)
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "CommunicationParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             self._init_step_list(deps_data)
             self.generate_view()
diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
index b6c545420c3bb961640c7ef25dc54e8050fad6ae..28472a241177ed4f8f13c7b090e02a98db1113c2 100644
--- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
@@ -26,10 +26,10 @@ class IntegrateParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        ProfilerLogger.init(self._profiler_path, "IntegrateParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "IntegrateParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             ProfilerConfig().load_info(self._profiler_path)
             self.generate_view()
diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
index 30ffd8be8ba46e0b8cc5ac1300c4eba389211eaa..ded9a612c6cfd98a7076fb749457e0c3da9aa44c 100644
--- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
@@ -17,8 +17,6 @@ class KernelViewParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self.step_range = []
-        ProfilerLogger.init(self._profiler_path, "KernelViewParser")
-        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def _project_map_for_headers(cls, input_headers: list):
@@ -35,6 +33,8 @@ class KernelViewParser(BaseParser):
         return output_headers
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "KernelViewParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             ProfilerConfig().load_info(self._profiler_path)
             self._init_step_range(deps_data)
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
index 04ef7c0e90da3b1ee494785ea540d22e0a07052a..47255efd09dbdca635e4888fd575f311fbcff5ef 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
@@ -34,8 +34,6 @@ class MemoryViewParser(BaseParser):
         self.ge_record_list = []
         self.memory_data = []
         self.component_list = []
-        ProfilerLogger.init(self._profiler_path, "MemoryViewParser")
-        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _get_data_from_file(file_set: set, file_type_bean: any, bean_list: bool = False) -> list:
@@ -73,6 +71,8 @@ class MemoryViewParser(BaseParser):
         return [cur_record_list, pta_ge_record_list]
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "MemoryViewParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             self.memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Text, [])
             self.pta_record_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", [])
@@ -109,7 +109,8 @@ class MemoryViewParser(BaseParser):
             if ge_record.time_ns >= pta_record.time_ns:
                 self.size_record_list.extend(self._combine_record(last_ge_record, pta_record))
                 pta_ptr += 1
-                last_pta_record = pta_record
+                if hasattr(pta_record, 'component_type') and pta_record.component_type != Constant.WORKSPACE_TYPE:
+                    last_pta_record = pta_record
             else:
                 self.size_record_list.extend(self._combine_record(last_pta_record, ge_record))
                 ge_ptr += 1
diff --git a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
index f87e8dc8b85e7f35097afd2666194f7cd0311b68..7c10e9d4bf45c2881fb8bd04ae3c2b1124f578c5 100644
--- a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
@@ -22,10 +22,10 @@ class OperatorViewParser(BaseParser):
         self._torch_op_node = []
         self._root_node = None
         self._kernel_dict = {}
-        ProfilerLogger.init(self._profiler_path, "OperatorViewParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "OperatorViewParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             self._kernel_dict = deps_data.get(Constant.RELATION_PARSER, {})
diff --git a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
index 2f793a8af8b611559613799a004531224c366590..b4a85271d99034e55936d682e9b4748f6251cf11 100644
--- a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
@@ -23,10 +23,10 @@ class StackViewParser(BaseParser):
         self._root_node = None
         self._kernel_dict = {}
         self._metric = param_dict.get("metric")
-        ProfilerLogger.init(self._profiler_path, "StackViewParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "StackViewParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             self.generate_view()
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
index 744e2cd8a6e1a42b9e9e813f5cb27c51cd34ce61..46093bec4e8e2cbe50af5590be96f37ad9ac574f 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
@@ -51,8 +51,6 @@ class TraceStepTimeParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self.step_range = []
-        ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser")
-        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def is_float_num(cls, num):
@@ -165,6 +163,8 @@ class TraceStepTimeParser(BaseParser):
         FileManager.create_csv_file(output_path, print_time, file_name, self.title)
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             self._init_step_range(deps_data)
             self.generate_view()
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
index f90100e869fd4c4ea92661dd2183b8fd20808412..c5e572e1bcfeba5ecaa4c4e6db93b47c896392eb 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
@@ -27,8 +27,6 @@ class TraceViewParser(BaseParser):
         self._trace_data = []
         self._torch_op_node = []
         self._root_node = None
-        ProfilerLogger.init(self._profiler_path, "TraceViewParser")
-        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _prune_trace_by_level(json_data: list) -> list:
@@ -47,6 +45,8 @@ class TraceViewParser(BaseParser):
         return result
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "TraceViewParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             ProfilerConfig().load_info(self._profiler_path)
             torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
index 8ef2072be611814bb0a604685b957745d8d221fa..da8037f982bbc2ba77f18a3aa5928565bf45a28e 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
@@ -34,10 +34,10 @@ class CANNAnalyzeParser(BaseParser):
         super().__init__(name, param_dict)
         self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
         self.msprof_path = shutil.which("msprof")
-        ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             ProfilerConfig().load_info(self._profiler_path)
             if not os.path.isdir(self._cann_path):
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
index 6a703d0b954ecca3a58621cd940b23f7726dc27c..7228525fae6d03a8d41a2f50b6ca9094fee8070b 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
@@ -41,10 +41,10 @@ class CANNExportParser(BaseParser):
         super().__init__(name, param_dict)
         self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
         self.msprof_path = shutil.which("msprof")
-        ProfilerLogger.init(self._profiler_path, "CANNExportParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "CANNExportParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             ProfilerConfig().load_info(self._profiler_path)
             if not os.path.isdir(self._cann_path):
diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
index 6cc6f235165107299886fb4cf936e927dbd687b4..939e06cf748ba4a011a9a33b4ded585fe04f3310 100644
--- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
@@ -28,10 +28,10 @@ class TracePreParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        ProfilerLogger.init(self._profiler_path, "TracePreParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "TracePreParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             fwk_trace_data = FwkFileParser(self._profiler_path).get_fwk_trace_data()
             trace_file_path = os.path.join(self._output_path, Constant.TRACE_VIEW_TEMP) if os.path.isdir(
diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
index e6eb02ddb81d7ffce69d4e2d60899beb62012c61..5e8a941de2873cf071baa412a50d964978fce539 100644
--- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
@@ -23,10 +23,10 @@ __all__ = []
 class RelationParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        ProfilerLogger.init(self._profiler_path, "RelationParser")
-        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
+        ProfilerLogger.init(self._profiler_path, "RelationParser")
+        self.logger = ProfilerLogger.get_instance()
         try:
             kernel_dict = FwkCANNRelationParser(self._profiler_path).get_kernel_dict()
         except Exception as e:
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
index 64de6315f246b49a93a2a72a7b1614aa1f630c3a..34a5fc27f856530c83cb66ba93a63afe367aa746 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
@@ -65,6 +65,8 @@ class MemoryDbParser(BaseParser):
 
     @staticmethod
     def _combine_record(last_record, cur_record):
+        if cur_record[MemoryRecordTableRow.COMPONENT.value] == Str2IdManager().get_id_from_str(Constant.WORKSPACE):
+            return [cur_record]
         pta_ge_record_list = cur_record[:]
         pta_ge_record_list[MemoryRecordTableRow.COMPONENT.value] = Str2IdManager().get_id_from_str(Constant.PTA_GE)
         if last_record:
@@ -179,9 +181,16 @@ class MemoryDbParser(BaseParser):
         if not self._pta_memory_bean_list:
             return
         for memory_bean in self._pta_memory_bean_list:
+            if memory_bean.component_type == Constant.WORKSPACE_TYPE:
+                self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.WORKSPACE), memory_bean.time_ns,
+                                              memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db,
+                                              memory_bean.total_active_for_db, memory_bean.stream_ptr,
+                                              memory_bean.device_index])
+                continue
             self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.PTA), memory_bean.time_ns,
                                           memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db,
-                                          memory_bean.total_active_for_db, memory_bean.stream_ptr, memory_bean.device_index])
+                                          memory_bean.total_active_for_db, memory_bean.stream_ptr,
+                                          memory_bean.device_index])
     
     def get_pta_ge_record_list(self):
         """
@@ -203,7 +212,9 @@ class MemoryDbParser(BaseParser):
             if ge_record[1] >= pta_record[1]:
                 self._record_list.extend(self._combine_record(last_ge_record, pta_record))
                 pta_ptr += 1
-                last_pta_record = pta_record
+                if pta_record[MemoryRecordTableRow.COMPONENT.value] != \
+                    Str2IdManager().get_id_from_str(Constant.WORKSPACE):
+                    last_pta_record = pta_record
             else:
                 self._record_list.extend(self._combine_record(last_pta_record, ge_record))
                 ge_ptr += 1
diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py
index 99bfd76f7263e115b2839e07a4bc8948b8969f64..313fe7d388f078fed4a63ffd668f531136a132a7 100644
--- a/torch_npu/profiler/dynamic_profile.py
+++ b/torch_npu/profiler/dynamic_profile.py
@@ -3,6 +3,7 @@ import json
 import atexit
 import time
 
+from ..npu import mstx, current_stream
 from .profiler import tensorboard_trace_handler, profile
 from .scheduler import Schedule as schedule
 
@@ -38,6 +39,7 @@ class _DynamicProfile:
         self._step_record_time = None
         self._step_time = 0
         self._min_poll_interval = 1
+        self._step_mstx_range_id = 0
 
     def init(self):
         if self.repeat_init:
@@ -78,6 +80,9 @@ class _DynamicProfile:
             self._step_time = max(self._min_poll_interval, int(time.time() - self._step_record_time))
             self._dynamic_monitor.modify_step_time(self._step_time)
         if self.prof:
+            if self._step_mstx_range_id:
+                mstx.range_end(self._step_mstx_range_id)
+                self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream())
             self.prof.step()
             self.step_num -= 1
             if 0 == self.step_num:
@@ -138,7 +143,9 @@ class _DynamicProfile:
             with_modules=self.cfg_ctx.with_modules,
             experimental_config=self.cfg_ctx.experimental_config
         )
+        self.prof._set_step_num_offset_for_dynamic_prof(self.cur_step)
         self.prof.start()
+        self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream())
         for key, value in self.cfg_ctx.meta_data().items():
             self.prof.add_metadata_json(str(key), json.dumps(value))
         DynamicProfilerUtils.out_log("Start Dynamic Profiler at {} step.".format(
diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py
index 409013114a8302ad7a7130387ad17352479968f6..d45ad41693385c69b674196da9b92b7e17e49fac 100644
--- a/torch_npu/profiler/profiler.py
+++ b/torch_npu/profiler/profiler.py
@@ -229,6 +229,7 @@ class profile(_KinetoProfile):
         self.on_trace_ready = on_trace_ready
         self.step_num = 0
         self.current_action = self.schedule(self.step_num)
+        self._step_num_offset = 0
         self.step_rec_fn: Optional[prof.record_function] = None
         if use_cuda is not None:
             print_warn_msg("This is npu environment, use_cuda is invalid")
@@ -249,6 +250,10 @@ class profile(_KinetoProfile):
         if self.stopped == False:
             self.stop()
 
+    @no_exception_func()
+    def _set_step_num_offset_for_dynamic_prof(self, step: int):
+        self._step_num_offset = step
+
     @no_exception_func()
     def start(self):
         self.stopped = False
@@ -256,7 +261,7 @@ class profile(_KinetoProfile):
             ProfPathCreator().init(export_only_mode=True)
         self.action_controller.transit_action(ProfilerAction.NONE, self.current_action)
         if self.record_steps:
-            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num))
+            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset))
             self.step_rec_fn.__enter__()
 
     @no_exception_func()
@@ -278,7 +283,7 @@ class profile(_KinetoProfile):
         self.current_action = self.schedule(self.step_num)
         self.action_controller.transit_action(prev_action, self.current_action)
         if self.record_steps:
-            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num))
+            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset))
             self.step_rec_fn.__enter__()
 
 
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index e8fbd923b343f387d2be38aa776237a48c2d0ef9..0cb93e9951d1cc59a4eaa6ae977aaf7e3d4e0333 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -1,12 +1,12 @@
 __all__ = ["npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter",
-           "set_thread_affinity"]
+           "set_thread_affinity", "reset_thread_affinity", "save_async"]
 
 from torch_npu import _C
 from ._module import _apply_module_patch
 from .tensor_methods import _add_tensor_methods
 from .storage import _add_storage_methods
 from .combine_tensors import npu_combine_tensors, get_part_combined_tensor, is_combined_tensor_valid
-from .serialization import _add_serialization_methods
+from .serialization import _add_serialization_methods, save_async
 from .npu_intercept import _cann_package_check, _add_intercept_methods
 from .dtensor import _register_ops_under_dtensor_rules
 from .collect_env import _add_collect_env_methods
@@ -18,6 +18,7 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _apply_np
 from ._step import add_perf_dump_patch
 from .flops_count import _FlopsCounter as FlopsCounter
 from .affinity import _set_thread_affinity as set_thread_affinity
+from .affinity import _reset_thread_affinity as reset_thread_affinity
 
 
 # init flopcount
diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index fbe408739fd35e2c9dff7f2635a04af7ba2e60d6..313736c3a176de58c81ea2fad222475c78b99a5a 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -30,8 +30,6 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm
 from torch_npu.utils._error_code import ErrCode, pta_error
 
 origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__
-origin_worker_loop = worker._worker_loop
-origin_pin_memory_loop = pin_memory._pin_memory_loop
 
 CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"]
 
@@ -370,17 +368,9 @@ def _mpdl_iter_init(self, *args, **kwargs):
         torch_npu.npu.synchronize()
     except Exception as e:
         print(e)
-    origin_mpdl_iter_init(self, *args, **kwargs)
-
-
-def _npu_worker_loop(*args, **kwargs):
     torch_npu._C._npu_set_thread_affinity(-1, -1)
-    origin_worker_loop(*args, **kwargs)
-
-
-def _npu_pin_memory_loop(*args, **kwargs):
-    torch_npu._C._npu_set_thread_affinity(-1, -1)
-    origin_pin_memory_loop(*args, **kwargs)
+    origin_mpdl_iter_init(self, *args, **kwargs)
+    torch_npu._C._npu_reset_thread_affinity()
 
 
 def _parallel_apply(
@@ -533,5 +523,3 @@ def _apply_module_patch():
     torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply
     torch.nn.parallel.data_parallel = npu_data_parallel
     torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
-    torch.utils.data._utils.worker._worker_loop = _npu_worker_loop
-    torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop
diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py
index 7728736baa19712879a2d8edc58ac33cdfc6c069..37973f5bc79bc81af684a286603bb75e2c734332 100644
--- a/torch_npu/utils/affinity.py
+++ b/torch_npu/utils/affinity.py
@@ -14,4 +14,8 @@ def _set_thread_affinity(core_range: List[int] = None):
             raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM))
         torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1])
     else:
-        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
\ No newline at end of file
+        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
+
+
+def _reset_thread_affinity():
+    torch_npu._C._npu_reset_thread_affinity()
\ No newline at end of file
diff --git a/torch_npu/utils/collect_env.py b/torch_npu/utils/collect_env.py
index 3f279bf3cc36512a8f8a0fcbbb748262f857998e..8ffed93212ccdc61027c5af7d2682fa8bb8e0358 100644
--- a/torch_npu/utils/collect_env.py
+++ b/torch_npu/utils/collect_env.py
@@ -87,14 +87,8 @@ def get_cann_version():
 
 def get_torch_npu_version():
     torch_npu_version_str = 'N/A'
-    torch_npu_root = get_torch_npu_install_path()
-    version_path = os.path.join(torch_npu_root, "torch_npu", "version.py")
-    check_directory_path_readable(version_path)
-    with open(version_path, "r") as f:
-        for line in f:
-            if line.find("__version__") != -1:
-                torch_npu_version_str = line.strip().split("=")[-1]
-                break
+    if TORCH_NPU_AVAILABLE:
+        torch_npu_version_str = torch_npu.__version__
     return torch_npu_version_str
 
 
diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index cb56f8cb79072537733e62f951b78073a1e2af3e..d08752f4704c6c798ddec16a5f8ac8e36ee03340 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -3,22 +3,25 @@ import io
 import sys
 import pickle
 import tarfile
+import threading
 from typing import Dict, Any, Optional
 
 import torch
 from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \
     _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \
     _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL, \
-    normalize_storage_type, location_tag, _serialization_tls, _check_seekable, closing, _should_read_directly
+    normalize_storage_type, location_tag, _serialization_tls, _check_seekable, closing, _should_read_directly, \
+    _open_zipfile_writer
 
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error
 from .utils import _should_print_warning
 
+__all__ = ["load", "save", "save_async"]
+
 ALWAYS_WARN_LEGACY_SERIALIZATION = False
 RE_MAP_CPU = False
-
-__all__ = ["load", "save"]
+save_async_stream_map = {}
 
 
 def _get_always_warn_legacy_serialization():
@@ -349,6 +352,146 @@ def save(
     return torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
 
 
+def save_async(
+    obj: object,
+    f,
+    pickle_module: Any = pickle,
+    pickle_protocol: int = DEFAULT_PROTOCOL,
+    _use_new_zipfile_serialization: bool = True,
+    _disable_byteorder_record: bool = False,
+    model: torch.nn.Module = None
+) -> None:
+    if _use_new_zipfile_serialization is False:
+        raise RuntimeError("Error: torch_npu.save_async with \"_use_new_zipfile_serialization = False\"\
+                           is not recommended for npu tensor, which may bring unexpected errors and hopefully \
+                           set \"_use_new_zipfile_serialization = True\"",
+                           "if it is necessary to use this, please convert the npu tensor to cpu tensor for saving" +
+                           pta_error(ErrCode.PARAM))
+
+    _check_dill_version(pickle_module)
+    save_args = (obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)
+
+    device = torch.npu.current_device()
+    save_thread = threading.Thread(target=_save_data_thread, args=(save_args, device, model))
+    save_thread.start()
+
+
+def _save_data_thread(save_args,
+                     device,
+                     model: torch.nn.Module = None):
+    global save_async_stream_map
+    torch.npu.set_device(device)
+
+    def hook_fn(*args):
+        torch.npu.current_stream().wait_stream(save_async_stream_map.get(device))
+
+    if device not in save_async_stream_map:
+        save_async_stream = torch.npu.Stream()
+        save_async_stream_map[device] = save_async_stream
+        if isinstance(model, torch.nn.Module):
+            model.register_full_backward_hook(hook_fn)
+    else:
+        save_async_stream = save_async_stream_map[device]
+
+    obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record = save_args
+    with torch.npu.stream(save_async_stream):
+        data_value, serialized_storages = _save(obj, pickle_module, pickle_protocol)
+        storage_value = []
+        for key in sorted(serialized_storages.keys()):
+            name = f'data/{key}'
+            storage = serialized_storages.get(key)
+            # given that we copy things around anyway, we might use storage.cpu()
+            # this means to that to get tensors serialized, you need to implement
+            # .cpu() on the underlying Storage
+            if storage.device.type != 'cpu':
+                storage = storage.cpu()
+            # Now that it is on the CPU we can directly copy it into the zip file
+            if storage.device.type != "cpu":
+                storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                num_bytes = storage_tensor.size().numel() * storage_tensor.element_size()
+            else:
+                num_bytes = storage.nbytes()
+            storage_value.append((name, storage, num_bytes))
+
+    with _open_zipfile_writer(f) as opened_zipfile:
+        opened_zipfile.write_record('data.pkl', data_value, len(data_value))
+
+        for name, storage, num_bytes in storage_value:
+            opened_zipfile.write_record(name, storage.data_ptr(), num_bytes)
+
+
+def _save(obj, pickle_module, pickle_protocol):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj):
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size()
+                else:
+                    storage_numel = obj._size()
+
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size()
+                else:
+                    storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            'Cannot save multiple tensors or storages that '
+                            'view the same data as different types' + pta_error(ErrCode.VALUE))
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ('storage',
+                    storage_type,
+                    storage_key,
+                    location,
+                    storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol)
+    pickler.persistent_id = persistent_id
+    if isinstance(obj, torch.nn.Module):
+        hook_handle = obj._backward_hooks.copy()
+        obj._backward_hooks.clear()
+        pickler.dump(obj)
+        obj._backward_hooks.update(hook_handle)
+    else:
+        pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return data_value, serialized_storages
+
+
 def _add_serialization_methods():
     torch.save = save
     torch.load = load
diff --git a/torch_npu/utils/storage.py b/torch_npu/utils/storage.py
index 9304f141bf2fc5475f1594d6aff44aee99fbc289..85a2a402a37c11f81e77155ab9c792c56c80a061 100644
--- a/torch_npu/utils/storage.py
+++ b/torch_npu/utils/storage.py
@@ -1,4 +1,7 @@
+__all__ = []
+
 import copy
+from typing import Union
 import torch
 from torch.storage import _warn_typed_storage_removal
 
@@ -49,6 +52,37 @@ def _deepcopy(self, memo):
         return self._new_wrapped_storage(copy.deepcopy(self._untyped_storage, memo))
 
 
+def _share_npu_(self, *args, **kwargs):
+    return torch_npu._C._share_npu_(self, *args, **kwargs)
+
+
+def _typed_storage_share_npu_(self, *args, **kwargs):
+    return self._untyped_storage._share_npu_(*args, **kwargs)
+
+
+def _new_shared_npu(*args, **kwargs):
+    return torch_npu._C._new_shared_npu(*args, **kwargs)
+
+
+def _typed_storage_new_shared_npu(*args, **kwargs):
+    return torch.UntypedStorage._new_shared_npu(*args, **kwargs)
+
+
+def _release_ipc_counter_npu(*args, **kwargs):
+    return torch_npu._C._release_ipc_counter_npu(*args, **kwargs)
+
+
+def _typed_storage_release_ipc_counter_npu(*args, device: Union[str, torch.device] = "npu", **kwargs):
+    return torch.UntypedStorage._release_ipc_counter_npu(*args, **kwargs)
+
+
 def _add_storage_methods():
     torch.storage.UntypedStorage.cpu = _cpu
     torch.storage.TypedStorage._deepcopy = _deepcopy
+
+    setattr(torch.UntypedStorage, "_share_npu_", _share_npu_)
+    setattr(torch.UntypedStorage, "_new_shared_npu", _new_shared_npu)
+    setattr(torch.UntypedStorage, "_release_ipc_counter_npu", _release_ipc_counter_npu)
+    setattr(torch.TypedStorage, "_share_npu_", _typed_storage_share_npu_)
+    setattr(torch.TypedStorage, "_new_shared_npu", _typed_storage_new_shared_npu)
+    setattr(torch.TypedStorage, "_release_ipc_counter_npu", _typed_storage_release_ipc_counter_npu)
\ No newline at end of file
diff --git a/torch_npu/utils/unsupport_api.py b/torch_npu/utils/unsupport_api.py
index 61ba27b3a239f000e7d93437add90e65d62884b8..5626e940b6a690e7a74815095c8d51a3fd08dabd 100644
--- a/torch_npu/utils/unsupport_api.py
+++ b/torch_npu/utils/unsupport_api.py
@@ -6,8 +6,6 @@ value: parent_module(object)
 """
 
 unsupported_Tensor_api = {
-    "is_shared": torch.Tensor,
-    "share_memory_": torch.Tensor
 }
 
 unsupported_nn_api = {