diff --git a/CMakeLists.txt b/CMakeLists.txt index 113c17f7a69f97d7dc8d1af053b922f0feb83576..9d506ad2dc6d0482dd316b1d56d33277401bb6a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,6 +234,7 @@ if (NOT DEFINED BUILD_LIBTORCH) set(FLOP_SRCS) set(NPU_SRCS) set(PROF_SRCS) + set(IPC_SRCS) set(UTILS_SRCS) set(SAN_SRCS) endif() @@ -247,11 +248,13 @@ add_subdirectory(${TORCHNPU_ROOT}/core) add_subdirectory(${TORCHNPU_ROOT}/framework) add_subdirectory(${TORCHNPU_ROOT}/flopcount) add_subdirectory(${TORCHNPU_ROOT}/logging) +add_subdirectory(${TORCHNPU_ROOT}/custom_dtype) if (NOT DEFINED BUILD_LIBTORCH) add_subdirectory(${TORCHNPU_ROOT}/distributed) add_subdirectory(${TORCHNPU_ROOT}/npu) add_subdirectory(${TORCHNPU_ROOT}/profiler) + add_subdirectory(${TORCHNPU_ROOT}/ipc) add_subdirectory(${TORCHNPU_ROOT}/utils) add_subdirectory(${TORCHNPU_ROOT}/sanitizer) endif() @@ -273,10 +276,10 @@ if (DEFINED BUILD_TENSORPIPE) endif() if (DEFINED BUILD_LIBTORCH) - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) else() # Compile code with pybind11 - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${IPC_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) endif() add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS}) diff --git a/README.zh.md b/README.zh.md index 1fe8456923835b42bdd1ea9cf294c4e4628e5271..b1748aad80a742dd554505b6c191400d6fdac326 100644 --- a/README.zh.md +++ b/README.zh.md @@ -253,7 +253,7 @@ AscendPyTorch版本分支的维护阶段如下: ## 安全声明 -[Ascend Extension for PyTorch插件 安全声明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md) +[Ascend Extension for PyTorch插件 安全声明](./SECURITYNOTE.md) ## 参考文档 diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 3a4ddb3c1ffcb92dd9dbb9dcdd3b0c78a3f602d6..2dfff1b2b34a839c3ef6fb6edfda9be2ec61ec24 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -1,9 +1,5 @@ # Ascend Extension for PyTorch插件 安全声明 -## 漏洞风险提示 - -PyTorch 2.6.0以下版本存在CVE-2025-32434漏洞,该漏洞因torch/serialization.py组件兼容性处理导致潜在的远程代码执行(RCE)风险。 torch_npu已参考[LINK](https://github.com/pytorch/pytorch/pull/145020)进行修复。 - ## 系统安全加固 建议用户在系统中配置开启ASLR(级别2 ),又称**全随机地址空间布局随机化**,可参考以下方式进行配置: @@ -229,3 +225,90 @@ PyTorch提供分布式训练能力,支持在单机和多机场景下进行训 | 版本 | 所有版本 | 所有版本 | | 特殊场景 | 无 | 无 | | 备注 | 该通信过程由开源软件PyTorch控制,配置为PyTorch原生设置,可参考[PyTorch文档](https://pytorch.org/docs/stable/distributed.html#launch-utility)。源端口由操作系统自动分配,分配范围由操作系统的配置决定,例如ubuntu:采用/proc/sys/net/ipv4/ipv4_local_port_range文件指定,可通过cat /proc/sys/net/ipv4/ipv4_local_port_range或sysctl net.ipv4.ip_local_port_range查看 | 该通信过程由CANN中HCCL组件控制,torch_npu不进行控制,端口范围可参考[《环境变量参考》](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/envvar/envref_07_0001.html)的“执行相关 > 集合通信与分布式训练 > 集合通信相关配置>HCCL_IF_BASE_PORT” | + +## 漏洞机制说明 + +Ascend Extension for PyTorch 社区非常重视社区版本的安全性,专门设置了漏洞管理专员负责处理漏洞相关的事务,同时为了构建更安全的AI全流程工具链,我们也欢迎您一起参与。 + +### 漏洞处理流程 + +对于每一个安全漏洞,Ascend Extension for PyTorch 社区会安排人员进行跟踪和处理,漏洞处理的端到端流程如下图所示。 + +![漏洞处理流程](./figures/cve.png) + +下面将重点解释漏洞上报、漏洞评估、漏洞披露的流程。 + +### 漏洞上报 + +您可以通过提交issue的方式联系 Ascend Extension for PyTorch 社区团队,我们将会第一时间安排安全漏洞专项人员向您联系。 +注意,为了确保安全性,请不要在issue中描述涉及安全隐私的具体信息。 + +#### 上报响应 + +1. Ascend Extension for PyTorch 社区会在3个工作日内确认、分析、上报安全漏洞问题,同时启动安全处理流程。 +2. Ascend Extension for PyTorch 安全团队在确认安全漏洞问题后,会对问题进行分发和跟进。 +3. 在安全漏洞问题从分类、确定、修复和发布的过程中,我们会及时更新报告。 + +### 漏洞评估 + +业界普遍使用 CVSS 标准评估漏洞的严重性,Ascend Extension for PyTorch 在使用 CVSS v3.1 进行漏洞评估时,需要设定漏洞攻击场景,基于在该攻击场景下的实际影响进行评估。漏洞严重等级评估是指针对漏洞利用难易程度,以及利用后对机密性、完整性、可用性的影响进行评估,并生成一个评分值。 + +#### 漏洞评估标准 + +Ascend Extension for PyTorch 通过以下向量评估一个漏洞的严重等级: + +- 攻击向量(AV):表示攻击的“远程性”以及如何利用此漏洞。 +- 攻击复杂性(AC):讲述攻击执行的难度以及成功进行攻击需要哪些因素。 +- 用户交互(UI):确定攻击是否需要用户参与。 +- 所需的权限(PR):记录成功进行攻击所需的用户身份验证级别。 +- 范围(S):确定攻击者是否可以影响具有不同权限级别的组件。 +- 机密性(C):衡量信息泄露给非授权方后导致的影响程度。 +- 完整性(I):衡量信息被篡改后导致的影响程度。 +- 可用性(A):衡量用户在需要访问数据或服务时受影响的程度。 + +#### 评估原则 + +- 评估漏洞的严重等级,不是评估风险。 +- 评估时必须基于攻击场景,且保证在该场景下,攻击者成功攻击后能对系统造成机密性、完整性、可用性影响。 +- 当安全漏洞有多个攻击场景时,应以造成最大的影响,即 CVSS 评分最高的攻击场景为依据。 +- 被嵌入调用的库存在漏洞,要根据该库在产品中的使用方式,确定漏洞的攻击场景后进行评估。 +- 安全缺陷不能被触发或不影响 CIA(机密性、完整性、可用性),CVSS 评分为 0 分。 + +#### 评估步骤 + +评估漏洞严重等级时,可根据下述步骤进行操作: + +1. 设定可能的攻击场景,基于攻击场景评分。 +2. 确定漏洞组件(Vulnerable Component)和受影响组件(Impact Component)。 + +3. 选择基础指标的值。 + + - 可利用指标(攻击向量、攻击复杂度、所需权限、用户交互、范围)根据漏洞组件选择指标值。 + + - 影响指标(机密性、完整性、可用性)要么反映对漏洞组件的影响,要么反映对受影响组件影响,以结果最严重的为准。 + +#### 严重等级划分 + +| **严重等级(Severity Rating)** | **CVSS评分(Score)** | **漏洞修复时长** | +| ------------------------------- | --------------------- | ---------------- | +| 致命(Critical) | 9.0~10.0 | 7天 | +| 高(High) | 7.0~8.9 | 14天 | +| 中(Medium) | 4.0~6.9 | 30天 | +| 低(Low) | 0.1~3.9 | 30天 | + +### 漏洞披露 + +安全漏洞修复后 Ascend Extension for PyTorch 社区会发布安全公告 (SA)以及安全说明(SN) ,安全公告内容包括该漏洞的技术细节、类型、上报人、CVE ID 以及受到该漏洞影响的版本和修复版本等信息。 +为了保护 Ascend Extension for PyTorch 用户的安全,在进行调查、修复和发布安全公告之前, Ascend Extension for PyTorch 社区不会公开披露、讨论或确认 Ascend Extension for PyTorch 产品的安全问题。 + +### 附录 + +#### 安全公告(SA) + +目前在维护版本,无安全漏洞 + +#### 安全说明(SN) + +涉及第三方的开源组件部分漏洞说明: + +PyTorch 2.6.0以下版本存在CVE-2025-32434漏洞,该漏洞因torch/serialization.py组件兼容性处理导致潜在的远程代码执行(RCE)风险。 torch_npu已参考[LINK](https://github.com/pytorch/pytorch/pull/145020)进行修复。 \ No newline at end of file diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py index c60f38b8d4896c2d646c162cb5ad4c28df7bb1a6..439863e039ed2362b6f1f1cabb5bad57690c67cd 100644 --- a/codegen/gen_backend_stubs.py +++ b/codegen/gen_backend_stubs.py @@ -402,6 +402,8 @@ def gen_dispatcher_registrations( ns_helper = NamespaceHelper(namespace_str="at") native_func_header = """\ #include "torch_npu/csrc/core/npu/NPURecovery.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/NPUException.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/profiler/utils.h" #endif diff --git a/codegen/utils.py b/codegen/utils.py index 8e0713982eae0f63145d56ed0db59d6c15885fa7..fa5903ee34ed3904b8c887576c2f7cd701d7da10 100644 --- a/codegen/utils.py +++ b/codegen/utils.py @@ -418,6 +418,7 @@ const DeviceGuard device_guard(device_or_default(device));""" device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" op_key = str(f.func.name) + is_aclnn_only = "c10_npu::IsAclnnOnly()" if enable_opplugin(): if op_key in GLOBAL_STRUCTURED_OP_INFO_CACHE: impl_name = f"op_plugin::{GLOBAL_STRUCTURED_OP_INFO_CACHE[op_key]}" @@ -436,6 +437,11 @@ const DeviceGuard device_guard(device_or_default(device));""" if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{ return {op_api_impl_name}({args_exprs_str}); }} else {{ + if ({is_aclnn_only}) {{ + TORCH_CHECK(false, + "Current device only support aclnn operator, and current operator {impl_name} do not support internal format.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + }} return {impl_name}({args_exprs_str}); }} """ diff --git a/env.sh b/env.sh index ff54b797d211caad86b37132a8fdc101157c1388..96fa71d80f4f94d140314654b82bfe8fa0f469c2 100644 --- a/env.sh +++ b/env.sh @@ -1,3 +1,4 @@ +#!/bin/bash # 配置CANN相关环境变量 CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' diff --git a/figures/cve.png b/figures/cve.png new file mode 100644 index 0000000000000000000000000000000000000000..095d0f7ba20c416165a57aa8870c0325bbcb8af0 Binary files /dev/null and b/figures/cve.png differ diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py index 99cc499a93c457b0c6732dd3de015c76a280c695..a05fe8538a776b4e79ae9f0a19c86b7090d175c3 100644 --- a/test/allocator/test_pluggable_allocator_extensions.py +++ b/test/allocator/test_pluggable_allocator_extensions.py @@ -2,6 +2,7 @@ import os import sys import shutil import subprocess +import ctypes import torch import torch.utils.cpp_extension @@ -27,6 +28,7 @@ def build_stub(base_dir): class TestPluggableAllocator(TestCase): module = None + new_alloc = None build_directory = "allocator/build" @classmethod @@ -59,9 +61,9 @@ class TestPluggableAllocator(TestCase): def test_pluggable_allocator(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Load the allocator - new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') + TestPluggableAllocator.new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') # Swap the current allocator - torch_npu.npu.memory.change_current_allocator(new_alloc) + torch_npu.npu.memory.change_current_allocator(TestPluggableAllocator.new_alloc) # This will allocate memory in the device using the new allocator self.assertFalse(self.module.check_custom_allocator_used()) npu_tensor = torch.zeros(10, device='npu') @@ -69,6 +71,23 @@ class TestPluggableAllocator(TestCase): self.assertRtolEqual(npu_tensor.cpu().numpy(), cpu_tensor.numpy()) self.assertTrue(self.module.check_custom_allocator_used()) + def test_set_get_device_stats_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value + + TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn) + self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0) + + def test_set_reset_peak_status_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value + + TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn) + torch.npu.reset_peak_memory_stats() + self.assertEqual(torch.npu.max_memory_allocated(), 0) + def test_pluggable_allocator_after_init(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Do an initial memory allocator diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 04ae192e478e93ab28a92b38b6be116531f0e1dd..fe31a00e3a594adc0f11d0f21173c2ceaa92d2b0 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -583,7 +583,9 @@ "ForkingPickler", "Union", "check_serializing_named_tensor", - "register_after_fork" + "register_after_fork", + "reduce_tensor", + "reduce_storage" ], "torch.multiprocessing.spawn": [ "Optional" @@ -2820,16 +2822,19 @@ "npu_cross_entropy_loss", "npu_format_cast_", "npu_fusion_attention", + "npu_fusion_attention_v2", "npu_get_float_status", "npu_nms_rotated", "npu_random_choice_with_mask", "npu_rms_norm", "npu_add_rms_norm_cast", "npu_fused_infer_attention_score", + "npu_fused_infer_attention_v2", "npu_mla_prolog", "npu_mla_prolog_v2", "npu_convert_weight_to_int4pack", "npu_ffn", + "npu_fused_matmul", "npu_geglu", "npu_grouped_matmul", "npu_quant_matmul", @@ -2848,6 +2853,7 @@ "npu_trans_quant_param", "npu_stride_add", "npu_sort_v2", + "npu_dtype_cast", "npu_gelu", "npu_gelu_backward", "npu_all_gather_base_mm", diff --git a/test/contrib/test_transfer_to_npu.py b/test/contrib/test_transfer_to_npu.py index b2f84413e1c57e3859be1495137fc4dabf73b130..073119a878e5266af0a2be6210bdcc39c0a874e5 100644 --- a/test/contrib/test_transfer_to_npu.py +++ b/test/contrib/test_transfer_to_npu.py @@ -12,6 +12,27 @@ from torch_npu.contrib import transfer_to_npu class TestTransferToNpu(TestCase): + def test_generator(self): + g0 = torch.Generator() + self.assertTrue(isinstance(g0, torch.Generator)) + self.assertEqual(g0.device.type, 'cpu') + + g1 = torch.Generator('cuda') + self.assertTrue(isinstance(g1, torch.Generator)) + self.assertEqual(g1.device.type, 'npu') + + g2 = torch.Generator(torch.device('cuda')) + self.assertTrue(isinstance(g2, torch.Generator)) + self.assertEqual(g2.device.type, 'npu') + + g3 = torch.Generator(device='cuda') + self.assertTrue(isinstance(g3, torch.Generator)) + self.assertEqual(g3.device.type, 'npu') + + g4 = torch.Generator(device=torch.device('cuda')) + self.assertTrue(isinstance(g4, torch.Generator)) + self.assertEqual(g4.device.type, 'npu') + def test_wrap_isinstance(self): # check builtins isinstance grammar self.assertTrue(isinstance(1, int)) diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp index 62c0591e06e359b574f50665545df965bbe39372..f13200aca1da40b966e008a1b43f297ca2f0cd5e 100644 --- a/test/cpp_extensions/extension.cpp +++ b/test/cpp_extensions/extension.cpp @@ -45,6 +45,17 @@ bool check_from_blob() return dtype_same && num_same && pos1_same && pos2_same && pos3_same && sub_same; } +bool check_from_blob_delete() +{ + int isgone = 0; + { + auto data = torch::tensor({1.0, 2.0, 3.0}, torch::kFloat).to(at::Device("npu:0")); + auto res = at_npu::native::from_blob(data.data_ptr(), data.sizes(), [&](void*) { isgone++; }); + } + bool is_deleted = (isgone == 1); + return is_deleted; +} + bool check_from_blob_strides() { auto data = torch::tensor({1, 2, 3, 4, 5, 6, 7, 8, 9}, torch::kInt32).to(at::Device("npu:0")); @@ -95,5 +106,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("check_storage_sizes", &check_storage_sizes, "check_storage_sizes"); m.def("check_from_blob", &check_from_blob, "check_from_blob"); m.def("check_from_blob_strides", &check_from_blob_strides, "check_from_blob_strides"); + m.def("check_from_blob_delete", &check_from_blob_delete, "check_from_blob_delete"); m.def("blocking_ops", &blocking_ops, "blocking_ops"); } diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp index 3ed2606b021ba7796ed6e94ad11f41625a88d169..6bb80e59dd5c4911d79fcb50cadc69b6f6babdbb 100644 --- a/test/cpp_extensions/pluggable_allocator_extensions.cpp +++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp @@ -4,8 +4,10 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "third_party/acl/inc/acl/acl_rt.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" extern "C" { +using c10_npu::NPUCachingAllocator::DeviceStats; static bool useflag = false; void* my_malloc(ssize_t size, int device, aclrtStream stream) @@ -27,6 +29,17 @@ bool check_custom_allocator_used() { return useflag; } + +DeviceStats my_get_device_stats(int device) +{ + DeviceStats stats; + return stats; +} + +void my_reset_peak_status(int device) +{ + std::cout<<"resetPeakStatus success!"< None: def set_device(): torch_npu.npu.set_device(0) multiprocessing.set_start_method("spawn", force=True) - jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(70)] + jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(100)] for p in jobs: p.start() @@ -20,4 +20,5 @@ def set_device(): p.join() -set_device() +if __name__ == "__main__": + set_device() diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py index 644579b9f1e6875854a35ad426adf9dd6272adde..18dbb79c5cb4691e6a9629e81777cd1b345777c3 100644 --- a/test/npu/test_aclgraph_update.py +++ b/test/npu/test_aclgraph_update.py @@ -122,6 +122,53 @@ class TestAclgraphUpdate(TestCase): g.replay() self.assertEqual(output.cpu(), res_src[0].cpu()) self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) + + @SupportedDevices(['Ascend910B']) + def test_npu_fused_infer_attention_v2(self): + torch.npu.set_device(0) + length = [29] + length_new = [100] + scale = 1 / 0.0078125 + query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + res_src = torch_npu.npu_fused_infer_attention_v2( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new) + g = torch.npu.NPUGraph() + event = torch.npu.ExternalEvent() + update_stream = torch.npu.Stream() + handle = None + output = None + softmax_lse = None + + workspace = torch_npu._npu_fused_infer_attention_v2_get_max_workspace( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length) + + with torch.npu.graph(g): + stream = torch.npu.current_stream() + output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu") + softmax_lse = torch.empty(1, dtype=torch.float16, device="npu") + event.wait(stream) + event.reset(stream) + torch.npu.graph_task_group_begin(stream) + torch_npu.npu_fused_infer_attention_v2.out( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length, out=[output, softmax_lse]) + handle = torch.npu.graph_task_group_end(stream) + + with torch.npu.stream(update_stream): + torch.npu.graph_task_update_begin(update_stream, handle) + torch_npu.npu_fused_infer_attention_v2.out( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new, out=[output, softmax_lse]) + torch.npu.graph_task_update_end(update_stream) + event.record(update_stream) + + g.replay() + self.assertEqual(output.cpu(), res_src[0].cpu()) + self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) if __name__ == "__main__": run_tests() diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index 0d52d5d11c6a55f679676dd6d50ea3e5bf601e99..6877fced8f5c4cf0831c663ccb5e81527db3e6f1 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -6,6 +6,9 @@ from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils.checkpoint import checkpoint import torch.distributed as dist import torch.nn as nn + +from torch_npu.testing.common_utils import SupportedDevices + os.environ["ASCEND_LAUNCH_BLOCKING"] = '0' import torch_npu @@ -156,6 +159,7 @@ class TestMode(TestCase): with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): torch.npu.reset_max_memory_allocated(device="npu:8") + @SupportedDevices(['Ascend910B']) def test_aclrtSetDevice(self): path = os.path.join(os.path.dirname(__file__), '_fault_mode_cases/error_set_device.py') process = subprocess.Popen(["python", f"{path}"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) diff --git a/test/npu/test_npu_format.py b/test/npu/test_npu_format.py new file mode 100644 index 0000000000000000000000000000000000000000..2bc1c067ff4496896e816493c36529074bbfb2a8 --- /dev/null +++ b/test/npu/test_npu_format.py @@ -0,0 +1,49 @@ +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestNPUFormat(TestCase): + + def test_enum_values(self): + """test the enumeration value""" + self.assertEqual(torch_npu.Format.NCHW.value, 0) + self.assertEqual(torch_npu.Format.NHWC.value, 1) + + def test_npu_format_cast(self): + """test npu_format_cast""" + tensor = torch.ones(2, 2).npu() + + out1 = torch_npu.npu_format_cast(tensor, 0) + fmt1 = torch_npu.get_npu_format(out1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + + out2 = torch_npu.npu_format_cast(tensor, torch_npu.Format.NHWC) + fmt2 = torch_npu.get_npu_format(out2) + self.assertEqual(fmt2, torch_npu.Format.NHWC) + + def test_npu_format_cast_(self): + """test npu_format_cast_""" + x1 = torch.ones(2, 2).npu() + x2 = torch.ones(2, 2).npu() + + torch_npu.npu_format_cast_(x1, 0) + fmt1 = torch_npu.get_npu_format(x1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + + torch_npu.npu_format_cast_(x2, torch_npu.Format.NHWC) + fmt2 = torch_npu.get_npu_format(x2) + self.assertEqual(fmt2, torch_npu.Format.NHWC) + + def test_get_npu_format(self): + """test get_npu_format""" + x1 = torch.ones(2, 2).npu() + torch_npu.npu_format_cast_(x1, 0) + + fmt1 = torch_npu.get_npu_format(x1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + self.assertEqual(fmt1, 0) + + +if __name__ == "__main__": + run_tests() diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py index 17bbe73886fe58c42b4df227e6e48390060da3c5..51c288ecda6d314bc7c44bb2f13afa9f70ef57f7 100644 --- a/test/npu/test_public_bindings.py +++ b/test/npu/test_public_bindings.py @@ -545,7 +545,7 @@ class TestPublicBindings(TestCase): "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_dequant_bias", "torch_npu.utils.collect_hccl_info", "torch_npu.op_plugin.meta._meta_registrations", - + "torch_npu.op_plugin.atb._atb_meta_registrations", } # No new entries should be added to this list. diff --git a/test/npu/test_save_async.py b/test/npu/test_save_async.py new file mode 100644 index 0000000000000000000000000000000000000000..2c446df9f572702c94df3265e170404a0d0d1121 --- /dev/null +++ b/test/npu/test_save_async.py @@ -0,0 +1,119 @@ +import os +import time +import copy + +import torch +import torch.nn as nn +import torch.optim as optim + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.utils._path_manager import PathManager + + +class TestAsyncSave(TestCase): + test_save_path = os.path.join( + os.path.realpath(os.path.dirname(__file__)), "test_save_async") + + @classmethod + def setUpClass(cls): + PathManager.make_dir_safety(TestAsyncSave.test_save_path) + + @classmethod + def tearDownClass(cls): + PathManager.remove_path_safety(TestAsyncSave.test_save_path) + + def wait_for_save_completion(self, file_path, timeout_sec=60, poll_interval_sec=0.5): + start_time = time.time() + + while time.time() - start_time < timeout_sec: + if os.path.exists(file_path): + current_size = os.path.getsize(file_path) + time.sleep(poll_interval_sec) + new_size = os.path.getsize(file_path) + + if current_size == new_size: + return True + else: + time.sleep(poll_interval_sec) + + return False + + def test_save_async_tensor(self): + save_tensor = torch.rand(1024, dtype=torch.float32).npu() + async_save_path = os.path.join(TestAsyncSave.test_save_path, "async_save_tensor.pt") + torch_npu.utils.save_async(save_tensor, async_save_path) + + if self.wait_for_save_completion(async_save_path): + tensor_async = torch.load(async_save_path) + self.assertEqual(tensor_async, save_tensor) + else: + self.assertTrue(False, f"{async_save_path} is not exist!") + + def test_save_async(self): + loss1 = [1.6099495, 1.6099086, 1.6098710] + loss2 = [] + model_list = [] + checkpoint_list = [] + model_origin = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 20), + nn.ReLU(), + nn.Linear(20, 5), + nn.ReLU() + ) + + input_data = torch.ones(6400, 100).npu() + labels = torch.arange(5).repeat(1280).npu() + + criterion = nn.CrossEntropyLoss() + model = model_origin.npu() + optimerizer = optim.SGD(model.parameters(), lr=0.1) + for step in range(3): + outputs = model(input_data) + loss = criterion(outputs, labels) + + optimerizer.zero_grad() + loss.backward() + + optimerizer.step() + + loss2.append(loss) + checkpoint = { + "model": model.state_dict(), + "optimizer": optimerizer.state_dict() + } + checkpoint_list.append(copy.deepcopy(checkpoint)) + model_list.append(copy.deepcopy(model)) + checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{step}.path") + model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{step}.path") + torch_npu.utils.save_async(checkpoint, checkpoint_async_path, model=model) + torch_npu.utils.save_async(model, model_async_path, model=model) + + for i in range(3): + self.assertEqual(loss1[i], loss2[i].item()) + checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{i}.path") + if self.wait_for_save_completion(checkpoint_async_path): + checkpoint_async = torch.load(checkpoint_async_path) + self.assertEqual(checkpoint_list[i], checkpoint_async, prec=2e-3) + else: + self.assertTrue(False, f"{checkpoint_async_path} is not exist!") + model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{i}.path") + if self.wait_for_save_completion(model_async_path): + model_async = torch.load(model_async_path) + else: + self.assertTrue(False, f"{model_async_path} is not exist!") + state_dict_sync = model_list[i].state_dict() + state_dict_async = model_async.state_dict() + + key_sync = sorted(state_dict_sync.keys()) + key_async = sorted(state_dict_async.keys()) + + self.assertEqual(key_sync, key_async) + for key in key_async: + self.assertEqual(state_dict_async[key], state_dict_sync[key], prec=2e-3) + +if __name__ == '__main__': + torch.npu.set_device(0) + run_tests() diff --git a/test/npu/test_torch_npu.py b/test/npu/test_torch_npu.py index 0e2c96e1bd7dd4709b73c1ff8f9418f839f254dc..29709ef991175785012ecc7aab547d3dae82f15a 100644 --- a/test/npu/test_torch_npu.py +++ b/test/npu/test_torch_npu.py @@ -78,6 +78,12 @@ class TorchNPUDeviceTestCase(TestCase): torch_npu.npu.synchronize() after_free_memory, after_total_memory = torch_npu.npu.mem_get_info(0) self.assertEqual(before_total_memory, after_total_memory) + + @unittest.skip("CANN doesn't support now.") + def test_set_device_res_limit(self): + ans_dict = {'cube_core_num': 12, 'vector_core_num': 24} + torch.npu.set_device_limit(torch.npu.current_device(), 12, 24) + self.assertEqual(ans_dict, torch.npu.get_device_limit(torch.npu.current_device())) class TorchNPUMemoryApiTestCase(TestCase): def test_npu_memory_stats(self): diff --git a/test/npu/test_unsupport_api.py b/test/npu/test_unsupport_api.py index 8883f3eb06e54a587d3e960b08fa262fdaef6494..54af07e0b21e3b5d86f3c01c4ae1cb06734a4a04 100644 --- a/test/npu/test_unsupport_api.py +++ b/test/npu/test_unsupport_api.py @@ -67,16 +67,6 @@ class TestPtaUnsupportApi(TestCase): coalesce_tensor = sparse_tensor.coalesce().npu() coalesce_tensor.ccol_indices() - def test_Tensor_is_shared_runtimeerror(self): - with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): - input_tensor = torch.tensor([1, 2, 3]).npu() - input_tensor.is_shared() - - def test_Tensor_share_memory__runtimeerror(self): - with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): - input_tensor = torch.tensor([1, 2, 3]).npu() - input_tensor.share_memory_() - def test_Module_share_memory_runtimeerror(self): with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): model = SimpleModel().npu() diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index b42929fd6dfe02a69659a18f89c1a0cc387928c3..4be850975ba7e71fcd92089d7ad36d68efd38a68 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -1028,9 +1028,6 @@ "torch_npu.npu.check_uce_in_memory": { "signature": "(device_id)" }, - "torch_npu.npu.get_uce_addr": { - "signature": "()" - }, "torch_npu.npu.clear_npu_overflow_flag": { "signature": "()" }, @@ -2550,7 +2547,7 @@ "signature": "(*args, **kwargs)" }, "torch_npu.npu_format_cast": { - "signature": "(self, acl_format)" + "signature": "(self, acl_format, customize_dtype=None)" }, "torch_npu.npu_format_cast_": { "signature": "(*args, **kwargs)" @@ -2768,6 +2765,9 @@ "torch_npu.utils.set_thread_affinity": { "signature": "(core_range: List[int] = None)" }, + "torch_npu.utils.reset_thread_affinity": { + "signature": "()" + }, "torch_npu.dynamo.torchair.scope.npu_stream_switch": { "signature": "(stream_tag: str, stream_priority: int = 0)" }, @@ -2789,6 +2789,9 @@ "torch_npu.distributed.all_gather_into_tensor_uneven": { "signature": "(output, input, output_split_sizes=None, group=None, async_op=False)" }, + "torch_npu.multiprocessing.reductions.rebuild_npu_tensor": { + "signature": "(tensor_cls, tensor_size, tensor_stride, tensor_offset, storage_cls, dtype, storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required)" + }, "func: unsafe_empty_with_format": { "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor" }, @@ -2811,16 +2814,16 @@ "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor" }, "func: npu_format_cast": { - "signature": "(Tensor self, int acl_format) -> Tensor" + "signature": "(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor" }, "func: npu_format_cast_": { - "signature": "(Tensor(a!) self, Tensor src) -> Tensor(a!)" + "signature": "(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast_.acl_format": { - "signature": "(Tensor(a!) self, int acl_format) -> Tensor(a!)" + "signature": "(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast.Tensor": { - "signature": "(Tensor self, Tensor dst) -> Tensor" + "signature": "(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor" }, "func: npu_change_data_ptr": { "signature": "(Tensor dst, Tensor src, int index) -> int" @@ -2840,6 +2843,9 @@ "func: _npu_format_cast": { "signature": "(Tensor self, int acl_format) -> Tensor" }, + "func: _npu_format_cast.aclnn": { + "signature": "(Tensor self, int acl_format, int customize_dtype) -> Tensor" + }, "torch_c_func: torch_npu::init_npu(const c10::DeviceIndex device_index = 0)": { "signature": "(const c10::DeviceIndex device_index = 0) -> void", "file": "torch_npu/csrc/libs/init_npu.h" diff --git a/third_party/acl/inc/acl/acl.h b/third_party/acl/inc/acl/acl.h index 95abdb6368eaee10dd19e50d292554c708c84be3..a31b673d070888e47052e084704213c0268e8457 100755 --- a/third_party/acl/inc/acl/acl.h +++ b/third_party/acl/inc/acl/acl.h @@ -25,6 +25,7 @@ extern "C" { #define ACL_PATCH_VERSION 0 #define ACL_PKG_VERSION_MAX_SIZE 128 #define ACL_PKG_VERSION_PARTS_MAX_SIZE 64 +#define ACL_IPC_HANDLE_SIZE 65 /** * @ingroup AscendCL diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index b8ef9dbd34075370416a049efff05be7b4c110df..7d592db6ed09d4e53ad5df29773b0935fcfc5515 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -48,6 +48,7 @@ extern "C" { typedef void *aclrtStream; typedef void *aclrtEvent; typedef void *aclrtContext; +typedef void *aclrtNotify; typedef int aclError; typedef uint16_t aclFloat16; typedef struct aclDataBuffer aclDataBuffer; @@ -183,6 +184,8 @@ typedef enum { ACL_FRACTAL_Z_3D = 33, ACL_FORMAT_NC = 35, ACL_FORMAT_NCL = 47, + ACL_FORMAT_FRACTAL_NZ_C0_16 = 50, + ACL_FORMAT_FRACTAL_NZ_C0_32 = 51, } aclFormat; typedef enum { diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 98b520ba4ac73a4b5072d98fd436edde37b51655..ecc36f38128bd746bc9f9cb5064e6f47f9bc5b6a 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -181,6 +181,11 @@ typedef enum aclrtLastErrLevel { ACL_RT_THREAD_LEVEL = 0, } aclrtLastErrLevel; +typedef enum { + ACL_RT_DEV_RES_CUBE_CORE = 0, + ACL_RT_DEV_RES_VECTOR_CORE, +} aclrtDevResModelType; + typedef void* aclrtDrvMemHandle; typedef void (*aclrtCallback)(void *userData); @@ -1541,6 +1546,37 @@ ACL_FUNC_VISIBILITY aclError aclrtPeekAtLastError(aclrtLastErrLevel level); */ ACL_FUNC_VISIBILITY aclError aclrtGetLastError(aclrtLastErrLevel level); +/** + * @ingroup AscendCL + * @brief Get the value of the current device's limited resources + * @param [in] deviceId the device id + * @param [in] type resources type + * @param [out] value resources limit value + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value); + +/** + * @ingroup AscendCL + * @brief Set the value of the current device's limited resources + * @param [in] deviceId the device id + * @param [in] type resource type + * @param [in] value resource limit value + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value); + +/** + * @ingroup AscendCL + * @brief Reset the value of the current device's limited resources + * @param [in] deviceId the device id + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtResetDeviceResLimit(int32_t deviceId); + #ifdef __cplusplus } #endif diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index 4f24e6bf043cba7c53c7015e597f5c6e82164bd6..9bb32581dd7ea6ca7d1b5fe01c7896dfb7d84764 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -18,6 +18,9 @@ aclError aclmdlSetDump(const char *configPath){return 0;} aclError aclmdlInitDump(){return 0;} aclError aclmdlFinalizeDump(){return 0;} aclError aclrtDeviceTaskAbort(int32_t deviceId, uint32_t timeout){return 0;} +aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value){return 0;} +aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value){return 0;} +aclError aclrtResetDeviceResLimit(int32_t deviceId){return 0;} // Stream aclError aclrtCreateStream(aclrtStream *stream) { return 0; } diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 023914a348285ad17c459b077cdd03c4593637ea..216ef7a83847e424ee1b0679b351d188452a2981 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -212,6 +212,8 @@ inline void HcclCommConfigInit(HcclCommConfig *config) config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET; config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE; + config->hcclWorldRankID = 0; + config->hcclJobID = 0; } /** diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 40631676c1bdc9bb44256b083e647e99e8f6fc8f..9a02c61c0414a96af23bf2468ab96482512240fa 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -15,7 +15,7 @@ extern "C" { const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; -const uint32_t HCCL_COMM_CONFIG_VERSION = 5; +const uint32_t HCCL_COMM_CONFIG_VERSION = 6; const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations const uint32_t COMM_NAME_MAX_LENGTH = 128; @@ -132,6 +132,8 @@ typedef struct HcclCommConfigDef { uint32_t hcclOpExpansionMode; uint32_t hcclRdmaTrafficClass; uint32_t hcclRdmaServiceLevel; + uint32_t hcclWorldRankID; + uint64_t hcclJobID; } HcclCommConfig; typedef enum { diff --git a/third_party/op-plugin b/third_party/op-plugin index 680dea4984135de69dc1ee031e08942c4049fa72..c94178b515bd4c1cc88f6598a72c7d019fa10b7a 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 680dea4984135de69dc1ee031e08942c4049fa72 +Subproject commit c94178b515bd4c1cc88f6598a72c7d019fa10b7a diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index fe8e22ad3a347ab8116eb61ff44450ef1fb07f91..9818eff91d926398e6bc2a733d044efe21629477 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit fe8e22ad3a347ab8116eb61ff44450ef1fb07f91 +Subproject commit 9818eff91d926398e6bc2a733d044efe21629477 diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index d84f72b37a3dd1069eb06d1b43a64ff4604d9902..10e15a15224e216ee116060b6838936b4a8d2712 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -75,6 +75,7 @@ from torch_npu.utils import _apply_module_patch, _add_tensor_methods, _add_colle _apply_npu_show_warning from torch_npu.utils._dynamo_device import _dynamo_register_interface_for_device from torch_npu.npu._stream_check import apply_sanitizer_patch +from torch_npu.npu._format import _apply_npu_format_patch import torch_npu.utils.custom_ops import torch_npu.distributed.rpc import torch_npu.op_plugin @@ -83,6 +84,7 @@ from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry from torch_npu.utils import _cann_package_check, _add_intercept_methods from torch_npu.utils import _register_ops_under_dtensor_rules from torch_npu.utils.exposed_api import public_npu_functions +from torch_npu.multiprocessing.reductions import _add_reductions_methods from torch_npu.npu.utils import _erase_stream as erase_stream from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler from torch_npu.asd.asd import _asd_patch @@ -113,6 +115,7 @@ for name in dir(torch.ops.npu): __all__.append(name) setattr(torch, name, _wrap_torch_error_func(getattr(torch.ops.npu, name))) + all_monkey_patches = [ ["nn.functional", npu_functional], ["nn", npu_modules], @@ -171,6 +174,8 @@ def _apply_class_patches(): add_perf_dump_patch() _apply_distributed_methods_patch() _apply_mstx_patch() + _add_reductions_methods() + _apply_npu_format_patch() def _apply_distributed_methods_patch(): @@ -193,6 +198,7 @@ torch._register_device_module('npu', torch_npu.npu) unsupported_dtype = [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8] torch.utils.generate_methods_for_privateuse1_backend(for_tensor=True, for_module=True, for_storage=True, unsupported_dtype=unsupported_dtype) +torch.nn.parameter.UninitializedTensorMixin._allowed_methods.append(torch.Tensor.npu) # Apply monkey-patches. _apply_patches(all_monkey_patches) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 41e84feded80fa570ee6639c29c614e098e35233..47486142336c06040d66792eb2f399285f85635b 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -1,5 +1,5 @@ import os -from functools import wraps +from functools import wraps, partial import logging import time import warnings @@ -16,8 +16,6 @@ from ._silent_fault_data import SilentFaultData, SilentFaultDataV2 __all__ = [] -original_matmul = torch.matmul -original_tensor_matmul = torch.Tensor.matmul loggerSilent = logging.getLogger("torch_npu.silent_check") @@ -314,17 +312,11 @@ class _MatmulSilentCheck: self.checksum_result = None self.checksum_state = None self.checksum_state_thread_running = False - self.checksum_state_thread = threading.Thread( - target=self._tcp_comm_checksum_state, - daemon=True - ) + self.checksum_state_thread = None # Use another thread to receive the statistic value and detect SDC self.check_thread_running = False - self.check_thread = threading.Thread( - target=self._async_detect, - daemon=True - ) - self.lock = threading.Lock() + self.check_thread = None + self._lock = None self.queue_len = 1024 self.statistic_cpu_value = None self.name_list = ["" for _ in range(self.queue_len)] @@ -409,7 +401,13 @@ class _MatmulSilentCheck: def get_grad_sample_interval(self): return self.filter_interval - + + @property + def lock(self): + if self._lock is None: + self._lock = threading.Lock() + return self._lock + def init_stream(self): if self.statistic_cpu_value is None: self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}") @@ -431,7 +429,8 @@ class _MatmulSilentCheck: def register_module_hook(self, module, name): self.check_stat[name + "_backward"] = {'avg': 0, 'pre_val': 0, 'step': 0, 'none_zero_step': 0} - self.hook_dict[name + "_backward"] = module.register_full_backward_hook(lambda module, grad_input, grad_output, n=name + "_backward": self.module_hook(module, grad_input, grad_output, n)) + hook = partial(self.module_hook, name=name + "_backward") + self.hook_dict[name + "_backward"] = module.register_full_backward_hook(hook) self.registered_modules.append(name) def module_hook(self, module, grad_input, grad_output, name): @@ -472,6 +471,8 @@ class _MatmulSilentCheck: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized(): break time.sleep(10) + if not self.check_thread_running: + return local_rank = os.getenv("LOCAL_RANK", "-1") if local_rank.isdigit(): torch.npu.set_device(int(local_rank)) @@ -481,7 +482,7 @@ class _MatmulSilentCheck: val = self.statistic_cpu_value[self.head_index].item() name = self.name_list[self.head_index] while val != -1 and name != "": - loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") + loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, bp time: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], self.upper_thresh1, self.upper_thresh2 @@ -604,21 +605,21 @@ class _MatmulSilentCheck: def _generate_event_log(self, new_abnormal): info_str = f"[Event][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: A grad-norm spike may happen, " info_str = info_str + f"param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " - info_str = info_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + info_str = info_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." loggerSilent.info(info_str) if self.store is not None and self.rank is not None and self.rank != 0: current_log = self.store.get(f"rank_{self.rank}_info_log").decode() self.store.set(f"rank_{self.rank}_info_log", current_log + "\n" + info_str if current_log != "" else info_str) def _generate_warning_log(self, counting_abnormal_pos, new_abnormal): - warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: Training instability happens, feature detection detects abnormal results!" + warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: feature detection detects abnormal results!" index = 0 for pos in reversed(counting_abnormal_pos): warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, " - warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, step {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}." + warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, bp time {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}." index += 1 warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {new_abnormal['time_str']}, param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " - warning_str = warning_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + warning_str = warning_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." loggerSilent.warning(warning_str) if self.store is not None and self.rank is not None and self.rank != 0: current_log = self.store.get(f"rank_{self.rank}_warn_log").decode() @@ -636,6 +637,8 @@ class _MatmulSilentCheck: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None: break time.sleep(10) + if not self.checksum_state_thread_running: + return local_rank = os.getenv("LOCAL_RANK", "-1") self.rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() @@ -673,7 +676,7 @@ class _MatmulSilentCheck: if global_state: now_time = time.time() if last_checksum_time is None or abs(now_time - last_checksum_time) > self.checksum_cooldown * 60: - loggerSilent.info(f'[Info] Rank {self.rank}: Training instability happened, checksum is on.') + loggerSilent.info(f'[Info] Rank {self.rank}: feature detection detects abnormal results, checksum is on.') last_checksum_time = now_time if self.checksum_result is None: self.checksum_result = torch.tensor(False, dtype=torch.bool, device='npu') @@ -697,14 +700,44 @@ class _MatmulSilentCheck: time.sleep(10) + def __getstate__(self): + self._cleanup() + state = self.__dict__.copy() + state['_lock'] = None + state['store'] = None + return state + + def __setstate(self, state): + self.__dict__.update(state) + self.store = None + + def _startup(self): + if not self.check_thread_running: + self.check_thread_running = True + self.check_thread = threading.Thread( + target=self._async_detect, + daemon=True + ) + self.check_thread.start() + + if not self.checksum_state_thread_running: + self.checksum_state_thread_running = True + self.checksum_state_thread = threading.Thread( + target=self._tcp_comm_checksum_state, + daemon=True + ) + self.checksum_state_thread.start() + def _cleanup(self): if self.check_thread_running: self.check_thread_running = False self.check_thread.join() + self.check_thread = None if self.checksum_state_thread_running: self.checksum_state_thread_running = False self.checksum_state_thread.join() + self.checksum_state_thread = None matmul_check = _MatmulSilentCheck() @@ -747,15 +780,10 @@ def _matmul_silent_check_decorator(func): matmul_check.init_module_info(id(self), self.training) self.matmul_check_outer = True - if not matmul_check.check_thread_running: - matmul_check.check_thread_running = True - matmul_check.check_thread.start() - - # 2 for checksum - if not matmul_check.checksum_state_thread_running: - matmul_check.checksum_state_thread_running = True - matmul_check.checksum_state_thread.start() + matmul_check._startup() if matmul_check.with_checksum and not matmul_check.matmul_trigger: + original_matmul = torch.matmul + original_tensor_matmul = torch.Tensor.matmul torch_npu.asd.checksum.matmul = original_matmul torch.matmul = _trigger_matmul_decorator(original_matmul) torch.Tensor.matmul = _trigger_tensor_matmul_decorator(original_tensor_matmul) diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py index cc6832f39832ac1be197a6cd5b5e393d52aef4dc..a9576675cb071d5d71e03843773026d62a6a8345 100644 --- a/torch_npu/asd/checksum.py +++ b/torch_npu/asd/checksum.py @@ -41,5 +41,10 @@ def _matmul_checksum(a, b, c): error_total = (c_ele_round_error_accum).to(torch.float) error = torch.abs(c_sum - c1_trans) - flag = (error - error_total) > 1e-20 - return torch.any(flag) + flag = (error - 5 * error_total) > 5 * 1e-20 + any_flag = torch.any(flag) + if any_flag: + matmul(a, b, out=c) + c_mean2 = torch.mean(torch.abs(c), dim=-1) + return torch.any(c_mean != c_mean2) + return any_flag diff --git a/torch_npu/contrib/function/roll.py b/torch_npu/contrib/function/roll.py index 97037c8d0754427942742c28de5ba0f6568ee3ac..550064e693c5ce5fc1110ef7f2b666ceb739a531 100644 --- a/torch_npu/contrib/function/roll.py +++ b/torch_npu/contrib/function/roll.py @@ -30,7 +30,7 @@ _roll_with_index_select = _RollWithIndexSelect.apply def _get_roll_index(H, W, shifts, device='cpu'): index = torch.arange(0, H * W).reshape(H, W) index_fp = torch.roll(index, shifts=shifts, dims=(0, 1)).reshape(-1).long() - index_bp_dict = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())} + index_bp_dict = {i: idx for idx, i in enumerate(index_fp.numpy().tolist())} index_bp_list = [index_bp_dict[i] for i in range(H * W)] index_bp = torch.LongTensor(index_bp_list) return [index_fp.to(device), index_bp.to(device)] diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py index 5a8a355d1a5aac1bc102ded8893bfd9b23b8457e..e7e6014c4f824a5d3d98d5fb1a73b27748957520 100644 --- a/torch_npu/contrib/transfer_to_npu.py +++ b/torch_npu/contrib/transfer_to_npu.py @@ -28,7 +28,7 @@ torch_fn_white_list = ['logspace', 'randint', 'hann_window', 'rand', 'full_like' 'eye', '_sparse_csr_tensor_unsafe', 'empty', '_sparse_coo_tensor_unsafe', 'blackman_window', 'zeros_like', 'range', 'sparse_csr_tensor', 'randn_like', 'from_file', '_cudnn_init_dropout_state', '_empty_affine_quantized', 'linspace', 'hamming_window', - 'empty_quantized', '_pin_memory', 'autocast', 'load', "Generator", 'set_default_device'] + 'empty_quantized', '_pin_memory', 'autocast', 'load', 'set_default_device'] torch_tensor_fn_white_list = ['new_empty', 'new_empty_strided', 'new_full', 'new_ones', 'new_tensor', 'new_zeros', 'to', 'pin_memory'] torch_module_fn_white_list = ['to', 'to_empty'] @@ -45,6 +45,14 @@ cur_path = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_path, 'apis_config.json') +class _GeneratorProxy(torch.Generator): + + def __new__(cls, device='cpu'): + device = _replace_cuda_to_npu_in_list([device], None)[0] + instance = super().__new__(cls, device) + return instance + + def _get_function_from_string(attribute_string): try: module_path, _, attr_name = attribute_string.rpartition('.') @@ -331,6 +339,7 @@ def _init(): # torch.* _device_wrapper(torch, torch_fn_white_list) torch.UntypedStorage.__new__ = _wrapper_cuda(torch.UntypedStorage.__new__) + torch.Generator = _GeneratorProxy # torch.Tensor.* _device_wrapper(torch.Tensor, torch_tensor_fn_white_list) @@ -350,6 +359,8 @@ def _init(): _wrapper_cuda(torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.__init__) _del_nccl_device_backend_map() torch.distributed.device_mesh.init_device_mesh = _wrapper_cuda(torch.distributed.device_mesh.init_device_mesh) + torch.distributed.distributed_c10d._new_group_with_tag = _wrapper_hccl( + torch.distributed.distributed_c10d._new_group_with_tag) # CUDAGraph torch.cuda.CUDAGraph = torch.npu.NPUGraph diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index c8084af923b506b9fd08466c21dfcaa73b68b79e..672b5289f5c8580ef21add7c8fafd824bd92bc86 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -15,7 +15,9 @@ #include "torch_npu/csrc/profiler/init.h" #include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/logging/Init.h" +#include "torch_npu/csrc/ipc/StorageSharing.h" #include "torch_npu/csrc/npu/Module.h" +#include "torch_npu/csrc/custom_dtype/Init.h" #include "torch_npu/csrc/npu/Stress_detect.h" #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" @@ -168,6 +170,8 @@ PyObject* initModule() AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions()); AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions()); AddPyMethodDefs(methods, torch_npu::logging::logging_functions()); + AddPyMethodDefs(methods, torch_npu::reductions::reductions_functions()); + AddPyMethodDefs(methods, c10_npu::custom_dtype_functions()); static struct PyModuleDef torchnpu_module = { PyModuleDef_HEAD_INIT, "torch_npu._C", diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 0c4000e52458682f8fa8ede2cde47b6421637d5d..2c35eaf44c791e9351d132a8d5db1944a41b0bcd 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -1,16 +1,131 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h" +#include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/aten/CustomFunctions.h" +#include "torch_npu/csrc/custom_dtype/Init.h" +#include "third_party/op-plugin/op_plugin/utils/op_api_common.h" namespace at_npu { namespace native { using tensor_list = std::vector; +using GetFormatFunc = int (*)(const aclTensor *, const int, const int, int64_t **, uint64_t *, int *); + +std::tuple> MaybeUseAclnnNpuFormatCast(const at::Tensor& src, + int64_t acl_format, c10::optional customize_dtype) +{ + const static auto GetFormatFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCastCalculateSizeAndFormat"); + const static auto FormatCastFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCast"); + + const static bool aclnnNpuFormatCastExist = + (GetFormatFuncAddr == nullptr || FormatCastFuncAddr == nullptr) ? false : true; + + GetFormatFunc GetFormat = reinterpret_cast(GetFormatFuncAddr); + int64_t *dstStorageShape = nullptr; + uint64_t dstShapeSize = 0; + int dstFormat; + at::SmallVector outputShape = {}; + aclDataType customizeAcltype = (customize_dtype.has_value()) ? + c10_npu::GetAclDataType(customize_dtype.value()) : + at_npu::native::OpPreparation::convert_to_acl_data_type(src.scalar_type()); + + if (c10_npu::IsAclnnOnly()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } + TORCH_CHECK(false, + "aclnnNpuFormatCast does not exist, Current device only support aclnn operators.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + if (at_npu::native::env::CheckJitDisable()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + if (api_ret != 0) { + if (customize_dtype.has_value()) { + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + } + return std::make_tuple(false, dstFormat, outputShape); + } + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while aclnnNpuFormatCast does not exist.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while jit_compile=True.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } +} + +at::Tensor create_tensor_with_format_and_shape(c10::IntArrayRef baseSizes, + c10::IntArrayRef storageSizes, + const caffe2::TypeMeta dtype, int64_t acl_format) +{ + c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get(); + int64_t nelements = 1; + for (const auto& num : storageSizes) { + nelements *= num; + } + int64_t size_bytes = nelements * dtype.itemsize(); + c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( + c10::StorageImpl::use_byte_size_t(), + c10::SymInt(size_bytes), + allocator->allocate(size_bytes), + allocator, + true); + auto tensor = at::detail::make_tensor(storage_impl, dtype); + + if (baseSizes.size() != 1 || baseSizes[0] != 0) { + tensor.unsafeGetTensorImpl()->set_sizes_contiguous(baseSizes); + } + tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous); + StorageDescHelper::SetDesc(tensor, baseSizes, storageSizes, tensor.strides(), static_cast(acl_format)); + return tensor; +} + +at::Tensor format_cast_impl_out_npu_aclnn(const at::Tensor& src, + int64_t acl_format, c10::IntArrayRef storageSizes) +{ + auto src_new = src.contiguous(); + auto src_new_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src_new)->npu_desc_; + + at::Tensor dst = create_tensor_with_format_and_shape( + src_new.sizes(), storageSizes, src.dtype(), acl_format); + + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnNpuFormatCast, src_new, dst); + + // format cast only change physical layout of base tensor and view tensor's + // metadata remain unchanged + dst.set_(dst.storage(), src_new.storage_offset(), src_new.sizes(), src_new.strides()); + return dst; +} at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) { @@ -36,7 +151,8 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) } // convert src from src_format to dst_format, write the result into dst(self) -at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src) +at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); torch_npu::utils::torch_check_npu(src); @@ -47,6 +163,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Ten return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, dst_desc.npu_format_, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + // calculate the output result of the NPU format_cast_impl_out_npu(self, src); @@ -59,16 +182,6 @@ at::Tensor npu_format_cast_impl( int64_t acl_format) { auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_; - if (src_desc.npu_format_ == acl_format) { - ASCEND_LOGD("no need to do format cast"); - return src; - } - if (FormatHelper::IsBaseFormatType(src) && - FormatHelper::IsBaseFormatType(static_cast(acl_format))) { - FormatCastHelper::format_cast_as_base_format(src, static_cast(acl_format)); - return src; - } - at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); @@ -84,18 +197,20 @@ at::Tensor npu_format_cast_impl( // conver self to dst'format, write the result into new result tensor at::Tensor NPUNativeFunctions::npu_format_cast( const at::Tensor& self, - const at::Tensor& dst) + const at::Tensor& dst, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(dst); auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_; int64_t dst_format = dst_desc.npu_format_; - return custom_ops::npu_format_cast(self, dst_format); + return custom_ops::npu_format_cast(self, dst_format, customize_dtype); } // conver self to acl_format, write the result into self at::Tensor& NPUNativeFunctions::npu_format_cast_( at::Tensor& self, - int64_t acl_format) + int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; @@ -108,6 +223,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, self.options(), acl_format); @@ -130,16 +252,54 @@ int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self) at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format) { - return npu_format_cast_impl(self, acl_format); + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, c10::nullopt); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); +} + +at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format, + int64_t customize_dtype) +{ + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); } -at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format) +at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); if (NPUNativeFunctions::get_npu_format(self) == acl_format) { ASCEND_LOGD("no need to do format cast"); return self; } + if (customize_dtype.has_value()) { + return custom_ops::_npu_format_cast(self, acl_format, customize_dtype.value()); + } return custom_ops::_npu_format_cast(self, acl_format); } diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp index 775d95cbfa597a61fcf71eca04008d8c21fd4e83..685f907653a96e2f36e6ee5c9ea4dc6344618cef 100644 --- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp +++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp @@ -10,13 +10,34 @@ namespace at_npu { namespace native { +#define AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) + + +#define AT_DISPATCH_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, __VA_ARGS__)) + + c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) { c10::Scalar r; - AT_DISPATCH_ALL_TYPES_AND3( + AT_DISPATCH_ALL_TYPES_AND5( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + at::ScalarType::Float8_e5m2, + at::ScalarType::Float8_e4m3fn, self.scalar_type(), "_local_scalar_dense_npu", [&] { diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp index 3b7a2245571eeadb800a93be4b1c105460a6b33a..143d40bf5e24f4c8ca5607fe158a53524672aa58 100644 --- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp @@ -162,11 +162,11 @@ at::Tensor NPUNativeFunctions::to( } if (dtype == at::ScalarType::Double) { TORCH_NPU_WARN_ONCE( - "Warning: Device do not support double dtype now, " - "dtype cast repalce with float."); + "Device do not support double dtype now, " + "dtype cast replace with float."); } dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype; - return custom_ops::npu_dtype_cast(self, dtype); + return custom_ops::_npu_dtype_cast(self, dtype); } at::Tensor NPUNativeFunctions::to( diff --git a/torch_npu/csrc/aten/common/from_blob.cpp b/torch_npu/csrc/aten/common/from_blob.cpp index 08f2e63fd20ad2fc219853ea86c42a185d6a9284..1363d69459a00888c0d2ae215f28afd4d47923ba 100644 --- a/torch_npu/csrc/aten/common/from_blob.cpp +++ b/torch_npu/csrc/aten/common/from_blob.cpp @@ -36,7 +36,12 @@ at::Tensor TensorMaker::make_tensor() std::size_t size_bytes = computeStorageSize(); - c10::DataPtr data_ptr{data_, *device_}; + c10::DataPtr data_ptr{}; + if (deleter_) { + data_ptr = c10::InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_); + } else { + data_ptr = c10::DataPtr(data_, *device_); + } c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( c10::StorageImpl::use_byte_size_t(), @@ -86,6 +91,54 @@ std::size_t TensorMaker::computeStorageSize() const noexcept return storage_size; } +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + std::function deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .deleter(std::move(deleter)) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + int64_t storage_offset, + const std::function& deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .strides(strides) + .storage_offset(storage_offset) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + const std::function& deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .strides(strides) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + at::Tensor from_blob( void* data, at::IntArrayRef sizes, diff --git a/torch_npu/csrc/aten/common/from_blob.h b/torch_npu/csrc/aten/common/from_blob.h index f0d6bbd12700ec295d322762febe80070286bb43..0669d2fdca08965e9797918b35d83b185ef1272e 100644 --- a/torch_npu/csrc/aten/common/from_blob.h +++ b/torch_npu/csrc/aten/common/from_blob.h @@ -41,6 +41,12 @@ public: return *this; } + TensorMaker& deleter(std::function value) noexcept + { + deleter_ = std::move(value); + + return *this; + } at::Tensor make_tensor(); private: @@ -58,6 +64,7 @@ private: c10::optional device_{}; at::TensorOptions opts_{}; c10::Allocator* allocator_{}; + std::function deleter_{}; }; inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept @@ -65,6 +72,30 @@ inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept return TensorMaker{data, sizes}; } +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + std::function deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + int64_t storage_offset, + const std::function& deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + const std::function& deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + TORCH_NPU_API at::Tensor from_blob( void* data, at::IntArrayRef sizes, diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 95bb740db159bef654fb063934f68344c1bf257e..b186df765181599eca85294f3343033c711f8a32 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -62,12 +62,12 @@ custom: - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int device_check: NoCheck - func: get_npu_format(Tensor self) -> int - - func: npu_format_cast.Tensor(Tensor self, Tensor dst) -> Tensor + - func: npu_format_cast.Tensor(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor device_check: NoCheck exposed: True - - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format) -> Tensor(a!) + - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!) exposed: True - - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!) + - func: npu_format_cast_(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!) device_check: NoCheck exposed: True - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor @@ -82,9 +82,10 @@ custom: - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck - func: get_storage_size(Tensor self) -> int - - func: npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: npu_format_cast(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor exposed: True - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: _npu_format_cast.aclnn(Tensor self, int acl_format, int customize_dtype) -> Tensor - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor dispatch: CompositeExplicitAutograd: empty_with_swapped_memory diff --git a/torch_npu/csrc/core/NPUStorageImpl.h b/torch_npu/csrc/core/NPUStorageImpl.h index bb8d5062dd1968730abbea9e720e31d14449789b..5bbaa0428c1d7cb9aa4b78601ebdfb714318e03e 100644 --- a/torch_npu/csrc/core/NPUStorageImpl.h +++ b/torch_npu/csrc/core/NPUStorageImpl.h @@ -24,7 +24,7 @@ public: aclFormat origin_format_ = ACL_FORMAT_UNDEFINED; aclFormat npu_format_ = ACL_FORMAT_ND; // used to make CANN GE tensor from storagImpl - caffe2::TypeMeta data_type_; + caffe2::TypeMeta data_type_ = caffe2::TypeMeta::Make(); }; struct NPUStorageImpl : public c10::StorageImpl { diff --git a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp index f03bfdb05b05dd3754747cd0a13d2af411fafe6e..ca093bb83726a1d217a1f6f9bf764ad798155a8e 100644 --- a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp +++ b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp @@ -1,4 +1,5 @@ #include +#include #include "torch_npu/csrc/core/npu/npu_log.h" #include #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -133,14 +134,16 @@ struct HostAllocator { c10_npu::SetCurrentDevice(); } + // Round up the allocation to the nearest power of two to improve reuse. + size_t roundSize = c10::llvm::PowerOf2Ceil(size); // allocate a new block if no cached allocation is found - err = aclrtMallocHost(ptr, size); + err = aclrtMallocHost(ptr, roundSize); if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); return err; } - blocks.insert({*ptr, Block(size, *ptr, true)}); + blocks.insert({*ptr, Block(roundSize, *ptr, true)}); return ACL_ERROR_NONE; } diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp index bcb89e6c88354a1a290b320c0d9003417b0339f7..c5f6f913d44632c09ebea3978b0f1bdbccbaf82a 100644 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp +++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp @@ -93,8 +93,10 @@ void GetExclusiveAffinityCPU() offset = find_offset->second; } c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu); - int length = (cpu_range.end - cpu_range.start + 1) / same_num; - c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length}; + unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast(same_num); + c10_npu::CoreIdRange exclusiveAffinityCpu = { + cpu_range.start + static_cast(offset) * length, + (cpu_range.start + length - 1) + static_cast(offset) * length}; offsetMap[affinity_cpu] = offset + 1; CardIdAffinityCPU[card_id] = exclusiveAffinityCpu; } diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index a331439d9f22f4516d4e41ade170d7c0ee6eb584..6c2d35fd951aaae50b0293c0437db458a3896874 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -9,11 +9,16 @@ #include #include #include +#include namespace c10_npu { static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; +static pthread_t main_thread; +static bool start_main_thread_bind = false; +static std::mutex core_map_mutex; + using ThreadCoreMap = std::unordered_map; static uint32_t cpu_affinity_mode; @@ -28,8 +33,7 @@ const std::unordered_map threadTypeToNameMap = { {ACL_THREAD, "acl_thread"}, {RELEASE_THREAD, "release_thread"}, {WATCHDOG_THREAD, "hccl_watchdog_t"}, - {OTHER_THREAD, "other_thread"}, - {USER_THREAD, "user_thread"}}; + {OTHER_THREAD, "other_thread"}}; CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id) { @@ -147,7 +151,7 @@ void printCoreRanges(const uint32_t mode, const std::vector &ranges oss << "Mode: " << mode << ". Core range for each device ID: "; for (size_t i = 0; i < ranges.size(); ++i) { - oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]"; + oss << "Device " << i << ": [" << ranges[i].start << ", " << ranges[i].end << "]"; if (i != ranges.size() - 1) { oss << "; "; } else { @@ -194,18 +198,18 @@ void SetThreadType(ThreadType type) return; } if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { - ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str()); + ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str()); } } std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap) { std::ostringstream oss; - for (auto local_thread : threadTypeList) { - oss << threadTypeToNameMap.at(local_thread) << " : [" - << threadCoreMap.at(local_thread).start << "," - << threadCoreMap.at(local_thread).end << "]"; - if (local_thread != OTHER_THREAD) { + for (auto thread_type : threadTypeList) { + oss << threadTypeToNameMap.at(thread_type) << ": [" + << threadCoreMap.at(thread_type).start << ", " + << threadCoreMap.at(thread_type).end << "]"; + if (thread_type != OTHER_THREAD) { oss << "; "; } else { oss << "."; @@ -222,16 +226,16 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector lock(core_map_mutex); if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) { device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges)); } - core_range = device_thread_core_maps.at(device_id).at(local_thread); + core_range = device_thread_core_maps.at(device_id).at(type); } + return core_range; +} - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto i = core_range.start; i <= core_range.end; i++) { - CPU_SET(i, &mask); +void SetThreadAffinity(c10::DeviceIndex device_id) +{ + if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) { + return; } - if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { + + CoreIdRange core_range = getCoreRange(device_id, local_thread); + if (setThreadAffinityImpl(pthread_self(), core_range)) { ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end); } else { @@ -280,7 +299,10 @@ void SetThreadAffinity(ThreadType type) int device_index; NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index)); c10::DeviceIndex device = static_cast(device_index); - SetThreadType(type); + local_thread = type; + if (local_thread == ThreadType::MAIN_THREAD) { + start_main_thread_bind = true; + } SetThreadAffinity(device); } @@ -289,20 +311,55 @@ void SetThreadAffinity(int core_start, int core_end) if (!needToSetThreadAffinity()) { return; } + static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - core_start = std::min(core_start, core_nums); - core_end = std::min(core_end, core_nums); + CoreIdRange core_range; + core_range.start = static_cast(std::min(core_start, core_nums)); + core_range.end = static_cast(std::min(core_end, core_nums)); local_thread = ThreadType::USER_THREAD; - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto i = core_start; i <= core_end; i++) { - CPU_SET(i, &mask); - } - if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { - ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + if (setThreadAffinityImpl(pthread_self(), core_range)) { + ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end); } else { - ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end); + } +} + +void SetMainThread() +{ + main_thread = pthread_self(); +} + +bool NeedMainThreadBind() +{ + return start_main_thread_bind && (local_thread == ThreadType::MAIN_THREAD); +} + +void StartMainThreadBind(c10::DeviceIndex device_id) +{ + if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) { + return; + } + + static thread_local bool seted = false; + if (!seted) { + seted = true; + if (syscall(SYS_gettid) != getpid()) { + start_main_thread_bind = true; + + SetThreadAffinity(device_id); + + CoreIdRange core_range = getCoreRange(device_id, ThreadType::MAIN_THREAD); + if (setThreadAffinityImpl(main_thread, core_range)) { + ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", + device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), + core_range.start, core_range.end); + } else { + ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.", + device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), + core_range.start, core_range.end); + } + } } } diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index 0ec3c4d995de635118e450c3553489facb1dc1a2..e850a47b67f3484ffafb56f0b4cc67b0eea0c0ee 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -20,9 +20,12 @@ enum ThreadType { }; void SetThreadType(ThreadType type); - void SetThreadAffinity(c10::DeviceIndex device); void SetThreadAffinity(ThreadType type); void SetThreadAffinity(int core_start, int core_end); +void SetMainThread(); +bool NeedMainThreadBind(); +void StartMainThreadBind(c10::DeviceIndex device_id); + } // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index c8575d5bc4fdfdbd1270bb4f3f16291b82b691d1..1691427e4602ce281a79344d3a5af0b4b8db29ec 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -24,6 +24,7 @@ #include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/NPUEvent.h" +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" @@ -100,6 +101,12 @@ const std::string kMinCannVersion = "8.1.RC1"; // minimum cann version wh const std::string kMinDriverVersion = "25.0.RC1"; // minimum driver version which supports 1g mem 25.0.RC1 const std::string kCannModule = "CANN"; // cann module name +static char SHAREABLE_HANDLE_VERSION = 1; +enum ShareableHandleType : char { + SHAREABLE_NPU_MALLOC = 'c', + SHAREABLE_NPU_EXPANDABLE_SEGMENT = 'e' +}; + using StatTypes = std::array(StatType::NUM_TYPES)>; void update_stat(Stat &stat, int64_t amount) @@ -355,7 +362,10 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size) + ExpandableSegment( + int device, + std::optional stream, + size_t size) : device_(device), stream_(stream), max_handles_(0), @@ -376,7 +386,7 @@ struct ExpandableSegment { auto default_stream = c10_npu::getDefaultNPUStream().stream(false); if (kSmallBuffer == segment_size_) { max_handles_ = numSegments(kSmallPoolVirAddrSize); - } else if (default_stream != stream) { + } else if (default_stream != *stream) { max_handles_ = numSegments(kLargePoolVirAddrSize); } } @@ -416,17 +426,17 @@ struct ExpandableSegment { for (auto j : c10::irange(begin, i)) { auto h = handles_.at(j).value(); handles_.at(j) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle)); } trimHandles(); return rangeFromHandles(begin, begin); } NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); - handles_.at(i) = handle; + handles_.at(i) = Handle{handle, std::nullopt}; } for (auto i : c10::irange(begin, end)) { NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, - handles_.at(i).value(), 0, getHcclComm())); + handles_.at(i).value().handle, 0, getHcclComm())); } ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_); return rangeFromHandles(begin, end); @@ -446,6 +456,59 @@ struct ExpandableSegment { return rangeFromHandles(begin, end); } + // Setup IPC sharing for range. + // Returns the (larger) range that was actually shared. + // Serializes data to std::ostream that can be passed to the + // other process, and then restored as an exapandable segment + // via ExpandableSegment::fromShared(istream); + SegmentRange share(SegmentRange range, std::ostream& buf) + { + auto begin = segmentLeft(range.ptr); + auto end = segmentRight(range.ptr + range.size); + ShareHeader header{segment_size_, end - begin}; + buf.write((const char*)&header, sizeof(ShareHeader)); + for (auto i : c10::irange(begin, end)) { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + auto& handle = handles_.at(i).value(); + if (!handle.shareableHandle) { + uint64_t shareableHandle = 0; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemExportToShareableHandle( + handle.handle, ACL_MEM_HANDLE_TYPE_NONE, 0, &shareableHandle)); + int32_t* pids = nullptr; + int pid_num = torch_npu::ipc::getPids(&pids); + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemSetPidToShareableHandle(shareableHandle, pids, pid_num)); + handle.shareableHandle = shareableHandle; + } + uint64_t shandle = *handle.shareableHandle; + buf.write((const char*)&shandle, sizeof(uint64_t)); + } + return rangeFromHandles(begin, end); + } + + static std::unique_ptr fromShared( + c10::DeviceIndex device, + std::istream& buf) + { + ShareHeader header{}; + buf.read((char*)&header, sizeof(ShareHeader)); + auto segment = std::make_unique( + device, + std::nullopt, + header.segment_size); + for (auto i : c10::irange(header.num_handles)) { + (void)i; + uint64_t shareableHandle = 0; + buf.read((char*)&shareableHandle, sizeof(uint64_t)); + int32_t deviceId = static_cast(device); + aclrtDrvMemHandle handle; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemImportFromShareableHandle( + shareableHandle, deviceId, &handle)); + segment->handles_.emplace_back(Handle{handle, std::nullopt}); + } + segment->mapAndSetAccess(0, header.num_handles); + return segment; + } + char *ptr() const { return (char *)ptr_; @@ -464,7 +527,7 @@ struct ExpandableSegment { segment_size_ * max_handles_, 0, 1)); for (auto i : c10::irange(handles_.size())) { HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(), - (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0)); + (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().handle, 0)); } } @@ -476,6 +539,15 @@ struct ExpandableSegment { } private: + void mapAndSetAccess(size_t begin, size_t end) + { + for (auto i : c10::irange(begin, end)) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, + handles_.at(i).value().handle, 0, getHcclComm())); + } + ASCEND_LOGD("NPUCachingAllocator mapAndSetAccess: segment_size=%zu", segment_size_); + } + void unmapHandles(size_t begin, size_t end) { // note: unlike aclrtFree, MemUnmap and MemRelease do @@ -485,18 +557,23 @@ private: // cannot call c10::npu::stream_synchronize because // it might grab the GIL which can lead to a deadlock // Locking order must be GIL -> Allocator Lock - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_)); + if (stream_) { + NPU_CHECK_ERROR(aclrtSynchronizeStream(*stream_)); + } else { + c10_npu::NPUGuard device_guard(device_); + c10_npu::npuSynchronizeDevice(true); + } #ifndef BUILD_LIBTORCH const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { - trigger->traceNpuStreamSynchronization(reinterpret_cast(stream_)); + trigger->traceNpuStreamSynchronization(reinterpret_cast(*stream_)); } #endif for (auto i : c10::irange(begin, end)) { - aclrtDrvMemHandle h = handles_.at(i).value(); + Handle h = handles_.at(i).value(); handles_.at(i) = c10::nullopt; NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle)); } ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_); trimHandles(); @@ -553,11 +630,19 @@ private: } int device_; - aclrtStream stream_; + std::optional stream_; void *ptr_{}; size_t max_handles_; size_t segment_size_; - std::vector> handles_; + struct Handle { + aclrtDrvMemHandle handle; + std::optional shareableHandle; + }; + struct ShareHeader { + size_t segment_size; + size_t num_handles; + }; + std::vector> handles_; std::shared_ptr hcclComm_; }; @@ -726,6 +811,7 @@ BlockState::BlockState(Block *block) SegmentState::SegmentState(Block *head) { + TORCH_INTERNAL_ASSERT(head != nullptr, PTA_ERROR(ErrCode::PTR)); TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr); is_small = head->pool->is_small; @@ -882,7 +968,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector& lock_; }; +struct handle_str { + char data[ACL_IPC_HANDLE_SIZE]; +}; + +// handle for ptr +ska::flat_hash_map ipc_handle_map; + class DeviceCachingAllocator { private: // lock around all operations @@ -1542,6 +1635,40 @@ public: return basePtr; } + ShareableHandle shareIpcHandle(Block* block) + { + std::lock_guard lock(mutex); + std::ostringstream ss; + ss.put(SHAREABLE_HANDLE_VERSION); + ptrdiff_t offset = 0; + if (!block->expandable_segment_) { + ss.put(SHAREABLE_NPU_MALLOC); + size_t base_size; + void* base_ptr = getBaseAllocation(block, &base_size); + offset = (char*)block->ptr - (char*)base_ptr; + + handle_str handle; + auto it = ipc_handle_map.find(base_ptr); + if (it == ipc_handle_map.end()) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemGetExportKey( + base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE, 0)); + int32_t* pids = nullptr; + size_t pid_num = torch_npu::ipc::getPids(&pids); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num)); + ipc_handle_map[base_ptr] = handle; + } else { + handle = it->second; + } + ss.write((char*)&handle, ACL_IPC_HANDLE_SIZE); + } else { + ss.put(SHAREABLE_NPU_EXPANDABLE_SEGMENT); + auto full_range = block->expandable_segment_->share( + SegmentRange(block->ptr, block->size), ss); + offset = (char*)block->ptr - (char*)full_range.ptr; + } + return ShareableHandle{offset, ss.str()}; + } + void recordStream(Block *block, c10_npu::NPUStream stream) { std::lock_guard lock(mutex); @@ -2220,6 +2347,9 @@ private: // map_block will map some of unmapped and merge with free auto remaining = size - candidate->size; auto new_candidate = candidate->next; + if (C10_UNLIKELY(new_candidate == nullptr)) { + return nullptr; + } if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) { return nullptr; } @@ -2443,7 +2573,11 @@ private: { bool freed_memory = false; for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) { - freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + if (FreeNPUMemoryCallbacksRegistry()->Create(name) != nullptr) { + freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + } else { + TORCH_CHECK(false, "free memory callback get nullptr", PTA_ERROR(ErrCode::PTR)); + } } return freed_memory; } @@ -2486,7 +2620,7 @@ private: // Repeat GC until we reach reclaim > target size. bool block_freed = true; - while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) { + while (gc_reclaimed < target_size && block_freed && freeable_block_count > 0) { // Free blocks exceeding this age threshold first. double age_threshold = total_age / freeable_block_count; // Stop iteration if we can no longer free a block. @@ -2678,6 +2812,12 @@ private: record_trace(TraceEntry::SEGMENT_FREE, int64_t(block->ptr), block->size, block->stream, block->device, context ? context : block->context_when_segment_allocated); + auto it = ipc_handle_map.find(block->ptr); + if (it != ipc_handle_map.end()) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(it->second.data)); + ipc_handle_map.erase(it); + } + aclrtFree((void *)block->ptr); total_allocated_memory -= block->size; @@ -3152,6 +3292,15 @@ public: return device_allocator[block->device]->getBaseAllocation(block, outSize); } + ShareableHandle shareIpcHandle(void* ptr) override + { + Block* block = get_allocated_block(ptr); + if (!block) { + AT_ERROR("invalid device pointer: ", ptr); + } + return device_allocator[block->device]->shareIpcHandle(block); + } + void recordStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream) override { // Empty tensor's storage().data() might be a null ptr. As there is no @@ -3402,6 +3551,109 @@ public: this->free(ptr); } + std::mutex IpcMutex; + struct MemHandleCacheEntry { + MemHandleCacheEntry( + c10::DeviceIndex device, + std::string& handle, + const DeviceCachingAllocator& allocator) + : device_(device) + { + int type = SHAREABLE_NPU_MALLOC; + std::istringstream ss(handle); + if (handle.size() != ACL_IPC_HANDLE_SIZE) { + auto version = ss.get(); + TORCH_CHECK( + version <= SHAREABLE_HANDLE_VERSION, + "received sharable handle from a future version of torch that this version does not know how to handle", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + type = ss.get(); + } + // otherwise this is coming from an old pytorch where it has to be a raw + // SHAREABLE_NPU_MALLOC + if (type == SHAREABLE_NPU_MALLOC) { + handle_str handle_r; + ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data, 0)); + handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE); + } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) { + expandable_segment_ = + ExpandableSegment::fromShared(device, ss) + .release(); + } else { + TORCH_INTERNAL_ASSERT( + false, "Unexpected or illformed shareable handle type"); + } + } + // this struct expects that clear is explicitly called to + // free resources, because we only want this code running when + // the shared pointer to this entry is destructed, not during + // deinitialization when npu may already have been shutdown. + // This replicates the previous behavior of this map when it + // stored raw npu_ipc_ptr_ handles. + void clear() + { + if (npu_ipc_ptr_) { + c10_npu::NPUGuard device_guard(device_); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(handle_s.c_str())); + npu_ipc_ptr_ = nullptr; + } + if (expandable_segment_) { + delete expandable_segment_; + expandable_segment_ = nullptr; + } + } + void* ptr() + { + if (npu_ipc_ptr_) { + return npu_ipc_ptr_; + } else { + return expandable_segment_->ptr(); + } + } + c10::DeviceIndex device_; + ExpandableSegment* expandable_segment_{nullptr}; + void* npu_ipc_ptr_{nullptr}; // nullptr if expandable_segment_ is not null + std::weak_ptr wp_; + std::string handle_s; + }; + ska::flat_hash_map ipcMemHandle_to_devptr; + + std::shared_ptr getIpcDevPtr(std::string handle) override + { + std::lock_guard lock(IpcMutex); + + auto iter = ipcMemHandle_to_devptr.find(handle); + if (iter != ipcMemHandle_to_devptr.end()) { + auto devptr = iter->second.wp_.lock(); + TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr"); + return devptr; + } + int curr_device = 0; + NPU_CHECK_ERROR(c10_npu::GetDevice(&curr_device)); + auto inserted = ipcMemHandle_to_devptr.insert( + iter, + {handle, + MemHandleCacheEntry( + static_cast(curr_device), handle, *device_allocator[curr_device])}); + auto sp = std::shared_ptr( + inserted->second.ptr(), [handle, this](void* ptr) { + std::unique_lock deleter_lock(IpcMutex); + + auto it = ipcMemHandle_to_devptr.find(handle); + TORCH_INTERNAL_ASSERT(it != ipcMemHandle_to_devptr.end()); + auto entry = std::move(it->second); + ipcMemHandle_to_devptr.erase(it); + + // ExpandableSegment synchronizes on destruction in unmapHandles, so + // we need to release the lock first to minimize the performance hit. + deleter_lock.unlock(); + entry.clear(); + }); + inserted->second.wp_ = sp; + return sp; + } + void FreeDeviceCachedMemory(int device) override { device_allocator[device]->emptyCache(device, true); diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index c33f51fbc895f989609cd6ef0953678d7b0e1cdf..c7082c89044158360f39373593e2deabb658b776 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex(); // block inside of already allocated area. class FreeMemoryCallback { public: - virtual ~FreeMemoryCallback(){}; - virtual bool Execute() = 0; + virtual ~FreeMemoryCallback(){}; + virtual bool Execute() = 0; }; C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback); @@ -188,6 +188,11 @@ using OutOfMemoryObserver = std::function; +struct ShareableHandle { + ptrdiff_t offset; + std::string handle; +}; + class NPUAllocator : public c10::Allocator { public: virtual c10::DataPtr allocate_with_aligned(size_t size, size_t aligned) const = 0; @@ -227,6 +232,8 @@ public: " does not yet support checkPoolLiveAllocations. " "If you need it, please file an issue describing your use case.", PTA_ERROR(ErrCode::NOT_SUPPORT)); } + virtual ShareableHandle shareIpcHandle(void* ptr) = 0; + virtual std::shared_ptr getIpcDevPtr(std::string handle) = 0; virtual bool isHistoryEnabled() { TORCH_CHECK( @@ -376,6 +383,16 @@ inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) return get()->releasePool(device, mempool_id); } +inline std::shared_ptr getIpcDevPtr(std::string handle) +{ + return get()->getIpcDevPtr(handle); +} + +inline ShareableHandle shareIpcHandle(void* ptr) +{ + return get()->shareIpcHandle(ptr); +} + inline void FreeDeviceCachedMemory(int device) { return get()->FreeDeviceCachedMemory(device); diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h index c01491aa033752413dd880329445a9eb2d8556e2..ac7f0176e0f52daf9f88fdd39bdb2f5b0d546f5b 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.h +++ b/torch_npu/csrc/core/npu/NPUEventManager.h @@ -22,7 +22,7 @@ public: void DecreaseUnrecordedCount(aclrtEvent event); bool IsEventRecorded(aclrtEvent event); void ClearUnrecordedCount(); - ~NPUEventManager() {} + ~NPUEventManager() {} private: void run(aclrtEvent event); diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 9c667f1fdb120c30696c49c1a911702332ffb54b..290daec7f11529eb1e744325ab335b84194f7db9 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -47,13 +47,14 @@ void warn_(const ::c10::Warning& warning) std::string formatErrorCode(SubModule submodule, ErrCode errorCode) { + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { + return ""; + } std::ostringstream oss; int deviceIndex = -1; c10_npu::GetDevice(&deviceIndex); auto rank_id = c10_npu::option::OptionsManager::GetRankId(); - if (!(c10_npu::option::OptionsManager::ShouldPrintLessError())) { oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") "; - } oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast(submodule); oss << std::setw(3) << std::setfill('0') << static_cast(errorCode); oss << " " << submoduleMap[submodule] << " " << errCodeMap[errorCode]; @@ -92,7 +93,7 @@ MemUceInfo memUceInfo; std::mutex memUceInfoMutex; -void set_mem_uce_info(MemUceInfo info) +void set_mem_uce_info(MemUceInfo& info) { std::lock_guard lock(memUceInfoMutex); memUceInfo = info; @@ -132,10 +133,12 @@ const std::string c10_npu_check_error_message(std::string& errmsg) std::regex ws_regex("[\\s\\t\\n\\r]+"); content = std::regex_replace(content, ws_regex, " "); - if (!content.empty() && content.front() == ' ') + if (!content.empty() && content.front() == ' ') { content.erase(0, 1); - if (!content.empty() && content.back() == ' ') + } + if (!content.empty() && content.back() == ' ') { content.pop_back(); + } return content; } @@ -147,10 +150,10 @@ const std::string c10_npu_check_error_message(std::string& errmsg) const char *c10_npu_get_error_message() { auto errmsg = c10_npu::acl::AclGetErrMsg(); - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { std::string log(errmsg); std::string errmsg_ = c10_npu::c10_npu_check_error_message(log); - thread_local std::string processedErrMsg = errmsg_; + thread_local std::string processedErrMsg = "CANN error: " + errmsg_; c10_npu::setRepoErrMsg(processedErrMsg.c_str()); return processedErrMsg.c_str(); } else { @@ -172,7 +175,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg) int device = 0; auto err = c10_npu::GetDevice(&device); if (err != ACL_ERROR_NONE) { - err_msg = "ERROR happend in GetDevice."; + err_msg = "ERROR happened in GetDevice."; if (check_error) { TORCH_CHECK(false, err_msg, PTA_ERROR(ErrCode::ACL)); } else { diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index a82f8f15688c9da828cc977954527869c99708d3..1d34ae2050ded401c5508ccaa57a89ad481e2a74 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -151,7 +151,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) " that driver and firmware packages do not match."); \ return true; \ }(); \ - } else if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + } else if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ std::ostringstream oss; \ oss << " NPU function error: " \ << (device_error_msg.empty() ? getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg) \ @@ -166,8 +166,20 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) false, \ (device_error_msg.empty() ? "" : device_error_msg), \ c10_npu::c10_npu_get_error_message()); \ + } else if (error_code == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ + TORCH_CHECK( \ + false, \ + __func__, \ + ":", \ + __FILE__, \ + ":", \ + __LINE__, \ + " NPU function error: ", (device_error_msg.empty() ? \ + " FORCE STOP" : device_error_msg), \ + ", error code is ", error_code, \ + PTA_ERROR(ErrCode::ACL)); \ } else { \ - TORCH_CHECK( \ + TORCH_CHECK( \ false, \ __func__, \ ":", \ @@ -195,7 +207,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) static c10_npu::acl::AclErrorCode err_map; \ if ((Error) != ACL_ERROR_NONE) { \ CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) \ { \ std::ostringstream oss; \ oss << " OPS function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ @@ -260,7 +272,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg); void record_mem_hbm_ecc_error(); -void set_mem_uce_info(MemUceInfo info); +void set_mem_uce_info(MemUceInfo& info); MemUceInfo get_mem_uce_info(); diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index cca42ed288f29bfac5b6c644ae42b2af80366356..0ceb84847bfd235be562af9a8e742da9d34eac30 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -5,6 +5,7 @@ #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" #endif @@ -46,7 +47,6 @@ aclError GetDevice(int32_t *device) { if (targetDeviceIndex >= 0) { *device = targetDeviceIndex; - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(targetDeviceIndex)); return ACL_ERROR_NONE; } @@ -60,13 +60,8 @@ aclError GetDevice(int32_t *device) } if (err == ACL_ERROR_NONE) { local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) { + } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; - local_device = 0; - std::lock_guard lock(mtx); - if (used_devices.find(local_device) == used_devices.end()) { - NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); - } return ACL_ERROR_NONE; } return err; @@ -103,7 +98,10 @@ aclError SetDevice(c10::DeviceIndex device) if (local_device == device) { return ACL_ERROR_NONE; } - c10_npu::SetThreadAffinity(device); + + if (c10_npu::NeedMainThreadBind()) { + c10_npu::SetThreadAffinity(device); + } aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { @@ -116,6 +114,17 @@ aclError SetDevice(c10::DeviceIndex device) return err; } +aclError MaybeSetDevice(c10::DeviceIndex device) +{ + if (isDeviceCtxActive(device)) { + ASCEND_LOGI("MaybeSetDevice: NPU device %d has not been initialized! We will set targetDeviceIndex.", device); + NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(device)); + } else { + targetDeviceIndex = device; + } + return ACL_ERROR_NONE; +} + aclError ResetUsedDevices() { std::lock_guard lock(mtx); @@ -137,7 +146,7 @@ aclError DestroyUsedStreams() for (const auto it : used_devices) { NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); NPUStream stream = getCurrentNPUStream(it.first); - aclError acl_ret = acl::AclrtDestroyStreamForce(stream); + aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false)); if (acl_ret != ACL_ERROR_NONE) { return acl_ret; } @@ -290,4 +299,42 @@ void stream_synchronize(aclrtStream stream) NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); } +aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + c10_npu::acl::aclrtDevResModelType restype = static_cast(type); + aclError err = c10_npu::acl::AclrtSetDeviceResLimit(device, restype, value); + NPU_CHECK_ERROR(err); + return err; +} + +uint32_t GetDeviceResLimit(int32_t device, int32_t type) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + c10_npu::acl::aclrtDevResModelType restype = static_cast(type); + uint32_t value; + NPU_CHECK_ERROR(c10_npu::acl::AclrtGetDeviceResLimit(device, restype, &value)); + return value; +} + +aclError ResetDeviceResLimit(int32_t device) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not reset device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + aclError err = c10_npu::acl::AclrtResetDeviceResLimit(device); + NPU_CHECK_ERROR(err); + return err; +} + } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 948998459799d300bf76f7cd3367ed7c7a392e3a..e162f8fe8f0ca3b00af68b6bf57efc0b9c19078f 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -48,6 +48,8 @@ aclError GetDeviceWithoutSet(int32_t *device); */ C10_NPU_API aclError SetDevice(c10::DeviceIndex device); +C10_NPU_API aclError MaybeSetDevice(c10::DeviceIndex device); + /** * @ingroup torch_npu * @brief reset all device id by ACL interface: aclrtResetDevice. @@ -79,6 +81,12 @@ void SetTargetDevice(); int GetLocalDevice(); +aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value); + +C10_NPU_API uint32_t GetDeviceResLimit(int32_t deviceId, int32_t type); + +aclError ResetDeviceResLimit(int32_t deviceId); + enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR }; // it's used to store npu synchronization state diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp index fd060dcb086f911f0e2b345a18ba9672ded2c34c..a00448bd1cf547cb2f641eb47ff940c902341610 100644 --- a/torch_npu/csrc/core/npu/NPUGraph.cpp +++ b/torch_npu/csrc/core/npu/NPUGraph.cpp @@ -1,4 +1,3 @@ -#ifndef BUILD_LIBTORCH #include "torch_npu/csrc/core/npu/NPUGraph.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" @@ -10,7 +9,6 @@ #include #include -#include namespace c10_npu { @@ -261,4 +259,3 @@ NPUGraph::~NPUGraph() } } // namespace c10_npu -#endif diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp new file mode 100644 index 0000000000000000000000000000000000000000..393b4706c60decfb6171dfb50d8670d92f74b102 --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp @@ -0,0 +1,36 @@ +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" +namespace torch_npu { +namespace ipc { + +int32_t* pids = nullptr; +size_t pid_num = 0; +size_t capacity = 0; + +void addPid(int pid) +{ + const size_t requiredCapacity = pid_num + 1; + + if (requiredCapacity > capacity) { + size_t newCapacity = capacity + 10; + + int32_t* newArray = new int32_t[newCapacity]; + for (int i = 0; i < pid_num; ++i) { + newArray[i] = pids[i]; + } + + delete[] pids; + pids = newArray; + capacity = newCapacity; + } + + pids[pid_num++] = static_cast(pid); +} + +size_t getPids(int32_t** ret_pids) +{ + *ret_pids = pids; + return pid_num; +} + +} // namespace ipc +} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.h b/torch_npu/csrc/core/npu/NPUIPCPidManager.h new file mode 100644 index 0000000000000000000000000000000000000000..f27cd240d15723f743fbcefe7204c81588ca60b3 --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.h @@ -0,0 +1,12 @@ +#pragma once +#include +#include + +namespace torch_npu { +namespace ipc { + +void addPid(int pid); +size_t getPids(int32_t** pids); + +} // namespace ipc +} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUMacros.h b/torch_npu/csrc/core/npu/NPUMacros.h index 3223c4f325b3de69b8e5cdc783954d84033b37b4..960dcb97b6e52bffc37582250ffd99b1f7ac08a6 100644 --- a/torch_npu/csrc/core/npu/NPUMacros.h +++ b/torch_npu/csrc/core/npu/NPUMacros.h @@ -29,6 +29,6 @@ #define TORCH_NPU_API C10_NPU_API -#define C10_COMPILE_TIME_MAX_NPUS 16 +#define C10_COMPILE_TIME_MAX_NPUS 32 // A maximum of 8 P2P links can be created on a NPU device #define C10_P2P_ACCESS_MAX_NPUS 8 diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 3eee4167a9d0c26130fa1ebb436965c218588456..579514ab37390f36aa208e7711c6fcec131a9f98 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -249,7 +249,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) // occur. #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { gilState = PyEval_SaveThread(); } #endif @@ -290,11 +290,11 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) error_msg = c10_npu::c10_npu_get_error_message(); } runtime_error = throwError + ", " + error_msg + PTA_ERROR(ErrCode::ACL); - error_msg = throwError + " happend."; + error_msg = throwError + " happened."; } if (current_status == RepoStatus::CAN_EXIT) { - error_msg = "Inner error happend with CAN_EXIT status, detail: " + repo_error; + error_msg = "Inner error happened with CAN_EXIT status, detail: " + repo_error; } if (current_status == RepoStatus::ERROR_EXIT) { @@ -314,12 +314,12 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) repo_error + ".\n" + "Since the operator is called asynchronously, the stacktrace may be inaccurate. " "If you want to get the accurate stacktrace, " - "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + + "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, " "resulting in performance degradation. " "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." + PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error; - error_msg = "Inner error happend, detail: " + repo_error; + error_msg = "Inner error happened, detail: " + repo_error; } #ifndef BUILD_LIBTORCH @@ -330,7 +330,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) #endif if (!error_msg.empty()) { - ASCEND_LOGE(error_msg); + ASCEND_LOGE("%s", error_msg.c_str()); } if (check_error && !runtime_error.empty()) { throw std::runtime_error(runtime_error); @@ -470,7 +470,7 @@ void Repository::Enqueue(void *cur_paras) ThrowDeviceError(current_status, cur_paras); if (current_status == RepoStatus::CAN_EXIT) { - ASCEND_LOGE("Inner error happend with CAN_EXIT status, detail: %s", repo_error.c_str()); + ASCEND_LOGE("Inner error happened with CAN_EXIT status, detail: %s", repo_error.c_str()); } if (current_status == RepoStatus::ERROR_EXIT) { @@ -490,7 +490,7 @@ void Repository::Enqueue(void *cur_paras) repo_error + ".\n" + "Since the operator is called asynchronously, the stacktrace may be inaccurate. " "If you want to get the accurate stacktrace, " - "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + + "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, " "resulting in performance degradation. " "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." + @@ -523,7 +523,7 @@ void Repository::Enqueue(void *cur_paras) uint64_t u = 1; SetWriteWorking(true); - while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) { + while (!ret && (GetStatus() == RUN || GetStatus() == INIT)) { ret = WriteQueue(cur_paras); if (ret == false) { SetWriteWorking(false); @@ -531,7 +531,7 @@ void Repository::Enqueue(void *cur_paras) if (IsFullQueue()) { #ifndef BUILD_LIBTORCH // double check the current thread hold a Gil lock - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u); Py_END_ALLOW_THREADS } else { @@ -707,6 +707,8 @@ bool Repository::CheckInit() const void StartConsume(Repository *repo, c10::DeviceIndex device_id) { SetThreadType(ThreadType::ACL_THREAD); + SetThreadAffinity(device_id); + aclError ret = c10_npu::SetDevice(device_id); if (ret != 0) { C10_NPU_SHOW_ERR_MSG(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 7910946197df1e53d0dd29e15f1b922e2c420962..31ade0aa41d9710c78ae591678d7962f3af954a1 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -229,6 +229,8 @@ static void initNPUStreamsOnce() { // Inits default and secondary streams (once, globally) c10::DeviceIndex device_index = current_device(); + // makesure on real devcie + SetTargetDevice(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { @@ -259,7 +261,7 @@ static uint32_t get_idx(std::atomic& counter) { auto raw_idx = counter++; static int StreamsPerPool = GetStreamsPerPool(); - return raw_idx % StreamsPerPool; + return raw_idx % static_cast(StreamsPerPool); } static uint32_t get_sync_launch_stream_idx(std::atomic& counter) diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp index 084f1df577a58ba352c76361211f5741df8ab4ef..39d19b0b628e1a191aab9ff4182a7e5bd1f6c657 100644 --- a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp @@ -47,7 +47,7 @@ void* registerSvmMem(void* ptr, size_t size) void* mallocHostSwapMemory(size_t size) { if (!initialized) { - kAlignSize = sysconf(_SC_PAGESIZE); + kAlignSize = static_cast(sysconf(_SC_PAGESIZE)); initialized = true; } size = (size + kAlignSize - 1) & ~(kAlignSize - 1); diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp index 4e0fce02fb311d239c95a913939e41b554333b02..24a2a8da62cf6ecb23684a51be0dadf01b671b04 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.cpp +++ b/torch_npu/csrc/core/npu/NpuVariables.cpp @@ -39,28 +39,33 @@ static std::map socVersionMap = { {"Ascend910_9372", SocVersion::Ascend910_9372}, {"Ascend910_9362", SocVersion::Ascend910_9362}}; -void SetSocVersion(const char* const socVersion) { - if (socVersion == nullptr || - g_curSocVersion != SocVersion::UnsupportedSocVersion) { - return; - } +void SetSocVersion(const char* const socVersion) +{ + if (socVersion == nullptr || + g_curSocVersion != SocVersion::UnsupportedSocVersion) { + return; + } - SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; + SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; - auto const& iter = socVersionMap.find(socVersion); - if (iter != socVersionMap.end()) { - curSocVersion = iter->second; - } else { - std::string unsupported_soc(socVersion); - std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); - AT_ERROR("Unsupported soc version: ", unsupported_soc); - } + auto const& iter = socVersionMap.find(socVersion); + if (iter != socVersionMap.end()) { + curSocVersion = iter->second; + } else { + std::string unsupported_soc(socVersion); + std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); + AT_ERROR("Unsupported soc version: ", unsupported_soc); + } - g_curSocVersion = curSocVersion; + g_curSocVersion = curSocVersion; } const SocVersion& GetSocVersion() { + if (g_curSocVersion == SocVersion::UnsupportedSocVersion) { + auto soc_name = c10_npu::acl::AclGetSocName(); + SetSocVersion(soc_name); + } return g_curSocVersion; } @@ -94,5 +99,10 @@ bool IsBF16Supported() { return GetSocVersion() >= SocVersion::Ascend910B1; } + +bool IsAclnnOnly() +{ + return false; +} } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h index 3119a645153322225f9d0d9ea19dfa3b1ef9ab9f..6a3a8cdfd7e9b8a59fcb5712ab81146dac5be875 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.h +++ b/torch_npu/csrc/core/npu/NpuVariables.h @@ -40,6 +40,8 @@ const SocVersion& GetSocVersion(); bool IsSupportInfNan(); bool IsBF16Supported(); + +bool IsAclnnOnly(); } // namespace c10_npu #endif diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp index 951b508240a3999a5c2773702103419112951ac5..7cfcb2a483ebc424cdde19c7d969c6da4375ae63 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp @@ -53,7 +53,8 @@ void NPUGuardImpl::setDevice(c10::Device d) const void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept { - NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); + c10_npu::StartMainThreadBind(d.index()); + NPU_CHECK_WARN(c10_npu::MaybeSetDevice(d.index())); } c10::Stream NPUGuardImpl::getStream(c10::Device d) const noexcept diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 3616d2e4568c1677baaa4688919787580baad870..b97a8d4c39bbc2cbabc6d951e48d4b012e72b745 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -82,6 +82,17 @@ LOAD_FUNCTION(aclmdlRICaptureTaskUpdateBegin) LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd) LOAD_FUNCTION(aclrtHostRegister) LOAD_FUNCTION(aclrtHostUnregister) +LOAD_FUNCTION(aclrtIpcMemGetExportKey) +LOAD_FUNCTION(aclrtIpcMemSetImportPid) +LOAD_FUNCTION(aclrtIpcMemImportByKey) +LOAD_FUNCTION(aclrtIpcMemClose) +LOAD_FUNCTION(aclrtMemExportToShareableHandle) +LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) +LOAD_FUNCTION(aclrtMemImportFromShareableHandle) +LOAD_FUNCTION(aclrtDeviceGetBareTgid) +LOAD_FUNCTION(aclrtGetDeviceResLimit) +LOAD_FUNCTION(aclrtSetDeviceResLimit) +LOAD_FUNCTION(aclrtResetDeviceResLimit) aclprofStepInfoPtr init_stepinfo() { @@ -175,6 +186,7 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { if (stream == nullptr) { // default stream return ACL_ERROR_INVALID_PARAM; } + typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t); static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode); if (func == nullptr) { @@ -411,7 +423,7 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) { } TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND)); return func_backup(stream); - } + } } aclError AclrtDestroyStreamForce(aclrtStream stream) { @@ -845,7 +857,7 @@ bool IsCaptureSupported() static bool have_load_func = false; static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) && (GetSocVersion() < SocVersion::Ascend310B1)) || - (GetSocVersion() >= SocVersion::Ascend910_9391); + ((GetSocVersion() >= SocVersion::Ascend910_9391)); if (default_support_capture && !have_load_func) { have_load_func = true; typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); @@ -928,5 +940,138 @@ aclError AclrtHostUnregister(void *ptr) return func(ptr); } +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag) +{ + typedef aclError (*AclrtIpcMemGetExportKey)(void *, size_t, char *, size_t, uint64_t); + static AclrtIpcMemGetExportKey func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemGetExportKey) GET_FUNC(aclrtIpcMemGetExportKey); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemGetExportKey", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(devPtr, size, key, len, flag); +} + +aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num) +{ + typedef aclError (*AclrtIpcMemSetImportPid)(const char *, int32_t *, size_t); + static AclrtIpcMemSetImportPid func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemSetImportPid) GET_FUNC(aclrtIpcMemSetImportPid); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemSetImportPid", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(key, pid, num); +} + +aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag) +{ + typedef aclError (*AclrtIpcMemImportByKey)(void **, const char *, uint64_t); + static AclrtIpcMemImportByKey func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemImportByKey) GET_FUNC(aclrtIpcMemImportByKey); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemImportByKey", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(devPtr, key, flag); +} + +aclError AclrtIpcMemClose(const char *key) +{ + typedef aclError (*AclrtIpcMemClose)(const char *); + static AclrtIpcMemClose func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemClose) GET_FUNC(aclrtIpcMemClose); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemClose", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(key); +} + +aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, + uint64_t flags, uint64_t *shareableHandle) +{ + typedef aclError (*AclrtMemExportToShareableHandle)(aclrtDrvMemHandle, aclrtMemHandleType, uint64_t, uint64_t *); + static AclrtMemExportToShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemExportToShareableHandle) GET_FUNC(aclrtMemExportToShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemExportToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(handle, handleType, flags, shareableHandle); +} + +aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum) +{ + typedef aclError (*AclrtMemSetPidToShareableHandle)(uint64_t, int32_t *, size_t); + static AclrtMemSetPidToShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemSetPidToShareableHandle) GET_FUNC(aclrtMemSetPidToShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemSetPidToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(shareableHandle, pid, pidNum); +} + +aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle) +{ + typedef aclError (*AclrtMemImportFromShareableHandle)(uint64_t, int32_t, aclrtDrvMemHandle *); + static AclrtMemImportFromShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemImportFromShareableHandle) GET_FUNC(aclrtMemImportFromShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemImportFromShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(shareableHandle, deviceId, handle); +} + +aclError AclrtDeviceGetBareTgid(int32_t *pid) +{ + typedef aclError (*AclrtDeviceGetBareTgid)(int32_t *); + static AclrtDeviceGetBareTgid func = nullptr; + if (func == nullptr) { + func = (AclrtDeviceGetBareTgid) GET_FUNC(aclrtDeviceGetBareTgid); + } + + TORCH_CHECK(func, "Failed to find function aclrtDeviceGetBareTgid", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(pid); +} + +aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value) +{ + typedef aclError (*AclrtGetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t*); + static AclrtGetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtGetDeviceResLimit) GET_FUNC(aclrtGetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtGetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId, type, value); +} + +aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value) +{ + typedef aclError (*AclrtSetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t); + static AclrtSetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtSetDeviceResLimit) GET_FUNC(aclrtSetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtSetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId, type, value); +} + +aclError AclrtResetDeviceResLimit(int32_t deviceId) +{ + typedef aclError (*AclrtResetDeviceResLimit)(int32_t); + static AclrtResetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtResetDeviceResLimit) GET_FUNC(aclrtResetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtResetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 3eebe56d358a189f170860edbb9dc9b32265d4ff..3b6d47cf4a260c1dfa282dd96b12bb4ea8f232e0 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -32,6 +32,12 @@ enum aclrtStreamStatus { }; using aclrtStreamStatus = enum aclrtStreamStatus; +enum aclrtDevResModelType { + ACL_RT_DEV_RES_CUBE_CORE = 0, + ACL_RT_DEV_RES_VECTOR_CORE = 1, +}; +using aclrtDevResModelType = enum aclrtDevResModelType; + /** aclprofStepInfo is provide by acl, it used to be store dispatch op info. */ @@ -228,5 +234,28 @@ aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, */ aclError AclrtHostUnregister(void *ptr); +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag); + +aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num); + +aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag); + +aclError AclrtIpcMemClose(const char *key); + +aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, + uint64_t flags, uint64_t *shareableHandle); + +aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum); + +aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle); + +aclError AclrtDeviceGetBareTgid(int32_t *pid); + +aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value); + +aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value); + +aclError AclrtResetDeviceResLimit(int32_t deviceId); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp index 8f7f17a0114a517ef7f5ef4b201b1bf749274210..9e46d36a6f0e74deb33502b64b143c4f7622f86d 100644 --- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp +++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp @@ -4,6 +4,7 @@ #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/npu_log.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" namespace c10_npu { namespace option { @@ -84,6 +85,18 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::u void SetOption(const std::string &key, const std::string &val) { + if (c10_npu::IsAclnnOnly()) { + if (key == "jitCompile" && val == "enable") { + TORCH_NPU_WARN_ONCE("Current device only support jit_compile=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + if (key == "ALLOW_INTERNAL_FORMAT" && val == "enable") { + TORCH_NPU_WARN_ONCE("Current device only support allow_internal_format=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + } register_options::OptionRegister::GetInstance()->Set(key, val); } diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 5dcf4e88aa742236ff3f48b6e020f7a2a485e51b..9fc3bac5a89dd5336a777ef797e7defb5c01f9df 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -470,7 +470,7 @@ uint32_t OptionsManager::GetP2PBufferSize() const static uint32_t buf_size = []() -> uint32_t { char* buf_val = std::getenv("P2P_HCCL_BUFFSIZE"); // Default 0M - int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0; + int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 20; TORCH_CHECK(buf_size >= 0, "P2P_HCCL_BUFFSIZE cannot be negative.", PTA_ERROR(ErrCode::VALUE)); return static_cast(buf_size); }(); @@ -485,6 +485,7 @@ uint32_t OptionsManager::GetAclOpInitMode() int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0; std::unordered_map aclOpInitMode = getAclOpInitMode(); if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) { + acl_op_init_mode = 0; TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0."); } return static_cast(acl_op_init_mode); @@ -622,7 +623,7 @@ bool OptionsManager::IsOomSnapshotEnable() return (envFlag != 0); } -bool OptionsManager::ShouldPrintLessError() +bool OptionsManager::IsCompactErrorOutput() { static bool should_print = []() -> bool { int32_t disabled_error = OptionsManager::GetBoolTypeOption("TORCH_NPU_COMPACT_ERROR_OUTPUT"); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 5be33e06daae47716164f4ad7299afabd8c3426c..73f5dbcb81f9fc268d8ef9122407e66b976dad08 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -133,7 +133,7 @@ public: static std::string GetOomSnapshotDumpPath(); static bool IsOomSnapshotEnable(); static bool ShouldPrintWarning(); - static bool ShouldPrintLessError(); + static bool IsCompactErrorOutput(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index de1010347fb5ffa785dc7f0b1e44acb8e5fb5d7f..4b6707b8495b9a89c500e06f160b42950e9ae6fb 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -189,6 +189,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) lazy_fn_.clear(); + SetMainThread(); + init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); @@ -275,8 +277,8 @@ void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack& call_, const std:: lazy_fn_.emplace_back(std::make_pair(call_, in)); } -void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, - ReleasePriority priority) { +void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, ReleasePriority priority) +{ const auto& iter = this->release_fn_.find(priority); if (iter != release_fn_.end()) { release_fn_[priority].emplace_back(release_fn); diff --git a/torch_npu/csrc/custom_dtype/CMakeLists.txt b/torch_npu/csrc/custom_dtype/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d3d7c0e5379a0c23354a45a6dbd12c0bffea0ac --- /dev/null +++ b/torch_npu/csrc/custom_dtype/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _CUS_DTYPE_SRCS *.cpp) + +LIST(APPEND CUS_DTYPE_SRCS ${_CUS_DTYPE_SRCS}) + +# Pass to parent +set(CUS_DTYPE_SRCS ${CUS_DTYPE_SRCS} PARENT_SCOPE) diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a88344ce5e969bc484b164ce2c28c41004394a52 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.cpp @@ -0,0 +1,125 @@ +#include "torch_npu/csrc/custom_dtype/Init.h" +#ifndef BUILD_LIBTORCH +#include +#include +#endif + + +namespace c10_npu { +struct DTypeConstants { + static const int float32_value; + static const int float16_value; + static const int int8_value; + static const int int32_value; + static const int uint8_value; + static const int int16_value; + static const int uint16_value; + static const int uint32_value; + static const int int64_value; + static const int uint64_value; + static const int float64_value; + static const int bool_value; + static const int string_value; + static const int complex64_value; + static const int complex128_value; + static const int bfloat16_value; + static const int int4_value; + static const int uint1_value; + static const int complex32_value; +}; + +const int DTypeConstants::float32_value = static_cast(DType::FLOAT); +const int DTypeConstants::float16_value = static_cast(DType::FLOAT16); +const int DTypeConstants::int8_value = static_cast(DType::INT8); +const int DTypeConstants::int32_value = static_cast(DType::INT32); +const int DTypeConstants::uint8_value = static_cast(DType::UINT8); +const int DTypeConstants::int16_value = static_cast(DType::INT16); +const int DTypeConstants::uint16_value = static_cast(DType::UINT16); +const int DTypeConstants::uint32_value = static_cast(DType::UINT32); +const int DTypeConstants::int64_value = static_cast(DType::INT64); +const int DTypeConstants::uint64_value = static_cast(DType::UINT64); +const int DTypeConstants::float64_value = static_cast(DType::DOUBLE); +const int DTypeConstants::bool_value = static_cast(DType::BOOL); +const int DTypeConstants::string_value = static_cast(DType::STRING); +const int DTypeConstants::complex64_value = static_cast(DType::COMPLEX64); +const int DTypeConstants::complex128_value = static_cast(DType::COMPLEX128); +const int DTypeConstants::bfloat16_value = static_cast(DType::BF16); +const int DTypeConstants::int4_value = static_cast(DType::INT4); +const int DTypeConstants::uint1_value = static_cast(DType::UINT1); +const int DTypeConstants::complex32_value = static_cast(DType::COMPLEX32); + +#ifndef BUILD_LIBTORCH +PyObject* cd_initExtension(PyObject*, PyObject *) +{ + auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C")); + if (!torch_npu_C_module) { + return nullptr; + } + auto torch_npu_C_m = py::handle(torch_npu_C_module).cast(); + auto m = torch_npu_C_m.def_submodule("_cd", "_cd bindings"); + + py::class_(m, "DType") + .def_readonly_static("float32", &DTypeConstants::float32_value) + .def_readonly_static("float16", &DTypeConstants::float16_value) + .def_readonly_static("int8", &DTypeConstants::int8_value) + .def_readonly_static("int32", &DTypeConstants::int32_value) + .def_readonly_static("uint8", &DTypeConstants::uint8_value) + .def_readonly_static("int16", &DTypeConstants::int16_value) + .def_readonly_static("uint16", &DTypeConstants::uint16_value) + .def_readonly_static("uint32", &DTypeConstants::uint32_value) + .def_readonly_static("int64", &DTypeConstants::int64_value) + .def_readonly_static("uint64", &DTypeConstants::uint64_value) + .def_readonly_static("float64", &DTypeConstants::float64_value) + .def_readonly_static("bool", &DTypeConstants::bool_value) + .def_readonly_static("string", &DTypeConstants::string_value) + .def_readonly_static("complex64", &DTypeConstants::complex64_value) + .def_readonly_static("complex128", &DTypeConstants::complex128_value) + .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value) + .def_readonly_static("int4", &DTypeConstants::int4_value) + .def_readonly_static("uint1", &DTypeConstants::uint1_value) + .def_readonly_static("complex32", &DTypeConstants::complex32_value); + + Py_RETURN_NONE; +} + +static PyMethodDef NPUCustomDtypeMethods[] = { // NOLINT + {"_cd_init", cd_initExtension, METH_NOARGS, nullptr}, + {nullptr, nullptr, 0, nullptr} +}; +#endif + +const std::string CustomDataTypeToString(int64_t dType) +{ + const std::map + TYPE_TO_STRING_MAP = { + {DType::FLOAT, "torch_npu.float32"}, + {DType::FLOAT16, "torch_npu.float16"}, + {DType::INT8, "torch_npu.int8"}, + {DType::INT32, "torch_npu.int32"}, + {DType::UINT8, "torch_npu.uint8"}, + {DType::INT16, "torch_npu.int16"}, + {DType::UINT16, "torch_npu.uint16"}, + {DType::UINT32, "torch_npu.uint32"}, + {DType::INT64, "torch_npu.int64"}, + {DType::UINT64, "torch_npu.uint64"}, + {DType::DOUBLE, "torch_npu.float64"}, + {DType::BOOL, "torch_npu.bool"}, + {DType::STRING, "torch_npu.string"}, + {DType::COMPLEX64, "torch_npu.complex64"}, + {DType::COMPLEX128, "torch_npu.complex128"}, + {DType::BF16, "torch_npu.bfloat16"}, + {DType::INT4, "torch_npu.int4"}, + {DType::UINT1, "torch_npu.uint1"}, + {DType::COMPLEX32, "torch_npu.complex32"}}; + + const auto iter = TYPE_TO_STRING_MAP.find(static_cast(dType)); + return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype"; +} + +#ifndef BUILD_LIBTORCH +PyMethodDef* custom_dtype_functions() +{ + return NPUCustomDtypeMethods; +} +#endif +} diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h new file mode 100644 index 0000000000000000000000000000000000000000..867e07ae3fe0671f4f8ddcd9fcda323a1bb6a5c9 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#ifndef BUILD_LIBTORCH +#include +#endif +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" +#include "third_party/acl/inc/acl/acl_base.h" + +namespace c10_npu { +const int g_toAclOffset = 256; + +#define ENUM_OFFSET(new_name, old_name) new_name = static_cast(old_name) + g_toAclOffset, + +#ifndef BUILD_LIBTORCH +TORCH_NPU_API PyMethodDef* custom_dtype_functions(); +#endif + +enum class DType { + UNDEFINED = -1, + ENUM_OFFSET(FLOAT, ACL_FLOAT) + ENUM_OFFSET(FLOAT16, ACL_FLOAT16) + ENUM_OFFSET(INT8, ACL_INT8) + ENUM_OFFSET(INT32, ACL_INT32) + ENUM_OFFSET(UINT8, ACL_UINT8) + ENUM_OFFSET(INT16, ACL_INT16) + ENUM_OFFSET(UINT16, ACL_UINT16) + ENUM_OFFSET(UINT32, ACL_UINT32) + ENUM_OFFSET(INT64, ACL_INT64) + ENUM_OFFSET(UINT64, ACL_UINT64) + ENUM_OFFSET(DOUBLE, ACL_DOUBLE) + ENUM_OFFSET(BOOL, ACL_BOOL) + ENUM_OFFSET(STRING, ACL_STRING) + ENUM_OFFSET(COMPLEX64, ACL_COMPLEX64) + ENUM_OFFSET(COMPLEX128, ACL_COMPLEX128) + ENUM_OFFSET(BF16, ACL_BF16) + ENUM_OFFSET(INT4, ACL_INT4) + ENUM_OFFSET(UINT1, ACL_UINT1) + ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32) +}; + +inline bool IsCustomDType(int64_t t) +{ + if (t >= g_toAclOffset) { + return true; + } + return false; +} + +// Both c10_npu::DType and ScalarType are supported +inline aclDataType GetAclDataType(int64_t t) +{ + if (t >= g_toAclOffset) { + return static_cast(t - g_toAclOffset); + } + return at_npu::native::OpPreparation::convert_to_acl_data_type( + static_cast(t)); +} + +inline aclDataType GetAclDataType(DType t) +{ + return static_cast(static_cast(t) - g_toAclOffset); +} + +inline at::ScalarType GetATenDType(int64_t t) +{ + aclDataType aclType = GetAclDataType(t); + return at_npu::native::OpPreparation::convert_to_scalar_type(aclType); +} + +const std::string CustomDataTypeToString(int64_t dType); + +} // namespace c10_npu diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index e9ad7bbd6af6a6d44dab6224888fe34b9d7526b7..1033d8de97f5d0d5eaa099041fd7089616fa3589 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -17,7 +17,7 @@ auto Error = err_code; \ if ((Error) != HCCL_SUCCESS) { \ CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ std::ostringstream oss; \ oss << " HCCL function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ << ", error code is " << Error << " " \ diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 86574926794476595f71ceed89eae6ea20a561af..c61e2b4ffacb80dc615e2721da8bd5cfe79af33c 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -576,7 +576,7 @@ Example:: Default settings return everything - i.e. contains HCCL comm dumps and collective traces. )"); - Py_RETURN_TRUE; + Py_RETURN_TRUE; } // c10d methods on torch._C diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp index 38899ea5a8d8eb3efdf25afcd0494db1603f8238..72e7ebf9a096c630e2029ac6e1abffe20cf5a465 100644 --- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp +++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp @@ -16,11 +16,13 @@ #include #include #include +#include #include #include #include #include #include "c10/util/Logging.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "ParallelTcpServer.hpp" namespace c10d { @@ -315,6 +317,11 @@ int ParallelTcpServer::CreateLocalSocket(const std::string &localSocketPath) noe return -1; } + if (!at_npu::native::NpuUtils::setFilePermissions(sockFd, S_IRUSR | S_IWUSR | S_IRGRP)) { + close(sockFd); + return -1; + } + ret = listen(sockFd, MAX_EVENT_COUNT); if (ret != 0) { LOG(ERROR) << "listen local socket fd failed " << errno << " : " << strerror(errno); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index b6d189608b7d424566901370aac5067862c2d96e..d39604b299ec82036e816b53fcd24d6bb0968803 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -19,8 +19,12 @@ #include #include #include +#include +#include #include +#include + #include "op_plugin/OpInterface.h" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" @@ -62,6 +66,7 @@ constexpr const char* P2P_DEVICE_KEY = "_p2p"; using hcclUs = std::chrono::steady_clock::time_point; constexpr int32_t MAX_GROUP_NAME_LEN = 128; +constexpr int32_t NSLB_JOBID_OFFSET = 32; // HCCL ReduceOp mapping std::map hcclOp = { @@ -926,6 +931,9 @@ ProcessGroupHCCL::ProcessGroupHCCL( const char* blockingWait = getenv(HCCL_BLOCKING_WAIT); logPrefix_ = createLogPrefix(); + if (options_->global_ranks_in_group.empty()) { + numRanks_ = size_; + } dumpOnException_ = c10d::getCvarBool(TORCH_HCCL_DUMP_ON_TIMEOUT, false); heartbeat_ = 1ULL; monitorThreadEnabled_.store(c10d::getCvarBool(TORCH_HCCL_ENABLE_MONITORING, false)); @@ -941,6 +949,24 @@ ProcessGroupHCCL::ProcessGroupHCCL( c10d::PrefixStore *prefixStore = dynamic_cast(store_.get()); globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; + c10::intrusive_ptr getTcpStore = store_; + while (getTcpStore) { + c10d::PrefixStore *asPrefixStore = dynamic_cast(getTcpStore.get()); + c10d::TCPStore *tcpStore = dynamic_cast(getTcpStore.get()); + if (tcpStore) { + if (!(tcpStore->getHost().empty())) { + tcpMasterAddr = tcpStore->getHost(); + tcpMasterPort = tcpStore->getPort(); + break; + } + } + if (asPrefixStore) { + getTcpStore = asPrefixStore->getUnderlyingStore(); + } else { + break; + } + } + try { if (blockingWait != nullptr) { auto val = std::stoi(blockingWait); @@ -1172,6 +1198,7 @@ void ProcessGroupHCCL::abortAndClearHcclComm(c10::optional abortRea abortCommsFromMap(devHCCLCommMap_, rank_, abortReason); devHCCLCommMap_.clear(); devHCCLCommNameMap_.clear(); + p2pSendRecvKeys_.clear(); hcclCommCounter_ = 0; return; } @@ -1214,6 +1241,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL() } } devHCCLCommMap_.clear(); + p2pSendRecvKeys_.clear(); } ASCEND_LOGI("process group destroyed, group id is %s.", options_->group_id.c_str()); logger->info("process group destroyed, group id is %s.", options_->group_id.c_str()); @@ -1707,6 +1735,7 @@ void ProcessGroupHCCL::workCleanupLoop() try { if (needSetDevice) { c10::DeviceIndex device = static_cast(work.devices_[0].index()); + c10_npu::SetThreadAffinity(device); NPU_CHECK_ERROR(c10_npu::SetDevice(device)); deviceId_ = static_cast(work.devices_[0].index()); needSetDevice = false; @@ -2049,30 +2078,34 @@ bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool e } fileName << "torch_hccl_status-" << std::to_string(global_rank) << "_" << master_addr << "_" << std::to_string(deviceId_) << "_"; fileName << std::to_string(numRanks_) << "_" << std::to_string(pid) << "_" << std::to_string(duration) << ".log"; - std::string isMaster = "false"; + bool isMaster = false; if (global_rank == 0) { - isMaster = "true"; + isMaster = true; } std::string out_file_path = c10::str(path, "/", fileName.str()); checkAndMakePath(path.c_str(), "Open shared directory failed. Please check whether input path is valid."); createFile(out_file_path.c_str()); - outfile.open(out_file_path.c_str(), std::ios::trunc); - outfile << "{\"last_comm_op\":["; - bool first_op = true; + using json = nlohmann::json; + json result; + std::list last_comm_ops; for (auto info = StatusOutput_.begin(); info != StatusOutput_.end(); info++) { - if (first_op) { - outfile << "{"; - } else { - outfile << ", {"; - } - outfile << "\"seq\":" << info->second.seq << ", \"op_type\":\"" << info->second.opType; - outfile << "\", \"pg_id\":\"" << info->second.pgId << "\", \"comm_ids\":\"" << info->second.commIds; - outfile << "\", \"status\":\""<< info->second.status << "\"}"; - first_op = false; - } - outfile << "], \"is_master\":" << isMaster; - outfile << ", \"exception_message\":\"" << exceptionMessage_; - outfile << "\", \"global_pg_end_time\":" << end_duration << "}" << std::endl; + json comm_op; + comm_op["seq"] = info->second.seq; + comm_op["op_type"] = info->second.opType; + comm_op["pg_id"] = info->second.pgId; + comm_op["comm_ids"] = info->second.commIds; + comm_op["status"] = info->second.status; + last_comm_ops.emplace_back(comm_op); + } + if (!last_comm_ops.empty()) { + result["last_comm_op"] = last_comm_ops; + } + result["is_master"] = isMaster; + result["exception_message"] = exceptionMessage_; + result["global_pg_end_time"] = end_duration; + std::string result_str = result.dump(); + outfile.open(out_file_path.c_str(), std::ios::trunc); + outfile << result_str << std::endl; outfile.close(); return true; } @@ -2141,6 +2174,30 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } +void ProcessGroupHCCL::setNSLBCommConfig(HcclCommConfig** commConfig) +{ + const char* envPtr = std::getenv("RANK"); + if (envPtr == nullptr) { + ASCEND_LOGI("Failed to get env info for NSLB-DP."); + return; + } + uint32_t worldRankID = std::stoi(std::string(envPtr)); + options_->hccl_config["hccl_world_rank_id"] = worldRankID; + uint32_t masterPort = tcpMasterPort; + struct sockaddr_in sa; + std::string master_addr = tcpMasterAddr; + inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr)); + uint32_t masterIp = ntohl(sa.sin_addr.s_addr); + uint64_t jobID = masterPort; + jobID = (jobID << NSLB_JOBID_OFFSET); + jobID += masterIp; + options_->hccl_config["hccl_job_id"] = jobID; + if ((*commConfig) != nullptr) { + (*commConfig)->hcclWorldRankID = worldRankID; + (*commConfig)->hcclJobID = jobID; + } +} + void ProcessGroupHCCL::createHCCLComm( const std::string& devicesKey, const std::vector& devices, @@ -2165,6 +2222,10 @@ void ProcessGroupHCCL::createHCCLComm( HcclCommConfig config; + if (options_->global_ranks_in_group.empty()) { + setNSLBCommConfig(&commConfig); + } + npuGuard.set_index(devices[i].index()); switch (commType) { case HcclCommType::DEFAULT: @@ -2295,6 +2356,9 @@ bool ProcessGroupHCCL::createHCCLCommEx( return false; } hcclComms[i] = subComm; + if (commType == HcclCommType::P2P) { + hcclComms[i]->p2pPeer = getP2pPeer(); + } // Creates the HCCL streams streamVal.push_back(getNPUStreamByCurrentType(devices[i].index())); } @@ -2397,6 +2461,14 @@ std::vector>& ProcessGroupHCCL::createHCCLComm( // Move the HCCL resource to cache devHCCLCommMap_.emplace(devicesKey, std::move(hcclComms)); + if (commType == HcclCommType::P2P) { + auto iter = p2pSendRecvKeys_.find(rank_); + if (iter == p2pSendRecvKeys_.end()) { + p2pSendRecvKeys_.emplace(rank_, std::vector{devicesKey}); + } else { + iter->second.push_back(devicesKey); + } + } return devHCCLCommMap_[devicesKey]; } @@ -2407,7 +2479,13 @@ int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer) std::vector devices = {at::Device(c10::DeviceType::PrivateUse1, device)}; auto key = getKeyFromDevices(devices); if (p2p && hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { - TORCH_CHECK(peer >= 0, "In p2p scenarios, the passed 'dst rank id' is error.", DIST_ERROR(ErrCode::PARAM)); + TORCH_CHECK( + peer >= 0, + "In p2p scenarios, the passed 'dst rank id' : ", + peer, + " is error, ", + "expected value >= 0.", + DIST_ERROR(ErrCode::PARAM)); key = getKeySendRecv(rank_, peer); } if ((hcclStreams_.count(key) == 0) || hcclStreams_[key].empty()) { @@ -2735,7 +2813,7 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) { at::Device device = at::Device(c10::DeviceType::PrivateUse1, device_id); std::vector devices = {device}; - const auto key = getKeyFromDevices(devices); + auto key = getKeyFromDevices(devices); { std::lock_guard lock(mutex_); @@ -2747,6 +2825,19 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); } } + if (p2pSendRecvKeys_.find(rank_) != p2pSendRecvKeys_.end()) { + auto p2pKeys = p2pSendRecvKeys_[rank_]; + for (const auto& p2pKey : p2pKeys) { + if (devHCCLCommMap_.find(p2pKey) != devHCCLCommMap_.end()) { + // Reuse the cached communicator if there is one. + auto& hcclComms = devHCCLCommMap_[p2pKey]; + for (const auto& hcclComm : hcclComms) { + auto comm = hcclComm->getHcclComm(); + HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); + } + } + } + } } ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str()); } @@ -3087,6 +3178,22 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions() } } + if (options_->hccl_config.find("hccl_world_rank_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_world_rank_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_world_rank_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_world_rank_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + + if (options_->hccl_config.find("hccl_job_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_job_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_job_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_job_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + return config; } @@ -3684,7 +3791,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3854,7 +3961,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce_coalesced( for (const auto i : c10::irange(tensors.size())) { if (tensors[i].scalar_type() == at::kBool || tensors[i].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[i] = at_npu::native::custom_ops::npu_dtype_cast(tensors[i], at::kInt); + tensors_cp[i] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[i], at::kInt); } } }, @@ -3913,7 +4020,7 @@ c10::intrusive_ptr ProcessGroupHCCL::reduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3973,11 +4080,11 @@ c10::intrusive_ptr ProcessGroupHCCL::_reduce_oop( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (inputTensors[0].scalar_type() == at::kBool || inputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - inputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(inputTensors[0], at::kInt); + inputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(inputTensors[0], at::kInt); } if (outputTensors[0].scalar_type() == at::kBool || outputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - outputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(outputTensors[0], at::kInt); + outputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(outputTensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -4012,14 +4119,14 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const if (num_add != 0) { bool transflag = false; if (inter_tensors.scalar_type() == at::ScalarType::Bool) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Int); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Int); transflag = true; } inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0); if (transflag) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Bool); } } return inter_tensors; diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 26a88874ab24c9f8134217f07d9064534036aa56..0c638ef579957aa94b1466545e94867842efa448 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -384,7 +384,7 @@ public: return c10::make_intrusive(_is_high_priority_stream); } - std::unordered_map> hccl_config; + std::unordered_map> hccl_config; std::chrono::milliseconds opTimeout; // Schedule HCCL operations on high priority CUDA streams @@ -571,6 +571,8 @@ public: void resumeHcclComm(int device_id); + void setNSLBCommConfig(HcclCommConfig** commConfig); + bool setCommWorkingDevNic( const HcclComm& comm, int nranks, @@ -746,6 +748,8 @@ protected: // // Note that the order of the device for the tensor list matters. std::unordered_map>> devHCCLCommMap_; + + std::unordered_map> p2pSendRecvKeys_; std::unordered_map devHCCLCommNameMap_; @@ -960,6 +964,10 @@ protected: std::string pg_desc_; + std::string tcpMasterAddr; + + uint32_t tcpMasterPort; + private: // Helper that encapsulates work shared across all collective communication // primitives. diff --git a/torch_npu/csrc/distributed/StoreMessagePacker.cpp b/torch_npu/csrc/distributed/StoreMessagePacker.cpp index 0ff08c8d95f08a43032cba4895fb7021749c98f7..61f0388e662327fc6031014637880a01d17597af 100644 --- a/torch_npu/csrc/distributed/StoreMessagePacker.cpp +++ b/torch_npu/csrc/distributed/StoreMessagePacker.cpp @@ -86,6 +86,7 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess } auto ptr = buffer.data(); + auto ptr_end = ptr + buffer.size(); auto totalSize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); @@ -97,22 +98,26 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess auto keyCount = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); - message.keys.reserve(keyCount); for (auto i = 0UL; i < keyCount; i++) { auto keySize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); message.keys.emplace_back(reinterpret_cast(ptr), keySize); ptr += keySize; + if (ptr > ptr_end) { + break; + } } auto valueCount = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); - message.values.reserve(valueCount); for (auto i = 0UL; i < valueCount; i++) { auto valueSize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); message.values.emplace_back(ptr, ptr + valueSize); ptr += valueSize; + if (ptr > ptr_end) { + break; + } } return static_cast(totalSize); diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index 9d4f9d9d523d14b2dc2421cedae95e268d43e236..f6140a3d06f6a81d1ee208f04f5945a2f4e820a8 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -711,71 +711,71 @@ DEFINE_CONSTANT(started_state, "started") if (includeCollectives) { std::list entries; for (auto& e : dump_entries()) { - json j; - if (onlyActive && e.time_discovered_completed_.has_value()) { - continue; - } - j[record_id_key_str] = int64_t(e.id_); - j[pg_id_key_str] = int64_t(e.pg_id_); - j[pg_name_key_str] = e.pg_name_; - j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_); - j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_); - j[op_id_key_str] = int64_t(e.op_id_); - j[profiling_name_key_str] = e.profiling_name_; - j[time_created_key_str] = int64_t(e.time_created_); - if (e.duration_) { - j[duration_key_str] = *e.duration_; - } - auto it = e.sizes_.begin(); - auto read_sizes = [&](const c10::SmallVector& dims) { - auto sizes = std::list>(); - for (auto dim : dims) { - auto arg_sizes = std::list(); - for (auto i : c10::irange(dim)) { - (void)i; - arg_sizes.push_back(*it++); + json j; + if (onlyActive && e.time_discovered_completed_.has_value()) { + continue; } - sizes.push_back(arg_sizes); + j[record_id_key_str] = int64_t(e.id_); + j[pg_id_key_str] = int64_t(e.pg_id_); + j[pg_name_key_str] = e.pg_name_; + j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_); + j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_); + j[op_id_key_str] = int64_t(e.op_id_); + j[profiling_name_key_str] = e.profiling_name_; + j[time_created_key_str] = int64_t(e.time_created_); + if (e.duration_) { + j[duration_key_str] = *e.duration_; } - return sizes; - }; - j[input_sizes_key_str] = read_sizes(e.input_dims_); - std::vector input_dtypes_strs; - input_dtypes_strs.reserve(e.input_dtypes_.size()); - for (const auto& input_dtype : e.input_dtypes_) { - input_dtypes_strs.emplace_back(c10::toString(input_dtype)); - } - j[input_dtypes_key_str] = input_dtypes_strs; - j[output_sizes_key_str] = read_sizes(e.output_dims_); - std::vector output_dtypes_strs; - output_dtypes_strs.reserve(e.output_dtypes_.size()); - for (const auto& output_dtype : e.output_dtypes_) { - output_dtypes_strs.emplace_back(c10::toString(output_dtype)); - } - j[output_dtypes_key_str] = output_dtypes_strs; - if (e.time_discovered_completed_.has_value()) { - j[state_key_str] = completed_state_str; - } else if (e.time_discovered_started_.has_value()) { - j[state_key_str] = started_state_str; - } else { - j[state_key_str] = scheduled_state_str; - } - j[time_discovered_started_key_str] = - e.time_discovered_started_.has_value() - ? int64_t(*e.time_discovered_started_) - : 0; - j[time_discovered_completed_key_str] = - e.time_discovered_completed_.has_value() - ? int64_t(*e.time_discovered_completed_) - : 0; - j[retired_key_str] = e.retired_; - j[timeout_key_str] = e.timeout_ms_; - j[is_p2p_key_str] = e.isP2P_; - entries.emplace_back(j); + auto it = e.sizes_.begin(); + auto read_sizes = [&](const c10::SmallVector& dims) { + auto sizes = std::list>(); + for (auto dim : dims) { + auto arg_sizes = std::list(); + for (auto i : c10::irange(dim)) { + (void)i; + arg_sizes.push_back(*it++); + } + sizes.push_back(arg_sizes); + } + return sizes; + }; + j[input_sizes_key_str] = read_sizes(e.input_dims_); + std::vector input_dtypes_strs; + input_dtypes_strs.reserve(e.input_dtypes_.size()); + for (const auto& input_dtype : e.input_dtypes_) { + input_dtypes_strs.emplace_back(c10::toString(input_dtype)); + } + j[input_dtypes_key_str] = input_dtypes_strs; + j[output_sizes_key_str] = read_sizes(e.output_dims_); + std::vector output_dtypes_strs; + output_dtypes_strs.reserve(e.output_dtypes_.size()); + for (const auto& output_dtype : e.output_dtypes_) { + output_dtypes_strs.emplace_back(c10::toString(output_dtype)); + } + j[output_dtypes_key_str] = output_dtypes_strs; + if (e.time_discovered_completed_.has_value()) { + j[state_key_str] = completed_state_str; + } else if (e.time_discovered_started_.has_value()) { + j[state_key_str] = started_state_str; + } else { + j[state_key_str] = scheduled_state_str; + } + j[time_discovered_started_key_str] = + e.time_discovered_started_.has_value() + ? int64_t(*e.time_discovered_started_) + : 0; + j[time_discovered_completed_key_str] = + e.time_discovered_completed_.has_value() + ? int64_t(*e.time_discovered_completed_) + : 0; + j[retired_key_str] = e.retired_; + j[timeout_key_str] = e.timeout_ms_; + j[is_p2p_key_str] = e.isP2P_; + entries.emplace_back(j); } if (!entries.empty()) { - result[entries_key_str] = entries; + result[entries_key_str] = entries; } } diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp index 1b85e7fce6438a9a4bb657a0ea1124c62be066c5..319de4ae93d3704562c0cfb74a2418d7afff3ea7 100644 --- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -424,6 +424,9 @@ void TensorPipeAgent::startImpl() priority = opts_.transports->size() - 1 - (iter - opts_.transports->begin()); } std::unique_ptr reg = TensorPipeTransportRegistry()->Create(key); + if (reg == nullptr || reg->transport == nullptr) { + TORCH_CHECK(false, "TensorPipeTransport get nullptr", DIST_ERROR(ErrCode::PTR)); + } if (!reg->transport->isViable()) { continue; } diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp index 6a92fe5af4d8039b3d0ff9c50e49d1fd5fa30a00..9bd270b8fd231cb39a8bc9b98c8680b88a66e6a2 100644 --- a/torch_npu/csrc/framework/FormatHelper.cpp +++ b/torch_npu/csrc/framework/FormatHelper.cpp @@ -52,6 +52,10 @@ std::unordered_map FormatHelper::Initialize {ACL_FORMAT_NDC1HWC0, (FormatInfo){ACL_FORMAT_NDC1HWC0, ACL_FORMAT_NCDHW, InferShapeOfNDC1HWC0, "NDC1HWC0", true}}, {ACL_FRACTAL_Z_3D, (FormatInfo){ACL_FRACTAL_Z_3D, ACL_FORMAT_NCDHW, InferShapeOfFZ3D, "FRACTAL_Z_3D", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_16, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_16", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_32, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_32, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_32", true}}, }; }; diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp index 8d12df0a312afb289cf6164c6d613a49a3fb0caa..5f51f9f0a5cfd43afceb87cf6f4552ea61d4e50c 100644 --- a/torch_npu/csrc/framework/LazyInitAclops.cpp +++ b/torch_npu/csrc/framework/LazyInitAclops.cpp @@ -4,7 +4,6 @@ #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -158,8 +157,6 @@ void SetPrecisionMode() void LazyInitAclopsCore() { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; if (PyGILState_Check()) { @@ -175,8 +172,6 @@ void LazyInitAclopsCore() PyEval_RestoreThread(gilState); } #endif - - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void LazyInitAclops() @@ -198,14 +193,10 @@ void LazyInitAclops() void InitAclopsCore() { - SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - SetPrecisionMode(); MakeCompileCacheDirAndSetOption(); GetAndSetDefaultJitCompileByAcl(); SetHF32DefaultValue(); - - SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void InitAclops() diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index 6b98651c51dba728c9062a47d777650ae7ac93a6..80af05f94b321cac1e928484e18027c1e5cc836b 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -24,7 +24,9 @@ static std::unordered_map> floating_limits_m {at::ScalarType::Double, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Float, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::BFloat16, {std::numeric_limits::max(), std::numeric_limits::min()}}, - {at::ScalarType::Half, {65504, -65504}}}; + {at::ScalarType::Half, {65504, -65504}}, + {at::ScalarType::Float8_e5m2, {57345, -57345}}, + {at::ScalarType::Float8_e4m3fn, {449, -449}}}; static std::unordered_map> integral_limits_map{ {at::ScalarType::Long, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Int, {std::numeric_limits::max(), std::numeric_limits::min()}}, @@ -274,7 +276,7 @@ OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceSca { std::tuple res; if (commonType.has_value() && commonType.value() != tensor.scalar_type()) { - tensor = custom_ops::npu_dtype_cast(tensor, commonType.value()); + tensor = custom_ops::_npu_dtype_cast(tensor, commonType.value()); } // as for dim=0, the dtype of tensor can not be `uint16` because of `TBE` if (torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).storage_sizes_.empty()) { @@ -331,7 +333,7 @@ OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType ty OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType) { if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) { - output = custom_ops::npu_dtype_cast(output, commonType.value()); + output = custom_ops::_npu_dtype_cast(output, commonType.value()); } auto res = OpCmdHelper::CovertToAclOutput(output, realType); aclCmd->AddOutput(std::get<0>(res), std::get<1>(res)); diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index ce8b9065146b362aca3e35a6531994d21a13cea0..0db71b00a109eadcbfacc134d79614ddff98b6e9 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -575,6 +575,7 @@ void *NewFunc(int caption, int &size) void DeleteFunc(void *ptr) { free(ptr); + ptr = nullptr; } using Func = int (*)(c10_npu::queue::QueueParas *, aclrtStream); diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index 6a23a5e4b9094896dc11efa83fed5deb413792eb..08a2d603b6cd96acbe4cd0a05435b15fae9d275c 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -62,9 +62,13 @@ void StorageDescHelper::UpdateDesc(torch_npu::NPUStorageDesc &npuDesc, const c10 } } npuDesc.base_strides_ = new_stride; - // 更新物理内存信息 - npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc); + int NCDHW_OR_NDHWC_DIM = 5; + if ((npuDesc.npu_format_ == ACL_FORMAT_NCDHW || npuDesc.npu_format_ == ACL_FORMAT_NDHWC) && new_size.size() < NCDHW_OR_NDHWC_DIM) { + npuDesc.storage_sizes_ = new_size; + } else { + npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc); + } if (new_data_numel > new_shape_numel) { // Refresh format to base format only when flattening storage data npuDesc.storage_sizes_ = new_size; @@ -98,6 +102,13 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, c torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = SetDesc(dst.dtype(), size, strides, format); } +void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format) +{ + torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = + SetDesc(dst.dtype(), base_size, storage_size, strides, format); +} + bool StorageDescHelper::CheckDescInit(const c10::Storage &storage) { return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ != @@ -255,6 +266,22 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty return npu_desc; } +torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format) +{ + struct torch_npu::NPUStorageDesc npu_desc; + npu_desc.data_type_ = dtype; + npu_desc.base_sizes_ = base_size; + npu_desc.base_strides_ = strides; + aclFormat baseFormat; + aclFormat npuFormat; + std::tie(baseFormat, npuFormat) = InferFormat::GuessFormatUnit(base_size, format); + npu_desc.storage_sizes_ = storage_size; + npu_desc.origin_format_ = baseFormat; + npu_desc.npu_format_ = npuFormat; + return npu_desc; +} + int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst) { const auto &physical_size = FormatHelper::GetStorageSizes(dst); diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h index 6497ee1a8825551acc85d71de0a2582db771c548..f3b35067e0f5fcf8f9f46f9886bd2286b48e887b 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.h +++ b/torch_npu/csrc/framework/StorageDescHelper.h @@ -35,6 +35,8 @@ public: static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides); static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides, aclFormat format); + static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format); static bool CheckDescInit(const c10::Storage &storage); // For Serialization to Get and Set NpuStorageDesc @@ -63,6 +65,8 @@ private: const c10::IntArrayRef& strides); static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& size, const c10::IntArrayRef& strides, aclFormat format); + static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format); }; } // namespace native diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp index c2abf7f4b2ae45e57a603f91ea3480e9519e4b1f..ee90387910967e7113f0153b0a8aea3099c0cb50 100644 --- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp @@ -70,6 +70,14 @@ private: ResetDataPtr(src, self, static_cast(src.storage().data_ptr().get())); return true; + case at::ScalarType::Float8_e5m2: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; + case at::ScalarType::Float8_e4m3fn: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; default: // Turn to conducting d2dCopyAsync for other dtypes. return false; diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index ad1f76c5e57105f546893c6ae6cd916688372839..f1bab9565f504e4d62640f432be6c0346f718a9e 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -1,6 +1,5 @@ #include #include "torch_npu/csrc/core/npu/NPUException.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "third_party/acl/inc/acl/acl_mdl.h" #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h" @@ -47,9 +46,23 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) { aclmdlSetDump(val.c_str()); }) -static bool acl_op_has_init = false; +bool CheckJitDisableInner() +{ + auto val = c10_npu::option::GetOption("jitCompile"); + if (val.has_value()) { + if (val.value() == ("disable")) { + return true; + } + if (val.value() == ("enable")) { + return false; + } + } + if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1) { + return true; + } + return false; +} -REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable") REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner) REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); @@ -60,14 +73,7 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { "Jit compile set is disabled! If you want to set, ", "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.", PTA_ERROR(ErrCode::NOT_SUPPORT)); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - } NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str())); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); - acl_op_has_init = true; - } } SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); }) diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index c1bf40b0ee7a351643f730b5153b52e5bcc96ef2..97a07a9025deda50040bfe4047cbbede07620532 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC) _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits8, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits16, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ _(at::ScalarType::Float8_e5m2fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::Float8_e4m3fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::UInt16, ACL_UINT16) \ @@ -86,6 +86,28 @@ AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC) static std::map STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = { {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}}; +static std::unordered_map + ACL_TYPE_TO_SCALAR_TYPE_MAP = {{ACL_DT_UNDEFINED, at::ScalarType::Undefined}, + {ACL_FLOAT, at::ScalarType::Float}, + {ACL_FLOAT16, at::ScalarType::Half}, + {ACL_INT8, at::ScalarType::Char}, + {ACL_INT32, at::ScalarType::Int}, + {ACL_UINT8, at::ScalarType::Byte}, + {ACL_INT16, at::ScalarType::Short}, + {ACL_UINT16, at::ScalarType::UInt16}, + {ACL_UINT32, at::ScalarType::UInt32}, + {ACL_INT64, at::ScalarType::Long}, + {ACL_UINT64, at::ScalarType::UInt64}, + {ACL_DOUBLE, at::ScalarType::Double}, + {ACL_BOOL, at::ScalarType::Bool}, + {ACL_STRING, at::ScalarType::Undefined}, + {ACL_COMPLEX64, at::ScalarType::ComplexFloat}, + {ACL_COMPLEX128, at::ScalarType::ComplexDouble}, + {ACL_BF16, at::ScalarType::BFloat16}, + {ACL_INT4, at::ScalarType::Undefined}, + {ACL_UINT1, at::ScalarType::Undefined}, + {ACL_COMPLEX32, at::ScalarType::ComplexHalf}}; + aclError AclrtMemcpyAsyncParamCheck(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) { @@ -297,5 +319,17 @@ int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32) return iter->second; } +at::ScalarType CalcuOpUtil::ConvertToScalarType(const aclDataType data_type) +{ + auto iter = ACL_TYPE_TO_SCALAR_TYPE_MAP.find(data_type); + if (iter == ACL_TYPE_TO_SCALAR_TYPE_MAP.end()) { + TORCH_CHECK(false, + std::string("aclDataType:") + std::to_string(data_type) + " has not been supported", + OPS_ERROR(ErrCode::NOT_SUPPORT)) + } + + return iter->second; +} + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h index 481c1756a4d484396e166f9ff869310ee7592652..5ee41e7d64b4988d356dcb69a9b06e332277e0f7 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h @@ -36,14 +36,23 @@ using std::vector; #define ASCEND_ALWAYS_INLINE inline #endif -#define ACL_REQUIRE_OK_OP(expr, opstr) \ - do { \ - if (ASCEND_UNLIKELY((expr) != 0)) { \ - std::cout << (opstr) << std::endl; \ - TORCH_CHECK((expr) == 0, __func__, ":", __FILE__, ":", __LINE__, \ - " NPU error,NPU error code is:", expr, "\n", \ - c10_npu::acl::AclGetErrMsg(), OPS_ERROR(ErrCode::INTERNAL)); \ - } \ +#define ACL_REQUIRE_OK_OP(expr, opstr) \ + do { \ + if (ASCEND_UNLIKELY((expr) != 0)) { \ + std::cout << (opstr) << std::endl; \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ + std::ostringstream oss; \ + oss << " NPU error,NPU error code is:" << (expr) << "\n" \ + << OPS_ERROR(ErrCode::INTERNAL); \ + std::string err_msg=oss.str(); \ + ASCEND_LOGE("%s", err_msg.c_str()); \ + TORCH_CHECK((expr) == 0, c10_npu::c10_npu_get_error_message()); \ + } else { \ + TORCH_CHECK((expr) == 0, __func__, ":", __FILE__, ":", __LINE__, \ + " NPU error,NPU error code is:", expr, "\n", \ + c10_npu::acl::AclGetErrMsg(), OPS_ERROR(ErrCode::INTERNAL)); \ + } \ + } \ } while (0) using StorageAndOffsetMemSizePair = std::pair; @@ -81,6 +90,7 @@ public: static int64_t GetTensorNpuFormat(const at::Tensor &tensor); static c10::SmallVector ConvertIntArrayRefToSmallVector(c10::IntArrayRef intArray); static int8_t GetCubeMathType(bool allowHf32); + static at::ScalarType ConvertToScalarType(const aclDataType data_type); }; } // namespace native diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp index a1b2b00aa12b2b2e96f75c0cf7ff4c718e653f01..1626499a80b2870fb4cad1c8c726595e003212ff 100644 --- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp +++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp @@ -18,7 +18,6 @@ namespace at_npu { namespace native { - void ForceAclnn::RegisterOp(const std::string &list) { if (list.empty()) { diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index 47d4fac21a2d2f740027e2f09b0a906ad00932e7..d9815f233580545e328236708ae64e92ffec9ebf 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" @@ -267,6 +268,15 @@ void NpuUtils::check_1d(const at::Tensor &t, const char *arg, const char *fn) OPS_ERROR(ErrCode::PARAM)); } +bool NpuUtils::setFilePermissions(int fd, mode_t mode) +{ + if (fchmod(fd, mode) == -1) { + ASCEND_LOGI("Failed to set permissions."); + return false; + } + return true; +} + #ifndef BUILD_LIBTORCH void NpuUtils::ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id) diff --git a/torch_npu/csrc/framework/utils/NpuUtils.h b/torch_npu/csrc/framework/utils/NpuUtils.h index a891f0d2b4ccf8d35852cc522495930de61920d2..a85dbe1b19988fd20ae7e90a0f2a6fad1d4e776e 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.h +++ b/torch_npu/csrc/framework/utils/NpuUtils.h @@ -46,6 +46,7 @@ public: static bool check_5d_5d_match(const at::Tensor &tensor); static bool IsOomError(aclError ret, int index); static void check_1d(const at::Tensor &t, const char *arg, const char *fn); + static bool setFilePermissions(int fd, mode_t mode); #ifndef BUILD_LIBTORCH static void ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id = 0); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index 069dc4719f050ef5579aa2976eb3931c59027b3d..c48c25786bae7f33753d3d828795d37595178a2b 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -96,6 +96,11 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType); } +at::ScalarType OpPreparation::convert_to_scalar_type(const aclDataType data_type) +{ + return CalcuOpUtil::ConvertToScalarType(data_type); +} + at::Tensor OpPreparation::copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) { return CalcuOpUtil::CopyScalarToDevice(cpu_scalar, scalar_data_type); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h index 74ac30389872e4c0c8cb7da7a1ae3d7c2d4e075c..e87a91011218a4aa55b3f5187523af97ba1226f6 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.h +++ b/torch_npu/csrc/framework/utils/OpPreparation.h @@ -22,6 +22,7 @@ public: // From CalcuOpUtil part static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type); static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType); + static at::ScalarType convert_to_scalar_type(const aclDataType data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type, const c10::Device device); diff --git a/torch_npu/csrc/ipc/CMakeLists.txt b/torch_npu/csrc/ipc/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c70da051f6f729c639eeb418daf0d154e6dc239 --- /dev/null +++ b/torch_npu/csrc/ipc/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _IPC_SRCS *.cpp) + +LIST(APPEND IPC_SRCS ${_IPC_SRCS}) + +# Pass to parent +set(IPC_SRCS ${IPC_SRCS} PARENT_SCOPE) \ No newline at end of file diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.cpp b/torch_npu/csrc/ipc/NPUIPCTypes.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1ff8458c8760e756c719083a0a2eaf89933148fb --- /dev/null +++ b/torch_npu/csrc/ipc/NPUIPCTypes.cpp @@ -0,0 +1,254 @@ +#include +#include +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUGuard.h" +#include "torch_npu/csrc/ipc/NPUIPCTypes.h" + +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + +namespace torch_npu { +namespace ipc { + +namespace { + +void warnProducerTerminatedBeforeSharedTensorsReleased() +{ + static bool warned = false; + if (!warned) { + LOG(WARNING) + << "Producer process has been terminated before all shared NPU tensors released. See Note [Sharing NPU tensors]"; + warned = true; + } +} + +struct NpuIPCGlobalEntities { + // This class is used as a singleton (see npu_ipc_global_entities) + // This variable is used to track its lifetime to avoid accessing it + // after it was destroyed which would lead to segmentation faults + // Note that a trvial type is used which doesn't suffer from construction + // and destruction order issues + static bool alive; + + std::mutex ref_counters_mutex_; + std::atomic sync_events_used_{0}; + std::map> + ref_counters_files_; + std::shared_ptr next_available_ref_counters_file_; + NpuIPCSentDataLimbo NpuIPCSentDataLimbo_; + + NpuIPCGlobalEntities() + { + alive = true; + } + + ~NpuIPCGlobalEntities() + { + NpuIPCSentDataLimbo_.collect(); + safe_clean_current_file(); + if (next_available_ref_counters_file_) { + warnProducerTerminatedBeforeSharedTensorsReleased(); + } + alive = false; + } + + void safe_clean_current_file() + { + std::lock_guard lock(ref_counters_mutex_); + if (next_available_ref_counters_file_ && + next_available_ref_counters_file_->offsets_in_use() == 0) { + ref_counters_files_.erase(next_available_ref_counters_file_->handle()); + next_available_ref_counters_file_.reset(); + } + } +}; + +bool NpuIPCGlobalEntities::alive = false; +NpuIPCGlobalEntities npu_ipc_global_entities; + +NpuIPCSentDataLimbo::~NpuIPCSentDataLimbo() +{ + collect(); + if (size() > 0) { + warnProducerTerminatedBeforeSharedTensorsReleased(); + } +} + +bool NpuIPCSentDataLimbo::collect() +{ + bool freed_memory = false; + std::vector> reset_blocks; + { + // Begin critical section to modify shared blocks + std::lock_guard lock(limbo_mutex_); + std::vector> kept_blocks; + for (auto& sd : shared_blocks_) { + if (sd->counter_value() > 0) { + kept_blocks.push_back(std::move(sd)); + } else { + freed_memory = true; + reset_blocks.push_back(std::move(sd)); + } + } + shared_blocks_ = std::move(kept_blocks); + } + // Need to reset blocks out of the critical section here, otherwise it + // deadlocks. + for (auto& sd : reset_blocks) { + sd.reset(); + } + return freed_memory; +} + +void NpuIPCSentDataLimbo::add(std::unique_ptr shared_block) +{ + std::lock_guard lock(limbo_mutex_); + static bool warned = false; + if (shared_blocks_.size() > NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO && + !warned) { + LOG(WARNING) + << "Producer process tried to deallocate over " + << NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO + << " memory blocks referred by consumer processes. Deallocation might be significantly slowed down. " + << "We assume it will never going to be the case."; + warned = true; + } + shared_blocks_.push_back(std::move(shared_block)); +} + +uint64_t NpuIPCSentDataLimbo::size() +{ + std::lock_guard lock(limbo_mutex_); + return shared_blocks_.size(); +} + +void NpuIPCSentDataDelete(void* ptr) +{ + std::unique_ptr sent_data( + static_cast(ptr)); + if (!NpuIPCGlobalEntities::alive) { + return; + } + if (sent_data->counter_value() > 0) { + npu_ipc_global_entities.NpuIPCSentDataLimbo_.add(std::move(sent_data)); + } + npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect(); +} + +void ReturnRefCounter(const std::string& handle, uint64_t offset /* unused */) +{ + if (!NpuIPCGlobalEntities::alive) { + return; + } + std::lock_guard lock( + npu_ipc_global_entities.ref_counters_mutex_); + auto& map = npu_ipc_global_entities.ref_counters_files_; + auto it = map.find(handle); + if (it != map.end()) { + it->second->return_offset(offset); + if (it->second->offsets_in_use() == 0 && !it->second->have_offsets()) { + map.erase(handle); + } + } +} + +} // namespace + +NpuIPCSentData::NpuIPCSentData( + std::string handle, + uint64_t offset, + uint64_t* counter_ptr, + at::Device device) + : handle_(std::move(handle)), + offset_(offset), + counter_ptr_(counter_ptr), + device_(device) +{ + if (npu_ipc_global_entities.sync_events_used_.load() < + NPU_IPC_MAXIMUM_EVENTS_TO_USE) { + // NPU does not suppurt event_sync in IPC now. + } else { + auto stream = c10_npu::getCurrentNPUStream(device.index()); + c10_npu::stream_synchronize(stream); + event_ = nullptr; + event_sync_required_ = false; + } +} + +NpuIPCSentData::~NpuIPCSentData() +{ + ReturnRefCounter(handle_, offset_); + try { + if (event_sync_required_) { + // NPU does not suppurt event_sync in IPC now. + } + } catch (...) { /* No throw */ + } +} + +uint64_t NpuIPCSentData::counter_value() +{ + return *counter_ptr_; +} + +at::DataPtr GetNewRefCountedSentData(void* data, at::Device device) +{ + { + std::lock_guard lock( + npu_ipc_global_entities.ref_counters_mutex_); + if (!npu_ipc_global_entities.next_available_ref_counters_file_) { + std::string ref_counter_handle = at::NewProcessWideShmHandle(); + + int flags = + at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE; + at::DataPtr sptr = at::RefcountedMapAllocator::makeDataPtr( + ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + auto rc = std::make_shared( + ref_counter_handle, NPU_IPC_REF_COUNTER_FILE_SIZE, std::move(sptr)); + npu_ipc_global_entities.ref_counters_files_[ref_counter_handle] = rc; + npu_ipc_global_entities.next_available_ref_counters_file_ = rc; + } + } + npu_ipc_global_entities.next_available_ref_counters_file_->set_counter(1); + auto sent_data = new NpuIPCSentData( + npu_ipc_global_entities.next_available_ref_counters_file_->handle(), + npu_ipc_global_entities.next_available_ref_counters_file_->get_offset(), + npu_ipc_global_entities.next_available_ref_counters_file_->counter_ptr(), + device); + + npu_ipc_global_entities.next_available_ref_counters_file_->rotate_offset(); + if (!npu_ipc_global_entities.next_available_ref_counters_file_ + ->have_offsets()) { + npu_ipc_global_entities.next_available_ref_counters_file_.reset(); + } + return at::DataPtr(data, sent_data, NpuIPCSentDataDelete, device); +} + +bool NpuIPCCollect() +{ + if (!NpuIPCGlobalEntities::alive) { + return true; + } + bool freed_memory = npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect(); + if (npu_ipc_global_entities.NpuIPCSentDataLimbo_.size() == 0) { + npu_ipc_global_entities.safe_clean_current_file(); + } + return freed_memory; +} + +} // namespace ipc +} // namespace torch_npu + +namespace c10_npu { +namespace NPUCachingAllocator { + +REGISTER_FREE_MEMORY_CALLBACK("npu_ipc_collect", NpuIPCCollectCallback); + +} // namespace NPUCachingAllocator +} // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.h b/torch_npu/csrc/ipc/NPUIPCTypes.h new file mode 100644 index 0000000000000000000000000000000000000000..5156af2da429aae306f886e7366cd46a82376667 --- /dev/null +++ b/torch_npu/csrc/ipc/NPUIPCTypes.h @@ -0,0 +1,150 @@ +#pragma once +#include + +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/core/npu/NPUFunctions.h" +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" + +namespace torch_npu { +namespace ipc { + +TORCH_NPU_API bool NpuIPCCollect(); + +struct NpuIPCReceivedData final { + NpuIPCReceivedData() = default; + explicit NpuIPCReceivedData(std::shared_ptr shared_ptr) + : shared_ptr_(std::move(shared_ptr)) {} + std::shared_ptr shared_ptr_; +}; + +struct NpuIPCSentData final { + std::string handle_; + uint64_t offset_; + uint64_t* counter_ptr_; // Reference counter shared memory block + at::DataPtr original_ptr_; // Original mem allocation + char* event_; // Sync event + bool event_sync_required_; + at::Device device_; + + NpuIPCSentData( + std::string handle, + uint64_t offset, + uint64_t* counter_ptr, + at::Device device); + ~NpuIPCSentData(); + + uint64_t counter_value(); + std::string handle() + { + return handle_; + } + uint64_t offset() + { + return offset_; + } + void set_original_ptr(at::DataPtr data_ptr) + { + original_ptr_ = std::move(data_ptr); + } +}; + +TORCH_NPU_API at::DataPtr GetNewRefCountedSentData( + void* data, + at::Device device); + +namespace { + +inline constexpr int64_t NPU_IPC_REF_COUNTER_FILE_SIZE = 10000; +inline constexpr int64_t NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000; +inline constexpr int64_t NPU_IPC_MAXIMUM_EVENTS_TO_USE = 0; + +// All to be deleted data blocks with non zero reference counter goes there +struct NpuIPCSentDataLimbo final { + ~NpuIPCSentDataLimbo(); + bool collect(); + void add(std::unique_ptr shared_block); + uint64_t size(); + +private: + std::vector> shared_blocks_; + std::mutex limbo_mutex_; +}; + +struct NpuIPCRefCountersFile final { + NpuIPCRefCountersFile( + std::string handle, + uint64_t size, + at::DataPtr data_ptr) + : size_(size), + handle_(std::move(handle)), + refcounted_shared_mem_(std::move(data_ptr)) {} + + uint64_t* counter_ptr() + { + return static_cast(refcounted_shared_mem_.get()) + next_offset_; + } + + void set_counter(uint64_t value) + { + *counter_ptr() = value; + } + + bool have_offsets() + { + return next_offset_ < size_; + } + + bool offsets_in_use() + { + return used_slots_; + } + + uint64_t get_offset() + { + return next_offset_; + } + + void rotate_offset() + { + next_offset_++; + used_slots_++; + } + + void return_offset(uint64_t offset /* unused */) + { + used_slots_--; + } + + std::string handle() + { + return handle_; + } + +private: + uint64_t next_offset_{0}; + uint64_t size_; + uint64_t used_slots_{0}; + std::string handle_; + at::DataPtr refcounted_shared_mem_; +}; + +} // namespace +} // namespace ipc +} // namespace torch_npu + +namespace c10_npu { +namespace NPUCachingAllocator { +namespace { + +class NpuIPCCollectCallback : public FreeMemoryCallback { +public: + bool Execute() override + { + return torch_npu::ipc::NpuIPCCollect(); + } +}; + +} // namespace +} // namespace NPUCachingAllocator +} // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp new file mode 100644 index 0000000000000000000000000000000000000000..18fdd4c5e0722bcde2133239e3ccf9c0f9ad6ba0 --- /dev/null +++ b/torch_npu/csrc/ipc/StorageSharing.cpp @@ -0,0 +1,309 @@ +#ifndef BUILD_LIBTORCH + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/core/NPUBridge.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" +#include "torch_npu/csrc/core/npu/NPUGuard.h" +#include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/framework/FormatHelper.h" + +#include "torch_npu/csrc/ipc/NPUIPCTypes.h" +#include "torch_npu/csrc/ipc/StorageSharing.h" + +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + +namespace torch_npu { +namespace reductions { + +static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args) +{ + HANDLE_TH_ERRORS + const auto& storage = THPStorage_Unpack(args); + TORCH_CHECK( + storage.device_type() == at::DeviceType::PrivateUse1, + "_share_npu_: only available on NPU.", PTA_ERROR(ErrCode::PARAM)); + c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl(); + + auto npu_storage_impl = static_cast(storage.unsafeGetStorageImpl()); + auto format = npu_storage_impl->npu_desc_.npu_format_; + TORCH_CHECK(at_npu::native::FormatHelper::IsBaseFormatType(format), + "Try to share a storage without base format", + PTA_ERROR(ErrCode::TYPE)); + + if (storage_impl->received_cuda()) { + AT_ERROR( + "Supported to send NPU tensor received from another process; other is not currently supported. Consider cloning before sending."); + } + + at::DeviceGuard device_guard(storage.device()); + THPObjectPtr tuple(PyTuple_New(8)); + THPObjectPtr device(THPUtils_packInt32(storage.device().index())); + THPObjectPtr _handle(Py_None); + Py_INCREF(Py_None); + THPObjectPtr size_bytes(THPUtils_packUInt64(storage.nbytes())); + THPObjectPtr _offset_bytes(THPUtils_packInt32(0)); + THPObjectPtr _ref_counter(Py_None); + Py_INCREF(Py_None); + THPObjectPtr _ref_counter_offset(THPUtils_packInt32(0)); + THPObjectPtr _event_handle(Py_None); + Py_INCREF(Py_None); + THPObjectPtr _event_sync_required(Py_None); + Py_INCREF(Py_None); + if (storage.data()) { + auto shandle = c10_npu::NPUCachingAllocator::shareIpcHandle(storage.mutable_data()); + _handle = PyBytes_FromStringAndSize( + shandle.handle.c_str(), (Py_ssize_t)shandle.handle.size()); + _offset_bytes = PyLong_FromSsize_t((Py_ssize_t)shandle.offset); + + at::DataPtr sent_data_ptr = torch_npu::ipc::GetNewRefCountedSentData( + storage.mutable_data(), storage.device()); + auto old_data_ptr = storage.set_data_ptr(std::move(sent_data_ptr)); + auto sent_data = + static_cast(storage.data_ptr().get_context()); + sent_data->set_original_ptr(std::move(old_data_ptr)); + _ref_counter = PyBytes_FromString((sent_data->handle()).c_str()); + _ref_counter_offset = THPUtils_packUInt64(sent_data->offset()); + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + aclrtNotify ipc_event_handle; + + if (sent_data->event_sync_required_) { + // NPU does not suppurt event_sync in IPC now. + } + + _event_handle = PyBytes_FromStringAndSize( + (char*)&ipc_event_handle, sizeof(aclrtNotify)); + _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_); + } + + if (!tuple || !device || !_handle || !size_bytes || !_offset_bytes || + !_event_handle) { + return nullptr; + } + PyTuple_SET_ITEM(tuple.get(), 0, device.release()); + PyTuple_SET_ITEM(tuple.get(), 1, _handle.release()); + // Size(in bytes) of the real storage, note this is not the size of basePtr + // memory block. + PyTuple_SET_ITEM(tuple.get(), 2, size_bytes.release()); + // Offset(in bytes) of the real storage in the basePtr memory block. + // NB: this offset MUST be in bytes instead of numel, since we use + // (storage_handle, offset) + // as key in shared_cache(multiprocessing/reduction.py). + // Offset in numel cannot uniquely represent a storage. + PyTuple_SET_ITEM(tuple.get(), 3, _offset_bytes.release()); + PyTuple_SET_ITEM(tuple.get(), 4, _ref_counter.release()); + PyTuple_SET_ITEM(tuple.get(), 5, _ref_counter_offset.release()); + PyTuple_SET_ITEM(tuple.get(), 6, _event_handle.release()); + PyTuple_SET_ITEM(tuple.get(), 7, _event_sync_required.release()); + return tuple.release(); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPStorage_releaseIPCCounter(PyObject* _unused, PyObject* args) +{ + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected", PTA_ERROR(ErrCode::PARAM)); + + PyObject* _ref_counter = PyTuple_GET_ITEM(args, 0); + PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 1); + if (!(PyBytes_Check(_ref_counter) && THPUtils_checkLong(_ref_counter_offset))) { + THPUtils_invalidArguments( + args, + nullptr, + "_release_ipc_counter in NPU mode", + 1, + "(bytes _ref_counter, int _ref_counter_offset)"); + return nullptr; + } + std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter); + ptrdiff_t ref_counter_offset = + (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset); + // We don't want to break existing code, so resource deletion is best + // effort basis. Exception expected if producer process terminated + // before consumer released data. + int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE; + try { + auto sptr = at::RefcountedMapAllocator::makeDataPtr( + ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + *(static_cast(sptr.get()) + ref_counter_offset) -= 1; + } catch (c10::Error& err) { + // Already warned inside of producer process + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static std::string THNPStorage_bytesAsHandleString(PyObject* handle) +{ + HANDLE_TH_ERRORS + char* buffer = nullptr; + Py_ssize_t handle_size = 0; + if (PyBytes_AsStringAndSize(handle, &buffer, &handle_size) == -1) { + TORCH_CHECK(handle_size == ACL_IPC_HANDLE_SIZE, "incorrect handle", PTA_ERROR(ErrCode::PARAM)); + } + return std::string(buffer, handle_size); + END_HANDLE_TH_ERRORS_RET("") +} + +static PyObject* THNPStorage_newSharedNpu(PyObject* _unused, PyObject* args) +{ + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_GET_SIZE(args) == 8, "tuple of 8 items expected", PTA_ERROR(ErrCode::PARAM)); + PyObject* _device = PyTuple_GET_ITEM(args, 0); + PyObject* _handle = PyTuple_GET_ITEM(args, 1); + PyObject* _size_bytes = PyTuple_GET_ITEM(args, 2); + PyObject* _offset_bytes = PyTuple_GET_ITEM(args, 3); + PyObject* _ref_counter = PyTuple_GET_ITEM(args, 4); + PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 5); + PyObject* _event_handle = PyTuple_GET_ITEM(args, 6); + PyObject* _event_sync_required = PyTuple_GET_ITEM(args, 7); + if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size_bytes) && + PyBytes_Check(_handle) && PyBytes_Check(_ref_counter) && + PyBytes_Check(_event_handle) && THPUtils_checkLong(_offset_bytes) && + THPUtils_checkLong(_ref_counter_offset) && + PyBool_Check(_event_sync_required))) { + THPUtils_invalidArguments( + args, + nullptr, + "_new_shared in NPU mode", + 1, + "(int device, bytes handle, int storage_size_bytes, int storage_offset_bytes, bytes _ref_counter, int _ref_counter_offset, bytes event_handle, bool event_sync_required)"); + return nullptr; + } + + size_t storage_size = + (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t); + ptrdiff_t storage_offset_bytes = + (ptrdiff_t)THPUtils_unpackLong(_offset_bytes); + + const auto device = c10::checked_convert( + THPUtils_unpackLong(_device), "c10::DeviceIndex"); + c10_npu::NPUGuard device_guard(device); + + if (PyObject_IsTrue(_event_sync_required)) { + // TO BE DONE + } + + std::string s_handle = THNPStorage_bytesAsHandleString(_handle); + if (s_handle.empty()) { + return nullptr; + } + std::shared_ptr basePtr = + c10_npu::NPUCachingAllocator::getIpcDevPtr(s_handle); + + // Offset the basePtr to reconstruct the real storage + // devPtr = basePtr + storage_offset + void* devPtr = basePtr.get(); + devPtr = (char*)devPtr + storage_offset_bytes; + + std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter); + ptrdiff_t ref_counter_offset = + (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset); + + struct IpcDeleterContext { + std::string ref_counter_handle; + ptrdiff_t ref_counter_offset; + int64_t device; + torch_npu::ipc::NpuIPCReceivedData received_data; + }; + + auto ctx = std::make_unique(); + ctx->ref_counter_handle = std::move(ref_counter_handle); + ctx->ref_counter_offset = ref_counter_offset; + ctx->device = device; + ctx->received_data.shared_ptr_ = std::move(basePtr); + + auto cur_device = c10_npu::current_device(); + c10::DataPtr data_ptr( + devPtr, + ctx.release(), + +[](void* ctx_) { + std::unique_ptr ctx( + static_cast(ctx_)); + + ctx->received_data.shared_ptr_.reset(); + + try { + c10_npu::stream_synchronize( + c10_npu::getCurrentNPUStream(ctx->device)); + } catch (c10::Error& err) { + // Already warned inside of producer process + } + + int flags = + at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE; + try { + auto sptr = at::RefcountedMapAllocator::makeDataPtr( + ctx->ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + *(static_cast(sptr.get()) + ctx->ref_counter_offset) -= 1; + } catch (c10::Error& err) { + // Already warned inside of producer process + } + }, + at::Device(at::DeviceType::PrivateUse1, cur_device)); + + c10::intrusive_ptr base = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + storage_size, + std::move(data_ptr), + nullptr, + false); + + base->set_resizable(false); + base->set_received_cuda(true); + + return THPStorage_NewWithStorage( + THPStorageClass, + std::move(base), + c10::impl::PyInterpreterStatus::TAGGED_BY_US); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPStorage_isShared(PyObject* self, PyObject* arg) +{ + const auto& storage = THPStorage_Unpack(self); + if (storage.device_type() == at::kPrivateUse1) { + Py_RETURN_TRUE; + } + if (at::MapAllocator::fromDataPtr(storage.data_ptr()) || + THManagedMapAllocator::fromDataPtr(storage.data_ptr())) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static struct PyMethodDef TorchReductionsMethods[] = { + {"_share_npu_", THNPStorage_shareNpu, METH_O, nullptr}, + {"_release_ipc_counter_npu", THNPStorage_releaseIPCCounter, METH_VARARGS, nullptr}, + {"_new_shared_npu", THNPStorage_newSharedNpu, METH_VARARGS, nullptr}, + {"_is_shared", THNPStorage_isShared, METH_O, nullptr}, + {nullptr, nullptr, 0, nullptr}, +}; + +PyMethodDef* reductions_functions() +{ + return TorchReductionsMethods; +} + +} // namespace reductions +} // namespace torch_npu + +#endif \ No newline at end of file diff --git a/torch_npu/csrc/ipc/StorageSharing.h b/torch_npu/csrc/ipc/StorageSharing.h new file mode 100644 index 0000000000000000000000000000000000000000..a38e0c0ad68248ecf542a65e5d3f5bc14cff5903 --- /dev/null +++ b/torch_npu/csrc/ipc/StorageSharing.h @@ -0,0 +1,15 @@ +#ifndef BUILD_LIBTORCH +#pragma once + +#include +#include "torch_npu/csrc/core/npu/NPUMacros.h" + +namespace torch_npu { +namespace reductions { + +TORCH_NPU_API PyMethodDef* reductions_functions(); + +} // namespace reductions +} // namespace torch_npu + +#endif \ No newline at end of file diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp index eaab8bc004e588f27ad9f8f022dc3c6d72ff7611..a527b4b4f2aa0491d0f2d3f74030ec8ba48be81e 100644 --- a/torch_npu/csrc/logging/Logger.cpp +++ b/torch_npu/csrc/logging/Logger.cpp @@ -8,6 +8,8 @@ #include "torch_npu/csrc/core/npu/register/OptionsManager.h" namespace npu_logging { +static const int BASE_PRINT_LIMIT = 1024; +static const int LONG_PRINT_LIMIT = 4096; static std::unordered_map LoggingLevelNames = { {LoggingLevel::DEBUG, "DEBUG"}, @@ -37,9 +39,8 @@ std::string Logger::getQName() return qname_; } -void Logger::log(LoggingLevel level, const char* format, va_list args) +void Logger::log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args) { - const int log_buffer_size = 1024; char buffer[log_buffer_size] = {0}; int ret = vsnprintf(buffer, log_buffer_size, format, args); @@ -75,7 +76,7 @@ void Logger::debug(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::DEBUG, format, args); + log(LoggingLevel::DEBUG, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -86,7 +87,7 @@ void Logger::info(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::INFO, format, args); + log(LoggingLevel::INFO, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -97,7 +98,7 @@ void Logger::warn(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::WARNING, format, args); + log(LoggingLevel::WARNING, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -108,7 +109,7 @@ void Logger::error(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::ERROR, format, args); + log(LoggingLevel::ERROR, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -119,7 +120,62 @@ void Logger::critical(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::CRITICAL, format, args); + log(LoggingLevel::CRITICAL, BASE_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_debug(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::DEBUG) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::DEBUG, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_info(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::INFO) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::INFO, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_warn(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::WARNING) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::WARNING, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_error(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::ERROR) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::ERROR, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_critical(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::CRITICAL) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::CRITICAL, LONG_PRINT_LIMIT, format, args); va_end(args); } diff --git a/torch_npu/csrc/logging/Logger.h b/torch_npu/csrc/logging/Logger.h index 1734a7c7bebbf574860c4675bee52ec039ce3d16..7e76af5013e564cad671323e0010493e1f96d5b1 100644 --- a/torch_npu/csrc/logging/Logger.h +++ b/torch_npu/csrc/logging/Logger.h @@ -29,9 +29,14 @@ public: void warn(const char* format, ...); void error(const char* format, ...); void critical(const char* format, ...); + void long_debug(const char* format, ...); + void long_info(const char* format, ...); + void long_warn(const char* format, ...); + void long_error(const char* format, ...); + void long_critical(const char* format, ...); private: - void log(LoggingLevel level, const char* format, va_list args); + void log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args); LoggingLevel allow_level_ = LoggingLevel::WARNING; std::string name_; diff --git a/torch_npu/csrc/npu/DataParallelComm.cpp b/torch_npu/csrc/npu/DataParallelComm.cpp index db0d3efabefc96ca39c8bcaad354ed07b159bd38..c744e1e1baf961dbfa42de031c4c371c9be22672 100644 --- a/torch_npu/csrc/npu/DataParallelComm.cpp +++ b/torch_npu/csrc/npu/DataParallelComm.cpp @@ -137,7 +137,7 @@ void check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, i { // need to check len(inputs) == len(outputs) size_t len = inputs.size(); - if (len <= 0) { + if (len == 0) { throw std::runtime_error("input sequence can't be empty" + PTA_ERROR(ErrCode::PARAM)); } diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index df89f0a8a803473204d20677258d140e19c8b5ca..040e4754678597ae89ba1776919184cca6d058a6 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -27,6 +27,8 @@ #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUAffinityController.h" +#include "torch_npu/csrc/core/npu/NPUPeerToPeerAccess.h" +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -275,6 +277,24 @@ void RegisterNpuPluggableAllocator(PyObject* module) std::function func = reinterpret_cast(func_ptr); self.set_erase_stream_fn(func); + }) + .def( + "set_get_device_stats_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType=c10_npu::NPUCachingAllocator::DeviceStats(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_get_device_stats_fn(func); + }) + .def( + "set_reset_peak_status_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType = void(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_reset_peak_status_fn(func); }); m.def( @@ -879,7 +899,13 @@ PyObject *THNPModule_is_jit_compile_false_wrap(PyObject *self, PyObject *noargs) if (option_value.has_value() && (option_value.value() == "disable")) { Py_RETURN_TRUE; } else { - Py_RETURN_FALSE; + static const std::string jit_compile_init_option_name = "jitCompileInit"; + auto init_option_value = c10_npu::option::GetOption(jit_compile_init_option_name); + if (init_option_value.has_value() && (init_option_value.value() == "disable")) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } END_HANDLE_TH_ERRORS } @@ -1213,7 +1239,8 @@ PyObject* THNPModule_npuCachingAllocator_raw_alloc(PyObject *_unused, PyObject * END_HANDLE_TH_ERRORS } -PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) { +PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) +{ HANDLE_TH_ERRORS void* mem_ptr = PyLong_AsVoidPtr(obj); c10_npu::NPUCachingAllocator::raw_delete(mem_ptr); @@ -1265,7 +1292,8 @@ PyObject* THNPModule_npuUnlockMutex(PyObject *module, PyObject *noargs) Py_RETURN_NONE; } -PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) { +PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) +{ HANDLE_TH_ERRORS pybind11::gil_scoped_release no_gil; NPU_CHECK_ERROR_WITHOUT_UCE(aclmdlInitDump()); @@ -1553,6 +1581,15 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) } else { c10_npu::SetThreadAffinity(core_start, core_end); } + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -1626,6 +1663,87 @@ static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } +static PyObject* THNPModule_add_ipc_pid(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int pid; + if (!PyArg_ParseTuple(args, "i", &pid)) { + throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); + } + torch_npu::ipc::addPid(pid); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_get_ipc_pid(PyObject* self, PyObject *noargs) +{ + HANDLE_TH_ERRORS + int32_t pid; + NPU_CHECK_ERROR(c10_npu::acl::AclrtDeviceGetBareTgid(&pid)); + return THPUtils_packInt32(pid); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int src_dev; + int dst_dev; + if (!PyArg_ParseTuple(args, "ii", &src_dev, &dst_dev)) { + throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); + } + bool warning_flag = false; + at_npu::native::NpuP2pCtrl::get_instance().get_p2p_access(src_dev, dst_dev, warning_flag); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_set_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + PyObject* device = nullptr; + PyObject* type = nullptr; + PyObject* value = nullptr; + + if (!PyArg_ParseTuple(args, "OOO", &device, &type, &value)) { + throw torch::TypeError("Pybind failed to parse parameters." + + PTA_ERROR(ErrCode::TYPE)); + } + int32_t device_ = THPUtils_unpackLong(device); + int32_t type_ = THPUtils_unpackLong(type); + uint32_t value_ = static_cast(THPUtils_unpackUInt32(value)); + c10_npu::SetDeviceResLimit(device_, type_, value_); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_get_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + PyObject* device = nullptr; + PyObject* type = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &device, &type)) { + throw torch::TypeError("Pybind failed to parse parameters." + + PTA_ERROR(ErrCode::TYPE)); + } + int32_t device_ = THPUtils_unpackLong(device); + int32_t type_ = THPUtils_unpackLong(type); + uint32_t value = c10_npu::GetDeviceResLimit(device_, type_); + return PyLong_FromUnsignedLong(value); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_reset_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int32_t device = THPUtils_unpackLong(args); + c10_npu::ResetDeviceResLimit(device); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, @@ -1681,12 +1799,19 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, + {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr}, {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr}, {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr}, {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr}, + {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr}, + {"_get_ipc_pid", (PyCFunction)THNPModule_get_ipc_pid, METH_NOARGS, nullptr}, + {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr}, + {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr}, + {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr}, + {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods() diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index e8e0fd3eeffaebecc6d11e73de73e49f13af7668..14ea0ce7e73dbe0b18c255b8678c3a23ad44c5bc 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -74,6 +74,18 @@ void NPUPluggableAllocator::set_erase_stream_fn( erase_stream_fn_ = std::move(erase_stream_fn); } +void NPUPluggableAllocator::set_get_device_stats_fn( + std::function get_device_stats_fn) +{ + get_device_stats_fn_ = std::move(get_device_stats_fn); +} + +void NPUPluggableAllocator::set_reset_peak_status_fn( + std::function reset_peak_status_fn) +{ + reset_peak_status_fn_ = std::move(reset_peak_status_fn); +} + void* NPUPluggableAllocator::malloc( size_t size, int device, @@ -212,8 +224,11 @@ void NPUPluggableAllocator::eraseStream( c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support getDeviceStats. " - "If you need it, please file an issue describing your use case."); + if (get_device_stats_fn_) { + return get_device_stats_fn_(device); + } else { + TORCH_NPU_WARN("get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"); + } } void NPUPluggableAllocator::resetAccumulatedStats(int device) @@ -224,8 +239,11 @@ void NPUPluggableAllocator::resetAccumulatedStats(int device) void NPUPluggableAllocator::resetPeakStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support resetPeakStats. " - "If you need it, please file an issue describing your use case."); + if (reset_peak_status_fn_) { + reset_peak_status_fn_(device); + } else { + TORCH_NPU_WARN("reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"); + } } c10_npu::NPUCachingAllocator::SnapshotInfo NPUPluggableAllocator::snapshot() @@ -282,6 +300,24 @@ void NPUPluggableAllocator::copy_data(void* dest, const void* src, std::size_t c { default_copy_data(dest, src, count); } + +std::shared_ptr NPUPluggableAllocator::getIpcDevPtr(std::string handle) +{ + TORCH_NPU_WARN( + "NPUPluggableAllocator does not yet support getIpcDevPtr. " + "If you need it, please file an issue describing your use case."); + auto sp = std::shared_ptr(); + return sp; +} + +c10_npu::NPUCachingAllocator::ShareableHandle NPUPluggableAllocator::shareIpcHandle(void* ptr) +{ + TORCH_NPU_WARN( + "NPUPluggableAllocator does not yet support shareIPcHandle. " + "If you need it, please file an issue describing your use case."); + return c10_npu::NPUCachingAllocator::ShareableHandle{0, nullptr}; +} + void NPUPluggableAllocator::recordHistory( bool enabled, c10_npu::NPUCachingAllocator::CreateContextFn context_recorder, diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 3a71319f3c7c4f79bd208206f1543947e64b9b1e..a3691d48eefbaf3743f5ce29a304a0dab3560151 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -45,6 +45,8 @@ struct NPUPluggableAllocator std::function record_stream_fn); void set_erase_stream_fn( std::function erase_stream_fn); + void set_get_device_stats_fn(std::function get_device_stats_fn); + void set_reset_peak_status_fn(std::function reset_peak_status_fn); void* malloc(size_t size, int device, aclrtStream stream); c10::DataPtr allocate(size_t size) override; @@ -81,6 +83,8 @@ struct NPUPluggableAllocator void FreeDeviceCachedMemory(int device) override; std::string name() override; void copy_data(void* dest, const void* src, std::size_t count) const final; + std::shared_ptr getIpcDevPtr(std::string handle) override; + c10_npu::NPUCachingAllocator::ShareableHandle shareIpcHandle(void*) override; void recordHistory( bool enabled, c10_npu::NPUCachingAllocator::CreateContextFn context_recorder, @@ -108,6 +112,8 @@ protected: std::function base_alloc_fn_; std::function record_stream_fn_; std::function erase_stream_fn_; + std::function get_device_stats_fn_; + std::function reset_peak_status_fn_; std::mutex allocator_mutex_; // We do the bookeeping here in order to simplify custom allocators std::unordered_map allocation_metadata_; diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp index 47fbf4de6cf5916a4713f9dde961e80fc89c8f74..cc893243a76fc8dd05d60b13e78fe429d7435dcf 100644 --- a/torch_npu/csrc/npu/memory_snapshot.cpp +++ b/torch_npu/csrc/npu/memory_snapshot.cpp @@ -16,7 +16,11 @@ namespace torch_npu { std::shared_ptr gather() { +#if defined(__x86_64__) return torch::CapturedTraceback::gather(true, true, false); +#else + return torch_npu::CapturedTraceback::gather(true, true, false); +#endif } std::shared_ptr gather_with_cpp() diff --git a/torch_npu/csrc/profiler/profiler_python.cpp b/torch_npu/csrc/profiler/profiler_python.cpp index 45ccf8f1b28155e984ad5d17f6e00acb4f18771c..571fb57bbce2c18e24ac6cd26aaedcd656968bd1 100644 --- a/torch_npu/csrc/profiler/profiler_python.cpp +++ b/torch_npu/csrc/profiler/profiler_python.cpp @@ -36,19 +36,6 @@ using TensorMetadata = torch_npu::toolkit::profiler::TensorMetadata; using ModuleParam = torch_npu::toolkit::profiler::ModuleParam; using OptimizerParam = torch_npu::toolkit::profiler::OptimizerParam; -std::string trimPrefix(std::string s) -{ - static std::vector prefixes = py::module::import("torch.profiler.python_tracer") - .attr("_prefix_regex")().cast>(); - for (const auto& p : prefixes) { - if (s.compare(0, p.size(), p) == 0) { - s.erase(0, p.size()); - return s; - } - } - return s; -} - std::vector getInterpreterThreads(PyInterpreterState* interpreter) { pybind11::gil_scoped_acquire gil; @@ -240,6 +227,7 @@ private: void reportTraceData(); void reportHashData(); void reportParamData(); + std::string trimPrefix(std::string s); private: std::atomic active_{false}; @@ -248,6 +236,7 @@ private: std::deque thread_local_results_; PyObject* module_call_code_{nullptr}; PyObject* optimizer_call_code_{nullptr}; + std::vector func_name_prefixes_; std::unordered_map py_call_cache_; std::unordered_map pyc_call_cache_; std::unordered_map module_info_cache_; @@ -277,6 +266,9 @@ PythonTracer::PythonTracer() : active_(false) .attr("_optimizer_step_code") .attr("__code__") .ptr(); + func_name_prefixes_ = py::module::import("torch.profiler.python_tracer") + .attr("_prefix_regex")() + .cast>(); } void PythonTracer::start(size_t max_threads) @@ -383,6 +375,17 @@ void PythonTracer::clear() interpreter_ = nullptr; } +std::string PythonTracer::trimPrefix(std::string s) +{ + for (const auto& p : func_name_prefixes_) { + if (s.compare(0, p.size(), p) == 0) { + s.erase(0, p.size()); + return s; + } + } + return s; +} + void PythonTracer::reportTraceData() { if (events_.size() > 0) { @@ -402,7 +405,7 @@ void PythonTracer::reportHashData() hash_data.resize(py_call_cache_.size() + pyc_call_cache_.size() + module_info_cache_.size() + 1); size_t idx = 0; for (auto& item : py_call_cache_) { - hash_data[idx++] = std::make_pair(item.first, trimPrefix(item.second.get_name())); + hash_data[idx++] = std::make_pair(item.first, trimPrefix(std::move(item.second.get_name()))); } for (auto& item : pyc_call_cache_) { hash_data[idx++] = std::make_pair(item.first, std::string(item.second.str())); diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h index 8d2629a2a6f859f227313cd84742db563b4c8859..b8065251c54c08dab47b63ba825b439be7fa4a5a 100644 --- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h +++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h @@ -344,8 +344,8 @@ struct MemoryData : BaseReportData { uint64_t thread_id{ 0 }; uint64_t process_id{ 0 }; MemoryData(int64_t ptr, int64_t time_ns, int64_t alloc_size, int64_t total_allocated, int64_t total_reserved, - int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t data_type, - uint8_t component_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) + int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t component_type, + uint8_t data_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) : BaseReportData(0, "torch.memory_usage"), ptr(ptr), time_ns(time_ns), diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp index aeb6fd8b832e96d39fc8b8cd0724ddbdcf9125b9..e6998f57eaad6ac9368366847593d66c8d56658e 100644 --- a/torch_npu/csrc/utils/TensorType.cpp +++ b/torch_npu/csrc/utils/TensorType.cpp @@ -6,7 +6,6 @@ namespace torch_npu { namespace utils { - using namespace at; using namespace torch::autograd; @@ -15,14 +14,13 @@ std::vector> all_declared_types_npu() std::vector> ret; // can't easily iterate over enum classes, does not support BFloat16 now std::vector backends = { c10::Backend::PrivateUse1 }; - std::vector scalar_types = { - ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float, - ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half, - ScalarType::Bool, ScalarType::BFloat16 - }; - - for (auto& backend : backends) { - for (auto& scalar_type : scalar_types) { + std::vector scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, + ScalarType::Float, ScalarType::Int, ScalarType::Long, + ScalarType::Short, ScalarType::Half, ScalarType::Bool, + ScalarType::BFloat16 }; + + for (auto &backend : backends) { + for (auto &scalar_type : scalar_types) { ret.emplace_back(std::make_pair(backend, scalar_type)); } } @@ -32,8 +30,8 @@ std::vector> all_declared_types_npu() struct PyTensorType { PyTypeObject py_type; - THPDtype* dtype; - THPLayout* layout; + THPDtype *dtype; + THPLayout *layout; bool is_npu; char name[64]; int backend; @@ -57,73 +55,67 @@ struct PyTensorType { static_assert(std::is_standard_layout::value, "PyTensorType must be standard layout"); -static void py_bind_tensor_types(const std::vector& tensor_types); +static void py_bind_tensor_types(const std::vector &tensor_types); -static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +static PyObject *Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS - auto& tensor_type = *((PyTensorType*)type); + auto &tensor_type = *((PyTensorType *)type); if (tensor_type.is_npu) { - TORCH_NPU_WARN_ONCE( - "Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " + TORCH_NPU_WARN_ONCE("Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " "It's best to use methods such as torch.tensor(data, dtype=*, device='npu') " "to create tensors."); } - TORCH_CHECK_TYPE( - !tensor_type.is_npu || c10_npu::device_count() != 0, - "type ", - tensor_type.name, + TORCH_CHECK_TYPE(!tensor_type.is_npu || c10_npu::device_count() != 0, "type ", tensor_type.name, " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE)) torch_npu::utils::npu_lazy_init(); - return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), - tensor_type.get_scalar_type(), - args, - kwargs)); + return THPVariable_Wrap( + torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), tensor_type.get_scalar_type(), args, kwargs)); END_HANDLE_TH_ERRORS } -static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) +static PyObject *Tensor_instancecheck(PyObject *_self, PyObject *arg) { - HANDLE_TH_ERRORS - auto self = (PyTensorType*)_self; - if (THPVariable_Check(arg)) { - const auto& var = THPVariable_Unpack(arg); - - if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && - var.scalar_type() == static_cast(self->scalar_type)) { - Py_RETURN_TRUE; + HANDLE_TH_ERRORS + auto self = (PyTensorType *)_self; + if (THPVariable_Check(arg)) { + const auto &var = THPVariable_Unpack(arg); + + if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && + var.scalar_type() == static_cast(self->scalar_type)) { + Py_RETURN_TRUE; + } } - } - Py_RETURN_FALSE; - END_HANDLE_TH_ERRORS + Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS } -PyObject* Tensor_dtype(PyTensorType* self, void *unused) +PyObject *Tensor_dtype(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->dtype); + return torch::autograd::utils::wrap(self->dtype); } -PyObject* Tensor_layout(PyTensorType* self, void *unused) +PyObject *Tensor_layout(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->layout); + return torch::autograd::utils::wrap(self->layout); } -PyObject* Tensor_is_npu(PyTensorType* self, void *unused) +PyObject *Tensor_is_npu(PyTensorType *self, void *unused) { - if (self->is_npu) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } + if (self->is_npu) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } -PyObject* Tensor_is_sparse(PyTensorType *self, void *unused) +PyObject *Tensor_is_sparse(PyTensorType *self, void *unused) { - if (self->layout->layout == at::Layout::Strided) { - Py_RETURN_FALSE; - } else { - Py_RETURN_TRUE; - } + if (self->layout->layout == at::Layout::Strided) { + Py_RETURN_FALSE; + } else { + Py_RETURN_TRUE; + } } static struct PyMethodDef metaclass_methods[] = { @@ -131,7 +123,7 @@ static struct PyMethodDef metaclass_methods[] = { {nullptr} }; -using getter = PyObject* (*)(PyObject *, void *); +using getter = PyObject *(*)(PyObject *, void *); static struct PyGetSetDef metaclass_properties[] = { {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr}, @@ -142,46 +134,44 @@ static struct PyGetSetDef metaclass_properties[] = { }; static PyTypeObject metaclass = { - PyVarObject_HEAD_INIT(nullptr, 0) - "torch.tensortype", /* tp_name */ - sizeof(PyTypeObject) /* tp_basicsize */ + PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */ + sizeof(PyTypeObject) /* tp_basicsize */ }; -static void py_initialize_metaclass(PyTypeObject& metaclass) +static void py_initialize_metaclass(PyTypeObject &metaclass) { - metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; - metaclass.tp_methods = metaclass_methods; - metaclass.tp_getset = metaclass_properties; - metaclass.tp_base = &PyType_Type; - if (PyType_Ready(&metaclass) < 0) { - throw python_error(); - } + metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; + metaclass.tp_methods = metaclass_methods; + metaclass.tp_getset = metaclass_properties; + metaclass.tp_base = &PyType_Type; + if (PyType_Ready(&metaclass) < 0) { + throw python_error(); + } } static PyTypeObject tensor_type_prototype = { - PyVarObject_HEAD_INIT(&metaclass, 0) - nullptr, /* tp_name */ - sizeof(PyTensorType) /* tp_basicsize */ + PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */ + sizeof(PyTensorType) /* tp_basicsize */ }; -static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) +static void py_initialize_tensor_type(PyTypeObject &type, const char *name, PyObject *tp_dict) { - // NOTE: we don't use the typical static declaration of PyTypeObject because - // we need to initialize as many types as there are VariableType instances. - // We copy the basic object fields from a prototype definition and initialize - // the remaining fields below. - memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); - // Subclassing from torch.Tensor isn't supported. - // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. - type.tp_flags = Py_TPFLAGS_DEFAULT; - type.tp_name = name; - type.tp_new = Tensor_new; - if (PyType_Ready(&type) < 0) { - throw python_error(); - } - if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { - throw python_error(); - } + // NOTE: we don't use the typical static declaration of PyTypeObject because + // we need to initialize as many types as there are VariableType instances. + // We copy the basic object fields from a prototype definition and initialize + // the remaining fields below. + memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); + // Subclassing from torch.Tensor isn't supported. + // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. + type.tp_flags = Py_TPFLAGS_DEFAULT; + type.tp_name = name; + type.tp_new = Tensor_new; + if (PyType_Ready(&type) < 0) { + throw python_error(); + } + if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { + throw python_error(); + } } static std::string get_module(Backend backend) @@ -204,103 +194,103 @@ static std::string get_module(Backend backend) static std::string get_name(Backend backend, ScalarType scalarType) { - std::ostringstream ss; - ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; - return ss.str(); + std::ostringstream ss; + ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; + return ss.str(); } -static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) +static void set_type(PyTensorType &type_obj, Backend backend, ScalarType scalarType) { - // This field is lazily initialized from backend and scalar_type - type_obj.backend = static_cast(backend); - type_obj.scalar_type = static_cast(scalarType); - type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); - type_obj.dtype = torch::getTHPDtype(scalarType); - type_obj.is_npu = (backend == c10::Backend::PrivateUse1); + // This field is lazily initialized from backend and scalar_type + type_obj.backend = static_cast(backend); + type_obj.scalar_type = static_cast(scalarType); + type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); + type_obj.dtype = torch::getTHPDtype(scalarType); + type_obj.is_npu = (backend == c10::Backend::PrivateUse1); } -static void set_name(PyTensorType& type_obj, const std::string& name) +static void set_name(PyTensorType &type_obj, const std::string &name) { - size_t n = sizeof(type_obj.name); - strncpy(type_obj.name, name.c_str(), n); - type_obj.name[n - 1] = '\0'; + size_t n = sizeof(type_obj.name); + strncpy(type_obj.name, name.c_str(), n); + type_obj.name[n - 1] = '\0'; } static THPObjectPtr get_tensor_dict() { - auto torch = THPObjectPtr(PyImport_ImportModule("torch")); - if (!torch) { - throw python_error(); - } - - auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); - if (!tensor_class) { - throw python_error(); - } - - auto tensor_type = (PyTypeObject*)tensor_class.get(); - TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); - - auto res = THPObjectPtr(PyDict_New()); - if (!res) { - throw python_error(); - } - - if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { - throw python_error(); - } - if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { - throw python_error(); - } - - return res; + auto torch = THPObjectPtr(PyImport_ImportModule("torch")); + if (!torch) { + throw python_error(); + } + + auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); + if (!tensor_class) { + throw python_error(); + } + + auto tensor_type = (PyTypeObject *)tensor_class.get(); + TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); + + auto res = THPObjectPtr(PyDict_New()); + if (!res) { + throw python_error(); + } + + if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { + throw python_error(); + } + if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { + throw python_error(); + } + + return res; } static std::vector tensor_types; -static void initialize_npu_aten_types(std::vector& tensor_types) +static void initialize_npu_aten_types(std::vector &tensor_types) { - // only initialize npu types - auto declared_types = all_declared_types_npu(); - tensor_types.resize(declared_types.size()); - - for (size_t i = 0, end = declared_types.size(); i != end; i++) { - auto& tensor_type = tensor_types[i]; - Backend backend = declared_types[i].first; - ScalarType scalar_type = declared_types[i].second; - set_type(tensor_type, backend, scalar_type); - set_name(tensor_type, get_name(backend, scalar_type)); - } + // only initialize npu types + auto declared_types = all_declared_types_npu(); + tensor_types.resize(declared_types.size()); + + for (size_t i = 0, end = declared_types.size(); i != end; i++) { + auto &tensor_type = tensor_types[i]; + Backend backend = declared_types[i].first; + ScalarType scalar_type = declared_types[i].second; + set_type(tensor_type, backend, scalar_type); + set_name(tensor_type, get_name(backend, scalar_type)); + } } void _initialize_python_bindings() { - // Initialize the at::Type* pointers, name, and properties of the PyTensorType - // vector. After this call, the vector must not be resized. - initialize_npu_aten_types(tensor_types); - - // Initialize the Python metaclass for the torch.FloatTensor, etc. types. - // The metaclass handles __instancecheck__ checks and binds the dtype property - // on the type objects. - py_initialize_metaclass(metaclass); - - // Get the tp_dict of the Variable class. We copy function definitions - // onto each Tensor type object so that they can be accessed via e.g. - // `torch.npu.FloatTensor.add`. - auto tensor_dict = get_tensor_dict(); - - // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. - for (auto& tensor_type : tensor_types) { - py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); - } - - // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor - // is added to the `torch_npu` module as `FloatTensor`. Also add all the type - // objects to the set torch_npu._tensor_classes. - py_bind_tensor_types(tensor_types); + // Initialize the at::Type* pointers, name, and properties of the PyTensorType + // vector. After this call, the vector must not be resized. + initialize_npu_aten_types(tensor_types); + + // Initialize the Python metaclass for the torch.FloatTensor, etc. types. + // The metaclass handles __instancecheck__ checks and binds the dtype property + // on the type objects. + py_initialize_metaclass(metaclass); + + // Get the tp_dict of the Variable class. We copy function definitions + // onto each Tensor type object so that they can be accessed via e.g. + // `torch.npu.FloatTensor.add`. + auto tensor_dict = get_tensor_dict(); + + // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. + for (auto &tensor_type : tensor_types) { + py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); + } + + // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor + // is added to the `torch_npu` module as `FloatTensor`. Also add all the type + // objects to the set torch_npu._tensor_classes. + py_bind_tensor_types(tensor_types); } -static void py_bind_tensor_types(const std::vector& tensor_types) +static void py_bind_tensor_types(const std::vector &tensor_types) { auto torch_module = THPObjectPtr(PyImport_ImportModule("torch")); if (!torch_module) { @@ -312,7 +302,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - for (auto& tensor_type : tensor_types) { + for (auto &tensor_type : tensor_types) { auto name = std::string(tensor_type.name); auto idx = name.rfind('.'); auto type_name = name.substr(idx + 1); @@ -323,7 +313,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - PyObject* type_obj = (PyObject*)&tensor_type; + PyObject *type_obj = (PyObject *)&tensor_type; Py_INCREF(type_obj); if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) { throw python_error(); @@ -335,12 +325,12 @@ static void py_bind_tensor_types(const std::vector& tensor_types) } // Callback for python part. Used for additional initialization of python classes -static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs) +static PyObject *THPModule_initExtension(PyObject *_unused, PyObject *noargs) { - HANDLE_TH_ERRORS - _initialize_python_bindings(); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS + HANDLE_TH_ERRORS + _initialize_python_bindings(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } // autograd methods on torch._C @@ -349,9 +339,9 @@ static PyMethodDef TorchNpuExtensionMethods[] = { {nullptr, nullptr, 0, nullptr} }; -PyMethodDef* npu_extension_functions() +PyMethodDef *npu_extension_functions() { - return TorchNpuExtensionMethods; + return TorchNpuExtensionMethods; } } } diff --git a/torch_npu/multiprocessing/reductions.py b/torch_npu/multiprocessing/reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..cc40949f7933337eaf6a441b688d1e941849ffa2 --- /dev/null +++ b/torch_npu/multiprocessing/reductions.py @@ -0,0 +1,178 @@ +__all__ = ["rebuild_npu_tensor"] + +import multiprocessing +import torch +from torch.multiprocessing.reductions import ( + shared_cache, + rebuild_storage_filename, + rebuild_storage_empty, + rebuild_storage_fd, + StorageWeakRef, + fd_id, + rebuild_tensor, + storage_from_cache, +) + +import torch_npu + + +def rebuild_npu_tensor( + tensor_cls, + tensor_size, + tensor_stride, + tensor_offset, + storage_cls, + dtype, + storage_device, + storage_handle, + storage_size_bytes, + storage_offset_bytes, + requires_grad, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, +): + # If storage_handle is None, storage points to nullptr. + if storage_handle is None or storage_size_bytes == 0: + storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True) + else: + storage = storage_from_cache( + storage_cls, (storage_handle, storage_offset_bytes) + ) + if storage is None: + torch_npu.npu._lazy_init() + storage = storage_cls._new_shared_npu( + storage_device, + storage_handle, + storage_size_bytes, + storage_offset_bytes, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ) + shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef( + storage + ) + else: + # We already ref counting this Storage, but producer needs new ref-counters to be released. + storage_cls._release_ipc_counter_npu( + ref_counter_handle, ref_counter_offset, device=storage_device + ) + + _storage = ( + storage + if isinstance(storage, torch.UntypedStorage) + else storage._untyped_storage + ) + + t = torch._utils._rebuild_tensor( + torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True), + tensor_offset, + tensor_size, + tensor_stride, + ) + + if tensor_cls == torch.nn.parameter.Parameter: + # It is crucial for integer tensors to receive + # the requires_grad=False as an argument in the constructor + t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad) + else: + t.requires_grad = requires_grad + + return t + + +def _npu_reduce_tensor(tensor): + storage = tensor._typed_storage() + + if tensor.requires_grad and not tensor.is_leaf: + raise RuntimeError( + "Cowardly refusing to serialize non-leaf tensor which requires_grad, " + "since autograd does not support crossing process boundaries. " + "If you just want to transfer the data, call detach() on the tensor " + "before serializing (e.g., putting it on the queue)." + ) + + torch._namedtensor_internals.check_serializing_named_tensor(tensor) + torch.utils.hooks.warn_if_has_hooks(tensor) + + if storage._untyped_storage.device.type == "npu": + ( + device, + handle, + storage_size_bytes, + storage_offset_bytes, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ) = storage._share_npu_() + tensor_offset = tensor.storage_offset() + shared_cache[handle] = StorageWeakRef(storage) + return ( + rebuild_npu_tensor, + ( + type(tensor), + tensor.size(), + tensor.stride(), + tensor_offset, # tensor offset in its storage + type(storage), + tensor.dtype, + device, + handle, # identifier which NPU allocation is the storage in. + storage_size_bytes, # size(in bytes) of the storage + storage_offset_bytes, # offset(in bytes) of the storage in the NPU allocation + tensor.requires_grad, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ), + ) + + # _backward_hooks purposely omitted here, see Note [Don't serialize hooks] + metadata = ( + tensor.storage_offset(), + tensor.size(), + tensor.stride(), + tensor.requires_grad, + ) + return (rebuild_tensor, (type(tensor), storage, metadata)) + + +def _npu_reduce_storage(storage): + from torch.multiprocessing import get_sharing_strategy + + if storage.is_npu: + raise RuntimeError( + "Cannot pickle NPU storage; try pickling a NPU tensor instead" + ) + elif get_sharing_strategy() == "file_system": + metadata = storage._share_filename_cpu_() + cache_key = metadata[1] + rebuild = rebuild_storage_filename + if isinstance(storage, torch.TypedStorage): + metadata += (storage.dtype,) + storage._shared_incref() + elif storage.size() == 0: + # This is special cased because Empty tensors + # (with size 0) cannot be mmapped. + return (rebuild_storage_empty, (type(storage),)) + else: + fd, size = storage._share_fd_cpu_() + df = multiprocessing.reduction.DupFd(fd) + cache_key = fd_id(fd) + metadata = (df, size) + rebuild = rebuild_storage_fd # type: ignore[assignment] + + shared_cache[cache_key] = StorageWeakRef(storage) + return (rebuild, (type(storage),) + metadata) + + +def _add_reductions_methods(): + torch.multiprocessing.reductions.reduce_tensor = _npu_reduce_tensor + torch.multiprocessing.reductions.reduce_storage = _npu_reduce_storage + + torch.multiprocessing.reductions.init_reductions() \ No newline at end of file diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index ba883da8fc25a809a10eaad2c1e292db538e46b0..182859d8a5aefc290a702d41fdd36cc33631c72c 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -98,7 +98,6 @@ __all__ = [ "stop_device", "restart_device", "check_uce_in_memory", - "get_uce_addr", "config", "matmul", "conv", @@ -115,7 +114,9 @@ __all__ = [ "graph_task_group_begin", "graph_task_group_end", "graph_task_update_begin", - "graph_task_update_end" + "graph_task_update_end", + "set_device_limit", + "get_device_limit" ] from typing import Tuple, Union, List, cast, Optional @@ -135,7 +136,7 @@ from .utils import (synchronize, can_device_access_peer, set_device, current_dev device, device_of, StreamContext, stream, set_stream, current_stream, default_stream, set_sync_debug_mode, get_sync_debug_mode, init_dump, current_blas_handle, is_bf16_supported, utilization, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, mem_get_info, - check_uce_in_memory, stress_detect, get_uce_addr) + check_uce_in_memory, stress_detect, _get_uce_addr) from ._recovery import restart_device, stop_device from .streams import Stream, Event, SyncLaunchStream, ExternalEvent from .mstx import mstx diff --git a/torch_npu/npu/_format.py b/torch_npu/npu/_format.py new file mode 100644 index 0000000000000000000000000000000000000000..beb65e076f74f5537daf4bcc76a58eaae4fdedbd --- /dev/null +++ b/torch_npu/npu/_format.py @@ -0,0 +1,38 @@ +from enum import IntEnum + +import torch +import torch_npu + + +class Format(IntEnum): + """NPU storage format enumeration class""" + UNDEFINED = -1 + NCHW = 0 + NHWC = 1 + ND = 2 + NC1HWC0 = 3 + FRACTAL_Z = 4 + NC1HWC0_C04 = 12 + HWCN = 16 + NDHWC = 27 + FRACTAL_NZ = 29 + NCDHW = 30 + NDC1HWC0 = 32 + FRACTAL_Z_3D = 33 + NC = 35 + NCL = 47 + + def __str__(self): + return self.name + + +def _apply_npu_format_patch(): + orig_get_format = torch_npu.get_npu_format + + def patched_get_format(tensor): + """get the Format type of tensor""" + format_int = orig_get_format(tensor) + return Format(format_int) + + torch_npu.get_npu_format = patched_get_format + torch_npu.Format = Format diff --git a/torch_npu/npu/npu_config.py b/torch_npu/npu/npu_config.py index 2233f7841c4b8866ee18b8f22c289562121779d2..5ca745339f0066eb0583b0a1b02cc48bbe4dbaee 100644 --- a/torch_npu/npu/npu_config.py +++ b/torch_npu/npu/npu_config.py @@ -6,12 +6,14 @@ import torch_npu import torch_npu._C from torch_npu.utils._path_manager import PathManager from torch_npu.utils._error_code import ErrCode, pta_error, prof_error +from .utils import _get_device_index # this file is used to enhance the npu frontend API by set_option or other. __all__ = ["set_option", "set_aoe", "set_compile_mode", "set_mm_bmm_format_nd", "get_mm_bmm_format_nd", - "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump"] + "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump", + "set_device_limit", "get_device_limit"] _option_map = {"ACL_PRECISION_MODE": ["allow_fp32_to_fp16", "must_keep_origin_dtype"], "ACL_OP_SELECT_IMPL_MODE": ["high_performance", "high_precision"], @@ -170,3 +172,42 @@ class _allowHF32Conv: hf32_value = torch_npu._C._npu_getOption("ALLOW_CONV_HF32") return (hf32_value is None) or (hf32_value.decode() == "") or (hf32_value.decode() == "enable") return None + + +class _call_once_class: + def __init__(self, func): + self.func = func + self.called = False + self.result = None + + def __call__(self, *args, **kwargs): + if self.called: + raise RuntimeError(f"Function '{self.func.__name__}' has already been called, \ + You can only set this interface once.") + + self.called = True + self.result = self.func(*args, **kwargs) + return self.result + + +@_call_once_class +def set_device_limit(device, cube_num=-1, vector_num=-1): + from torch_npu.npu import device_count + device_id = _get_device_index(device, optional=True) + if device_id < 0 or device_id >= device_count(): + raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE)) + torch_npu.npu._lazy_init() + if cube_num != -1: + torch_npu._C._npu_set_device_res_limit(device_id, 0, cube_num) + if vector_num != -1: + torch_npu._C._npu_set_device_res_limit(device_id, 1, vector_num) + + +def get_device_limit(device): + from torch_npu.npu import device_count + device_id = _get_device_index(device, optional=True) + if device_id < 0 or device_id >= device_count(): + raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE)) + torch_npu.npu._lazy_init() + return {"cube_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 0), \ + "vector_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 1)} \ No newline at end of file diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py index 069848345b994b93ff6610716eae20455184793b..697504d52d7f82d4d68182d990493bd2856a56c1 100644 --- a/torch_npu/npu/utils.py +++ b/torch_npu/npu/utils.py @@ -17,7 +17,7 @@ __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device" "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode", "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported", "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle", - "check_uce_in_memory", "stress_detect", "get_cann_version", "get_uce_addr"] + "check_uce_in_memory", "stress_detect", "get_cann_version"] def get_cann_version(module="CANN"): @@ -446,7 +446,7 @@ def check_uce_in_memory(device_id): return torch_npu._C._npu_check_uce_in_memory(device_id) -def get_uce_addr(): +def _get_uce_addr(): torch_npu.npu._lazy_init() return torch_npu._C._npu_get_uce_addr() diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py index bc97473537d65cc6ab5df816da4192df5dc0edb8..16ae07087e5bd17121ab4519a929e7c564270ab7 100644 --- a/torch_npu/onnx/wrapper_onnx_ops.py +++ b/torch_npu/onnx/wrapper_onnx_ops.py @@ -255,8 +255,8 @@ class _NPUFormatCastOP(torch.autograd.Function): return torch.ops.npu.npu_format_cast(*args, **kwargs) @staticmethod - def symbolic(g, self: Tensor, acl_format: int): - return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format) + def symbolic(g, self: Tensor, acl_format: int, customize_dtype: int = None): + return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format, customize_dtype_i=customize_dtype) class _NPUSoftmaxCrossEntropyWithLogitsOP(torch.autograd.Function): @@ -1042,8 +1042,8 @@ def _wrapper_npu_deformable_conv2d(inputs, weight, offset, bias, kernel_size, st padding, dilation, groups, deformable_groups, modulated) -def _wrapper_npu_format_cast(self, acl_format): - return _NPUFormatCastOP.apply(self, acl_format) +def _wrapper_npu_format_cast(self, acl_format, customize_dtype=None): + return _NPUFormatCastOP.apply(self, acl_format, customize_dtype) def _wrapper_npu_softmax_cross_entropy_with_logits(self, labels): diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py index ab8a5abfe6ab53e8d3f1181c9c02cf0dd41a684b..5da94ae76334e5978f4da11c2313733369b52949 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py @@ -198,8 +198,8 @@ class ConfigContext: op_attr = json_data.get('PROFILE_OP_ATTR', 'false') op_attr = self.BOOL_MAP.get(op_attr.lower(), False) gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None) - if isinstance(gc_detect_threshold, str) and gc_detect_threshold != "None": - gc_detect_threshold = float(gc_detect_threshold) + if isinstance(gc_detect_threshold, str): + gc_detect_threshold = None if gc_detect_threshold == "None" else float(gc_detect_threshold) data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true') data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True) record_op_args = False diff --git a/torch_npu/profiler/_non_intrusive_profile.py b/torch_npu/profiler/_non_intrusive_profile.py index a60303adec0a88258dff4d7e2b3785cd20ecfc7b..c4ce45223b6f16f19aa768f18852f0b7da82e591 100644 --- a/torch_npu/profiler/_non_intrusive_profile.py +++ b/torch_npu/profiler/_non_intrusive_profile.py @@ -8,7 +8,7 @@ from ..utils._path_manager import PathManager from ._dynamic_profiler._dynamic_profiler_utils import DynamicProfilerUtils from .dynamic_profile import init as dp_init from .dynamic_profile import step as dp_step -from .analysis.prof_common_func._constant import print_error_msg +from .analysis.prof_common_func._constant import print_error_msg, print_warn_msg __all__ = [ @@ -59,11 +59,19 @@ class _NonIntrusiveProfile: @staticmethod def init(): prof_config_path = os.getenv("PROF_CONFIG_PATH", "") - dyno_enable_flag = os.getenv("KINETO_USE_DAEMON", 0) + kine_to_value = os.getenv("KINETO_USE_DAEMON") + msmonitor_value = os.getenv("MSMONITOR_USE_DAEMON") + + if kine_to_value is not None: + print_warn_msg( + "Environment variable 'KINETO_USE_DAEMON' will be deprecated. " + "Please use 'MSMONITOR_USE_DAEMON' instead." + ) + dyno_enable_flag = msmonitor_value or kine_to_value or 0 try: dyno_enable_flag = int(dyno_enable_flag) except ValueError: - print_error_msg("Environment variable KINETO_USE_DAEMON value not valid, will be set to 0 !") + print_error_msg("Environment variable 'MSMONITOR_USE_DAEMON' value not valid, will be set to 0 !") dyno_enable_flag = 0 if not prof_config_path and dyno_enable_flag != 1: return diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 56809c9b7f65be2479f7dd1e9d63e068940c1eab..1a62c54d6f6af19bed262b7b5765d192e3fd3c0d 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -1,5 +1,5 @@ import os -from datetime import datetime +import time from typing import Union from torch_npu.utils._error_code import ErrCode, prof_error @@ -217,20 +217,23 @@ class Constant(object): def print_info_msg(message: str): - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}", flush=True) def print_warn_msg(message: str): if not _should_print_warning(): return - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}", flush=True) def print_error_msg(message: str): - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}", flush=True) def convert_ns2us_float(ns) -> float: diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py index 15ba7a80f9d10ed74e1e26a4a5be4ab9190b7ef0..0fecde48c41b465cf04eff26282a02911655c032 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_log.py +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -34,6 +34,7 @@ class ProfilerLogger: BACKUP_COUNT = 3 # logger instance _instance = None + _pid = None @classmethod def get_instance(cls) -> logging.Logger: @@ -54,14 +55,17 @@ class ProfilerLogger: RuntimeError: If logger initialization fails """ if cls._instance is not None: - return + if cls._pid == os.getpid(): + return # Create logs directory log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) PathManager.make_dir_safety(log_dir) # Create logger - logger = logging.getLogger(cls.DEFAULT_LOGGER_NAME) + logger = logging.getLogger( + f"{cls.DEFAULT_LOGGER_NAME}_{custom_name}" if custom_name else cls.DEFAULT_LOGGER_NAME + ) logger.setLevel(cls.DEFAULT_LOG_LEVEL) logger.propagate = False @@ -89,6 +93,7 @@ class ProfilerLogger: logger.addHandler(file_handler) cls._instance = logger + cls._pid = os.getpid() logger.info("Profiler logger initialized at: %s", log_file) @classmethod @@ -106,9 +111,13 @@ class ProfilerLogger: @classmethod def destroy(cls) -> None: - """Close and cleanup the logger.""" + """ + Close and cleanup the logger. + To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios, + when child process updates instance, the parent process instance obtained by fork does not call this method. + """ if cls._instance: for handler in cls._instance.handlers[:]: - handler.close() cls._instance.removeHandler(handler) + handler.close() cls._instance = None diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py index b5d3797c6fc2e625840ec07a8357690d94186e51..ba29da446eb5c43b8b93ce4d8bea4b9f245da487 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py @@ -74,7 +74,7 @@ class FwkCANNRelationParser: step_id = step_node.event.name.split("#")[-1] if not step_node.corr_id_total: self.logger.error("There is no flow events in %s range.", step_node.event.name) - return [] + continue corr_id_list = sorted(step_node.corr_id_total) min_index, max_index = 0, len(corr_id_list) - 1 min_kernel_list, max_kernel_list = [], [] diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py index aa00324c97c25909dbca2f4efb8a3f97533b216f..b8216a6995895eba8b907d03d90997f6d49c58a9 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py @@ -152,11 +152,14 @@ class FwkFileParser: def get_fwk_trace_data(self): torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP) - if not torch_op_data: - self.logger.error("Get fwk trace data failed, the torch op data is empty.") - return [] enqueue_data_list, dequeue_data_list = self.get_task_queue_data() - pid = torch_op_data[0].pid + if torch_op_data: + pid = torch_op_data[0].pid + elif enqueue_data_list or dequeue_data_list: + pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid + else: + self.logger.error("Get fwk trace data failed, framework data is empty.") + return [] tid_dict = {} fwk_x_event_list = [None] * ( len(torch_op_data) + len(enqueue_data_list) * 2 + len(dequeue_data_list) * 2) @@ -247,9 +250,15 @@ class FwkFileParser: def get_fwk_api(self) -> dict: torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP) - if not torch_op_data: + enqueue_data_list, dequeue_data_list = self.get_task_queue_data() + if torch_op_data: + pid = torch_op_data[0].pid + elif enqueue_data_list or dequeue_data_list: + pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid + else: + self.logger.error("Get fwk api data failed, framework data is empty.") return {} - pid = torch_op_data[0].pid + torch_op_apis = [] fwd_bwd_dict = {} torch_op_idx = 0 @@ -272,13 +281,13 @@ class FwkFileParser: connection_ids = [] task_enqueues = [] task_dequeues = [] - enqueue_data_list, dequeue_data_list = self.get_task_queue_data() correlation_id_name_dict = {} for dequeue_data in dequeue_data_list: task_dequeues.append( [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid), dequeue_data.corr_id, dequeue_data.name]) correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name + torch_tids.add(dequeue_data.tid) for enqueue_data in enqueue_data_list: name = enqueue_data.name if enqueue_data.corr_id in correlation_id_name_dict: @@ -288,6 +297,7 @@ class FwkFileParser: [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid), enqueue_data.corr_id, name]) connection_ids.append(enqueue_data.corr_id) + torch_tids.add(enqueue_data.tid) start_connection_id = max(connection_ids) + 1 if connection_ids else 0 self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id) diff --git a/torch_npu/profiler/analysis/prof_view/_communication_parser.py b/torch_npu/profiler/analysis/prof_view/_communication_parser.py index fff6d265d6ceb5198681e78956b6268efc732cb9..e07f68b785b31eb509602a99a12760fad476a5f3 100644 --- a/torch_npu/profiler/analysis/prof_view/_communication_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_communication_parser.py @@ -46,8 +46,6 @@ class CommunicationParser(BaseParser): self._root_node = TorchOpNode() self._kernel_dict = {} self.step_list = [] - ProfilerLogger.init(self._profiler_path, "CommunicationParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def combine_size_distribution(op_dict: dict, total_dict: dict): @@ -63,6 +61,8 @@ class CommunicationParser(BaseParser): return round(dividend / divisor, 4) def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CommunicationParser") + self.logger = ProfilerLogger.get_instance() try: self._init_step_list(deps_data) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py index b6c545420c3bb961640c7ef25dc54e8050fad6ae..28472a241177ed4f8f13c7b090e02a98db1113c2 100644 --- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py @@ -26,10 +26,10 @@ class IntegrateParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "IntegrateParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "IntegrateParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py index 30ffd8be8ba46e0b8cc5ac1300c4eba389211eaa..ded9a612c6cfd98a7076fb749457e0c3da9aa44c 100644 --- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py @@ -17,8 +17,6 @@ class KernelViewParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] - ProfilerLogger.init(self._profiler_path, "KernelViewParser") - self.logger = ProfilerLogger.get_instance() @classmethod def _project_map_for_headers(cls, input_headers: list): @@ -35,6 +33,8 @@ class KernelViewParser(BaseParser): return output_headers def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "KernelViewParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) self._init_step_range(deps_data) diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py index 04ef7c0e90da3b1ee494785ea540d22e0a07052a..47255efd09dbdca635e4888fd575f311fbcff5ef 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py @@ -34,8 +34,6 @@ class MemoryViewParser(BaseParser): self.ge_record_list = [] self.memory_data = [] self.component_list = [] - ProfilerLogger.init(self._profiler_path, "MemoryViewParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def _get_data_from_file(file_set: set, file_type_bean: any, bean_list: bool = False) -> list: @@ -73,6 +71,8 @@ class MemoryViewParser(BaseParser): return [cur_record_list, pta_ge_record_list] def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "MemoryViewParser") + self.logger = ProfilerLogger.get_instance() try: self.memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Text, []) self.pta_record_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", []) @@ -109,7 +109,8 @@ class MemoryViewParser(BaseParser): if ge_record.time_ns >= pta_record.time_ns: self.size_record_list.extend(self._combine_record(last_ge_record, pta_record)) pta_ptr += 1 - last_pta_record = pta_record + if hasattr(pta_record, 'component_type') and pta_record.component_type != Constant.WORKSPACE_TYPE: + last_pta_record = pta_record else: self.size_record_list.extend(self._combine_record(last_pta_record, ge_record)) ge_ptr += 1 diff --git a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py index f87e8dc8b85e7f35097afd2666194f7cd0311b68..7c10e9d4bf45c2881fb8bd04ae3c2b1124f578c5 100644 --- a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py @@ -22,10 +22,10 @@ class OperatorViewParser(BaseParser): self._torch_op_node = [] self._root_node = None self._kernel_dict = {} - ProfilerLogger.init(self._profiler_path, "OperatorViewParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "OperatorViewParser") + self.logger = ProfilerLogger.get_instance() try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self._kernel_dict = deps_data.get(Constant.RELATION_PARSER, {}) diff --git a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py index 2f793a8af8b611559613799a004531224c366590..b4a85271d99034e55936d682e9b4748f6251cf11 100644 --- a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py @@ -23,10 +23,10 @@ class StackViewParser(BaseParser): self._root_node = None self._kernel_dict = {} self._metric = param_dict.get("metric") - ProfilerLogger.init(self._profiler_path, "StackViewParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "StackViewParser") + self.logger = ProfilerLogger.get_instance() try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 744e2cd8a6e1a42b9e9e813f5cb27c51cd34ce61..46093bec4e8e2cbe50af5590be96f37ad9ac574f 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -51,8 +51,6 @@ class TraceStepTimeParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] - ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser") - self.logger = ProfilerLogger.get_instance() @classmethod def is_float_num(cls, num): @@ -165,6 +163,8 @@ class TraceStepTimeParser(BaseParser): FileManager.create_csv_file(output_path, print_time, file_name, self.title) def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser") + self.logger = ProfilerLogger.get_instance() try: self._init_step_range(deps_data) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py index f90100e869fd4c4ea92661dd2183b8fd20808412..c5e572e1bcfeba5ecaa4c4e6db93b47c896392eb 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py @@ -27,8 +27,6 @@ class TraceViewParser(BaseParser): self._trace_data = [] self._torch_op_node = [] self._root_node = None - ProfilerLogger.init(self._profiler_path, "TraceViewParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def _prune_trace_by_level(json_data: list) -> list: @@ -47,6 +45,8 @@ class TraceViewParser(BaseParser): return result def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TraceViewParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py index 8ef2072be611814bb0a604685b957745d8d221fa..da8037f982bbc2ba77f18a3aa5928565bf45a28e 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py @@ -34,10 +34,10 @@ class CANNAnalyzeParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") - ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) if not os.path.isdir(self._cann_path): diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py index 6a703d0b954ecca3a58621cd940b23f7726dc27c..7228525fae6d03a8d41a2f50b6ca9094fee8070b 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py @@ -41,10 +41,10 @@ class CANNExportParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") - ProfilerLogger.init(self._profiler_path, "CANNExportParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CANNExportParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) if not os.path.isdir(self._cann_path): diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py index 6cc6f235165107299886fb4cf936e927dbd687b4..939e06cf748ba4a011a9a33b4ded585fe04f3310 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py @@ -28,10 +28,10 @@ class TracePreParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "TracePreParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TracePreParser") + self.logger = ProfilerLogger.get_instance() try: fwk_trace_data = FwkFileParser(self._profiler_path).get_fwk_trace_data() trace_file_path = os.path.join(self._output_path, Constant.TRACE_VIEW_TEMP) if os.path.isdir( diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py index e6eb02ddb81d7ffce69d4e2d60899beb62012c61..5e8a941de2873cf071baa412a50d964978fce539 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py @@ -23,10 +23,10 @@ __all__ = [] class RelationParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "RelationParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "RelationParser") + self.logger = ProfilerLogger.get_instance() try: kernel_dict = FwkCANNRelationParser(self._profiler_path).get_kernel_dict() except Exception as e: diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py index 64de6315f246b49a93a2a72a7b1614aa1f630c3a..34a5fc27f856530c83cb66ba93a63afe367aa746 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py @@ -65,6 +65,8 @@ class MemoryDbParser(BaseParser): @staticmethod def _combine_record(last_record, cur_record): + if cur_record[MemoryRecordTableRow.COMPONENT.value] == Str2IdManager().get_id_from_str(Constant.WORKSPACE): + return [cur_record] pta_ge_record_list = cur_record[:] pta_ge_record_list[MemoryRecordTableRow.COMPONENT.value] = Str2IdManager().get_id_from_str(Constant.PTA_GE) if last_record: @@ -179,9 +181,16 @@ class MemoryDbParser(BaseParser): if not self._pta_memory_bean_list: return for memory_bean in self._pta_memory_bean_list: + if memory_bean.component_type == Constant.WORKSPACE_TYPE: + self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.WORKSPACE), memory_bean.time_ns, + memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db, + memory_bean.total_active_for_db, memory_bean.stream_ptr, + memory_bean.device_index]) + continue self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.PTA), memory_bean.time_ns, memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db, - memory_bean.total_active_for_db, memory_bean.stream_ptr, memory_bean.device_index]) + memory_bean.total_active_for_db, memory_bean.stream_ptr, + memory_bean.device_index]) def get_pta_ge_record_list(self): """ @@ -203,7 +212,9 @@ class MemoryDbParser(BaseParser): if ge_record[1] >= pta_record[1]: self._record_list.extend(self._combine_record(last_ge_record, pta_record)) pta_ptr += 1 - last_pta_record = pta_record + if pta_record[MemoryRecordTableRow.COMPONENT.value] != \ + Str2IdManager().get_id_from_str(Constant.WORKSPACE): + last_pta_record = pta_record else: self._record_list.extend(self._combine_record(last_pta_record, ge_record)) ge_ptr += 1 diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py index 99bfd76f7263e115b2839e07a4bc8948b8969f64..313fe7d388f078fed4a63ffd668f531136a132a7 100644 --- a/torch_npu/profiler/dynamic_profile.py +++ b/torch_npu/profiler/dynamic_profile.py @@ -3,6 +3,7 @@ import json import atexit import time +from ..npu import mstx, current_stream from .profiler import tensorboard_trace_handler, profile from .scheduler import Schedule as schedule @@ -38,6 +39,7 @@ class _DynamicProfile: self._step_record_time = None self._step_time = 0 self._min_poll_interval = 1 + self._step_mstx_range_id = 0 def init(self): if self.repeat_init: @@ -78,6 +80,9 @@ class _DynamicProfile: self._step_time = max(self._min_poll_interval, int(time.time() - self._step_record_time)) self._dynamic_monitor.modify_step_time(self._step_time) if self.prof: + if self._step_mstx_range_id: + mstx.range_end(self._step_mstx_range_id) + self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream()) self.prof.step() self.step_num -= 1 if 0 == self.step_num: @@ -138,7 +143,9 @@ class _DynamicProfile: with_modules=self.cfg_ctx.with_modules, experimental_config=self.cfg_ctx.experimental_config ) + self.prof._set_step_num_offset_for_dynamic_prof(self.cur_step) self.prof.start() + self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream()) for key, value in self.cfg_ctx.meta_data().items(): self.prof.add_metadata_json(str(key), json.dumps(value)) DynamicProfilerUtils.out_log("Start Dynamic Profiler at {} step.".format( diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py index 409013114a8302ad7a7130387ad17352479968f6..d45ad41693385c69b674196da9b92b7e17e49fac 100644 --- a/torch_npu/profiler/profiler.py +++ b/torch_npu/profiler/profiler.py @@ -229,6 +229,7 @@ class profile(_KinetoProfile): self.on_trace_ready = on_trace_ready self.step_num = 0 self.current_action = self.schedule(self.step_num) + self._step_num_offset = 0 self.step_rec_fn: Optional[prof.record_function] = None if use_cuda is not None: print_warn_msg("This is npu environment, use_cuda is invalid") @@ -249,6 +250,10 @@ class profile(_KinetoProfile): if self.stopped == False: self.stop() + @no_exception_func() + def _set_step_num_offset_for_dynamic_prof(self, step: int): + self._step_num_offset = step + @no_exception_func() def start(self): self.stopped = False @@ -256,7 +261,7 @@ class profile(_KinetoProfile): ProfPathCreator().init(export_only_mode=True) self.action_controller.transit_action(ProfilerAction.NONE, self.current_action) if self.record_steps: - self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num)) + self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset)) self.step_rec_fn.__enter__() @no_exception_func() @@ -278,7 +283,7 @@ class profile(_KinetoProfile): self.current_action = self.schedule(self.step_num) self.action_controller.transit_action(prev_action, self.current_action) if self.record_steps: - self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num)) + self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset)) self.step_rec_fn.__enter__() diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index e8fbd923b343f387d2be38aa776237a48c2d0ef9..0cb93e9951d1cc59a4eaa6ae977aaf7e3d4e0333 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,12 +1,12 @@ __all__ = ["npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter", - "set_thread_affinity"] + "set_thread_affinity", "reset_thread_affinity", "save_async"] from torch_npu import _C from ._module import _apply_module_patch from .tensor_methods import _add_tensor_methods from .storage import _add_storage_methods from .combine_tensors import npu_combine_tensors, get_part_combined_tensor, is_combined_tensor_valid -from .serialization import _add_serialization_methods +from .serialization import _add_serialization_methods, save_async from .npu_intercept import _cann_package_check, _add_intercept_methods from .dtensor import _register_ops_under_dtensor_rules from .collect_env import _add_collect_env_methods @@ -18,6 +18,7 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _apply_np from ._step import add_perf_dump_patch from .flops_count import _FlopsCounter as FlopsCounter from .affinity import _set_thread_affinity as set_thread_affinity +from .affinity import _reset_thread_affinity as reset_thread_affinity # init flopcount diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index fbe408739fd35e2c9dff7f2635a04af7ba2e60d6..313736c3a176de58c81ea2fad222475c78b99a5a 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -30,8 +30,6 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm from torch_npu.utils._error_code import ErrCode, pta_error origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__ -origin_worker_loop = worker._worker_loop -origin_pin_memory_loop = pin_memory._pin_memory_loop CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"] @@ -370,17 +368,9 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except Exception as e: print(e) - origin_mpdl_iter_init(self, *args, **kwargs) - - -def _npu_worker_loop(*args, **kwargs): torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_worker_loop(*args, **kwargs) - - -def _npu_pin_memory_loop(*args, **kwargs): - torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_pin_memory_loop(*args, **kwargs) + origin_mpdl_iter_init(self, *args, **kwargs) + torch_npu._C._npu_reset_thread_affinity() def _parallel_apply( @@ -533,5 +523,3 @@ def _apply_module_patch(): torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply torch.nn.parallel.data_parallel = npu_data_parallel torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init - torch.utils.data._utils.worker._worker_loop = _npu_worker_loop - torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py index 7728736baa19712879a2d8edc58ac33cdfc6c069..37973f5bc79bc81af684a286603bb75e2c734332 100644 --- a/torch_npu/utils/affinity.py +++ b/torch_npu/utils/affinity.py @@ -14,4 +14,8 @@ def _set_thread_affinity(core_range: List[int] = None): raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM)) torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1]) else: - raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) \ No newline at end of file + raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) + + +def _reset_thread_affinity(): + torch_npu._C._npu_reset_thread_affinity() \ No newline at end of file diff --git a/torch_npu/utils/collect_env.py b/torch_npu/utils/collect_env.py index 3f279bf3cc36512a8f8a0fcbbb748262f857998e..8ffed93212ccdc61027c5af7d2682fa8bb8e0358 100644 --- a/torch_npu/utils/collect_env.py +++ b/torch_npu/utils/collect_env.py @@ -87,14 +87,8 @@ def get_cann_version(): def get_torch_npu_version(): torch_npu_version_str = 'N/A' - torch_npu_root = get_torch_npu_install_path() - version_path = os.path.join(torch_npu_root, "torch_npu", "version.py") - check_directory_path_readable(version_path) - with open(version_path, "r") as f: - for line in f: - if line.find("__version__") != -1: - torch_npu_version_str = line.strip().split("=")[-1] - break + if TORCH_NPU_AVAILABLE: + torch_npu_version_str = torch_npu.__version__ return torch_npu_version_str diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py index cb56f8cb79072537733e62f951b78073a1e2af3e..d08752f4704c6c798ddec16a5f8ac8e36ee03340 100644 --- a/torch_npu/utils/serialization.py +++ b/torch_npu/utils/serialization.py @@ -3,22 +3,25 @@ import io import sys import pickle import tarfile +import threading from typing import Dict, Any, Optional import torch from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \ _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \ _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL, \ - normalize_storage_type, location_tag, _serialization_tls, _check_seekable, closing, _should_read_directly + normalize_storage_type, location_tag, _serialization_tls, _check_seekable, closing, _should_read_directly, \ + _open_zipfile_writer import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error from .utils import _should_print_warning +__all__ = ["load", "save", "save_async"] + ALWAYS_WARN_LEGACY_SERIALIZATION = False RE_MAP_CPU = False - -__all__ = ["load", "save"] +save_async_stream_map = {} def _get_always_warn_legacy_serialization(): @@ -349,6 +352,146 @@ def save( return torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record) +def save_async( + obj: object, + f, + pickle_module: Any = pickle, + pickle_protocol: int = DEFAULT_PROTOCOL, + _use_new_zipfile_serialization: bool = True, + _disable_byteorder_record: bool = False, + model: torch.nn.Module = None +) -> None: + if _use_new_zipfile_serialization is False: + raise RuntimeError("Error: torch_npu.save_async with \"_use_new_zipfile_serialization = False\"\ + is not recommended for npu tensor, which may bring unexpected errors and hopefully \ + set \"_use_new_zipfile_serialization = True\"", + "if it is necessary to use this, please convert the npu tensor to cpu tensor for saving" + + pta_error(ErrCode.PARAM)) + + _check_dill_version(pickle_module) + save_args = (obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record) + + device = torch.npu.current_device() + save_thread = threading.Thread(target=_save_data_thread, args=(save_args, device, model)) + save_thread.start() + + +def _save_data_thread(save_args, + device, + model: torch.nn.Module = None): + global save_async_stream_map + torch.npu.set_device(device) + + def hook_fn(*args): + torch.npu.current_stream().wait_stream(save_async_stream_map.get(device)) + + if device not in save_async_stream_map: + save_async_stream = torch.npu.Stream() + save_async_stream_map[device] = save_async_stream + if isinstance(model, torch.nn.Module): + model.register_full_backward_hook(hook_fn) + else: + save_async_stream = save_async_stream_map[device] + + obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record = save_args + with torch.npu.stream(save_async_stream): + data_value, serialized_storages = _save(obj, pickle_module, pickle_protocol) + storage_value = [] + for key in sorted(serialized_storages.keys()): + name = f'data/{key}' + storage = serialized_storages.get(key) + # given that we copy things around anyway, we might use storage.cpu() + # this means to that to get tensors serialized, you need to implement + # .cpu() on the underlying Storage + if storage.device.type != 'cpu': + storage = storage.cpu() + # Now that it is on the CPU we can directly copy it into the zip file + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + num_bytes = storage_tensor.size().numel() * storage_tensor.element_size() + else: + num_bytes = storage.nbytes() + storage_value.append((name, storage, num_bytes)) + + with _open_zipfile_writer(f) as opened_zipfile: + opened_zipfile.write_record('data.pkl', data_value, len(data_value)) + + for name, storage, num_bytes in storage_value: + opened_zipfile.write_record(name, storage.data_ptr(), num_bytes) + + +def _save(obj, pickle_module, pickle_protocol): + serialized_storages = {} + id_map: Dict[int, str] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj): + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + + if isinstance(obj, torch.storage.TypedStorage): + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size() + else: + storage_numel = obj._size() + + else: + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() + else: + storage_numel = storage.nbytes() + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + 'Cannot save multiple tensors or storages that ' + 'view the same data as different types' + pta_error(ErrCode.VALUE)) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + storage_key = id_map.setdefault(storage._cdata, str(len(id_map))) + location = location_tag(storage) + serialized_storages[storage_key] = storage + + return ('storage', + storage_type, + storage_key, + location, + storage_numel) + + return None + + # Write the pickle data for `obj` + data_buf = io.BytesIO() + pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol) + pickler.persistent_id = persistent_id + if isinstance(obj, torch.nn.Module): + hook_handle = obj._backward_hooks.copy() + obj._backward_hooks.clear() + pickler.dump(obj) + obj._backward_hooks.update(hook_handle) + else: + pickler.dump(obj) + data_value = data_buf.getvalue() + return data_value, serialized_storages + + def _add_serialization_methods(): torch.save = save torch.load = load diff --git a/torch_npu/utils/storage.py b/torch_npu/utils/storage.py index 9304f141bf2fc5475f1594d6aff44aee99fbc289..85a2a402a37c11f81e77155ab9c792c56c80a061 100644 --- a/torch_npu/utils/storage.py +++ b/torch_npu/utils/storage.py @@ -1,4 +1,7 @@ +__all__ = [] + import copy +from typing import Union import torch from torch.storage import _warn_typed_storage_removal @@ -49,6 +52,37 @@ def _deepcopy(self, memo): return self._new_wrapped_storage(copy.deepcopy(self._untyped_storage, memo)) +def _share_npu_(self, *args, **kwargs): + return torch_npu._C._share_npu_(self, *args, **kwargs) + + +def _typed_storage_share_npu_(self, *args, **kwargs): + return self._untyped_storage._share_npu_(*args, **kwargs) + + +def _new_shared_npu(*args, **kwargs): + return torch_npu._C._new_shared_npu(*args, **kwargs) + + +def _typed_storage_new_shared_npu(*args, **kwargs): + return torch.UntypedStorage._new_shared_npu(*args, **kwargs) + + +def _release_ipc_counter_npu(*args, **kwargs): + return torch_npu._C._release_ipc_counter_npu(*args, **kwargs) + + +def _typed_storage_release_ipc_counter_npu(*args, device: Union[str, torch.device] = "npu", **kwargs): + return torch.UntypedStorage._release_ipc_counter_npu(*args, **kwargs) + + def _add_storage_methods(): torch.storage.UntypedStorage.cpu = _cpu torch.storage.TypedStorage._deepcopy = _deepcopy + + setattr(torch.UntypedStorage, "_share_npu_", _share_npu_) + setattr(torch.UntypedStorage, "_new_shared_npu", _new_shared_npu) + setattr(torch.UntypedStorage, "_release_ipc_counter_npu", _release_ipc_counter_npu) + setattr(torch.TypedStorage, "_share_npu_", _typed_storage_share_npu_) + setattr(torch.TypedStorage, "_new_shared_npu", _typed_storage_new_shared_npu) + setattr(torch.TypedStorage, "_release_ipc_counter_npu", _typed_storage_release_ipc_counter_npu) \ No newline at end of file diff --git a/torch_npu/utils/unsupport_api.py b/torch_npu/utils/unsupport_api.py index 61ba27b3a239f000e7d93437add90e65d62884b8..5626e940b6a690e7a74815095c8d51a3fd08dabd 100644 --- a/torch_npu/utils/unsupport_api.py +++ b/torch_npu/utils/unsupport_api.py @@ -6,8 +6,6 @@ value: parent_module(object) """ unsupported_Tensor_api = { - "is_shared": torch.Tensor, - "share_memory_": torch.Tensor } unsupported_nn_api = {