diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 8b16502c8146b6b4fd58b8fc979a7e0c3afb2800..cc631b4748c0509b1753f2cfcd6c456ca5459b88 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1058,6 +1058,9 @@ class DeviceCachingAllocator { key.size = (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size; auto it = pool.blocks.lower_bound(&key); + + c10_npu::npuSynchronizeDevice(true); + if (it == pool.blocks.end() || (*it)->stream != p.stream()) { // No single block is large enough; free multiple oversize blocks, starting with the largest if (it == pool.blocks.begin()){