diff --git a/torch_npu/profiler/analysis/npu_profiler.py b/torch_npu/profiler/analysis/npu_profiler.py
index 8632108d9f1be8017e5987f240c32fd4c7df1826..f7c1acbc73683bc5da2e7159eb282bd0d5842ff1 100644
--- a/torch_npu/profiler/analysis/npu_profiler.py
+++ b/torch_npu/profiler/analysis/npu_profiler.py
@@ -25,9 +25,20 @@ from ...utils.path_manager import PathManager
 
 
 class NpuProfiler:
-
     @classmethod
     def analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
+                 **kwargs):
+        """ Muti-process in parsing use fork to generate child processes for better performance, while forking from a
+            muti-threaded process may cause deadlock. So spawn a pure process to be public parent process for parsing.
+        """
+        mp = multiprocessing.get_context("spawn")
+        p = mp.Process(target=NpuProfiler._analyse, args=(input_path, analysis_type, output_path),
+                                    kwargs=kwargs)
+        p.start()
+        p.join()
+
+    @classmethod
+    def _analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
                 **kwargs):
         input_path = ProfilerPathManager.get_realpath(input_path)
         cls._check_input_path(input_path)