diff --git a/torch_npu/profiler/analysis/npu_profiler.py b/torch_npu/profiler/analysis/npu_profiler.py
index f6b136be763c8088fcebe71253fbecec0d0373c3..f99b734151c3a7115dd1b1b9a43f4e8afecc59e0 100644
--- a/torch_npu/profiler/analysis/npu_profiler.py
+++ b/torch_npu/profiler/analysis/npu_profiler.py
@@ -24,9 +24,20 @@ from ...utils.path_manager import PathManager
 
 
 class NpuProfiler:
-
     @classmethod
     def analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
+                 **kwargs):
+        """ Muti-process in parsing use fork to generate child processes for better performance, while forking from a
+            muti-threaded process may cause deadlock. So spawn a pure process to be public parent process for parsing.
+        """
+        mp = multiprocessing.get_context("spawn")
+        p = mp.Process(target=NpuProfiler._analyse, args=(input_path, analysis_type, output_path),
+                                    kwargs=kwargs)
+        p.start()
+        p.join()
+
+    @classmethod
+    def _analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
                 **kwargs):
         input_path = ProfilerPathManager.get_realpath(input_path)
         cls._check_input_path(input_path)