From d11548f7b7b658fc022227a8539f9f67e0d6207f Mon Sep 17 00:00:00 2001
From: jzh <jiangzhihui4@huawei.com>
Date: Thu, 4 Sep 2025 17:50:58 +0800
Subject: [PATCH 1/3] [pytorch][bugfix] icsl for chmod644

---
 mindspeed_llm/tasks/checkpoint/saver.py |  3 ++-
 mindspeed_llm/training/training.py      | 28 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py
index f2a10a105..69b9b9aff 100644
--- a/mindspeed_llm/tasks/checkpoint/saver.py
+++ b/mindspeed_llm/tasks/checkpoint/saver.py
@@ -492,7 +492,7 @@ def set_model_rm_head(model_mg, msg, md, **kwargs):
             if model_mg.has_rm_head_bias(**kwargs):
                 model_mg.set_rm_head_bias(**kwargs, data=rm_head_bias)
 
-
+from mindspeed_llm.training.training import update_save_checkpoint_chmod
 def save_model(model_mg, md, **kwargs):
     margs = model_mg.get_args()
     args_cmd = model_mg.get_args_cmd()
@@ -532,6 +532,7 @@ def save_model(model_mg, md, **kwargs):
                 save_checkpoint(md.iteration, vp_models, None, None, 0)
             elif args_cmd.save_model_type == "hf":
                 save_huggingface(args_cmd, model_mg)
+    update_save_checkpoint_chmod(args_cmd.save_dir)
 
 
 def save_huggingface(args, model):
diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py
index abf010f3e..9f8d6f9f3 100644
--- a/mindspeed_llm/training/training.py
+++ b/mindspeed_llm/training/training.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import gc
 import sys
 import json
@@ -64,6 +65,29 @@ from mindspeed_llm.tasks.posttrain.lora.utils import is_enable_lora
 _TRAIN_START_TIME = time.time()
 
 
+def update_save_checkpoint_chmod(save_path, permission=0o640):
+    if not os.path.exists(save_path):
+        return False
+
+    if not os.path.isdir(save_path):
+        logging.warning(f"warning:not a directory: {save_path}")
+        return False
+
+    for root, dirs, files in os.walk(save_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+
+            try:
+                os.chmod(file_path, permission)
+            except PermissionError:
+                logging.warning(f"permission error: {file_path}")
+            except Exception as e:
+                logging.warning(f"failed to change permission: {file_path}: {e}")
+
+    print(f"finish permission set for files in {save_path}")
+    return True
+
+
 def model_provider_func_wrapper(model_provider_func):
     @wraps(model_provider_func)
     def wrapper(*args, **kwargs):
@@ -647,6 +671,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                                          opt_param_scheduler,
                                          num_floating_point_operations_so_far,
                                          checkpointing_context=None)
+                update_save_checkpoint_chmod(config.save)
                 print_datetime('exiting program after receiving SIGTERM.')
                 exit = True
                 break
@@ -657,6 +682,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                                      opt_param_scheduler,
                                      num_floating_point_operations_so_far,
                                      checkpointing_context=None)
+            update_save_checkpoint_chmod(config.save)
             saved_checkpoint = True
 
         # Exiting based on duration
@@ -674,6 +700,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                                              opt_param_scheduler,
                                              num_floating_point_operations_so_far,
                                              checkpointing_context=None)
+                    update_save_checkpoint_chmod(config.save)
                 print_datetime('exiting program after {} minutes'.format(train_time))
                 exit = True
                 break
@@ -685,6 +712,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                                          opt_param_scheduler,
                                          num_floating_point_operations_so_far,
                                          checkpointing_context=None)
+                update_save_checkpoint_chmod(config.save)
             torch.distributed.barrier()
             print_datetime('exiting program at iteration {}'.format(iteration))
             exit = True
-- 
Gitee


From 19b587841b3553eb19db0998b22ae54449c79040 Mon Sep 17 00:00:00 2001
From: jzh <jiangzhihui4@huawei.com>
Date: Thu, 4 Sep 2025 20:03:47 +0800
Subject: [PATCH 2/3] [pytorch][bugfix] icsl for nltk.load

---
 SECURITYNOTE.md                         |  7 +++---
 mindspeed_llm/tasks/checkpoint/saver.py |  3 ++-
 mindspeed_llm/training/training.py      | 29 ++++++++++---------------
 preprocess_data.py                      |  1 +
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index 87061aae5..b50b5611e 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -40,13 +40,13 @@
 ## 数据安全声明
 
 1. MindSpeed-LLM会在megatron中的checkpointing模块中保存模型文件，其中部分模型文件使用了风险模块pickle，可能存在数据风险。
-
+2. 程序运行过程中，会通过nltk.load从用户指定的路径中加载语料库，需要保证网络安全，确保下载的语料包来源可信。
 
 ## 运行安全声明
 
 1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配，如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况，可能引发错误并导致进程意外退出。
 2. MindSpeed-LLM内部用到了pytorch,可能会因为版本不匹配导致运行错误，具体可参考pytorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。
-3. 出于安全考虑torch.load应该采用了weights_only=True的配置，因为影响业务正常运行，保持配置为False，并提供告警声明。
+3. 本软件使用pytorch的torch.load做模型加载，代码中存在该接口使用场景配置参数weights_only=True,对于pytorch版本<=2.5.1时，存在反序列化漏洞CVE-2025-32434,请用户保障所加载权重的安全性，避免恶意模型加载使执行机/设备遭到攻击。
 4. HumanEval使用了subprocess.run,存在安全风险，为了不影响功能正常使用，做了一些安全校验规避，请用户根据需要自行构建黑名单，完善安全问题。
 5. 因为安全问题，需要设置trust_remote_code=False，无法远程加载Transformer官方仓库未支持的开源模型，如需要，请手动配置--trust-remote-code参量。
 
@@ -79,4 +79,5 @@ MindSpeed-LLM 暂时未发布wheel包，无正式对外公开接口，所有功
 ### 特殊场景
 | 场景                                  | 使用方法                                         | 端口 | 可能的风险       |
 | ------------------------------------- | ------------------------------------------------ | ---------- | ---------- |
-| 使用MindSpeed-LLM进行训练任务时，新增32个随机端口和1个master-port端口（该端口与torch_npu的master-post端口一致） | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题，影响训练效率。     |
\ No newline at end of file
+| 使用MindSpeed-LLM进行训练任务时，新增32个随机端口和1个master-port端口（该端口与torch_npu的master-post端口一致） | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题，影响训练效率。     |
+| 用户通过nltk.download下载语料库 | 用户在代码内部使用nltk.download来实现语料库的下载 | 随机端口 | 文件来源若不可信，在文件加载时可能存在反序列化漏洞，导致文件被篡改。 |
\ No newline at end of file
diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py
index 69b9b9aff..4e60b2a15 100644
--- a/mindspeed_llm/tasks/checkpoint/saver.py
+++ b/mindspeed_llm/tasks/checkpoint/saver.py
@@ -23,6 +23,7 @@ from megatron.core import mpu
 import megatron.core.tensor_parallel.layers as tpl
 from megatron.training.checkpointing import save_checkpoint
 
+from mindspeed_llm.training.training import update_save_checkpoint_chmod
 from .models import get_megatron_model
 
 logger.basicConfig(format="")
@@ -492,7 +493,7 @@ def set_model_rm_head(model_mg, msg, md, **kwargs):
             if model_mg.has_rm_head_bias(**kwargs):
                 model_mg.set_rm_head_bias(**kwargs, data=rm_head_bias)
 
-from mindspeed_llm.training.training import update_save_checkpoint_chmod
+
 def save_model(model_mg, md, **kwargs):
     margs = model_mg.get_args()
     args_cmd = model_mg.get_args_cmd()
diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py
index 9f8d6f9f3..328b1318e 100644
--- a/mindspeed_llm/training/training.py
+++ b/mindspeed_llm/training/training.py
@@ -66,26 +66,19 @@ _TRAIN_START_TIME = time.time()
 
 
 def update_save_checkpoint_chmod(save_path, permission=0o640):
-    if not os.path.exists(save_path):
-        return False
-
-    if not os.path.isdir(save_path):
-        logging.warning(f"warning:not a directory: {save_path}")
-        return False
-
-    for root, dirs, files in os.walk(save_path):
-        for file in files:
-            file_path = os.path.join(root, file)
-
-            try:
-                os.chmod(file_path, permission)
-            except PermissionError:
-                logging.warning(f"permission error: {file_path}")
-            except Exception as e:
-                logging.warning(f"failed to change permission: {file_path}: {e}")
+    if os.path.exists(save_path) and os.path.isdir(save_path):
+        for root, _, files in os.walk(save_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+
+                try:
+                    os.chmod(file_path, permission)
+                except PermissionError:
+                    logging.warning(f"permission error: {file_path}")
+                except Exception as e:
+                    logging.warning(f"failed to change permission: {file_path}: {e}")
 
     print(f"finish permission set for files in {save_path}")
-    return True
 
 
 def model_provider_func_wrapper(model_provider_func):
diff --git a/preprocess_data.py b/preprocess_data.py
index 8e8a709cb..cdd18eeb4 100644
--- a/preprocess_data.py
+++ b/preprocess_data.py
@@ -77,6 +77,7 @@ def build_splitter(args):
         if not nltk:
             logger.error("NLTK is not available to split sentences.")
             raise Exception("nltk is not available")
+        logger.warning("Warning: nltk.load() uses pickle. Ensure the source of the corpus is trusted.")
         splitter = nltk.load("tokenizers/punkt/english.pickle")
         if args.keep_newlines:
             # this prevents punkt from eating newlines after sentences
-- 
Gitee


From 4049365448f4388702ecbb4c8b160c076b4f3990 Mon Sep 17 00:00:00 2001
From: jzh <jiangzhihui4@huawei.com>
Date: Wed, 10 Sep 2025 18:18:23 +0800
Subject: [PATCH 3/3] [pytorch][bugfix]remove invalid ut test

---
 tests/ut/checkpoint/test_checkpoint.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/ut/checkpoint/test_checkpoint.py b/tests/ut/checkpoint/test_checkpoint.py
index 9d973ce00..cf2f8d5be 100644
--- a/tests/ut/checkpoint/test_checkpoint.py
+++ b/tests/ut/checkpoint/test_checkpoint.py
@@ -50,15 +50,6 @@ class TestCheckpoint(object):
         base_dir = '/data/ci/deepseek2/hf_base/deepseek2_mla-tp_hf_base'
         save_dir = os.path.join(self.test_config['test_deepseek2_mcore2hf_tp1pp4ep8'][0]['save-dir'], 'mg2hf')
         assert weight_compare(base_dir, save_dir, suffix="safetensors", use_md5=True)
-        shutil.rmtree(save_dir)    
-
-    def test_qwen2_moe_hf2mcore_tp2pp1ep2(self):
-        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-        exit_code = run_cmd(["python3", CKPT_PYPATH] + self.test_config_cmd['test_qwen2_moe_hf2mcore_tp2pp1ep2'])
-        assert exit_code == 0
-        base_dir = "/data/ci/qwen2_moe/mg_base/qwen2_moe_l2_t2p1e2_base"
-        save_dir = self.test_config['test_qwen2_moe_hf2mcore_tp2pp1ep2'][0]['save-dir']
-        assert weight_compare(base_dir, save_dir)
         shutil.rmtree(save_dir)
 
     def test_llama2_hf2mcore_orm_pp2vpp2(self):
-- 
Gitee