From d11548f7b7b658fc022227a8539f9f67e0d6207f Mon Sep 17 00:00:00 2001 From: jzh Date: Thu, 4 Sep 2025 17:50:58 +0800 Subject: [PATCH 1/3] [pytorch][bugfix] icsl for chmod644 --- mindspeed_llm/tasks/checkpoint/saver.py | 3 ++- mindspeed_llm/training/training.py | 28 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py index f2a10a105..69b9b9aff 100644 --- a/mindspeed_llm/tasks/checkpoint/saver.py +++ b/mindspeed_llm/tasks/checkpoint/saver.py @@ -492,7 +492,7 @@ def set_model_rm_head(model_mg, msg, md, **kwargs): if model_mg.has_rm_head_bias(**kwargs): model_mg.set_rm_head_bias(**kwargs, data=rm_head_bias) - +from mindspeed_llm.training.training import update_save_checkpoint_chmod def save_model(model_mg, md, **kwargs): margs = model_mg.get_args() args_cmd = model_mg.get_args_cmd() @@ -532,6 +532,7 @@ def save_model(model_mg, md, **kwargs): save_checkpoint(md.iteration, vp_models, None, None, 0) elif args_cmd.save_model_type == "hf": save_huggingface(args_cmd, model_mg) + update_save_checkpoint_chmod(args_cmd.save_dir) def save_huggingface(args, model): diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py index abf010f3e..9f8d6f9f3 100644 --- a/mindspeed_llm/training/training.py +++ b/mindspeed_llm/training/training.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import gc import sys import json @@ -64,6 +65,29 @@ from mindspeed_llm.tasks.posttrain.lora.utils import is_enable_lora _TRAIN_START_TIME = time.time() +def update_save_checkpoint_chmod(save_path, permission=0o640): + if not os.path.exists(save_path): + return False + + if not os.path.isdir(save_path): + logging.warning(f"warning:not a directory: {save_path}") + return False + + for root, dirs, files in os.walk(save_path): + for file in files: + file_path = os.path.join(root, file) + + try: + os.chmod(file_path, permission) + except PermissionError: + logging.warning(f"permission error: {file_path}") + except Exception as e: + logging.warning(f"failed to change permission: {file_path}: {e}") + + print(f"finish permission set for files in {save_path}") + return True + + def model_provider_func_wrapper(model_provider_func): @wraps(model_provider_func) def wrapper(*args, **kwargs): @@ -647,6 +671,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) print_datetime('exiting program after receiving SIGTERM.') exit = True break @@ -657,6 +682,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) saved_checkpoint = True # Exiting based on duration @@ -674,6 +700,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) print_datetime('exiting program after {} minutes'.format(train_time)) exit = True break @@ -685,6 +712,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) torch.distributed.barrier() print_datetime('exiting program at iteration {}'.format(iteration)) exit = True -- Gitee From 19b587841b3553eb19db0998b22ae54449c79040 Mon Sep 17 00:00:00 2001 From: jzh Date: Thu, 4 Sep 2025 20:03:47 +0800 Subject: [PATCH 2/3] [pytorch][bugfix] icsl for nltk.load --- SECURITYNOTE.md | 7 +++--- mindspeed_llm/tasks/checkpoint/saver.py | 3 ++- mindspeed_llm/training/training.py | 29 ++++++++++--------------- preprocess_data.py | 1 + 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 87061aae5..b50b5611e 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -40,13 +40,13 @@ ## 数据安全声明 1. MindSpeed-LLM会在megatron中的checkpointing模块中保存模型文件,其中部分模型文件使用了风险模块pickle,可能存在数据风险。 - +2. 程序运行过程中,会通过nltk.load从用户指定的路径中加载语料库,需要保证网络安全,确保下载的语料包来源可信。 ## 运行安全声明 1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配,如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况,可能引发错误并导致进程意外退出。 2. MindSpeed-LLM内部用到了pytorch,可能会因为版本不匹配导致运行错误,具体可参考pytorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。 -3. 出于安全考虑torch.load应该采用了weights_only=True的配置,因为影响业务正常运行,保持配置为False,并提供告警声明。 +3. 本软件使用pytorch的torch.load做模型加载,代码中存在该接口使用场景配置参数weights_only=True,对于pytorch版本<=2.5.1时,存在反序列化漏洞CVE-2025-32434,请用户保障所加载权重的安全性,避免恶意模型加载使执行机/设备遭到攻击。 4. HumanEval使用了subprocess.run,存在安全风险,为了不影响功能正常使用,做了一些安全校验规避,请用户根据需要自行构建黑名单,完善安全问题。 5. 因为安全问题,需要设置trust_remote_code=False,无法远程加载Transformer官方仓库未支持的开源模型,如需要,请手动配置--trust-remote-code参量。 @@ -79,4 +79,5 @@ MindSpeed-LLM 暂时未发布wheel包,无正式对外公开接口,所有功 ### 特殊场景 | 场景 | 使用方法 | 端口 | 可能的风险 | | ------------------------------------- | ------------------------------------------------ | ---------- | ---------- | -| 使用MindSpeed-LLM进行训练任务时,新增32个随机端口和1个master-port端口(该端口与torch_npu的master-post端口一致) | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题,影响训练效率。 | \ No newline at end of file +| 使用MindSpeed-LLM进行训练任务时,新增32个随机端口和1个master-port端口(该端口与torch_npu的master-post端口一致) | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题,影响训练效率。 | +| 用户通过nltk.download下载语料库 | 用户在代码内部使用nltk.download来实现语料库的下载 | 随机端口 | 文件来源若不可信,在文件加载时可能存在反序列化漏洞,导致文件被篡改。 | \ No newline at end of file diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py index 69b9b9aff..4e60b2a15 100644 --- a/mindspeed_llm/tasks/checkpoint/saver.py +++ b/mindspeed_llm/tasks/checkpoint/saver.py @@ -23,6 +23,7 @@ from megatron.core import mpu import megatron.core.tensor_parallel.layers as tpl from megatron.training.checkpointing import save_checkpoint +from mindspeed_llm.training.training import update_save_checkpoint_chmod from .models import get_megatron_model logger.basicConfig(format="") @@ -492,7 +493,7 @@ def set_model_rm_head(model_mg, msg, md, **kwargs): if model_mg.has_rm_head_bias(**kwargs): model_mg.set_rm_head_bias(**kwargs, data=rm_head_bias) -from mindspeed_llm.training.training import update_save_checkpoint_chmod + def save_model(model_mg, md, **kwargs): margs = model_mg.get_args() args_cmd = model_mg.get_args_cmd() diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py index 9f8d6f9f3..328b1318e 100644 --- a/mindspeed_llm/training/training.py +++ b/mindspeed_llm/training/training.py @@ -66,26 +66,19 @@ _TRAIN_START_TIME = time.time() def update_save_checkpoint_chmod(save_path, permission=0o640): - if not os.path.exists(save_path): - return False - - if not os.path.isdir(save_path): - logging.warning(f"warning:not a directory: {save_path}") - return False - - for root, dirs, files in os.walk(save_path): - for file in files: - file_path = os.path.join(root, file) - - try: - os.chmod(file_path, permission) - except PermissionError: - logging.warning(f"permission error: {file_path}") - except Exception as e: - logging.warning(f"failed to change permission: {file_path}: {e}") + if os.path.exists(save_path) and os.path.isdir(save_path): + for root, _, files in os.walk(save_path): + for file in files: + file_path = os.path.join(root, file) + + try: + os.chmod(file_path, permission) + except PermissionError: + logging.warning(f"permission error: {file_path}") + except Exception as e: + logging.warning(f"failed to change permission: {file_path}: {e}") print(f"finish permission set for files in {save_path}") - return True def model_provider_func_wrapper(model_provider_func): diff --git a/preprocess_data.py b/preprocess_data.py index 8e8a709cb..cdd18eeb4 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -77,6 +77,7 @@ def build_splitter(args): if not nltk: logger.error("NLTK is not available to split sentences.") raise Exception("nltk is not available") + logger.warning("Warning: nltk.load() uses pickle. Ensure the source of the corpus is trusted.") splitter = nltk.load("tokenizers/punkt/english.pickle") if args.keep_newlines: # this prevents punkt from eating newlines after sentences -- Gitee From 4049365448f4388702ecbb4c8b160c076b4f3990 Mon Sep 17 00:00:00 2001 From: jzh Date: Wed, 10 Sep 2025 18:18:23 +0800 Subject: [PATCH 3/3] [pytorch][bugfix]remove invalid ut test --- tests/ut/checkpoint/test_checkpoint.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/ut/checkpoint/test_checkpoint.py b/tests/ut/checkpoint/test_checkpoint.py index 9d973ce00..cf2f8d5be 100644 --- a/tests/ut/checkpoint/test_checkpoint.py +++ b/tests/ut/checkpoint/test_checkpoint.py @@ -50,15 +50,6 @@ class TestCheckpoint(object): base_dir = '/data/ci/deepseek2/hf_base/deepseek2_mla-tp_hf_base' save_dir = os.path.join(self.test_config['test_deepseek2_mcore2hf_tp1pp4ep8'][0]['save-dir'], 'mg2hf') assert weight_compare(base_dir, save_dir, suffix="safetensors", use_md5=True) - shutil.rmtree(save_dir) - - def test_qwen2_moe_hf2mcore_tp2pp1ep2(self): - os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" - exit_code = run_cmd(["python3", CKPT_PYPATH] + self.test_config_cmd['test_qwen2_moe_hf2mcore_tp2pp1ep2']) - assert exit_code == 0 - base_dir = "/data/ci/qwen2_moe/mg_base/qwen2_moe_l2_t2p1e2_base" - save_dir = self.test_config['test_qwen2_moe_hf2mcore_tp2pp1ep2'][0]['save-dir'] - assert weight_compare(base_dir, save_dir) shutil.rmtree(save_dir) def test_llama2_hf2mcore_orm_pp2vpp2(self): -- Gitee