diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 87061aae52e6cca24b6c6cad58d16f4b4fe83184..b50b5611ef5d0a990f507406359521685e7fabda 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -40,13 +40,13 @@ ## 数据安全声明 1. MindSpeed-LLM会在megatron中的checkpointing模块中保存模型文件,其中部分模型文件使用了风险模块pickle,可能存在数据风险。 - +2. 程序运行过程中,会通过nltk.load从用户指定的路径中加载语料库,需要保证网络安全,确保下载的语料包来源可信。 ## 运行安全声明 1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配,如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况,可能引发错误并导致进程意外退出。 2. MindSpeed-LLM内部用到了pytorch,可能会因为版本不匹配导致运行错误,具体可参考pytorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。 -3. 出于安全考虑torch.load应该采用了weights_only=True的配置,因为影响业务正常运行,保持配置为False,并提供告警声明。 +3. 本软件使用pytorch的torch.load做模型加载,代码中存在该接口使用场景配置参数weights_only=True,对于pytorch版本<=2.5.1时,存在反序列化漏洞CVE-2025-32434,请用户保障所加载权重的安全性,避免恶意模型加载使执行机/设备遭到攻击。 4. HumanEval使用了subprocess.run,存在安全风险,为了不影响功能正常使用,做了一些安全校验规避,请用户根据需要自行构建黑名单,完善安全问题。 5. 因为安全问题,需要设置trust_remote_code=False,无法远程加载Transformer官方仓库未支持的开源模型,如需要,请手动配置--trust-remote-code参量。 @@ -79,4 +79,5 @@ MindSpeed-LLM 暂时未发布wheel包,无正式对外公开接口,所有功 ### 特殊场景 | 场景 | 使用方法 | 端口 | 可能的风险 | | ------------------------------------- | ------------------------------------------------ | ---------- | ---------- | -| 使用MindSpeed-LLM进行训练任务时,新增32个随机端口和1个master-port端口(该端口与torch_npu的master-post端口一致) | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题,影响训练效率。 | \ No newline at end of file +| 使用MindSpeed-LLM进行训练任务时,新增32个随机端口和1个master-port端口(该端口与torch_npu的master-post端口一致) | 使用pytorch分布式训练拉起任一任务 | [1024,65520]内 | 网络配置错误可能引发端口冲突或连接问题,影响训练效率。 | +| 用户通过nltk.download下载语料库 | 用户在代码内部使用nltk.download来实现语料库的下载 | 随机端口 | 文件来源若不可信,在文件加载时可能存在反序列化漏洞,导致文件被篡改。 | \ No newline at end of file diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py index f2a10a1059b2a618af7accc2dda44b41669b768a..4e60b2a15aa1545db5e3836a38dfe04b7f530f39 100644 --- a/mindspeed_llm/tasks/checkpoint/saver.py +++ b/mindspeed_llm/tasks/checkpoint/saver.py @@ -23,6 +23,7 @@ from megatron.core import mpu import megatron.core.tensor_parallel.layers as tpl from megatron.training.checkpointing import save_checkpoint +from mindspeed_llm.training.training import update_save_checkpoint_chmod from .models import get_megatron_model logger.basicConfig(format="") @@ -532,6 +533,7 @@ def save_model(model_mg, md, **kwargs): save_checkpoint(md.iteration, vp_models, None, None, 0) elif args_cmd.save_model_type == "hf": save_huggingface(args_cmd, model_mg) + update_save_checkpoint_chmod(args_cmd.save_dir) def save_huggingface(args, model): diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py index abf010f3e0134b408d5a7731840fb33e22e995aa..328b1318e77c7e6e717bf4b9192f17b718a1fbda 100644 --- a/mindspeed_llm/training/training.py +++ b/mindspeed_llm/training/training.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import gc import sys import json @@ -64,6 +65,22 @@ from mindspeed_llm.tasks.posttrain.lora.utils import is_enable_lora _TRAIN_START_TIME = time.time() +def update_save_checkpoint_chmod(save_path, permission=0o640): + if os.path.exists(save_path) and os.path.isdir(save_path): + for root, _, files in os.walk(save_path): + for file in files: + file_path = os.path.join(root, file) + + try: + os.chmod(file_path, permission) + except PermissionError: + logging.warning(f"permission error: {file_path}") + except Exception as e: + logging.warning(f"failed to change permission: {file_path}: {e}") + + print(f"finish permission set for files in {save_path}") + + def model_provider_func_wrapper(model_provider_func): @wraps(model_provider_func) def wrapper(*args, **kwargs): @@ -647,6 +664,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) print_datetime('exiting program after receiving SIGTERM.') exit = True break @@ -657,6 +675,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) saved_checkpoint = True # Exiting based on duration @@ -674,6 +693,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) print_datetime('exiting program after {} minutes'.format(train_time)) exit = True break @@ -685,6 +705,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None) + update_save_checkpoint_chmod(config.save) torch.distributed.barrier() print_datetime('exiting program at iteration {}'.format(iteration)) exit = True diff --git a/preprocess_data.py b/preprocess_data.py index 8e8a709cbdd4d80f7458254f2a68ac86b2cb3a75..cdd18eeb4c48c2cd6f4801e4dc9fd9c349f9f72c 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -77,6 +77,7 @@ def build_splitter(args): if not nltk: logger.error("NLTK is not available to split sentences.") raise Exception("nltk is not available") + logger.warning("Warning: nltk.load() uses pickle. Ensure the source of the corpus is trusted.") splitter = nltk.load("tokenizers/punkt/english.pickle") if args.keep_newlines: # this prevents punkt from eating newlines after sentences diff --git a/tests/ut/checkpoint/test_checkpoint.py b/tests/ut/checkpoint/test_checkpoint.py index 9d973ce00c5d8e5129f82745b690ac0f1717e40b..cf2f8d5be33199bead01ab25ce478537086a32bd 100644 --- a/tests/ut/checkpoint/test_checkpoint.py +++ b/tests/ut/checkpoint/test_checkpoint.py @@ -50,15 +50,6 @@ class TestCheckpoint(object): base_dir = '/data/ci/deepseek2/hf_base/deepseek2_mla-tp_hf_base' save_dir = os.path.join(self.test_config['test_deepseek2_mcore2hf_tp1pp4ep8'][0]['save-dir'], 'mg2hf') assert weight_compare(base_dir, save_dir, suffix="safetensors", use_md5=True) - shutil.rmtree(save_dir) - - def test_qwen2_moe_hf2mcore_tp2pp1ep2(self): - os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" - exit_code = run_cmd(["python3", CKPT_PYPATH] + self.test_config_cmd['test_qwen2_moe_hf2mcore_tp2pp1ep2']) - assert exit_code == 0 - base_dir = "/data/ci/qwen2_moe/mg_base/qwen2_moe_l2_t2p1e2_base" - save_dir = self.test_config['test_qwen2_moe_hf2mcore_tp2pp1ep2'][0]['save-dir'] - assert weight_compare(base_dir, save_dir) shutil.rmtree(save_dir) def test_llama2_hf2mcore_orm_pp2vpp2(self):