From a8f5dff502053a45e1320e3c023a759f595ebd93 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Mon, 18 Aug 2025 19:52:00 +0800 Subject: [PATCH 1/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 197 +++++++++++++++++++++++++++++++-- 1 file changed, 189 insertions(+), 8 deletions(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 6b4f915f..c1417c71 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -1,15 +1,17 @@ import os import json +from functools import wraps from mindspeed_mm.utils.utils import get_dtype class ConfigReader: - """ + """ read_config read json file dict processed by MMconfig and convert to class attributes, besides, read_config support to convert dict for specific purposes. """ + def __init__(self, config_dict: dict) -> None: for k, v in config_dict.items(): if k == "dtype": @@ -18,7 +20,7 @@ class ConfigReader: self.__dict__[k] = ConfigReader(v) else: self.__dict__[k] = v - + def to_dict(self) -> dict: ret = {} for k, v in self.__dict__.items(): @@ -27,7 +29,7 @@ class ConfigReader: else: ret[k] = v return ret - + def __repr__(self) -> str: for k, v in self.__dict__.items(): if isinstance(v, self.__class__): @@ -58,25 +60,28 @@ class ConfigReader: class MMConfig: - """ - MMconfig + """ + MMconfig input: a dict of json path """ + def __init__(self, json_files: dict) -> None: for json_name, json_path in json_files.items(): if os.path.exists(json_path): real_path = os.path.realpath(json_path) config_dict = self.read_json(real_path) setattr(self, json_name, ConfigReader(config_dict)) - + else: + raise Exception("{} don't exist".format(json_name)) + @staticmethod def read_json(json_path): with open(json_path, mode="r") as f: json_file = f.read() config_dict = json.loads(json_file) return config_dict - - + + def _add_mm_args(parser): group = parser.add_argument_group(title="multimodel") group.add_argument("--mm-data", type=str, default="") @@ -90,9 +95,185 @@ def mm_extra_args_provider(parser): return parser +def merge_mm_args_decorator(merge_mm_args): + called = False + + @wraps(merge_mm_args) + def wrapper(args): + merge_mm_args(args) + nonlocal called + if not called: + args_external_path_checker(args) + called = True + return wrapper + + +@merge_mm_args_decorator def merge_mm_args(args): if not hasattr(args, "mm"): setattr(args, "mm", object) json_files = {"model": args.mm_model, "data": args.mm_data, "tool": args.mm_tool} args.mm = MMConfig(json_files) + +def args_external_path_checker(args): + """ + Verify the security of all file path parameters in 3 code repositories:mindspeed-mm,mindspeed,megatron + and 3 json file:mm_data.json,mm_model.json,mm_tool.json + """ + # args from mindspeed_mm + mindspeed_mm_params = ['load_base_model', "mm_data", "mm_tool", "mm_model"] + for param in mindspeed_mm_params: + if hasattr(args, param) and getattr(args, param): + file_legality_checker(getattr(args, param), param) + + # args from mindspeed + mindspeed_param = ['auto_tuning_work_dir', "profile_save_path", "tokenizer_name_or_path", "additional_config", + "layerzero_config", "prof_file"] + for param in mindspeed_param: + if hasattr(args, param) and getattr(args, param): + file_legality_checker(getattr(args, param), param) + + # args from megatron + megatron_param = ["tensorboard_dir", "save", "load", "pretrained_checkpoint", "data_cache_path", "merge_file", + "s3_cache_path", "ict_load", "bert_load", "titles_data_path", "evidence_data_path", + "block_data_path", "embedding_path", "yaml_cfg"] + for param in megatron_param: + if hasattr(args, param) and getattr(args, param): + file_legality_checker(getattr(args, param), param) + + # These parameters may have the following format:weight path weight path + megatron_special_params = ["data_path", "train_data_path", "valid_data_path", "test_data_path"] + for param in megatron_special_params: + if hasattr(args, param) and getattr(args, param): + file_list = split_param(param) + for path in file_list: + file_legality_checker(path, param) + + # arge from MM_ModeL + MM_model_params = ["text_encoder.from_pretrained", "text_enceder.template_file_path", "text_encoder.ckpt_path", + "image_encoder.vision_encoder.ckpt_path", "image_encoder.vision_projector.ckpt_path", + "ae.from_pretrained", "ae.from_pretrained_3dvae_ckpt", "as.i2v_processor.processor_path", + "ae.i2v_processor.image_eneoder", "dpo.histgram_path", + "tokenizer.from_pretrained", "tokenizer.template_file_path", "predictor.from_pretrained", + "discriminator.perceptual_from_pretrained", "save_path", "prompt", "video", "image", + "image_path", "file_path", "image_processer_path", "from_pretrained", + "conditional_pixel_values_path", "ckpt_path", "eval_config.dataset.basic_parm.data_path", + "eval_config.dataset.basic_parm.data_folder", "eval_config.dataset.extra_param.prompt_file", + "eval_config.dataset.extra_param.augmented_prompt_file", "eval_config.eval_result_path", + "eval_config.image_path", "eval_config.long_eval_config", + "result_output_path", "dataset_path", "evaluation_dataset"] + if args.mm.model: + for param in MM_model_params: + values = get_ConfigReader_value(args.mm.model, param) + for value in values: + if value: + file_legality_checker(value, param) + + # args from MM_Data + MM_data_params = ["dataset_param.basic_parameters.data_path", "dataset_param.basic_parameters.data_folder", + "dataset_param.basic_parameters.dataset_dir", "dataset_param.basic_parameters.dataset", + "dataset_param.basic_parameters.cache_dir", "dataset_param.tokenizer_config.from_pretrained", + "dataset_param.tokenizer_config.template_file_path", "dataset_param.processor_path", + "dataset_param.preprocess_parameters.model_name_or_path", + "dataset_param.preprocess_parameters.processor_name_or_path", + "dataset_param.video_folder", "dataloader_param.collate_param.processor_name_or_path"] + if args.mm.data: + for param in MM_data_params: + values = get_ConfigReader_value(args.mm.data, param) + for value in values: + if value: + file_legality_checker(value, param) + + # args from MM_Tool + MM_tool_params = ["profile.static_param.save_path", "profile.static_param.dynamic_param", + "memory_profile.save_path", "sorafeature.save_path"] + if args.mm.tool: + for param in MM_tool_params: + values = get_ConfigReader_value(args.mm.tool, param) + for value in values: + if value: + file_legality_checker(value, param) + + +def file_legality_checker(file_path, param_name, base_dir=None): + """ + Perform soft link and path traversal checks on file path + """ + if not base_dir: + base_dir = os.getcwd() + + # check file exist + try: + if not os.path.exists(file_path): + return False + except OSError: + return False + + # check symbolic link + from mindspeed_mm.utils.security_utils.validate_path import normalize_path + try: + norm_path, is_link = normalize_path(file_path) + if is_link: + print( + "WARNING: [{}] {} is a symbolic link.It's normalize path is {}".format(param_name, file_path, + norm_path)) + return False + except OSError: + return False + + # check path crossing + try: + # get absolute file path + norm_path = os.path.realpath(file_path) + # get absolute base dir path + base_directory = os.path.abspath(base_dir) + if not norm_path.startswith(base_directory): + print("WARNING: [{}] {} attempts to traverse to an disallowed directory".format(param_name, file_path)) + return False + except OSError: + return False + + return True + + +def split_param(param): + """ + Segment some special parameters in megatron + """ + + def is_number(s): + if isinstance(s, str): + s = s.strip() + try: + float(s) + return True + except (ValueError, TypeError): + return False + + param_list = param.split(" ") + if len(param_list) == 1: + return param_list + else: + if is_number(param_list[0]): + return [param_list[2 * i] for i in range(len(param_list) // 2)] + else: + return param_list + + +def get_ConfigReader_value(config, param): + objs = [config.to_dict()] + for key in param.split("."): + new_objs = [] + for obj in objs: + if key in obj: + if isinstance(obj[key], list): + new_objs.extend(obj[key]) + else: + new_objs.append(obj[key]) + if new_objs: + objs = new_objs + else: + return None + + return objs -- Gitee From 97fd966f5ee0cb07b8fa921782c0f94be9baf5d1 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Mon, 18 Aug 2025 20:45:29 +0800 Subject: [PATCH 2/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index c1417c71..9ddeae68 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -95,12 +95,12 @@ def mm_extra_args_provider(parser): return parser -def merge_mm_args_decorator(merge_mm_args): +def merge_mm_args_decorator(func): called = False - @wraps(merge_mm_args) + @wraps(func) def wrapper(args): - merge_mm_args(args) + func(args) nonlocal called if not called: args_external_path_checker(args) -- Gitee From 31182f4af51f2d2da19d936ddf50f8f916008616 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 10:02:51 +0800 Subject: [PATCH 3/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 9ddeae68..3fd3a721 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -267,6 +267,8 @@ def get_ConfigReader_value(config, param): new_objs = [] for obj in objs: if key in obj: + if not obj[key]: + continue if isinstance(obj[key], list): new_objs.extend(obj[key]) else: -- Gitee From 8ce536f4a312e315bb446bdb748b86c3f9d0d8f3 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 10:51:15 +0800 Subject: [PATCH 4/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 3fd3a721..55c168e5 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -276,6 +276,6 @@ def get_ConfigReader_value(config, param): if new_objs: objs = new_objs else: - return None + return [] return objs -- Gitee From eaf1a63c6fb4743900e3b6387771a987ad2f26ce Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 11:20:11 +0800 Subject: [PATCH 5/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 55c168e5..0ec3c6c9 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -103,7 +103,11 @@ def merge_mm_args_decorator(func): func(args) nonlocal called if not called: - args_external_path_checker(args) + try: + args_external_path_checker(args) + except Exception as e: + print(e) + raise e called = True return wrapper -- Gitee From 289b3ec373f0cc13ec636a774dc679f12acc653b Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 11:38:37 +0800 Subject: [PATCH 6/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 0ec3c6c9..80740db2 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -103,11 +103,7 @@ def merge_mm_args_decorator(func): func(args) nonlocal called if not called: - try: - args_external_path_checker(args) - except Exception as e: - print(e) - raise e + args_external_path_checker(args) called = True return wrapper @@ -267,19 +263,25 @@ def split_param(param): def get_ConfigReader_value(config, param): objs = [config.to_dict()] - for key in param.split("."): - new_objs = [] - for obj in objs: - if key in obj: - if not obj[key]: + try: + for key in param.split("."): + new_objs = [] + for obj in objs: + if not obj: continue - if isinstance(obj[key], list): - new_objs.extend(obj[key]) - else: - new_objs.append(obj[key]) - if new_objs: - objs = new_objs - else: - return [] + if key in obj: + if not obj[key]: + continue + if isinstance(obj[key], list): + new_objs.extend(obj[key]) + else: + new_objs.append(obj[key]) + if new_objs: + objs = new_objs + else: + return [] + except Exception as e: + print(param) + print(e) return objs -- Gitee From 9e33dc316884467dfd3d761e37c7cda9d53a24ea Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 14:26:01 +0800 Subject: [PATCH 7/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 80740db2..1a39bc36 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -103,7 +103,10 @@ def merge_mm_args_decorator(func): func(args) nonlocal called if not called: - args_external_path_checker(args) + try: + args_external_path_checker(args) + except Exception as e: + print(e) called = True return wrapper @@ -283,5 +286,6 @@ def get_ConfigReader_value(config, param): except Exception as e: print(param) print(e) + return [] return objs -- Gitee From 9195fd3fb85ac6c1c7ae499a6a6dd686d93cb7cf Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Tue, 19 Aug 2025 15:03:00 +0800 Subject: [PATCH 8/8] [bugfix]add parameter path security verification --- mindspeed_mm/configs/config.py | 40 ++++++++++++++-------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/mindspeed_mm/configs/config.py b/mindspeed_mm/configs/config.py index 1a39bc36..cb8f30da 100644 --- a/mindspeed_mm/configs/config.py +++ b/mindspeed_mm/configs/config.py @@ -103,10 +103,7 @@ def merge_mm_args_decorator(func): func(args) nonlocal called if not called: - try: - args_external_path_checker(args) - except Exception as e: - print(e) + args_external_path_checker(args) called = True return wrapper @@ -266,26 +263,21 @@ def split_param(param): def get_ConfigReader_value(config, param): objs = [config.to_dict()] - try: - for key in param.split("."): - new_objs = [] - for obj in objs: - if not obj: + for key in param.split("."): + new_objs = [] + for obj in objs: + if not obj: + continue + if key in obj: + if not obj[key]: continue - if key in obj: - if not obj[key]: - continue - if isinstance(obj[key], list): - new_objs.extend(obj[key]) - else: - new_objs.append(obj[key]) - if new_objs: - objs = new_objs - else: - return [] - except Exception as e: - print(param) - print(e) - return [] + if isinstance(obj[key], list): + new_objs.extend(obj[key]) + else: + new_objs.append(obj[key]) + if new_objs: + objs = new_objs + else: + return [] return objs -- Gitee