diff --git a/codecheck_toolkits/pyproject.toml b/codecheck_toolkits/pyproject.toml index bfd35cafc290569641f84be000eca2df68e8dbb5..16ce966af552e4669c43cc8a5b4555ae8df90b42 100644 --- a/codecheck_toolkits/pyproject.toml +++ b/codecheck_toolkits/pyproject.toml @@ -69,7 +69,6 @@ line-length = 80 "vllm_mindspore/__init__.py" = ["E402"] "vllm_mindspore/entrypoints/__main__.py" = ["F401", "E402"] "tests/**/*.py" = ["F401", "E402"] -"tests/st/python/cases_parallel/*.py" = ["F401", "E402"] "setup.py" = ["ERA001"] "vllm_mindspore/collocation/collocator.py" = ["ERA001"] diff --git a/tests/st/python/cases_parallel/__init__.py b/tests/st/python/chunk_prefill/__init__.py similarity index 100% rename from tests/st/python/cases_parallel/__init__.py rename to tests/st/python/chunk_prefill/__init__.py diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill.py similarity index 83% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill.py rename to tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill.py index ffb30ce93e28fac171f914978d95b670a45c02d1..d9f9da9116817594cedf79ec726d0dbeb50495bb 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill.py +++ b/tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -45,9 +41,15 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen_7b_chunk_prefill(): """ - test case qwen_7b_chunk_prefill + Test Summary: + test case qwen_7b_chunk_prefill + Expected Result: + Running successfully, the request result meets expectations. + Model Info: + Qwen2.5-7B-Instruct """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -83,15 +85,14 @@ def test_mf_qwen_7b_chunk_prefill(): sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - max_num_seqs=16, - max_num_batched_tokens=32, - block_size=32, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - enable_chunked_prefill=True) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + max_num_seqs=16, + max_num_batched_tokens=32, + block_size=32, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + enable_chunked_prefill=True) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. for batch_data in batch_datas: diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill_v1.py b/tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill_v1.py similarity index 84% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill_v1.py rename to tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill_v1.py index 5d80b0a0bfd82f571d54b2d780d0eb4dc02e8ae2..cbd2c56de88482ad36cf0e9d6bf2805217d92896 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_chunk_prefill_v1.py +++ b/tests/st/python/chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill_v1.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,9 +40,15 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen_7b_chunk_prefill(): """ - test case qwen_7b_chunk_prefill + Test Summary: + test case qwen_7b_chunk_prefill in v1 mode + Expected Result: + Running successfully, the request result meets expectations. + Model Info: + Qwen2.5-7B-Instruct """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -82,14 +84,13 @@ def test_mf_qwen_7b_chunk_prefill(): sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - max_num_seqs=16, - max_num_batched_tokens=32, - block_size=32, - gpu_memory_utilization=0.85, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + max_num_seqs=16, + max_num_batched_tokens=32, + block_size=32, + gpu_memory_utilization=0.85, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. for batch_data in batch_datas: diff --git a/tests/st/python/config/predict_deepseek_r1_671b.yaml b/tests/st/python/config/predict_deepseek_r1_671b.yaml deleted file mode 100644 index a4d055709ff091ca68316d8f9a0604d17cf108ca..0000000000000000000000000000000000000000 --- a/tests/st/python/config/predict_deepseek_r1_671b.yaml +++ /dev/null @@ -1,117 +0,0 @@ -seed: 0 -output_dir: './output' # path to save checkpoint/strategy -run_mode: 'predict' -use_parallel: True - -load_checkpoint: "/path/to/deepseekr1/model_ckpt" -load_ckpt_format: "safetensors" -auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model - -# trainer config -trainer: - type: CausalLanguageModelingTrainer - model_name: 'DeepSeekR1' - -# default parallel of device num = 32 for Atlas 800T A2 -parallel_config: - model_parallel: 4 - pipeline_stage: 1 - expert_parallel: 1 - vocab_emb_dp: False - -# mindspore context init config -context: - mode: 0 # 0--Graph Mode; 1--Pynative Mode - max_device_memory: "61GB" - device_id: 0 - affinity_cpu_list: None - -# parallel context config -parallel: - parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend - full_batch: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" - -# model config -model: - model_config: - type: DeepseekV3Config - auto_register: deepseek3_config.DeepseekV3Config - batch_size: 1 # add for incre predict - seq_length: 4096 - hidden_size: 7168 - num_layers: 4 - num_heads: 128 - max_position_embeddings: 163840 - intermediate_size: 18432 - kv_lora_rank: 512 - q_lora_rank: 1536 - qk_rope_head_dim: 64 - v_head_dim: 128 - qk_nope_head_dim: 128 - vocab_size: 129280 - multiple_of: 256 - rms_norm_eps: 1.0e-6 - bos_token_id: 0 - eos_token_id: 1 - pad_token_id: 1 - ignore_token_id: -100 - compute_dtype: "bfloat16" - layernorm_compute_type: "bfloat16" - softmax_compute_type: "bfloat16" - rotary_dtype: "bfloat16" - router_dense_type: "bfloat16" - param_init_type: "bfloat16" - scaling_factor: - beta_fast: 32.0 - beta_slow: 1.0 - factor: 40.0 - mscale: 1.0 - mscale_all_dim: 1.0 - original_max_position_embeddings: 4096 - use_past: True - extend_method: "YARN" - use_flash_attention: True - block_size: 16 - num_blocks: 512 - offset: 0 - checkpoint_name_or_path: "" - repetition_penalty: 1 - max_decode_length: 1024 - top_k: 1 - top_p: 1 - theta: 10000.0 - do_sample: False - is_dynamic: True - qkv_concat: False - ffn_concat: True - auto_map: - AutoConfig: deepseek3_config.DeepseekV3Config - AutoModel: deepseek3.DeepseekV3ForCausalLM - arch: - type: DeepseekV3ForCausalLM - auto_register: deepseek3.DeepseekV3ForCausalLM - -moe_config: - expert_num: 256 - num_experts_chosen: 8 - routing_policy: "TopkRouterV2" - shared_expert_num: 1 - routed_scaling_factor: 2.5 - first_k_dense_replace: 3 - moe_intermediate_size: 2048 - topk_group: 4 - n_group: 8 - -processor: - return_tensors: ms - tokenizer: - unk_token: '' - bos_token: '<|begin▁of▁sentence|>' - eos_token: '<|end▁of▁sentence|>' - pad_token: '<|end▁of▁sentence|>' - type: LlamaTokenizerFast - vocab_file: '/path/to/deepseekr1/tokenizer.json' - tokenizer_file: '/path/to/deepseekr1/tokenizer.json' - chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" - type: LlamaProcessor diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml deleted file mode 100644 index 239adc121d4f975d91a195b6c4dfba2144e1cde2..0000000000000000000000000000000000000000 --- a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml +++ /dev/null @@ -1,122 +0,0 @@ -seed: 0 -output_dir: './output' # path to save checkpoint/strategy -run_mode: 'predict' -use_parallel: True - -load_checkpoint: "/path/to/deepseekr1/model_w8a8_ckpt" -load_ckpt_format: "safetensors" -auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model - -# trainer config -trainer: - type: CausalLanguageModelingTrainer - model_name: 'DeepSeekR1-W8A8' - -# default parallel of device num = 16 for Atlas 800T A2 -parallel_config: - data_parallel: 1 - model_parallel: 16 - pipeline_stage: 1 - expert_parallel: 1 - vocab_emb_dp: False - -# mindspore context init config -context: - mode: 0 # 0--Graph Mode; 1--Pynative Mode - max_device_memory: "58GB" - device_id: 0 - affinity_cpu_list: None - -# parallel context config -parallel: - parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend - full_batch: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" - -# model config -model: - model_config: - type: DeepseekV3Config - auto_register: deepseek3_config.DeepseekV3Config - batch_size: 1 # add for incre predict - seq_length: 4096 - hidden_size: 7168 - num_layers: 4 - num_heads: 128 - max_position_embeddings: 163840 - intermediate_size: 18432 - kv_lora_rank: 512 - q_lora_rank: 1536 - qk_rope_head_dim: 64 - v_head_dim: 128 - qk_nope_head_dim: 128 - vocab_size: 129280 - multiple_of: 256 - rms_norm_eps: 1.0e-6 - bos_token_id: 0 - eos_token_id: 1 - pad_token_id: 1 - ignore_token_id: -100 - compute_dtype: "bfloat16" - layernorm_compute_type: "bfloat16" - softmax_compute_type: "bfloat16" - rotary_dtype: "bfloat16" - router_dense_type: "bfloat16" - param_init_type: "bfloat16" - scaling_factor: - beta_fast: 32.0 - beta_slow: 1.0 - factor: 40.0 - mscale: 1.0 - mscale_all_dim: 1.0 - original_max_position_embeddings: 4096 - use_past: True - extend_method: "YARN" - use_flash_attention: True - block_size: 16 - num_blocks: 512 - offset: 0 - checkpoint_name_or_path: "" - repetition_penalty: 1 - max_decode_length: 1024 - top_k: 1 - top_p: 1 - theta: 10000.0 - do_sample: False - is_dynamic: True - qkv_concat: False - ffn_concat: True - quantization_config: - quant_method: 'ptq' - weight_dtype: 'int8' - activation_dtype: 'int8' - auto_map: - AutoConfig: deepseek3_config.DeepseekV3Config - AutoModel: deepseek3.DeepseekV3ForCausalLM - arch: - type: DeepseekV3ForCausalLM - auto_register: deepseek3.DeepseekV3ForCausalLM - -moe_config: - expert_num: 256 - num_experts_chosen: 8 - routing_policy: "TopkRouterV2" - shared_expert_num: 1 - routed_scaling_factor: 2.5 - first_k_dense_replace: 3 - moe_intermediate_size: 2048 - topk_group: 4 - n_group: 8 - -processor: - return_tensors: ms - tokenizer: - unk_token: '' - bos_token: '<|begin▁of▁sentence|>' - eos_token: '<|end▁of▁sentence|>' - pad_token: '<|end▁of▁sentence|>' - type: LlamaTokenizerFast - vocab_file: '/path/to/deepseekr1/tokenizer.json' - tokenizer_file: '/path/to/deepseekr1/tokenizer.json' - chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" - type: LlamaProcessor diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml deleted file mode 100644 index e8d310471e5665d6ae063d698a4da0042f61f0e8..0000000000000000000000000000000000000000 --- a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml +++ /dev/null @@ -1,125 +0,0 @@ -seed: 0 -output_dir: './output' # path to save checkpoint/strategy -run_mode: 'predict' -use_parallel: True - -load_checkpoint: "/path/to/deepseekr1/model_w8a8_osl_ckpt" -load_ckpt_format: "safetensors" -auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model - -# trainer config -trainer: - type: CausalLanguageModelingTrainer - model_name: 'DeepSeekR1-W8A8' - -# default parallel of device num = 16 for Atlas 800T A2 -parallel_config: - model_parallel: 16 - pipeline_stage: 1 - expert_parallel: 1 - vocab_emb_dp: False - -# mindspore context init config -context: - mode: 0 # 0--Graph Mode; 1--Pynative Mode - max_device_memory: "61GB" - device_id: 0 - affinity_cpu_list: None - -kernel_launch_group: - thread_num: 4 - kernel_group_num: 16 - -# parallel context config -parallel: - parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend - full_batch: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" - -# model config -model: - model_config: - type: DeepseekV3Config - auto_register: deepseek3_config.DeepseekV3Config - batch_size: 1 # add for incre predict - seq_length: 4096 - hidden_size: 7168 - num_layers: 4 - num_heads: 128 - max_position_embeddings: 163840 - intermediate_size: 18432 - kv_lora_rank: 512 - q_lora_rank: 1536 - qk_rope_head_dim: 64 - v_head_dim: 128 - qk_nope_head_dim: 128 - vocab_size: 129280 - multiple_of: 256 - rms_norm_eps: 1.0e-6 - bos_token_id: 0 - eos_token_id: 1 - pad_token_id: 1 - ignore_token_id: -100 - compute_dtype: "bfloat16" - layernorm_compute_type: "bfloat16" - softmax_compute_type: "bfloat16" - rotary_dtype: "bfloat16" - router_dense_type: "bfloat16" - param_init_type: "bfloat16" - scaling_factor: - beta_fast: 32.0 - beta_slow: 1.0 - factor: 40.0 - mscale: 1.0 - mscale_all_dim: 1.0 - original_max_position_embeddings: 4096 - use_past: True - extend_method: "YARN" - use_flash_attention: True - block_size: 16 - num_blocks: 512 - offset: 0 - checkpoint_name_or_path: "" - repetition_penalty: 1 - max_decode_length: 1024 - top_k: 1 - top_p: 1 - theta: 10000.0 - do_sample: False - is_dynamic: True - qkv_concat: True - ffn_concat: True - quantization_config: - quant_method: 'osl' - weight_dtype: 'int8' - activation_dtype: 'int8' - auto_map: - AutoConfig: deepseek3_config.DeepseekV3Config - AutoModel: deepseek3.DeepseekV3ForCausalLM - arch: - type: DeepseekV3ForCausalLM - auto_register: deepseek3.DeepseekV3ForCausalLM - -moe_config: - expert_num: 256 - num_experts_chosen: 8 - routing_policy: "TopkRouterV2" - shared_expert_num: 1 - routed_scaling_factor: 2.5 - first_k_dense_replace: 3 - moe_intermediate_size: 2048 - topk_group: 4 - n_group: 8 - -processor: - return_tensors: ms - tokenizer: - unk_token: '' - bos_token: '<|begin▁of▁sentence|>' - eos_token: '<|end▁of▁sentence|>' - pad_token: '<|end▁of▁sentence|>' - type: LlamaTokenizerFast - vocab_file: '/path/to/deepseekr1/tokenizer.json' - tokenizer_file: '/path/to/deepseekr1/tokenizer.json' - chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" - type: LlamaProcessor diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml deleted file mode 100644 index f8984e0fde2c898cb54663180ede3495f3608e78..0000000000000000000000000000000000000000 --- a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml +++ /dev/null @@ -1,125 +0,0 @@ -seed: 0 -output_dir: './output' # path to save checkpoint/strategy -run_mode: 'predict' -use_parallel: True - -load_checkpoint: "/path/to/deepseekr1/model_w8a8_smoothquant_ckpt" -load_ckpt_format: "safetensors" -auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model - -# trainer config -trainer: - type: CausalLanguageModelingTrainer - model_name: 'DeepSeekR1-W8A8' - -# default parallel of device num = 16 for Atlas 800T A2 -parallel_config: - model_parallel: 16 - pipeline_stage: 1 - expert_parallel: 1 - vocab_emb_dp: False - -# mindspore context init config -context: - mode: 0 # 0--Graph Mode; 1--Pynative Mode - max_device_memory: "61GB" - device_id: 0 - affinity_cpu_list: None - -kernel_launch_group: - thread_num: 4 - kernel_group_num: 16 - -# parallel context config -parallel: - parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend - full_batch: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" - -# model config -model: - model_config: - type: DeepseekV3Config - auto_register: deepseek3_config.DeepseekV3Config - batch_size: 1 # add for incre predict - seq_length: 4096 - hidden_size: 7168 - num_layers: 4 - num_heads: 128 - max_position_embeddings: 163840 - intermediate_size: 18432 - kv_lora_rank: 512 - q_lora_rank: 1536 - qk_rope_head_dim: 64 - v_head_dim: 128 - qk_nope_head_dim: 128 - vocab_size: 129280 - multiple_of: 256 - rms_norm_eps: 1.0e-6 - bos_token_id: 0 - eos_token_id: 1 - pad_token_id: 1 - ignore_token_id: -100 - compute_dtype: "bfloat16" - layernorm_compute_type: "bfloat16" - softmax_compute_type: "bfloat16" - rotary_dtype: "bfloat16" - router_dense_type: "bfloat16" - param_init_type: "bfloat16" - scaling_factor: - beta_fast: 32.0 - beta_slow: 1.0 - factor: 40.0 - mscale: 1.0 - mscale_all_dim: 1.0 - original_max_position_embeddings: 4096 - use_past: True - extend_method: "YARN" - use_flash_attention: True - block_size: 16 - num_blocks: 512 - offset: 0 - checkpoint_name_or_path: "" - repetition_penalty: 1 - max_decode_length: 1024 - top_k: 1 - top_p: 1 - theta: 10000.0 - do_sample: False - is_dynamic: True - qkv_concat: True - ffn_concat: True - quantization_config: - quant_method: 'smoothquant' - weight_dtype: 'int8' - activation_dtype: 'int8' - auto_map: - AutoConfig: deepseek3_config.DeepseekV3Config - AutoModel: deepseek3.DeepseekV3ForCausalLM - arch: - type: DeepseekV3ForCausalLM - auto_register: deepseek3.DeepseekV3ForCausalLM - -moe_config: - expert_num: 256 - num_experts_chosen: 8 - routing_policy: "TopkRouterV2" - shared_expert_num: 1 - routed_scaling_factor: 2.5 - first_k_dense_replace: 3 - moe_intermediate_size: 2048 - topk_group: 4 - n_group: 8 - -processor: - return_tensors: ms - tokenizer: - unk_token: '' - bos_token: '<|begin▁of▁sentence|>' - eos_token: '<|end▁of▁sentence|>' - pad_token: '<|end▁of▁sentence|>' - type: LlamaTokenizerFast - vocab_file: '/path/to/deepseekr1/tokenizer.json' - tokenizer_file: '/path/to/deepseekr1/tokenizer.json' - chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" - type: LlamaProcessor diff --git a/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml b/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml deleted file mode 100644 index 821e33f5d713b248adc4ff68ee942793b78e5315..0000000000000000000000000000000000000000 --- a/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml +++ /dev/null @@ -1,126 +0,0 @@ -seed: 0 -output_dir: './output' # path to save checkpoint/strategy -load_checkpoint: '' -src_strategy_path_or_dir: '' -auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model -only_save_strategy: False -resume_training: False -use_parallel: False -run_mode: 'predict' - -# trainer config -trainer: - type: CausalLanguageModelingTrainer - model_name: 'qwen2_5_7b' - -# runner config -runner_config: - epochs: 5 - batch_size: 1 - sink_mode: True - sink_size: 2 -runner_wrapper: - type: MFTrainOneStepCell - scale_sense: - type: DynamicLossScaleUpdateCell - loss_scale_value: 65536 - scale_factor: 2 - scale_window: 1000 - use_clip_grad: True - -# default parallel of device num = 8 for Atlas 800T A2 -parallel_config: - data_parallel: 1 - model_parallel: 1 - pipeline_stage: 1 - micro_batch_num: 1 - vocab_emb_dp: False - gradient_aggregation_group: 4 -# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. -micro_batch_interleave_num: 1 - -model: - model_config: - type: LlamaConfig - batch_size: 1 - seq_length: 32768 - hidden_size: 3584 - num_layers: 28 - num_heads: 28 - n_kv_heads: 4 - vocab_size: 152064 - intermediate_size: 18944 - max_position_embeddings: 32768 - qkv_has_bias: True - rms_norm_eps: 1.0e-6 - theta: 1000000.0 - emb_dropout_prob: 0.0 - eos_token_id: [151645,151643] - pad_token_id: 151643 - bos_token_id: 151643 - compute_dtype: "bfloat16" - layernorm_compute_type: "float32" - softmax_compute_type: "float32" - rotary_dtype: "bfloat16" - param_init_type: "bfloat16" - use_past: True - use_flash_attention: True - block_size: 32 - num_blocks: 1024 - use_past_shard: False - offset: 0 - checkpoint_name_or_path: "" - repetition_penalty: 1.05 - max_decode_length: 512 - top_k: 20 - top_p: 0.8 - temperature: 0.7 - do_sample: True - is_dynamic: True - qkv_concat: True - auto_map: - AutoTokenizer: [qwen2_5_tokenizer.Qwen2Tokenizer, null] - - arch: - type: LlamaForCausalLM - -processor: - return_tensors: ms - tokenizer: - model_max_length: 131072 - bos_token: null - eos_token: "<|im_end|>" - unk_token: null - pad_token: "<|endoftext|>" - vocab_file: "/path/to/vocab.json" - merges_file: "/path/to/merges.txt" - chat_template: "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n" - type: Qwen2Tokenizer - type: Qwen2Processor - -# mindspore context init config -context: - mode: 0 #0--Graph Mode; 1--Pynative Mode - device_target: "Ascend" - ascend_config: - precision_mode: "must_keep_origin_dtype" - max_call_depth: 10000 - max_device_memory: "59GB" - save_graphs: False - save_graphs_path: "./graph" - device_id: 0 - -# parallel context config -parallel: - parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel - gradients_mean: False - enable_alltoall: False - full_batch: True - search_mode: "sharding_propagation" - enable_parallel_optimizer: False - strategy_ckpt_config: - save_file: "./ckpt_strategy.ckpt" - only_trainable_params: False - parallel_optimizer_config: - gradient_accumulation_shard: False - parallel_optimizer_threshold: 64 diff --git a/tests/st/python/distributed/__init__.py b/tests/st/python/distributed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/test_mcore_mix_parallel.py b/tests/st/python/distributed/test_mcore_mix_parallel.py similarity index 80% rename from tests/st/python/test_mcore_mix_parallel.py rename to tests/st/python/distributed/test_mcore_mix_parallel.py index 148f0878612b6a405f9d37d209411a2ce7b816e2..9f101e7085c8f016f1196428e0f4b74c8474a977 100644 --- a/tests/st/python/test_mcore_mix_parallel.py +++ b/tests/st/python/distributed/test_mcore_mix_parallel.py @@ -21,14 +21,10 @@ from unittest.mock import patch import os from multiprocessing import Process, Queue -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() env_vars = { @@ -47,13 +43,13 @@ env_vars = { "LCAL_COMM_ID": "127.0.0.1:10068" } -ds_model_path = "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8" +ds_model_path = MODEL_PATH["DeepSeek-R1-W8A8"] common_ds_prompt = ("You are a helpful assistant.<|User|>将文本分类为中性、" "负面或正面。 \n文本:我认为这次假期还可以。 \n情感:" "<|Assistant|>\n") common_ds_expect_result = 'ugs611ాలు' -qwen_model_path = "/home/workspace/mindspore_dataset/weight/Qwen3-30B-A3B" +qwen_model_path = MODEL_PATH["Qwen3-30B-A3B"] common_qwen_prompt = common_ds_prompt common_qwen_expect_result = '\n好的' @@ -136,10 +132,10 @@ def exec_model_with_dp(dp_size, exit_code = 0 for proc in procs: - proc.join(timeout=300) + proc.join(timeout=600) if proc.exitcode is None: print(f"Killing process {proc.pid} that " - f"didn't stop within 3 minutes.") + f"didn't stop within 10 minutes.") proc.kill() exit_code = 1 elif proc.exitcode: @@ -193,7 +189,13 @@ def exec_model_without_dp(tp_size, @pytest.mark.allcards def test_vllm_qwen3_moe_30b_dp4_tp2_ep4(): """ - test case qwen3_moe_30B with DP4TP2EP4 + Test Summary: + test case qwen3_moe_30B with DP4TP2EP4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + Qwen3-8B """ import vllm_mindspore @@ -213,7 +215,13 @@ def test_vllm_qwen3_moe_30b_dp4_tp2_ep4(): @pytest.mark.allcards def test_deepseek_r1_dp4_tp2_ep4(): """ - test case deepseek r1 w8a8 dp4 tp2 ep4 + Test Summary: + test case deepseek r1 w8a8 dp4 tp2 ep4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -234,7 +242,13 @@ def test_deepseek_r1_dp4_tp2_ep4(): @patch.dict(os.environ, env_vars) def test_deepseek_r1_dp8_tp1_ep8(): """ - test case deepseek r1 w8a8 Dp8 tp1 ep8 + Test Summary: + test case deepseek r1 w8a8 Dp8 tp1 ep8 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -254,7 +268,13 @@ def test_deepseek_r1_dp8_tp1_ep8(): @pytest.mark.allcards def test_deepseek_r1_dp2_tp4_ep1(): """ - test case deepseek r1 w8a8 dp2 tp4 ep1 + Test Summary: + test case deepseek r1 w8a8 dp2 tp4 ep1 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -275,7 +295,13 @@ def test_deepseek_r1_dp2_tp4_ep1(): ) def test_deepseek_r1_dp4_tp2_ep8(): """ - test case deepseek r1 w8a8 dp4 tp2 ep8 + Test Summary: + test case deepseek r1 w8a8 dp4 tp2 ep8 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -295,7 +321,13 @@ def test_deepseek_r1_dp4_tp2_ep8(): @pytest.mark.allcards def test_deepseek_r1_dp8_tp1_ep1(): """ - test case deepseek r1 w8a8 dp8 tp1 ep1 + Test Summary: + test case deepseek r1 w8a8 dp8 tp1 ep1 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -315,7 +347,13 @@ def test_deepseek_r1_dp8_tp1_ep1(): @pytest.mark.allcards def test_deepseek_r1_dp8_tp1_ep4(): """ - test case deepseek r1 w8a8 dp8 tp1 ep4 + Test Summary: + test case deepseek r1 w8a8 dp8 tp1 ep4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -335,7 +373,13 @@ def test_deepseek_r1_dp8_tp1_ep4(): @pytest.mark.allcards def test_deepseek_r1_tp8_ep8(): """ - test case deepseek r1 w8a8 tp8 ep8 + Test Summary: + test case deepseek r1 w8a8 tp8 ep8 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -354,7 +398,13 @@ def test_deepseek_r1_tp8_ep8(): @pytest.mark.allcards def test_deepseek_r1_tp8_ep4(): """ - test case deepseek r1 w8a8 tp8 ep4 + Test Summary: + test case deepseek r1 w8a8 tp8 ep4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -373,7 +423,13 @@ def test_deepseek_r1_tp8_ep4(): @pytest.mark.allcards def test_vllm_native_qwen3_moe_30b_dp4_tp2_ep4(): """ - test case qwen3_moe_30B with DP4TP2EP4 + Test Summary: + test case qwen3_moe_30B with DP4TP2EP4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore @@ -393,7 +449,13 @@ def test_vllm_native_qwen3_moe_30b_dp4_tp2_ep4(): @pytest.mark.allcards def test_vllm_native_qwen3_moe_30b_tp8_ep4(): """ - test case qwen3_moe_30B with TP8EP4 + Test Summary: + test case qwen3_moe_30B with TP8EP4 + Expected Result: + Running successfully, the first three tokens in the return result + meet expectations. + Model Info: + DeepSeek-R1-W8A8 """ import vllm_mindspore diff --git a/tests/st/python/cases_parallel/shm_broadcast.py b/tests/st/python/distributed/test_shm_broadcast.py similarity index 88% rename from tests/st/python/cases_parallel/shm_broadcast.py rename to tests/st/python/distributed/test_shm_broadcast.py index c82688c244703ea13026e4fc15602aa2351684fd..302c3c8c4ac4e44895fc2ca4edd1634f9623b778 100644 --- a/tests/st/python/cases_parallel/shm_broadcast.py +++ b/tests/st/python/distributed/test_shm_broadcast.py @@ -36,18 +36,27 @@ import torch.distributed as dist from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.utils import get_ip, get_open_port, get_distributed_init_method -from tests.st.python.test_ds_online import env_vars -from tests.st.python.utils.cases_parallel import cleanup_subprocesses from tests.st.python.utils.env_var_manager import EnvVarManager - - -def teardown_function(): - cleanup_subprocesses() - +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) env_manager = EnvVarManager() env_manager.setup_mindformers_environment() +env_vars = { + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", + "HCCL_IF_BASE_PORT": "60000", + "LCAL_COMM_ID": "127.0.0.1:10068" +} + def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: np.random.seed(seed) @@ -152,5 +161,6 @@ def worker_fn(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_shm_broadcast(): distributed_run(worker_fn, 4) diff --git a/tests/st/python/kernels/__init__.py b/tests/st/python/kernels/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cell/test_attention.py b/tests/st/python/kernels/test_attention.py similarity index 99% rename from tests/st/python/cell/test_attention.py rename to tests/st/python/kernels/test_attention.py index 9216757ce039c687c298eb613b391edd2deaf62d..92c4c6aa59958e2d94d8acd21db1058149371ebf 100644 --- a/tests/st/python/cell/test_attention.py +++ b/tests/st/python/kernels/test_attention.py @@ -30,6 +30,9 @@ from vllm_mindspore.model_executor.layers.quantization.attention import ( from vllm.config import CacheConfig from vllm.attention.backends.abstract import AttentionType +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) + @pytest.fixture(params=[mstype.float16, mstype.bfloat16], ids=["float16", "bfloat16"]) diff --git a/tests/st/python/lora/__init__.py b/tests/st/python/lora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/multilora_inference.py b/tests/st/python/lora/test_multilora_inference.py similarity index 86% rename from tests/st/python/cases_parallel/multilora_inference.py rename to tests/st/python/lora/test_multilora_inference.py index 0a1e3776b970776308bb8cdb25867ea92404eb6f..23755fa0d88c2960421eac9e996d787348de157f 100644 --- a/tests/st/python/cases_parallel/multilora_inference.py +++ b/tests/st/python/lora/test_multilora_inference.py @@ -23,7 +23,8 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager import vllm_mindspore # noqa: F401 @@ -32,11 +33,6 @@ from typing import Optional from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm.lora.request import LoRARequest - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -99,22 +95,22 @@ def initialize_engine() -> LLMEngine: # numbers will cause higher memory usage. If you know that all LoRAs will # use the same rank, it is recommended to set this as low as possible. # max_cpu_loras: controls the size of the CPU LoRA cache. - engine_args = EngineArgs( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - enable_lora=True, - max_loras=1, - max_lora_rank=64, - max_cpu_loras=2, - max_num_seqs=256, - max_model_len=256, - max_num_batched_tokens=400) + engine_args = EngineArgs(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + enable_lora=True, + max_loras=1, + max_lora_rank=64, + max_cpu_loras=2, + max_num_seqs=256, + max_model_len=256, + max_num_batched_tokens=400) return LLMEngine.from_engine_args(engine_args) @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_multilora_inference(): """test function that sets up and runs the prompt processing.""" engine = initialize_engine() - lora_path = "/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Lora-Law" + lora_path = MODEL_PATH["Qwen2.5-7B-Lora-Law"] test_prompts = create_test_prompts(lora_path) process_requests(engine, test_prompts) diff --git a/tests/st/python/models/__init__.py b/tests/st/python/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/models/deepseek/__init__.py b/tests/st/python/models/deepseek/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/models/deepseek/offline/__init__.py b/tests/st/python/models/deepseek/offline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_deepseek_bf16_part.py b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part.py similarity index 86% rename from tests/st/python/cases_parallel/vllm_deepseek_bf16_part.py rename to tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part.py index d113591690a8c0254b13cd6e64c581299db1873c..1f6c0994301668acb91cd736789e68cec01add3b 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_bf16_part.py +++ b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,6 +40,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_deepseek_r1_bf16(): """ test case deepseek r1 bf16 @@ -61,12 +58,11 @@ def test_deepseek_r1_bf16(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", - trust_remote_code=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - max_model_len=4096) + llm = LLM(model=MODEL_PATH["DeepSeek-R1-bf16"], + trust_remote_code=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_deepseek_bf16_part_v1.py b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part_v1.py similarity index 86% rename from tests/st/python/cases_parallel/vllm_deepseek_bf16_part_v1.py rename to tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part_v1.py index 649f1b7a0a3e394b177606f13a273eb20e797d67..3743079f2b84b526ddfe1b3416992acdc535eca1 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_bf16_part_v1.py +++ b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_bf16_part_v1.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -43,6 +39,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_deepseek_r1_bf16(): """ test case deepseek r1 bf16 @@ -59,12 +56,11 @@ def test_deepseek_r1_bf16(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", - trust_remote_code=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - max_model_len=33 * 1024) + llm = LLM(model=MODEL_PATH["DeepSeek-R1-bf16"], + trust_remote_code=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + max_model_len=33 * 1024) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_deepseek_part.py b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_part.py similarity index 93% rename from tests/st/python/cases_parallel/vllm_deepseek_part.py rename to tests/st/python/models/deepseek/offline/test_vllm_deepseek_part.py index 8ed5ce6f62851ac9a8609b7bf5cea46b6025ce4a..8f5f9532492dd21d21a838bc643492583f5f744c 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_part.py +++ b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_part.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,6 +40,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_deepseek_r1(): """ test case deepseek r1 w8a8 @@ -62,7 +59,7 @@ def test_deepseek_r1(): # Create an LLM. llm = LLM( - model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + model=MODEL_PATH["DeepSeek-R1-W8A8"], trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=2, @@ -83,6 +80,7 @@ def test_deepseek_r1(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_deepseek_mtp(): """ test case deepseek mtp with main model of r1-w8a8 @@ -100,7 +98,7 @@ def test_deepseek_mtp(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP", + llm = LLM(model=MODEL_PATH["DeepSeek-R1-MTP"], trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=2, diff --git a/tests/st/python/cases_parallel/vllm_deepseek_part_v1.py b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_part_v1.py similarity index 92% rename from tests/st/python/cases_parallel/vllm_deepseek_part_v1.py rename to tests/st/python/models/deepseek/offline/test_vllm_deepseek_part_v1.py index 06cacc3556a31c11a809e442e1f288e6512a509f..d101e26bba20da6fcb5a211d26782ce942c2fe38 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_part_v1.py +++ b/tests/st/python/models/deepseek/offline/test_vllm_deepseek_part_v1.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,6 +40,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_deepseek_r1(): """ test case deepseek r1 w8a8 @@ -62,7 +59,7 @@ def test_deepseek_r1(): # Create an LLM. llm = LLM( - model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + model=MODEL_PATH["DeepSeek-R1-W8A8"], trust_remote_code=True, gpu_memory_utilization=0.8, # Reduce gpu_memory_utilization because new memory will be allocated diff --git a/tests/st/python/models/deepseek/online/__init__.py b/tests/st/python/models/deepseek/online/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/test_ds_online.py b/tests/st/python/models/deepseek/online/test_ds_online.py similarity index 94% rename from tests/st/python/test_ds_online.py rename to tests/st/python/models/deepseek/online/test_ds_online.py index ed342ffc3cada9a7b8353d69c0db72b48a02989d..148ebfcc4531c2009db9400317ae6d42f5ec36a8 100644 --- a/tests/st/python/test_ds_online.py +++ b/tests/st/python/models/deepseek/online/test_ds_online.py @@ -27,6 +27,8 @@ import signal import time from tests.st.python.utils.env_var_manager import EnvVarManager +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) env_manager = EnvVarManager() env_manager.setup_mindformers_environment() @@ -44,7 +46,7 @@ env_vars = { "LCAL_COMM_ID": "127.0.0.1:10068" } -DS_R1_W8A8_MODEL = "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8" +DS_R1_W8A8_MODEL = MODEL_PATH["DeepSeek-R1-W8A8"] def execute_shell_command(command): @@ -197,6 +199,15 @@ def set_request(model_path, master_ip="127.0.0.1", port="8000"): @pytest.mark.platform_arm_ascend910b_training @pytest.mark.allcards def test_deepseek_r1_dp4_tp2_ep4_online(): + """ + Test Summary: + test deepseek r1 with dp4 tp2 ep4 + Expected Result: + Start online service successfully and send request, the first three + tokens in the return result. + Model Info: + DeepSeek-R1-W8A8 + """ import vllm_mindspore from vllm.utils import get_open_port # noqa: E402 diff --git a/tests/st/python/models/llama3/__init__.py b/tests/st/python/models/llama3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_llama3.py b/tests/st/python/models/llama3/test_vllm_llama3.py similarity index 85% rename from tests/st/python/cases_parallel/vllm_llama3.py rename to tests/st/python/models/llama3/test_vllm_llama3.py index 8355cbd81f3337dcabc2937101eea9210a876a7d..32cb01cedf778894f12b1ed61754008496924a23 100644 --- a/tests/st/python/cases_parallel/vllm_llama3.py +++ b/tests/st/python/models/llama3/test_vllm_llama3.py @@ -21,7 +21,8 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager import vllm_mindspore @@ -45,6 +46,9 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_vllm_llama3_8b(): """ test case llama3.1 8B @@ -62,11 +66,10 @@ def test_vllm_llama3_8b(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Llama-3.1-8B-Instruct", - gpu_memory_utilization=0.9, - tensor_parallel_size=1, - max_model_len=4096) + llm = LLM(model=MODEL_PATH["Llama-3.1-8B-Instruct"], + gpu_memory_utilization=0.9, + tensor_parallel_size=1, + max_model_len=4096) # Generate texts from the prompts. # The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/tests/st/python/models/qwen2_5/__init__.py b/tests/st/python/models/qwen2_5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b.py b/tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b.py similarity index 88% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b.py rename to tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b.py index 2ee512fda70fee0e5fa09c5407a276a881a515e8..fd75e28e63dfb57b4dba8d8ed4a1c1096d0626c1 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b.py +++ b/tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -45,6 +41,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen(): """ test case qwen2.5 7B @@ -62,10 +59,9 @@ def test_mf_qwen(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - gpu_memory_utilization=0.9, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + gpu_memory_utilization=0.9, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_v1.py b/tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b_v1.py similarity index 88% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_v1.py rename to tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b_v1.py index 70bca2447d61144c1ae8d3042ba69129f447dd0c..6c517174893141abee4bca57e16b950caeb5f615 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_v1.py +++ b/tests/st/python/models/qwen2_5/test_vllm_mf_qwen_7b_v1.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -43,6 +39,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen(): """ test case qwen2.5 7B @@ -60,10 +57,9 @@ def test_mf_qwen(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - gpu_memory_utilization=0.9, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + gpu_memory_utilization=0.9, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) @@ -97,11 +93,10 @@ def test_mf_qwen_batch(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - block_size=32, - gpu_memory_utilization=0.9, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + block_size=32, + gpu_memory_utilization=0.9, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_qwen_7b.py b/tests/st/python/models/qwen2_5/test_vllm_qwen_7b.py similarity index 88% rename from tests/st/python/cases_parallel/vllm_qwen_7b.py rename to tests/st/python/models/qwen2_5/test_vllm_qwen_7b.py index 8a6e9301a2f8e06566cbb6ef1bf0c026d64bbc0f..712bd4236da2e5b312ea75d745fb6e12c5881cf4 100644 --- a/tests/st/python/cases_parallel/vllm_qwen_7b.py +++ b/tests/st/python/models/qwen2_5/test_vllm_qwen_7b.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,6 +40,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_vllm_qwen(): """ test case qwen2.5 7B @@ -61,10 +58,9 @@ def test_vllm_qwen(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - gpu_memory_utilization=0.9, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + gpu_memory_utilization=0.9, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_qwen_7b_v1.py b/tests/st/python/models/qwen2_5/test_vllm_qwen_7b_v1.py similarity index 87% rename from tests/st/python/cases_parallel/vllm_qwen_7b_v1.py rename to tests/st/python/models/qwen2_5/test_vllm_qwen_7b_v1.py index 35c3e0b7002758caabeaf304061c0aa4afb6ffef..b9d2132d2d2285628c5a23408d237cef8ff19aea 100644 --- a/tests/st/python/cases_parallel/vllm_qwen_7b_v1.py +++ b/tests/st/python/models/qwen2_5/test_vllm_qwen_7b_v1.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -59,11 +55,10 @@ def run_vllm_qwen(enforce_eager=False): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - gpu_memory_utilization=0.9, - enforce_eager=enforce_eager, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + gpu_memory_utilization=0.9, + enforce_eager=enforce_eager, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) @@ -77,6 +72,7 @@ def run_vllm_qwen(enforce_eager=False): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_vllm_qwen(): """ test case qwen2.5 7B @@ -86,6 +82,7 @@ def test_vllm_qwen(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_qwen_enforce_eager(): """ Test qwen2.5 7B using ENFORCE_EAGER. diff --git a/tests/st/python/models/qwen2_5_vl/__init__.py b/tests/st/python/models/qwen2_5_vl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/images/1080p.jpeg b/tests/st/python/models/qwen2_5_vl/images/1080p.jpeg similarity index 100% rename from tests/st/python/images/1080p.jpeg rename to tests/st/python/models/qwen2_5_vl/images/1080p.jpeg diff --git a/tests/st/python/cases_parallel/similarity.py b/tests/st/python/models/qwen2_5_vl/similarity.py similarity index 100% rename from tests/st/python/cases_parallel/similarity.py rename to tests/st/python/models/qwen2_5_vl/similarity.py diff --git a/tests/st/python/cases_parallel/vllm_qwen2_5_vl_7b_v1.py b/tests/st/python/models/qwen2_5_vl/test_vllm_qwen2_5_vl_7b_v1.py similarity index 93% rename from tests/st/python/cases_parallel/vllm_qwen2_5_vl_7b_v1.py rename to tests/st/python/models/qwen2_5_vl/test_vllm_qwen2_5_vl_7b_v1.py index dc36d54881316ae2c66aae0fc763d8115e86d82b..3fdfdacf28b3f56b161097b34c23cc6773595a9d 100644 --- a/tests/st/python/cases_parallel/vllm_qwen2_5_vl_7b_v1.py +++ b/tests/st/python/models/qwen2_5_vl/test_vllm_qwen2_5_vl_7b_v1.py @@ -23,15 +23,11 @@ import os import cv2 import numpy as np from PIL import Image -from tests.st.python.utils.cases_parallel import cleanup_subprocesses from tests.st.python.utils.env_var_manager import EnvVarManager -from tests.st.python.cases_parallel.similarity import compare_distance +from tests.st.python.models.qwen2_5_vl.similarity import compare_distance from transformers import AutoProcessor - - -def teardown_function(): - cleanup_subprocesses() - +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) env_manager = EnvVarManager() env_manager.setup_mindformers_environment() @@ -60,11 +56,13 @@ PROMPT_TEMPLATE_2 = ( "<|im_start|>assistant\n") video_path = "/home/workspace/mindspore_dataset/video_file/korean_eating.mp4" -model_path = "/home/workspace/mindspore_dataset/weight/Qwen2.5-VL-7B-Instruct" +model_path = MODEL_PATH["Qwen2.5-VL-7B-Instruct"] def pil_image() -> Image.Image: - image_path = "images/1080p.jpeg" + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + image_path = os.path.join(current_dir, "images/1080p.jpeg") return Image.open(image_path) @@ -176,6 +174,7 @@ def prepare_text(processor: AutoProcessor, prompt: str): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_qwen2_5_vl_7b_v1(): """ test case qwen2.5 vl 7B @@ -187,6 +186,9 @@ def test_qwen2_5_vl_7b_v1(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_qwen2_5_vl_7b_v1_enforce_eager(): """ test case qwen2.5 vl 7B with eager mode @@ -198,6 +200,9 @@ def test_qwen2_5_vl_7b_v1_enforce_eager(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_qwen2_5_vl_7b_v1_video_infer(): import vllm_mindspore from vllm import LLM, SamplingParams diff --git a/tests/st/python/models/qwen3/__init__.py b/tests/st/python/models/qwen3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen3_8b.py b/tests/st/python/models/qwen3/test_vllm_mf_qwen3_8b.py similarity index 92% rename from tests/st/python/cases_parallel/vllm_mf_qwen3_8b.py rename to tests/st/python/models/qwen3/test_vllm_mf_qwen3_8b.py index 7324d1c15b5e6dc300bf5576fe5069f81491e3c9..feceda728760ee4164f7dbcd94fe0f3179df4a2c 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen3_8b.py +++ b/tests/st/python/models/qwen3/test_vllm_mf_qwen3_8b.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -60,7 +56,7 @@ def run_mf_qwen3_networt(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen3-8B", + llm = LLM(model=MODEL_PATH["Qwen3-8B"], gpu_memory_utilization=0.9, tensor_parallel_size=2) # Generate texts from the prompts. @@ -77,12 +73,14 @@ def run_mf_qwen3_networt(): @patch.dict(os.environ, {**env_vars, "VLLM_USE_V1": "0"}) +@pytest.mark.level0 def test_mf_qwen3_v0(): """Test qwen3 8B using V0 LLMEngine.""" run_mf_qwen3_networt() @patch.dict(os.environ, {**env_vars, "VLLM_USE_V1": "1"}) +@pytest.mark.level0 def test_mf_qwen3_v1(): """Test qwen3 8B using V0 LLMEngine.""" run_mf_qwen3_networt() @@ -92,6 +90,7 @@ def test_mf_qwen3_v1(): **env_vars, "VLLM_USE_V1": "1", "MS_ENABLE_INTERNAL_BOOST": "off" }) +@pytest.mark.level0 def test_mf_qwen3_v1_310p(): """Test qwen3 8B using V1 LLMEngine in 310p.""" run_mf_qwen3_networt() diff --git a/tests/st/python/cases_parallel/vllm_qwen3.py b/tests/st/python/models/qwen3/test_vllm_qwen3.py similarity index 79% rename from tests/st/python/cases_parallel/vllm_qwen3.py rename to tests/st/python/models/qwen3/test_vllm_qwen3.py index f2eb008c1b65a7b3a8a5a260bf8ea25056f1d170..d27862ac37334bb3825cfc78f0a2962027d82528 100644 --- a/tests/st/python/cases_parallel/vllm_qwen3.py +++ b/tests/st/python/models/qwen3/test_vllm_qwen3.py @@ -21,14 +21,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -62,7 +58,7 @@ def run_vllm_qwen3_8b(enforce_eager=False): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen3-8B", + llm = LLM(model=MODEL_PATH["Qwen3-8B"], gpu_memory_utilization=0.9, tensor_parallel_size=1, enforce_eager=enforce_eager, @@ -82,9 +78,17 @@ def run_vllm_qwen3_8b(enforce_eager=False): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_vllm_qwen3_0_6b(): """ - test case qwen3 0.6B + Test Summary: + test case qwen3 0.6B + Expected Result: + Running successfully, the request result meets expectations. + Model Info: + Qwen3-8B """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -100,7 +104,7 @@ def test_vllm_qwen3_0_6b(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen3-0.6B", + llm = LLM(model=MODEL_PATH["Qwen3-0.6B"], gpu_memory_utilization=0.9, tensor_parallel_size=1, max_model_len=4096) @@ -119,18 +123,34 @@ def test_vllm_qwen3_0_6b(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_vllm_qwen3_8b(): """ - test case qwen3 8B + Test Summary: + Test qwen3 8B with graph mode. + Expected Result: + Running successfully, the request result meets expectations + Model Info: + Qwen3-8B """ import vllm_mindspore run_vllm_qwen3_8b() @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_qwen3_enforce_eager(): """ - Test qwen3 8B using ENFORCE_EAGER. + Test Summary: + Test qwen3 8B using enforce_eager. + Expected Result: + Running successfully, the request result meets expectations + Model Info: + Qwen3-8B """ import vllm_mindspore run_vllm_qwen3_8b(enforce_eager=True) diff --git a/tests/st/python/models/qwen3_moe/__init__.py b/tests/st/python/models/qwen3_moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_qwen3_moe.py b/tests/st/python/models/qwen3_moe/test_vllm_qwen3_moe.py similarity index 92% rename from tests/st/python/cases_parallel/vllm_qwen3_moe.py rename to tests/st/python/models/qwen3_moe/test_vllm_qwen3_moe.py index ec2d57377a7ba9351b8aa3199fbc440f7b680162..fa7c13c9d97d7145acb066a85a9f84e8e142032a 100644 --- a/tests/st/python/cases_parallel/vllm_qwen3_moe.py +++ b/tests/st/python/models/qwen3_moe/test_vllm_qwen3_moe.py @@ -21,14 +21,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -64,7 +60,7 @@ def run_vllm_qwen3_30b_a3b(enforce_eager=False): # Create an LLM. llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen3-30B-A3B", + model=MODEL_PATH["Qwen3-30B-A3B"], gpu_memory_utilization=0.9, tensor_parallel_size=2, max_model_len=4096, @@ -85,6 +81,7 @@ def run_vllm_qwen3_30b_a3b(enforce_eager=False): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_vllm_qwen3_30b_a3b(): """ test case qwen3-30B-A3B @@ -94,6 +91,7 @@ def test_vllm_qwen3_30b_a3b(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_vllm_qwen3_30b_a3b_eager(): """ test case qwen3-30B-A3B eager mode diff --git a/tests/st/python/models/telechat2/__init__.py b/tests/st/python/models/telechat2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_mf_telechat2_7b.py b/tests/st/python/models/telechat2/test_vllm_mf_telechat2_7b.py similarity index 89% rename from tests/st/python/cases_parallel/vllm_mf_telechat2_7b.py rename to tests/st/python/models/telechat2/test_vllm_mf_telechat2_7b.py index 0a38a51b9e952e00e62decf6043a1efe56f57423..d782b6cc81a36a9acaf0d07d6b7e266e3c0450ad 100644 --- a/tests/st/python/cases_parallel/vllm_mf_telechat2_7b.py +++ b/tests/st/python/models/telechat2/test_vllm_mf_telechat2_7b.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -57,7 +53,7 @@ def run_mf_telechat2_7b_network(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/telechat2_7b", + llm = LLM(model=MODEL_PATH["telechat2_7b"], gpu_memory_utilization=0.9, trust_remote_code=True, tensor_parallel_size=1) @@ -75,6 +71,9 @@ def run_mf_telechat2_7b_network(): @patch.dict(os.environ, env_vars) +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_onecard def test_mf_telechat2_7b(): """Test telechat2_7b.""" run_mf_telechat2_7b_network() diff --git a/tests/st/python/mss/__init__.py b/tests/st/python/mss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/mss/test_vllm_mf_qwen_7b_cp_pc_mss.py similarity index 83% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_cp_pc_mss.py rename to tests/st/python/mss/test_vllm_mf_qwen_7b_cp_pc_mss.py index 72a7c7aaa70641200bae1d07779c74d4046861e8..048609be6f5518fb926f84046b2b2e490bfc9786 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_cp_pc_mss.py +++ b/tests/st/python/mss/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -45,9 +41,12 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level4 def test_mf_qwen_7b_cp_pc_mss(): """ test case mf_qwen_7b_cp_pc_mss + accuracy error happens occasionally, adjust the level to + level 4 until it is fixed. """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -81,17 +80,16 @@ def test_mf_qwen_7b_cp_pc_mss(): sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - max_num_seqs=16, - max_num_batched_tokens=32, - block_size=32, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - enable_chunked_prefill=True, - enable_prefix_caching=True, - num_scheduler_steps=8) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + max_num_seqs=16, + max_num_batched_tokens=32, + block_size=32, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + enable_chunked_prefill=True, + enable_prefix_caching=True, + num_scheduler_steps=8) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. for _ in range(3): diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_mss.py b/tests/st/python/mss/test_vllm_mf_qwen_7b_mss.py similarity index 83% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_mss.py rename to tests/st/python/mss/test_vllm_mf_qwen_7b_mss.py index 30e5a49a5b8b26f789479d6246bf036d6e1b8086..5b1954ffee8aadcc497220b7ea5a7670bfc92d81 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_mss.py +++ b/tests/st/python/mss/test_vllm_mf_qwen_7b_mss.py @@ -19,14 +19,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -45,6 +41,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen_7b_mss(): """ test case qwen_7b_mss @@ -61,14 +58,13 @@ def test_mf_qwen_7b_mss(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - max_num_batched_tokens=8192, - block_size=32, - gpu_memory_utilization=0.9, - num_scheduler_steps=8, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + max_num_batched_tokens=8192, + block_size=32, + gpu_memory_utilization=0.9, + num_scheduler_steps=8, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/ops/__init__.py b/tests/st/python/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/test_custom_advstepflash.py b/tests/st/python/ops/test_custom_advstepflash.py similarity index 97% rename from tests/st/python/test_custom_advstepflash.py rename to tests/st/python/ops/test_custom_advstepflash.py index cd779f3192f8dfa1b24892d7f8e32fb4db25f228..70ab1000e0b11dee903057cd0e17212d75bfc6ab 100644 --- a/tests/st/python/test_custom_advstepflash.py +++ b/tests/st/python/ops/test_custom_advstepflash.py @@ -22,11 +22,8 @@ import pytest from vllm_mindspore import _custom_ops as custom_ops import torch -from tests.st.python.utils.cases_parallel import cleanup_subprocesses - - -def teardown_function(): - cleanup_subprocesses() +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) def benchmark_advance_step_op(sampled_token_ids, input_tokens, input_positions, diff --git a/tests/st/python/prefix_caching/__init__.py b/tests/st/python/prefix_caching/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching.py similarity index 87% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching.py rename to tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching.py index 8ac1e28cf3336a4adcf1c2db6d9cfbeedc554a21..43667b75ed85abecf9feb94851a1778067463425 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching.py +++ b/tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() env_vars = { @@ -42,6 +38,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen_7b_prefix_caching(): """ test case qwen_7b_prefix_caching @@ -64,13 +61,12 @@ def test_mf_qwen_7b_prefix_caching(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - block_size=16, - enable_prefix_caching=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + block_size=16, + enable_prefix_caching=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching_v1.py b/tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching_v1.py similarity index 90% rename from tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching_v1.py rename to tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching_v1.py index 6fb0c6084b228f2b8a97f768b05b54c8a86f1925..ca119cc17de041835c132ea453d1a691ff8f499c 100644 --- a/tests/st/python/cases_parallel/vllm_mf_qwen_7b_prefix_caching_v1.py +++ b/tests/st/python/prefix_caching/test_vllm_mf_qwen_7b_prefix_caching_v1.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() env_vars = { @@ -41,6 +37,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_mf_qwen_7b_prefix_caching(): """ test case qwen_7b_prefix_caching @@ -63,11 +60,10 @@ def test_mf_qwen_7b_prefix_caching(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", - max_model_len=8192, - block_size=16, - tensor_parallel_size=2) + llm = LLM(model=MODEL_PATH["Qwen2.5-7B-Instruct"], + max_model_len=8192, + block_size=16, + tensor_parallel_size=2) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/quantization/__init__.py b/tests/st/python/quantization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/cell/test_smooth_quant.py b/tests/st/python/quantization/test_smooth_quant.py similarity index 98% rename from tests/st/python/cell/test_smooth_quant.py rename to tests/st/python/quantization/test_smooth_quant.py index 4be2a5f7e5a7cda1cf71684ea460d794a2e597bd..50cd623ace308c72b6cf3760fe101fa3afaf0a11 100644 --- a/tests/st/python/cell/test_smooth_quant.py +++ b/tests/st/python/quantization/test_smooth_quant.py @@ -25,6 +25,9 @@ from mindspore.common.initializer import initializer from vllm_mindspore.model_executor.layers.quantization.smooth_quant_modelslim \ import A8W8DYNLinearMethod, A8W8LinearMethod, SmoothQuantModelSlimConfig +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) + @pytest.fixture(params=[mstype.float16, mstype.bfloat16], ids=["float16", "bfloat16"]) diff --git a/tests/st/python/cases_parallel/vllm_deepseek_osl.py b/tests/st/python/quantization/test_vllm_deepseek_osl.py similarity index 81% rename from tests/st/python/cases_parallel/vllm_deepseek_osl.py rename to tests/st/python/quantization/test_vllm_deepseek_osl.py index 68c54fd79d8c1b1e15831ec79a590cd7cf0e28d4..8aa3864b3963a68df5af689c0a4b9e5ebb17eee4 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_osl.py +++ b/tests/st/python/quantization/test_vllm_deepseek_osl.py @@ -24,14 +24,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -50,9 +46,13 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level4 def test_deepseek_r1(): """ test case deepseek r1 w8a8 + + Mcore currently does not support the following test cases, + adjust the level to level 4 until it is re supported """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -66,12 +66,11 @@ def test_deepseek_r1(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-osl", - trust_remote_code=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - max_model_len=4096) + llm = LLM(model=MODEL_PATH["DeepSeek-R1-W8A8-osl"], + trust_remote_code=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_deepseek_smoothquant.py b/tests/st/python/quantization/test_vllm_deepseek_smoothquant.py similarity index 81% rename from tests/st/python/cases_parallel/vllm_deepseek_smoothquant.py rename to tests/st/python/quantization/test_vllm_deepseek_smoothquant.py index 748b43373edd7760a76ad5cfb44ee1cb11b9aaa1..e307cb2f30cf057da9958cd3e98058406f814cc5 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_smoothquant.py +++ b/tests/st/python/quantization/test_vllm_deepseek_smoothquant.py @@ -22,14 +22,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -48,9 +44,13 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level4 def test_deepseek_r1(): """ test case deepseek r1 w8a8 + + Mcore currently does not support the following test cases, + adjust the level to level 4 until it is re supported """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -64,13 +64,11 @@ def test_deepseek_r1(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model= - "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig", - trust_remote_code=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - max_model_len=4096) + llm = LLM(model=MODEL_PATH["DeepSeek-R1-W8A8-smoothquant-newconfig"], + trust_remote_code=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/cases_parallel/vllm_deepseek_smoothquant_mss.py b/tests/st/python/quantization/test_vllm_deepseek_smoothquant_mss.py similarity index 79% rename from tests/st/python/cases_parallel/vllm_deepseek_smoothquant_mss.py rename to tests/st/python/quantization/test_vllm_deepseek_smoothquant_mss.py index 3733060910bd671e29109add2da1c05b83b07661..45588b869822c256f2f1e40dd60b0bfbbb8c1fb3 100644 --- a/tests/st/python/cases_parallel/vllm_deepseek_smoothquant_mss.py +++ b/tests/st/python/quantization/test_vllm_deepseek_smoothquant_mss.py @@ -22,14 +22,10 @@ from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function, MODEL_PATH) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -48,9 +44,13 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level4 def test_deepseek_r1_mss(): """ test case deepseek r1 w8a8 mss + + Mcore currently does not support the following test cases, + adjust the level to level 4 until it is re supported """ import vllm_mindspore from vllm import LLM, SamplingParams @@ -64,14 +64,12 @@ def test_deepseek_r1_mss(): sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM( - model= - "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig", - trust_remote_code=True, - gpu_memory_utilization=0.9, - tensor_parallel_size=2, - num_scheduler_steps=8, - max_model_len=4096) + llm = LLM(model=MODEL_PATH["DeepSeek-R1-W8A8-smoothquant-newconfig"], + trust_remote_code=True, + gpu_memory_utilization=0.9, + tensor_parallel_size=2, + num_scheduler_steps=8, + max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/register_parallel_tests.json b/tests/st/python/register_parallel_tests.json new file mode 100644 index 0000000000000000000000000000000000000000..6307a4b396a72f369cf673bd576409225c076518 --- /dev/null +++ b/tests/st/python/register_parallel_tests.json @@ -0,0 +1,118 @@ +{ + "registered_910b_tests": [ + { + "test_node_id": "models/qwen2_5_vl/test_vllm_qwen2_5_vl_7b_v1.py::test_qwen2_5_vl_7b_v1", + "card_num": 2 + }, + { + "test_node_id": "models/qwen3_moe/test_vllm_qwen3_moe.py::test_vllm_qwen3_30b_a3b", + "card_num": 2 + }, + { + "test_node_id": "models/qwen3_moe/test_vllm_qwen3_moe.py::test_vllm_qwen3_30b_a3b_eager", + "card_num": 2 + }, + { + "test_node_id": "lora/test_multilora_inference.py::test_multilora_inference", + "card_num": 2 + }, + { + "test_node_id": "models/qwen2_5/test_vllm_qwen_7b_v1.py::test_vllm_qwen", + "card_num": 2 + }, + { + "test_node_id": "models/qwen2_5/test_vllm_qwen_7b_v1.py::test_qwen_enforce_eager", + "card_num": 2 + }, + { + "test_node_id": "models/deepseek/offline/test_vllm_deepseek_part.py::test_deepseek_r1", + "card_num": 2 + }, + { + "test_node_id": "models/deepseek/offline/test_vllm_deepseek_part.py::test_deepseek_mtp", + "card_num": 2 + }, + { + "test_node_id": "models/qwen3/test_vllm_mf_qwen3_8b.py::test_mf_qwen3_v0", + "card_num": 2 + }, + { + "test_node_id": "models/qwen3/test_vllm_mf_qwen3_8b.py::test_mf_qwen3_v1", + "card_num": 2 + }, + { + "test_node_id": "models/qwen2_5/test_vllm_qwen_7b.py::test_vllm_qwen", + "card_num": 2 + }, + { + "test_node_id": "models/deepseek/offline/test_vllm_deepseek_bf16_part.py::test_deepseek_r1_bf16", + "card_num": 2 + }, + { + "test_node_id": "models/deepseek/offline/test_vllm_deepseek_bf16_part_v1.py::test_deepseek_r1_bf16", + "card_num": 2 + }, + { + "test_node_id": "models/deepseek/offline/test_vllm_deepseek_part_v1.py::test_deepseek_r1", + "card_num": 2 + }, + { + "test_node_id": "models/qwen2_5/test_vllm_mf_qwen_7b.py::test_mf_qwen", + "card_num": 2 + }, + { + "test_node_id": "chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill.py::test_mf_qwen_7b_chunk_prefill", + "card_num": 2 + }, + { + "test_node_id": "chunk_prefill/test_vllm_mf_qwen_7b_chunk_prefill_v1.py::test_mf_qwen_7b_chunk_prefill", + "card_num": 2 + }, + { + "test_node_id": "sampling/test_vllm_sampling.py::test_vllm_sampling_n_logprobs", + "card_num": 2 + }, + { + "test_node_id": "mss/test_vllm_mf_qwen_7b_mss.py::test_mf_qwen_7b_mss", + "card_num": 2 + }, + { + "test_node_id": "prefix_caching/test_vllm_mf_qwen_7b_prefix_caching.py::test_mf_qwen_7b_prefix_caching", + "card_num": 2 + }, + { + "test_node_id": "prefix_caching/test_vllm_mf_qwen_7b_prefix_caching_v1.py::test_mf_qwen_7b_prefix_caching", + "card_num": 2 + }, + { + "test_node_id": "models/qwen2_5/test_vllm_mf_qwen_7b_v1.py::test_mf_qwen", + "card_num": 2 + }, + { + "test_node_id": "mss/test_vllm_mf_qwen_7b_cp_pc_mss.py::test_mf_qwen_7b_cp_pc_mss", + "card_num": 2 + }, + { + "test_node_id": "quantization/test_vllm_deepseek_osl.py::test_deepseek_r1", + "card_num": 2 + }, + { + "test_node_id": "quantization/test_vllm_deepseek_smoothquant.py::test_deepseek_r1", + "card_num": 2 + }, + { + "test_node_id": "quantization/test_vllm_deepseek_smoothquant_mss.py::test_deepseek_r1_mss", + "card_num": 2 + }, + { + "test_node_id": "distributed/test_shm_broadcast.py::test_shm_broadcast", + "card_num": 4 + } + ], + "registered_310p_tests": [ + { + "test_node_id": "models/qwen3/test_vllm_mf_qwen3_8b.py::test_mf_qwen3_v1_310p", + "card_num": 2 + } + ] +} \ No newline at end of file diff --git a/tests/st/python/sampling/__init__.py b/tests/st/python/sampling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/test_sampler.py b/tests/st/python/sampling/test_sampler.py similarity index 99% rename from tests/st/python/test_sampler.py rename to tests/st/python/sampling/test_sampler.py index 4a6c4795d36252454ba1d209b226363545e23752..9d65a9dab47bf6727f0caebd9a323aae1a55797c 100644 --- a/tests/st/python/test_sampler.py +++ b/tests/st/python/sampling/test_sampler.py @@ -40,12 +40,8 @@ from vllm.model_executor.utils import set_random_seed from vllm.utils import Counter, is_pin_memory_available from vllm.sequence import SamplingParams from vllm_mindspore.sequence import SequenceData, SequenceGroupMetadata - -from tests.st.python.utils.cases_parallel import cleanup_subprocesses - - -def teardown_function(): - cleanup_subprocesses() +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) class MockLogitsSampler(Sampler): diff --git a/tests/st/python/test_sampler_v1.py b/tests/st/python/sampling/test_sampler_v1.py similarity index 99% rename from tests/st/python/test_sampler_v1.py rename to tests/st/python/sampling/test_sampler_v1.py index 126629273f4ae1470f7ecf95a9ee8312c32f1623..51821889c9546b5117a1be254a78b6bed49a5c22 100644 --- a/tests/st/python/test_sampler_v1.py +++ b/tests/st/python/sampling/test_sampler_v1.py @@ -33,12 +33,8 @@ from vllm.utils import make_tensor_with_pad from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler -from tests.st.python.utils.cases_parallel import cleanup_subprocesses - - -def teardown_function(): - cleanup_subprocesses() - +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 diff --git a/tests/st/python/cases_parallel/vllm_sampling.py b/tests/st/python/sampling/test_vllm_sampling.py similarity index 94% rename from tests/st/python/cases_parallel/vllm_sampling.py rename to tests/st/python/sampling/test_vllm_sampling.py index d9578d54c7753db8ac82ef3f00be62dc46754360..8be1f78409622e7ec30adddf7bf13525d890f02d 100644 --- a/tests/st/python/cases_parallel/vllm_sampling.py +++ b/tests/st/python/sampling/test_vllm_sampling.py @@ -18,14 +18,10 @@ import pytest from unittest.mock import patch import os -from tests.st.python.utils.cases_parallel import cleanup_subprocesses +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) from tests.st.python.utils.env_var_manager import EnvVarManager - -def teardown_function(): - cleanup_subprocesses() - - env_manager = EnvVarManager() env_manager.setup_mindformers_environment() # def env @@ -44,6 +40,7 @@ env_vars = { @patch.dict(os.environ, env_vars) +@pytest.mark.level0 def test_vllm_sampling_n_logprobs(): """ parameter n and logprobs test case diff --git a/tests/st/python/test_cases_parallel.py b/tests/st/python/test_cases_parallel.py index 87ee11cad68b309edddae4ded763063051ab2dd7..43d9cbe0ecdf5b0ca62e8267f92ab86664b06325 100644 --- a/tests/st/python/test_cases_parallel.py +++ b/tests/st/python/test_cases_parallel.py @@ -16,16 +16,262 @@ """test cases parallel""" import os +import json +import pytest +import importlib + from multiprocessing.pool import Pool +from .utils.common_utils import logger +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) -import pytest +level_marks = ("level0", "level1", "level2", "level3", "level4") + +card_marks = ("env_onecard", "allcards", "env_single") + +platform_marks = ("platform_arm_ascend910b_training", "platform_ascend310p") + +PLATFORM_MAP = { + '910B': "platform_arm_ascend910b_training", + '310P': "platform_ascend310p" +} + +HAS_TESTS_REGISTERED = False + +registered_910b_tests = [] +registered_310p_tests = [] + + +def register_tests_by_platform(register_cases, register_list): + """ + Register function for specific platform + """ + for test_case in register_cases: + """ + card_num: number of occupied cards. + test_node_id: string in {test_file_path}::{test_function_name} format. + """ + card_num = test_case.get("card_num") + test_node_id = test_case.get("test_node_id") + if card_num is not None and test_node_id is not None: + register_list.append((card_num, test_node_id)) + else: + logger.warning("Invalid test case entry: %s", test_case) + + +def load_rejistered_tests_from_json(json_file): + """ + Register the tests to registered_910b_tests and registered_310p_tests. + """ + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + register_json_path = os.path.join(current_dir, json_file) + with open(register_json_path) as f: + tests_cases = json.load(f) + + register_tests_by_platform(tests_cases.get("registered_910b_tests"), + register_list=registered_910b_tests) + register_tests_by_platform(tests_cases.get("registered_310p_tests"), + register_list=registered_310p_tests) + + +def tasks_resource_alloc(tasks: list[tuple[int]]) -> list[tuple[str]]: + """ + Allocate devices, lccl base port, hccl base port to tasks + according to device requirement of each task. + + For example: + [(2, "cases_parallel/vllm_task.py::test_1", "test_1.log")] + ==> [("export ASCEND_RT_VISIBLE_DEVICES=0,1 && + export LCAL_COMM_ID=127.0.0.1:10068 && " + "export HCCL_IF_BASE_PORT=61000 && " + "pytest -s -v cases_parallel/vllm_task.py::test_1 > test_1.log", + "test_1.log")] + + Args: + tasks (list[tuple[int]]): list of tasks. Each task contain 3 elements. + 1. device_req (int): Num of device requirements, + which will occur device_req devices, + device_req ports for lccl, + device_req ports for hccl. + 2. case_desc (str): The case description, + such as "path_to_case/case.py::target_case". + 3. log_file (str): The logging file path. + + Returns: + list[tuple[str]]: Append resource environment to the task commands. + """ + device_limit = 8 + device_base = 0 + lccl_base_port = 20068 + hccl_base_port = 51000 + + out_tasks: list[tuple[str]] = [] + for task in tasks: + assert len(task) == 3 + resource_req, task_case, log_file = task + if not isinstance(resource_req, int): + raise TypeError( + "First argument of task should be a int or str, but got %s!", + str(type(resource_req))) + + device_str = ",".join( + [str(d) for d in range(device_base, device_base + resource_req)]) + lccl_str = f"127.0.0.1:{lccl_base_port}" + + commands = [ + f"export ASCEND_RT_VISIBLE_DEVICES={device_str}", + f"export LCAL_COMM_ID={lccl_str}", + f"export HCCL_IF_BASE_PORT={hccl_base_port}" + ] + + device_base += resource_req + lccl_base_port += resource_req + hccl_base_port += resource_req + + commands.append(f"pytest -s -v {task_case} > {log_file}") + out_tasks.append((" && ".join(commands), log_file)) + + if device_base > device_limit: + raise ValueError( + "Total require device %d exceeding resource limits %d !", + device_base, device_limit) + + return out_tasks + + +def generate_group_contents(tests_info, capacity=8): + ''' + Group and combine the registered tests according to the given rule, + which prioritizes those occupied more cards. Strive to maximize the + utilization of device capacity. + ''' + # Sort by the number of occupied devices in descending order. + tests_info_sorted = sorted(tests_info, key=lambda x: x[0], reverse=True) + groups = [] # The total number of cards occupied by each group. + group_contents = [] # Store test information for each group. + + for info in tests_info_sorted: + num = info[0] + # Check if there are any existing groups that can accommodate the + # current number. + found = False + for i in range(len(groups)): + if groups[i] + num <= capacity: + groups[i] += num + group_contents[i].append(info) + found = True + break + # If no feasible group is found, create a new group. + if not found: + groups.append(num) + group_contents.append([info]) + + return group_contents + + +def get_module_pytest_marks(module_path, function_name): + """Obtain the pytestmark of the test module.""" + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + module_file_path = os.path.join(current_dir, *module_path.split('/')) + if not os.path.exists(module_file_path): + raise ImportError("module file %s does not exist.", module_file_path) + + spec = importlib.util.spec_from_file_location(module_path, + module_file_path) + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + except Exception as e: + raise ImportError("Loading module %s failed: %s", module_path, + str(e)) from e + + func = getattr(module, function_name, None) + if func is None: + raise AttributeError("Function %s not found in module %s.", + function_name, module_path) + + if hasattr(func, 'pytestmark'): + return [mark.name for mark in func.pytestmark] + else: + return [] + + +def collection_cases_by_level(test_register): + ''' + According to the level of registered tests, divide them into + corresponding maps. + ''' + tests_info = { + f"{level_marks[0]}": [], + f"{level_marks[1]}": [], + f"{level_marks[2]}": [], + f"{level_marks[3]}": [], + f"{level_marks[4]}": [] + } + + for case in test_register: + module_path, test_function_name = case[1].split("::") + marks = get_module_pytest_marks(module_path, test_function_name) + level_mark = [mark for mark in marks if mark in level_marks] + card_mark = [mark for mark in marks if mark in card_marks] + + if len(card_mark) > 0: + raise ValueError( + "If the case has specified 'env_single', 'env_onecard' " + "or 'allcards', there is no need to register and execute " + "concurrently") + elif len(level_mark) > 1: + raise ValueError( + "Each test case can only specify a unique level, " + "but %s got %s.", case[1], str(len(level_mark))) + elif len(level_mark) == 1: + tests_info[level_mark[0]].append(case) + else: + raise ValueError( + "Case '%s' lacks necessary level mark, " + "please specify", case[1]) + return tests_info + + +def generate_parallel_cases(test_register, platform): + """Generate composite concurrent tests content for all registered tests.""" + tests_info = collection_cases_by_level(test_register) + + for level in level_marks: + generate_cases_with_level(tests_info[level], platform, level=level) + + +def generate_cases_with_level(tests_info, platform, level="level0"): + """ + Generate composite concurrent tests content based on the specified + test level. + """ + if len(tests_info) == 0: + return + + group_contents = generate_group_contents(tests_info, capacity=8) + + for i, per_group in enumerate(group_contents): + print(f"iter: {i}. per_group: {per_group}\n") + test_content = "" + test_content += ( + f"@pytest.mark.{level}\n" + f"@pytest.mark.{PLATFORM_MAP[platform]}\n" + f"@pytest.mark.env_single\n" + f"def test_cases_parallel_{platform}_{level}_part{i}():\n" + f" cases = [\n") -from tests.st.python.utils.cases_parallel import (cleanup_subprocesses, - tasks_resource_alloc) + for case in per_group: + node_id = case[1] + log_name = node_id.split('/')[-1].replace(".py::", '_') + '.log' + test_content += f" ({case[0]}, '{node_id}', '{log_name}'),\n" + test_content += (" ]\n" + " run_tasks(cases)\n\n\n") -def teardown_function(): - cleanup_subprocesses() + exec(test_content, globals()) def run_command(command_info): @@ -51,247 +297,18 @@ def run_tasks(cases): check_results(commands, results) -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part0(): +def load_and_generate_tests(): """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. + Load and generate tests form register_parallel_tests.json """ - cases = [ - (2, "cases_parallel/vllm_mf_qwen_7b.py::test_mf_qwen", - "vllm_mf_qwen_7b_test_mf_qwen.log"), - (2, "cases_parallel/vllm_mf_qwen_7b_chunk_prefill.py" - "::test_mf_qwen_7b_chunk_prefill", - "vllm_mf_qwen_7b_chunk_prefill_test_mf_qwen_7b_chunk_prefill.log"), - (2, "cases_parallel/vllm_mf_qwen_7b_chunk_prefill_v1.py" - "::test_mf_qwen_7b_chunk_prefill", - "vllm_mf_qwen_7b_chunk_prefill_v1_test_mf_qwen_7b_chunk_prefill.log"), - (2, "cases_parallel/vllm_sampling.py::test_vllm_sampling_n_logprobs", - "vllm_sampling_n_logprobs_v1.log") - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part1(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_mf_qwen_7b_mss.py::test_mf_qwen_7b_mss", - "vllm_mf_qwen_7b_mss_test_mf_qwen_7b_mss.log"), - (2, "cases_parallel/vllm_mf_qwen_7b_prefix_caching.py" - "::test_mf_qwen_7b_prefix_caching", - "vllm_mf_qwen_7b_prefix_caching_test_mf_qwen_7b_prefix_caching.log"), - (2, "cases_parallel/vllm_mf_qwen_7b_prefix_caching_v1.py" - "::test_mf_qwen_7b_prefix_caching", - "vllm_mf_qwen_7b_prefix_caching_v1_test_mf_qwen_7b_prefix_caching.log" - ), - (2, "cases_parallel/vllm_mf_qwen_7b_v1.py::test_mf_qwen", - "vllm_mf_qwen_7b_v1_test_mf_qwen.log") - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part2(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [(2, "cases_parallel/vllm_qwen_7b.py::test_vllm_qwen", - "vllm_qwen_7b_test_vllm_qwen.log"), - (2, "cases_parallel/vllm_qwen_7b_v1.py::test_vllm_qwen", - "vllm_qwen_7b_v1_test_vllm_qwen.log"), - (4, "cases_parallel/shm_broadcast.py::test_shm_broadcast", - "shm_broadcast_test_shm_broadcast.log")] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part3(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_deepseek_bf16_part.py::test_deepseek_r1_bf16", - "vllm_deepseek_bf16_part_test_deepseek_r1_bf16.log"), - (2, - "cases_parallel/vllm_deepseek_bf16_part_v1.py::test_deepseek_r1_bf16", - "vllm_deepseek_bf16_part_v1_test_deepseek_r1_bf16.log"), - (2, "cases_parallel/vllm_deepseek_part.py::test_deepseek_r1", - "vllm_deepseek_part_test_deepseek_r1.log"), - (2, "cases_parallel/vllm_deepseek_part_v1.py::test_deepseek_r1", - "vllm_deepseek_part_v1_test_deepseek_r1.log"), - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part4(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_mf_qwen3_8b.py::test_mf_qwen3_v0", - "vllm_mf_qwen3_8b_test_mf_qwen3.log"), - (2, "cases_parallel/vllm_mf_qwen3_8b.py::test_mf_qwen3_v1", - "vllm_mf_qwen3_8b_v1_test_mf_qwen3.log"), - (1, "cases_parallel/vllm_mf_telechat2_7b.py::test_mf_telechat2_7b", - "vllm_mf_telechat2_7b_test_mf_telechat2_7b.log"), - (1, "cases_parallel/vllm_qwen3.py::test_vllm_qwen3_8b", - "vllm_qwen3_test_vllm_qwen3_8b.log"), - (1, "cases_parallel/vllm_qwen3.py::test_vllm_qwen3_0_6b", - "vllm_qwen3_test_vllm_qwen3_0_6b.log"), - (1, "cases_parallel/vllm_llama3.py::test_vllm_llama3_8b", - "vllm_llama3_8b_test_vllm_llama3.log") - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part5(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/multilora_inference.py::test_multilora_inference", - "multilora_inference_test_multilora_inference.log"), - (2, "cases_parallel/vllm_qwen_7b_v1.py::test_qwen_enforce_eager", - "vllm_qwen_7b_v1_test_qwen_enforce_eager.log"), - (2, "cases_parallel/vllm_deepseek_part.py::test_deepseek_mtp", - "vllm_deepseek_part_test_deepseek_mtp.log"), - (1, "cases_parallel/vllm_qwen3.py::test_qwen3_enforce_eager", - "vllm_qwen3_test_qwen3_enforce_eager.log"), - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part6(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_qwen3_moe.py::test_vllm_qwen3_30b_a3b", - "test_vllm_qwen3_30b_a3b.log"), - (2, "cases_parallel/vllm_qwen3_moe.py::test_vllm_qwen3_30b_a3b_eager", - "test_vllm_qwen3_30b_a3b_eager.log"), - ] - run_tasks(cases) - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_part7(): - """ - Feature: test cases parallel. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_qwen2_5_vl_7b_v1.py::test_qwen2_5_vl_7b_v1", - "vllm_qwen2_5_vl_7b_v1.log"), - (1, "cases_parallel/vllm_qwen2_5_vl_7b_v1.py" - "::test_qwen2_5_vl_7b_v1_enforce_eager", - "vllm_qwen2_5_vl_7b_v1_enforce_eager.log"), - (1, "cases_parallel/vllm_qwen2_5_vl_7b_v1.py" - "::test_qwen2_5_vl_7b_v1_video_infer", - "vllm_qwen2_5_vl_7b_v1_video_infer.log"), - ] - run_tasks(cases) - - -@pytest.mark.level4 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_level4_part0(): - """ - Feature: test cases parallel. - Description: - vllm_mf_qwen_7b_cp_pc_mss.py::test_mf_qwen_7b_cp_pc_mss: - accuracy error happens occasionally - Expectation: Pass. - """ - cases = [(2, "cases_parallel/vllm_mf_qwen_7b_cp_pc_mss.py" - "::test_mf_qwen_7b_cp_pc_mss", - "vllm_mf_qwen_7b_cp_pc_mss_test_mf_qwen_7b_cp_pc_mss.log")] - run_tasks(cases) + global HAS_TESTS_REGISTERED + if not HAS_TESTS_REGISTERED: + load_rejistered_tests_from_json("register_parallel_tests.json") - -@pytest.mark.level4 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_level4_mcore1(): - """ - Mcore currently does not support the following test cases, - adjust the level to level 4 until it is re supported - """ - cases = [ - (2, "cases_parallel/vllm_deepseek_osl.py::test_deepseek_r1", - "vllm_deepseek_osl_test_deepseek_r1.log"), - (2, "cases_parallel/vllm_deepseek_smoothquant.py::test_deepseek_r1", - "vllm_deepseek_smoothquant_test_deepseek_r1.log"), - (2, "cases_parallel/vllm_deepseek_smoothquant_mss.py" - "::test_deepseek_r1_mss", - "vllm_deepseek_smoothquant_mss_test_deepseek_r1_mss.log") - ] - run_tasks(cases) - - -@pytest.mark.level4 -@pytest.mark.platform_arm_ascend910b_training -@pytest.mark.env_single -def test_cases_parallel_level4_mcore2(): - """ - Mcore currently does not support the following test cases, - adjust the level to level 4 until it is re supported - """ - cases = [ - (2, "cases_parallel/vllm_deepseek_a8w4.py::test_deepseek_r1_a8w4", - "vllm_deepseek_a8w4_test_deepseek_r1_a8w4.log"), - ] - run_tasks(cases) + # Dynamically generate test cases + generate_parallel_cases(registered_910b_tests, platform="910B") + generate_parallel_cases(registered_310p_tests, platform="310P") + HAS_TESTS_REGISTERED = True -@pytest.mark.level0 -@pytest.mark.platform_ascend310p -@pytest.mark.env_single -def test_cases_parallel_310p_part0(): - """ - Feature: test cases parallel in 310p. - Description: test cases parallel. - Expectation: Pass. - """ - cases = [ - (2, "cases_parallel/vllm_mf_qwen3_8b.py::test_mf_qwen3_v1_310p", - "vllm_mf_qwen3_8b_v1_310p_test_mf_qwen3.log"), - ] - run_tasks(cases) +load_and_generate_tests() diff --git a/tests/st/python/tool_use/__init__.py b/tests/st/python/tool_use/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/st/python/test_deepseekv3_tool_parser.py b/tests/st/python/tool_use/test_deepseekv3_tool_parser.py similarity index 98% rename from tests/st/python/test_deepseekv3_tool_parser.py rename to tests/st/python/tool_use/test_deepseekv3_tool_parser.py index c641fed558b603f6e202a94175dbd23747866868..729b753282f463344672e6bf43598bc48f0ea3ec 100644 --- a/tests/st/python/test_deepseekv3_tool_parser.py +++ b/tests/st/python/tool_use/test_deepseekv3_tool_parser.py @@ -31,6 +31,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from tests.st.python.utils.common_utils import (teardown_function, + setup_function) + class StreamingToolReconstructor: diff --git a/tests/st/python/utils/cases_parallel.py b/tests/st/python/utils/cases_parallel.py deleted file mode 100644 index 29216fb4eebb7c49c79e8035851aa4053c9131e7..0000000000000000000000000000000000000000 --- a/tests/st/python/utils/cases_parallel.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2025 Huawei Technologies Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import os -import signal -import psutil - - -def tasks_resource_alloc(tasks: list[tuple[int]]) -> list[tuple[str]]: - """ - Allocate devices, lccl base port, hccl base port to tasks - according to device requirement of each task. - - For example: - [(2, "cases_parallel/vllm_task.py::test_1", "test_1.log")] - ==> [("export ASCEND_RT_VISIBLE_DEVICES=0,1 && - export LCAL_COMM_ID=127.0.0.1:10068 && " - "export HCCL_IF_BASE_PORT=61000 && " - "pytest -s -v cases_parallel/vllm_task.py::test_1 > test_1.log", - "test_1.log")] - - Args: - tasks (list[tuple[int]]): list of tasks. Each task contain 3 elements. - 1. device_req (int): Num of device requirements, - which will occur device_req devices, - device_req ports for lccl, - device_req ports for hccl. - 2. case_desc (str): The case description, - such as "path_to_case/case.py::target_case". - 3. log_file (str): The logging file path. - - Returns: - list[tuple[str]]: Append resource environment to the task commands. - """ - device_limit = 8 - device_base = 0 - lccl_base_port = 10068 - hccl_base_port = 61000 - - out_tasks: list[tuple[str]] = [] - for task in tasks: - assert len(task) == 3 - resource_req, task_case, log_file = task - if not isinstance(resource_req, int): - raise TypeError( - "First argument of task should be a int or str, but got %s!", - str(type(resource_req))) - - device_str = ",".join( - [str(d) for d in range(device_base, device_base + resource_req)]) - lccl_str = f"127.0.0.1:{lccl_base_port}" - ''' - env_var = os.environ.copy() - env_var.update({ - "ASCEND_RT_VISIBLE_DEVICES": f"{device_str}", - "LCAL_COMM_ID": f"{lccl_str}", - "HCCL_IF_BASE_PORT": f"{hccl_base_port}" - }) - - out_tasks.append((env_var, task_case, log_file)) - - ''' - commands = [ - f"export ASCEND_RT_VISIBLE_DEVICES={device_str}", - f"export LCAL_COMM_ID={lccl_str}", - f"export HCCL_IF_BASE_PORT={hccl_base_port}" - ] - - device_base += resource_req - lccl_base_port += resource_req - hccl_base_port += resource_req - - commands.append(f"pytest -s -v {task_case} > {log_file}") - out_tasks.append((" && ".join(commands), log_file)) - - if device_base > device_limit: - raise ValueError( - "Total require device %d exceeding resource limits %d !", - device_base, device_limit) - - return out_tasks - - -def cleanup_subprocesses() -> None: - """Cleanup all subprocesses raise by main test process.""" - cur_proc = psutil.Process(os.getpid()) - children = cur_proc.children(recursive=True) - for child in children: - with contextlib.suppress(ProcessLookupError): - os.killpg(child.pid, signal.SIGKILL) diff --git a/tests/st/python/utils/common_utils.py b/tests/st/python/utils/common_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3bcc2b40940d8403e90618188a1a466584f3b98e --- /dev/null +++ b/tests/st/python/utils/common_utils.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +common utils +''' + +import contextlib +import logging +import os +import yaml +import signal +import psutil +import subprocess +import random + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + +MODEL_PATH = { + "Llama-3.1-8B-Instruct": + "/home/workspace/mindspore_dataset/weight/Llama-3.1-8B-Instruct", + "telechat2_7b": + "/home/workspace/mindspore_dataset/weight/telechat2_7b", + "Qwen2.5-7B-Instruct": + "/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + "Qwen2.5-7B-Lora-Law": + "/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Lora-Law", + "Qwen2.5-VL-7B-Instruct": + "/home/workspace/mindspore_dataset/weight/Qwen2.5-VL-7B-Instruct", + "Qwen3-0.6B": + "/home/workspace/mindspore_dataset/weight/Qwen3-0.6B", + "Qwen3-8B": + "/home/workspace/mindspore_dataset/weight/Qwen3-8B", + "Qwen3-30B-A3B": + "/home/workspace/mindspore_dataset/weight/Qwen3-30B-A3B", + "DeepSeek-R1-bf16": + "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", + "DeepSeek-R1-W8A8": + "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + "DeepSeek-R1-MTP": + "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP", + "DeepSeek-R1-W8A8-osl": + "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-osl", + "DeepSeek-R1-W8A8-smoothquant-newconfig": + "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-" + "smoothquant-newconfig" +} +''' +HAS_MODEL_PATH_REGISTERED = False + + +def register_model_path_from_yaml(yaml_file): + """ + Register the model path to MODEL_PATH dict. + """ + global HAS_MODEL_PATH_REGISTERED + if not HAS_MODEL_PATH_REGISTERED: + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + module_info_yaml_path = os.path.join(current_dir, yaml_file) + with open(module_info_yaml_path) as f: + models = yaml.safe_load(f) + + MODEL_PATH.update({ + model_name: + f"/home/workspace/mindspore_dataset/weight/{model_name}" + for model_name in models + }) + HAS_MODEL_PATH_REGISTERED = True + + +register_model_path_from_yaml("model_info.yaml") +''' + + +def setup_function(): + """pytest will call the setup_function before case executes.""" + result = subprocess.run( + ["npu-smi", "info"], + capture_output=True, + text=True, + check=True, + ) + logger.warning(result.stdout) + + # Specify device through environment variables to avoid the problem of + # delayed resource release in single card cases. + device_id = os.environ.pop("DEVICE_ID", None) + if device_id is not None: + import mindspore as ms + ms.set_device("Ascend", 0) + logger.warning("This case is assigned to device:%s", str(device_id)) + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = device_id + + # Randomly specify LCCL and HCCL ports for cases without specified port, + # mainly in single card concurrent scenarios, to avoid port conflicts. + lccl_port = os.getenv("LCAL_COMM_ID", None) + if not lccl_port: + lccl_port = random.randint(61000, 65535) + os.environ["LCAL_COMM_ID"] = f"127.0.0.1:{lccl_port}" + + hccl_port = os.getenv("HCCL_IF_BASE_PORT", None) + if not hccl_port: + hccl_port = random.randint(61000, 65535) + os.environ["HCCL_IF_BASE_PORT"] = str(hccl_port) + + +def cleanup_subprocesses(pid=None) -> None: + """Cleanup all subprocesses raise by main test process.""" + pid = pid if pid else os.getpid() + cur_proc = psutil.Process(pid) + children = cur_proc.children(recursive=True) + for child in children: + with contextlib.suppress(ProcessLookupError): + os.killpg(child.pid, signal.SIGKILL) + + +def teardown_function(): + """pytest will call the teardown_function after case function completed.""" + cleanup_subprocesses() diff --git a/tests/st/python/utils/model_info.yaml b/tests/st/python/utils/model_info.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89bacd20aeffc65e39efa1b8a8a1b2b6423d65da --- /dev/null +++ b/tests/st/python/utils/model_info.yaml @@ -0,0 +1,58 @@ +# All registered model weights can be obtained on +# https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/ +# And it will be mapped to /home/workspace/mindspore_dataset/weight/ in the CI environment + +Llama-3.1-8B-Instruct: + description: "Llama-3.1-8B, HF default configuration, source from https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Llama-3.1-8B-Instruct" + +telechat2_7b: + description: "TeleChat2 7B, HF default configuration, source from https://huggingface.co/Tele-AI/TeleChat2-7B, and then convert the bin format to safetensors" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/telechat2_7b" + +Qwen2.5-7B-Instruct: + description: "Qwen2.5-7B, HF default configuration, source from https://huggingface.co/Qwen/Qwen2.5-7B-Instruct" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct" + +Qwen2.5-7B-Lora-Law: + description: "Qwen2.5-7B-Lora-Law, HF default configuration, source from https://huggingface.co/qingpingwan/Qwen2.5-7B-Lora-Law" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen2.5-7B-Lora-Law" + +Qwen2.5-VL-7B-Instruct: + description: "Qwen2.5-VL-7B-Instruct, HF default configuration, source from https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen2.5-VL-7B-Instruct" + +Qwen3-0.6B: + description: "Qwen3-0.6B, HF default configuration, source from https://huggingface.co/Qwen/Qwen3-0.6B" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen3-0.6B" + +Qwen3-8B: + description: "Qwen3-8B, HF default configuration, source from https://huggingface.co/Qwen/Qwen3-8B" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen3-8B" + +Qwen3-30B-A3B: + description: "Qwen3-30B-A3B, HF default configuration, source from https://huggingface.co/Qwen/Qwen3-30B-A3B" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/Qwen3-30B-A3B" + +DeepSeek-R1-bf16: + description: "DeepSeek-R1-bf16, HF default configuration" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16" + +DeepSeek-R1-W8A8: + description: "DeepSeek-R1-W8A8, modify the 'num_hidden_layers' to 4 based on the default quantilization configuration" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8" + +DeepSeek-R1-MTP: + description: "DeepSeek-R1 with MTP, Perform the following modifications based on default quantilization configuration: + 1. Modify the 'num_hidden_layers' to 4 + 2. Delete the content after layer.4 in quantit_modelw_eight_w8a8d_dynamic.index.json, and change the layer.61 corresponding to mtp in model.safesensors.index.json to layer.4 + 3. Delete the content of layer 0-60 in model.safesensors.index.json" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP" + +DeepSeek-R1-W8A8-osl: + description: "DeepSeek-R1 with osl quantilization" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-osl" + +DeepSeek-R1-W8A8-smoothquant-newconfig: + description: "DeepSeek-R1 with smoothquant new config" + archive_addr: "https://tools.mindspore.cn/dataset/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig" \ No newline at end of file