Qwen2.5vl 3b 推理报错

一、问题现象（附报错日志上下文）：
按照 inference_qwen2_5_vl_7b.json和inference_qwen2_5_vl_7b.sh对脚本进行了修改，用于3b模型的推理
inference_qwen2_5_vl_3b.json
```
{
    "pipeline_class": "Qwen2VlPipeline",
    "img_context_token_id": 151655,
    "image_encoder": {
        "vision_encoder": {
            "model_id": "qwen2vit",
            "num_layers": 32,
            "hidden_size": 1280,
            "ffn_hidden_size": 3420,
            "gated_linear_unit": true,
            "llm_hidden_size": 1280,
            "num_attention_heads": 16,
            "hidden_dropout": 0.0,
            "attention_dropout": 0.0,
            "in_channels": 3,
            "patch_size": 14,
            "spatial_merge_size": 2,
            "temporal_patch_size": 2,
            "layernorm_epsilon": 1e-06,
            "normalization": "RMSNorm",
            "fp16": false,
            "bf16": true,
            "params_dtype": "bf16",
            "activation_func": "silu",
            "freeze": true,
            "use_fused_rotary_pos_emb": true,
            "post_layer_norm": false,
            "pipeline_num_layers": [32],
            "intermediate_size": 3420,
            "tokens_per_second": 2,
            "window_attn_size": 112,
            "fullatt_block_indexes": [
                7,
                15,
                23,
                31
            ]
        },
        "vision_projector": {
            "model_id": "lnmlp",
            "num_layers": 1,
            "num_attention_heads": 1,
            "gated_linear_unit": false,
            "bias_activation_fusion": false,
            "add_bias_linear": true,
            "input_size": 1280,
            "hidden_size": 2048,
            "ffn_hidden_size": 5120,
            "activation_func": "gelu",
            "bf16": true,
            "params_dtype": "bf16",
            "freeze": true,
            "layernorm_epsilon": 1e-06,
            "normalization": "RMSNorm"
        }
    },
    "text_decoder": {
        "model_id": "qwen2_5_lm",
        "num_layers": 36,
        "pipeline_num_layers": [36],
        "hidden_size": 2048,
        "ffn_hidden_size": 11008,
        "num_attention_heads": 16,
        "max_position_embeddings": 128000,
        "vocab_size": 151936,
        "rope_theta": 1000000.0,
        "untie_embeddings_and_output_weights": true,
        "disable_bias_linear": true,
        "attention_dropout": 0.0,
        "init_method_std": 0.01,
        "hidden_dropout": 0.0,
        "position_embedding_type": "mrope",
        "normalization": "RMSNorm",
        "activation_func": "silu",
        "use_fused_rotary_pos_emb": true,
        "attention_softmax_in_fp32": true,
        "params_dtype": "bf16",
        "bf16": true,
        "parallel_output": false,
        "group_query_attention": true,
        "num_query_groups": 2,
        "mrope_section": [16, 24, 24],
        "rope_scaling": null,
        "gated_linear_unit": true,
        "layernorm_epsilon": 1e-06,
        "add_bias_linear":false,
        "add_qkv_bias": true,
        "sequence_parallel": false,
        "tokenizer_type": "PretrainedFromHF",
        "is_encoder_decoder": false
    },
    "text_encoder": null,
    "video_encoder": null,
    "dtype": "bf16",
    "device": "npu",
    "tokenizer": {
        "hub_backend": "hf",
        "autotokenizer_name": "AutoTokenizer",
        "from_pretrained": "/root/autodl-tmp/ckpt/hf_path/Qwen2.5-VL-3B-Instruct"
    },
    "generation_config": {
        "bos_token_id": 151643,
        "do_sample": true,
        "kv_cache":true,
        "output_attentions": false,
        "output_hidden_states": false,
        "max_length": 20,
        "min_length": 0,
        "min_new_tokens": null,
        "constraints": null,
        "prompt_lookup_num_tokens": null,
        "guidance_scale": null,
        "bad_words_ids": null,
        "begin_suppress_tokens": null,
        "diversity_penalty": 0.0,
        "early_stopping": false,
        "encoder_no_repeat_ngram_size": 0,
        "encoder_repetition_penalty": 1.0,
        "epsilon_cutoff": 0.0,
        "eta_cutoff": 0.0,
        "exponential_decay_length_penalty": null,
        "forced_bos_token_id": null,
        "forced_decoder_ids": null,
        "forced_eos_token_id": null,
        "length_penalty": 1.0,
        "low_memory": null,
        "max_time": null,
        "no_repeat_ngram_size": 0,
        "num_assistant_tokens": 5,
        "num_assistant_tokens_schedule": "heuristic",
        "num_beam_groups": 1,
        "num_return_groups": 1,
        "num_return_sequences": 1,
        "output_scores": false,
        "output_logits": null,
        "penalty_alpha": null,
        "remove_invalid_values": false,
        "repetition_penalty": 1.05,
        "return_dict_in_generate": false,
        "sequence_bias": null,
        "suppress_tokens": null,
        "typical_p": 1.0,
        "force_words_ids": null,
        "num_beams": 1,
        "renormalize_logits": false,
        "use_cache": true,
        "eos_token_id": [
            151645,
            151643
        ],
        "max_new_tokens": 256,
        "pad_token_id": 151643,
        "vision_start_token_id": 151652,
        "image_token_id": 151655,
        "video_token_id": 151656,
        "temperature": 0.1,
        "top_k": 1,
        "top_p": 0.001,
        "dola_layers": null,
        "cache_implementation": null,
        "cache_config": null,
        "return_legacy_cache": null,
        "min_p": null,
        "token_healing": false,
        "watermarking_config": null,
        "decoder_start_token_id": null,
        "max_matching_ngram_size": null,
        "stop_strings": null
    },
    "min_pixels": 1003520,
    "max_pixels": 12845056,
    "image_path": "examples/qwen2vl/demo.jpeg",
    "prompts": "Describe this image and keep it within 100 words."
}

```

inference_qwen2_5_vl_3b.sh
```
#!/bin/bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh

# 该变量只用于规避megatron对其校验，对npu无效
export CUDA_DEVICE_MAX_CONNECTIONS=1
export ASCEND_SLOG_PRINT_TO_STDOUT=0
export ASCEND_GLOBAL_LOG_LEVEL=3
export TASK_QUEUE_ENABLE=2
export COMBINED_ENABLE=1
export CPU_AFFINITY_CONF=1
export HCCL_CONNECT_TIMEOUT=1200
export NPU_ASD_ENABLE=0
export ACLNN_CACHE_LIMIT=100000
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True

NPUS_PER_NODE=1
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))

MM_MODEL="./examples/qwen2.5vl/inference_qwen2_5_vl_3b.json"
LOAD_PATH="ckpt/mm_path/Qwen2.5-VL-3B-Instruct"

TP=1
PP=1
CP=1
MBS=1
GRAD_ACC_STEP=1
DP=$(($WORLD_SIZE/$TP/$PP/$CP))
GBS=$(($MBS*$GRAD_ACC_STEP*$DP))

DISTRIBUTED_ARGS="
    --nproc_per_node $NPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
"

GPT_ARGS="
    --use-mcore-models \
    --tensor-model-parallel-size ${TP} \
    --pipeline-model-parallel-size ${PP} \
    --micro-batch-size ${MBS} \
    --global-batch-size ${GBS} \
    --tokenizer-type NullTokenizer \
    --vocab-size 152064 \
    --seq-length 1024 \
    --make-vocab-size-divisible-by 1 \
    --normalization RMSNorm \
    --use-fused-rmsnorm \
    --swiglu \
    --use-fused-swiglu \
    --seed 42 \
    --bf16 \
    --load $LOAD_PATH \
    --variable-seq-lengths \
    --use-flash-attn \
    --no-load-optim \
    --no-load-rng
"

MM_ARGS="
    --mm-model $MM_MODEL
"

OUTPUT_ARGS="
    --log-interval 1 \
    --save-interval 10000 \
    --eval-interval 10000 \
    --eval-iters 5000 \
"

torchrun $DISTRIBUTED_ARGS inference_vlm.py \
    $GPT_ARGS \
    $MM_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl

```
bash examples/qwen2.5vl/inference_qwen2_5_vl_3b.sh

报错信息：

```

building VLMModel ...
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 4065787904
 loading release checkpoint from ckpt/mm_path/Qwen2.5-VL-3B-Instruct
could not find arguments in the checkpoint ...
Traceback (most recent call last):
  File "/root/MindSpeed-MM/inference_vlm.py", line 29, in <module>
    main()
  File "/root/MindSpeed-MM/inference_vlm.py", line 25, in main
    vlm_pipeline_dict[inference_config.pipeline_class](inference_config)()
  File "/root/MindSpeed-MM/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py", line 23, in __init__
    self.model = ParallelWrapper(model_provider)
  File "/root/MindSpeed-MM/mindspeed_mm/tasks/inference/pipeline/parallel_wrapper.py", line 38, in __init__
    load_checkpoint(model, None, None, 'load')
  File "/root/MindSpeed-MM/megatron/training/checkpointing.py", line 839, in load_checkpoint
    model[0].load_state_dict(state_dict['model'], strict=strict)
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2152, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for VLMModel:
        Missing key(s) in state_dict: "text_decoder.output_layer.weight".
[ERROR] 2025-05-22-14:55:10 (PID:8278, Device:0, RankID:-1) ERR99999 UNKNOWN application exception
[2025-05-22 14:55:17,038] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 8278) of binary: /root/miniconda3/envs/myenv1/bin/python3.10
Traceback (most recent call last):
  File "/root/miniconda3/envs/myenv1/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
    run(args)
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
    elastic_launch(
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/myenv1/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
inference_vlm.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-05-22_14:55:17
  host      : autodl-container-c86d4c9b52-22e5566c
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 8278)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
```

Ascend/MindSpeed-MM
暂停

内容风险标识

评论 (5)

Ascend/MindSpeed-MM暂停 .gitee-modal { width: 500px !important; }

内容风险标识