diff --git a/docs/features/dummy_optimizer.md b/docs/features/dummy_optimizer.md index 396c1f82d5c9b34de15e2dee3cedc2d5ef35c8a3..62c8da4466ddedba36e96b6fad10e0bc25b8d868 100644 --- a/docs/features/dummy_optimizer.md +++ b/docs/features/dummy_optimizer.md @@ -11,7 +11,7 @@ ## 使用方法 -1. 在模型入口脚本中导入 patch 模块(InternVL 已支持); +1. 在模型入口脚本中导入 patch 模块(InternVL/Qwen2VL 已支持); ```python from mindspeed_mm.patchs import dummy_optimizer_patch diff --git a/examples/qwen2vl/README.md b/examples/qwen2vl/README.md index 6f4ed3d6a981cf603c35e8eaab2865d4b5676bd2..cc65b7ad2c966742e8d0b94f967ac00eabcbf05d 100644 --- a/examples/qwen2vl/README.md +++ b/examples/qwen2vl/README.md @@ -139,8 +139,8 @@ model_config = MODEL_CONFIG_DICT[model_size] #PP parameters: 7B pp_size = 4 -vit_pipeline_num_layers = [32, 0, 0, 0] # LLM在每个卡上切分的层数,和为llm_num_layers,注意要和model.json中配置的pipeline_num_layers一致 -llm_pipeline_num_layers = [1, 6, 11, 10] # vit在每个卡上切分的层数,和为vit_num_layers,注意要和model.json中配置的pipeline_num_layers一致 +vit_pipeline_num_layers = [32, 0, 0, 0] # vit在每个卡上切分的层数,和为llm_num_layers,注意要和model.json中配置的pipeline_num_layers一致 +llm_pipeline_num_layers = [1, 6, 11, 10] # llm在每个卡上切分的层数,和为vit_num_layers,注意要和model.json中配置的pipeline_num_layers一致 tp_size = 1 ``` @@ -320,6 +320,10 @@ NNODES=1 NODE_RANK=0 WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) ``` +注意,当开启PP时,`model.json`中配置的`vision_encoder`和`text_decoder`的`pipeline_num_layer`参数控制了各自的PP切分策略。对于流水线并行,要先处理`vision_encoder`再处理`text_decoder`。 +比如7b默认的值`[32,0,0,0]`、`[1,6,11,10]`,其含义为PP域内第一张卡先放32层`vision_encoder`再放1层`text_decoder`、第二张卡放`text_decoder`接着的6层、第三张卡放`text_decoder`接着的11层、第四张卡放`text_decoder`接着的10层,`vision_encoder`没有放完时不能先放`text_decoder`(比如`[30,2,0,0]`、`[1,6,11,10]`的配置是错的) + +同时注意,如果某张卡上的参数全部冻结时会导致没有梯度(比如`vision_encoder`冻结时PP配置`[30,2,0,0]`、`[0,7,11,10]`),需要在`finetune_qwen2vl_7b.sh`中`GPT_ARGS`参数中增加`--enable-dummy-optimizer`,参考[dummy_optimizer特性文档](https://gitee.com/ascend/MindSpeed-MM/blob/master/docs/features/dummy_optimizer.md)。 #### 3. 启动微调 diff --git a/mindspeed_mm/models/vision/vision_model.py b/mindspeed_mm/models/vision/vision_model.py index fa695cbd6ad9dfe900c97945e79c738851910080..ef7e3944a033fa3a035b7ab5b0d5ec968e63bd19 100644 --- a/mindspeed_mm/models/vision/vision_model.py +++ b/mindspeed_mm/models/vision/vision_model.py @@ -51,7 +51,8 @@ class VisionModel(MultiModalModule): self.post_process = post_process self.add_encoder = config.vision_encoder is not None self.add_projector = config.vision_projector is not None and self.post_process - + self.projector = None # 开pp时projector只在最后一张卡有projector,这里默认要设为None不然影响freeze + self.encoder = None if self.add_encoder: self.encoder = VISION_ENCODER_MAPPINGS[config.vision_encoder.model_id]( config=config.vision_encoder, diff --git a/pretrain_qwen2vl.py b/pretrain_qwen2vl.py index 7efa8efedffae2ed2f98d2e916cbf701e872542c..8a43a6f04476e0a6748b4d9da1210d172facb4ef 100644 --- a/pretrain_qwen2vl.py +++ b/pretrain_qwen2vl.py @@ -15,6 +15,7 @@ from mindspeed_mm.data.data_utils.utils import build_iterations from mindspeed_mm.models.qwen2vl_model import Qwen2VLModel from mindspeed_mm.training import pretrain from mindspeed_mm.utils.transformer_model_config import get_model_config +from mindspeed_mm.patchs import dummy_optimizer_patch # noqa def model_provider(pre_process=True, post_process=True):