diff --git a/README.md b/README.md index 077012f43cc77d4feb831a0034b16cb6694f71b0..0478d67827a691f53b87ab9c42b6f4d956879d69 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ MindSpeed-LLM,原仓名ModelLink,作为昇腾大模型训练框架,旨在为华为 [昇腾芯片](https://www.hiascend.com/) 提供端到端的大语言模型训练方案, 包含分布式预训练、分布式指令微调、分布式偏好对齐以及对应的开发工具链。 +***注 : 原包名 modellink 更改为 mindspeed_llm *** + --- ## MindSpeed-LLM 大模型训练框架功能特性概览 diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index df4e14cc669a71db53ab2a0ecd319251dd134d9a..0fe4492fbce7a14ef4902868b9d3381551a2f572 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -51,15 +51,15 @@ | 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 | | ---- | ------------ | ----------------------------------------------------------- | ------------------------------------------------------------ | ------------ | -| 自研 | 不涉及 | modellink/model/language_model.py:85 | https://github.com/kingoflolz/mesh-transformer-jax/ | 详情地址 | +| 自研 | 不涉及 | mindspeed_llm/model/language_model.py:85 | https://github.com/kingoflolz/mesh-transformer-jax/ | 详情地址 | | 自研 | 涉及 | tests/test_tools/dist_test.py:6 | https://github.com/microsoft/DeepSpeed/blob/master/tests/unit/common.py | 源代码地址 | | 自研 | 涉及 | tests/pipeline/conftest.py:6 | https://github.com/microsoft/DeepSpeed/blob/master/tests/conftest.py | 源代码地址 | | 自研 | 涉及 | tests/ut/conftest.py:6 | https://github.com/microsoft/DeepSpeed/blob/master/tests/conftest.py | 源代码地址 | | 自研 | 不涉及 | examples/mcore/gemma/data_convert_gemma_pretrain.sh:5 | https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered/resolve/main/wikipedia-cn-20230720-filtered.json?download=true | 数据下载地址 | -| 自研 | 不涉及 | modellink/core/transformer/moe/moe_utils.py:135 | https://arxiv.org/abs/2101.03961 | 论文地址 | -| 自研 | 涉及 | modellink/tasks/data/collator.py:4 | https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/src/axolotl/monkeypatch/utils.py | 源代码地址 | -| 自研 | 涉及 | modellink/core/distributed/distributed_data_parallel.py:126 | https://github.com/NVIDIA/TransformerEngine/pull/719 | 源代码地址 | -| 自研 | 不涉及 | modellink/core/datasets/gpt_dataset.py:159, 219 | https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98 | 详情地址 | +| 自研 | 不涉及 | mindspeed_llm/core/transformer/moe/moe_utils.py:135 | https://arxiv.org/abs/2101.03961 | 论文地址 | +| 自研 | 涉及 | mindspeed_llm/tasks/data/collator.py:4 | https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/src/axolotl/monkeypatch/utils.py | 源代码地址 | +| 自研 | 涉及 | mindspeed_llm/core/distributed/distributed_data_parallel.py:126 | https://github.com/NVIDIA/TransformerEngine/pull/719 | 源代码地址 | +| 自研 | 不涉及 | mindspeed_llm/core/datasets/gpt_dataset.py:159, 219 | https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98 | 详情地址 | ## 公开接口声明 MindSpeed-LLM 暂时未发布wheel包,无正式对外公开接口,所有功能均通过shell脚本调用。5个入口脚本分别为[pretrain_gpt.py](https://gitee.com/ascend/MindSpeed-LLM/blob/master/pretrain_gpt.py), [inference.py](https://gitee.com/ascend/MindSpeed-LLM/blob/master/inference.py), [evaluation.py](https://gitee.com/ascend/MindSpeed-LLM/blob/master/evaluation.py), [preprocess_data.py](https://gitee.com/ascend/MindSpeed-LLM/blob/master/preprocess_data.py) 和 [convert_ckpt.py](https://gitee.com/ascend/MindSpeed-LLM/blob/master/convert_ckpt.py)。 diff --git a/convert_ckpt.py b/convert_ckpt.py index 0c05e446029d52cdb1a75f4b48a390bba5d4e168..2eaa06897a23010fdbd809c0872dc481a58cae75 100644 --- a/convert_ckpt.py +++ b/convert_ckpt.py @@ -20,10 +20,10 @@ import os import sys from functools import wraps import torch.multiprocessing as mp -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from pretrain_gpt import model_provider -MODULE_ROOT = "modellink.tasks.checkpoint" +MODULE_ROOT = "mindspeed_llm.tasks.checkpoint" def load_plugin(plugin_type, name): diff --git a/docs/DEVELOP_GUIDE.md b/docs/DEVELOP_GUIDE.md index b8634b5f5e837c0e42f90a1fbd582109fb44c8da..69ffa3c6884c6f19ec433c5b92959cd7a26ac0ad 100644 --- a/docs/DEVELOP_GUIDE.md +++ b/docs/DEVELOP_GUIDE.md @@ -22,7 +22,7 @@ pip install -r requirements.txt pip install -e . git clone https://gitee.com/ascend/MindSpeed-LLM.git -cp -r MindSpeed-LLM/modellink Megatron-LM +cp -r MindSpeed-LLM/mindspeed_llm Megatron-LM ``` ### 3. 快速上手 diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index 2c19c964d7978880f77c8ea02451ccf8562e765b..e8386c42a883106e8fce99aad2f75a5720a023df 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -178,7 +178,7 @@ python convert_ckpt.py \ 【--model-type-hf】 -huggingface模型类别,默认为llama2,目前支持的模型见 [model_cfg.json](https://gitee.com/ascend/MindSpeed-LLM/blob/master/modellink/tasks/checkpoint/model_cfg.json) +huggingface模型类别,默认为llama2,目前支持的模型见 [model_cfg.json](https://gitee.com/ascend/MindSpeed-LLM/blob/master/mindspeed_llm/tasks/checkpoint/model_cfg.json) 【--tokenizer-model】 diff --git a/docs/features/alpaca_dataset.md b/docs/features/alpaca_dataset.md index f28edea33bdab7a3147d73830d616fad8b182f72..2a94ed29fb3479a1706ef3a60f87f1287d91544c 100644 --- a/docs/features/alpaca_dataset.md +++ b/docs/features/alpaca_dataset.md @@ -81,7 +81,7 @@ Alpaca风格示例: 【--prompt-type】 -用于指定模型模板,能够让base模型微调后能具备更好的对话能力。`prompt-type`的可选项可以在[templates](../../modellink/tasks/preprocess/templates.py)文件内查看。 +用于指定模型模板,能够让base模型微调后能具备更好的对话能力。`prompt-type`的可选项可以在[templates](../../mindspeed_llm/tasks/preprocess/templates.py)文件内查看。 【--handler-name】 diff --git a/docs/features/mc2.md b/docs/features/mc2.md index 6b3569947c47777c9add4a9143ede08d98e789ae..0824c773344a1737215ab49dc91f4d599f1b8e4b 100644 --- a/docs/features/mc2.md +++ b/docs/features/mc2.md @@ -5,7 +5,7 @@ 若在非指定版本中尝试采用该配置,可能触发系统级的异常行为,包括但不限于运行时错误。 -MindSpeed-LLM 中 mc2 功能默认关闭,如需开启MC2,需将 `modellink/training/arguments.py` 文件下,`validate_args_decorator`函数中的`args.use_mc2 = False`语句注释掉。 +MindSpeed-LLM 中 mc2 功能默认关闭,如需开启MC2,需将 `mindspeed_llm/training/arguments.py` 文件下,`validate_args_decorator`函数中的`args.use_mc2 = False`语句注释掉。 ## 问题分析 @@ -21,7 +21,7 @@ MC2通过融合算子的方式将matmul计算和集合通信操作进行融合 在python脚本侧将原本串行的matmul和all_gather/reduce_scatter操作通过MC2融合算子接口进行了融合。 -具体实现参见[link](../../modellink/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py)。 +具体实现参见[link](../../mindspeed_llm/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py)。 MC2算子接口参见[link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/apiref/apilist/ptaoplist_000449.html)。 diff --git a/docs/features/pairwise_dataset.md b/docs/features/pairwise_dataset.md index a412aa1897c9f17cfea9c153108d76a91e73e156..e59f394bf9810a4467085e4fd82c09d0603f897b 100644 --- a/docs/features/pairwise_dataset.md +++ b/docs/features/pairwise_dataset.md @@ -56,7 +56,7 @@ python ./preprocess_data.py \ 【--prompt-type】 -用于指定模型模板,能够让base模型微调后能具备更好的对话能力。`prompt-type`的可选项可以在[templates](../../modellink/tasks/preprocess/templates.py)文件内查看。 +用于指定模型模板,能够让base模型微调后能具备更好的对话能力。`prompt-type`的可选项可以在[templates](../../mindspeed_llm/tasks/preprocess/templates.py)文件内查看。 【--handler-name】 diff --git a/evaluation.py b/evaluation.py index 6fd660ece162fc0a6ffdf8ce8620aabbe27f4bd0..382f7522d837e72ae25112774a4234ed1ecd443e 100644 --- a/evaluation.py +++ b/evaluation.py @@ -23,7 +23,7 @@ from typing import Union from torch import distributed as dist from transformers import AutoTokenizer -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, \ get_gpt_layer_local_spec from megatron.core.transformer.spec_utils import import_module @@ -33,16 +33,16 @@ from megatron.legacy.model import GPTModel from megatron.training.arguments import core_transformer_config_from_args from megatron.training.yaml_arguments import core_transformer_config_from_yaml -from modellink.tasks.inference.module import GPTModelInfer, MegatronModuleForCausalLM -from modellink.tasks.evaluation.utils import add_text_generate_args -from modellink.tasks.evaluation.eval_api.chat import Chat -from modellink.tasks.evaluation.eval_impl.boolq_eval import BoolqEval -from modellink.tasks.evaluation.eval_impl.gsm8k_eval import Gsm8kEval -from modellink.tasks.evaluation.eval_impl.mmlu_eval import MmluEval -from modellink.tasks.evaluation.eval_impl.ceval_exam import CEvalExam -from modellink.tasks.evaluation.eval_impl.bbh_eval import BBHEval -from modellink.tasks.evaluation.eval_impl.agi_eval import AGIEvalExam -from modellink.tasks.evaluation.eval_impl.human_eval import HumanEval +from mindspeed_llm.tasks.inference.module import GPTModelInfer, MegatronModuleForCausalLM +from mindspeed_llm.tasks.evaluation.utils import add_text_generate_args +from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat +from mindspeed_llm.tasks.evaluation.eval_impl.boolq_eval import BoolqEval +from mindspeed_llm.tasks.evaluation.eval_impl.gsm8k_eval import Gsm8kEval +from mindspeed_llm.tasks.evaluation.eval_impl.mmlu_eval import MmluEval +from mindspeed_llm.tasks.evaluation.eval_impl.ceval_exam import CEvalExam +from mindspeed_llm.tasks.evaluation.eval_impl.bbh_eval import BBHEval +from mindspeed_llm.tasks.evaluation.eval_impl.agi_eval import AGIEvalExam +from mindspeed_llm.tasks.evaluation.eval_impl.human_eval import HumanEval sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) logging.getLogger().setLevel(logging.INFO) diff --git a/examples/mcore/deepseek2/ckpt_convert_deepseek2_hf2mcore.sh b/examples/mcore/deepseek2/ckpt_convert_deepseek2_hf2mcore.sh index 4f1db493d90fb4a28e92e435226aef3d205928ea..e12d9144e795d14f482c2550c63964f54f474409 100644 --- a/examples/mcore/deepseek2/ckpt_convert_deepseek2_hf2mcore.sh +++ b/examples/mcore/deepseek2/ckpt_convert_deepseek2_hf2mcore.sh @@ -16,4 +16,4 @@ python convert_ckpt.py \ --load-dir ./model_from_hf/deepseek2-hf/ \ --save-dir ./model_weights/deepseek2-mcore/ \ --tokenizer-model ./model_from_hf/deepseek2-hf/ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec diff --git a/examples/mcore/deepseek2/ckpt_convert_deepseek2_mcore2hf.sh b/examples/mcore/deepseek2/ckpt_convert_deepseek2_mcore2hf.sh index 7011bb3d0a7ee10616aa21e24eb804a20d610d96..0b6b1becf2aa45896ae122bede791ed2468b01cd 100644 --- a/examples/mcore/deepseek2/ckpt_convert_deepseek2_mcore2hf.sh +++ b/examples/mcore/deepseek2/ckpt_convert_deepseek2_mcore2hf.sh @@ -15,4 +15,4 @@ python convert_ckpt.py \ --target-expert-parallel-size 1 \ --load-dir ./model_weights/deepseek-mcore/ \ --save-dir ./model_from_hf/deepseek2-hf/ \ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ No newline at end of file + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ No newline at end of file diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_C_ptd.sh b/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_C_ptd.sh index 7ca836418595d90995af4c9c76d1eade1fa54fcb..213166ac4013944eaf989264fef1401577cf7337 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_C_ptd.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_C_ptd.sh @@ -29,7 +29,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --reuse-fp32-param \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_ptd.sh b/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_ptd.sh index 2a634658d8c672e80e3c39f015cce2a112a74ed8..a829452f6843d92b018b9b9395c606c14552e223 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_ptd.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_100b_8k_ptd.sh @@ -31,7 +31,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_236b_8k_ptd.sh b/examples/mcore/deepseek2/pretrain_deepseek2_236b_8k_ptd.sh index a3eb05616114a60f82fb7033ee133c06b4380f37..fa648a8cfa66265b2366a53db526972eabd00fb8 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_236b_8k_ptd.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_236b_8k_ptd.sh @@ -74,7 +74,7 @@ ROPE_ARGS=" " GPT_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --num-layers-per-virtual-pipeline-stage 2 \ --recompute-granularity full \ --recompute-method uniform \ diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_60b_128k_ptd.sh b/examples/mcore/deepseek2/pretrain_deepseek2_60b_128k_ptd.sh index ad2838e597ae5791bbe8f5f14bccee50d2fbe7e7..c7873c80d7b3027001a1dfda9142fc0355377544 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_60b_128k_ptd.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_60b_128k_ptd.sh @@ -74,7 +74,7 @@ ROPE_ARGS=" " GPT_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --recompute-granularity full \ --recompute-method uniform \ --recompute-num-layers 1 \ diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_60b_8k_ptd.sh b/examples/mcore/deepseek2/pretrain_deepseek2_60b_8k_ptd.sh index 2a59c26d11f87a6b732f2ff4fc3bab084541b194..c9683312de03052887ebaea4163aa44e27ee0f94 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_60b_8k_ptd.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_60b_8k_ptd.sh @@ -29,7 +29,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2/pretrain_deepseek2_ptd_8p.sh b/examples/mcore/deepseek2/pretrain_deepseek2_ptd_8p.sh index fa33f14e98f67adc9c649962ec754cb57c25629e..e8f530f57ab2e004b79485a2f9bc18a59e81be84 100644 --- a/examples/mcore/deepseek2/pretrain_deepseek2_ptd_8p.sh +++ b/examples/mcore/deepseek2/pretrain_deepseek2_ptd_8p.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh b/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh index fa33f14e98f67adc9c649962ec754cb57c25629e..e8f530f57ab2e004b79485a2f9bc18a59e81be84 100644 --- a/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh +++ b/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_hf2mcore.sh b/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_hf2mcore.sh index 4f9515818e28dbae770494b05114a455866fb03a..406b452d1194e566d2a7919dc2eec813c557d95f 100644 --- a/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_hf2mcore.sh +++ b/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_hf2mcore.sh @@ -14,7 +14,7 @@ python convert_ckpt.py \ --target-tensor-parallel-size 1 \ --target-pipeline-parallel-size 1 \ --target-expert-parallel-size 8 \ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --load-dir ./model_from_hf/deepseek_v2_lite/ \ --save-dir ./model_weights/deepseek2_lite_mcore/ \ --tokenizer-model ./model_from_hf/deepseek_v2_lite/ diff --git a/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_mcore2hf.sh b/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_mcore2hf.sh index f09f648120a3a16b5580f9b8962d1a058501305c..1723790c3b381a9037d16e263afda4387785ade6 100644 --- a/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_mcore2hf.sh +++ b/examples/mcore/deepseek2_lite/convert_ckpt_deepseek2_lite_mcore2hf.sh @@ -12,6 +12,6 @@ python convert_ckpt.py \ --target-tensor-parallel-size 1 \ --target-pipeline-parallel-size 1 \ --target-expert-parallel-size 1 \ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --load-dir ./model_weights/deepseek2_lite_mcore/ \ --save-dir ./model/deepseek2_lite/ diff --git a/examples/mcore/deepseek2_lite/evaluate_deepseek2_lite_16b_ptd.sh b/examples/mcore/deepseek2_lite/evaluate_deepseek2_lite_16b_ptd.sh index 1f240b62c7e08a3fe9f1b26f085c219657be70df..d532e21a32015bea8be39090b154aaf85774be78 100644 --- a/examples/mcore/deepseek2_lite/evaluate_deepseek2_lite_16b_ptd.sh +++ b/examples/mcore/deepseek2_lite/evaluate_deepseek2_lite_16b_ptd.sh @@ -26,7 +26,7 @@ DISTRIBUTED_ARGS=" " python -m torch.distributed.launch $DISTRIBUTED_ARGS evaluation.py \ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --task-data-path $DATA_PATH \ --task $TASK\ --load "${CHECKPOINT}" \ diff --git a/examples/mcore/deepseek2_lite/generate_deepseek2_lite_16b_ptd.sh b/examples/mcore/deepseek2_lite/generate_deepseek2_lite_16b_ptd.sh index e79f9f78fde71b8eb73b88ecc9a0b49eac26b0cf..8f86c68281129ffae9d5b79ec3fc245c8879d473 100644 --- a/examples/mcore/deepseek2_lite/generate_deepseek2_lite_16b_ptd.sh +++ b/examples/mcore/deepseek2_lite/generate_deepseek2_lite_16b_ptd.sh @@ -31,7 +31,7 @@ DISTRIBUTED_ARGS=" " python -m torch.distributed.launch $DISTRIBUTED_ARGS inference.py \ - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --load "${CHECKPOINT}" \ --task chat \ --max-new-tokens 256 \ diff --git a/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_16p.sh b/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_16p.sh index 6a6ce8a36ca895abf381aba9a55afac27ece77d8..7b003fb0d84b434be5668a0ff0ed8542310b08f0 100644 --- a/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_16p.sh +++ b/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_16p.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_8p.sh b/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_8p.sh index 766ef830b8c42633ddf75525244a49d3bc6309fc..30c750f4ec14ddc812a8f4b0fcee679f1e9f28e5 100644 --- a/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_8p.sh +++ b/examples/mcore/deepseek2_lite/pretrain_deepseek2_lite_16b_ptd_8p.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd.sh b/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd.sh index bd4a57ab708c6f8128800aa8906ea93f1688eec7..2b2fad1e658b78927c6458f91aae821312df2597 100644 --- a/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd.sh +++ b/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd_16p.sh b/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd_16p.sh index 26c871495499e090f62d195b42018febfcbd8fe5..808a4453895f3a68b6b90799807eb7a83e49c8c9 100644 --- a/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd_16p.sh +++ b/examples/mcore/deepseek2_lite/tune_deepseek2_lite_16b_full_ptd_16p.sh @@ -27,7 +27,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/examples/mcore/minicpm3/ckpt_convert_minicpm3_hf2mcore.sh b/examples/mcore/minicpm3/ckpt_convert_minicpm3_hf2mcore.sh index 518617b4594f53ccdc888c8aa0cbee534f572c93..b5b9d7a7ca63ab51a475adc0892360d4c9d7fb47 100644 --- a/examples/mcore/minicpm3/ckpt_convert_minicpm3_hf2mcore.sh +++ b/examples/mcore/minicpm3/ckpt_convert_minicpm3_hf2mcore.sh @@ -13,5 +13,5 @@ python convert_ckpt.py \ --target-pipeline-parallel-size 2 \ --load-dir ./model_from_hf/MiniCPM3-4B-hf/ \ --save-dir ./model_weights/MiniCPM3-4B_mcore-tp1pp2 \ - --spec modellink.tasks.models.spec.minicpm_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.minicpm_spec layer_spec \ --tokenizer-model ./model_from_hf/MiniCPM3-4B-hf/tokenizer.model diff --git a/examples/mcore/minicpm3/ckpt_convert_minicpm3_mcore2hf.sh b/examples/mcore/minicpm3/ckpt_convert_minicpm3_mcore2hf.sh index c49d836772384ff0b93a8ffa973c659d0125ddf7..d5d54791164a38f8c4265d61ecd8aa4edeea4715 100644 --- a/examples/mcore/minicpm3/ckpt_convert_minicpm3_mcore2hf.sh +++ b/examples/mcore/minicpm3/ckpt_convert_minicpm3_mcore2hf.sh @@ -13,5 +13,5 @@ python convert_ckpt.py \ --target-pipeline-parallel-size 1 \ --load-dir ./model_weights/MiniCPM3-4B_mcore-tp1pp2 \ --save-dir ./model_from_hf/MiniCPM3-4B-hf/ \ - --spec modellink.tasks.models.spec.minicpm_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.minicpm_spec layer_spec \ --tokenizer-model ./model_from_hf/MiniCPM3-4B-hf/tokenizer.model diff --git a/examples/mcore/minicpm3/evaluate_minicpm3_4b.sh b/examples/mcore/minicpm3/evaluate_minicpm3_4b.sh index 598c7f93357b1d0523cf3b65047f5ad03b6d0bb0..44d54d06486a13c0406886cc21699fbdcc15dcef 100644 --- a/examples/mcore/minicpm3/evaluate_minicpm3_4b.sh +++ b/examples/mcore/minicpm3/evaluate_minicpm3_4b.sh @@ -29,7 +29,7 @@ torchrun $DISTRIBUTED_ARGS evaluation.py \ --use-flash-attn \ --multi-head-latent-attention \ --prompt-type minicpm3 \ - --spec modellink.tasks.models.spec.minicpm_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.minicpm_spec layer_spec \ --qk-rope-head-dim 32 \ --qk-nope-head-dim 64 \ --q-lora-rank 768 \ diff --git a/examples/mcore/minicpm3/generate_minicpm3_4b.sh b/examples/mcore/minicpm3/generate_minicpm3_4b.sh index 009d8511acfdda57f8b4c8150ab7e5a0ad94b5f8..dd7d688d1e26121b72614ec0adfeb5b5455ec001 100644 --- a/examples/mcore/minicpm3/generate_minicpm3_4b.sh +++ b/examples/mcore/minicpm3/generate_minicpm3_4b.sh @@ -25,7 +25,7 @@ torchrun $DISTRIBUTED_ARGS inference.py \ --pipeline-model-parallel-size 2 \ --use-mcore-models \ --multi-head-latent-attention \ - --spec modellink.tasks.models.spec.minicpm_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.minicpm_spec layer_spec \ --qk-rope-head-dim 32 \ --qk-nope-head-dim 64 \ --q-lora-rank 768 \ diff --git a/examples/mcore/minicpm3/pretrain_minicpm3_4b_32k_ptd.sh b/examples/mcore/minicpm3/pretrain_minicpm3_4b_32k_ptd.sh index 12b2af3f638a2db61e0221da4a0e225ae5e08698..8da61ecc730eb12ebc3647c33e59ae1c00f76f67 100644 --- a/examples/mcore/minicpm3/pretrain_minicpm3_4b_32k_ptd.sh +++ b/examples/mcore/minicpm3/pretrain_minicpm3_4b_32k_ptd.sh @@ -107,7 +107,7 @@ GPT_ARGS=" --no-gradient-accumulation-fusion \ --norm-epsilon 1e-5 \ --tokenizer-not-use-fast \ - --spec modellink.tasks.models.spec.minicpm_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.minicpm_spec layer_spec \ --no-load-optim \ --no-load-rng \ --bf16 diff --git a/inference.py b/inference.py index d6016cd18bc523a33b4c1e392ad697aaa3d8bb78..c91cdb4ecc4d46c9cb04dab616d6471dceaf405f 100644 --- a/inference.py +++ b/inference.py @@ -15,7 +15,7 @@ # limitations under the License. from typing import Union -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, \ get_gpt_layer_local_spec from megatron.core.transformer.spec_utils import import_module @@ -25,8 +25,8 @@ from megatron.training.initialize import initialize_megatron from megatron.training.arguments import core_transformer_config_from_args from megatron.training.yaml_arguments import core_transformer_config_from_yaml -from modellink.tasks.inference.infer_base import task_factory, add_text_generate_args -from modellink.tasks.inference.module import GPTModelInfer, MegatronModuleForCausalLM +from mindspeed_llm.tasks.inference.infer_base import task_factory, add_text_generate_args +from mindspeed_llm.tasks.inference.module import GPTModelInfer, MegatronModuleForCausalLM def model_provider(pre_process=True, post_process=True) -> Union[GPTModelInfer, GPTModel]: diff --git a/modellink/__init__.py b/mindspeed_llm/__init__.py similarity index 92% rename from modellink/__init__.py rename to mindspeed_llm/__init__.py index 9de5ba61dc947d781e5c3a36b0b7242af76011e2..7873e7315ee37ef2c1c610c7f7da16201743ba14 100644 --- a/modellink/__init__.py +++ b/mindspeed_llm/__init__.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from modellink.tasks import megatron_adaptor +from mindspeed_llm.tasks import megatron_adaptor diff --git a/modellink/core/__init__.py b/mindspeed_llm/core/__init__.py similarity index 100% rename from modellink/core/__init__.py rename to mindspeed_llm/core/__init__.py diff --git a/modellink/core/datasets/__init__.py b/mindspeed_llm/core/datasets/__init__.py similarity index 100% rename from modellink/core/datasets/__init__.py rename to mindspeed_llm/core/datasets/__init__.py diff --git a/modellink/core/datasets/blended_megatron_dataset_builder.py b/mindspeed_llm/core/datasets/blended_megatron_dataset_builder.py similarity index 100% rename from modellink/core/datasets/blended_megatron_dataset_builder.py rename to mindspeed_llm/core/datasets/blended_megatron_dataset_builder.py diff --git a/modellink/core/datasets/gpt_dataset.py b/mindspeed_llm/core/datasets/gpt_dataset.py similarity index 99% rename from modellink/core/datasets/gpt_dataset.py rename to mindspeed_llm/core/datasets/gpt_dataset.py index 21086743f0ea81ac437f29bec6b0bd6084f4c520..7fe141d647a15e278a2fd9ec6db8112b56020a27 100644 --- a/modellink/core/datasets/gpt_dataset.py +++ b/mindspeed_llm/core/datasets/gpt_dataset.py @@ -12,7 +12,7 @@ from megatron.core.datasets.utils import Split, log_single_rank from megatron.core.datasets.gpt_dataset import (_build_document_index, _build_shuffle_index ) -from modellink.tasks.error_utils import GPTDatasetSampleIndexError +from mindspeed_llm.tasks.error_utils import GPTDatasetSampleIndexError from .blended_megatron_dataset_builder import need_to_build_dataset logger = logging.getLogger(__name__) diff --git a/modellink/core/datasets/indexed_dataset.py b/mindspeed_llm/core/datasets/indexed_dataset.py similarity index 100% rename from modellink/core/datasets/indexed_dataset.py rename to mindspeed_llm/core/datasets/indexed_dataset.py diff --git a/modellink/core/distributed/__init__.py b/mindspeed_llm/core/distributed/__init__.py similarity index 100% rename from modellink/core/distributed/__init__.py rename to mindspeed_llm/core/distributed/__init__.py diff --git a/modellink/core/distributed/distributed_data_parallel.py b/mindspeed_llm/core/distributed/distributed_data_parallel.py similarity index 100% rename from modellink/core/distributed/distributed_data_parallel.py rename to mindspeed_llm/core/distributed/distributed_data_parallel.py diff --git a/modellink/core/distributed/param_and_grad_buffer.py b/mindspeed_llm/core/distributed/param_and_grad_buffer.py similarity index 100% rename from modellink/core/distributed/param_and_grad_buffer.py rename to mindspeed_llm/core/distributed/param_and_grad_buffer.py diff --git a/modellink/core/models/__init__.py b/mindspeed_llm/core/models/__init__.py similarity index 100% rename from modellink/core/models/__init__.py rename to mindspeed_llm/core/models/__init__.py diff --git a/modellink/core/models/common/__init__.py b/mindspeed_llm/core/models/common/__init__.py similarity index 100% rename from modellink/core/models/common/__init__.py rename to mindspeed_llm/core/models/common/__init__.py diff --git a/modellink/core/models/common/embeddings/__init__.py b/mindspeed_llm/core/models/common/embeddings/__init__.py similarity index 100% rename from modellink/core/models/common/embeddings/__init__.py rename to mindspeed_llm/core/models/common/embeddings/__init__.py diff --git a/modellink/core/models/common/embeddings/rotary_pos_embedding.py b/mindspeed_llm/core/models/common/embeddings/rotary_pos_embedding.py similarity index 100% rename from modellink/core/models/common/embeddings/rotary_pos_embedding.py rename to mindspeed_llm/core/models/common/embeddings/rotary_pos_embedding.py diff --git a/modellink/core/models/gpt/__init__.py b/mindspeed_llm/core/models/gpt/__init__.py similarity index 100% rename from modellink/core/models/gpt/__init__.py rename to mindspeed_llm/core/models/gpt/__init__.py diff --git a/modellink/core/models/gpt/gpt_layer_specs.py b/mindspeed_llm/core/models/gpt/gpt_layer_specs.py similarity index 96% rename from modellink/core/models/gpt/gpt_layer_specs.py rename to mindspeed_llm/core/models/gpt/gpt_layer_specs.py index 39e1e290e418fbb6bc718a7d829d0e1919fe5985..2b044b3222dcd7f6bb517212a97df5a961e80206 100644 --- a/modellink/core/models/gpt/gpt_layer_specs.py +++ b/mindspeed_llm/core/models/gpt/gpt_layer_specs.py @@ -19,7 +19,7 @@ from functools import wraps from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.training import get_args -from modellink.core.transformer.custom_layers.transformer_engine import PTNorm +from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm def get_gpt_layer_local_spec_wrapper(fn): diff --git a/modellink/core/models/gpt/gpt_model.py b/mindspeed_llm/core/models/gpt/gpt_model.py similarity index 98% rename from modellink/core/models/gpt/gpt_model.py rename to mindspeed_llm/core/models/gpt/gpt_model.py index 0a7f4e49305a927fe5ed169f1f79610c09d27cbb..9a633dd0e800b81289d47c52e93130c15a658944 100644 --- a/modellink/core/models/gpt/gpt_model.py +++ b/mindspeed_llm/core/models/gpt/gpt_model.py @@ -21,7 +21,7 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.training import get_args -from modellink.core.tensor_parallel.layers import SegmentedColumnParallelLinear +from mindspeed_llm.core.tensor_parallel.layers import SegmentedColumnParallelLinear def gpt_model_init_wrapper(fn): diff --git a/modellink/core/optimizer/__init__.py b/mindspeed_llm/core/optimizer/__init__.py similarity index 100% rename from modellink/core/optimizer/__init__.py rename to mindspeed_llm/core/optimizer/__init__.py diff --git a/modellink/core/optimizer/clip_grads.py b/mindspeed_llm/core/optimizer/clip_grads.py similarity index 100% rename from modellink/core/optimizer/clip_grads.py rename to mindspeed_llm/core/optimizer/clip_grads.py diff --git a/modellink/core/optimizer/distrib_optimizer.py b/mindspeed_llm/core/optimizer/distrib_optimizer.py similarity index 100% rename from modellink/core/optimizer/distrib_optimizer.py rename to mindspeed_llm/core/optimizer/distrib_optimizer.py diff --git a/modellink/core/parallel_state.py b/mindspeed_llm/core/parallel_state.py similarity index 100% rename from modellink/core/parallel_state.py rename to mindspeed_llm/core/parallel_state.py diff --git a/modellink/core/pipeline_parallel/__init__.py b/mindspeed_llm/core/pipeline_parallel/__init__.py similarity index 100% rename from modellink/core/pipeline_parallel/__init__.py rename to mindspeed_llm/core/pipeline_parallel/__init__.py diff --git a/modellink/core/pipeline_parallel/p2p_communication.py b/mindspeed_llm/core/pipeline_parallel/p2p_communication.py similarity index 100% rename from modellink/core/pipeline_parallel/p2p_communication.py rename to mindspeed_llm/core/pipeline_parallel/p2p_communication.py diff --git a/modellink/core/pipeline_parallel/schedules.py b/mindspeed_llm/core/pipeline_parallel/schedules.py similarity index 100% rename from modellink/core/pipeline_parallel/schedules.py rename to mindspeed_llm/core/pipeline_parallel/schedules.py diff --git a/modellink/core/tensor_parallel/__init__.py b/mindspeed_llm/core/tensor_parallel/__init__.py similarity index 100% rename from modellink/core/tensor_parallel/__init__.py rename to mindspeed_llm/core/tensor_parallel/__init__.py diff --git a/modellink/core/tensor_parallel/ascend_turbo/__init__.py b/mindspeed_llm/core/tensor_parallel/ascend_turbo/__init__.py similarity index 100% rename from modellink/core/tensor_parallel/ascend_turbo/__init__.py rename to mindspeed_llm/core/tensor_parallel/ascend_turbo/__init__.py diff --git a/modellink/core/tensor_parallel/ascend_turbo/ascend_turbo_cfg.py b/mindspeed_llm/core/tensor_parallel/ascend_turbo/ascend_turbo_cfg.py similarity index 100% rename from modellink/core/tensor_parallel/ascend_turbo/ascend_turbo_cfg.py rename to mindspeed_llm/core/tensor_parallel/ascend_turbo/ascend_turbo_cfg.py diff --git a/modellink/core/tensor_parallel/ascend_turbo/initialize.py b/mindspeed_llm/core/tensor_parallel/ascend_turbo/initialize.py similarity index 100% rename from modellink/core/tensor_parallel/ascend_turbo/initialize.py rename to mindspeed_llm/core/tensor_parallel/ascend_turbo/initialize.py diff --git a/modellink/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py b/mindspeed_llm/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py similarity index 100% rename from modellink/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py rename to mindspeed_llm/core/tensor_parallel/ascend_turbo/mc2_linears_seq_parallel.py diff --git a/modellink/core/tensor_parallel/layers.py b/mindspeed_llm/core/tensor_parallel/layers.py similarity index 100% rename from modellink/core/tensor_parallel/layers.py rename to mindspeed_llm/core/tensor_parallel/layers.py diff --git a/modellink/core/transformer/__init__.py b/mindspeed_llm/core/transformer/__init__.py similarity index 100% rename from modellink/core/transformer/__init__.py rename to mindspeed_llm/core/transformer/__init__.py diff --git a/modellink/core/transformer/attention.py b/mindspeed_llm/core/transformer/attention.py similarity index 100% rename from modellink/core/transformer/attention.py rename to mindspeed_llm/core/transformer/attention.py diff --git a/modellink/core/transformer/custom_layers/__init__.py b/mindspeed_llm/core/transformer/custom_layers/__init__.py similarity index 100% rename from modellink/core/transformer/custom_layers/__init__.py rename to mindspeed_llm/core/transformer/custom_layers/__init__.py diff --git a/modellink/core/transformer/custom_layers/transformer_engine.py b/mindspeed_llm/core/transformer/custom_layers/transformer_engine.py similarity index 100% rename from modellink/core/transformer/custom_layers/transformer_engine.py rename to mindspeed_llm/core/transformer/custom_layers/transformer_engine.py diff --git a/modellink/core/transformer/dot_product_attention.py b/mindspeed_llm/core/transformer/dot_product_attention.py similarity index 99% rename from modellink/core/transformer/dot_product_attention.py rename to mindspeed_llm/core/transformer/dot_product_attention.py index b8b2dfce612eeea61e3e3696ff6bb76435bdaabf..1ff153d2045a0c99443c3e4354c79d52029d73cd 100644 --- a/modellink/core/transformer/dot_product_attention.py +++ b/mindspeed_llm/core/transformer/dot_product_attention.py @@ -24,8 +24,8 @@ from mindspeed.core.context_parallel.utils import get_scheduling_info from mindspeed.model.transformer import get_attention_mask from mindspeed.utils import get_actual_seq_len -from modellink.core.models.common.embeddings.rotary_pos_embedding import yarn_get_mscale -from modellink.tasks.models.common.alibi import Alibi +from mindspeed_llm.core.models.common.embeddings.rotary_pos_embedding import yarn_get_mscale +from mindspeed_llm.tasks.models.common.alibi import Alibi try: from einops import rearrange diff --git a/modellink/core/transformer/mlp.py b/mindspeed_llm/core/transformer/mlp.py similarity index 100% rename from modellink/core/transformer/mlp.py rename to mindspeed_llm/core/transformer/mlp.py diff --git a/modellink/core/transformer/moe/__init__.py b/mindspeed_llm/core/transformer/moe/__init__.py similarity index 100% rename from modellink/core/transformer/moe/__init__.py rename to mindspeed_llm/core/transformer/moe/__init__.py diff --git a/modellink/core/transformer/moe/moe_layer.py b/mindspeed_llm/core/transformer/moe/moe_layer.py similarity index 100% rename from modellink/core/transformer/moe/moe_layer.py rename to mindspeed_llm/core/transformer/moe/moe_layer.py diff --git a/modellink/core/transformer/moe/moe_utils.py b/mindspeed_llm/core/transformer/moe/moe_utils.py similarity index 100% rename from modellink/core/transformer/moe/moe_utils.py rename to mindspeed_llm/core/transformer/moe/moe_utils.py diff --git a/modellink/core/transformer/moe/router.py b/mindspeed_llm/core/transformer/moe/router.py similarity index 99% rename from modellink/core/transformer/moe/router.py rename to mindspeed_llm/core/transformer/moe/router.py index 4c3e9966346b5ec47c2ef309ffb73e15492a2ccb..1845e161210eb0e8f2bfbe4e59fad9575e01dd24 100644 --- a/modellink/core/transformer/moe/router.py +++ b/mindspeed_llm/core/transformer/moe/router.py @@ -22,7 +22,7 @@ from megatron.training import get_args from megatron.core.transformer.moe.moe_utils import MoEAuxLossAutoScaler, save_to_aux_losses_tracker from megatron.core import parallel_state from .moe_utils import topk_softmax_with_capacity, switch_load_balancing_loss_func -from modellink.tasks.models.common.pai_megatron import pai_megatron_aux_loss +from mindspeed_llm.tasks.models.common.pai_megatron import pai_megatron_aux_loss def group_limited_greedy_topKgating(self, logits: torch.Tensor): args = get_args() diff --git a/modellink/core/transformer/multi_head_latent_attention.py b/mindspeed_llm/core/transformer/multi_head_latent_attention.py similarity index 100% rename from modellink/core/transformer/multi_head_latent_attention.py rename to mindspeed_llm/core/transformer/multi_head_latent_attention.py diff --git a/modellink/core/transformer/transformer_block.py b/mindspeed_llm/core/transformer/transformer_block.py similarity index 100% rename from modellink/core/transformer/transformer_block.py rename to mindspeed_llm/core/transformer/transformer_block.py diff --git a/modellink/core/transformer/transformer_layer.py b/mindspeed_llm/core/transformer/transformer_layer.py similarity index 100% rename from modellink/core/transformer/transformer_layer.py rename to mindspeed_llm/core/transformer/transformer_layer.py diff --git a/modellink/inference/__init__.py b/mindspeed_llm/inference/__init__.py similarity index 100% rename from modellink/inference/__init__.py rename to mindspeed_llm/inference/__init__.py diff --git a/modellink/inference/text_generation/__init__.py b/mindspeed_llm/inference/text_generation/__init__.py similarity index 100% rename from modellink/inference/text_generation/__init__.py rename to mindspeed_llm/inference/text_generation/__init__.py diff --git a/modellink/inference/text_generation/forward_step.py b/mindspeed_llm/inference/text_generation/forward_step.py similarity index 100% rename from modellink/inference/text_generation/forward_step.py rename to mindspeed_llm/inference/text_generation/forward_step.py diff --git a/modellink/inference/text_generation/generation.py b/mindspeed_llm/inference/text_generation/generation.py similarity index 100% rename from modellink/inference/text_generation/generation.py rename to mindspeed_llm/inference/text_generation/generation.py diff --git a/modellink/inference/text_generation/tokenization.py b/mindspeed_llm/inference/text_generation/tokenization.py similarity index 98% rename from modellink/inference/text_generation/tokenization.py rename to mindspeed_llm/inference/text_generation/tokenization.py index e8d7324a35cc9833bdcf4803ddc7e9f487bd20e8..13f5918885872357c671e26feeee4eba8b2e6f3e 100644 --- a/modellink/inference/text_generation/tokenization.py +++ b/mindspeed_llm/inference/text_generation/tokenization.py @@ -18,7 +18,7 @@ import torch from megatron.training import get_args from megatron.inference.text_generation.communication import broadcast_int_list, broadcast_tensor -from modellink.tasks.preprocess.templates import Template, get_model_template +from mindspeed_llm.tasks.preprocess.templates import Template, get_model_template def _encode_no_template(tokenizer, prompts): diff --git a/modellink/legacy/__init__.py b/mindspeed_llm/legacy/__init__.py similarity index 100% rename from modellink/legacy/__init__.py rename to mindspeed_llm/legacy/__init__.py diff --git a/modellink/legacy/data/__init__.py b/mindspeed_llm/legacy/data/__init__.py similarity index 100% rename from modellink/legacy/data/__init__.py rename to mindspeed_llm/legacy/data/__init__.py diff --git a/modellink/legacy/data/data_samplers.py b/mindspeed_llm/legacy/data/data_samplers.py similarity index 97% rename from modellink/legacy/data/data_samplers.py rename to mindspeed_llm/legacy/data/data_samplers.py index 526ff94c993388a338d48930ac78dee25ed5731b..9265f7b82a294b1ae24065757e74fcb65480fe25 100644 --- a/modellink/legacy/data/data_samplers.py +++ b/mindspeed_llm/legacy/data/data_samplers.py @@ -23,7 +23,7 @@ from transformers import DataCollatorForSeq2Seq from megatron.training import get_args, get_tokenizer from megatron.core import parallel_state from megatron.legacy.data.data_samplers import MegatronPretrainingSampler, MegatronPretrainingRandomSampler -from modellink.tasks.data.collator import PairwiseDataCollatorWithPadding +from mindspeed_llm.tasks.data.collator import PairwiseDataCollatorWithPadding def build_pretraining_data_loader(dataset, consumed_samples): diff --git a/modellink/legacy/model/__init__.py b/mindspeed_llm/legacy/model/__init__.py similarity index 100% rename from modellink/legacy/model/__init__.py rename to mindspeed_llm/legacy/model/__init__.py diff --git a/modellink/legacy/model/gpt_model.py b/mindspeed_llm/legacy/model/gpt_model.py similarity index 98% rename from modellink/legacy/model/gpt_model.py rename to mindspeed_llm/legacy/model/gpt_model.py index 9fdbfc37de3a7ae333048266647fc7da6b67cedd..2921a56e11698f45ea76a8e59211c233623d6f61 100644 --- a/modellink/legacy/model/gpt_model.py +++ b/mindspeed_llm/legacy/model/gpt_model.py @@ -18,7 +18,7 @@ from megatron.legacy.model.module import MegatronModule from megatron.legacy.model.enums import AttnMaskType from megatron.legacy.model.language_model import get_language_model, parallel_lm_logits from megatron.core import tensor_parallel -from modellink.tasks.inference import MegatronModuleForCausalLM +from mindspeed_llm.tasks.inference import MegatronModuleForCausalLM def post_language_model_processing(lm_output, labels, logit_weights, diff --git a/modellink/legacy/model/language_model.py b/mindspeed_llm/legacy/model/language_model.py similarity index 100% rename from modellink/legacy/model/language_model.py rename to mindspeed_llm/legacy/model/language_model.py diff --git a/modellink/legacy/model/rms_norm.py b/mindspeed_llm/legacy/model/rms_norm.py similarity index 100% rename from modellink/legacy/model/rms_norm.py rename to mindspeed_llm/legacy/model/rms_norm.py diff --git a/modellink/legacy/model/transformer.py b/mindspeed_llm/legacy/model/transformer.py similarity index 99% rename from modellink/legacy/model/transformer.py rename to mindspeed_llm/legacy/model/transformer.py index 65c0bc69cc63048edb964b351911fd57fd3ba1d2..02e712c1a7904ba9a9bf83f44b34c164b6e194c8 100644 --- a/modellink/legacy/model/transformer.py +++ b/mindspeed_llm/legacy/model/transformer.py @@ -42,11 +42,11 @@ from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl from mindspeed.core.tensor_parallel.random import CheckpointWithoutOutput from mindspeed.model.transformer import get_attention_mask -from modellink.core.transformer.mlp import should_recompute_activation -from modellink.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb -from modellink.tasks.error_utils import ensure_valid -from modellink.tasks.models.common.alibi import Alibi -from modellink.tasks.finetune.lora.utils import is_enable_lora +from mindspeed_llm.core.transformer.mlp import should_recompute_activation +from mindspeed_llm.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from mindspeed_llm.tasks.error_utils import ensure_valid +from mindspeed_llm.tasks.models.common.alibi import Alibi +from mindspeed_llm.tasks.finetune.lora.utils import is_enable_lora def state_dict_for_save_checkpoint(state_dict): diff --git a/modellink/tasks/__init__.py b/mindspeed_llm/tasks/__init__.py similarity index 100% rename from modellink/tasks/__init__.py rename to mindspeed_llm/tasks/__init__.py diff --git a/modellink/tasks/checkpoint/__init__.py b/mindspeed_llm/tasks/checkpoint/__init__.py similarity index 100% rename from modellink/tasks/checkpoint/__init__.py rename to mindspeed_llm/tasks/checkpoint/__init__.py diff --git a/modellink/tasks/checkpoint/loader_hf.py b/mindspeed_llm/tasks/checkpoint/loader_hf.py similarity index 100% rename from modellink/tasks/checkpoint/loader_hf.py rename to mindspeed_llm/tasks/checkpoint/loader_hf.py diff --git a/modellink/tasks/checkpoint/loader_mg.py b/mindspeed_llm/tasks/checkpoint/loader_mg.py similarity index 100% rename from modellink/tasks/checkpoint/loader_mg.py rename to mindspeed_llm/tasks/checkpoint/loader_mg.py diff --git a/modellink/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py similarity index 97% rename from modellink/tasks/checkpoint/models.py rename to mindspeed_llm/tasks/checkpoint/models.py index 8970d71afd56f0afe33af73dd5c0ff0238564d9f..9c954929ed312e9032477352a885ddb441ca9258 100644 --- a/modellink/tasks/checkpoint/models.py +++ b/mindspeed_llm/tasks/checkpoint/models.py @@ -1,1198 +1,1198 @@ -import abc -import os -import sys -import re -import json -from types import SimpleNamespace -import logging as logger -from pathlib import Path -from collections import OrderedDict -from tqdm import tqdm -import torch -from transformers import AutoModelForCausalLM -from megatron.core import mpu -from megatron.training.arguments import validate_args -from megatron.legacy.model import module -from megatron.core.enums import ModelType -from megatron.training.checkpointing import load_args_from_checkpoint -from megatron.training.global_vars import set_args -from megatron.training.checkpointing import load_checkpoint -from megatron.core import tensor_parallel -from modellink.training.utils import parse_args -from modellink.training import model_provider_func_wrapper -from modellink.training.checkpointing import load_checkpoint_wrapper - -logger.basicConfig(format="") -logger.getLogger().setLevel(logger.INFO) - -load_checkpoint = load_checkpoint_wrapper(load_checkpoint) - - -def tensor_info(tensor): - shape = tensor.shape - mean_val = tensor.mean().item() - min_val = tensor.min().item() - max_val = tensor.max().item() - return f"shape: {shape} mean_val: {mean_val} min_val: {min_val} max_val: {max_val}" - - -class ModelBase(abc.ABC): - def __init__(self, args_cmd=None): - self.args_cmd = args_cmd - self.args = None - self.args_megatron_checkpoint = None - self.module = None - self.module_mapping = None - self.model_cfg = self.read_model_cfg(args_cmd) - self.__register_functions() - self.kwargs_idx = OrderedDict({ - "vp_rank": 0, - "ep_rank": 0, - "tp_rank": 0, - "layer_idx": 0, - "expert_idx": 0 - }) - - def update_kwargs_idx(self, **kwargs): - for key in self.kwargs_idx: - if key in kwargs: - self.kwargs_idx[key] = kwargs[key] - else: - self.kwargs_idx[key] = 0 - - def __register_functions(self): - self.get_module_mapping() - - def _get_obj(self, value, **kwargs): - pattern = r'(\w+)(?:\[(\w+)\])?' - matches = re.findall(pattern, value) - self.update_kwargs_idx(**kwargs) - obj = self.get_model_item(**kwargs) - for attr, attr_ident in matches: - if hasattr(obj, attr): - obj = getattr(obj, attr) - else: - return None - if attr_ident: - if attr_ident in self.kwargs_idx: - attr_idx = self.kwargs_idx[attr_ident] - obj = obj[attr_idx] - else: - raise AssertionError(f"check {self.__class__.__name__}.module_mapping **{attr_ident}**.") - return obj - - def _get_dst_obj(self, value, **kwargs): - if kwargs.get("layer_idx") is None: - kwargs["layer_idx"] = kwargs.get("dst_layer_idx") - - return _get_obj(self, value, **kwargs) - - def _get_src_obj(self, value, **kwargs): - if kwargs.get("layer_idx") is None: - kwargs["layer_idx"] = kwargs.get("src_layer_idx") - - return _get_obj(self, value, **kwargs) - - def _func_generator_get_module(value): - def func(self, **kwargs): - return _get_src_obj(self, value, **kwargs) - return func - - def _func_generator_get_weight(value): - def func(self, **kwargs): - return _get_src_obj(self, value, **kwargs).weight.data - return func - - def _func_generator_get_bias(value): - def func(self, **kwargs): - return _get_src_obj(self, value, **kwargs).bias.data - return func - - def _func_generator_set_weight(value): - def func(self, **kwargs): - return _get_dst_obj(self, value, **kwargs).weight.data.copy_(kwargs.get('data')) - return func - - def _func_generator_set_module(value): - def func(self, **kwargs): - return _get_dst_obj(self, value, **kwargs).data.copy_(kwargs.get('data')) - return func - - def _func_generator_set_bias(value): - def func(self, **kwargs): - return _get_dst_obj(self, value, **kwargs).bias.data.copy_(kwargs.get('data')) - return func - - def _func_generator_has_module(value): - def func(self, **kwargs): - obj = _get_src_obj(self, value, **kwargs) - return True if obj else False - return func - - def _func_generator_has_bias(value): - def func(self, **kwargs): - bias = getattr(_get_src_obj(self, value, **kwargs), 'bias', None) - return bias is not None - return func - - if self.module_mapping: - for key, value in self.module_mapping.items(): - setattr(self, "get_" + key + "_module", _func_generator_get_module(value).__get__(self, ModelBase)) - setattr(self, "set_" + key + "_module", _func_generator_set_module(value).__get__(self, ModelBase)) - setattr(self, "get_" + key + "_weight", _func_generator_get_weight(value).__get__(self, ModelBase)) - setattr(self, "get_" + key + "_bias", _func_generator_get_bias(value).__get__(self, ModelBase)) - setattr(self, "set_" + key + "_weight", _func_generator_set_weight(value).__get__(self, ModelBase)) - setattr(self, "set_" + key + "_bias", _func_generator_set_bias(value).__get__(self, ModelBase)) - setattr(self, "has_" + key + "_module", _func_generator_has_module(value).__get__(self, ModelBase)) - setattr(self, "has_" + key + "_bias", _func_generator_has_bias(value).__get__(self, ModelBase)) - - def update_module(self, src_model): - self.set_preprocess_state(src_model) - self.set_postprocess_state(src_model) - if not (hasattr(self.args, "noop_layers") and self.args.noop_layers): - for layer_idx in tqdm(range(self.args.num_layers), "set layer states"): - self.set_layer_state(src_model, layer_idx) - return - - # Do ckpt conversion when noop layer is configured. - # For example, hf_layer = [0, 1], add noop layer [1, 3], then mg_layers = [0(0), 1(noop), 2(1), 3(noop)] - hf_num_layers = self.args.num_layers - len(self.args.noop_layers) - mg_layer_list = [i for i in range(hf_num_layers)] - for i in self.args.noop_layers: - # insert noop layer - mg_layer_list.insert(i, -1) - for dst_layer_idx, src_layer_idx in enumerate(mg_layer_list): - if not self.is_noop_layer(src_layer_idx): - self.set_layer_state_base(src_model, src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx) - - def set_preprocess_state(self, src_model): - """Set embedding params.""" - embeddings_weight = src_model.get_embedding_word_embeddings_weight() - if embeddings_weight.size(0) > self.get_embedding_word_embeddings_weight().size(0): - logger.info(f"Source embedding size: {embeddings_weight.size()} " - f"Target embedding size: {self.get_embedding_word_embeddings_weight().size()}") - embeddings_weight = embeddings_weight[:self.get_embedding_word_embeddings_weight().size(0), :] - self.set_embedding_word_embeddings_weight(data=embeddings_weight) - if src_model.has_embedding_word_embeddings_norm_module(): - embd_norm_weight = src_model.get_embedding_word_embeddings_norm_weight() - embd_norm_bias = src_model.get_embedding_word_embeddings_norm_bias() - self.set_embedding_word_embeddings_norm_weight(data=embd_norm_weight) - self.set_embedding_word_embeddings_norm_bias(data=embd_norm_bias) - - def set_postprocess_state(self, src_model): - final_layernorm_weight = src_model.get_final_layernorm_weight() - self.set_final_layernorm_weight(data=final_layernorm_weight) - if self.args.untie_embeddings_and_output_weights: - output_layer_weight = src_model.get_output_layer_weight() - if output_layer_weight.size(0) > self.get_output_layer_weight().size(0): - logger.info(f"Source output layer weight size: {output_layer_weight.size()} " - f"Target output layer weight size: {self.get_output_layer_weight().size()}") - output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :] - self.set_output_layer_weight(data=output_layer_weight) - if self.has_final_layernorm_bias(): - final_layernorm_bias = src_model.get_final_layernorm_bias() - self.set_final_layernorm_bias(data=final_layernorm_bias) - - def set_layer_state(self, src_model, layer_idx): - """ - For source layer index == destination layer index. - """ - self.set_layer_state_base(src_model, layer_idx, layer_idx) - - @staticmethod - def is_noop_layer(src_layer_idx): - return src_layer_idx == -1 - - def set_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx): - """ - We update megatron module by transferring layers in src_layer_idx into those in dst_layer_idx . - """ - kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} - self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) - self.set_mlp_state(src_model, **kwargs) - input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight) - - if self.args.post_norm: - post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight( - layer_idx=src_layer_idx) - self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx, - data=post_attn_layernorm_weight) - else: - pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight( - layer_idx=src_layer_idx) - self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, - data=pre_mlp_layernorm_weight) - - if self.has_layers_input_layernorm_bias(layer_idx=src_layer_idx): - input_layernorm_bias = src_model.get_layers_input_layernorm_bias(layer_idx=src_layer_idx) - self.set_layers_input_layernorm_bias(layer_idx=dst_layer_idx, data=input_layernorm_bias) - - if self.has_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx): - pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx) - self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias) - - def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model): - """Set self-attention params.""" - if getattr(src_model.get_args(), "qk_layernorm", False): - if getattr(src_model.get_args(), "q_lora_rank", None): - q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) - k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm) - - if getattr(src_model.get_args(), "multi_head_latent_attention", False): - if getattr(src_model.get_args(), "q_lora_rank", None): - linear_qb = src_model.get_layers_self_attention_linear_qb_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_linear_qb_weight(layer_idx=dst_layer_idx, data=linear_qb) - linear_kvb = src_model.get_layers_self_attention_linear_kvb_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_linear_kvb_weight(layer_idx=dst_layer_idx, data=linear_kvb) - - qkv_weight = src_model.get_layers_self_attention_linear_qkv_weight(layer_idx=src_layer_idx) - proj_weight = src_model.get_layers_self_attention_linear_proj_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_linear_qkv_weight(layer_idx=dst_layer_idx, data=qkv_weight) - self.set_layers_self_attention_linear_proj_weight(layer_idx=dst_layer_idx, data=proj_weight) - if self.args.add_qkv_bias: - qkv_bias = src_model.get_layers_self_attention_linear_qkv_bias(layer_idx=src_layer_idx) - self.set_layers_self_attention_linear_qkv_bias(layer_idx=dst_layer_idx, data=qkv_bias) - if self.args.add_dense_bias: - proj_bias = src_model.get_layers_self_attention_linear_proj_bias(layer_idx=src_layer_idx) - self.set_layers_self_attention_linear_proj_bias(layer_idx=dst_layer_idx, data=proj_bias) - - def _set_mlp_state(self, src_model, **kwargs): - """Set MLP params.""" - fc1_weight = src_model.get_layers_mlp_linear_fc1_weight(**kwargs) - fc2_weight = src_model.get_layers_mlp_linear_fc2_weight(**kwargs) - self.set_layers_mlp_linear_fc1_weight(data=fc1_weight, **kwargs) - self.set_layers_mlp_linear_fc2_weight(data=fc2_weight, **kwargs) - if src_model.has_layers_mlp_linear_fc1_bias(**kwargs): - fc1_bias = src_model.get_layers_mlp_linear_fc1_bias(**kwargs) - self.set_layers_mlp_linear_fc1_bias(data=fc1_bias, **kwargs) - if src_model.has_layers_mlp_linear_fc2_bias(**kwargs): - fc2_bias = src_model.get_layers_mlp_linear_fc2_bias(**kwargs) - self.set_layers_mlp_linear_fc2_bias(data=fc2_bias, **kwargs) - if self.args.post_norm: - pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(**kwargs) - post_mlp_layernorm_weight = src_model.get_layers_self_attention_post_mlp_layernorm_weight(**kwargs) - self.set_layers_self_attention_pre_mlp_layernorm_weight(data=pre_mlp_layernorm_weight, **kwargs) - self.set_layers_self_attention_post_mlp_layernorm_weight(data=post_mlp_layernorm_weight, **kwargs) - - def _set_mlp_experts_state(self, src_model, **kwargs): - """Set MLP experts params.""" - fc1_weight = src_model.get_layers_mlp_experts_linear_fc1_weight(**kwargs) - fc2_weight = src_model.get_layers_mlp_experts_linear_fc2_weight(**kwargs) - self.set_layers_mlp_experts_linear_fc1_weight(data=fc1_weight, **kwargs) - self.set_layers_mlp_experts_linear_fc2_weight(data=fc2_weight, **kwargs) - - def _set_mlp_shared_experts_state(self, src_model, **kwargs): - """Set MLP shared experts params.""" - fc1_weight = src_model.get_layers_mlp_shared_experts_linear_fc1_weight(**kwargs) - fc2_weight = src_model.get_layers_mlp_shared_experts_linear_fc2_weight(**kwargs) - self.set_layers_mlp_shared_experts_linear_fc1_weight(data=fc1_weight, **kwargs) - self.set_layers_mlp_shared_experts_linear_fc2_weight(data=fc2_weight, **kwargs) - - def _set_moe_grouped_gemm_state(self, src_model, **kwargs): - """Set MOE grouped gemm params.""" - weight1 = src_model.get_layers_mlp_experts_weight1_module(**kwargs) - weight2 = src_model.get_layers_mlp_experts_weight2_module(**kwargs) - self.set_layers_mlp_experts_weight1_module(data=weight1, **kwargs) - self.set_layers_mlp_experts_weight2_module(data=weight2, **kwargs) - - def set_mlp_state(self, src_model, **kwargs): - args = src_model.get_args() - num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) - first_k_dense_replace = self.get_first_k_dense_replace() - moe_layer_freq = self.get_moe_layer_freq() - shared_expert_gate = getattr(args, 'shared_expert_gate', False) - dst_layer_idx = kwargs["dst_layer_idx"] - if dst_layer_idx >= first_k_dense_replace and dst_layer_idx % moe_layer_freq == 0: - router_weight = src_model.get_layers_mlp_router_weight(**kwargs) - self.set_layers_mlp_router_weight(**kwargs, data=router_weight) - if shared_expert_gate: - shared_expert_gate_weight = src_model.get_layers_mlp_shared_expert_gate_weight(**kwargs) - self.set_layers_mlp_shared_expert_gate_weight(**kwargs, data=shared_expert_gate_weight) - if getattr(self.args, "n_shared_experts", None) is not None: - self._set_mlp_shared_experts_state(src_model, **kwargs) - if args.moe_grouped_gemm: - self._set_moe_grouped_gemm_state(src_model, **kwargs) - else: - for expert_idx in range(num_experts): - kwargs['expert_idx'] = expert_idx - self._set_mlp_experts_state(src_model, **kwargs) - else: - self._set_mlp_state(src_model, **kwargs) - - def get_args(self): - return self.args - - def get_args_cmd(self): - return self.args_cmd - - def get_metadata(self): - return self.md - - def get_modules_count(self): - return len(self.module) - - def get_first_k_dense_replace(self): - if getattr(self.args, "first_k_dense_replace", None) is None: - num_experts = (getattr(self.args, 'num_experts', None) or - getattr(self.args, 'num_local_experts', None)) - if num_experts is None: - return self.args.num_layers - else: - return 0 - else: - return self.args.first_k_dense_replace - - def get_moe_layer_freq(self): - if getattr(self.args, "moe_layer_freq", None) is None: - return 1 - else: - return self.args.moe_layer_freq - - @staticmethod - def read_model_cfg(args_cmd): - def merge_configs(base_config, specific_config): - merged_config = base_config.copy() - for key, value in specific_config.items(): - if isinstance(value, dict) and key in merged_config: - merged_config[key] = merge_configs(merged_config[key], value) - else: - merged_config[key] = value - return merged_config - - if args_cmd.ckpt_cfg_path == "configs/checkpoint/model_cfg.json": - current_directory = os.path.dirname(os.path.abspath(__file__)) - cfg_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(current_directory))), "configs/checkpoint/model_cfg.json") - else: - cfg_dir = args_cmd.ckpt_cfg_path - with open(cfg_dir, 'r') as file: - config = json.load(file) - final_configs = {} - - for model_name, model_config in config["model_mappings"].items(): - if "__base__" in model_config: - base_model_name = model_config["__base__"] - base_config = config["model_mappings"][base_model_name] - specific_config = model_config.copy() - specific_config.pop("__base__", None) - final_config = merge_configs(base_config, specific_config) - else: - final_config = model_config - final_configs[model_name] = final_config - - return final_configs - - @abc.abstractmethod - def get_module_mapping(self): - pass - - @abc.abstractmethod - def get_model_item(self, **kwargs): - pass - - -class HuggingfaceModel(ModelBase): - def __init__(self, args_cmd): - super(HuggingfaceModel, self).__init__(args_cmd) - self.initialize_args() - self.layers_self_attention_linear_qkv_caches = {"layer_idx": -1, "weight": None, "bias": None} - - def initialize_args(self): - # Read huggingface args. - if self.args_cmd.save_model_type == 'hf': - cfg_dir = self.args_cmd.save_dir - else: - cfg_dir = self.args_cmd.load_dir - llama_args_path = os.path.join(cfg_dir, "config.json") - with open(llama_args_path) as f: - self.args = json.load(f) - - config_key_mapping = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_hf_key_mapping') - config_value = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value') - for key_target in config_key_mapping: - key_hf = config_key_mapping[key_target] - if self.args.get(key_hf, None) is not None: - self.args[key_target] = self.args[key_hf] - else: - logger.warning(f"{key_target} was not found in the config file.") - for key_target in config_value: - self.args[key_target] = config_value[key_target] - - if ( - "num_key_value_heads" in self.args and - self.args["num_attention_heads"] != self.args["num_key_value_heads"] - ): - if self.args["num_attention_heads"] == 1: - raise AssertionError("Number of attention heads should be greater than 1!") - self.args['group_query_attention'] = True - - self.args['untie_embeddings_and_output_weights'] = not self.args.get("tie_word_embeddings", False) - self.args = SimpleNamespace(**self.args) - self.args.add_qkv_bias = self.args_cmd.add_qkv_bias - self.args.add_dense_bias = self.args_cmd.add_dense_bias - self.args.post_norm = self.args_cmd.post_norm - - def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True): - # Load Huggingface model. - if self.args_cmd.save_model_type == "hf": - load_dir = self.args_cmd.save_dir - else: - load_dir = self.args_cmd.load_dir - self.module = [AutoModelForCausalLM.from_pretrained(load_dir, device_map=device_map, trust_remote_code=trust_remote_code, local_files_only=True)] - if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]: - self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}')) - - def get_module_mapping(self): - self.module_mapping = self.model_cfg.get(self.args_cmd.model_type_hf).get('model_hf_key_mapping') - - def __get_layers_self_attention_linear_qkv_module(self, layer_idx=0): - if self.layers_self_attention_linear_qkv_caches["layer_idx"] == layer_idx: - return - self.layers_self_attention_linear_qkv_caches["layer_idx"] = layer_idx - # Reshape loaded weights. - nh = self.args.num_attention_heads - ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) - dim = self.args.kv_channels if hasattr(self.args, "kv_channels") else self.args.hidden_size // self.args.num_attention_heads - if not nh % ng == 0: - raise ValueError("nh % ng should equal 0") - - def qkv_concatenate_weight(qkv): - return torch.cat([ - qkv[0].reshape((ng, dim * nh // ng, -1)), - qkv[1].reshape((ng, dim, -1)), - qkv[2].reshape((ng, dim, -1)), - ], dim=1).reshape((-1, self.args.hidden_size)) - - def qkv_concatenate_bias(qkv): - return torch.cat([ - qkv[0].reshape((ng, dim * nh // ng)), - qkv[1].reshape((ng, dim)), - qkv[2].reshape((ng, dim)), - ], dim=1).reshape((-1)) - - qkv_type = self.args.qkv_type - if qkv_type == "unpack": - q_proj = self.get_layers_self_attention_linear_q_proj_module(layer_idx=layer_idx) - k_proj = self.get_layers_self_attention_linear_k_proj_module(layer_idx=layer_idx) - v_proj = self.get_layers_self_attention_linear_v_proj_module(layer_idx=layer_idx) - query_key_value_weight = [q_proj.weight, k_proj.weight, v_proj.weight] - query_key_value_bias = [q_proj.bias, k_proj.bias, v_proj.bias] - self.layers_self_attention_linear_qkv_caches["weight"] = (qkv_concatenate_weight(query_key_value_weight)) - if self.args_cmd.add_qkv_bias: - self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) - elif qkv_type == "pack_mla": - q_proj = self.get_layers_self_attention_linear_q_proj_module(layer_idx=layer_idx) - kv_proj = self.get_layers_self_attention_linear_kv_proj_module(layer_idx=layer_idx) - query_key_value_weight = [q_proj.weight.reshape((-1, self.args.hidden_size)), - kv_proj.weight.reshape((-1, self.args.hidden_size))] - self.layers_self_attention_linear_qkv_caches["weight"] = (torch.cat(query_key_value_weight, dim=0)) - if self.args_cmd.add_qkv_bias: - query_key_value_bias = [q_proj.bias, kv_proj.bias] - self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) - elif qkv_type == "pack_gqa": - qkv_pack = self.get_layers_self_attention_linear_qkv_pack_module(layer_idx=layer_idx) - qkv_pack_weight = qkv_pack.weight - full_q = dim * nh - end_k = full_q + ng * dim - q_weight = qkv_pack_weight[:full_q, :] - k_weight = qkv_pack_weight[full_q:end_k, :] - v_weight = qkv_pack_weight[end_k:, :] - query_key_value_weight = [q_weight, k_weight, v_weight] - self.layers_self_attention_linear_qkv_caches["weight"] = (qkv_concatenate_weight(query_key_value_weight)) - if self.args_cmd.add_qkv_bias: - qkv_pack_bias = qkv_pack.bias - q_bias = qkv_pack_bias[:full_q] - k_bias = qkv_pack_bias[full_q:end_k] - v_bias = qkv_pack_bias[end_k:] - query_key_value_bias = [q_bias, k_bias, v_bias] - self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) - elif qkv_type == "pack_self": - qkv_pack = self.get_layers_self_attention_linear_qkv_pack_module(layer_idx=layer_idx) - qkv_pack_weight = qkv_pack.weight - self.layers_self_attention_linear_qkv_caches["weight"] = qkv_pack_weight - if self.args_cmd.add_qkv_bias: - qkv_pack_bias = qkv_pack.bias - full_q = dim * nh - end_k = full_q + ng * dim - q_bias = qkv_pack_bias[:full_q, :] - k_bias = qkv_pack_bias[full_q:end_k, :] - v_bias = qkv_pack_bias[end_k:, :] - query_key_value_bias = [q_bias, k_bias, v_bias] - self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) - else: - raise ValueError(f"Unsupported types. {qkv_type}") - - def has_layers_mlp_linear_fc1_bias(self, **kwargs): - return False - - def get_layers_mlp_linear_fc1_weight(self, **kwargs): - fc_type = self.args.fc_type - if fc_type == "h_to_4h": - return self.get_layers_mlp_linear_fc1_module(**kwargs).weight - elif fc_type == "gate_up_down": - gate_proj = self.get_layers_mlp_gate_proj_weight(**kwargs) - up_proj = self.get_layers_mlp_up_proj_weight(**kwargs) - return torch.cat([gate_proj, up_proj], dim=0) - else: - raise ValueError(f"Unsupported fc_type {fc_type}") - - def get_layers_self_attention_linear_qkv_weight(self, layer_idx): - self.__get_layers_self_attention_linear_qkv_module(layer_idx=layer_idx) - return self.layers_self_attention_linear_qkv_caches["weight"] - - def get_layers_self_attention_linear_qkv_bias(self, layer_idx): - self.__get_layers_self_attention_linear_qkv_module(layer_idx=layer_idx) - return self.layers_self_attention_linear_qkv_caches["bias"] - - def set_layers_mlp_linear_fc1_weight(self, data=None, **kwargs): - gate_proj, up_proj = torch.chunk(data, 2, dim=0) - self.set_layers_mlp_gate_proj_weight(data=gate_proj, **kwargs) - self.set_layers_mlp_up_proj_weight(data=up_proj, **kwargs) - - def set_layers_mlp_experts_linear_fc1_weight(self, data=None, **kwargs): - gate_proj, up_proj = torch.chunk(data, 2, dim=0) - self.set_layers_mlp_experts_gate_proj_weight(data=gate_proj, **kwargs) - self.set_layers_mlp_experts_up_proj_weight(data=up_proj, **kwargs) - - def set_layers_mlp_shared_experts_linear_fc1_weight(self, data=None, **kwargs): - gate_proj, up_proj = torch.chunk(data, 2, dim=0) - self.set_layers_mlp_shared_experts_gate_proj_weight(data=gate_proj, **kwargs) - self.set_layers_mlp_shared_experts_up_proj_weight(data=up_proj, **kwargs) - - def set_layers_mlp_experts_weight1_module(self, data=None, **kwargs): - args = self.get_args() - num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) - experts_linear_fc1_list = torch.chunk(data.view(-1), num_experts) - for expert_idx in range(num_experts): - kwargs['expert_idx'] = expert_idx - fc1_weight = experts_linear_fc1_list[expert_idx].view(args.hidden_size, -1).t() - self.set_layers_mlp_experts_linear_fc1_weight(data=fc1_weight, **kwargs) - - def set_layers_mlp_experts_weight2_module(self, data=None, **kwargs): - args = self.get_args() - num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) - experts_linear_fc2_list = torch.chunk(data.view(-1), num_experts) - for expert_idx in range(num_experts): - kwargs['expert_idx'] = expert_idx - fc2_weight = experts_linear_fc2_list[expert_idx].view(-1, args.hidden_size).t() - self.set_layers_mlp_experts_linear_fc2_weight(data=fc2_weight, **kwargs) - - def get_layers_mlp_experts_linear_fc1_weight(self, **kwargs): - fc_type = self.args.fc_type - if fc_type == "h_to_4h": - return self.get_layers_mlp_experts_linear_fc1_module(**kwargs).weight - elif fc_type == "gate_up_down": - gate_proj = self.get_layers_mlp_experts_gate_proj_weight(**kwargs) - up_proj = self.get_layers_mlp_experts_up_proj_weight(**kwargs) - return torch.cat([gate_proj, up_proj], dim=0) - else: - raise ValueError(f"Unsupported fc_type {fc_type}") - - def get_layers_mlp_shared_experts_linear_fc1_weight(self, **kwargs): - fc_type = self.args.fc_type - if fc_type == "h_to_4h": - return self.get_layers_mlp_experts_linear_fc1_module(**kwargs).weight - elif fc_type == "gate_up_down": - gate_proj = self.get_layers_mlp_shared_experts_gate_proj_weight(**kwargs) - up_proj = self.get_layers_mlp_shared_experts_up_proj_weight(**kwargs) - return torch.cat([gate_proj, up_proj], dim=0) - else: - raise ValueError(f"Unsupported fc_type {fc_type}") - - def get_layers_mlp_experts_weight1_module(self, **kwargs): - args = self.get_args() - num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) - experts_linear_fc1_list = [] - for expert_idx in range(num_experts): - kwargs['expert_idx'] = expert_idx - fc1_weight = self.get_layers_mlp_experts_linear_fc1_weight(**kwargs) - experts_linear_fc1_list.append(fc1_weight.t().view(-1)) - return torch.cat(experts_linear_fc1_list).view(args.hidden_size, -1) - - def get_layers_mlp_experts_weight2_module(self, **kwargs): - args = self.get_args() - num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) - experts_linear_fc2_list = [] - for expert_idx in range(num_experts): - kwargs['expert_idx'] = expert_idx - fc2_weight = self.get_layers_mlp_experts_linear_fc2_weight(**kwargs) - experts_linear_fc2_list.append(fc2_weight.t().view(-1)) - return torch.cat(experts_linear_fc2_list).view(-1, args.hidden_size) - - def set_layers_self_attention_linear_qkv_weight(self, layer_idx=0, data=None): - def qkv_split_weight(query_key_value): - qkv_weight = query_key_value.reshape( - ng, - repeats + 2, - query_key_value.shape[0] // ng // (repeats + 2), - query_key_value.shape[1], - ) - hidden_size = qkv_weight.shape[-1] - qw = qkv_weight[:, :repeats, ...].reshape(-1, hidden_size) - kw = qkv_weight[:, repeats: repeats + 1, ...].reshape(-1, hidden_size) - vw = qkv_weight[:, repeats + 1:, ...].reshape(-1, hidden_size) - return qw, kw, vw - - nh = self.args.num_attention_heads - ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) - if not nh % ng == 0: - raise ValueError("nh % ng should equal 0") - repeats = nh // ng - - qkv_type = self.args.qkv_type - if qkv_type == "unpack": - q_weight, k_weight, v_weight = qkv_split_weight(data) - self.set_layers_self_attention_linear_q_proj_weight(layer_idx=layer_idx, data=q_weight) - self.set_layers_self_attention_linear_k_proj_weight(layer_idx=layer_idx, data=k_weight) - self.set_layers_self_attention_linear_v_proj_weight(layer_idx=layer_idx, data=v_weight) - elif qkv_type == "pack_gqa": - qw, k_weight, v_weight = qkv_split_weight(data) - qkv = torch.cat((qw, k_weight, v_weight), dim=0) - self.set_layers_self_attention_linear_qkv_pack_weight(layer_idx=layer_idx, data=qkv) - elif qkv_type == "pack_mla": - if self.args.q_lora_rank is None: - q_head_dim = self.args.qk_nope_head_dim + self.args.qk_rope_head_dim - q_proj = data[:self.args.num_attention_heads * q_head_dim, :] - kv_proj = data[self.args.num_attention_heads * q_head_dim:, :] - else: - q_proj = data[:self.args.q_lora_rank, :] - kv_proj = data[self.args.q_lora_rank:, :] - self.set_layers_self_attention_linear_q_proj_weight(layer_idx=layer_idx, data=q_proj) - self.set_layers_self_attention_linear_kv_proj_weight(layer_idx=layer_idx, data=kv_proj) - elif qkv_type == "pack_self": - self.set_layers_self_attention_linear_qkv_pack_weight(layer_idx=layer_idx, data=data) - else: - raise ValueError(f"Unsupported types. {qkv_type}") - - def set_layers_self_attention_linear_qkv_bias(self, layer_idx, data=None): - def qkv_split_bias(query_key_value): - bias_weight = query_key_value.reshape( - ng, repeats + 2, query_key_value.shape[0] // ng // (repeats + 2) - ) - qw = bias_weight[:, :repeats, ...].reshape(-1) - kw = bias_weight[:, repeats: repeats + 1, ...].reshape(-1) - vw = bias_weight[:, repeats + 1:, ...].reshape(-1) - return qw, kw, vw - - nh = self.args.num_attention_heads - ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) - if not nh % ng == 0: - raise ValueError("nh % ng should equal 0") - repeats = nh // ng - - qkv_type = self.args.qkv_type - if qkv_type == "unpack": - if self.args_cmd.add_qkv_bias: - q_bias, k_bias, v_bias = qkv_split_bias(data) - self.set_layers_self_attention_linear_q_proj_bias(layer_idx=layer_idx, data=q_bias) - self.set_layers_self_attention_linear_k_proj_bias(layer_idx=layer_idx, data=k_bias) - self.set_layers_self_attention_linear_v_proj_bias(layer_idx=layer_idx, data=v_bias) - elif qkv_type == "pack_gqa": - if self.args_cmd.add_qkv_bias: - q_bias, k_bias, v_bias = qkv_split_bias(data) - qkv_bias = torch.cat((q_bias, k_bias, v_bias), dim=0) - self.set_layers_self_attention_linear_qkv_pack_bias(layer_idx=layer_idx, data=qkv_bias) - else: - raise ValueError(f"Unsupported types. {qkv_type}") - - def get_model_item(self, **kwargs): - return self.module[0] - - -class MegatronModel(ModelBase): - def __init__(self, model_provider, args_cmd, md=None): - super(MegatronModel, self).__init__(args_cmd) - self.model_provider = model_provider_func_wrapper(model_provider) - self.md = md - self.pp_stage_cache = [] - - def initialize_megatron_args(self, hf_args=None, queue=None, loader_megatron=False, saver_megatron=False): - sys.argv = self.get_sys_argv() - self.args = parse_args() - - self.update_megatron_args_from_megatron_checkpoint(loader_megatron) - self.update_megatron_args_from_cmd_config(loader_megatron) - self.update_megatron_args_from_huggingface_config(hf_args) - - # Arguments do sanity checks on the world size, but we don't care, - # so trick it into thinking we are plenty of processes. - self.args.world_size = self.args.tensor_model_parallel_size * self.args.pipeline_model_parallel_size - self.update_megatron_args_from_loader_margs() - self.args = validate_args(self.args) - self.check_for_args(queue, saver_megatron) - - self.args.model_type = ModelType.encoder_or_decoder - # Suppress warning about torch.distributed not being initialized. - module.MegatronModule.embedding_warning_printed = True - set_args(self.args) - self.set_megatron_parallel_state(saver_megatron) - - def update_megatron_args_from_loader_margs(self): - if self.md and hasattr(self.md, 'checkpoint_args'): - # These are arguments that we are either changing, or cause problems for validation if they are set - args_to_keep = [ - 'tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', - 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', - 'bias_gelu_fusion', 'bias_dropout_fusion', 'sequence_parallel', 'async_tensor_model_parallel_allreduce', - 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', 'vocab_file', 'tokenizer_model', - 'save_interval', 'save', 'perform_initialization', 'use_cpu_initialization', 'recompute_granularity', - 'recompute_num_layers', 'recompute_method', 'encoder_num_layers', 'encoder_seq_length', - 'distribute_saved_activations', 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', - 'lr_warmup_fraction', 'start_weight_decay', 'end_weight_decay', 'make_vocab_size_divisible_by', - 'masked_softmax_fusion', 'num_layer_list', 'lora_target_modules', 'expert_model_parallel_size', 'use_mcore_models' - ] - - for arg, value in vars(self.md.checkpoint_args).items(): - if arg in args_to_keep: - continue - if not hasattr(self.args, arg): - logger.warning(f"Checkpoint had argument {arg} but new arguments does not have this.") - continue - if getattr(self.args, arg) != value: - logger.warning( - f"Overwriting default {arg} value {getattr(self.args, arg)} with value from checkpoint {value}." - ) - setattr(self.args, arg, value) - - if hasattr(self.md, 'consumed_train_samples'): - self.args.consumed_train_samples = self.md.consumed_train_samples - self.args.consumed_valid_samples = self.md.consumed_valid_samples - logger.info(f"Setting consumed_train_samples to {self.args.consumed_train_samples} " - f"and consumed_valid_samples to {self.args.consumed_valid_samples}") - else: - logger.warning("consumed_train_samples not provided.") - - def update_megatron_args_from_huggingface_config(self, hf_args): - if hf_args is None: - return - try: - self.args.seq_length = getattr(hf_args, "max_position_embeddings", 4096) - self.args.global_batch_size = 1024 - self.args.max_position_embeddings = self.args.seq_length - self.args.norm_epsilon = getattr(hf_args, "norm_epsilon", 1e-6) - self.args.iteration = 1 # '0', 'release' don't work - self.args.hidden_size = hf_args.hidden_size - self.args.num_attention_heads = hf_args.num_attention_heads - - self.args.num_layers = hf_args.num_layers - if self.args.noop_layers is not None: - self.args.num_layers = hf_args.num_layers + len(self.args.noop_layers) - logger.info(f"[INFO] When using noop_layer, origin layers from huggingface is {hf_args.num_layers}, " - f"add noop layer {len(self.args.noop_layers)}, so megatron_ckpt has {self.args.num_layers}") - - self.args.add_position_embedding = hf_args.add_position_embedding - self.args.use_rotary_position_embeddings = hf_args.use_rotary_position_embeddings - self.args.swiglu = hf_args.swiglu - self.args.tokenizer_type = hf_args.tokenizer_type - self.args.normalization = hf_args.normalization - self.args.add_bias_linear = hf_args.add_bias_linear - self.args.untie_embeddings_and_output_weights = not getattr(hf_args, "tie_word_embeddings", False) - self.args.vocab_size = hf_args.vocab_size - self.args.padded_vocab_size = hf_args.vocab_size - self.args.llama = hf_args - self.args.ffn_hidden_size = hf_args.intermediate_size - self.args.gradient_accumulation_fusion = hf_args.gradient_accumulation_fusion - self.args.kv_channels = hf_args.kv_channels if hasattr(hf_args, "kv_channels") else None - self.args.moe_grouped_gemm = hf_args.moe_grouped_gemm - self.args.spec = hf_args.spec - self.args.num_experts = getattr(hf_args, "num_experts", None) - self.args.n_shared_experts = getattr(hf_args, "n_shared_experts", None) - self.args.shared_expert_gate = getattr(hf_args, "shared_expert_gate", None) - self.args.qk_layernorm = getattr(hf_args, "qk_layernorm", False) - self.args.moe_intermediate_size = getattr(hf_args, "moe_intermediate_size", None) - self.args.first_k_dense_replace = getattr(hf_args, "first_k_dense_replace", None) - self.args.moe_layer_freq = getattr(hf_args, "moe_layer_freq", None) - self.args.multi_head_latent_attention = getattr(hf_args, "multi_head_latent_attention", False) - self.args.shared_expert_intermediate_size = getattr(hf_args, "shared_expert_intermediate_size", None) - if self.args.shared_expert_intermediate_size is not None and self.args.n_shared_experts is None: - self.args.n_shared_experts = self.args.shared_expert_intermediate_size // self.args.moe_intermediate_size - if self.args.multi_head_latent_attention: - self.args.qk_rope_head_dim = getattr(hf_args, "qk_rope_head_dim", None) - self.args.qk_nope_head_dim = getattr(hf_args, "qk_nope_head_dim", None) - self.args.q_lora_rank = getattr(hf_args, "q_lora_rank", None) - self.args.kv_lora_rank = getattr(hf_args, "kv_lora_rank", None) - self.args.v_head_dim = getattr(hf_args, "v_head_dim", None) - - if self.args.add_dense_bias: - self.args.skip_bias_add = False - - if ( - hasattr(hf_args, "num_key_value_heads") and - hf_args.num_attention_heads != hf_args.num_key_value_heads - ): - if hf_args.num_attention_heads == 1: - raise AssertionError("Number of attention heads should be greater than 1!") - self.args.group_query_attention = True - self.args.num_query_groups = hf_args.num_key_value_heads - if hasattr(hf_args, 'num_local_experts'): - self.args.num_experts = hf_args.num_local_experts - except Exception as e: - logger.info(e) - raise AssertionError("You may got an incomplete config, please check hf config.json") - - - def update_megatron_args_from_megatron_checkpoint(self, loader_megatron): - if not loader_megatron: - return - set_args(self.args) - self.args, self.args_megatron_checkpoint = load_args_from_checkpoint(self.args) - - def update_megatron_args_from_cmd_config(self, loader_megatron): - self.args.w_pack = self.args_cmd.w_pack - self.args.add_qkv_bias = self.args_cmd.add_qkv_bias - self.args.add_dense_bias = self.args_cmd.add_dense_bias - self.args.post_norm = self.args_cmd.post_norm - self.args.tokenizer_model = getattr(self.args_cmd, 'tokenizer_model', None) - self.args.make_vocab_size_divisible_by = getattr(self.args_cmd, 'make_vocab_size_divisible_by', None) - if self.args_cmd.params_dtype == 'bf16': - self.args.bf16 = True - elif self.args_cmd.params_dtype == 'fp16': - self.args.fp16 = True - if self.args_cmd.add_dense_bias: - self.args.skip_bias_add = False - self.args.use_mcore_models = self.args_cmd.use_mcore_models - - if loader_megatron: - self.args.lora_target_modules = self.args_cmd.lora_target_modules - self.args.lora_load = self.args_cmd.lora_load - self.args.lora_r = self.args_cmd.lora_r - self.args.lora_alpha = self.args_cmd.lora_alpha - # Determine how to make our models. - if not self.args_cmd.model_type == 'GPT': - raise ValueError("Llama-2 is a GPT model.") - - if self.md and self.args_cmd.num_layer_list: - self.args.num_layer_list = self.args_cmd.num_layer_list - - if self.args_cmd.noop_layers: - self.args.noop_layers = self.args_cmd.noop_layers.split(',') - self.args.noop_layers = [int(i) for i in self.args.noop_layers] - - # gradient_accumulation_fusion should be false in ckpt convertion - self.args.gradient_accumulation_fusion = False - - def set_padded_vocab_size(self, padded_vocab_size): - self.args.padded_vocab_size = padded_vocab_size - - def set_megatron_parallel_state(self, saver_megatron): - if saver_megatron: - self.set_tensor_model_parallel_world_size(self.args_cmd.target_tensor_parallel_size) - self.set_expert_model_parallel_world_size(self.args_cmd.target_expert_parallel_size) - self.set_pipeline_model_parallel_world_size(self.args_cmd.target_pipeline_parallel_size) - if self.args_cmd.num_layers_per_virtual_pipeline_stage: - vp_size = (self.args.num_layers // - self.args_cmd.target_pipeline_parallel_size // - self.args_cmd.num_layers_per_virtual_pipeline_stage) - self.set_virtual_pipeline_model_parallel_world_size(vp_size) - else: - self.set_tensor_model_parallel_world_size(self.args.tensor_model_parallel_size) - self.set_pipeline_model_parallel_world_size(self.args.pipeline_model_parallel_size) - self.set_virtual_pipeline_model_parallel_world_size(self.args.virtual_pipeline_model_parallel_size) - - # Get first pipe stage. - self.set_tensor_model_parallel_rank(0) - self.set_pipeline_model_parallel_rank(0) - - def get_modules_from_config(self, pp_stage_cache_flag=False): - self.__get_modules(pp_stage_cache_flag=pp_stage_cache_flag) - - def get_modules_from_pretrained(self, pp_stage_cache_flag=False): - self.__get_modules(from_pretrained=True, pp_stage_cache_flag=pp_stage_cache_flag) - - def __get_modules(self, from_pretrained=False, pp_stage_cache_flag=False): - if self.args.num_experts: - tensor_parallel.model_parallel_cuda_manual_seed(123) - # Initialize the dictionary for the parallel mode of the model - pp_rank = self.get_pipeline_model_parallel_rank() - if pp_stage_cache_flag and pp_rank < len(self.pp_stage_cache): - self.module = self.pp_stage_cache[pp_rank] - return - - virtual_pipeline_model_parallel_size = self.args.virtual_pipeline_model_parallel_size - if virtual_pipeline_model_parallel_size is None: - virtual_pipeline_model_parallel_size = 1 - - models = [ - [ - [ - None for _ in range(self.args.tensor_model_parallel_size) - ] - for _ in range(self.args.expert_model_parallel_size) - ] - for _ in range(virtual_pipeline_model_parallel_size) - ] - - for ep_rank in range(self.args.expert_model_parallel_size): - if self.args.expert_model_parallel_size > 1: - self.set_expert_model_parallel_rank(ep_rank) - for tp_rank in range(self.args.tensor_model_parallel_size): - self.set_tensor_model_parallel_rank(tp_rank) - if self.args.virtual_pipeline_model_parallel_size is not None: - model_ = [] - for vp_rank in range(self.args.virtual_pipeline_model_parallel_size): - self.set_virtual_pipeline_model_parallel_rank(vp_rank) - # Set pre_process and post_process only after virtual rank is set. - pre_process = mpu.is_pipeline_first_stage() - post_process = mpu.is_pipeline_last_stage() - expert_parallel_size = mpu.get_expert_model_parallel_world_size() - this_model = self.model_provider( - pre_process=pre_process, - post_process=post_process - ).to(self.args.params_dtype) - model_.append(this_model) - else: - pre_process = mpu.is_pipeline_first_stage() - post_process = mpu.is_pipeline_last_stage() - model_ = [self.model_provider(pre_process, post_process).to(self.args.params_dtype)] - self.args.consumed_train_samples = 0 - self.args.consumed_valid_samples = 0 - if from_pretrained: - load_checkpoint(model_, None, None) - for vp_rank in range(virtual_pipeline_model_parallel_size): - models[vp_rank][ep_rank][tp_rank] = model_[vp_rank] - if self.args.lora_target_modules and from_pretrained: - if virtual_pipeline_model_parallel_size > 1: - raise AssertionError("Virtual pipeline and LoRA weight merging " - "are not supported simultaneously") - models[vp_rank][ep_rank][tp_rank].merge_and_unload() - - self.module = models - - if pp_stage_cache_flag: - self.pp_stage_cache.append(models) - - def check_for_args(self, queue, saver_megatron): - if saver_megatron: - return - check_args_list = { - 'tensor_model_parallel_size': None, 'pipeline_model_parallel_size': None, 'num_layers': None, - 'hidden_size': None, 'seq_length': None, 'num_attention_heads': None, 'max_position_embeddings': None, - 'position_embedding_type': None, 'tokenizer_type': None, 'iteration': 1, 'bert_binary_head': None, - 'disable_bias_linear': False, 'params_dtype': None, 'swiglu': False - } - # if hasattr(self.args, 'add_bias_linear'): - # check_args_list['disable_bias_linear'] = self.args.add_bias_linear - - def check_for_arg(arg_name, default=None): - if getattr(self.args, arg_name, None) is None: - if default is not None: - setattr(self.args, arg_name, default) - elif queue is not None: - logger.error(f"Checkpoint does not specify the argument {arg_name}. Exiting.") - logger.info(f"Arguments: {self.args}") - queue.put("exit") - exit(1) - - for check_arg in check_args_list: - check_for_arg(check_arg, check_args_list[check_arg]) - - def get_sys_argv(self): - sys_argv = [ - 'script.py', - '--no-masked-softmax-fusion', - '--no-bias-gelu-fusion', - '--no-bias-dropout-fusion', - '--no-async-tensor-model-parallel-allreduce', - '--use-cpu-initialization', - '--micro-batch-size', '1', - '--no-load-optim', - '--no-load-rng', - '--no-save-optim', - '--no-save-rng', - '--no-initialization', - '--save-interval', '1', - '--mock-data', # To pass the "blend data checks" in arguments.py - '--load', self.args_cmd.load_dir, - '--finetune' - ] - - if hasattr(self.args_cmd, 'add_bias_linear') and not self.args_cmd.add_bias_linear: - sys_argv.append('--disable-bias-linear') - - if self.args_cmd.use_mcore_models: - sys_argv.append('--use-mcore-models') - - if self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value').get('embed_layernorm', False): - sys_argv.append('--embed-layernorm') - - if self.md is None: - return sys_argv - - sys_argv.extend([ - '--num-layers', str(self.md.num_layers), - '--hidden-size', str(self.md.hidden_size), - '--seq-length', str(self.md.seq_length), - '--num-attention-heads', str(self.md.num_attention_heads), - '--max-position-embeddings', str(self.md.max_position_embeddings), - '--position-embedding-type', str(self.md.position_embedding_type), - '--tokenizer-type', str(self.md.tokenizer_type), - '--tensor-model-parallel-size', str(self.args_cmd.target_tensor_parallel_size), - '--pipeline-model-parallel-size', str(self.args_cmd.target_pipeline_parallel_size), - '--expert-model-parallel-size', str(self.args_cmd.target_expert_parallel_size), - '--save', self.args_cmd.save_dir - ]) - - if self.args_cmd.num_layers_per_virtual_pipeline_stage: - sys_argv.extend(['--num-layers-per-virtual-pipeline-stage', - str(self.args_cmd.num_layers_per_virtual_pipeline_stage)]) - - num_experts = getattr(self.md.checkpoint_args, 'num_experts', None) - if self.args_cmd.target_tensor_parallel_size > 1 and num_experts is not None and num_experts > 1: - sys_argv.append('--sequence-parallel') - - if self.md.make_vocab_size_divisible_by is not None: - sys_argv.extend(['--make-vocab-size-divisible-by', str(self.md.make_vocab_size_divisible_by)]) - if self.md.params_dtype == torch.float16: - sys_argv.append('--fp16') - elif self.md.params_dtype == torch.bfloat16: - sys_argv.append('--bf16') - - if self.md.output_layer: - sys_argv.append('--untie-embeddings-and-output-weights') - if not self.md.linear_bias: - sys_argv.append('--disable-bias-linear') - - if self.md.model_type == 'BERT' and not self.md.bert_binary_head: - sys_argv.append('--bert-no-binary-head') - - return sys_argv - - def get_model_item(self, **kwargs): - self.update_kwargs_idx(**kwargs) - _module = self.module - for key in self.kwargs_idx: - if "rank" in key: - _module = _module[self.kwargs_idx[key]] - return _module - - @staticmethod - def set_tensor_model_parallel_world_size(tensor_model_parallel_size): - mpu.set_tensor_model_parallel_world_size(tensor_model_parallel_size) - - @staticmethod - def set_expert_model_parallel_world_size(expert_model_parallel_size): - mpu.set_expert_model_parallel_world_size(expert_model_parallel_size) - - @staticmethod - def set_pipeline_model_parallel_world_size(pipeline_model_parallel_size): - mpu.set_pipeline_model_parallel_world_size(pipeline_model_parallel_size) - - @staticmethod - def set_virtual_pipeline_model_parallel_world_size(virtual_pipeline_model_parallel_size): - mpu.set_virtual_pipeline_model_parallel_world_size(virtual_pipeline_model_parallel_size) - - @staticmethod - def set_tensor_model_parallel_rank(tensor_model_parallel_rank): - mpu.set_tensor_model_parallel_rank(tensor_model_parallel_rank) - - @staticmethod - def set_pipeline_model_parallel_rank(pipeline_model_parallel_rank): - mpu.set_pipeline_model_parallel_rank(pipeline_model_parallel_rank) - - @staticmethod - def set_expert_model_parallel_rank(pipeline_model_parallel_rank): - mpu.set_expert_model_parallel_rank(pipeline_model_parallel_rank) - - @staticmethod - def set_virtual_pipeline_model_parallel_rank(pipeline_model_parallel_rank): - mpu.set_virtual_pipeline_model_parallel_rank(pipeline_model_parallel_rank) - - @staticmethod - def get_pipeline_model_parallel_rank(): - return mpu.get_pipeline_model_parallel_rank() - - -class MegatronLegacyModel(MegatronModel): - def __init__(self, model_provider, args_cmd, md=None): - super(MegatronLegacyModel, self).__init__(model_provider, args_cmd, md) - - def get_module_mapping(self): - module_layer = "language_model.encoder.layers[layer_idx]." - self.module_mapping = { - "embedding": "language_model.embedding", - "embedding_word_embeddings": "language_model.embedding.word_embeddings", - "embedding_word_embeddings_norm": "language_model.embedding.word_embeddings.norm", - "embedding_position_embeddings": "language_model.embedding.position_embeddings", - "model": "module", - "layers_input_layernorm": module_layer + "input_norm", - "layers": "language_model.encoder.layers", - "layers_self_attention_linear_proj": module_layer + "self_attention.dense", - "layers_self_attention_linear_qkv": module_layer + "self_attention.query_key_value", - "layers_self_attention_post_attention_layernorm": module_layer + "post_attention_norm", - "layers_self_attention_pre_mlp_layernorm": module_layer + "post_attention_norm", - "layers_mlp_linear_fc1": module_layer + "mlp.dense_h_to_4h", - "layers_mlp_linear_fc2": module_layer + "mlp.dense_4h_to_h", - "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", - "final_layernorm": "language_model.encoder.final_norm", - "output_layer": "language_model.output_layer", - "word_embeddings": "word_embeddings" - } - - -class MegatronMCoreModel(MegatronModel): - def __init__(self, model_provider, args_cmd, md=None): - super(MegatronMCoreModel, self).__init__(model_provider, args_cmd, md) - - def get_module_mapping(self): - module_layer = "decoder.layers[layer_idx]." - self.module_mapping = { - "embedding": "embedding", - "embedding_word_embeddings": "embedding.word_embeddings", - "embedding_word_embeddings_norm": "embedding.word_embeddings.norm", - "embedding_position_embeddings": "embedding.position_embeddings", - "model": "module", - "layers_input_layernorm": module_layer + "input_layernorm", - "layers": "decoder.layers", - "layers_self_attention_linear_proj": module_layer + "self_attention.linear_proj", - "layers_self_attention_linear_qkv": module_layer + "self_attention.linear_qkv", - "layers_self_attention_q_layernorm": module_layer + "self_attention.q_layernorm", - "layers_self_attention_k_layernorm": module_layer + "self_attention.k_layernorm", - "layers_self_attention_post_attention_layernorm": module_layer + "post_attn_norm", - "layers_self_attention_pre_mlp_layernorm": module_layer + "pre_mlp_layernorm", - "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1", - "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2", - "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", - "final_layernorm": "decoder.final_layernorm", - "output_layer": "output_layer" - } - - config_value = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value') - - self.module_mapping["layers_mlp_router"] = module_layer + "mlp.router" - self.module_mapping[ - "layers_mlp_experts_linear_fc1"] = module_layer + "mlp.experts.local_experts[expert_idx].linear_fc1" - self.module_mapping[ - "layers_mlp_experts_linear_fc2"] = module_layer + "mlp.experts.local_experts[expert_idx].linear_fc2" - - # MLP - self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb" - self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb" - - # shared experts - self.module_mapping[ - "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1" - self.module_mapping[ - "layers_mlp_shared_experts_linear_fc2"] = module_layer + "mlp.shared_experts.linear_fc2" - - # shared experts gate - if config_value.get('shared_expert_gate', False): - self.module_mapping["layers_mlp_shared_expert_gate"] = module_layer + "mlp.shared_expert_gate" - - # moe grouped gemm - self.module_mapping[ - "layers_mlp_experts_weight1"] = module_layer + "mlp.experts.weight1" - self.module_mapping[ - "layers_mlp_experts_weight2"] = module_layer + "mlp.experts.weight2" - - -def get_megatron_model(model_provider, args_cmd, md=None): - if args_cmd.use_mcore_models: - return MegatronMCoreModel(model_provider, args_cmd=args_cmd, md=md) - else: - return MegatronLegacyModel(model_provider, args_cmd=args_cmd, md=md) - - -def get_huggingface_model(args_cmd): - return HuggingfaceModel(args_cmd) +import abc +import os +import sys +import re +import json +from types import SimpleNamespace +import logging as logger +from pathlib import Path +from collections import OrderedDict +from tqdm import tqdm +import torch +from transformers import AutoModelForCausalLM +from megatron.core import mpu +from megatron.training.arguments import validate_args +from megatron.legacy.model import module +from megatron.core.enums import ModelType +from megatron.training.checkpointing import load_args_from_checkpoint +from megatron.training.global_vars import set_args +from megatron.training.checkpointing import load_checkpoint +from megatron.core import tensor_parallel +from mindspeed_llm.training.utils import parse_args +from mindspeed_llm.training import model_provider_func_wrapper +from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper + +logger.basicConfig(format="") +logger.getLogger().setLevel(logger.INFO) + +load_checkpoint = load_checkpoint_wrapper(load_checkpoint) + + +def tensor_info(tensor): + shape = tensor.shape + mean_val = tensor.mean().item() + min_val = tensor.min().item() + max_val = tensor.max().item() + return f"shape: {shape} mean_val: {mean_val} min_val: {min_val} max_val: {max_val}" + + +class ModelBase(abc.ABC): + def __init__(self, args_cmd=None): + self.args_cmd = args_cmd + self.args = None + self.args_megatron_checkpoint = None + self.module = None + self.module_mapping = None + self.model_cfg = self.read_model_cfg(args_cmd) + self.__register_functions() + self.kwargs_idx = OrderedDict({ + "vp_rank": 0, + "ep_rank": 0, + "tp_rank": 0, + "layer_idx": 0, + "expert_idx": 0 + }) + + def update_kwargs_idx(self, **kwargs): + for key in self.kwargs_idx: + if key in kwargs: + self.kwargs_idx[key] = kwargs[key] + else: + self.kwargs_idx[key] = 0 + + def __register_functions(self): + self.get_module_mapping() + + def _get_obj(self, value, **kwargs): + pattern = r'(\w+)(?:\[(\w+)\])?' + matches = re.findall(pattern, value) + self.update_kwargs_idx(**kwargs) + obj = self.get_model_item(**kwargs) + for attr, attr_ident in matches: + if hasattr(obj, attr): + obj = getattr(obj, attr) + else: + return None + if attr_ident: + if attr_ident in self.kwargs_idx: + attr_idx = self.kwargs_idx[attr_ident] + obj = obj[attr_idx] + else: + raise AssertionError(f"check {self.__class__.__name__}.module_mapping **{attr_ident}**.") + return obj + + def _get_dst_obj(self, value, **kwargs): + if kwargs.get("layer_idx") is None: + kwargs["layer_idx"] = kwargs.get("dst_layer_idx") + + return _get_obj(self, value, **kwargs) + + def _get_src_obj(self, value, **kwargs): + if kwargs.get("layer_idx") is None: + kwargs["layer_idx"] = kwargs.get("src_layer_idx") + + return _get_obj(self, value, **kwargs) + + def _func_generator_get_module(value): + def func(self, **kwargs): + return _get_src_obj(self, value, **kwargs) + return func + + def _func_generator_get_weight(value): + def func(self, **kwargs): + return _get_src_obj(self, value, **kwargs).weight.data + return func + + def _func_generator_get_bias(value): + def func(self, **kwargs): + return _get_src_obj(self, value, **kwargs).bias.data + return func + + def _func_generator_set_weight(value): + def func(self, **kwargs): + return _get_dst_obj(self, value, **kwargs).weight.data.copy_(kwargs.get('data')) + return func + + def _func_generator_set_module(value): + def func(self, **kwargs): + return _get_dst_obj(self, value, **kwargs).data.copy_(kwargs.get('data')) + return func + + def _func_generator_set_bias(value): + def func(self, **kwargs): + return _get_dst_obj(self, value, **kwargs).bias.data.copy_(kwargs.get('data')) + return func + + def _func_generator_has_module(value): + def func(self, **kwargs): + obj = _get_src_obj(self, value, **kwargs) + return True if obj else False + return func + + def _func_generator_has_bias(value): + def func(self, **kwargs): + bias = getattr(_get_src_obj(self, value, **kwargs), 'bias', None) + return bias is not None + return func + + if self.module_mapping: + for key, value in self.module_mapping.items(): + setattr(self, "get_" + key + "_module", _func_generator_get_module(value).__get__(self, ModelBase)) + setattr(self, "set_" + key + "_module", _func_generator_set_module(value).__get__(self, ModelBase)) + setattr(self, "get_" + key + "_weight", _func_generator_get_weight(value).__get__(self, ModelBase)) + setattr(self, "get_" + key + "_bias", _func_generator_get_bias(value).__get__(self, ModelBase)) + setattr(self, "set_" + key + "_weight", _func_generator_set_weight(value).__get__(self, ModelBase)) + setattr(self, "set_" + key + "_bias", _func_generator_set_bias(value).__get__(self, ModelBase)) + setattr(self, "has_" + key + "_module", _func_generator_has_module(value).__get__(self, ModelBase)) + setattr(self, "has_" + key + "_bias", _func_generator_has_bias(value).__get__(self, ModelBase)) + + def update_module(self, src_model): + self.set_preprocess_state(src_model) + self.set_postprocess_state(src_model) + if not (hasattr(self.args, "noop_layers") and self.args.noop_layers): + for layer_idx in tqdm(range(self.args.num_layers), "set layer states"): + self.set_layer_state(src_model, layer_idx) + return + + # Do ckpt conversion when noop layer is configured. + # For example, hf_layer = [0, 1], add noop layer [1, 3], then mg_layers = [0(0), 1(noop), 2(1), 3(noop)] + hf_num_layers = self.args.num_layers - len(self.args.noop_layers) + mg_layer_list = [i for i in range(hf_num_layers)] + for i in self.args.noop_layers: + # insert noop layer + mg_layer_list.insert(i, -1) + for dst_layer_idx, src_layer_idx in enumerate(mg_layer_list): + if not self.is_noop_layer(src_layer_idx): + self.set_layer_state_base(src_model, src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx) + + def set_preprocess_state(self, src_model): + """Set embedding params.""" + embeddings_weight = src_model.get_embedding_word_embeddings_weight() + if embeddings_weight.size(0) > self.get_embedding_word_embeddings_weight().size(0): + logger.info(f"Source embedding size: {embeddings_weight.size()} " + f"Target embedding size: {self.get_embedding_word_embeddings_weight().size()}") + embeddings_weight = embeddings_weight[:self.get_embedding_word_embeddings_weight().size(0), :] + self.set_embedding_word_embeddings_weight(data=embeddings_weight) + if src_model.has_embedding_word_embeddings_norm_module(): + embd_norm_weight = src_model.get_embedding_word_embeddings_norm_weight() + embd_norm_bias = src_model.get_embedding_word_embeddings_norm_bias() + self.set_embedding_word_embeddings_norm_weight(data=embd_norm_weight) + self.set_embedding_word_embeddings_norm_bias(data=embd_norm_bias) + + def set_postprocess_state(self, src_model): + final_layernorm_weight = src_model.get_final_layernorm_weight() + self.set_final_layernorm_weight(data=final_layernorm_weight) + if self.args.untie_embeddings_and_output_weights: + output_layer_weight = src_model.get_output_layer_weight() + if output_layer_weight.size(0) > self.get_output_layer_weight().size(0): + logger.info(f"Source output layer weight size: {output_layer_weight.size()} " + f"Target output layer weight size: {self.get_output_layer_weight().size()}") + output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :] + self.set_output_layer_weight(data=output_layer_weight) + if self.has_final_layernorm_bias(): + final_layernorm_bias = src_model.get_final_layernorm_bias() + self.set_final_layernorm_bias(data=final_layernorm_bias) + + def set_layer_state(self, src_model, layer_idx): + """ + For source layer index == destination layer index. + """ + self.set_layer_state_base(src_model, layer_idx, layer_idx) + + @staticmethod + def is_noop_layer(src_layer_idx): + return src_layer_idx == -1 + + def set_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx): + """ + We update megatron module by transferring layers in src_layer_idx into those in dst_layer_idx . + """ + kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} + self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) + self.set_mlp_state(src_model, **kwargs) + input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) + self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight) + + if self.args.post_norm: + post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight( + layer_idx=src_layer_idx) + self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx, + data=post_attn_layernorm_weight) + else: + pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight( + layer_idx=src_layer_idx) + self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, + data=pre_mlp_layernorm_weight) + + if self.has_layers_input_layernorm_bias(layer_idx=src_layer_idx): + input_layernorm_bias = src_model.get_layers_input_layernorm_bias(layer_idx=src_layer_idx) + self.set_layers_input_layernorm_bias(layer_idx=dst_layer_idx, data=input_layernorm_bias) + + if self.has_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx): + pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx) + self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias) + + def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model): + """Set self-attention params.""" + if getattr(src_model.get_args(), "qk_layernorm", False): + if getattr(src_model.get_args(), "q_lora_rank", None): + q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) + k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm) + + if getattr(src_model.get_args(), "multi_head_latent_attention", False): + if getattr(src_model.get_args(), "q_lora_rank", None): + linear_qb = src_model.get_layers_self_attention_linear_qb_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_linear_qb_weight(layer_idx=dst_layer_idx, data=linear_qb) + linear_kvb = src_model.get_layers_self_attention_linear_kvb_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_linear_kvb_weight(layer_idx=dst_layer_idx, data=linear_kvb) + + qkv_weight = src_model.get_layers_self_attention_linear_qkv_weight(layer_idx=src_layer_idx) + proj_weight = src_model.get_layers_self_attention_linear_proj_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_linear_qkv_weight(layer_idx=dst_layer_idx, data=qkv_weight) + self.set_layers_self_attention_linear_proj_weight(layer_idx=dst_layer_idx, data=proj_weight) + if self.args.add_qkv_bias: + qkv_bias = src_model.get_layers_self_attention_linear_qkv_bias(layer_idx=src_layer_idx) + self.set_layers_self_attention_linear_qkv_bias(layer_idx=dst_layer_idx, data=qkv_bias) + if self.args.add_dense_bias: + proj_bias = src_model.get_layers_self_attention_linear_proj_bias(layer_idx=src_layer_idx) + self.set_layers_self_attention_linear_proj_bias(layer_idx=dst_layer_idx, data=proj_bias) + + def _set_mlp_state(self, src_model, **kwargs): + """Set MLP params.""" + fc1_weight = src_model.get_layers_mlp_linear_fc1_weight(**kwargs) + fc2_weight = src_model.get_layers_mlp_linear_fc2_weight(**kwargs) + self.set_layers_mlp_linear_fc1_weight(data=fc1_weight, **kwargs) + self.set_layers_mlp_linear_fc2_weight(data=fc2_weight, **kwargs) + if src_model.has_layers_mlp_linear_fc1_bias(**kwargs): + fc1_bias = src_model.get_layers_mlp_linear_fc1_bias(**kwargs) + self.set_layers_mlp_linear_fc1_bias(data=fc1_bias, **kwargs) + if src_model.has_layers_mlp_linear_fc2_bias(**kwargs): + fc2_bias = src_model.get_layers_mlp_linear_fc2_bias(**kwargs) + self.set_layers_mlp_linear_fc2_bias(data=fc2_bias, **kwargs) + if self.args.post_norm: + pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(**kwargs) + post_mlp_layernorm_weight = src_model.get_layers_self_attention_post_mlp_layernorm_weight(**kwargs) + self.set_layers_self_attention_pre_mlp_layernorm_weight(data=pre_mlp_layernorm_weight, **kwargs) + self.set_layers_self_attention_post_mlp_layernorm_weight(data=post_mlp_layernorm_weight, **kwargs) + + def _set_mlp_experts_state(self, src_model, **kwargs): + """Set MLP experts params.""" + fc1_weight = src_model.get_layers_mlp_experts_linear_fc1_weight(**kwargs) + fc2_weight = src_model.get_layers_mlp_experts_linear_fc2_weight(**kwargs) + self.set_layers_mlp_experts_linear_fc1_weight(data=fc1_weight, **kwargs) + self.set_layers_mlp_experts_linear_fc2_weight(data=fc2_weight, **kwargs) + + def _set_mlp_shared_experts_state(self, src_model, **kwargs): + """Set MLP shared experts params.""" + fc1_weight = src_model.get_layers_mlp_shared_experts_linear_fc1_weight(**kwargs) + fc2_weight = src_model.get_layers_mlp_shared_experts_linear_fc2_weight(**kwargs) + self.set_layers_mlp_shared_experts_linear_fc1_weight(data=fc1_weight, **kwargs) + self.set_layers_mlp_shared_experts_linear_fc2_weight(data=fc2_weight, **kwargs) + + def _set_moe_grouped_gemm_state(self, src_model, **kwargs): + """Set MOE grouped gemm params.""" + weight1 = src_model.get_layers_mlp_experts_weight1_module(**kwargs) + weight2 = src_model.get_layers_mlp_experts_weight2_module(**kwargs) + self.set_layers_mlp_experts_weight1_module(data=weight1, **kwargs) + self.set_layers_mlp_experts_weight2_module(data=weight2, **kwargs) + + def set_mlp_state(self, src_model, **kwargs): + args = src_model.get_args() + num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) + first_k_dense_replace = self.get_first_k_dense_replace() + moe_layer_freq = self.get_moe_layer_freq() + shared_expert_gate = getattr(args, 'shared_expert_gate', False) + dst_layer_idx = kwargs["dst_layer_idx"] + if dst_layer_idx >= first_k_dense_replace and dst_layer_idx % moe_layer_freq == 0: + router_weight = src_model.get_layers_mlp_router_weight(**kwargs) + self.set_layers_mlp_router_weight(**kwargs, data=router_weight) + if shared_expert_gate: + shared_expert_gate_weight = src_model.get_layers_mlp_shared_expert_gate_weight(**kwargs) + self.set_layers_mlp_shared_expert_gate_weight(**kwargs, data=shared_expert_gate_weight) + if getattr(self.args, "n_shared_experts", None) is not None: + self._set_mlp_shared_experts_state(src_model, **kwargs) + if args.moe_grouped_gemm: + self._set_moe_grouped_gemm_state(src_model, **kwargs) + else: + for expert_idx in range(num_experts): + kwargs['expert_idx'] = expert_idx + self._set_mlp_experts_state(src_model, **kwargs) + else: + self._set_mlp_state(src_model, **kwargs) + + def get_args(self): + return self.args + + def get_args_cmd(self): + return self.args_cmd + + def get_metadata(self): + return self.md + + def get_modules_count(self): + return len(self.module) + + def get_first_k_dense_replace(self): + if getattr(self.args, "first_k_dense_replace", None) is None: + num_experts = (getattr(self.args, 'num_experts', None) or + getattr(self.args, 'num_local_experts', None)) + if num_experts is None: + return self.args.num_layers + else: + return 0 + else: + return self.args.first_k_dense_replace + + def get_moe_layer_freq(self): + if getattr(self.args, "moe_layer_freq", None) is None: + return 1 + else: + return self.args.moe_layer_freq + + @staticmethod + def read_model_cfg(args_cmd): + def merge_configs(base_config, specific_config): + merged_config = base_config.copy() + for key, value in specific_config.items(): + if isinstance(value, dict) and key in merged_config: + merged_config[key] = merge_configs(merged_config[key], value) + else: + merged_config[key] = value + return merged_config + + if args_cmd.ckpt_cfg_path == "configs/checkpoint/model_cfg.json": + current_directory = os.path.dirname(os.path.abspath(__file__)) + cfg_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(current_directory))), "configs/checkpoint/model_cfg.json") + else: + cfg_dir = args_cmd.ckpt_cfg_path + with open(cfg_dir, 'r') as file: + config = json.load(file) + final_configs = {} + + for model_name, model_config in config["model_mappings"].items(): + if "__base__" in model_config: + base_model_name = model_config["__base__"] + base_config = config["model_mappings"][base_model_name] + specific_config = model_config.copy() + specific_config.pop("__base__", None) + final_config = merge_configs(base_config, specific_config) + else: + final_config = model_config + final_configs[model_name] = final_config + + return final_configs + + @abc.abstractmethod + def get_module_mapping(self): + pass + + @abc.abstractmethod + def get_model_item(self, **kwargs): + pass + + +class HuggingfaceModel(ModelBase): + def __init__(self, args_cmd): + super(HuggingfaceModel, self).__init__(args_cmd) + self.initialize_args() + self.layers_self_attention_linear_qkv_caches = {"layer_idx": -1, "weight": None, "bias": None} + + def initialize_args(self): + # Read huggingface args. + if self.args_cmd.save_model_type == 'hf': + cfg_dir = self.args_cmd.save_dir + else: + cfg_dir = self.args_cmd.load_dir + llama_args_path = os.path.join(cfg_dir, "config.json") + with open(llama_args_path) as f: + self.args = json.load(f) + + config_key_mapping = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_hf_key_mapping') + config_value = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value') + for key_target in config_key_mapping: + key_hf = config_key_mapping[key_target] + if self.args.get(key_hf, None) is not None: + self.args[key_target] = self.args[key_hf] + else: + logger.warning(f"{key_target} was not found in the config file.") + for key_target in config_value: + self.args[key_target] = config_value[key_target] + + if ( + "num_key_value_heads" in self.args and + self.args["num_attention_heads"] != self.args["num_key_value_heads"] + ): + if self.args["num_attention_heads"] == 1: + raise AssertionError("Number of attention heads should be greater than 1!") + self.args['group_query_attention'] = True + + self.args['untie_embeddings_and_output_weights'] = not self.args.get("tie_word_embeddings", False) + self.args = SimpleNamespace(**self.args) + self.args.add_qkv_bias = self.args_cmd.add_qkv_bias + self.args.add_dense_bias = self.args_cmd.add_dense_bias + self.args.post_norm = self.args_cmd.post_norm + + def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True): + # Load Huggingface model. + if self.args_cmd.save_model_type == "hf": + load_dir = self.args_cmd.save_dir + else: + load_dir = self.args_cmd.load_dir + self.module = [AutoModelForCausalLM.from_pretrained(load_dir, device_map=device_map, trust_remote_code=trust_remote_code, local_files_only=True)] + if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]: + self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}')) + + def get_module_mapping(self): + self.module_mapping = self.model_cfg.get(self.args_cmd.model_type_hf).get('model_hf_key_mapping') + + def __get_layers_self_attention_linear_qkv_module(self, layer_idx=0): + if self.layers_self_attention_linear_qkv_caches["layer_idx"] == layer_idx: + return + self.layers_self_attention_linear_qkv_caches["layer_idx"] = layer_idx + # Reshape loaded weights. + nh = self.args.num_attention_heads + ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) + dim = self.args.kv_channels if hasattr(self.args, "kv_channels") else self.args.hidden_size // self.args.num_attention_heads + if not nh % ng == 0: + raise ValueError("nh % ng should equal 0") + + def qkv_concatenate_weight(qkv): + return torch.cat([ + qkv[0].reshape((ng, dim * nh // ng, -1)), + qkv[1].reshape((ng, dim, -1)), + qkv[2].reshape((ng, dim, -1)), + ], dim=1).reshape((-1, self.args.hidden_size)) + + def qkv_concatenate_bias(qkv): + return torch.cat([ + qkv[0].reshape((ng, dim * nh // ng)), + qkv[1].reshape((ng, dim)), + qkv[2].reshape((ng, dim)), + ], dim=1).reshape((-1)) + + qkv_type = self.args.qkv_type + if qkv_type == "unpack": + q_proj = self.get_layers_self_attention_linear_q_proj_module(layer_idx=layer_idx) + k_proj = self.get_layers_self_attention_linear_k_proj_module(layer_idx=layer_idx) + v_proj = self.get_layers_self_attention_linear_v_proj_module(layer_idx=layer_idx) + query_key_value_weight = [q_proj.weight, k_proj.weight, v_proj.weight] + query_key_value_bias = [q_proj.bias, k_proj.bias, v_proj.bias] + self.layers_self_attention_linear_qkv_caches["weight"] = (qkv_concatenate_weight(query_key_value_weight)) + if self.args_cmd.add_qkv_bias: + self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) + elif qkv_type == "pack_mla": + q_proj = self.get_layers_self_attention_linear_q_proj_module(layer_idx=layer_idx) + kv_proj = self.get_layers_self_attention_linear_kv_proj_module(layer_idx=layer_idx) + query_key_value_weight = [q_proj.weight.reshape((-1, self.args.hidden_size)), + kv_proj.weight.reshape((-1, self.args.hidden_size))] + self.layers_self_attention_linear_qkv_caches["weight"] = (torch.cat(query_key_value_weight, dim=0)) + if self.args_cmd.add_qkv_bias: + query_key_value_bias = [q_proj.bias, kv_proj.bias] + self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) + elif qkv_type == "pack_gqa": + qkv_pack = self.get_layers_self_attention_linear_qkv_pack_module(layer_idx=layer_idx) + qkv_pack_weight = qkv_pack.weight + full_q = dim * nh + end_k = full_q + ng * dim + q_weight = qkv_pack_weight[:full_q, :] + k_weight = qkv_pack_weight[full_q:end_k, :] + v_weight = qkv_pack_weight[end_k:, :] + query_key_value_weight = [q_weight, k_weight, v_weight] + self.layers_self_attention_linear_qkv_caches["weight"] = (qkv_concatenate_weight(query_key_value_weight)) + if self.args_cmd.add_qkv_bias: + qkv_pack_bias = qkv_pack.bias + q_bias = qkv_pack_bias[:full_q] + k_bias = qkv_pack_bias[full_q:end_k] + v_bias = qkv_pack_bias[end_k:] + query_key_value_bias = [q_bias, k_bias, v_bias] + self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) + elif qkv_type == "pack_self": + qkv_pack = self.get_layers_self_attention_linear_qkv_pack_module(layer_idx=layer_idx) + qkv_pack_weight = qkv_pack.weight + self.layers_self_attention_linear_qkv_caches["weight"] = qkv_pack_weight + if self.args_cmd.add_qkv_bias: + qkv_pack_bias = qkv_pack.bias + full_q = dim * nh + end_k = full_q + ng * dim + q_bias = qkv_pack_bias[:full_q, :] + k_bias = qkv_pack_bias[full_q:end_k, :] + v_bias = qkv_pack_bias[end_k:, :] + query_key_value_bias = [q_bias, k_bias, v_bias] + self.layers_self_attention_linear_qkv_caches["bias"] = (qkv_concatenate_bias(query_key_value_bias)) + else: + raise ValueError(f"Unsupported types. {qkv_type}") + + def has_layers_mlp_linear_fc1_bias(self, **kwargs): + return False + + def get_layers_mlp_linear_fc1_weight(self, **kwargs): + fc_type = self.args.fc_type + if fc_type == "h_to_4h": + return self.get_layers_mlp_linear_fc1_module(**kwargs).weight + elif fc_type == "gate_up_down": + gate_proj = self.get_layers_mlp_gate_proj_weight(**kwargs) + up_proj = self.get_layers_mlp_up_proj_weight(**kwargs) + return torch.cat([gate_proj, up_proj], dim=0) + else: + raise ValueError(f"Unsupported fc_type {fc_type}") + + def get_layers_self_attention_linear_qkv_weight(self, layer_idx): + self.__get_layers_self_attention_linear_qkv_module(layer_idx=layer_idx) + return self.layers_self_attention_linear_qkv_caches["weight"] + + def get_layers_self_attention_linear_qkv_bias(self, layer_idx): + self.__get_layers_self_attention_linear_qkv_module(layer_idx=layer_idx) + return self.layers_self_attention_linear_qkv_caches["bias"] + + def set_layers_mlp_linear_fc1_weight(self, data=None, **kwargs): + gate_proj, up_proj = torch.chunk(data, 2, dim=0) + self.set_layers_mlp_gate_proj_weight(data=gate_proj, **kwargs) + self.set_layers_mlp_up_proj_weight(data=up_proj, **kwargs) + + def set_layers_mlp_experts_linear_fc1_weight(self, data=None, **kwargs): + gate_proj, up_proj = torch.chunk(data, 2, dim=0) + self.set_layers_mlp_experts_gate_proj_weight(data=gate_proj, **kwargs) + self.set_layers_mlp_experts_up_proj_weight(data=up_proj, **kwargs) + + def set_layers_mlp_shared_experts_linear_fc1_weight(self, data=None, **kwargs): + gate_proj, up_proj = torch.chunk(data, 2, dim=0) + self.set_layers_mlp_shared_experts_gate_proj_weight(data=gate_proj, **kwargs) + self.set_layers_mlp_shared_experts_up_proj_weight(data=up_proj, **kwargs) + + def set_layers_mlp_experts_weight1_module(self, data=None, **kwargs): + args = self.get_args() + num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) + experts_linear_fc1_list = torch.chunk(data.view(-1), num_experts) + for expert_idx in range(num_experts): + kwargs['expert_idx'] = expert_idx + fc1_weight = experts_linear_fc1_list[expert_idx].view(args.hidden_size, -1).t() + self.set_layers_mlp_experts_linear_fc1_weight(data=fc1_weight, **kwargs) + + def set_layers_mlp_experts_weight2_module(self, data=None, **kwargs): + args = self.get_args() + num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) + experts_linear_fc2_list = torch.chunk(data.view(-1), num_experts) + for expert_idx in range(num_experts): + kwargs['expert_idx'] = expert_idx + fc2_weight = experts_linear_fc2_list[expert_idx].view(-1, args.hidden_size).t() + self.set_layers_mlp_experts_linear_fc2_weight(data=fc2_weight, **kwargs) + + def get_layers_mlp_experts_linear_fc1_weight(self, **kwargs): + fc_type = self.args.fc_type + if fc_type == "h_to_4h": + return self.get_layers_mlp_experts_linear_fc1_module(**kwargs).weight + elif fc_type == "gate_up_down": + gate_proj = self.get_layers_mlp_experts_gate_proj_weight(**kwargs) + up_proj = self.get_layers_mlp_experts_up_proj_weight(**kwargs) + return torch.cat([gate_proj, up_proj], dim=0) + else: + raise ValueError(f"Unsupported fc_type {fc_type}") + + def get_layers_mlp_shared_experts_linear_fc1_weight(self, **kwargs): + fc_type = self.args.fc_type + if fc_type == "h_to_4h": + return self.get_layers_mlp_experts_linear_fc1_module(**kwargs).weight + elif fc_type == "gate_up_down": + gate_proj = self.get_layers_mlp_shared_experts_gate_proj_weight(**kwargs) + up_proj = self.get_layers_mlp_shared_experts_up_proj_weight(**kwargs) + return torch.cat([gate_proj, up_proj], dim=0) + else: + raise ValueError(f"Unsupported fc_type {fc_type}") + + def get_layers_mlp_experts_weight1_module(self, **kwargs): + args = self.get_args() + num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) + experts_linear_fc1_list = [] + for expert_idx in range(num_experts): + kwargs['expert_idx'] = expert_idx + fc1_weight = self.get_layers_mlp_experts_linear_fc1_weight(**kwargs) + experts_linear_fc1_list.append(fc1_weight.t().view(-1)) + return torch.cat(experts_linear_fc1_list).view(args.hidden_size, -1) + + def get_layers_mlp_experts_weight2_module(self, **kwargs): + args = self.get_args() + num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) + experts_linear_fc2_list = [] + for expert_idx in range(num_experts): + kwargs['expert_idx'] = expert_idx + fc2_weight = self.get_layers_mlp_experts_linear_fc2_weight(**kwargs) + experts_linear_fc2_list.append(fc2_weight.t().view(-1)) + return torch.cat(experts_linear_fc2_list).view(-1, args.hidden_size) + + def set_layers_self_attention_linear_qkv_weight(self, layer_idx=0, data=None): + def qkv_split_weight(query_key_value): + qkv_weight = query_key_value.reshape( + ng, + repeats + 2, + query_key_value.shape[0] // ng // (repeats + 2), + query_key_value.shape[1], + ) + hidden_size = qkv_weight.shape[-1] + qw = qkv_weight[:, :repeats, ...].reshape(-1, hidden_size) + kw = qkv_weight[:, repeats: repeats + 1, ...].reshape(-1, hidden_size) + vw = qkv_weight[:, repeats + 1:, ...].reshape(-1, hidden_size) + return qw, kw, vw + + nh = self.args.num_attention_heads + ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) + if not nh % ng == 0: + raise ValueError("nh % ng should equal 0") + repeats = nh // ng + + qkv_type = self.args.qkv_type + if qkv_type == "unpack": + q_weight, k_weight, v_weight = qkv_split_weight(data) + self.set_layers_self_attention_linear_q_proj_weight(layer_idx=layer_idx, data=q_weight) + self.set_layers_self_attention_linear_k_proj_weight(layer_idx=layer_idx, data=k_weight) + self.set_layers_self_attention_linear_v_proj_weight(layer_idx=layer_idx, data=v_weight) + elif qkv_type == "pack_gqa": + qw, k_weight, v_weight = qkv_split_weight(data) + qkv = torch.cat((qw, k_weight, v_weight), dim=0) + self.set_layers_self_attention_linear_qkv_pack_weight(layer_idx=layer_idx, data=qkv) + elif qkv_type == "pack_mla": + if self.args.q_lora_rank is None: + q_head_dim = self.args.qk_nope_head_dim + self.args.qk_rope_head_dim + q_proj = data[:self.args.num_attention_heads * q_head_dim, :] + kv_proj = data[self.args.num_attention_heads * q_head_dim:, :] + else: + q_proj = data[:self.args.q_lora_rank, :] + kv_proj = data[self.args.q_lora_rank:, :] + self.set_layers_self_attention_linear_q_proj_weight(layer_idx=layer_idx, data=q_proj) + self.set_layers_self_attention_linear_kv_proj_weight(layer_idx=layer_idx, data=kv_proj) + elif qkv_type == "pack_self": + self.set_layers_self_attention_linear_qkv_pack_weight(layer_idx=layer_idx, data=data) + else: + raise ValueError(f"Unsupported types. {qkv_type}") + + def set_layers_self_attention_linear_qkv_bias(self, layer_idx, data=None): + def qkv_split_bias(query_key_value): + bias_weight = query_key_value.reshape( + ng, repeats + 2, query_key_value.shape[0] // ng // (repeats + 2) + ) + qw = bias_weight[:, :repeats, ...].reshape(-1) + kw = bias_weight[:, repeats: repeats + 1, ...].reshape(-1) + vw = bias_weight[:, repeats + 1:, ...].reshape(-1) + return qw, kw, vw + + nh = self.args.num_attention_heads + ng = (self.args.num_key_value_heads if self.args.group_query_attention else self.args.num_attention_heads) + if not nh % ng == 0: + raise ValueError("nh % ng should equal 0") + repeats = nh // ng + + qkv_type = self.args.qkv_type + if qkv_type == "unpack": + if self.args_cmd.add_qkv_bias: + q_bias, k_bias, v_bias = qkv_split_bias(data) + self.set_layers_self_attention_linear_q_proj_bias(layer_idx=layer_idx, data=q_bias) + self.set_layers_self_attention_linear_k_proj_bias(layer_idx=layer_idx, data=k_bias) + self.set_layers_self_attention_linear_v_proj_bias(layer_idx=layer_idx, data=v_bias) + elif qkv_type == "pack_gqa": + if self.args_cmd.add_qkv_bias: + q_bias, k_bias, v_bias = qkv_split_bias(data) + qkv_bias = torch.cat((q_bias, k_bias, v_bias), dim=0) + self.set_layers_self_attention_linear_qkv_pack_bias(layer_idx=layer_idx, data=qkv_bias) + else: + raise ValueError(f"Unsupported types. {qkv_type}") + + def get_model_item(self, **kwargs): + return self.module[0] + + +class MegatronModel(ModelBase): + def __init__(self, model_provider, args_cmd, md=None): + super(MegatronModel, self).__init__(args_cmd) + self.model_provider = model_provider_func_wrapper(model_provider) + self.md = md + self.pp_stage_cache = [] + + def initialize_megatron_args(self, hf_args=None, queue=None, loader_megatron=False, saver_megatron=False): + sys.argv = self.get_sys_argv() + self.args = parse_args() + + self.update_megatron_args_from_megatron_checkpoint(loader_megatron) + self.update_megatron_args_from_cmd_config(loader_megatron) + self.update_megatron_args_from_huggingface_config(hf_args) + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes. + self.args.world_size = self.args.tensor_model_parallel_size * self.args.pipeline_model_parallel_size + self.update_megatron_args_from_loader_margs() + self.args = validate_args(self.args) + self.check_for_args(queue, saver_megatron) + + self.args.model_type = ModelType.encoder_or_decoder + # Suppress warning about torch.distributed not being initialized. + module.MegatronModule.embedding_warning_printed = True + set_args(self.args) + self.set_megatron_parallel_state(saver_megatron) + + def update_megatron_args_from_loader_margs(self): + if self.md and hasattr(self.md, 'checkpoint_args'): + # These are arguments that we are either changing, or cause problems for validation if they are set + args_to_keep = [ + 'tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', + 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', + 'bias_gelu_fusion', 'bias_dropout_fusion', 'sequence_parallel', 'async_tensor_model_parallel_allreduce', + 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', 'vocab_file', 'tokenizer_model', + 'save_interval', 'save', 'perform_initialization', 'use_cpu_initialization', 'recompute_granularity', + 'recompute_num_layers', 'recompute_method', 'encoder_num_layers', 'encoder_seq_length', + 'distribute_saved_activations', 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', + 'lr_warmup_fraction', 'start_weight_decay', 'end_weight_decay', 'make_vocab_size_divisible_by', + 'masked_softmax_fusion', 'num_layer_list', 'lora_target_modules', 'expert_model_parallel_size', 'use_mcore_models' + ] + + for arg, value in vars(self.md.checkpoint_args).items(): + if arg in args_to_keep: + continue + if not hasattr(self.args, arg): + logger.warning(f"Checkpoint had argument {arg} but new arguments does not have this.") + continue + if getattr(self.args, arg) != value: + logger.warning( + f"Overwriting default {arg} value {getattr(self.args, arg)} with value from checkpoint {value}." + ) + setattr(self.args, arg, value) + + if hasattr(self.md, 'consumed_train_samples'): + self.args.consumed_train_samples = self.md.consumed_train_samples + self.args.consumed_valid_samples = self.md.consumed_valid_samples + logger.info(f"Setting consumed_train_samples to {self.args.consumed_train_samples} " + f"and consumed_valid_samples to {self.args.consumed_valid_samples}") + else: + logger.warning("consumed_train_samples not provided.") + + def update_megatron_args_from_huggingface_config(self, hf_args): + if hf_args is None: + return + try: + self.args.seq_length = getattr(hf_args, "max_position_embeddings", 4096) + self.args.global_batch_size = 1024 + self.args.max_position_embeddings = self.args.seq_length + self.args.norm_epsilon = getattr(hf_args, "norm_epsilon", 1e-6) + self.args.iteration = 1 # '0', 'release' don't work + self.args.hidden_size = hf_args.hidden_size + self.args.num_attention_heads = hf_args.num_attention_heads + + self.args.num_layers = hf_args.num_layers + if self.args.noop_layers is not None: + self.args.num_layers = hf_args.num_layers + len(self.args.noop_layers) + logger.info(f"[INFO] When using noop_layer, origin layers from huggingface is {hf_args.num_layers}, " + f"add noop layer {len(self.args.noop_layers)}, so megatron_ckpt has {self.args.num_layers}") + + self.args.add_position_embedding = hf_args.add_position_embedding + self.args.use_rotary_position_embeddings = hf_args.use_rotary_position_embeddings + self.args.swiglu = hf_args.swiglu + self.args.tokenizer_type = hf_args.tokenizer_type + self.args.normalization = hf_args.normalization + self.args.add_bias_linear = hf_args.add_bias_linear + self.args.untie_embeddings_and_output_weights = not getattr(hf_args, "tie_word_embeddings", False) + self.args.vocab_size = hf_args.vocab_size + self.args.padded_vocab_size = hf_args.vocab_size + self.args.llama = hf_args + self.args.ffn_hidden_size = hf_args.intermediate_size + self.args.gradient_accumulation_fusion = hf_args.gradient_accumulation_fusion + self.args.kv_channels = hf_args.kv_channels if hasattr(hf_args, "kv_channels") else None + self.args.moe_grouped_gemm = hf_args.moe_grouped_gemm + self.args.spec = hf_args.spec + self.args.num_experts = getattr(hf_args, "num_experts", None) + self.args.n_shared_experts = getattr(hf_args, "n_shared_experts", None) + self.args.shared_expert_gate = getattr(hf_args, "shared_expert_gate", None) + self.args.qk_layernorm = getattr(hf_args, "qk_layernorm", False) + self.args.moe_intermediate_size = getattr(hf_args, "moe_intermediate_size", None) + self.args.first_k_dense_replace = getattr(hf_args, "first_k_dense_replace", None) + self.args.moe_layer_freq = getattr(hf_args, "moe_layer_freq", None) + self.args.multi_head_latent_attention = getattr(hf_args, "multi_head_latent_attention", False) + self.args.shared_expert_intermediate_size = getattr(hf_args, "shared_expert_intermediate_size", None) + if self.args.shared_expert_intermediate_size is not None and self.args.n_shared_experts is None: + self.args.n_shared_experts = self.args.shared_expert_intermediate_size // self.args.moe_intermediate_size + if self.args.multi_head_latent_attention: + self.args.qk_rope_head_dim = getattr(hf_args, "qk_rope_head_dim", None) + self.args.qk_nope_head_dim = getattr(hf_args, "qk_nope_head_dim", None) + self.args.q_lora_rank = getattr(hf_args, "q_lora_rank", None) + self.args.kv_lora_rank = getattr(hf_args, "kv_lora_rank", None) + self.args.v_head_dim = getattr(hf_args, "v_head_dim", None) + + if self.args.add_dense_bias: + self.args.skip_bias_add = False + + if ( + hasattr(hf_args, "num_key_value_heads") and + hf_args.num_attention_heads != hf_args.num_key_value_heads + ): + if hf_args.num_attention_heads == 1: + raise AssertionError("Number of attention heads should be greater than 1!") + self.args.group_query_attention = True + self.args.num_query_groups = hf_args.num_key_value_heads + if hasattr(hf_args, 'num_local_experts'): + self.args.num_experts = hf_args.num_local_experts + except Exception as e: + logger.info(e) + raise AssertionError("You may got an incomplete config, please check hf config.json") + + + def update_megatron_args_from_megatron_checkpoint(self, loader_megatron): + if not loader_megatron: + return + set_args(self.args) + self.args, self.args_megatron_checkpoint = load_args_from_checkpoint(self.args) + + def update_megatron_args_from_cmd_config(self, loader_megatron): + self.args.w_pack = self.args_cmd.w_pack + self.args.add_qkv_bias = self.args_cmd.add_qkv_bias + self.args.add_dense_bias = self.args_cmd.add_dense_bias + self.args.post_norm = self.args_cmd.post_norm + self.args.tokenizer_model = getattr(self.args_cmd, 'tokenizer_model', None) + self.args.make_vocab_size_divisible_by = getattr(self.args_cmd, 'make_vocab_size_divisible_by', None) + if self.args_cmd.params_dtype == 'bf16': + self.args.bf16 = True + elif self.args_cmd.params_dtype == 'fp16': + self.args.fp16 = True + if self.args_cmd.add_dense_bias: + self.args.skip_bias_add = False + self.args.use_mcore_models = self.args_cmd.use_mcore_models + + if loader_megatron: + self.args.lora_target_modules = self.args_cmd.lora_target_modules + self.args.lora_load = self.args_cmd.lora_load + self.args.lora_r = self.args_cmd.lora_r + self.args.lora_alpha = self.args_cmd.lora_alpha + # Determine how to make our models. + if not self.args_cmd.model_type == 'GPT': + raise ValueError("Llama-2 is a GPT model.") + + if self.md and self.args_cmd.num_layer_list: + self.args.num_layer_list = self.args_cmd.num_layer_list + + if self.args_cmd.noop_layers: + self.args.noop_layers = self.args_cmd.noop_layers.split(',') + self.args.noop_layers = [int(i) for i in self.args.noop_layers] + + # gradient_accumulation_fusion should be false in ckpt convertion + self.args.gradient_accumulation_fusion = False + + def set_padded_vocab_size(self, padded_vocab_size): + self.args.padded_vocab_size = padded_vocab_size + + def set_megatron_parallel_state(self, saver_megatron): + if saver_megatron: + self.set_tensor_model_parallel_world_size(self.args_cmd.target_tensor_parallel_size) + self.set_expert_model_parallel_world_size(self.args_cmd.target_expert_parallel_size) + self.set_pipeline_model_parallel_world_size(self.args_cmd.target_pipeline_parallel_size) + if self.args_cmd.num_layers_per_virtual_pipeline_stage: + vp_size = (self.args.num_layers // + self.args_cmd.target_pipeline_parallel_size // + self.args_cmd.num_layers_per_virtual_pipeline_stage) + self.set_virtual_pipeline_model_parallel_world_size(vp_size) + else: + self.set_tensor_model_parallel_world_size(self.args.tensor_model_parallel_size) + self.set_pipeline_model_parallel_world_size(self.args.pipeline_model_parallel_size) + self.set_virtual_pipeline_model_parallel_world_size(self.args.virtual_pipeline_model_parallel_size) + + # Get first pipe stage. + self.set_tensor_model_parallel_rank(0) + self.set_pipeline_model_parallel_rank(0) + + def get_modules_from_config(self, pp_stage_cache_flag=False): + self.__get_modules(pp_stage_cache_flag=pp_stage_cache_flag) + + def get_modules_from_pretrained(self, pp_stage_cache_flag=False): + self.__get_modules(from_pretrained=True, pp_stage_cache_flag=pp_stage_cache_flag) + + def __get_modules(self, from_pretrained=False, pp_stage_cache_flag=False): + if self.args.num_experts: + tensor_parallel.model_parallel_cuda_manual_seed(123) + # Initialize the dictionary for the parallel mode of the model + pp_rank = self.get_pipeline_model_parallel_rank() + if pp_stage_cache_flag and pp_rank < len(self.pp_stage_cache): + self.module = self.pp_stage_cache[pp_rank] + return + + virtual_pipeline_model_parallel_size = self.args.virtual_pipeline_model_parallel_size + if virtual_pipeline_model_parallel_size is None: + virtual_pipeline_model_parallel_size = 1 + + models = [ + [ + [ + None for _ in range(self.args.tensor_model_parallel_size) + ] + for _ in range(self.args.expert_model_parallel_size) + ] + for _ in range(virtual_pipeline_model_parallel_size) + ] + + for ep_rank in range(self.args.expert_model_parallel_size): + if self.args.expert_model_parallel_size > 1: + self.set_expert_model_parallel_rank(ep_rank) + for tp_rank in range(self.args.tensor_model_parallel_size): + self.set_tensor_model_parallel_rank(tp_rank) + if self.args.virtual_pipeline_model_parallel_size is not None: + model_ = [] + for vp_rank in range(self.args.virtual_pipeline_model_parallel_size): + self.set_virtual_pipeline_model_parallel_rank(vp_rank) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + expert_parallel_size = mpu.get_expert_model_parallel_world_size() + this_model = self.model_provider( + pre_process=pre_process, + post_process=post_process + ).to(self.args.params_dtype) + model_.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + model_ = [self.model_provider(pre_process, post_process).to(self.args.params_dtype)] + self.args.consumed_train_samples = 0 + self.args.consumed_valid_samples = 0 + if from_pretrained: + load_checkpoint(model_, None, None) + for vp_rank in range(virtual_pipeline_model_parallel_size): + models[vp_rank][ep_rank][tp_rank] = model_[vp_rank] + if self.args.lora_target_modules and from_pretrained: + if virtual_pipeline_model_parallel_size > 1: + raise AssertionError("Virtual pipeline and LoRA weight merging " + "are not supported simultaneously") + models[vp_rank][ep_rank][tp_rank].merge_and_unload() + + self.module = models + + if pp_stage_cache_flag: + self.pp_stage_cache.append(models) + + def check_for_args(self, queue, saver_megatron): + if saver_megatron: + return + check_args_list = { + 'tensor_model_parallel_size': None, 'pipeline_model_parallel_size': None, 'num_layers': None, + 'hidden_size': None, 'seq_length': None, 'num_attention_heads': None, 'max_position_embeddings': None, + 'position_embedding_type': None, 'tokenizer_type': None, 'iteration': 1, 'bert_binary_head': None, + 'disable_bias_linear': False, 'params_dtype': None, 'swiglu': False + } + # if hasattr(self.args, 'add_bias_linear'): + # check_args_list['disable_bias_linear'] = self.args.add_bias_linear + + def check_for_arg(arg_name, default=None): + if getattr(self.args, arg_name, None) is None: + if default is not None: + setattr(self.args, arg_name, default) + elif queue is not None: + logger.error(f"Checkpoint does not specify the argument {arg_name}. Exiting.") + logger.info(f"Arguments: {self.args}") + queue.put("exit") + exit(1) + + for check_arg in check_args_list: + check_for_arg(check_arg, check_args_list[check_arg]) + + def get_sys_argv(self): + sys_argv = [ + 'script.py', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--save-interval', '1', + '--mock-data', # To pass the "blend data checks" in arguments.py + '--load', self.args_cmd.load_dir, + '--finetune' + ] + + if hasattr(self.args_cmd, 'add_bias_linear') and not self.args_cmd.add_bias_linear: + sys_argv.append('--disable-bias-linear') + + if self.args_cmd.use_mcore_models: + sys_argv.append('--use-mcore-models') + + if self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value').get('embed_layernorm', False): + sys_argv.append('--embed-layernorm') + + if self.md is None: + return sys_argv + + sys_argv.extend([ + '--num-layers', str(self.md.num_layers), + '--hidden-size', str(self.md.hidden_size), + '--seq-length', str(self.md.seq_length), + '--num-attention-heads', str(self.md.num_attention_heads), + '--max-position-embeddings', str(self.md.max_position_embeddings), + '--position-embedding-type', str(self.md.position_embedding_type), + '--tokenizer-type', str(self.md.tokenizer_type), + '--tensor-model-parallel-size', str(self.args_cmd.target_tensor_parallel_size), + '--pipeline-model-parallel-size', str(self.args_cmd.target_pipeline_parallel_size), + '--expert-model-parallel-size', str(self.args_cmd.target_expert_parallel_size), + '--save', self.args_cmd.save_dir + ]) + + if self.args_cmd.num_layers_per_virtual_pipeline_stage: + sys_argv.extend(['--num-layers-per-virtual-pipeline-stage', + str(self.args_cmd.num_layers_per_virtual_pipeline_stage)]) + + num_experts = getattr(self.md.checkpoint_args, 'num_experts', None) + if self.args_cmd.target_tensor_parallel_size > 1 and num_experts is not None and num_experts > 1: + sys_argv.append('--sequence-parallel') + + if self.md.make_vocab_size_divisible_by is not None: + sys_argv.extend(['--make-vocab-size-divisible-by', str(self.md.make_vocab_size_divisible_by)]) + if self.md.params_dtype == torch.float16: + sys_argv.append('--fp16') + elif self.md.params_dtype == torch.bfloat16: + sys_argv.append('--bf16') + + if self.md.output_layer: + sys_argv.append('--untie-embeddings-and-output-weights') + if not self.md.linear_bias: + sys_argv.append('--disable-bias-linear') + + if self.md.model_type == 'BERT' and not self.md.bert_binary_head: + sys_argv.append('--bert-no-binary-head') + + return sys_argv + + def get_model_item(self, **kwargs): + self.update_kwargs_idx(**kwargs) + _module = self.module + for key in self.kwargs_idx: + if "rank" in key: + _module = _module[self.kwargs_idx[key]] + return _module + + @staticmethod + def set_tensor_model_parallel_world_size(tensor_model_parallel_size): + mpu.set_tensor_model_parallel_world_size(tensor_model_parallel_size) + + @staticmethod + def set_expert_model_parallel_world_size(expert_model_parallel_size): + mpu.set_expert_model_parallel_world_size(expert_model_parallel_size) + + @staticmethod + def set_pipeline_model_parallel_world_size(pipeline_model_parallel_size): + mpu.set_pipeline_model_parallel_world_size(pipeline_model_parallel_size) + + @staticmethod + def set_virtual_pipeline_model_parallel_world_size(virtual_pipeline_model_parallel_size): + mpu.set_virtual_pipeline_model_parallel_world_size(virtual_pipeline_model_parallel_size) + + @staticmethod + def set_tensor_model_parallel_rank(tensor_model_parallel_rank): + mpu.set_tensor_model_parallel_rank(tensor_model_parallel_rank) + + @staticmethod + def set_pipeline_model_parallel_rank(pipeline_model_parallel_rank): + mpu.set_pipeline_model_parallel_rank(pipeline_model_parallel_rank) + + @staticmethod + def set_expert_model_parallel_rank(pipeline_model_parallel_rank): + mpu.set_expert_model_parallel_rank(pipeline_model_parallel_rank) + + @staticmethod + def set_virtual_pipeline_model_parallel_rank(pipeline_model_parallel_rank): + mpu.set_virtual_pipeline_model_parallel_rank(pipeline_model_parallel_rank) + + @staticmethod + def get_pipeline_model_parallel_rank(): + return mpu.get_pipeline_model_parallel_rank() + + +class MegatronLegacyModel(MegatronModel): + def __init__(self, model_provider, args_cmd, md=None): + super(MegatronLegacyModel, self).__init__(model_provider, args_cmd, md) + + def get_module_mapping(self): + module_layer = "language_model.encoder.layers[layer_idx]." + self.module_mapping = { + "embedding": "language_model.embedding", + "embedding_word_embeddings": "language_model.embedding.word_embeddings", + "embedding_word_embeddings_norm": "language_model.embedding.word_embeddings.norm", + "embedding_position_embeddings": "language_model.embedding.position_embeddings", + "model": "module", + "layers_input_layernorm": module_layer + "input_norm", + "layers": "language_model.encoder.layers", + "layers_self_attention_linear_proj": module_layer + "self_attention.dense", + "layers_self_attention_linear_qkv": module_layer + "self_attention.query_key_value", + "layers_self_attention_post_attention_layernorm": module_layer + "post_attention_norm", + "layers_self_attention_pre_mlp_layernorm": module_layer + "post_attention_norm", + "layers_mlp_linear_fc1": module_layer + "mlp.dense_h_to_4h", + "layers_mlp_linear_fc2": module_layer + "mlp.dense_4h_to_h", + "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", + "final_layernorm": "language_model.encoder.final_norm", + "output_layer": "language_model.output_layer", + "word_embeddings": "word_embeddings" + } + + +class MegatronMCoreModel(MegatronModel): + def __init__(self, model_provider, args_cmd, md=None): + super(MegatronMCoreModel, self).__init__(model_provider, args_cmd, md) + + def get_module_mapping(self): + module_layer = "decoder.layers[layer_idx]." + self.module_mapping = { + "embedding": "embedding", + "embedding_word_embeddings": "embedding.word_embeddings", + "embedding_word_embeddings_norm": "embedding.word_embeddings.norm", + "embedding_position_embeddings": "embedding.position_embeddings", + "model": "module", + "layers_input_layernorm": module_layer + "input_layernorm", + "layers": "decoder.layers", + "layers_self_attention_linear_proj": module_layer + "self_attention.linear_proj", + "layers_self_attention_linear_qkv": module_layer + "self_attention.linear_qkv", + "layers_self_attention_q_layernorm": module_layer + "self_attention.q_layernorm", + "layers_self_attention_k_layernorm": module_layer + "self_attention.k_layernorm", + "layers_self_attention_post_attention_layernorm": module_layer + "post_attn_norm", + "layers_self_attention_pre_mlp_layernorm": module_layer + "pre_mlp_layernorm", + "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1", + "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2", + "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", + "final_layernorm": "decoder.final_layernorm", + "output_layer": "output_layer" + } + + config_value = self.model_cfg.get(self.args_cmd.model_type_hf).get('config_set_value') + + self.module_mapping["layers_mlp_router"] = module_layer + "mlp.router" + self.module_mapping[ + "layers_mlp_experts_linear_fc1"] = module_layer + "mlp.experts.local_experts[expert_idx].linear_fc1" + self.module_mapping[ + "layers_mlp_experts_linear_fc2"] = module_layer + "mlp.experts.local_experts[expert_idx].linear_fc2" + + # MLP + self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb" + self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb" + + # shared experts + self.module_mapping[ + "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1" + self.module_mapping[ + "layers_mlp_shared_experts_linear_fc2"] = module_layer + "mlp.shared_experts.linear_fc2" + + # shared experts gate + if config_value.get('shared_expert_gate', False): + self.module_mapping["layers_mlp_shared_expert_gate"] = module_layer + "mlp.shared_expert_gate" + + # moe grouped gemm + self.module_mapping[ + "layers_mlp_experts_weight1"] = module_layer + "mlp.experts.weight1" + self.module_mapping[ + "layers_mlp_experts_weight2"] = module_layer + "mlp.experts.weight2" + + +def get_megatron_model(model_provider, args_cmd, md=None): + if args_cmd.use_mcore_models: + return MegatronMCoreModel(model_provider, args_cmd=args_cmd, md=md) + else: + return MegatronLegacyModel(model_provider, args_cmd=args_cmd, md=md) + + +def get_huggingface_model(args_cmd): + return HuggingfaceModel(args_cmd) diff --git a/modellink/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py similarity index 100% rename from modellink/tasks/checkpoint/saver.py rename to mindspeed_llm/tasks/checkpoint/saver.py diff --git a/modellink/tasks/data/__init__.py b/mindspeed_llm/tasks/data/__init__.py similarity index 100% rename from modellink/tasks/data/__init__.py rename to mindspeed_llm/tasks/data/__init__.py diff --git a/modellink/tasks/data/collator.py b/mindspeed_llm/tasks/data/collator.py similarity index 100% rename from modellink/tasks/data/collator.py rename to mindspeed_llm/tasks/data/collator.py diff --git a/modellink/tasks/error_utils.py b/mindspeed_llm/tasks/error_utils.py similarity index 100% rename from modellink/tasks/error_utils.py rename to mindspeed_llm/tasks/error_utils.py diff --git a/modellink/tasks/evaluation/__init__.py b/mindspeed_llm/tasks/evaluation/__init__.py similarity index 100% rename from modellink/tasks/evaluation/__init__.py rename to mindspeed_llm/tasks/evaluation/__init__.py diff --git a/modellink/tasks/evaluation/eval_api/__init__.py b/mindspeed_llm/tasks/evaluation/eval_api/__init__.py similarity index 100% rename from modellink/tasks/evaluation/eval_api/__init__.py rename to mindspeed_llm/tasks/evaluation/eval_api/__init__.py diff --git a/modellink/tasks/evaluation/eval_api/chat.py b/mindspeed_llm/tasks/evaluation/eval_api/chat.py similarity index 100% rename from modellink/tasks/evaluation/eval_api/chat.py rename to mindspeed_llm/tasks/evaluation/eval_api/chat.py diff --git a/modellink/tasks/evaluation/eval_api/dataset_eval.py b/mindspeed_llm/tasks/evaluation/eval_api/dataset_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_api/dataset_eval.py rename to mindspeed_llm/tasks/evaluation/eval_api/dataset_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/__init__.py b/mindspeed_llm/tasks/evaluation/eval_impl/__init__.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/__init__.py rename to mindspeed_llm/tasks/evaluation/eval_impl/__init__.py diff --git a/modellink/tasks/evaluation/eval_impl/agi_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/agi_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/bbh_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py similarity index 99% rename from modellink/tasks/evaluation/eval_impl/bbh_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py index 1bba99d1ffd94e10618ee9da98f13d87d86416fc..1168957d422c9d2fe5ebbe8a7fcb9c2c9d892afd 100644 --- a/modellink/tasks/evaluation/eval_impl/bbh_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py @@ -23,7 +23,7 @@ import tqdm import pandas as pd from torch import distributed as dist from megatron.training import get_args -from modellink.tasks.preprocess.templates import Role +from mindspeed_llm.tasks.preprocess.templates import Role from .template import BBH_TEMPLATE_DIR, get_eval_template from ..eval_api.dataset_eval import DatasetEval from ..eval_api.chat import Chat diff --git a/modellink/tasks/evaluation/eval_impl/boolq_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/boolq_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/boolq_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/boolq_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/ceval_exam.py b/mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/ceval_exam.py rename to mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py diff --git a/modellink/tasks/evaluation/eval_impl/fewshot_template/AGI_fewshot.json b/mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/AGI_fewshot.json similarity index 100% rename from modellink/tasks/evaluation/eval_impl/fewshot_template/AGI_fewshot.json rename to mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/AGI_fewshot.json diff --git a/modellink/tasks/evaluation/eval_impl/fewshot_template/bbh_template.json b/mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/bbh_template.json similarity index 100% rename from modellink/tasks/evaluation/eval_impl/fewshot_template/bbh_template.json rename to mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/bbh_template.json diff --git a/modellink/tasks/evaluation/eval_impl/fewshot_template/ceval_5shot_template.json b/mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/ceval_5shot_template.json similarity index 100% rename from modellink/tasks/evaluation/eval_impl/fewshot_template/ceval_5shot_template.json rename to mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/ceval_5shot_template.json diff --git a/modellink/tasks/evaluation/eval_impl/fewshot_template/gsm8k_3shot_template.json b/mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/gsm8k_3shot_template.json similarity index 100% rename from modellink/tasks/evaluation/eval_impl/fewshot_template/gsm8k_3shot_template.json rename to mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/gsm8k_3shot_template.json diff --git a/modellink/tasks/evaluation/eval_impl/fewshot_template/mmlu_5shot_template.json b/mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/mmlu_5shot_template.json similarity index 100% rename from modellink/tasks/evaluation/eval_impl/fewshot_template/mmlu_5shot_template.json rename to mindspeed_llm/tasks/evaluation/eval_impl/fewshot_template/mmlu_5shot_template.json diff --git a/modellink/tasks/evaluation/eval_impl/gsm8k_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/gsm8k_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/human_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/human_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/mmlu_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py similarity index 100% rename from modellink/tasks/evaluation/eval_impl/mmlu_eval.py rename to mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py diff --git a/modellink/tasks/evaluation/eval_impl/template.py b/mindspeed_llm/tasks/evaluation/eval_impl/template.py similarity index 98% rename from modellink/tasks/evaluation/eval_impl/template.py rename to mindspeed_llm/tasks/evaluation/eval_impl/template.py index 8f3c40a99a4b5759461c78a42afae1e018f36b85..d93242ab08a90a08b6e5643c8b7fc9efd0394c4e 100644 --- a/modellink/tasks/evaluation/eval_impl/template.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/template.py @@ -16,7 +16,7 @@ import os from pathlib import Path from dataclasses import dataclass -from modellink.tasks.preprocess.templates import Role +from mindspeed_llm.tasks.preprocess.templates import Role cur_file_dir = Path(__file__).absolute().parent diff --git a/modellink/tasks/evaluation/utils.py b/mindspeed_llm/tasks/evaluation/utils.py similarity index 100% rename from modellink/tasks/evaluation/utils.py rename to mindspeed_llm/tasks/evaluation/utils.py diff --git a/modellink/tasks/finetune/__init__.py b/mindspeed_llm/tasks/finetune/__init__.py similarity index 100% rename from modellink/tasks/finetune/__init__.py rename to mindspeed_llm/tasks/finetune/__init__.py diff --git a/modellink/tasks/finetune/lora/__init__.py b/mindspeed_llm/tasks/finetune/lora/__init__.py similarity index 100% rename from modellink/tasks/finetune/lora/__init__.py rename to mindspeed_llm/tasks/finetune/lora/__init__.py diff --git a/modellink/tasks/finetune/lora/cc_lora_forward.py b/mindspeed_llm/tasks/finetune/lora/cc_lora_forward.py similarity index 100% rename from modellink/tasks/finetune/lora/cc_lora_forward.py rename to mindspeed_llm/tasks/finetune/lora/cc_lora_forward.py diff --git a/modellink/tasks/finetune/lora/lora_moe.py b/mindspeed_llm/tasks/finetune/lora/lora_moe.py similarity index 100% rename from modellink/tasks/finetune/lora/lora_moe.py rename to mindspeed_llm/tasks/finetune/lora/lora_moe.py diff --git a/modellink/tasks/finetune/lora/utils.py b/mindspeed_llm/tasks/finetune/lora/utils.py similarity index 100% rename from modellink/tasks/finetune/lora/utils.py rename to mindspeed_llm/tasks/finetune/lora/utils.py diff --git a/modellink/tasks/inference/__init__.py b/mindspeed_llm/tasks/inference/__init__.py similarity index 100% rename from modellink/tasks/inference/__init__.py rename to mindspeed_llm/tasks/inference/__init__.py diff --git a/modellink/tasks/inference/infer_base.py b/mindspeed_llm/tasks/inference/infer_base.py similarity index 100% rename from modellink/tasks/inference/infer_base.py rename to mindspeed_llm/tasks/inference/infer_base.py diff --git a/modellink/tasks/inference/module.py b/mindspeed_llm/tasks/inference/module.py similarity index 100% rename from modellink/tasks/inference/module.py rename to mindspeed_llm/tasks/inference/module.py diff --git a/modellink/tasks/megatron_adaptor.py b/mindspeed_llm/tasks/megatron_adaptor.py similarity index 98% rename from modellink/tasks/megatron_adaptor.py rename to mindspeed_llm/tasks/megatron_adaptor.py index 156d37bf21cb65ddcff6958f8c010a4e43969515..1d8db9aaba7fca168473cb33f7dd4dc6670a37a0 100644 --- a/modellink/tasks/megatron_adaptor.py +++ b/mindspeed_llm/tasks/megatron_adaptor.py @@ -62,7 +62,7 @@ class MegatronAdaptation: if cls._args is not None: return cls._args - from modellink.training.arguments import process_args + from mindspeed_llm.training.arguments import process_args parser = argparse.ArgumentParser(description='MindSpeed-LLM Arguments', allow_abbrev=False) _args, _ = process_args(parser).parse_known_args() return _args @@ -342,7 +342,7 @@ class CoreAdaptation(MegatronAdaptationABC): _batched_p2p_ops) # dpo relative, we need to change the recv/send shape when using PP, then deal with it by ourselves. - from modellink.tasks.post_train.utils import get_tensor_shapes_decorator + from mindspeed_llm.tasks.post_train.utils import get_tensor_shapes_decorator MegatronAdaptation.register('megatron.core.pipeline_parallel.schedules.get_tensor_shapes', get_tensor_shapes_decorator) @@ -424,11 +424,11 @@ class CoreAdaptation(MegatronAdaptationABC): finalize_wrapper) def patch_utils(self): - from modellink.training.utils import unwrap_model_wrapper + from mindspeed_llm.training.utils import unwrap_model_wrapper MegatronAdaptation.register('megatron.training.checkpointing.unwrap_model', unwrap_model_wrapper) MegatronAdaptation.register('megatron.training.training.unwrap_model', unwrap_model_wrapper) - from modellink.training.utils import generate_adaptive_cp_mask_list_by_user, generate_adaptive_cp_grid_mask_by_user + from mindspeed_llm.training.utils import generate_adaptive_cp_mask_list_by_user, generate_adaptive_cp_grid_mask_by_user MegatronAdaptation.register('mindspeed.core.context_parallel.utils.generate_adaptive_cp_mask_list_by_user', generate_adaptive_cp_mask_list_by_user) MegatronAdaptation.register('mindspeed.core.context_parallel.utils.generate_adaptive_cp_grid_mask_by_user', @@ -439,7 +439,7 @@ class CoreAdaptation(MegatronAdaptationABC): def has_recomputation_or_swap(args): return (args.swap_attention or args.recompute_in_advance) if has_recomputation_or_swap(args): - from modellink.core.tensor_parallel.layers import linear_forward_main_grad_wrapper, linear_backward_main_grad_wrapper + from mindspeed_llm.core.tensor_parallel.layers import linear_forward_main_grad_wrapper, linear_backward_main_grad_wrapper MegatronAdaptation.register('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.forward', linear_forward_main_grad_wrapper) MegatronAdaptation.register('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.backward', @@ -468,7 +468,7 @@ class LegacyAdaptation(MegatronAdaptationABC): def patch_log_handler(self): from megatron.training.log_handler import CustomHandler - from modellink.training.utils import emit + from mindspeed_llm.training.utils import emit CustomHandler.emit = emit def patch_high_availability_feature(self): @@ -619,9 +619,9 @@ class LegacyAdaptation(MegatronAdaptationABC): MegatronAdaptation.register('megatron.inference.text_generation.forward_step._with_pipelining_forward_step', _with_pipelining_forward_step) def patch_miscellaneous(self): - from modellink.training.utils import print_args_wrapper - from modellink.training.arguments import validate_args_decorator - from modellink.training.arguments import core_transformer_config_from_args_wrapper + from mindspeed_llm.training.utils import print_args_wrapper + from mindspeed_llm.training.arguments import validate_args_decorator + from mindspeed_llm.training.arguments import core_transformer_config_from_args_wrapper from ..training.checkpointing import _load_base_checkpoint_wrapper from ..training.tokenizer import build_tokenizer from ..training.arguments import parse_args_decorator diff --git a/modellink/tasks/models/__init__.py b/mindspeed_llm/tasks/models/__init__.py similarity index 100% rename from modellink/tasks/models/__init__.py rename to mindspeed_llm/tasks/models/__init__.py diff --git a/modellink/tasks/models/common/__init__.py b/mindspeed_llm/tasks/models/common/__init__.py similarity index 100% rename from modellink/tasks/models/common/__init__.py rename to mindspeed_llm/tasks/models/common/__init__.py diff --git a/modellink/tasks/models/common/alibi.py b/mindspeed_llm/tasks/models/common/alibi.py similarity index 100% rename from modellink/tasks/models/common/alibi.py rename to mindspeed_llm/tasks/models/common/alibi.py diff --git a/modellink/tasks/models/common/pai_megatron.py b/mindspeed_llm/tasks/models/common/pai_megatron.py similarity index 100% rename from modellink/tasks/models/common/pai_megatron.py rename to mindspeed_llm/tasks/models/common/pai_megatron.py diff --git a/modellink/tasks/models/mask_generator.py b/mindspeed_llm/tasks/models/mask_generator.py similarity index 100% rename from modellink/tasks/models/mask_generator.py rename to mindspeed_llm/tasks/models/mask_generator.py diff --git a/modellink/tasks/models/spec/__init__.py b/mindspeed_llm/tasks/models/spec/__init__.py similarity index 100% rename from modellink/tasks/models/spec/__init__.py rename to mindspeed_llm/tasks/models/spec/__init__.py diff --git a/modellink/tasks/models/spec/deepseek_spec.py b/mindspeed_llm/tasks/models/spec/deepseek_spec.py similarity index 87% rename from modellink/tasks/models/spec/deepseek_spec.py rename to mindspeed_llm/tasks/models/spec/deepseek_spec.py index 76aebe465ffa6f1c7fbaee21ad143a7ec3c9b887..f5304311c2816908a2cee872085a219fe969d610 100644 --- a/modellink/tasks/models/spec/deepseek_spec.py +++ b/mindspeed_llm/tasks/models/spec/deepseek_spec.py @@ -4,9 +4,9 @@ from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinea from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp -from modellink.core.transformer.multi_head_latent_attention import MLASelfAttentionSubmodules, MultiHeadLatentAttention -from modellink.tasks.models.transformer.mla_dot_product_attention import MlaDotProductAttention -from modellink.core import PTNorm +from mindspeed_llm.core.transformer.multi_head_latent_attention import MLASelfAttentionSubmodules, MultiHeadLatentAttention +from mindspeed_llm.tasks.models.transformer.mla_dot_product_attention import MlaDotProductAttention +from mindspeed_llm.core import PTNorm """ MultiHeadLatent Layer Specification, which is mainly for Deepseek. diff --git a/modellink/tasks/models/spec/minicpm_spec.py b/mindspeed_llm/tasks/models/spec/minicpm_spec.py similarity index 87% rename from modellink/tasks/models/spec/minicpm_spec.py rename to mindspeed_llm/tasks/models/spec/minicpm_spec.py index 109752e63b1ab35fe9803eb3172881d781e5079f..91e81444f1ae2db8a51f984e44b9f160bd699681 100644 --- a/modellink/tasks/models/spec/minicpm_spec.py +++ b/mindspeed_llm/tasks/models/spec/minicpm_spec.py @@ -5,9 +5,9 @@ from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerL from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp -from modellink.core.transformer.multi_head_latent_attention import MLASelfAttentionSubmodules, MultiHeadLatentAttention -from modellink.tasks.models.transformer.mla_dot_product_attention import MlaDotProductAttention -from modellink.core import PTNorm +from mindspeed_llm.core.transformer.multi_head_latent_attention import MLASelfAttentionSubmodules, MultiHeadLatentAttention +from mindspeed_llm.tasks.models.transformer.mla_dot_product_attention import MlaDotProductAttention +from mindspeed_llm.core import PTNorm """ Layer Specification for MiniCPM. diff --git a/modellink/tasks/models/transformer/__init__.py b/mindspeed_llm/tasks/models/transformer/__init__.py similarity index 100% rename from modellink/tasks/models/transformer/__init__.py rename to mindspeed_llm/tasks/models/transformer/__init__.py diff --git a/modellink/tasks/models/transformer/fast_mlp.py b/mindspeed_llm/tasks/models/transformer/fast_mlp.py similarity index 98% rename from modellink/tasks/models/transformer/fast_mlp.py rename to mindspeed_llm/tasks/models/transformer/fast_mlp.py index c50c8d417021e51dfa4f7bfb48ba93fc46eda60b..41b41f677266258c4883e94079be6dd92e69a4a9 100644 --- a/modellink/tasks/models/transformer/fast_mlp.py +++ b/mindspeed_llm/tasks/models/transformer/fast_mlp.py @@ -4,7 +4,7 @@ try: import fused_weight_gradient_mlp_cuda except ImportError: fused_weight_gradient_mlp_cuda = None -from modellink.tasks.finetune.lora.cc_lora_forward import get_tensor_model_parallel_group, \ +from mindspeed_llm.tasks.finetune.lora.cc_lora_forward import get_tensor_model_parallel_group, \ _gather_along_first_dim_async, _reduce_scatter_along_first_dim_async, get_tensor_model_parallel_world_size diff --git a/modellink/tasks/models/transformer/mla_dot_product_attention.py b/mindspeed_llm/tasks/models/transformer/mla_dot_product_attention.py similarity index 94% rename from modellink/tasks/models/transformer/mla_dot_product_attention.py rename to mindspeed_llm/tasks/models/transformer/mla_dot_product_attention.py index 1a2795561e889ea5e791aa86743e27af86c2b6ed..23396c5eb9dddd0870a2b6735587476e29615e77 100644 --- a/modellink/tasks/models/transformer/mla_dot_product_attention.py +++ b/mindspeed_llm/tasks/models/transformer/mla_dot_product_attention.py @@ -3,7 +3,7 @@ from megatron.training import get_args from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.dot_product_attention import DotProductAttention -from modellink.core.models.common.embeddings.rotary_pos_embedding import yarn_get_mscale +from mindspeed_llm.core.models.common.embeddings.rotary_pos_embedding import yarn_get_mscale class MlaDotProductAttention(DotProductAttention): diff --git a/modellink/tasks/post_train/__init__.py b/mindspeed_llm/tasks/post_train/__init__.py similarity index 100% rename from modellink/tasks/post_train/__init__.py rename to mindspeed_llm/tasks/post_train/__init__.py diff --git a/modellink/tasks/post_train/base/__init__.py b/mindspeed_llm/tasks/post_train/base/__init__.py similarity index 100% rename from modellink/tasks/post_train/base/__init__.py rename to mindspeed_llm/tasks/post_train/base/__init__.py diff --git a/modellink/tasks/post_train/base/base_trainer.py b/mindspeed_llm/tasks/post_train/base/base_trainer.py similarity index 97% rename from modellink/tasks/post_train/base/base_trainer.py rename to mindspeed_llm/tasks/post_train/base/base_trainer.py index 36efd1f5a3eb1d1b8487dc34c6b017ae04a40f36..ed73961dacdfb6b9b98952d7aa599922b8b91ba8 100644 --- a/modellink/tasks/post_train/base/base_trainer.py +++ b/mindspeed_llm/tasks/post_train/base/base_trainer.py @@ -20,10 +20,10 @@ from megatron.core.models.gpt.gpt_layer_specs import ( ) from megatron.core.models.gpt import GPTModel from megatron.training.checkpointing import save_checkpoint -from modellink.training import build_train_args -from modellink.training import train -from modellink.training.initialize import set_jit_fusion_options -from modellink.tasks.post_train.utils import train_valid_test_datasets_provider +from mindspeed_llm.training import build_train_args +from mindspeed_llm.training import train +from mindspeed_llm.training.initialize import set_jit_fusion_options +from mindspeed_llm.tasks.post_train.utils import train_valid_test_datasets_provider _TRAIN_START_TIME = time.time() diff --git a/modellink/tasks/post_train/dpo/__init__.py b/mindspeed_llm/tasks/post_train/dpo/__init__.py similarity index 100% rename from modellink/tasks/post_train/dpo/__init__.py rename to mindspeed_llm/tasks/post_train/dpo/__init__.py diff --git a/modellink/tasks/post_train/dpo/dpo_model.py b/mindspeed_llm/tasks/post_train/dpo/dpo_model.py similarity index 98% rename from modellink/tasks/post_train/dpo/dpo_model.py rename to mindspeed_llm/tasks/post_train/dpo/dpo_model.py index 9c6dc5474ae1523b352160416700c26a5435aef4..830c6846b656af45e9411da9b2f2a18500924a8e 100644 --- a/modellink/tasks/post_train/dpo/dpo_model.py +++ b/mindspeed_llm/tasks/post_train/dpo/dpo_model.py @@ -4,7 +4,7 @@ import torch from megatron.training import get_args from megatron.core import mpu from megatron.core.pipeline_parallel.schedules import get_attr_wrapped_model -from modellink.tasks.post_train.utils import get_attr_from_wrapped_model +from mindspeed_llm.tasks.post_train.utils import get_attr_from_wrapped_model class HyperModelABC(abc.ABC): diff --git a/modellink/tasks/post_train/dpo/dpo_trainer.py b/mindspeed_llm/tasks/post_train/dpo/dpo_trainer.py similarity index 98% rename from modellink/tasks/post_train/dpo/dpo_trainer.py rename to mindspeed_llm/tasks/post_train/dpo/dpo_trainer.py index 1301c9f88d560ea46273fb306be0eb49d3d7a690..dfaf094d8c16fd2a93cf562cb794b6b5639084d1 100644 --- a/modellink/tasks/post_train/dpo/dpo_trainer.py +++ b/mindspeed_llm/tasks/post_train/dpo/dpo_trainer.py @@ -10,10 +10,10 @@ from megatron.core.enums import ModelType from megatron.training.checkpointing import load_checkpoint from megatron.training.utils import average_losses_across_data_parallel_group from megatron.training.global_vars import set_args -from modellink.tasks.post_train.base import BaseTrainer -from modellink.tasks.post_train.dpo.dpo_model import DPOModel -from modellink.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank -from modellink.training.utils import get_batch_on_this_cp_rank, generate_actual_seq_len +from mindspeed_llm.tasks.post_train.base import BaseTrainer +from mindspeed_llm.tasks.post_train.dpo.dpo_model import DPOModel +from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank +from mindspeed_llm.training.utils import get_batch_on_this_cp_rank, generate_actual_seq_len class DPOTrainer(BaseTrainer): diff --git a/modellink/tasks/post_train/dpo/simpo_trainer.py b/mindspeed_llm/tasks/post_train/dpo/simpo_trainer.py similarity index 98% rename from modellink/tasks/post_train/dpo/simpo_trainer.py rename to mindspeed_llm/tasks/post_train/dpo/simpo_trainer.py index ee64704531a54508678055c8b87bbb9a7a884b82..2068c13cbc389165fc7832156524692a95e15ec5 100644 --- a/modellink/tasks/post_train/dpo/simpo_trainer.py +++ b/mindspeed_llm/tasks/post_train/dpo/simpo_trainer.py @@ -7,8 +7,8 @@ import torch.nn.functional as F from megatron.training import get_args from megatron.core import mpu from megatron.training.utils import average_losses_across_data_parallel_group -from modellink.tasks.post_train.base import BaseTrainer -from modellink.tasks.post_train.dpo import DPOTrainer +from mindspeed_llm.tasks.post_train.base import BaseTrainer +from mindspeed_llm.tasks.post_train.dpo import DPOTrainer class SimPOTrainer(BaseTrainer): diff --git a/modellink/tasks/post_train/launcher.py b/mindspeed_llm/tasks/post_train/launcher.py similarity index 85% rename from modellink/tasks/post_train/launcher.py rename to mindspeed_llm/tasks/post_train/launcher.py index 13f38366d1a9d1e3f6ba17b8cad5f0d0eddefd46..050ce86b9d00d6878a91f7fa1563ec2c3b104a6c 100644 --- a/modellink/tasks/post_train/launcher.py +++ b/mindspeed_llm/tasks/post_train/launcher.py @@ -1,10 +1,10 @@ # Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. from megatron.training import get_args from megatron.training.initialize import initialize_megatron -from modellink.tasks.post_train.sft import SFTTrainer -from modellink.tasks.post_train.dpo import DPOTrainer -from modellink.tasks.post_train.rm import RMTrainer -from modellink.tasks.post_train.dpo import SimPOTrainer +from mindspeed_llm.tasks.post_train.sft import SFTTrainer +from mindspeed_llm.tasks.post_train.dpo import DPOTrainer +from mindspeed_llm.tasks.post_train.rm import RMTrainer +from mindspeed_llm.tasks.post_train.dpo import SimPOTrainer def get_trainer(stage): diff --git a/modellink/tasks/post_train/rm/__init__.py b/mindspeed_llm/tasks/post_train/rm/__init__.py similarity index 100% rename from modellink/tasks/post_train/rm/__init__.py rename to mindspeed_llm/tasks/post_train/rm/__init__.py diff --git a/modellink/tasks/post_train/rm/rm_model.py b/mindspeed_llm/tasks/post_train/rm/rm_model.py similarity index 100% rename from modellink/tasks/post_train/rm/rm_model.py rename to mindspeed_llm/tasks/post_train/rm/rm_model.py diff --git a/modellink/tasks/post_train/rm/rm_trainer.py b/mindspeed_llm/tasks/post_train/rm/rm_trainer.py similarity index 96% rename from modellink/tasks/post_train/rm/rm_trainer.py rename to mindspeed_llm/tasks/post_train/rm/rm_trainer.py index f1f06739bbbf7b07259d621d5492807b2a620f40..7221f289f3536dfe022b7e06c0bd3c4198c78490 100644 --- a/modellink/tasks/post_train/rm/rm_trainer.py +++ b/mindspeed_llm/tasks/post_train/rm/rm_trainer.py @@ -13,9 +13,9 @@ from megatron.training.yaml_arguments import core_transformer_config_from_yaml from megatron.core.transformer.spec_utils import import_module from megatron.training.utils import average_losses_across_data_parallel_group from megatron.core.models.gpt import GPTModel -from modellink.tasks.post_train.base import BaseTrainer -from modellink.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank -from modellink.tasks.post_train.rm.rm_model import GPTRewardModel +from mindspeed_llm.tasks.post_train.base import BaseTrainer +from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank +from mindspeed_llm.tasks.post_train.rm.rm_model import GPTRewardModel class RMTrainer(BaseTrainer): diff --git a/modellink/tasks/post_train/sft/__init__.py b/mindspeed_llm/tasks/post_train/sft/__init__.py similarity index 100% rename from modellink/tasks/post_train/sft/__init__.py rename to mindspeed_llm/tasks/post_train/sft/__init__.py diff --git a/modellink/tasks/post_train/sft/sft_trainer.py b/mindspeed_llm/tasks/post_train/sft/sft_trainer.py similarity index 96% rename from modellink/tasks/post_train/sft/sft_trainer.py rename to mindspeed_llm/tasks/post_train/sft/sft_trainer.py index 43128ed42cd2caeab71588d5908d7e0d56b832e2..0afca30c9dd214ef0ebc45f01cc0834bb1645e2a 100644 --- a/modellink/tasks/post_train/sft/sft_trainer.py +++ b/mindspeed_llm/tasks/post_train/sft/sft_trainer.py @@ -10,8 +10,8 @@ from megatron.training.utils import ( average_losses_across_data_parallel_group ) from megatron.training import get_timers -from modellink.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank, generate_actual_seq_len -from modellink.tasks.post_train.base import BaseTrainer +from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank, generate_actual_seq_len +from mindspeed_llm.tasks.post_train.base import BaseTrainer class SFTTrainer(BaseTrainer): diff --git a/modellink/tasks/post_train/utils.py b/mindspeed_llm/tasks/post_train/utils.py similarity index 96% rename from modellink/tasks/post_train/utils.py rename to mindspeed_llm/tasks/post_train/utils.py index 5a56207e1137c61e9ce4d5dcae20e925c9564ba1..e1942d9f1b59ab1c4484d71bfe2c209736e267cf 100644 --- a/modellink/tasks/post_train/utils.py +++ b/mindspeed_llm/tasks/post_train/utils.py @@ -8,7 +8,7 @@ from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegat from megatron.core.datasets.gpt_dataset import GPTDatasetConfig from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset from megatron.core.datasets.utils import get_blend_from_list -from modellink.tasks.preprocess.decoder_packed_mtf_dataset import build_train_valid_test_datasets as build_instruction_dataset +from mindspeed_llm.tasks.preprocess.decoder_packed_mtf_dataset import build_train_valid_test_datasets as build_instruction_dataset def is_dataset_built_on_rank(): diff --git a/modellink/tasks/preprocess/__init__.py b/mindspeed_llm/tasks/preprocess/__init__.py similarity index 100% rename from modellink/tasks/preprocess/__init__.py rename to mindspeed_llm/tasks/preprocess/__init__.py diff --git a/modellink/tasks/preprocess/data_handler.py b/mindspeed_llm/tasks/preprocess/data_handler.py similarity index 99% rename from modellink/tasks/preprocess/data_handler.py rename to mindspeed_llm/tasks/preprocess/data_handler.py index cabbd5f5589525d2f6fb307e49970fceb62ae775..8a988fbb9c603d88d7fc88489f36a12fd124f00c 100644 --- a/modellink/tasks/preprocess/data_handler.py +++ b/mindspeed_llm/tasks/preprocess/data_handler.py @@ -26,7 +26,7 @@ from datasets import load_dataset from megatron.core.datasets import indexed_dataset -from modellink.tasks.preprocess.templates import Prompter, AlpacaTemplate, get_model_template +from mindspeed_llm.tasks.preprocess.templates import Prompter, AlpacaTemplate, get_model_template from .utils import get_dataset_list, get_handler_dataset_attr, load_single_dataset, merge_dataset, align_dataset from .utils import greedy_knapsack diff --git a/modellink/tasks/preprocess/decoder_packed_mtf_dataset.py b/mindspeed_llm/tasks/preprocess/decoder_packed_mtf_dataset.py similarity index 98% rename from modellink/tasks/preprocess/decoder_packed_mtf_dataset.py rename to mindspeed_llm/tasks/preprocess/decoder_packed_mtf_dataset.py index 4b978c803fe0cec5c333741ffd4ce3c9e214fa64..c4339410a6f90f09db8551e5920579f48b31aa32 100644 --- a/modellink/tasks/preprocess/decoder_packed_mtf_dataset.py +++ b/mindspeed_llm/tasks/preprocess/decoder_packed_mtf_dataset.py @@ -25,9 +25,9 @@ import torch from megatron.training import print_rank_0, get_args from megatron.core import parallel_state from megatron.legacy.data.dataset_utils import get_train_valid_test_split_ -from modellink.training.tokenizer import build_tokenizer -from modellink.tasks.error_utils import check_equal -from modellink.tasks.preprocess.mtf_dataset import MTFDataset, get_packed_indexed_dataset +from mindspeed_llm.training.tokenizer import build_tokenizer +from mindspeed_llm.tasks.error_utils import check_equal +from mindspeed_llm.tasks.preprocess.mtf_dataset import MTFDataset, get_packed_indexed_dataset logger = logging.getLogger(__name__) diff --git a/modellink/tasks/preprocess/formatter.py b/mindspeed_llm/tasks/preprocess/formatter.py similarity index 100% rename from modellink/tasks/preprocess/formatter.py rename to mindspeed_llm/tasks/preprocess/formatter.py diff --git a/modellink/tasks/preprocess/mtf_dataset.py b/mindspeed_llm/tasks/preprocess/mtf_dataset.py similarity index 97% rename from modellink/tasks/preprocess/mtf_dataset.py rename to mindspeed_llm/tasks/preprocess/mtf_dataset.py index da3ed95a4c209507b2981d5e3ae4b35ee3874a6f..73df15e31f57ae51fa7d881b6ac29fa9faf6beb8 100644 --- a/modellink/tasks/preprocess/mtf_dataset.py +++ b/mindspeed_llm/tasks/preprocess/mtf_dataset.py @@ -22,7 +22,7 @@ import numpy as np import torch from megatron.core.datasets.indexed_dataset import IndexedDataset -from modellink.tasks.error_utils import ensure_valid +from mindspeed_llm.tasks.error_utils import ensure_valid class MTFDataset(torch.utils.data.Dataset): diff --git a/modellink/tasks/preprocess/parser.py b/mindspeed_llm/tasks/preprocess/parser.py similarity index 100% rename from modellink/tasks/preprocess/parser.py rename to mindspeed_llm/tasks/preprocess/parser.py diff --git a/modellink/tasks/preprocess/templates.py b/mindspeed_llm/tasks/preprocess/templates.py similarity index 100% rename from modellink/tasks/preprocess/templates.py rename to mindspeed_llm/tasks/preprocess/templates.py diff --git a/modellink/tasks/preprocess/utils.py b/mindspeed_llm/tasks/preprocess/utils.py similarity index 99% rename from modellink/tasks/preprocess/utils.py rename to mindspeed_llm/tasks/preprocess/utils.py index f8a20208d7c6c1936fc7087c0402143fe4f47ed9..dc19b39d24df19b0cf1ddc60cd85c1b8336df8c0 100644 --- a/modellink/tasks/preprocess/utils.py +++ b/mindspeed_llm/tasks/preprocess/utils.py @@ -21,8 +21,8 @@ from functools import partial from typing import Any, Dict, List, Sequence from datasets import load_dataset, concatenate_datasets, interleave_datasets -from modellink.tasks.preprocess.templates import Role -from modellink.tasks.preprocess.parser import InstructionDatasetAttr +from mindspeed_llm.tasks.preprocess.templates import Role +from mindspeed_llm.tasks.preprocess.parser import InstructionDatasetAttr logging.basicConfig(level=logging.INFO) diff --git a/modellink/tasks/trainer/launcher.py b/mindspeed_llm/tasks/trainer/launcher.py similarity index 77% rename from modellink/tasks/trainer/launcher.py rename to mindspeed_llm/tasks/trainer/launcher.py index 8034d2215b7614fcbb0ec29eeb806c64d16f55c4..5500480b35f9cdfd89b0d6bde1f0c892e56bcbdb 100644 --- a/modellink/tasks/trainer/launcher.py +++ b/mindspeed_llm/tasks/trainer/launcher.py @@ -1,10 +1,10 @@ # Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. from megatron.training import get_args from megatron.training.initialize import initialize_megatron -from modellink.tasks.trainer.base import BaseTrainer -from modellink.tasks.rl.dpo import DPOTrainer -from modellink.tasks.rl.rm import RMTrainer -from modellink.tasks.rl.simpo import SimPOTrainer +from mindspeed_llm.tasks.trainer.base import BaseTrainer +from mindspeed_llm.tasks.rl.dpo import DPOTrainer +from mindspeed_llm.tasks.rl.rm import RMTrainer +from mindspeed_llm.tasks.rl.simpo import SimPOTrainer class AutoTrainer: diff --git a/modellink/training/__init__.py b/mindspeed_llm/training/__init__.py similarity index 100% rename from modellink/training/__init__.py rename to mindspeed_llm/training/__init__.py diff --git a/modellink/training/arguments.py b/mindspeed_llm/training/arguments.py similarity index 99% rename from modellink/training/arguments.py rename to mindspeed_llm/training/arguments.py index 9a9fcf24b7d6336bad0b46e7c16e2d4af80604bc..874a6f0014d2dfb91e952ca859a6c7e5ae0b305d 100644 --- a/modellink/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -17,7 +17,7 @@ import os import argparse from pathlib import Path from functools import wraps -from modellink.training.utils import print_rank0_by_args +from mindspeed_llm.training.utils import print_rank0_by_args cur_file_dir = Path(__file__).absolute().parent @@ -1093,7 +1093,7 @@ def validate_args_decorator(megatron_validate_args): _validate_noop_layer(args) _add_dummy_args(args) - from modellink.training.utils import print_args + from mindspeed_llm.training.utils import print_args print_args('MindSpeed-LLM Arguments', args) return args diff --git a/modellink/training/checkpointing.py b/mindspeed_llm/training/checkpointing.py similarity index 96% rename from modellink/training/checkpointing.py rename to mindspeed_llm/training/checkpointing.py index 90ee6cb9a603b193ca636e1ac708e58183cc20e7..016f6e162cc8ab2954c0f5a80c51ad384cba193e 100644 --- a/modellink/training/checkpointing.py +++ b/mindspeed_llm/training/checkpointing.py @@ -18,8 +18,8 @@ from functools import wraps from megatron.training import get_args from megatron.training.utils import print_rank_0 from megatron.training.checkpointing import _load_base_checkpoint -from modellink.tasks.finetune.lora.utils import is_enable_lora, merge_dicts, modify_keys_with_dict -from modellink.tasks.post_train.utils import load_checkpoint_loosely +from mindspeed_llm.tasks.finetune.lora.utils import is_enable_lora, merge_dicts, modify_keys_with_dict +from mindspeed_llm.tasks.post_train.utils import load_checkpoint_loosely def _load_base_checkpoint_wrapper(fn): diff --git a/modellink/training/initialize.py b/mindspeed_llm/training/initialize.py similarity index 95% rename from modellink/training/initialize.py rename to mindspeed_llm/training/initialize.py index 5327945a6b048af66cae38c632a5fc30ab4b02c2..51100ccb4ff65f48842a71654dbd369c9e7a40f1 100644 --- a/modellink/training/initialize.py +++ b/mindspeed_llm/training/initialize.py @@ -29,10 +29,10 @@ from megatron.training.initialize import ( _warmup_jit_function ) -from modellink.training.arguments import parse_args_decorator -from modellink.core.tensor_parallel.ascend_turbo.initialize import initialize_cfg_from_args -from modellink.tasks.error_utils import ensure_valid -from modellink.training.utils import seed_all +from mindspeed_llm.training.arguments import parse_args_decorator +from mindspeed_llm.core.tensor_parallel.ascend_turbo.initialize import initialize_cfg_from_args +from mindspeed_llm.tasks.error_utils import ensure_valid +from mindspeed_llm.training.utils import seed_all def _compile_dependencies(): diff --git a/modellink/training/tokenizer/__init__.py b/mindspeed_llm/training/tokenizer/__init__.py similarity index 100% rename from modellink/training/tokenizer/__init__.py rename to mindspeed_llm/training/tokenizer/__init__.py diff --git a/modellink/training/tokenizer/tokenizer.py b/mindspeed_llm/training/tokenizer/tokenizer.py similarity index 98% rename from modellink/training/tokenizer/tokenizer.py rename to mindspeed_llm/training/tokenizer/tokenizer.py index a6a367a7021ca176f1f1aaeb27dab504fd5feabb..d360555576fa3266cf4fcaea1023f504eae7063a 100644 --- a/modellink/training/tokenizer/tokenizer.py +++ b/mindspeed_llm/training/tokenizer/tokenizer.py @@ -20,7 +20,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from megatron.training.tokenizer import build_tokenizer as megatron_build_tokenizer from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer -from modellink.tasks.preprocess.templates import fix_model_tokenizer +from mindspeed_llm.tasks.preprocess.templates import fix_model_tokenizer def build_tokenizer(args): diff --git a/modellink/training/training.py b/mindspeed_llm/training/training.py similarity index 98% rename from modellink/training/training.py rename to mindspeed_llm/training/training.py index e71e5ecaf59d9bb055570537d9167233db015273..7ecfdb21fb798cd917678f0340bf4e9b10e2d6c8 100644 --- a/modellink/training/training.py +++ b/mindspeed_llm/training/training.py @@ -54,8 +54,8 @@ from megatron.training.utils import ( ) from megatron.core.distributed import DistributedDataParallel as DDP from megatron.core.distributed import finalize_model_grads -from modellink.training.initialize import set_jit_fusion_options -from modellink.tasks.finetune.lora.utils import is_enable_lora +from mindspeed_llm.training.initialize import set_jit_fusion_options +from mindspeed_llm.tasks.finetune.lora.utils import is_enable_lora def model_provider_func_wrapper(model_provider_func): @@ -64,7 +64,7 @@ def model_provider_func_wrapper(model_provider_func): model = model_provider_func(*args, **kwargs) args = get_args() if args.use_fused_mlp: - from modellink.tasks.models.transformer.fast_mlp import ParallelSwigluMLPForward + from mindspeed_llm.tasks.models.transformer.fast_mlp import ParallelSwigluMLPForward from megatron.legacy.model.transformer import ParallelMLP from megatron.core.transformer.mlp import MLP ParallelMLP.forward = ParallelSwigluMLPForward @@ -79,12 +79,12 @@ def model_provider_func_wrapper(model_provider_func): setattr(peft.tuners.lora.LoraLayer, 'unmerge', peft.tuners.lora.Linear.unmerge) setattr(peft.tuners.lora.LoraLayer, 'get_delta_weight', peft.tuners.lora.Linear.get_delta_weight) from peft.tuners.lora import tp_layer - from modellink.tasks.finetune.lora.lora_moe import LoraParallelLinearMoE + from mindspeed_llm.tasks.finetune.lora.lora_moe import LoraParallelLinearMoE tp_layer.LoraParallelLinear = LoraParallelLinearMoE if hasattr(args, 'lora_fusion') and args.lora_fusion: from peft.tuners.lora.tp_layer import LoraParallelLinear - from modellink.tasks.finetune.lora.cc_lora_forward import CCLoraParallelLinearForward + from mindspeed_llm.tasks.finetune.lora.cc_lora_forward import CCLoraParallelLinearForward LoraParallelLinear.forward = CCLoraParallelLinearForward config = core_transformer_config_from_args(args) diff --git a/modellink/training/utils.py b/mindspeed_llm/training/utils.py similarity index 100% rename from modellink/training/utils.py rename to mindspeed_llm/training/utils.py diff --git a/posttrain_gpt.py b/posttrain_gpt.py index d1234f9daf93a3b3135961ae269a39d5c1b904ff..7264d9f0a2705032dd32f7f24f65da32563199a6 100644 --- a/posttrain_gpt.py +++ b/posttrain_gpt.py @@ -1,6 +1,6 @@ # Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. -from modellink import megatron_adaptor -from modellink.tasks.post_train.launcher import AutoTrainer +from mindspeed_llm import megatron_adaptor +from mindspeed_llm.tasks.post_train.launcher import AutoTrainer def launch(): diff --git a/preprocess_data.py b/preprocess_data.py index 553da1c1302bb2c71c171faec83f90d9ae412859..6e9ba256c367214bbab7045ddea8a0b435a42cbf 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -33,8 +33,8 @@ except ImportError: sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from modellink.training.tokenizer import build_tokenizer -from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler +from mindspeed_llm.training.tokenizer import build_tokenizer +from mindspeed_llm.tasks.preprocess.data_handler import build_dataset, get_dataset_handler from megatron.core.datasets.indexed_dataset import ( IndexedDatasetBuilder, IndexedDataset, diff --git a/pretrain_gpt.py b/pretrain_gpt.py index e9c13fb6189bda967cf9ff937afd1c3ff5b9e9b9..e77f7821fd6bff8aadfc1fbe418259b5e175c891 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -6,7 +6,7 @@ from functools import partial from typing import Union import torch -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from megatron.training import get_args from megatron.training import print_rank_0 from megatron.training import get_timers @@ -19,7 +19,7 @@ from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset from megatron.core.datasets.utils import get_blend_from_list import megatron.legacy.model from megatron.core.models.gpt import GPTModel -from modellink.training import pretrain +from mindspeed_llm.training import pretrain from megatron.core.transformer.spec_utils import import_module from megatron.training.utils import ( get_batch_on_this_cp_rank, @@ -32,7 +32,7 @@ from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, ) -from modellink.training.utils import generate_actual_seq_len +from mindspeed_llm.training.utils import generate_actual_seq_len def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: diff --git a/setup.py b/setup.py index de345fdfa02181bc2c95f68e1e6dd0a223749070..8f722de0861698f86fcf7c0bc6b77625979d8dd4 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ __version__ = '0.0.1' __author__ = 'Ascend' __long_description__ = 'MindSpeed-LLM for LLMs of Ascend' __keywords__ = 'Ascend, langauge, deep learning, NLP' -__package_name__ = 'modellink' +__package_name__ = 'mindspeed_llm' __contact_names__ = 'Ascend' @@ -56,8 +56,8 @@ try: ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") op_files = [] - op_files += glob.glob("modellink/te/ops/csrc/*.cpp") - op_files += glob.glob("modellink/te/ops/csrc/cann/*.cpp") + op_files += glob.glob("mindspeed_llm/te/ops/csrc/*.cpp") + op_files += glob.glob("mindspeed_llm/te/ops/csrc/cann/*.cpp") ext_ops = cpp_extension.NpuExtension( name="ascendspeed_te_ops", sources=op_files, @@ -78,7 +78,7 @@ except Exception: print('Can not find env : ASCEND_TOOLKIT_HOME or ATB_HOME_PATH, ops setup failed') setuptools.setup( - package_data={'modellink':['modellink/data/Makefile']}, + package_data={'mindspeed_llm':['mindspeed_llm/data/Makefile']}, name=__package_name__, # Versions should comply with PEP440. version=__version__, diff --git a/tests/pipeline/baichuan2-13B/test_ckpt_hf2mg.py b/tests/pipeline/baichuan2-13B/test_ckpt_hf2mg.py index c5de70a0276197027438e286a7fab0c40e6d0c02..de02863578de21c96fda6c80777f88e32b2e559e 100644 --- a/tests/pipeline/baichuan2-13B/test_ckpt_hf2mg.py +++ b/tests/pipeline/baichuan2-13B/test_ckpt_hf2mg.py @@ -9,7 +9,7 @@ import pytest import torch import torch.distributed as dist -import modellink +import mindspeed_llm from convert_ckpt import main from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import (build_args, create_testconfig, run_cmd, diff --git a/tests/pipeline/deepseek/deepseek2_tp1_pp1_mcore_moe.sh b/tests/pipeline/deepseek/deepseek2_tp1_pp1_mcore_moe.sh index 091e6b65d07615483f3a4f44125457ec5dd5546e..c763efbd05848918a718521f6b900c1df92f19be 100644 --- a/tests/pipeline/deepseek/deepseek2_tp1_pp1_mcore_moe.sh +++ b/tests/pipeline/deepseek/deepseek2_tp1_pp1_mcore_moe.sh @@ -29,7 +29,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/tests/st/shell_scripts/deepseek_v2_mcore_tp1_pp1_ep8.sh b/tests/st/shell_scripts/deepseek_v2_mcore_tp1_pp1_ep8.sh index b4041927b50256dd5a703cfe2c9bf187832a6179..65c15f26cd207e1e3b60e80df390881971ef9ea8 100644 --- a/tests/st/shell_scripts/deepseek_v2_mcore_tp1_pp1_ep8.sh +++ b/tests/st/shell_scripts/deepseek_v2_mcore_tp1_pp1_ep8.sh @@ -29,7 +29,7 @@ DISTRIBUTED_ARGS=" " MLA_ARGS=" - --spec modellink.tasks.models.spec.deepseek_spec layer_spec \ + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --multi-head-latent-attention \ --qk-rope-head-dim 64 \ --qk-nope-head-dim 128 \ diff --git a/tests/test_tools/test_ci_st.py b/tests/test_tools/test_ci_st.py index 39d101c4049bbcf5f4b0da8ca15b9c9bc72e8c49..9a58493b2401b0dd76b396d5d8205382855cae04 100644 --- a/tests/test_tools/test_ci_st.py +++ b/tests/test_tools/test_ci_st.py @@ -1,5 +1,5 @@ import pytest -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.acquire_json import transfer_logs_as_json, read_json MEMO_INFO = "memo info" diff --git a/tests/test_tools/utils.py b/tests/test_tools/utils.py index 69a1d31b8431ee5cfc62294518e94fb075a39c39..2a0b39036ba5f8f2a2e3cb7174aa58dbcf6c1000 100644 --- a/tests/test_tools/utils.py +++ b/tests/test_tools/utils.py @@ -15,7 +15,7 @@ import torch import torch_npu import megatron.core.parallel_state as mpu from mindspeed.core.parallel_state import initialize_model_parallel, initialize_model_parallel_wrapper -from modellink.core.parallel_state import initialize_model_parallel_decorator +from mindspeed_llm.core.parallel_state import initialize_model_parallel_decorator def judge_expression(expression): if not expression: diff --git a/tests/ut/checkpoint/test_checkpoint.json b/tests/ut/checkpoint/test_checkpoint.json index bda2cd3e9d63fbf1508c67c1ffa90639df8d30fb..8fcd9d96484714c1648055053494674b17e38fc3 100644 --- a/tests/ut/checkpoint/test_checkpoint.json +++ b/tests/ut/checkpoint/test_checkpoint.json @@ -53,7 +53,7 @@ "model-type-hf": "deepseek2", "params-dtype": "bf16", "tokenizer-model":"/data/ci/deepseek2/hf/deepseek2_hf", - "spec":"modellink.tasks.models.spec.deepseek_spec layer_spec" + "spec":"mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec" } } ], @@ -73,7 +73,7 @@ "moe-grouped-gemm": null, "model-type-hf": "deepseek2", "params-dtype": "bf16", - "spec":"modellink.tasks.models.spec.deepseek_spec layer_spec" + "spec":"mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec" } } ], @@ -93,7 +93,7 @@ "model-type-hf": "deepseek2-lite", "params-dtype": "bf16", "tokenizer-model":"/data/ci/deepseek2_lite/hf/deepseek2_lite_hf", - "spec":"modellink.tasks.models.spec.deepseek_spec layer_spec" + "spec":"mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec" } } ], @@ -112,7 +112,7 @@ "use-mcore-models": null, "model-type-hf": "deepseek2-lite", "params-dtype": "bf16", - "spec":"modellink.tasks.models.spec.deepseek_spec layer_spec" + "spec":"mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec" } } ], diff --git a/tests/ut/checkpoint/test_checkpoint.py b/tests/ut/checkpoint/test_checkpoint.py index 57f4b383ff01c4bf5517fbcf8700f5842585557b..81717147af039e1666e4014742bbe1b7e876bd0c 100644 --- a/tests/ut/checkpoint/test_checkpoint.py +++ b/tests/ut/checkpoint/test_checkpoint.py @@ -22,7 +22,7 @@ import logging import re import math import pytest -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import create_testconfig, weight_compare, run_cmd diff --git a/tests/ut/checkpoint/test_convert_ckpt_from_huggingface.py b/tests/ut/checkpoint/test_convert_ckpt_from_huggingface.py index 14dd4ce284d84b3a87c01df5accc2b5f372ad164..0df685fb6455bf3f35d69e46a95b96bc8b570e62 100644 --- a/tests/ut/checkpoint/test_convert_ckpt_from_huggingface.py +++ b/tests/ut/checkpoint/test_convert_ckpt_from_huggingface.py @@ -5,7 +5,7 @@ import subprocess from pathlib import Path import torch -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import judge_expression from tests.test_tools.utils import weight_compare diff --git a/tests/ut/checkpoint/test_convert_ckpt_from_megatron.py b/tests/ut/checkpoint/test_convert_ckpt_from_megatron.py index 258a3bdd5be742d6963273a6e7b7c891471c7d1a..2ad09c7998eb9dbd6dd7e660a91db4a86ef3dd10 100644 --- a/tests/ut/checkpoint/test_convert_ckpt_from_megatron.py +++ b/tests/ut/checkpoint/test_convert_ckpt_from_megatron.py @@ -5,7 +5,7 @@ import subprocess from pathlib import Path import torch -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import judge_expression diff --git a/tests/ut/checkpoint/test_convert_ckpt_to_huggingface.py b/tests/ut/checkpoint/test_convert_ckpt_to_huggingface.py index 6e90c9e2f65e35716c3a11667127039b64a014d9..b8df41222593c8cbd0d752e94116369989c75a60 100644 --- a/tests/ut/checkpoint/test_convert_ckpt_to_huggingface.py +++ b/tests/ut/checkpoint/test_convert_ckpt_to_huggingface.py @@ -7,7 +7,7 @@ import numpy as np import torch from transformers import AutoModelForCausalLM -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import judge_expression diff --git a/tests/ut/dist_algo/context_parallel/test_adaptive_context_parallel.py b/tests/ut/dist_algo/context_parallel/test_adaptive_context_parallel.py index 29091835622897369d74707d6adee313003a0f88..c47b74cde5ba4c7d0dee33ed529d2bffa3a54de6 100644 --- a/tests/ut/dist_algo/context_parallel/test_adaptive_context_parallel.py +++ b/tests/ut/dist_algo/context_parallel/test_adaptive_context_parallel.py @@ -5,8 +5,8 @@ import torch_npu import numpy as np import torch.distributed as dist -# To activate modellink.patches.__init__ -from modellink import megatron_adaptor +# To activate mindspeed_llm.patches.__init__ +from mindspeed_llm import megatron_adaptor from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from mindspeed.core.context_parallel.adaptive_context_parallel import adaptive_attn_context_parallel @@ -27,7 +27,7 @@ from mindspeed.core.context_parallel.utils import (set_scheduling_info, from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import initialize_model_parallel, initialize_model_parallel_decorator -from modellink.training.utils import seed_all +from mindspeed_llm.training.utils import seed_all DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] diff --git a/tests/ut/dist_algo/context_parallel/test_hybrid_context_parallel.py b/tests/ut/dist_algo/context_parallel/test_hybrid_context_parallel.py index 01963a755e9c7fb49a58705393a56a250568a23a..b64c498a69bdf491e4be9bbbbeaa9e83c316bc35 100644 --- a/tests/ut/dist_algo/context_parallel/test_hybrid_context_parallel.py +++ b/tests/ut/dist_algo/context_parallel/test_hybrid_context_parallel.py @@ -4,8 +4,8 @@ import torch import torch_npu import torch.distributed as dist -# To activate modellink.patches.__init__ -from modellink import megatron_adaptor +# To activate mindspeed_llm.patches.__init__ +from mindspeed_llm import megatron_adaptor from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from megatron.legacy.model.transformer import FlashSelfAttention @@ -20,7 +20,7 @@ from mindspeed.model.transformer import set_attention_mask from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import initialize_model_parallel, initialize_model_parallel_decorator -from modellink.training.utils import seed_all +from mindspeed_llm.training.utils import seed_all def get_data_on_this_cp_rank(data, r_size, u_size, cp_rank, dim=0): diff --git a/tests/ut/dist_algo/context_parallel/test_ringattn_context_parallel.py b/tests/ut/dist_algo/context_parallel/test_ringattn_context_parallel.py index ae277f5391b118f9e04a5f42611e4e2f6fd3900a..44a57743191f28bba666881412fa93a24721039c 100644 --- a/tests/ut/dist_algo/context_parallel/test_ringattn_context_parallel.py +++ b/tests/ut/dist_algo/context_parallel/test_ringattn_context_parallel.py @@ -4,16 +4,16 @@ import torch import torch_npu import torch.distributed as dist -# To activate modellink.patches.__init__ -from modellink import megatron_adaptor +# To activate mindspeed_llm.patches.__init__ +from mindspeed_llm import megatron_adaptor from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from mindspeed.model.transformer import get_attention_mask -from modellink.training.utils import seed_all +from mindspeed_llm.training.utils import seed_all from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import initialize_model_parallel, initialize_model_parallel_decorator -from modellink.core.transformer.dot_product_attention import do_ring_context_parallel +from mindspeed_llm.core.transformer.dot_product_attention import do_ring_context_parallel def get_data_on_this_cp_rank(data, cp_size, cp_rank, dim=0): diff --git a/tests/ut/dist_algo/context_parallel/test_ulysses_context_parallel.py b/tests/ut/dist_algo/context_parallel/test_ulysses_context_parallel.py index 3de7a9fba53320754e7876e73fb60dc1006d0cfd..1eddd6a1adba72770cb3933d3f4c047f29ce6390 100644 --- a/tests/ut/dist_algo/context_parallel/test_ulysses_context_parallel.py +++ b/tests/ut/dist_algo/context_parallel/test_ulysses_context_parallel.py @@ -4,14 +4,14 @@ import torch import torch_npu import torch.distributed as dist -# To activate modellink.patches.__init__ -from modellink import megatron_adaptor +# To activate mindspeed_llm.patches.__init__ +from mindspeed_llm import megatron_adaptor import megatron.core.parallel_state as ps from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from mindspeed.core.context_parallel.ulysses_context_parallel import UlyssesContextAttention -from modellink.training.utils import seed_all +from mindspeed_llm.training.utils import seed_all from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import initialize_model_parallel, initialize_model_parallel_decorator diff --git a/tests/ut/model_module/embeddings/test_rotary_pos_embedding.py b/tests/ut/model_module/embeddings/test_rotary_pos_embedding.py index aad93d26944365a5498b6cb2ea833eb5eda0e9d1..6605397ffcfb42e6b7b440e246b8bbba64ccaef2 100644 --- a/tests/ut/model_module/embeddings/test_rotary_pos_embedding.py +++ b/tests/ut/model_module/embeddings/test_rotary_pos_embedding.py @@ -17,7 +17,7 @@ from types import SimpleNamespace from pathlib import Path import pytest import torch -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.dist_test import create_testconfig from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding @@ -35,8 +35,8 @@ class TestRotaryPosEmbedding: test_name_space.rotary_base = request.getfixturevalue("rotary_base") return test_name_space # set up name space function - import modellink - setattr(modellink.core.models.common.embeddings.rotary_pos_embedding, "get_args", get_test_namespace) + import mindspeed_llm + setattr(mindspeed_llm.core.models.common.embeddings.rotary_pos_embedding, "get_args", get_test_namespace) @pytest.mark.parametrize("rotary_param, chatglm, rotary_base, seq, expected", test_config["test_rotary_pos_embedding"]) def test_rotary_pos_embedding(self, mock_dependency, rotary_param, chatglm, rotary_base, seq, expected): diff --git a/tests/ut/model_module/transformer/test_attention.py b/tests/ut/model_module/transformer/test_attention.py index fddd8b745f1e612f6d12ac6a612c1234616eb576..ea3efa5a2406a0ba8fff6aa7555a7c556d9a3556 100644 --- a/tests/ut/model_module/transformer/test_attention.py +++ b/tests/ut/model_module/transformer/test_attention.py @@ -4,8 +4,8 @@ import torch import torch_npu import torch.distributed as dist -# To activate modellink.patches.__init__ -from modellink import megatron_adaptor +# To activate mindspeed_llm.patches.__init__ +from mindspeed_llm import megatron_adaptor from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from megatron.legacy.model.transformer import FlashSelfAttention @@ -19,8 +19,8 @@ from mindspeed.model.transformer import set_attention_mask from tests.test_tools.dist_test import DistributedTest from tests.test_tools.utils import initialize_model_parallel, initialize_model_parallel_decorator -from modellink.tasks.models.common.alibi import Alibi -from modellink.training.utils import seed_all +from mindspeed_llm.tasks.models.common.alibi import Alibi +from mindspeed_llm.training.utils import seed_all def get_data_on_this_cp_rank(data, r_size, u_size, cp_rank, dim=0): diff --git a/tests/ut/process_data/test_preprocess_data.py b/tests/ut/process_data/test_preprocess_data.py index a37acf5e9a9f4e6b83db4ea390bb892318bb641a..e351d4bf6c62193c0405d6fbd73fbad336f36f8b 100644 --- a/tests/ut/process_data/test_preprocess_data.py +++ b/tests/ut/process_data/test_preprocess_data.py @@ -2,7 +2,7 @@ import os from pathlib import Path import pytest import pandas as pd -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import build_args, create_testconfig, compare_file_md5_same from preprocess_data import main diff --git a/tests/ut/process_data/test_process_instruction_data_lf.py b/tests/ut/process_data/test_process_instruction_data_lf.py index 703b7e147d520ba9f4cbe430f86f1c8de81220bb..6120aecc7e44659ebeb9883754bd941c0fcf8c0a 100644 --- a/tests/ut/process_data/test_process_instruction_data_lf.py +++ b/tests/ut/process_data/test_process_instruction_data_lf.py @@ -1,7 +1,7 @@ import os from pathlib import Path import pytest -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import build_args, create_testconfig, compare_file_md5_same from preprocess_data import main diff --git a/tests/ut/process_data/test_process_instruction_pack_data.py b/tests/ut/process_data/test_process_instruction_pack_data.py index 52da846f59ee4a772ccd88d567ed1018af8dc6ef..a5f7cf1cc225e25fa090fc68573372a636a3e1b4 100644 --- a/tests/ut/process_data/test_process_instruction_pack_data.py +++ b/tests/ut/process_data/test_process_instruction_pack_data.py @@ -1,10 +1,10 @@ import sys import os import math -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor -from modellink.training.tokenizer import build_tokenizer -from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler +from mindspeed_llm.training.tokenizer import build_tokenizer +from mindspeed_llm.tasks.preprocess.data_handler import build_dataset, get_dataset_handler from preprocess_data import get_args, build_splitter from tests.test_tools.utils import judge_expression diff --git a/tests/ut/process_data/test_process_pairwise_data_lf.py b/tests/ut/process_data/test_process_pairwise_data_lf.py index 3fd779bbcbcb71a5cd47e250a412de3b3d4cd2d2..95afcf9c03c21551d198d3f4e45476275a880e98 100644 --- a/tests/ut/process_data/test_process_pairwise_data_lf.py +++ b/tests/ut/process_data/test_process_pairwise_data_lf.py @@ -2,7 +2,7 @@ import os from pathlib import Path import pytest -from modellink import megatron_adaptor +from mindspeed_llm import megatron_adaptor from tests.test_tools.utils import build_args, create_testconfig, compare_file_md5_same from preprocess_data import main