From 1d54912a98e86a184c0f7be0b79096602018ef43 Mon Sep 17 00:00:00 2001 From: Huang <1023668958@qq.com> Date: Thu, 19 Dec 2024 09:24:14 +0000 Subject: [PATCH] =?UTF-8?q?=E3=80=90OpenMind=E3=80=91=E9=80=82=E9=85=8Dint?= =?UTF-8?q?ernlm2-base-7b=E6=A8=A1=E5=9E=8B=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/models/evaluation.md | 2 +- docs/models/pretrain.md | 9 ++ .../evaluate_internlm2_base_7b_ptd.sh | 61 +++++++++++ .../generate_internlm2_base_7b_ptd.sh | 60 +++++++++++ .../pretrain_internlm2_base_7b_ptd.sh | 101 ++++++++++++++++++ 5 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 examples/mcore/internlm2/evaluate_internlm2_base_7b_ptd.sh create mode 100644 examples/mcore/internlm2/generate_internlm2_base_7b_ptd.sh create mode 100644 examples/mcore/internlm2/pretrain_internlm2_base_7b_ptd.sh diff --git a/docs/models/evaluation.md b/docs/models/evaluation.md index e1dfaee35..f1c186c5f 100644 --- a/docs/models/evaluation.md +++ b/docs/models/evaluation.md @@ -51,7 +51,7 @@ MindSpeed-LLM 已支持的大模型评估数据统计如下: | Yi1.5-9B | MMLU | 69.2% | [69.5%](https://huggingface.co/01-ai/Yi-1.5-9B/tree/main) | Yi1.5-34B | MMLU | 76.9% | [77.1%](https://huggingface.co/01-ai/Yi-1.5-34B/tree/main) | | CodeQWen2.5-7B | Human. | 66.5% | [61.6%](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B) | Qwen2.5-Math-7B |MMLU-STEM| 65.1% | [67.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/) | | Qwen2.5-Math-72B |MMLU-STEM| 83.7% | [82.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/) | MiniCPM3-4B | MMLU | 63.7% | 64.6% | -| Phi3.5-Mini | MMLU | 64.39% | 64.34% | +| Phi3.5-Mini | MMLU | 64.39% | 64.34% | InternLM2-base-7b | MMLU | 63.1% | [64.0%](https://link.zhihu.com/?target=https%3A//arxiv.org/pdf/2403.17297.pdf) | 具体的权重转换功能命令介绍见 [MindSpeed-LLM 大模型使用指南分布式评估](../USER_GUIDE.md/#大模型分布式评估)章节. diff --git a/docs/models/pretrain.md b/docs/models/pretrain.md index 6775196d8..e43d7e53e 100644 --- a/docs/models/pretrain.md +++ b/docs/models/pretrain.md @@ -174,6 +174,15 @@ 【GTS】 【Test】 + + InternLM2-base + 7B + 4K + Mcore + 1x8 + 【昇腾】 + 【Test】 + InternLM2.5 1.8B diff --git a/examples/mcore/internlm2/evaluate_internlm2_base_7b_ptd.sh b/examples/mcore/internlm2/evaluate_internlm2_base_7b_ptd.sh new file mode 100644 index 000000000..a98497199 --- /dev/null +++ b/examples/mcore/internlm2/evaluate_internlm2_base_7b_ptd.sh @@ -0,0 +1,61 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# distributed config +NPUS_PER_NODE=1 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 + +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your converted model checkpoint directory path" +DATA_PATH="./data/mmlu/test" +TASK="mmlu" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT" + +torchrun ${DISTRIBUTED_ARGS} evaluation.py \ + --no-chat-template \ + --task-data-path ${DATA_PATH} \ + --task ${TASK} \ + --use-mcore-models \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --group-query-attention \ + --num-query-groups 8 \ + --position-embedding-type rope \ + --norm-epsilon 1e-5 \ + --rotary-base 1000000 \ + --seq-length 32768 \ + --max-new-tokens 1 \ + --micro-batch-size 1 \ + --num-attention-heads 32 \ + --max-position-embeddings 32768 \ + --padded-vocab-size 92544 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --load ${CHECKPOINT} \ + --disable-bias-linear \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --normalization RMSNorm \ + --exit-on-missing-checkpoint \ + --no-load-rng \ + --no-load-optim \ + --untie-embeddings-and-output-weights \ + --no-masked-softmax-fusion \ + --no-gradient-accumulation-fusion \ + --make-vocab-size-divisible-by 1 \ + --seed 42 \ + | tee logs/evaluate_internlm2_base_7b_mcore_${TASK}.log \ No newline at end of file diff --git a/examples/mcore/internlm2/generate_internlm2_base_7b_ptd.sh b/examples/mcore/internlm2/generate_internlm2_base_7b_ptd.sh new file mode 100644 index 000000000..c377df89c --- /dev/null +++ b/examples/mcore/internlm2/generate_internlm2_base_7b_ptd.sh @@ -0,0 +1,60 @@ + +#!/bin/bash + +# The number of parameters is not aligned +export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH +export HCCL_CONNECT_TIMEOUT=1200 +export COMBINED_ENABLE=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +CHECKPOINT="your model directory path" +TOKENIZER_PATH="your tokenizer directory path" +TOKENIZER_MODEL="your tokenizer.model file path" + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=1 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS inference.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --position-embedding-type rope \ + --use-rotary-position-embeddings \ + --rotary-base 1000000 \ + --group-query-attention \ + --num-query-groups 8 \ + --seq-length 8192 \ + --max-new-tokens 256 \ + --micro-batch-size 1 \ + --num-attention-heads 32 \ + --max-position-embeddings 32768 \ + --padded-vocab-size 92544 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --load "${CHECKPOINT}" \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path "${TOKENIZER_PATH}" \ + --tokenizer-model "${TOKENIZER_MODEL}" \ + --tokenizer-not-use-fast \ + --bf16 \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-softmax-in-fp32 \ + --exit-on-missing-checkpoint \ + --seed 42 \ + --make-vocab-size-divisible-by 1 \ + --use-mcore-models \ + | tee logs/generate_internlm2_base_7b.log \ No newline at end of file diff --git a/examples/mcore/internlm2/pretrain_internlm2_base_7b_ptd.sh b/examples/mcore/internlm2/pretrain_internlm2_base_7b_ptd.sh new file mode 100644 index 000000000..72882cd7b --- /dev/null +++ b/examples/mcore/internlm2/pretrain_internlm2_base_7b_ptd.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export HCCL_CONNECT_TIMEOUT=1200 + +GPUS_PER_NODE=2 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_MODEL="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path, the parameters TP and PP should be the same as the converted model" +TP=1 +PP=2 + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --use-mcore-models \ + --sequence-parallel \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 32 \ + --group-query-attention \ + --num-query-groups 8 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_MODEL} \ + --seq-length 256 \ + --max-position-embeddings 256 \ + --micro-batch-size 1 \ + --global-batch-size 1 \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 92544 \ + --lr 1e-6 \ + --lr-decay-style cosine \ + --rotary-base 1000000 \ + --train-iters 1 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --init-method-std 0.01 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --swiglu \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --no-load-optim \ + --no-load-rng \ + --seed 42 \ + --bf16 +" + # --use-mc2 \ + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_internlm2_base_7b_micro.log \ No newline at end of file -- Gitee