diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh new file mode 100644 index 0000000000000000000000000000000000000000..fba4236040d2547ae43eab30dbf37ca2ca90bd5c --- /dev/null +++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh @@ -0,0 +1,117 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=1 +CP=8 +MBS=1 +GBS=32 +SEQ_LEN=32768 +CP_ALGO=ulysses_cp_algo + +DISTRIBUTED_ARGS=" + --master_addr $MASTER_ADDR \ + --node_rank $NODE_RANK \ + --worker_num $WORLD_SIZE \ + --local_worker_num $NPUS_PER_NODE \ + --master_port $MASTER_PORT \ + --log_dir=msrun_log \ + --join=False \ + --cluster_time_out=300 \ + --bind_core=True \ +" + +GPT_ARGS=" + --use-mcore-models \ + --transformer-impl local \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --num-layers 28 \ + --hidden-size 4096 \ + --ffn-hidden-size 13696 \ + --num-attention-heads 32 \ + --seq-length ${SEQ_LEN} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --context-parallel-algo ${CP_ALGO} \ + --context-parallel-size ${CP} \ + --max-position-embeddings ${SEQ_LEN} \ + --padded-vocab-size 65024 \ + --make-vocab-size-divisible-by 1 \ + --group-query-attention \ + --num-query-groups 2 \ + --disable-bias-linear \ + --add-qkv-bias \ + --position-embedding-type rope \ + --no-rope-fusion \ + --use-distributed-optimizer \ + --use-glm-rope \ + --rotary-percent 0.5 \ + --use-flash-attn \ + --use-fused-rmsnorm \ + --use-fused-swiglu \ + --normalization RMSNorm \ + --swiglu \ + --no-create-attention-mask-in-dataloader \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --lr 1e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --initial-loss-scale 4096 \ + --adam-beta2 0.95 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --fp16 \ + --kv-head-repeat-before-uly-alltoall \ + --use-cp-send-recv-overlap \ + --overlap-grad-reduce \ + --overlap-param-gather \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 949,50,1 \ +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 10 \ + --save $CKPT_SAVE_DIR \ + --load $CKPT_LOAD_DIR \ +" + +msrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --ai-framework mindspore \ + | tee logs/ms_train_mcore_chatglm3_6B_32K.log \ No newline at end of file diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh new file mode 100644 index 0000000000000000000000000000000000000000..691b40b4ec77fc978780908045c6e42883d2e36a --- /dev/null +++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh @@ -0,0 +1,137 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Please enter ip of your server. +IPs=('IP1' 'IP2') +LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'` +echo LOCAL_HOST $LOCAL_HOST + +NPUS_PER_NODE=8 +MASTER_ADDR=${IPs[0]} +MASTER_PORT=6010 +NNODES=${#IPs[@]} +NODE_RANK="" +WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) + +for i in "${!IPs[@]}"; +do + if [ "$LOCAL_HOST" == "${IPs[$i]}" ]; + then + echo "Node Rank : ${i}" + NODE_RANK=$i + break + fi +done + + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=1 +CP=16 +MBS=1 +GBS=64 +SEQ_LEN=65536 +CP_ALGO=hybrid_cp_algo + +DISTRIBUTED_ARGS=" + --master_addr $MASTER_ADDR \ + --node_rank $NODE_RANK \ + --worker_num $WORLD_SIZE \ + --local_worker_num $NPUS_PER_NODE \ + --master_port $MASTER_PORT \ + --log_dir=msrun_log \ + --join=False \ + --cluster_time_out=300 \ + --bind_core=True \ +" + + +GPT_ARGS=" + --use-mcore-models \ + --transformer-impl local \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --num-layers 28 \ + --hidden-size 4096 \ + --ffn-hidden-size 13696 \ + --num-attention-heads 32 \ + --ulysses-degree-in-cp 8 \ + --seq-length ${SEQ_LEN} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --context-parallel-algo ${CP_ALGO} \ + --context-parallel-size ${CP} \ + --max-position-embeddings ${SEQ_LEN} \ + --padded-vocab-size 65024 \ + --make-vocab-size-divisible-by 1 \ + --group-query-attention \ + --num-query-groups 2 \ + --disable-bias-linear \ + --add-qkv-bias \ + --position-embedding-type rope \ + --no-rope-fusion \ + --use-distributed-optimizer \ + --use-glm-rope \ + --rotary-percent 0.5 \ + --use-flash-attn \ + --use-fused-rmsnorm \ + --use-fused-swiglu \ + --normalization RMSNorm \ + --swiglu \ + --no-create-attention-mask-in-dataloader \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --lr 1e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --initial-loss-scale 512 \ + --adam-beta2 0.95 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --fp16 \ + --num-workers 1 \ + --kv-head-repeat-before-uly-alltoall \ + --no-shared-storage \ + --use-cp-send-recv-overlap \ + --overlap-grad-reduce \ + --overlap-param-gather \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 949,50,1 +" +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 10 \ + --save $CKPT_SAVE_DIR \ + --load $CKPT_LOAD_DIR \ +" + + +msrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --ai-framework mindspore \ + | tee logs/ms_train_mcore_chatglm3_6B_64K.log \ No newline at end of file diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh new file mode 100644 index 0000000000000000000000000000000000000000..2550b2a71541e5f79af4aa0233b09f3ebc0643c1 --- /dev/null +++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh @@ -0,0 +1,111 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6021 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=2 +MBS=1 +GBS=128 +SEQ_LEN=8192 + +DISTRIBUTED_ARGS=" + --master_addr $MASTER_ADDR \ + --node_rank $NODE_RANK \ + --worker_num $WORLD_SIZE \ + --local_worker_num $NPUS_PER_NODE \ + --master_port $MASTER_PORT \ + --log_dir=msrun_log \ + --join=False \ + --cluster_time_out=300 \ + --bind_core=True \ +" + + +GPT_ARGS=" + --use-mcore-models \ + --transformer-impl local \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --num-layers 28 \ + --hidden-size 4096 \ + --ffn-hidden-size 13696 \ + --num-attention-heads 32 \ + --seq-length ${SEQ_LEN} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --max-position-embeddings ${SEQ_LEN} \ + --padded-vocab-size 65024 \ + --make-vocab-size-divisible-by 1 \ + --group-query-attention \ + --num-query-groups 2 \ + --disable-bias-linear \ + --add-qkv-bias \ + --position-embedding-type rope \ + --no-rope-fusion \ + --use-glm-rope \ + --rotary-percent 0.5 \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --swiglu \ + --use-fused-swiglu \ + --use-flash-attn \ + --use-distributed-optimizer \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --lr 1e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --initial-loss-scale 4096 \ + --adam-beta2 0.95 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --fp16 \ + --overlap-grad-reduce \ + --overlap-param-gather \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 949,50,1 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 10 \ + --save $CKPT_SAVE_DIR \ + --load $CKPT_LOAD_DIR \ +" + +msrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --ai-framework mindspore \ + | tee logs/ms_train_mcore_chatglm3_6B_8K.log \ No newline at end of file