diff --git a/examples/mindspore/codellama/pretrain_codellama_34b_ms_16p.sh b/examples/mindspore/codellama/pretrain_codellama_34b_ms_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..3c045ab38c6a9894bb4fed10aa0ae9b9d274c742 --- /dev/null +++ b/examples/mindspore/codellama/pretrain_codellama_34b_ms_16p.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NPU_ASD_ENABLE=0 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=2 +NODE_RANK=0 +WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_MODEL="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" +TP=8 +PP=2 + +DISTRIBUTED_ARGS=" + --master_addr $MASTER_ADDR \ + --node_rank $NODE_RANK \ + --worker_num $WORLD_SIZE \ + --local_worker_num $NPUS_PER_NODE \ + --master_port $MASTER_PORT \ + --log_dir=msrun_log \ + --join=False \ + --cluster_time_out=300 \ + --bind_core=True \ +" + + +GPT_ARGS=" + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --num-layers 48 \ + --hidden-size 8192 \ + --ffn-hidden-size 22016 \ + --num-attention-heads 64 \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --seq-length 4096 \ + --max-position-embeddings 16384 \ + --micro-batch-size 2 \ + --global-batch-size 1024 \ + --make-vocab-size-divisible-by 1 \ + --lr 1.0e-7 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --swiglu \ + --use-flash-attn \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.0e-8 \ + --weight-decay 1e-2 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --initial-loss-scale 8188.0 \ + --no-gradient-accumulation-fusion \ + --load ${CKPT_LOAD_DIR} \ + --no-load-optim \ + --no-load-rng \ + --rotary-base 1000000 \ + --vocab-size 32000 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 0 \ +" + +msrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --ai-framework mindspore \ + --save $CKPT_SAVE_DIR \ + | tee logs/train_codellama_34b.log \ No newline at end of file