From bebbdf7f74a75b1da3ee247fe1351d395936683f Mon Sep 17 00:00:00 2001 From: duyangkai Date: Mon, 23 Jun 2025 10:02:42 +0800 Subject: [PATCH 1/2] add st for overlap --- .../feature_tests/dualpipev_overlap.sh | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 tests_extend/system_tests/feature_tests/dualpipev_overlap.sh diff --git a/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh b/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh new file mode 100644 index 00000000..05024424 --- /dev/null +++ b/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +source "tests_extend/system_tests/env_npu.sh" +export STREAMS_PER_DEVICE=32 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +CKPT_DIR=./ckpt_llama +DATA_PATH="/home/dataset/llama2/alpaca_text_document" +TOKENIZER_MODEL="/home/dataset/model/llama-2-7b-hf/tokenizer.model" + +TP=2 +PP=2 +CP=1 +EP=2 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --expert-model-parallel-size ${EP} \ + --moe-token-dispatcher-type alltoall \ + --moe-fb-overlap \ + --moe-zero-memory level0 \ + --moe-grouped-gemm \ + --n-shared-experts 1 \ + --num-experts 16 \ + --moe-router-topk 4 \ + --moe-aux-loss-coeff 0.02 \ +" + +GPT_ARGS=" + --transformer-impl local \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --schedules-method dualpipev \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-fused-rmsnorm \ + --sequence-parallel \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --num-layers 8 \ + --noop-layers 7 \ + --manual-gc \ + --manual-gc-interval 50 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --train-iters 10000 \ + --hidden-size 8192 \ + --num-attention-heads 64 \ + --ffn-hidden-size 4352 \ + --make-vocab-size-divisible-by 128 \ + --vocab-size 126464 \ + --micro-batch-size 1 \ + --global-batch-size 32 \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --disable-bias-linear \ + --lr-decay-style linear \ + --lr-warmup-iters 1500 \ + --short-seq-prob 0.0 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --untie-embeddings-and-output-weights \ + --init-method-std 0.006 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --no-gradient-accumulation-fusion \ + --bf16 \ + --group-query-attention \ + --num-query-groups 8 \ + --lr 2.0e-4 \ + --min-lr 2.0e-5 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --rotary-base 100000 \ + --norm-epsilon 1.0e-5 \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 995,5,0 +" + +OUTPUT_ARGS=" + --log-throughput \ + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 10 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $MOE_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + +set +x -- Gitee From 6017434fc23b263a31e6389d144b7fec432df8f9 Mon Sep 17 00:00:00 2001 From: yangkai Date: Thu, 26 Jun 2025 11:49:08 +0000 Subject: [PATCH 2/2] fix st --- tests_extend/system_tests/feature_tests/dualpipev_overlap.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh b/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh index 05024424..5ae4a72e 100644 --- a/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh +++ b/tests_extend/system_tests/feature_tests/dualpipev_overlap.sh @@ -19,6 +19,7 @@ TP=2 PP=2 CP=1 EP=2 +ETP=1 DISTRIBUTED_ARGS=" --nproc_per_node $NPUS_PER_NODE \ @@ -30,6 +31,7 @@ DISTRIBUTED_ARGS=" MOE_ARGS=" --expert-model-parallel-size ${EP} \ + --expert-tensor-parallel-size ${ETP} \ --moe-token-dispatcher-type alltoall \ --moe-fb-overlap \ --moe-zero-memory level0 \ @@ -50,7 +52,6 @@ GPT_ARGS=" --use-fused-rmsnorm \ --sequence-parallel \ --use-distributed-optimizer \ - --overlap-grad-reduce \ --num-layers 8 \ --noop-layers 7 \ --manual-gc \ -- Gitee