diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fba4236040d2547ae43eab30dbf37ca2ca90bd5c
--- /dev/null
+++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_32k.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=1
+CP=8
+MBS=1
+GBS=32
+SEQ_LEN=32768
+CP_ALGO=ulysses_cp_algo
+
+DISTRIBUTED_ARGS="
+    --master_addr $MASTER_ADDR \
+    --node_rank $NODE_RANK \
+    --worker_num $WORLD_SIZE \
+    --local_worker_num $NPUS_PER_NODE \
+    --master_port $MASTER_PORT \
+    --log_dir=msrun_log \
+    --join=False \
+    --cluster_time_out=300 \
+    --bind_core=True \
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --transformer-impl local \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 28 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 32 \
+    --seq-length ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --context-parallel-algo ${CP_ALGO} \
+    --context-parallel-size ${CP} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --padded-vocab-size 65024 \
+    --make-vocab-size-divisible-by 1 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --disable-bias-linear \
+    --add-qkv-bias \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --use-distributed-optimizer \
+    --use-glm-rope \
+    --rotary-percent 0.5 \
+    --use-flash-attn \
+    --use-fused-rmsnorm \
+    --use-fused-swiglu \
+    --normalization RMSNorm \
+    --swiglu \
+    --no-create-attention-mask-in-dataloader \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --lr 1e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1e-8 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --initial-loss-scale 4096 \
+    --adam-beta2 0.95 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --fp16 \
+    --kv-head-repeat-before-uly-alltoall \
+    --use-cp-send-recv-overlap \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 949,50,1 \
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 10 \
+    --save $CKPT_SAVE_DIR \
+    --load $CKPT_LOAD_DIR \
+"
+
+msrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --ai-framework mindspore \
+    | tee logs/ms_train_mcore_chatglm3_6B_32K.log
\ No newline at end of file
diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh
new file mode 100644
index 0000000000000000000000000000000000000000..691b40b4ec77fc978780908045c6e42883d2e36a
--- /dev/null
+++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_64k.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Please enter ip of your server.
+IPs=('IP1' 'IP2')
+LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
+echo LOCAL_HOST $LOCAL_HOST
+
+NPUS_PER_NODE=8
+MASTER_ADDR=${IPs[0]}
+MASTER_PORT=6010
+NNODES=${#IPs[@]}
+NODE_RANK=""
+WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+
+for i in "${!IPs[@]}";
+do
+    if [ "$LOCAL_HOST" == "${IPs[$i]}" ];
+    then
+        echo "Node Rank : ${i}"
+        NODE_RANK=$i
+        break
+    fi
+done
+
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=1
+CP=16
+MBS=1
+GBS=64
+SEQ_LEN=65536
+CP_ALGO=hybrid_cp_algo
+
+DISTRIBUTED_ARGS="
+    --master_addr $MASTER_ADDR \
+    --node_rank $NODE_RANK \
+    --worker_num $WORLD_SIZE \
+    --local_worker_num $NPUS_PER_NODE \
+    --master_port $MASTER_PORT \
+    --log_dir=msrun_log \
+    --join=False \
+    --cluster_time_out=300 \
+    --bind_core=True \
+"
+
+
+GPT_ARGS="
+    --use-mcore-models \
+    --transformer-impl local \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 28 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 32 \
+    --ulysses-degree-in-cp 8 \
+    --seq-length ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --context-parallel-algo ${CP_ALGO} \
+    --context-parallel-size ${CP} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --padded-vocab-size 65024 \
+    --make-vocab-size-divisible-by 1 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --disable-bias-linear \
+    --add-qkv-bias \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --use-distributed-optimizer \
+    --use-glm-rope \
+    --rotary-percent 0.5 \
+    --use-flash-attn \
+    --use-fused-rmsnorm \
+    --use-fused-swiglu \
+    --normalization RMSNorm \
+    --swiglu \
+    --no-create-attention-mask-in-dataloader \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --lr 1e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1e-8 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --initial-loss-scale 512 \
+    --adam-beta2 0.95 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --fp16 \
+    --num-workers 1 \
+    --kv-head-repeat-before-uly-alltoall \
+    --no-shared-storage \
+    --use-cp-send-recv-overlap \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 949,50,1
+"
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 10 \
+    --save $CKPT_SAVE_DIR \
+    --load $CKPT_LOAD_DIR \
+"
+
+
+msrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --ai-framework mindspore \
+    | tee logs/ms_train_mcore_chatglm3_6B_64K.log
\ No newline at end of file
diff --git a/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2550b2a71541e5f79af4aa0233b09f3ebc0643c1
--- /dev/null
+++ b/examples/mindspore/chatglm3/pretrain_chatglm3_6B_8k.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6021
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=2
+MBS=1
+GBS=128
+SEQ_LEN=8192
+
+DISTRIBUTED_ARGS="
+    --master_addr $MASTER_ADDR \
+    --node_rank $NODE_RANK \
+    --worker_num $WORLD_SIZE \
+    --local_worker_num $NPUS_PER_NODE \
+    --master_port $MASTER_PORT \
+    --log_dir=msrun_log \
+    --join=False \
+    --cluster_time_out=300 \
+    --bind_core=True \
+"
+
+
+GPT_ARGS="
+    --use-mcore-models \
+    --transformer-impl local \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 28 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 32 \
+    --seq-length ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --padded-vocab-size 65024 \
+    --make-vocab-size-divisible-by 1 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --disable-bias-linear \
+    --add-qkv-bias \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --use-glm-rope \
+    --rotary-percent 0.5 \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --swiglu \
+    --use-fused-swiglu \
+    --use-flash-attn \
+    --use-distributed-optimizer \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --lr 1e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1e-8 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --initial-loss-scale 4096 \
+    --adam-beta2 0.95 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --fp16 \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 10 \
+    --save $CKPT_SAVE_DIR \
+    --load $CKPT_LOAD_DIR \
+"
+
+msrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --ai-framework mindspore \
+    | tee logs/ms_train_mcore_chatglm3_6B_8K.log
\ No newline at end of file