diff --git a/examples/llama_task/README.md b/examples/llama_task/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..aaedcda4432d4ab8964c6d73221b3ba165ba7c38
--- /dev/null
+++ b/examples/llama_task/README.md
@@ -0,0 +1,62 @@
+# Contents
+- [Contents](#contents)
+- [Evaluation and Tasks](#evaluation-and-tasks)
+  - [Datastes](#datastes)
+  - [LLama Evaluation](#llama-evaluation)
+  - [Zeroshot Task](#zeroshot-task)
+# Evaluation and Tasks
+
+## Datastes
+First of all, You must download the evaluation datasets for the [BoolQ](https://storage.googleapis.com/boolq/dev.jsonl), PIQA ([1](https://yonatanbisk.com/piqa/data/valid.jsonl), [2](https://yonatanbisk.com/piqa/data/valid-labels.lst)), [HellaSwag](https://github.com/rowanz/hellaswag/tree/master/data/hellaswag_val.jsonl) tasks.
+
+## LLama Evaluation
+
+We include zeroshot example scripts for llama evaluation on [BoolQ](https://storage.googleapis.com/boolq/dev.jsonl), PIQA ([1](https://yonatanbisk.com/piqa/data/valid.jsonl), [2](https://yonatanbisk.com/piqa/data/valid-labels.lst)), and [HellaSwag](https://github.com/rowanz/hellaswag/tree/master/data/hellaswag_val.jsonl) accuracy.
+
+For example, you can use the following command to run BoolQ zeroshot task on a Llama-7B parameter model.
+<pre>
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TASK="BoolQ"
+VALID_DATA=&#60;boolq dev data path&#62;.jsonl
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task $TASK \
+               --valid-data $VALID_DATA \
+               --tokenizer-type PretrainedFromHF \
+               --tokenizer-name-or-path ./dataset/llama/  \
+               --tokenizer-not-use-fast \
+               --load $CHECKPOINT \
+               --tensor-model-parallel-size 8 \
+               --pipeline-model-parallel-size 1 \
+               --num-layers 32 \
+               --hidden-size 4096 \
+               --ffn-hidden-size 11008 \
+               --num-attention-heads 32 \
+               --micro-batch-size 8 \
+               --seq-length 2048 \
+               --max-position-embeddings 2048 \
+               --log-interval 1 \
+               --layernorm-epsilon 1e-6 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
+</pre>
+
+## Zeroshot Task
+
+
+The following table shows the NPU and [LLama Paper](https://arxiv.org/abs/2302.13971) accuracy achieved by the Zeroshot task of the Llama model. 
+
+| Model Size | BoolQ | PIQA | HellaSwag |
+| :---: | :---: | :---: | :---: |
+| 7B   | 74.7% \| 76.5% | 78.6% \| 79.8% | 73.9% \| 79.8% | 
+| 13B  | 79.5% \| 78.1% | 80.4% \| 80.1% | 77.3% \| 80.1% | 
+| 33B  | 83.1% \| 83.1% | 81.7% \| 82.3% | 83.0% \| 82.3% |
+| 65B  | 85.5% \| 85.3% | 81.2% \| 82.8% | 82.3% \| 82.8% |
diff --git a/examples/llama_task/evaluate_llama_zeroshot_7b.sh b/examples/llama_task/evaluate_llama_zeroshot_7b.sh
index 3b1f880d373ae252426d79cfe9f047b0b7aff90c..27b4897d148cc3b0e4bd753d34b495e9c5789e12 100644
--- a/examples/llama_task/evaluate_llama_zeroshot_7b.sh
+++ b/examples/llama_task/evaluate_llama_zeroshot_7b.sh
@@ -29,7 +29,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
                --ffn-hidden-size 11008 \
                --num-attention-heads 32 \
                --micro-batch-size 8 \
-               --global-batch-size 16 \
                --seq-length 2048 \
                --max-position-embeddings 2048 \
                --log-interval 1 \
diff --git a/tasks/zeroshot/evaluate_llama.py b/tasks/zeroshot/evaluate_llama.py
index fd1d78af831ff6ed62f6cf269a084d973f41ceee..bd3b10599cf763520f4ccc687b37465c9c5c6838 100644
--- a/tasks/zeroshot/evaluate_llama.py
+++ b/tasks/zeroshot/evaluate_llama.py
@@ -20,7 +20,7 @@ import math
 import torch
 
 from ascendspeed import get_args
-from ascendspeed import print_rank_0, is_last_rank
+from ascendspeed import print_rank_0, is_last_rank, print_rank_last
 from ascendspeed import get_tokenizer
 from ascendspeed import mpu
 from ascendspeed.checkpointing import load_checkpoint
@@ -215,7 +215,8 @@ def custom_forwrad(batch, model):
 def evaluate(data_loader, model, eval_metric):
     """Evaluation."""
     args = get_args()
-
+    micro_batch_size = args.orig_micro_batch_size if hasattr(args, "orig_micro_batch_size") \
+        else args.micro_batch_Size
     # Turn on evaluation mode which disables dropout.
     model.eval()
     total_output = 0.0
@@ -223,8 +224,8 @@ def evaluate(data_loader, model, eval_metric):
         # For all the batches in the dataset.
         for iteration, batch in enumerate(data_loader):
             if iteration % args.log_interval == 0:
-                print_rank_0(f'> working on iteration: {iteration}')
-                print_rank_0(f"> corrcet percent: {total_output / max(1, (iteration * args.micro_batch_size))}")
+                print_rank_last(f'> working on iteration: {iteration}')
+                print_rank_last(f"> corrcet percent: {total_output / max(1, (iteration * micro_batch_size))}")
             # Forward evaluation.
             if eval_metric:
                 output = forward_step(batch, model, eval_metric)
@@ -315,6 +316,9 @@ def main():
     dataset = build_dataset(args.task)
     dataloader = build_data_loader(dataset, args.micro_batch_size,
                                    args.num_workers, drop_last=False)
+    if hasattr(dataset, "class_num"):
+        args.orig_micro_batch_size = args.micro_batch_size
+        args.micro_batch_size *= dataset.class_num
 
     # Run evaluation.
     evaluate_and_print_results(args.task, dataloader, model, eval_metric)