diff --git a/examples/llama_task/README.md b/examples/llama_task/README.md new file mode 100644 index 0000000000000000000000000000000000000000..aaedcda4432d4ab8964c6d73221b3ba165ba7c38 --- /dev/null +++ b/examples/llama_task/README.md @@ -0,0 +1,62 @@ +# Contents +- [Contents](#contents) +- [Evaluation and Tasks](#evaluation-and-tasks) + - [Datastes](#datastes) + - [LLama Evaluation](#llama-evaluation) + - [Zeroshot Task](#zeroshot-task) +# Evaluation and Tasks + +## Datastes +First of all, You must download the evaluation datasets for the [BoolQ](https://storage.googleapis.com/boolq/dev.jsonl), PIQA ([1](https://yonatanbisk.com/piqa/data/valid.jsonl), [2](https://yonatanbisk.com/piqa/data/valid-labels.lst)), [HellaSwag](https://github.com/rowanz/hellaswag/tree/master/data/hellaswag_val.jsonl) tasks. + +## LLama Evaluation + +We include zeroshot example scripts for llama evaluation on [BoolQ](https://storage.googleapis.com/boolq/dev.jsonl), PIQA ([1](https://yonatanbisk.com/piqa/data/valid.jsonl), [2](https://yonatanbisk.com/piqa/data/valid-labels.lst)), and [HellaSwag](https://github.com/rowanz/hellaswag/tree/master/data/hellaswag_val.jsonl) accuracy. + +For example, you can use the following command to run BoolQ zeroshot task on a Llama-7B parameter model. +
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TASK="BoolQ"
+VALID_DATA=<boolq dev data path>.jsonl
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task $TASK \
+               --valid-data $VALID_DATA \
+               --tokenizer-type PretrainedFromHF \
+               --tokenizer-name-or-path ./dataset/llama/  \
+               --tokenizer-not-use-fast \
+               --load $CHECKPOINT \
+               --tensor-model-parallel-size 8 \
+               --pipeline-model-parallel-size 1 \
+               --num-layers 32 \
+               --hidden-size 4096 \
+               --ffn-hidden-size 11008 \
+               --num-attention-heads 32 \
+               --micro-batch-size 8 \
+               --seq-length 2048 \
+               --max-position-embeddings 2048 \
+               --log-interval 1 \
+               --layernorm-epsilon 1e-6 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
+
+ +## Zeroshot Task + + +The following table shows the NPU and [LLama Paper](https://arxiv.org/abs/2302.13971) accuracy achieved by the Zeroshot task of the Llama model. + +| Model Size | BoolQ | PIQA | HellaSwag | +| :---: | :---: | :---: | :---: | +| 7B | 74.7% \| 76.5% | 78.6% \| 79.8% | 73.9% \| 79.8% | +| 13B | 79.5% \| 78.1% | 80.4% \| 80.1% | 77.3% \| 80.1% | +| 33B | 83.1% \| 83.1% | 81.7% \| 82.3% | 83.0% \| 82.3% | +| 65B | 85.5% \| 85.3% | 81.2% \| 82.8% | 82.3% \| 82.8% | diff --git a/examples/llama_task/evaluate_llama_zeroshot_7b.sh b/examples/llama_task/evaluate_llama_zeroshot_7b.sh index 3b1f880d373ae252426d79cfe9f047b0b7aff90c..27b4897d148cc3b0e4bd753d34b495e9c5789e12 100644 --- a/examples/llama_task/evaluate_llama_zeroshot_7b.sh +++ b/examples/llama_task/evaluate_llama_zeroshot_7b.sh @@ -29,7 +29,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ --ffn-hidden-size 11008 \ --num-attention-heads 32 \ --micro-batch-size 8 \ - --global-batch-size 16 \ --seq-length 2048 \ --max-position-embeddings 2048 \ --log-interval 1 \ diff --git a/tasks/zeroshot/evaluate_llama.py b/tasks/zeroshot/evaluate_llama.py index fd1d78af831ff6ed62f6cf269a084d973f41ceee..bd3b10599cf763520f4ccc687b37465c9c5c6838 100644 --- a/tasks/zeroshot/evaluate_llama.py +++ b/tasks/zeroshot/evaluate_llama.py @@ -20,7 +20,7 @@ import math import torch from ascendspeed import get_args -from ascendspeed import print_rank_0, is_last_rank +from ascendspeed import print_rank_0, is_last_rank, print_rank_last from ascendspeed import get_tokenizer from ascendspeed import mpu from ascendspeed.checkpointing import load_checkpoint @@ -215,7 +215,8 @@ def custom_forwrad(batch, model): def evaluate(data_loader, model, eval_metric): """Evaluation.""" args = get_args() - + micro_batch_size = args.orig_micro_batch_size if hasattr(args, "orig_micro_batch_size") \ + else args.micro_batch_Size # Turn on evaluation mode which disables dropout. model.eval() total_output = 0.0 @@ -223,8 +224,8 @@ def evaluate(data_loader, model, eval_metric): # For all the batches in the dataset. for iteration, batch in enumerate(data_loader): if iteration % args.log_interval == 0: - print_rank_0(f'> working on iteration: {iteration}') - print_rank_0(f"> corrcet percent: {total_output / max(1, (iteration * args.micro_batch_size))}") + print_rank_last(f'> working on iteration: {iteration}') + print_rank_last(f"> corrcet percent: {total_output / max(1, (iteration * micro_batch_size))}") # Forward evaluation. if eval_metric: output = forward_step(batch, model, eval_metric) @@ -315,6 +316,9 @@ def main(): dataset = build_dataset(args.task) dataloader = build_data_loader(dataset, args.micro_batch_size, args.num_workers, drop_last=False) + if hasattr(dataset, "class_num"): + args.orig_micro_batch_size = args.micro_batch_size + args.micro_batch_size *= dataset.class_num # Run evaluation. evaluate_and_print_results(args.task, dataloader, model, eval_metric)