From f782a9f6d92df69409fdfbc3bf11a7ac6c55aaf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 15 Mar 2025 15:02:34 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8E=BB=E6=8E=89npu=20profiling=E9=87=87?= =?UTF-8?q?=E9=9B=86=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit requirements.txt添加ninja,加快mmcv编译安装 --- model_examples/BEVDet/BEVDet.patch | 402 ++++++++++++++++++++++++- model_examples/BEVDet/requirements.txt | 1 + 2 files changed, 390 insertions(+), 13 deletions(-) diff --git a/model_examples/BEVDet/BEVDet.patch b/model_examples/BEVDet/BEVDet.patch index 280e31d2..b0a57fb6 100644 --- a/model_examples/BEVDet/BEVDet.patch +++ b/model_examples/BEVDet/BEVDet.patch @@ -124,19 +124,6 @@ index 4d97026..be10ecd 100644 model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner -diff --git a/mmdet3d/core/hook/__init__.py b/mmdet3d/core/hook/__init__.py -index 0b85670..168b207 100644 ---- a/mmdet3d/core/hook/__init__.py -+++ b/mmdet3d/core/hook/__init__.py -@@ -3,6 +3,7 @@ from .ema import MEGVIIEMAHook - from .utils import is_parallel - from .sequentialcontrol import SequentialControlHook - from .syncbncontrol import SyncbnControlHook -+from .profiler_hook_npu import ProfilerHookNPU - - __all__ = ['MEGVIIEMAHook', 'is_parallel', 'SequentialControlHook', -- 'SyncbnControlHook'] -+ 'SyncbnControlHook', 'ProfilerHookNPU'] diff --git a/mmdet3d/datasets/pipelines/loading.py b/mmdet3d/datasets/pipelines/loading.py index b9357ff..3f23888 100644 --- a/mmdet3d/datasets/pipelines/loading.py @@ -608,6 +595,395 @@ index bda8bfe..9015b9e 100644 scores = scores.permute(0, 2, 3, 1) # (B, N, K, M) +diff --git a/test/env_npu.sh b/test/env_npu.sh +new file mode 100644 +index 0000000..5c16419 +--- /dev/null ++++ b/test/env_npu.sh +@@ -0,0 +1,51 @@ ++#!/bin/bash ++CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' ++ ++if [ -f $CANN_INSTALL_PATH_CONF ]; then ++ CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) ++else ++ CANN_INSTALL_PATH="/usr/local/Ascend" ++fi ++ ++if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then ++ source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh ++else ++ source ${CANN_INSTALL_PATH}/nnae/set_env.sh ++fi ++ ++ ++#将Host日志输出到串口,0-关闭/1-开启 ++export ASCEND_SLOG_PRINT_TO_STDOUT=0 ++#设置默认日志级别,0-debug/1-info/2-warning/3-error ++export ASCEND_GLOBAL_LOG_LEVEL=3 ++#设置Event日志开启标志,0-关闭/1-开启 ++export ASCEND_GLOBAL_EVENT_ENABLE=0 ++#设置是否开启taskque,0-关闭/1-开启 ++export TASK_QUEUE_ENABLE=2 ++#设置是否开启PTCopy,0-关闭/1-开启 ++export PTCOPY_ENABLE=1 ++#设置是否开启combined标志,0-关闭/1-开启 ++export COMBINED_ENABLE=1 ++#设置特殊场景是否需要重新编译,不需要修改 ++export DYNAMIC_OP="ADD#MUL" ++#HCCL白名单开关,1-关闭/0-开启 ++export HCCL_WHITELIST_DISABLE=1 ++export HCCL_IF_IP=$(hostname -I |awk '{print $1}') ++ ++#开启绑核 ++export CPU_AFFINITY_CONF=1 ++ ++export OMP_NUM_THREADS=16 ++export MKL_NUM_THREADS=16 ++ ++#设置device侧日志登记为error ++msnpureport -g error -d 0 ++msnpureport -g error -d 1 ++msnpureport -g error -d 2 ++msnpureport -g error -d 3 ++msnpureport -g error -d 4 ++msnpureport -g error -d 5 ++msnpureport -g error -d 6 ++msnpureport -g error -d 7 ++#关闭Device侧Event日志 ++msnpureport -e disable +diff --git a/test/train_1p.sh b/test/train_1p.sh +new file mode 100644 +index 0000000..b726985 +--- /dev/null ++++ b/test/train_1p.sh +@@ -0,0 +1,167 @@ ++#!/bin/bash ++ ++#当前路径 ++cur_path=`pwd` ++# 指定训练所使用的npu device卡id ++device_id=0 ++ ++#集合通信参数 ++export RANK_SIZE=1 ++export JOB_ID=10087 ++RANK_ID_START=0 ++ ++performance=0 ++ ++#基础参数 ++batch_size=1 ++#训练step ++max_epochs=24 ++ ++ ++# 帮助信息 ++if [[ $1 == --help || $1 == -h ]];then ++ echo"usage:./train_1p.sh " ++ echo " " ++ echo "parameter explain: ++ --py_config train config ++ --performance switch to performance mode when != 0 ++ --work_dir set output dir for training ++ -h/--help show help message ++ " ++ exit 1 ++fi ++ ++#参数校验 ++for para in $* ++do ++ if [[ $para == --py_config* ]];then ++ py_config=`echo ${para#*=}` ++ elif [[ $para == --performance* ]];then ++ performance=`echo ${para#*=}` ++ elif [[ $para == --work_dir* ]];then ++ work_dir=`echo ${para#*=}` ++ fi ++done ++ ++if (($performance!=0)); then ++ max_epochs=1 ++fi ++ ++#校验是否传入py_config ++if [[ $py_config == "" ]];then ++ echo "[Error] para \"py_config\" must be config" ++ exit 1 ++fi ++ ++#配置名称 ++config_name=`echo $py_config | awk -F "/" '{print $NF}' | awk -F "." '{print $1}'` ++#网络名称 ++Network=$config_name ++ ++# 校验是否指定了device_id,分动态分配device_id与手动指定device_id ++if [ $ASCEND_DEVICE_ID ];then ++ echo "device id is ${ASCEND_DEVICE_ID}" ++elif [ ${device_id} ];then ++ export ASCEND_DEVICE_ID=${device_id} ++ echo "device id is ${ASCEND_DEVICE_ID}" ++else ++ "[Error] device id must be config" ++ exit 1 ++fi ++ ++if [[ $work_dir == "" ]];then ++ work_dir="output/train_1p/$config_name" ++else ++ work_dir="${work_dir}/train_1p/$config_name" ++fi ++ ++test_path_dir=$cur_path ++ASCEND_DEVICE_ID=$device_id ++export ASCEND_RT_VISIBLE_DEVICES=$ASCEND_DEVICE_ID ++if [ ! -d ${test_path_dir}/output ];then ++ mkdir ${test_path_dir}/output ++fi ++if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then ++ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} ++ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt ++else ++ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt ++fi ++ ++ ++#训练开始时间 ++start_time=$(date +%s) ++# 非平台场景时source 环境变量 ++check_etp_flag=`env | grep etp_running_flag` ++etp_flag=`echo ${check_etp_flag#*=}` ++if [ x"${etp_flag}" != x"true" ];then ++ source ${test_path_dir}/test/env_npu.sh ++fi ++ ++ ++#设置环境变量 ++echo "Device ID: $ASCEND_DEVICE_ID" ++export RANK_ID=$RANK_ID ++export WORLD_SIZE=1 ++ ++bash ./tools/dist_train.sh ${py_config} ${WORLD_SIZE} \ ++--work-dir ${work_dir} \ ++--cfg-options runner.max_epochs=$max_epochs ++ ++ ++#训练结束时间 ++end_time=$(date +%s) ++e2e_time=$(( $end_time - $start_time )) ++ ++log_file=`find ${work_dir} -regex ".*\.log" | sort -r | head -n 1` ++ ++#结果打印 ++echo "------------------ Final result ------------------" ++#输出性能FPS ++FPS=`grep -a 'Epoch ' ${log_file}|awk -F " time: " '!/Epoch \[1\]\[1/ {print $NF}'|awk -F " " '{print $1}' | awk '{ sum += $0; n++} END { if (n > 0) print sum / n;}'` ++FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${FPS}'}'` ++#打印 ++echo "Final Performance images/sec : $FPS" ++echo "E2E Training Duration sec : $e2e_time" ++ ++#性能看护结果汇总 ++#训练用例信息 ++DeviceType=`uname -m` ++CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'map' ++ ++##获取性能数据 ++#吞吐量 ++ActualFPS=${FPS} ++#单迭代训练时长 ++TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` ++echo "TrainingTime for step(ms) : $TrainingTime" ++ ++#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 ++grep "Epoch " ${log_file}|awk -F "loss: " '!/Epoch \[1\]\[1/ {print $NF}' | awk -F "," '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt ++ ++#最后一个迭代loss值 ++ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` ++ ++#关键信息打印到${CaseName}.log中 ++echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "BatchSize = ${batch_size}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++ ++# 性能任务控制 ++if (($performance==0));then ++ eval_log_file=`echo ${test_path_dir}/output/$ASCEND_DEVICE_ID/eval_${CaseName}.log` ++ chmod +x ./tools/dist_test.sh ++ nohup ./tools/dist_test.sh ${py_config} ${work_dir}/epoch_${max_epochs}_ema.pth 8 --eval mAP >> ${eval_log_file} 2>&1 & ++ wait ++ #输出训练精度 ++ train_accuracy=`grep -a 'mAP: ' ${eval_log_file}|awk 'END {print}'|awk -F " " '{print $NF}'` ++ #打印 ++ echo "Final Train Accuracy : ${train_accuracy}" ++ echo "mAP = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++fi +diff --git a/test/train_8p.sh b/test/train_8p.sh +new file mode 100644 +index 0000000..9597828 +--- /dev/null ++++ b/test/train_8p.sh +@@ -0,0 +1,153 @@ ++#!/bin/bash ++ ++#当前路径 ++cur_path=`pwd` ++ ++#集合通信参数 ++export RANK_SIZE=8 ++export JOB_ID=10087 ++RANK_ID_START=0 ++ ++performance=0 ++ ++#基础参数 ++batch_size=8 ++#训练step ++max_epochs=24 ++ ++# 帮助信息 ++if [[ $1 == --help || $1 == -h ]];then ++ echo"usage:./train_8p.sh " ++ echo " " ++ echo "parameter explain: ++ --py_config train config ++ --performance switch to performance mode when != 0 ++ --work_dir set output dir for training ++ -h/--help show help message ++ " ++ exit 1 ++fi ++ ++#参数校验 ++for para in $* ++do ++ if [[ $para == --py_config* ]];then ++ py_config=`echo ${para#*=}` ++ elif [[ $para == --performance* ]];then ++ performance=`echo ${para#*=}` ++ elif [[ $para == --work_dir* ]];then ++ work_dir=`echo ${para#*=}` ++ fi ++done ++ ++if (($performance!=0)); then ++ max_epochs=1 ++fi ++ ++#校验是否传入py_config ++if [[ $py_config == "" ]];then ++ echo "[Error] para \"py_config\" must be config" ++ exit 1 ++fi ++ ++#配置名称 ++config_name=`echo $py_config | awk -F "/" '{print $NF}' | awk -F "." '{print $1}'` ++#网络名称,同配置名称 ++Network=$config_name ++ ++if [[ $work_dir == "" ]];then ++ work_dir="output/train_8p/$config_name" ++else ++ work_dir="${work_dir}/train_8p/$config_name" ++fi ++ ++test_path_dir=$cur_path ++ASCEND_DEVICE_ID=0 ++ ++if [ ! -d ${test_path_dir}/output ];then ++ mkdir ${test_path_dir}/output ++fi ++if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then ++ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} ++ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt ++else ++ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt ++fi ++ ++ ++#训练开始时间 ++start_time=$(date +%s) ++# 非平台场景时source 环境变量 ++check_etp_flag=`env | grep etp_running_flag` ++etp_flag=`echo ${check_etp_flag#*=}` ++if [ x"${etp_flag}" != x"true" ];then ++ source ${test_path_dir}/test/env_npu.sh ++fi ++ ++ ++#设置环境变量 ++echo "Device ID: $ASCEND_DEVICE_ID" ++export RANK_ID=$RANK_ID ++export WORLD_SIZE=8 ++ ++bash ./tools/dist_train.sh ${py_config} ${WORLD_SIZE} \ ++--work-dir ${work_dir} \ ++--cfg-options runner.max_epochs=$max_epochs ++ ++ ++#训练结束时间 ++end_time=$(date +%s) ++e2e_time=$(( $end_time - $start_time )) ++ ++log_file=`find ${work_dir} -regex ".*\.log" | sort -r | head -n 1` ++ ++#结果打印 ++echo "------------------ Final result ------------------" ++#输出性能FPS ++FPS=`grep -a 'Epoch ' ${log_file}|awk -F " time: " '!/Epoch \[1\]\[1/ {print $NF}'|awk -F " " '{print $1}' | awk '{ sum += $0; n++} END { if (n > 0) print sum / n;}'` ++FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*8/'${FPS}'}'` ++#打印 ++echo "Final Performance images/sec : $FPS" ++echo "E2E Training Duration sec : $e2e_time" ++ ++#性能看护结果汇总 ++#训练用例信息 ++DeviceType=`uname -m` ++CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'map' ++ ++##获取性能数据 ++#吞吐量 ++ActualFPS=${FPS} ++#单迭代训练时长 ++TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000*8/'${FPS}'}'` ++echo "TrainingTime for step(ms) : $TrainingTime" ++ ++#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 ++grep "Epoch " ${log_file}|awk -F "loss: " '!/Epoch \[1\]\[1/ {print $NF}' | awk -F "," '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt ++ ++#最后一个迭代loss值 ++ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` ++ ++#关键信息打印到${CaseName}.log中 ++echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "BatchSize = ${batch_size}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++ ++# 性能任务控制 ++if (($performance==0));then ++ eval_log_file=`echo ${test_path_dir}/output/$ASCEND_DEVICE_ID/eval_${CaseName}.log` ++ chmod +x ./tools/dist_test.sh ++ nohup ./tools/dist_test.sh ${py_config} ${work_dir}/epoch_${max_epochs}_ema.pth 8 --eval mAP >> ${eval_log_file} 2>&1 & ++ wait ++ #输出训练精度 ++ train_accuracy=`grep -a 'mAP: ' ${eval_log_file}|awk 'END {print}'|awk -F " " '{print $NF}'` ++ #打印 ++ echo "Final Train Accuracy : ${train_accuracy}" ++ echo "mAP = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log ++fi diff --git a/tests/test_utils/test_box3d.py b/tests/test_utils/test_box3d.py index 69d8b31..5149884 100644 --- a/tests/test_utils/test_box3d.py diff --git a/model_examples/BEVDet/requirements.txt b/model_examples/BEVDet/requirements.txt index eb5a4463..bb2413fb 100644 --- a/model_examples/BEVDet/requirements.txt +++ b/model_examples/BEVDet/requirements.txt @@ -23,3 +23,4 @@ absl-py yapf mmdet==2.28.2 mmsegmentation==0.30.0 +ninja \ No newline at end of file -- Gitee