From 3484aab6b3f6da67d8718230fee504713a1103ca Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 26 Mar 2025 15:03:03 +0800 Subject: [PATCH] change crlf to lf --- .../vqmivc/pytorch/Dataset/README.md | 2 +- .../vqmivc/pytorch/convert_example.py | 244 +- .../vqmivc/pytorch/mi_estimators.py | 402 +-- .../vqmivc/pytorch/requirements.txt | 16 +- .../speech_synthesis/vqmivc/pytorch/train.py | 822 ++--- .../vqmivc/pytorch/vocoder/README.md | 2 +- .../waveglow/pytorch/tacotron2/loss_scaler.py | 262 +- .../configs/bevformer/bevformer_base.py | 514 +-- .../configs/bevformer/bevformer_small.py | 536 +-- .../configs/bevformer/bevformer_tiny.py | 540 +-- .../bevformer_fp16/bevformer_tiny_fp16.py | 544 +-- .../mmdet3d_plugin/bevformer/__init__.py | 12 +- .../mmdet3d_plugin/bevformer/apis/test.py | 326 +- .../bevformer/dense_heads/bevformer_head.py | 1046 +++--- .../bevformer/detectors/__init__.py | 2 +- .../bevformer/detectors/bevformer.py | 586 ++-- .../bevformer/detectors/bevformer_fp16.py | 178 +- .../bevformer/hooks/custom_hooks.py | 28 +- .../bevformer/modules/__init__.py | 12 +- .../modules/custom_base_transformer_layer.py | 520 +-- .../bevformer/modules/decoder.py | 690 ++-- .../bevformer/modules/encoder.py | 806 ++--- .../multi_scale_deformable_attn_function.py | 326 +- .../modules/spatial_cross_attention.py | 798 ++--- .../modules/temporal_self_attention.py | 544 +-- .../bevformer/modules/transformer.py | 578 +-- .../bevformer/runner/epoch_based_runner.py | 192 +- .../datasets/nuscenes_mono_dataset.py | 1552 ++++---- .../datasets/pipelines/formating.py | 76 +- .../datasets/samplers/sampler.py | 14 +- .../tools/analysis_tools/get_params.py | 20 +- .../pytorch/tools/analysis_tools/visual.py | 954 ++--- .../bevformer/pytorch/tools/fp16/train.py | 544 +-- .../pytorch/det3d/core/bbox/box_np_ops.py | 1606 ++++----- .../pytorch/det3d/core/bbox/geometry.py | 914 ++--- .../pytorch/det3d/core/sampler/preprocess.py | 1952 +++++------ .../det3d/ops/point_cloud/point_cloud_ops.py | 404 +-- .../centerpoint/pytorch/numba/docs/make.bat | 484 +-- .../centerpoint/pytorch/tools/create_data.py | 70 +- cv/detection/ssd/tensorflow/ci/prepare.sh | 68 +- .../arcface/pytorch/arcface.py | 282 +- .../arcface/pytorch/eval_LFW.py | 130 +- .../arcface/pytorch/nets/arcface.py | 180 +- .../arcface/pytorch/nets/arcface_training.py | 92 +- .../arcface/pytorch/nets/iresnet.py | 366 +- .../arcface/pytorch/nets/mobilefacenet.py | 262 +- .../arcface/pytorch/nets/mobilenet.py | 172 +- .../arcface/pytorch/predict.py | 86 +- .../arcface/pytorch/summary.py | 56 +- .../arcface/pytorch/txt_annotation.py | 54 +- .../arcface/pytorch/utils/callback.py | 170 +- .../arcface/pytorch/utils/dataloader.py | 210 +- .../arcface/pytorch/utils/utils_fit.py | 252 +- .../arcface/pytorch/utils/utils_metrics.py | 314 +- .../cosface/pytorch/dataset.py | 92 +- cv/face_recognition/cosface/pytorch/layer.py | 214 +- .../cosface/pytorch/lfw_eval.py | 234 +- cv/face_recognition/cosface/pytorch/main.py | 416 +-- .../cosface/pytorch/txt_annotation.py | 78 +- .../facenet/pytorch/eval_LFW.py | 118 +- .../facenet/pytorch/facenet.py | 222 +- .../facenet/pytorch/nets/facenet.py | 206 +- .../facenet/pytorch/nets/facenet_training.py | 178 +- .../pytorch/nets/inception_resnetv1.py | 464 +-- .../facenet/pytorch/nets/mobilenet.py | 144 +- .../facenet/pytorch/predict.py | 48 +- .../facenet/pytorch/requirements.txt | 18 +- .../facenet/pytorch/summary.py | 58 +- .../facenet/pytorch/txt_annotation.py | 50 +- .../facenet/pytorch/utils/utils_fit.py | 280 +- .../facenet/pytorch/utils/utils_metrics.py | 314 +- .../deep_sort/pytorch/evaluate.py | 30 +- .../deep_sort/pytorch/model.py | 208 +- .../deep_sort/pytorch/test.py | 152 +- .../src/lib/tracking_utils/kalman_filter.py | 538 +-- .../alphapose/datasets/coco_wholebody.py | 322 +- .../alphapose/datasets/coco_wholebody_det.py | 228 +- .../alphapose/datasets/halpe_136_det.py | 228 +- .../alphapose/datasets/halpe_26_det.py | 216 +- .../pytorch/alphapose/models/hardnet.py | 1140 +++--- .../256x192_res50_lr1e-3_2x-regression.yaml | 138 +- .../alphapose/pytorch/detector/effdet_cfg.py | 16 +- .../detector/tracker/utils/kalman_filter.py | 456 +-- .../alphapose/pytorch/detector/tracker_cfg.py | 16 +- .../alphapose/pytorch/detector/yolo_cfg.py | 18 +- .../detector/yolov5/models/utils/__init__.py | 74 +- .../pytorch/detector/yolov5/utils/__init__.py | 74 +- .../pytorch/scripts/trainval/demo_api.py | 730 ++-- .../pytorch/trackers/ReidModels/osnet_ain.py | 1078 +++--- .../psroi_pooling/src/psroi_pooling_cuda.c | 136 +- .../alphapose/pytorch/trackers/tracker_cfg.py | 20 +- .../trackers/tracking/utils/kalman_filter.py | 456 +-- .../alphapose/pytorch/trackers/utils/io.py | 222 +- .../pytorch/trackers/utils/kalman_filter.py | 556 +-- .../alphapose/pytorch/trackers/utils/log.py | 36 +- .../pytorch/trackers/utils/parse_config.py | 74 +- .../alphapose/pytorch/trackers/utils/timer.py | 90 +- .../alphapose/pytorch/trackers/utils/utils.py | 1500 ++++---- .../infer/mxbase/process/src/OpenPose.cpp | 698 ++-- .../infer/mxbase/process/src/OpenPose.h | 132 +- .../src/PostProcess/OpenPoseMindsporePost.cpp | 1182 +++---- .../src/PostProcess/OpenPoseMindsporePost.h | 162 +- .../infer/mxbase/proto/CMakeLists.txt | 96 +- .../mxbase/proto/mxpiOpenposeProto.proto | 64 +- .../infer/sdk/plugins/CMakeLists.txt | 50 +- .../sdk/plugins/MxpiOpenposePostProcess.cpp | 1612 ++++----- .../sdk/plugins/MxpiOpenposePostProcess.h | 378 +- .../mindspore/infer/sdk/proto/CMakeLists.txt | 94 +- .../infer/sdk/proto/mxpiOpenposeProto.proto | 64 +- .../ttvsr/pytorch/configs/TTVSR_reds4.py | 226 +- .../models/backbones/sr_backbones/ttvsrnet.py | 1430 ++++---- .../pytorch/mmedit/models/restorers/ttvsr.py | 410 +-- .../llava-1.5/pytorch/train.sh | 108 +- .../preprocess_stc_finetune.py | 96 +- .../pytorch/base/model/fp16/loss_scaler.py | 480 +-- .../Yi-VL-6B/pytorch/yi_vl_6b_full_sft.yaml | 90 +- .../Yi-VL-6B/pytorch/yi_vl_6b_lora_sft.yaml | 84 +- .../chatglm-6b/pytorch/ptuning/README_bi.md | 100 +- .../chatglm-6b/pytorch/ptuning/deepspeed.json | 50 +- .../ptuning/deepspeed_no_cpu_offload.json | 42 +- nlp/llm/chatglm-6b/pytorch/ptuning/hostfile | 2 +- nlp/llm/phi-3/pytorch/FineTrainingScript.py | 146 +- .../pytorch/models/configuration_phi3.py | 454 +-- nlp/llm/phi-3/pytorch/models/modeling_phi3.py | 3120 ++++++++--------- .../olive-lab/data/data_sample_travel.jsonl | 576 +-- .../pytorch/qwen2_5-7b_full_sft.yaml | 78 +- .../pytorch/qwen2_5-7b_lora_sft.yaml | 78 +- .../pytorch/generate_training_data.py | 218 +- .../pytorch/dlrm/config/criteo_kaggle.json | 92 +- .../dlrm/config/criteo_kaggle_tiny.json | 92 +- .../pytorch/dlrm/config/mlperf_10m.limit.json | 92 +- .../pytorch/dlrm/config/official_config.json | 94 +- .../dlrm/pytorch/dlrm/deprecated_model.py | 592 ++-- .../dlrm/pytorch/dlrm/nn/__init__.py | 4 +- .../dlrm/pytorch/dlrm/nn/functional.py | 56 +- .../dlrm/nn/modules/buckle_embedding.py | 86 +- .../dlrm/pytorch/dlrm/nn/modules/gather.py | 124 +- .../dlrm/pytorch/dlrm/utils/metrics.py | 200 +- .../dlrm/pytorch/scripts/split_data.py | 116 +- .../dlrm/pytorch/setup.py | 82 +- .../dlrm/pytorch/src/gather_gpu.cu | 342 +- .../pytorch/tests/buckle_embedding_test.py | 114 +- .../dlrm/pytorch/tests/dataset_test.py | 342 +- .../dlrm/pytorch/tests/dist_model_test.py | 176 +- .../dlrm/pytorch/tests/metrics_test.py | 82 +- .../dlrm/pytorch/tests/model_test.py | 222 +- toolbox/Fairseq/patch/bart.py | 78 +- toolbox/Fairseq/patch/roberta.py | 80 +- toolbox/MoE-LLaVA/moellava/vis/vis1.py | 338 +- toolbox/MoE-LLaVA/moellava/vis/vis2.py | 244 +- toolbox/MoE-LLaVA/moellava/vis/vis3.py | 362 +- .../pcdet/datasets/custom/custom_dataset.py | 566 +-- 152 files changed, 25297 insertions(+), 25297 deletions(-) diff --git a/audio/speech_synthesis/vqmivc/pytorch/Dataset/README.md b/audio/speech_synthesis/vqmivc/pytorch/Dataset/README.md index c8b06fe8f..d5f854d81 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/Dataset/README.md +++ b/audio/speech_synthesis/vqmivc/pytorch/Dataset/README.md @@ -1 +1 @@ -Put VCTK corpus under this directory. +Put VCTK corpus under this directory. diff --git a/audio/speech_synthesis/vqmivc/pytorch/convert_example.py b/audio/speech_synthesis/vqmivc/pytorch/convert_example.py index c72efdfa3..764d63a86 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/convert_example.py +++ b/audio/speech_synthesis/vqmivc/pytorch/convert_example.py @@ -1,122 +1,122 @@ - -import torch -import numpy as np - - -import soundfile as sf - -from model_encoder import Encoder, Encoder_lf0 -from model_decoder import Decoder_ac -from model_encoder import SpeakerEncoder as Encoder_spk -import os - -import subprocess -from spectrogram import logmelspectrogram -import kaldiio - -import resampy -import pyworld as pw - -import argparse - - -def extract_logmel(wav_path, mean, std, sr=16000): - # wav, fs = librosa.load(wav_path, sr=sr) - wav, fs = sf.read(wav_path) - if fs != sr: - wav = resampy.resample(wav, fs, sr, axis=0) - fs = sr - #wav, _ = librosa.effects.trim(wav, top_db=15) - # duration = len(wav)/fs - assert fs == 16000 - peak = np.abs(wav).max() - if peak > 1.0: - wav /= peak - mel = logmelspectrogram( - x=wav, - fs=fs, - n_mels=80, - n_fft=400, - n_shift=160, - win_length=400, - window='hann', - fmin=80, - fmax=7600, - ) - - mel = (mel - mean) / (std + 1e-8) - tlen = mel.shape[0] - frame_period = 160/fs*1000 - f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period) - f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs) - f0 = f0[:tlen].reshape(-1).astype('float32') - nonzeros_indices = np.nonzero(f0) - lf0 = f0.copy() - lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0 - mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices]) - lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8) - return mel, lf0 - - -def convert(args): - src_wav_path = args.source_wav - ref_wav_path = args.reference_wav - - out_dir = args.converted_wav_path - os.makedirs(out_dir, exist_ok=True) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - encoder = Encoder(in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256) - encoder_lf0 = Encoder_lf0() - encoder_spk = Encoder_spk() - decoder = Decoder_ac(dim_neck=64) - encoder.to(device) - encoder_lf0.to(device) - encoder_spk.to(device) - decoder.to(device) - - checkpoint_path = args.model_path - checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) - encoder.load_state_dict(checkpoint["encoder"]) - encoder_spk.load_state_dict(checkpoint["encoder_spk"]) - decoder.load_state_dict(checkpoint["decoder"]) - - encoder.eval() - encoder_spk.eval() - decoder.eval() - - mel_stats = np.load('./mel_stats/stats.npy') - mean = mel_stats[0] - std = mel_stats[1] - feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir)+'/feats.1')) - src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std) - ref_mel, _ = extract_logmel(ref_wav_path, mean, std) - src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device) - src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device) - ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device) - out_filename = os.path.basename(src_wav_path).split('.')[0] - with torch.no_grad(): - z, _, _, _ = encoder.encode(src_mel) - lf0_embs = encoder_lf0(src_lf0) - spk_emb = encoder_spk(ref_mel) - output = decoder(z, lf0_embs, spk_emb) - - feat_writer[out_filename+'_converted'] = output.squeeze(0).cpu().numpy() - feat_writer[out_filename+'_source'] = src_mel.squeeze(0).cpu().numpy().T - feat_writer[out_filename+'_reference'] = ref_mel.squeeze(0).cpu().numpy().T - - feat_writer.close() - print('synthesize waveform...') - cmd = ['parallel-wavegan-decode', '--checkpoint', \ - './vocoder/checkpoint-3000000steps.pkl', \ - '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)] - subprocess.call(cmd) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--source_wav', '-s', type=str, required=True) - parser.add_argument('--reference_wav', '-r', type=str, required=True) - parser.add_argument('--converted_wav_path', '-c', type=str, default='converted') - parser.add_argument('--model_path', '-m', type=str, required=True) - args = parser.parse_args() - convert(args) + +import torch +import numpy as np + + +import soundfile as sf + +from model_encoder import Encoder, Encoder_lf0 +from model_decoder import Decoder_ac +from model_encoder import SpeakerEncoder as Encoder_spk +import os + +import subprocess +from spectrogram import logmelspectrogram +import kaldiio + +import resampy +import pyworld as pw + +import argparse + + +def extract_logmel(wav_path, mean, std, sr=16000): + # wav, fs = librosa.load(wav_path, sr=sr) + wav, fs = sf.read(wav_path) + if fs != sr: + wav = resampy.resample(wav, fs, sr, axis=0) + fs = sr + #wav, _ = librosa.effects.trim(wav, top_db=15) + # duration = len(wav)/fs + assert fs == 16000 + peak = np.abs(wav).max() + if peak > 1.0: + wav /= peak + mel = logmelspectrogram( + x=wav, + fs=fs, + n_mels=80, + n_fft=400, + n_shift=160, + win_length=400, + window='hann', + fmin=80, + fmax=7600, + ) + + mel = (mel - mean) / (std + 1e-8) + tlen = mel.shape[0] + frame_period = 160/fs*1000 + f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period) + f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs) + f0 = f0[:tlen].reshape(-1).astype('float32') + nonzeros_indices = np.nonzero(f0) + lf0 = f0.copy() + lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0 + mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices]) + lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8) + return mel, lf0 + + +def convert(args): + src_wav_path = args.source_wav + ref_wav_path = args.reference_wav + + out_dir = args.converted_wav_path + os.makedirs(out_dir, exist_ok=True) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + encoder = Encoder(in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256) + encoder_lf0 = Encoder_lf0() + encoder_spk = Encoder_spk() + decoder = Decoder_ac(dim_neck=64) + encoder.to(device) + encoder_lf0.to(device) + encoder_spk.to(device) + decoder.to(device) + + checkpoint_path = args.model_path + checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) + encoder.load_state_dict(checkpoint["encoder"]) + encoder_spk.load_state_dict(checkpoint["encoder_spk"]) + decoder.load_state_dict(checkpoint["decoder"]) + + encoder.eval() + encoder_spk.eval() + decoder.eval() + + mel_stats = np.load('./mel_stats/stats.npy') + mean = mel_stats[0] + std = mel_stats[1] + feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir)+'/feats.1')) + src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std) + ref_mel, _ = extract_logmel(ref_wav_path, mean, std) + src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device) + src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device) + ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device) + out_filename = os.path.basename(src_wav_path).split('.')[0] + with torch.no_grad(): + z, _, _, _ = encoder.encode(src_mel) + lf0_embs = encoder_lf0(src_lf0) + spk_emb = encoder_spk(ref_mel) + output = decoder(z, lf0_embs, spk_emb) + + feat_writer[out_filename+'_converted'] = output.squeeze(0).cpu().numpy() + feat_writer[out_filename+'_source'] = src_mel.squeeze(0).cpu().numpy().T + feat_writer[out_filename+'_reference'] = ref_mel.squeeze(0).cpu().numpy().T + + feat_writer.close() + print('synthesize waveform...') + cmd = ['parallel-wavegan-decode', '--checkpoint', \ + './vocoder/checkpoint-3000000steps.pkl', \ + '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)] + subprocess.call(cmd) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--source_wav', '-s', type=str, required=True) + parser.add_argument('--reference_wav', '-r', type=str, required=True) + parser.add_argument('--converted_wav_path', '-c', type=str, default='converted') + parser.add_argument('--model_path', '-m', type=str, required=True) + args = parser.parse_args() + convert(args) diff --git a/audio/speech_synthesis/vqmivc/pytorch/mi_estimators.py b/audio/speech_synthesis/vqmivc/pytorch/mi_estimators.py index c1c54b449..27e081557 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/mi_estimators.py +++ b/audio/speech_synthesis/vqmivc/pytorch/mi_estimators.py @@ -1,201 +1,201 @@ -''' -Modified from: https://github.com/Linear95/CLUB -''' - -import torch -import torch.nn as nn - -class CLUB(nn.Module): # CLUB: Mutual Information Contrastive Learning Upper Bound - ''' - This class provides the CLUB estimation to I(X,Y) - Method: - mi_est() : provides the estimation with input samples - loglikeli() : provides the log-likelihood of the approximation q(Y|X) with input samples - Arguments: - x_dim, y_dim : the dimensions of samples from X, Y respectively - hidden_size : the dimension of the hidden layer of the approximation network q(Y|X) - x_samples, y_samples : samples from X and Y, having shape [sample_size, x_dim/y_dim] - ''' - def __init__(self, x_dim, y_dim, hidden_size): - super(CLUB, self).__init__() - # p_mu outputs mean of q(Y|X) - self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim)) - # p_logvar outputs log of variance of q(Y|X) - self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim), - nn.Tanh()) - # self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - # nn.ReLU(), - # nn.Linear(hidden_size//2, y_dim)) - - def get_mu_logvar(self, x_samples): - mu = self.p_mu(x_samples) - logvar = self.p_logvar(x_samples) - return mu, logvar - - def mi_est(self, x_samples, y_samples): - mu, logvar = self.get_mu_logvar(x_samples) - - # log of conditional probability of positive sample pairs - positive = - (mu - y_samples)**2 /2./logvar.exp() - - prediction_1 = mu.unsqueeze(1) # shape [nsample,1,dim] - y_samples_1 = y_samples.unsqueeze(0) # shape [1,nsample,dim] - - # log of conditional probability of negative sample pairs - negative = - ((y_samples_1 - prediction_1)**2).mean(dim=1)/2./logvar.exp() - - return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() - - def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood - mu, logvar = self.get_mu_logvar(x_samples) - return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) - - - -class CLUBSample(nn.Module): # Sampled version of the CLUB estimator - def __init__(self, x_dim, y_dim, hidden_size): - super(CLUBSample, self).__init__() - self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim)) - - self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim), - nn.Tanh()) - - def get_mu_logvar(self, x_samples): - mu = self.p_mu(x_samples) - logvar = self.p_logvar(x_samples) - return mu, logvar - - - def loglikeli(self, x_samples, y_samples): - mu, logvar = self.get_mu_logvar(x_samples) - return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) - - - def mi_est(self, x_samples, y_samples): - mu, logvar = self.get_mu_logvar(x_samples) - - sample_size = x_samples.shape[0] - #random_index = torch.randint(sample_size, (sample_size,)).long() - random_index = torch.randperm(sample_size).long() - - positive = - (mu - y_samples)**2 / logvar.exp() - negative = - (mu - y_samples[random_index])**2 / logvar.exp() - upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() - return upper_bound/2. - - -class CLUBSample_reshape(nn.Module): # Sampled version of the CLUB estimator - def __init__(self, x_dim, y_dim, hidden_size): - super(CLUBSample_reshape, self).__init__() - self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim)) - - self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim), - nn.Tanh()) - - def get_mu_logvar(self, x_samples): - mu = self.p_mu(x_samples) - logvar = self.p_logvar(x_samples) - return mu, logvar - - - def loglikeli(self, x_samples, y_samples): - mu, logvar = self.get_mu_logvar(x_samples) - mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) - logvar = logvar.reshape(-1, logvar.shape[-1]) - y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) - return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) - - - def mi_est(self, x_samples, y_samples): - mu, logvar = self.get_mu_logvar(x_samples) - sample_size = mu.shape[0] - random_index = torch.randperm(sample_size).long() - y_shuffle = y_samples[random_index] - mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) - logvar = logvar.reshape(-1, logvar.shape[-1]) - y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) - y_shuffle = y_shuffle.reshape(-1, y_shuffle.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) - - positive = - (mu - y_samples)**2 / logvar.exp() - negative = - (mu - y_shuffle)**2 / logvar.exp() - upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() - return upper_bound/2. - - -class CLUBSample_group(nn.Module): # Sampled version of the CLUB estimator - def __init__(self, x_dim, y_dim, hidden_size): - super(CLUBSample_group, self).__init__() - self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim)) - - self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, hidden_size//2), - nn.ReLU(), - nn.Linear(hidden_size//2, y_dim), - nn.Tanh()) - - def get_mu_logvar(self, x_samples): - mu = self.p_mu(x_samples) - logvar = self.p_logvar(x_samples) - return mu, logvar - - - def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood - mu, logvar = self.get_mu_logvar(x_samples) # mu/logvar: (bs, y_dim) - mu = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) - logvar = logvar.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1]) - y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) - return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) / 2 - - def mi_est(self, x_samples, y_samples): # x_samples: (bs, x_dim); y_samples: (bs, T, y_dim) - mu, logvar = self.get_mu_logvar(x_samples) - - sample_size = x_samples.shape[0] - #random_index = torch.randint(sample_size, (sample_size,)).long() - random_index = torch.randperm(sample_size).long() - - # log of conditional probability of positive sample pairs - mu_exp1 = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1) # (bs, y_dim) -> (bs, T, y_dim) - # logvar_exp1 = logvar.unqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1]) - positive = - ((mu_exp1 - y_samples)**2).mean(dim=1) / logvar.exp() # mean along T - negative = - ((mu_exp1 - y_samples[random_index])**2).mean(dim=1) / logvar.exp() # mean along T - - return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() / 2 - - +''' +Modified from: https://github.com/Linear95/CLUB +''' + +import torch +import torch.nn as nn + +class CLUB(nn.Module): # CLUB: Mutual Information Contrastive Learning Upper Bound + ''' + This class provides the CLUB estimation to I(X,Y) + Method: + mi_est() : provides the estimation with input samples + loglikeli() : provides the log-likelihood of the approximation q(Y|X) with input samples + Arguments: + x_dim, y_dim : the dimensions of samples from X, Y respectively + hidden_size : the dimension of the hidden layer of the approximation network q(Y|X) + x_samples, y_samples : samples from X and Y, having shape [sample_size, x_dim/y_dim] + ''' + def __init__(self, x_dim, y_dim, hidden_size): + super(CLUB, self).__init__() + # p_mu outputs mean of q(Y|X) + self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim)) + # p_logvar outputs log of variance of q(Y|X) + self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim), + nn.Tanh()) + # self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + # nn.ReLU(), + # nn.Linear(hidden_size//2, y_dim)) + + def get_mu_logvar(self, x_samples): + mu = self.p_mu(x_samples) + logvar = self.p_logvar(x_samples) + return mu, logvar + + def mi_est(self, x_samples, y_samples): + mu, logvar = self.get_mu_logvar(x_samples) + + # log of conditional probability of positive sample pairs + positive = - (mu - y_samples)**2 /2./logvar.exp() + + prediction_1 = mu.unsqueeze(1) # shape [nsample,1,dim] + y_samples_1 = y_samples.unsqueeze(0) # shape [1,nsample,dim] + + # log of conditional probability of negative sample pairs + negative = - ((y_samples_1 - prediction_1)**2).mean(dim=1)/2./logvar.exp() + + return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() + + def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood + mu, logvar = self.get_mu_logvar(x_samples) + return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) + + + +class CLUBSample(nn.Module): # Sampled version of the CLUB estimator + def __init__(self, x_dim, y_dim, hidden_size): + super(CLUBSample, self).__init__() + self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim)) + + self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim), + nn.Tanh()) + + def get_mu_logvar(self, x_samples): + mu = self.p_mu(x_samples) + logvar = self.p_logvar(x_samples) + return mu, logvar + + + def loglikeli(self, x_samples, y_samples): + mu, logvar = self.get_mu_logvar(x_samples) + return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) + + + def mi_est(self, x_samples, y_samples): + mu, logvar = self.get_mu_logvar(x_samples) + + sample_size = x_samples.shape[0] + #random_index = torch.randint(sample_size, (sample_size,)).long() + random_index = torch.randperm(sample_size).long() + + positive = - (mu - y_samples)**2 / logvar.exp() + negative = - (mu - y_samples[random_index])**2 / logvar.exp() + upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() + return upper_bound/2. + + +class CLUBSample_reshape(nn.Module): # Sampled version of the CLUB estimator + def __init__(self, x_dim, y_dim, hidden_size): + super(CLUBSample_reshape, self).__init__() + self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim)) + + self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim), + nn.Tanh()) + + def get_mu_logvar(self, x_samples): + mu = self.p_mu(x_samples) + logvar = self.p_logvar(x_samples) + return mu, logvar + + + def loglikeli(self, x_samples, y_samples): + mu, logvar = self.get_mu_logvar(x_samples) + mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) + logvar = logvar.reshape(-1, logvar.shape[-1]) + y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) + return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) + + + def mi_est(self, x_samples, y_samples): + mu, logvar = self.get_mu_logvar(x_samples) + sample_size = mu.shape[0] + random_index = torch.randperm(sample_size).long() + y_shuffle = y_samples[random_index] + mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) + logvar = logvar.reshape(-1, logvar.shape[-1]) + y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) + y_shuffle = y_shuffle.reshape(-1, y_shuffle.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) + + positive = - (mu - y_samples)**2 / logvar.exp() + negative = - (mu - y_shuffle)**2 / logvar.exp() + upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() + return upper_bound/2. + + +class CLUBSample_group(nn.Module): # Sampled version of the CLUB estimator + def __init__(self, x_dim, y_dim, hidden_size): + super(CLUBSample_group, self).__init__() + self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim)) + + self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, hidden_size//2), + nn.ReLU(), + nn.Linear(hidden_size//2, y_dim), + nn.Tanh()) + + def get_mu_logvar(self, x_samples): + mu = self.p_mu(x_samples) + logvar = self.p_logvar(x_samples) + return mu, logvar + + + def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood + mu, logvar = self.get_mu_logvar(x_samples) # mu/logvar: (bs, y_dim) + mu = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim) + logvar = logvar.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1]) + y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim) + return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) / 2 + + def mi_est(self, x_samples, y_samples): # x_samples: (bs, x_dim); y_samples: (bs, T, y_dim) + mu, logvar = self.get_mu_logvar(x_samples) + + sample_size = x_samples.shape[0] + #random_index = torch.randint(sample_size, (sample_size,)).long() + random_index = torch.randperm(sample_size).long() + + # log of conditional probability of positive sample pairs + mu_exp1 = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1) # (bs, y_dim) -> (bs, T, y_dim) + # logvar_exp1 = logvar.unqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1]) + positive = - ((mu_exp1 - y_samples)**2).mean(dim=1) / logvar.exp() # mean along T + negative = - ((mu_exp1 - y_samples[random_index])**2).mean(dim=1) / logvar.exp() # mean along T + + return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() / 2 + + diff --git a/audio/speech_synthesis/vqmivc/pytorch/requirements.txt b/audio/speech_synthesis/vqmivc/pytorch/requirements.txt index 2aa8fa84f..255738152 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/requirements.txt +++ b/audio/speech_synthesis/vqmivc/pytorch/requirements.txt @@ -1,8 +1,8 @@ -pyworld==0.3.0 -librosa==0.8.0 -soundfile==0.10.3.post1 -tqdm -hydra-core==1.0.0 -torch==1.3.1 -numpy==1.19.1 -kaldiio==2.15.1 +pyworld==0.3.0 +librosa==0.8.0 +soundfile==0.10.3.post1 +tqdm +hydra-core==1.0.0 +torch==1.3.1 +numpy==1.19.1 +kaldiio==2.15.1 diff --git a/audio/speech_synthesis/vqmivc/pytorch/train.py b/audio/speech_synthesis/vqmivc/pytorch/train.py index adfd67e47..5fab7eca4 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/train.py +++ b/audio/speech_synthesis/vqmivc/pytorch/train.py @@ -1,411 +1,411 @@ -import hydra -from hydra import utils -from itertools import chain -from pathlib import Path -import numpy as np - -import torch -import torch.optim as optim -from torch.utils.data import DataLoader - - -from dataset import CPCDataset_sameSeq as CPCDataset -from scheduler import WarmupScheduler -from model_encoder import Encoder, CPCLoss_sameSeq, Encoder_lf0 -from model_decoder import Decoder_ac -from model_encoder import SpeakerEncoder as Encoder_spk -from mi_estimators import CLUBSample_group, CLUBSample_reshape - -import apex.amp as amp -import os -import time - -torch.manual_seed(137) -np.random.seed(137) - -def save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \ - cs_mi_net, ps_mi_net, cp_mi_net, decoder, \ - optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg): - if cfg.use_amp: - amp_state_dict = amp.state_dict() - else: - amp_state_dict = None - checkpoint_state = { - "encoder": encoder.state_dict(), - "encoder_lf0": encoder_lf0.state_dict(), - "cpc": cpc.state_dict(), - "encoder_spk": encoder_spk.state_dict(), - "ps_mi_net": ps_mi_net.state_dict(), - "cp_mi_net": cp_mi_net.state_dict(), - "cs_mi_net": cs_mi_net.state_dict(), - "decoder": decoder.state_dict(), - "optimizer": optimizer.state_dict(), - "optimizer_cs_mi_net": optimizer_cs_mi_net.state_dict(), - "optimizer_ps_mi_net": optimizer_ps_mi_net.state_dict(), - "optimizer_cp_mi_net": optimizer_cp_mi_net.state_dict(), - "scheduler": scheduler.state_dict(), - "amp": amp_state_dict, - "epoch": epoch - } - checkpoint_dir.mkdir(exist_ok=True, parents=True) - checkpoint_path = checkpoint_dir / "model.ckpt-{}.pt".format(epoch) - torch.save(checkpoint_state, checkpoint_path) - print("Saved checkpoint: {}".format(checkpoint_path.stem)) - - - -def mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, - ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg): - optimizer_cs_mi_net.zero_grad() - optimizer_ps_mi_net.zero_grad() - optimizer_cp_mi_net.zero_grad() - z, _, _, _, _ = encoder(mels) - z = z.detach() - lf0_embs = encoder_lf0(lf0).detach() - spk_embs = encoder_spk(mels).detach() - if cfg.use_CSMI: - lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z) - if cfg.use_amp: - with amp.scale_loss(lld_cs_loss, optimizer_cs_mi_net) as sl: - sl.backward() - else: - lld_cs_loss.backward() - optimizer_cs_mi_net.step() - else: - lld_cs_loss = torch.tensor(0.) - - if cfg.use_CPMI: - lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) - if cfg.use_amp: - with amp.scale_loss(lld_cp_loss, optimizer_cp_mi_net) as slll: - slll.backward() - else: - lld_cp_loss.backward() - torch.nn.utils.clip_grad_norm_(cp_mi_net.parameters(), 1) - optimizer_cp_mi_net.step() - else: - lld_cp_loss = torch.tensor(0.) - - if cfg.use_PSMI: - lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs) - if cfg.use_amp: - with amp.scale_loss(lld_ps_loss, optimizer_ps_mi_net) as sll: - sll.backward() - else: - lld_ps_loss.backward() - optimizer_ps_mi_net.step() - else: - lld_ps_loss = torch.tensor(0.) - - return optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss - - -def mi_second_forward(mels, lf0, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg, optimizer, scheduler): - optimizer.zero_grad() - z, c, _, vq_loss, perplexity = encoder(mels) - cpc_loss, accuracy = cpc(z, c) - spk_embs = encoder_spk(mels) - lf0_embs = encoder_lf0(lf0) - recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2)) - - loss = recon_loss + cpc_loss + vq_loss - - if cfg.use_CSMI: - mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z) - else: - mi_cs_loss = torch.tensor(0.).to(loss.device) - - if cfg.use_CPMI: - mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) - else: - mi_cp_loss = torch.tensor(0.).to(loss.device) - - if cfg.use_PSMI: - mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs) - else: - mi_ps_loss = torch.tensor(0.).to(loss.device) - - loss = loss + mi_cs_loss + mi_ps_loss + mi_cp_loss - - if cfg.use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - optimizer.step() - return optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss - - -def calculate_eval_loss(mels, lf0, \ - encoder, encoder_lf0, cpc, \ - encoder_spk, cs_mi_net, ps_mi_net, \ - cp_mi_net, decoder, cfg): - with torch.no_grad(): - z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels) - c = c - lf0_embs = encoder_lf0(lf0) - spk_embs = encoder_spk(mels) - - if cfg.use_CSMI: - lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z) - mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z) - else: - lld_cs_loss = torch.tensor(0.) - mi_cs_loss = torch.tensor(0.) - - # z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels) - cpc_loss, accuracy = cpc(z, c) - recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2)) - - if cfg.use_CPMI: - mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) - lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) - else: - mi_cp_loss = torch.tensor(0.) - lld_cp_loss = torch.tensor(0.) - - if cfg.use_PSMI: - mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs) - lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs) - else: - mi_ps_loss = torch.tensor(0.) - lld_ps_loss = torch.tensor(0.) - - return recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss - - -def to_eval(all_models): - for m in all_models: - m.eval() - - -def to_train(all_models): - for m in all_models: - m.train() - - -def eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg): - stime = time.time() - average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0 - average_accuracies = np.zeros(cfg.training.n_prediction_steps) - average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0 - all_models = [encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder] - to_eval(all_models) - for i, (mels, lf0, speakers) in enumerate(valid_dataloader, 1): - lf0 = lf0.to(device) - mels = mels.to(device) # (bs, 80, 128) - recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss = \ - calculate_eval_loss(mels, lf0, \ - encoder, encoder_lf0, cpc, \ - encoder_spk, cs_mi_net, ps_mi_net, \ - cp_mi_net, decoder, cfg) - - average_recon_loss += (recon_loss.item() - average_recon_loss) / i - average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i - average_vq_loss += (vq_loss.item() - average_vq_loss) / i - average_perplexity += (perplexity.item() - average_perplexity) / i - average_accuracies += (np.array(accuracy) - average_accuracies) / i - average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i - average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i - average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i - average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i - average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i - average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i - - - ctime = time.time() - print("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" - .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) - print(100 * average_accuracies) - results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a') - results_txt.write("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}" - .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n') - results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n') - results_txt.close() - - to_train(all_models) - - -@hydra.main(config_path="config/train.yaml") -def train_model(cfg): - cfg.checkpoint_dir = f'{cfg.checkpoint_dir}/useCSMI{cfg.use_CSMI}_useCPMI{cfg.use_CPMI}_usePSMI{cfg.use_PSMI}_useAmp{cfg.use_amp}' - if cfg.encoder_lf0_type == 'no_emb': # default - dim_lf0 = 1 - else: - dim_lf0 = 64 - - checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) - checkpoint_dir.mkdir(exist_ok=True, parents=True) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # define model - encoder = Encoder(**cfg.model.encoder) - encoder_lf0 = Encoder_lf0(cfg.encoder_lf0_type) - cpc = CPCLoss_sameSeq(**cfg.model.cpc) - encoder_spk = Encoder_spk() - cs_mi_net = CLUBSample_group(256, cfg.model.encoder.z_dim, 512) - ps_mi_net = CLUBSample_group(256, dim_lf0, 512) - cp_mi_net = CLUBSample_reshape(dim_lf0, cfg.model.encoder.z_dim, 512) - decoder = Decoder_ac(dim_neck=cfg.model.encoder.z_dim, dim_lf0=dim_lf0, use_l1_loss=True) - - encoder.to(device) - cpc.to(device) - encoder_lf0.to(device) - encoder_spk.to(device) - cs_mi_net.to(device) - ps_mi_net.to(device) - cp_mi_net.to(device) - decoder.to(device) - - optimizer = optim.Adam( - chain(encoder.parameters(), encoder_lf0.parameters(), cpc.parameters(), encoder_spk.parameters(), decoder.parameters()), - lr=cfg.training.scheduler.initial_lr) - optimizer_cs_mi_net = optim.Adam(cs_mi_net.parameters(), lr=cfg.mi_lr) - optimizer_ps_mi_net = optim.Adam(ps_mi_net.parameters(), lr=cfg.mi_lr) - optimizer_cp_mi_net = optim.Adam(cp_mi_net.parameters(), lr=cfg.mi_lr) - # TODO: use_amp is set default to True to speed up training; no-amp -> more stable training? => need to be verified - if cfg.use_amp: - [encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer = amp.initialize([encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer, opt_level='O1') - [cs_mi_net], optimizer_cs_mi_net = amp.initialize([cs_mi_net], optimizer_cs_mi_net, opt_level='O1') - [ps_mi_net], optimizer_ps_mi_net = amp.initialize([ps_mi_net], optimizer_ps_mi_net, opt_level='O1') - [cp_mi_net], optimizer_cp_mi_net = amp.initialize([cp_mi_net], optimizer_cp_mi_net, opt_level='O1') - - root_path = Path(utils.to_absolute_path("data")) - dataset = CPCDataset( - root=root_path, - n_sample_frames=cfg.training.sample_frames, # 128 - mode='train') - valid_dataset = CPCDataset( - root=root_path, - n_sample_frames=cfg.training.sample_frames, # 128 - mode='valid') - - warmup_epochs = 2000 // (len(dataset)//cfg.training.batch_size) - print('warmup_epochs:', warmup_epochs) - scheduler = WarmupScheduler( - optimizer, - warmup_epochs=warmup_epochs, - initial_lr=cfg.training.scheduler.initial_lr, - max_lr=cfg.training.scheduler.max_lr, - milestones=cfg.training.scheduler.milestones, - gamma=cfg.training.scheduler.gamma) - - dataloader = DataLoader( - dataset, - batch_size=cfg.training.batch_size, # 256 - shuffle=True, - num_workers=cfg.training.n_workers, - pin_memory=True, - drop_last=False) - valid_dataloader = DataLoader( - valid_dataset, - batch_size=cfg.training.batch_size, # 256 - shuffle=False, - num_workers=cfg.training.n_workers, - pin_memory=True, - drop_last=False) - - if cfg.resume: - print("Resume checkpoint from: {}:".format(cfg.resume)) - resume_path = utils.to_absolute_path(cfg.resume) - checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) - encoder.load_state_dict(checkpoint["encoder"]) - encoder_lf0.load_state_dict(checkpoint["encoder_lf0"]) - cpc.load_state_dict(checkpoint["cpc"]) - encoder_spk.load_state_dict(checkpoint["encoder_spk"]) - cs_mi_net.load_state_dict(checkpoint["cs_mi_net"]) - ps_mi_net.load_state_dict(checkpoint["ps_mi_net"]) - if cfg.use_CPMI: - cp_mi_net.load_state_dict(checkpoint["cp_mi_net"]) - decoder.load_state_dict(checkpoint["decoder"]) - optimizer.load_state_dict(checkpoint["optimizer"]) - optimizer_cs_mi_net.load_state_dict(checkpoint["optimizer_cs_mi_net"]) - optimizer_ps_mi_net.load_state_dict(checkpoint["optimizer_ps_mi_net"]) - optimizer_cp_mi_net.load_state_dict(checkpoint["optimizer_cp_mi_net"]) - if cfg.use_amp: - amp.load_state_dict(checkpoint["amp"]) - scheduler.load_state_dict(checkpoint["scheduler"]) - start_epoch = checkpoint["epoch"] - else: - start_epoch = 1 - - if os.path.exists(f'{str(checkpoint_dir)}/results.txt'): - wmode = 'a' - else: - wmode = 'w' - results_txt = open(f'{str(checkpoint_dir)}/results.txt', wmode) - results_txt.write('save training info...\n') - results_txt.close() - - global_step = 0 - stime = time.time() - for epoch in range(start_epoch, cfg.training.n_epochs + 1): - average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0 - average_accuracies = np.zeros(cfg.training.n_prediction_steps) - average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0 - - for i, (mels, lf0, speakers) in enumerate(dataloader, 1): - lf0 = lf0.to(device) - mels = mels.to(device) # (bs, 80, 128) - if cfg.use_CSMI or cfg.use_CPMI or cfg.use_PSMI: - for j in range(cfg.mi_iters): - optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss = mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, \ - ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg) - else: - lld_cs_loss = torch.tensor(0.) - lld_ps_loss = torch.tensor(0.) - lld_cp_loss = torch.tensor(0.) - - optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss = mi_second_forward(mels, lf0, \ - encoder, encoder_lf0, cpc, \ - encoder_spk, cs_mi_net, ps_mi_net, \ - cp_mi_net, decoder, cfg, \ - optimizer, scheduler) - - average_recon_loss += (recon_loss.item() - average_recon_loss) / i - average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i - average_vq_loss += (vq_loss.item() - average_vq_loss) / i - average_perplexity += (perplexity.item() - average_perplexity) / i - average_accuracies += (np.array(accuracy) - average_accuracies) / i - average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i - average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i - average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i - average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i - average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i - average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i - - - ctime = time.time() - print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" - .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) - print(100 * average_accuracies) - stime = time.time() - global_step += 1 - # scheduler.step() - - results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a') - results_txt.write("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}" - .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n') - results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n') - results_txt.close() - scheduler.step() - - - if epoch % cfg.training.log_interval == 0 and epoch != start_epoch: - eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg) - - ctime = time.time() - print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" - .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) - print(100 * average_accuracies) - stime = time.time() - - if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch: - save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \ - cs_mi_net, ps_mi_net, cp_mi_net, decoder, \ - optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg) - - -if __name__ == "__main__": - train_model() +import hydra +from hydra import utils +from itertools import chain +from pathlib import Path +import numpy as np + +import torch +import torch.optim as optim +from torch.utils.data import DataLoader + + +from dataset import CPCDataset_sameSeq as CPCDataset +from scheduler import WarmupScheduler +from model_encoder import Encoder, CPCLoss_sameSeq, Encoder_lf0 +from model_decoder import Decoder_ac +from model_encoder import SpeakerEncoder as Encoder_spk +from mi_estimators import CLUBSample_group, CLUBSample_reshape + +import apex.amp as amp +import os +import time + +torch.manual_seed(137) +np.random.seed(137) + +def save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \ + cs_mi_net, ps_mi_net, cp_mi_net, decoder, \ + optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg): + if cfg.use_amp: + amp_state_dict = amp.state_dict() + else: + amp_state_dict = None + checkpoint_state = { + "encoder": encoder.state_dict(), + "encoder_lf0": encoder_lf0.state_dict(), + "cpc": cpc.state_dict(), + "encoder_spk": encoder_spk.state_dict(), + "ps_mi_net": ps_mi_net.state_dict(), + "cp_mi_net": cp_mi_net.state_dict(), + "cs_mi_net": cs_mi_net.state_dict(), + "decoder": decoder.state_dict(), + "optimizer": optimizer.state_dict(), + "optimizer_cs_mi_net": optimizer_cs_mi_net.state_dict(), + "optimizer_ps_mi_net": optimizer_ps_mi_net.state_dict(), + "optimizer_cp_mi_net": optimizer_cp_mi_net.state_dict(), + "scheduler": scheduler.state_dict(), + "amp": amp_state_dict, + "epoch": epoch + } + checkpoint_dir.mkdir(exist_ok=True, parents=True) + checkpoint_path = checkpoint_dir / "model.ckpt-{}.pt".format(epoch) + torch.save(checkpoint_state, checkpoint_path) + print("Saved checkpoint: {}".format(checkpoint_path.stem)) + + + +def mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, + ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg): + optimizer_cs_mi_net.zero_grad() + optimizer_ps_mi_net.zero_grad() + optimizer_cp_mi_net.zero_grad() + z, _, _, _, _ = encoder(mels) + z = z.detach() + lf0_embs = encoder_lf0(lf0).detach() + spk_embs = encoder_spk(mels).detach() + if cfg.use_CSMI: + lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z) + if cfg.use_amp: + with amp.scale_loss(lld_cs_loss, optimizer_cs_mi_net) as sl: + sl.backward() + else: + lld_cs_loss.backward() + optimizer_cs_mi_net.step() + else: + lld_cs_loss = torch.tensor(0.) + + if cfg.use_CPMI: + lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) + if cfg.use_amp: + with amp.scale_loss(lld_cp_loss, optimizer_cp_mi_net) as slll: + slll.backward() + else: + lld_cp_loss.backward() + torch.nn.utils.clip_grad_norm_(cp_mi_net.parameters(), 1) + optimizer_cp_mi_net.step() + else: + lld_cp_loss = torch.tensor(0.) + + if cfg.use_PSMI: + lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs) + if cfg.use_amp: + with amp.scale_loss(lld_ps_loss, optimizer_ps_mi_net) as sll: + sll.backward() + else: + lld_ps_loss.backward() + optimizer_ps_mi_net.step() + else: + lld_ps_loss = torch.tensor(0.) + + return optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss + + +def mi_second_forward(mels, lf0, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg, optimizer, scheduler): + optimizer.zero_grad() + z, c, _, vq_loss, perplexity = encoder(mels) + cpc_loss, accuracy = cpc(z, c) + spk_embs = encoder_spk(mels) + lf0_embs = encoder_lf0(lf0) + recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2)) + + loss = recon_loss + cpc_loss + vq_loss + + if cfg.use_CSMI: + mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z) + else: + mi_cs_loss = torch.tensor(0.).to(loss.device) + + if cfg.use_CPMI: + mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) + else: + mi_cp_loss = torch.tensor(0.).to(loss.device) + + if cfg.use_PSMI: + mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs) + else: + mi_ps_loss = torch.tensor(0.).to(loss.device) + + loss = loss + mi_cs_loss + mi_ps_loss + mi_cp_loss + + if cfg.use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + optimizer.step() + return optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss + + +def calculate_eval_loss(mels, lf0, \ + encoder, encoder_lf0, cpc, \ + encoder_spk, cs_mi_net, ps_mi_net, \ + cp_mi_net, decoder, cfg): + with torch.no_grad(): + z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels) + c = c + lf0_embs = encoder_lf0(lf0) + spk_embs = encoder_spk(mels) + + if cfg.use_CSMI: + lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z) + mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z) + else: + lld_cs_loss = torch.tensor(0.) + mi_cs_loss = torch.tensor(0.) + + # z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels) + cpc_loss, accuracy = cpc(z, c) + recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2)) + + if cfg.use_CPMI: + mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) + lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z) + else: + mi_cp_loss = torch.tensor(0.) + lld_cp_loss = torch.tensor(0.) + + if cfg.use_PSMI: + mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs) + lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs) + else: + mi_ps_loss = torch.tensor(0.) + lld_ps_loss = torch.tensor(0.) + + return recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss + + +def to_eval(all_models): + for m in all_models: + m.eval() + + +def to_train(all_models): + for m in all_models: + m.train() + + +def eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg): + stime = time.time() + average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0 + average_accuracies = np.zeros(cfg.training.n_prediction_steps) + average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0 + all_models = [encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder] + to_eval(all_models) + for i, (mels, lf0, speakers) in enumerate(valid_dataloader, 1): + lf0 = lf0.to(device) + mels = mels.to(device) # (bs, 80, 128) + recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss = \ + calculate_eval_loss(mels, lf0, \ + encoder, encoder_lf0, cpc, \ + encoder_spk, cs_mi_net, ps_mi_net, \ + cp_mi_net, decoder, cfg) + + average_recon_loss += (recon_loss.item() - average_recon_loss) / i + average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i + average_vq_loss += (vq_loss.item() - average_vq_loss) / i + average_perplexity += (perplexity.item() - average_perplexity) / i + average_accuracies += (np.array(accuracy) - average_accuracies) / i + average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i + average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i + average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i + average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i + average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i + average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i + + + ctime = time.time() + print("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" + .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) + print(100 * average_accuracies) + results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a') + results_txt.write("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}" + .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n') + results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n') + results_txt.close() + + to_train(all_models) + + +@hydra.main(config_path="config/train.yaml") +def train_model(cfg): + cfg.checkpoint_dir = f'{cfg.checkpoint_dir}/useCSMI{cfg.use_CSMI}_useCPMI{cfg.use_CPMI}_usePSMI{cfg.use_PSMI}_useAmp{cfg.use_amp}' + if cfg.encoder_lf0_type == 'no_emb': # default + dim_lf0 = 1 + else: + dim_lf0 = 64 + + checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) + checkpoint_dir.mkdir(exist_ok=True, parents=True) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # define model + encoder = Encoder(**cfg.model.encoder) + encoder_lf0 = Encoder_lf0(cfg.encoder_lf0_type) + cpc = CPCLoss_sameSeq(**cfg.model.cpc) + encoder_spk = Encoder_spk() + cs_mi_net = CLUBSample_group(256, cfg.model.encoder.z_dim, 512) + ps_mi_net = CLUBSample_group(256, dim_lf0, 512) + cp_mi_net = CLUBSample_reshape(dim_lf0, cfg.model.encoder.z_dim, 512) + decoder = Decoder_ac(dim_neck=cfg.model.encoder.z_dim, dim_lf0=dim_lf0, use_l1_loss=True) + + encoder.to(device) + cpc.to(device) + encoder_lf0.to(device) + encoder_spk.to(device) + cs_mi_net.to(device) + ps_mi_net.to(device) + cp_mi_net.to(device) + decoder.to(device) + + optimizer = optim.Adam( + chain(encoder.parameters(), encoder_lf0.parameters(), cpc.parameters(), encoder_spk.parameters(), decoder.parameters()), + lr=cfg.training.scheduler.initial_lr) + optimizer_cs_mi_net = optim.Adam(cs_mi_net.parameters(), lr=cfg.mi_lr) + optimizer_ps_mi_net = optim.Adam(ps_mi_net.parameters(), lr=cfg.mi_lr) + optimizer_cp_mi_net = optim.Adam(cp_mi_net.parameters(), lr=cfg.mi_lr) + # TODO: use_amp is set default to True to speed up training; no-amp -> more stable training? => need to be verified + if cfg.use_amp: + [encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer = amp.initialize([encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer, opt_level='O1') + [cs_mi_net], optimizer_cs_mi_net = amp.initialize([cs_mi_net], optimizer_cs_mi_net, opt_level='O1') + [ps_mi_net], optimizer_ps_mi_net = amp.initialize([ps_mi_net], optimizer_ps_mi_net, opt_level='O1') + [cp_mi_net], optimizer_cp_mi_net = amp.initialize([cp_mi_net], optimizer_cp_mi_net, opt_level='O1') + + root_path = Path(utils.to_absolute_path("data")) + dataset = CPCDataset( + root=root_path, + n_sample_frames=cfg.training.sample_frames, # 128 + mode='train') + valid_dataset = CPCDataset( + root=root_path, + n_sample_frames=cfg.training.sample_frames, # 128 + mode='valid') + + warmup_epochs = 2000 // (len(dataset)//cfg.training.batch_size) + print('warmup_epochs:', warmup_epochs) + scheduler = WarmupScheduler( + optimizer, + warmup_epochs=warmup_epochs, + initial_lr=cfg.training.scheduler.initial_lr, + max_lr=cfg.training.scheduler.max_lr, + milestones=cfg.training.scheduler.milestones, + gamma=cfg.training.scheduler.gamma) + + dataloader = DataLoader( + dataset, + batch_size=cfg.training.batch_size, # 256 + shuffle=True, + num_workers=cfg.training.n_workers, + pin_memory=True, + drop_last=False) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=cfg.training.batch_size, # 256 + shuffle=False, + num_workers=cfg.training.n_workers, + pin_memory=True, + drop_last=False) + + if cfg.resume: + print("Resume checkpoint from: {}:".format(cfg.resume)) + resume_path = utils.to_absolute_path(cfg.resume) + checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) + encoder.load_state_dict(checkpoint["encoder"]) + encoder_lf0.load_state_dict(checkpoint["encoder_lf0"]) + cpc.load_state_dict(checkpoint["cpc"]) + encoder_spk.load_state_dict(checkpoint["encoder_spk"]) + cs_mi_net.load_state_dict(checkpoint["cs_mi_net"]) + ps_mi_net.load_state_dict(checkpoint["ps_mi_net"]) + if cfg.use_CPMI: + cp_mi_net.load_state_dict(checkpoint["cp_mi_net"]) + decoder.load_state_dict(checkpoint["decoder"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + optimizer_cs_mi_net.load_state_dict(checkpoint["optimizer_cs_mi_net"]) + optimizer_ps_mi_net.load_state_dict(checkpoint["optimizer_ps_mi_net"]) + optimizer_cp_mi_net.load_state_dict(checkpoint["optimizer_cp_mi_net"]) + if cfg.use_amp: + amp.load_state_dict(checkpoint["amp"]) + scheduler.load_state_dict(checkpoint["scheduler"]) + start_epoch = checkpoint["epoch"] + else: + start_epoch = 1 + + if os.path.exists(f'{str(checkpoint_dir)}/results.txt'): + wmode = 'a' + else: + wmode = 'w' + results_txt = open(f'{str(checkpoint_dir)}/results.txt', wmode) + results_txt.write('save training info...\n') + results_txt.close() + + global_step = 0 + stime = time.time() + for epoch in range(start_epoch, cfg.training.n_epochs + 1): + average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0 + average_accuracies = np.zeros(cfg.training.n_prediction_steps) + average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0 + + for i, (mels, lf0, speakers) in enumerate(dataloader, 1): + lf0 = lf0.to(device) + mels = mels.to(device) # (bs, 80, 128) + if cfg.use_CSMI or cfg.use_CPMI or cfg.use_PSMI: + for j in range(cfg.mi_iters): + optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss = mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, \ + ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg) + else: + lld_cs_loss = torch.tensor(0.) + lld_ps_loss = torch.tensor(0.) + lld_cp_loss = torch.tensor(0.) + + optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss = mi_second_forward(mels, lf0, \ + encoder, encoder_lf0, cpc, \ + encoder_spk, cs_mi_net, ps_mi_net, \ + cp_mi_net, decoder, cfg, \ + optimizer, scheduler) + + average_recon_loss += (recon_loss.item() - average_recon_loss) / i + average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i + average_vq_loss += (vq_loss.item() - average_vq_loss) / i + average_perplexity += (perplexity.item() - average_perplexity) / i + average_accuracies += (np.array(accuracy) - average_accuracies) / i + average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i + average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i + average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i + average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i + average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i + average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i + + + ctime = time.time() + print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" + .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) + print(100 * average_accuracies) + stime = time.time() + global_step += 1 + # scheduler.step() + + results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a') + results_txt.write("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}" + .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n') + results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n') + results_txt.close() + scheduler.step() + + + if epoch % cfg.training.log_interval == 0 and epoch != start_epoch: + eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg) + + ctime = time.time() + print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" + .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime)) + print(100 * average_accuracies) + stime = time.time() + + if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch: + save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \ + cs_mi_net, ps_mi_net, cp_mi_net, decoder, \ + optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg) + + +if __name__ == "__main__": + train_model() diff --git a/audio/speech_synthesis/vqmivc/pytorch/vocoder/README.md b/audio/speech_synthesis/vqmivc/pytorch/vocoder/README.md index 3cb239e40..4b74bf3b8 100644 --- a/audio/speech_synthesis/vqmivc/pytorch/vocoder/README.md +++ b/audio/speech_synthesis/vqmivc/pytorch/vocoder/README.md @@ -1 +1 @@ -Put PWG vocoder under this directory. +Put PWG vocoder under this directory. diff --git a/audio/speech_synthesis/waveglow/pytorch/tacotron2/loss_scaler.py b/audio/speech_synthesis/waveglow/pytorch/tacotron2/loss_scaler.py index 0662a60e2..88cc9cf29 100644 --- a/audio/speech_synthesis/waveglow/pytorch/tacotron2/loss_scaler.py +++ b/audio/speech_synthesis/waveglow/pytorch/tacotron2/loss_scaler.py @@ -1,131 +1,131 @@ -import torch - -class LossScaler: - - def __init__(self, scale=1): - self.cur_scale = scale - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - return False - - # `overflow` is boolean indicating whether we overflowed in gradient - def update_scale(self, overflow): - pass - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss): - scaled_loss = loss*self.loss_scale - scaled_loss.backward() - -class DynamicLossScaler: - - def __init__(self, - init_scale=2**32, - scale_factor=2., - scale_window=1000): - self.cur_scale = init_scale - self.cur_iter = 0 - self.last_overflow_iter = -1 - self.scale_factor = scale_factor - self.scale_window = scale_window - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): -# return False - for p in params: - if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): - return True - - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - cpu_sum = float(x.float().sum()) - if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: - return True - return False - - # `overflow` is boolean indicating whether we overflowed in gradient - def update_scale(self, overflow): - if overflow: - #self.cur_scale /= self.scale_factor - self.cur_scale = max(self.cur_scale/self.scale_factor, 1) - self.last_overflow_iter = self.cur_iter - else: - if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: - self.cur_scale *= self.scale_factor -# self.cur_scale = 1 - self.cur_iter += 1 - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss): - scaled_loss = loss*self.loss_scale - scaled_loss.backward() - -############################################################## -# Example usage below here -- assuming it's in a separate file -############################################################## -if __name__ == "__main__": - import torch - from torch.autograd import Variable - from dynamic_loss_scaler import DynamicLossScaler - - # N is batch size; D_in is input dimension; - # H is hidden dimension; D_out is output dimension. - N, D_in, H, D_out = 64, 1000, 100, 10 - - # Create random Tensors to hold inputs and outputs, and wrap them in Variables. - x = Variable(torch.randn(N, D_in), requires_grad=False) - y = Variable(torch.randn(N, D_out), requires_grad=False) - - w1 = Variable(torch.randn(D_in, H), requires_grad=True) - w2 = Variable(torch.randn(H, D_out), requires_grad=True) - parameters = [w1, w2] - - learning_rate = 1e-6 - optimizer = torch.optim.SGD(parameters, lr=learning_rate) - loss_scaler = DynamicLossScaler() - - for t in range(500): - y_pred = x.mm(w1).clamp(min=0).mm(w2) - loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale - print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) - print('Iter {} scaled loss: {}'.format(t, loss.data[0])) - print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) - - # Run backprop - optimizer.zero_grad() - loss.backward() - - # Check for overflow - has_overflow = DynamicLossScaler.has_overflow(parameters) - - # If no overflow, unscale grad and update as usual - if not has_overflow: - for param in parameters: - param.grad.data.mul_(1. / loss_scaler.loss_scale) - optimizer.step() - # Otherwise, don't do anything -- ie, skip iteration - else: - print('OVERFLOW!') - - # Update loss scale for next iteration - loss_scaler.update_scale(has_overflow) - +import torch + +class LossScaler: + + def __init__(self, scale=1): + self.cur_scale = scale + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + return False + + # `overflow` is boolean indicating whether we overflowed in gradient + def update_scale(self, overflow): + pass + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss): + scaled_loss = loss*self.loss_scale + scaled_loss.backward() + +class DynamicLossScaler: + + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): +# return False + for p in params: + if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): + return True + + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + cpu_sum = float(x.float().sum()) + if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: + return True + return False + + # `overflow` is boolean indicating whether we overflowed in gradient + def update_scale(self, overflow): + if overflow: + #self.cur_scale /= self.scale_factor + self.cur_scale = max(self.cur_scale/self.scale_factor, 1) + self.last_overflow_iter = self.cur_iter + else: + if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: + self.cur_scale *= self.scale_factor +# self.cur_scale = 1 + self.cur_iter += 1 + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss): + scaled_loss = loss*self.loss_scale + scaled_loss.backward() + +############################################################## +# Example usage below here -- assuming it's in a separate file +############################################################## +if __name__ == "__main__": + import torch + from torch.autograd import Variable + from dynamic_loss_scaler import DynamicLossScaler + + # N is batch size; D_in is input dimension; + # H is hidden dimension; D_out is output dimension. + N, D_in, H, D_out = 64, 1000, 100, 10 + + # Create random Tensors to hold inputs and outputs, and wrap them in Variables. + x = Variable(torch.randn(N, D_in), requires_grad=False) + y = Variable(torch.randn(N, D_out), requires_grad=False) + + w1 = Variable(torch.randn(D_in, H), requires_grad=True) + w2 = Variable(torch.randn(H, D_out), requires_grad=True) + parameters = [w1, w2] + + learning_rate = 1e-6 + optimizer = torch.optim.SGD(parameters, lr=learning_rate) + loss_scaler = DynamicLossScaler() + + for t in range(500): + y_pred = x.mm(w1).clamp(min=0).mm(w2) + loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale + print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) + print('Iter {} scaled loss: {}'.format(t, loss.data[0])) + print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) + + # Run backprop + optimizer.zero_grad() + loss.backward() + + # Check for overflow + has_overflow = DynamicLossScaler.has_overflow(parameters) + + # If no overflow, unscale grad and update as usual + if not has_overflow: + for param in parameters: + param.grad.data.mul_(1. / loss_scaler.loss_scale) + optimizer.step() + # Otherwise, don't do anything -- ie, skip iteration + else: + print('OVERFLOW!') + + # Update loss scale for next iteration + loss_scaler.update_scale(has_overflow) + diff --git a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_base.py b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_base.py index beac9b274..028e9e5c6 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_base.py +++ b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_base.py @@ -1,257 +1,257 @@ -_base_ = [ - '../datasets/custom_nus-3d.py', - '../_base_/default_runtime.py' -] -# -plugin = True -plugin_dir = 'projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - - - -img_norm_cfg = dict( - mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=True) - -_dim_ = 256 -_pos_dim_ = _dim_//2 -_ffn_dim_ = _dim_*2 -_num_levels_ = 4 -bev_h_ = 200 -bev_w_ = 200 -queue_length = 4 # each sequence contains `queue_length` frames. - -model = dict( - type='BEVFormer', - use_grid_mask=True, - video_test_mode=True, - img_backbone=dict( - type='ResNet', - depth=101, - num_stages=4, - out_indices=(1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN2d', requires_grad=False), - norm_eval=True, - style='caffe', - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict - stage_with_dcn=(False, False, True, True)), - img_neck=dict( - type='FPN', - in_channels=[512, 1024, 2048], - out_channels=_dim_, - start_level=0, - add_extra_convs='on_output', - num_outs=4, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='BEVFormerHead', - bev_h=bev_h_, - bev_w=bev_w_, - num_query=900, - num_classes=10, - in_channels=_dim_, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='PerceptionTransformer', - rotate_prev_bev=True, - use_shift=True, - use_can_bus=True, - embed_dims=_dim_, - encoder=dict( - type='BEVFormerEncoder', - num_layers=6, - pc_range=point_cloud_range, - num_points_in_pillar=4, - return_intermediate=False, - transformerlayers=dict( - type='BEVFormerLayer', - attn_cfgs=[ - dict( - type='TemporalSelfAttention', - embed_dims=_dim_, - num_levels=1), - dict( - type='SpatialCrossAttention', - pc_range=point_cloud_range, - deformable_attention=dict( - type='MSDeformableAttention3D', - embed_dims=_dim_, - num_points=8, - num_levels=_num_levels_), - embed_dims=_dim_, - ) - ], - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm'))), - decoder=dict( - type='DetectionTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=_dim_, - num_heads=8, - dropout=0.1), - dict( - type='CustomMSDeformableAttention', - embed_dims=_dim_, - num_levels=1), - ], - - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='LearnedPositionalEncoding', - num_feats=_pos_dim_, - row_num_embed=bev_h_, - col_num_embed=bev_w_, - ), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'CustomNuScenesDataset' -data_root = 'data/nuscenes/' -file_client_args = dict(backend='disk') - - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] - -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1600, 900), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='CustomCollect3D', keys=['img']) - ]) -] - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - bev_size=(bev_h_, bev_w_), - queue_length=queue_length, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - val=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality, samples_per_gpu=1), - # test=dict(type=dataset_type, - # data_root=data_root, - # ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - # pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - # classes=class_names, modality=input_modality), - shuffler_sampler=dict(type='DistributedGroupSampler'), - nonshuffler_sampler=dict(type='DistributedSampler') -) - -optimizer = dict( - type='AdamW', - lr=2e-5, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) - -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 2 -evaluation = dict(interval=1, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) -load_from = 'ckpts/bevformer_r101_dcn_24ep.pth' -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook') - ]) - -checkpoint_config = dict(interval=1) +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 4 +bev_h_ = 200 +bev_w_ = 200 +queue_length = 4 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict + stage_with_dcn=(False, False, True, True)), + img_neck=dict( + type='FPN', + in_channels=[512, 1024, 2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=6, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + # test=dict(type=dataset_type, + # data_root=data_root, + # ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + # pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + # classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-5, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 2 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from = 'ckpts/bevformer_r101_dcn_24ep.pth' +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_small.py b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_small.py index 2701914eb..6856e7cd4 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_small.py +++ b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_small.py @@ -1,268 +1,268 @@ -# BEvFormer-small consumes at lease 10500M GPU memory -# compared to bevformer_base, bevformer_small has -# smaller BEV: 200*200 -> 150*150 -# less encoder layers: 6 -> 3 -# smaller input size: 1600*900 -> (1600*900)*0.8 -# multi-scale feautres -> single scale features (C5) -# with_cp of backbone = True - -_base_ = [ - '../datasets/custom_nus-3d.py', - '../_base_/default_runtime.py' -] -# -plugin = True -plugin_dir = 'projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - - -img_norm_cfg = dict( - mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=True) - -_dim_ = 256 -_pos_dim_ = _dim_//2 -_ffn_dim_ = _dim_*2 -_num_levels_ = 1 -bev_h_ = 150 -bev_w_ = 150 -queue_length = 3 # each sequence contains `queue_length` frames. - -model = dict( - type='BEVFormer', - use_grid_mask=True, - video_test_mode=True, - img_backbone=dict( - type='ResNet', - depth=101, - num_stages=4, - out_indices=(3,), - frozen_stages=1, - norm_cfg=dict(type='BN2d', requires_grad=False), - norm_eval=True, - style='caffe', - with_cp=True, # using checkpoint to save GPU memory - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict - stage_with_dcn=(False, False, True, True)), - img_neck=dict( - type='FPN', - in_channels=[2048], - out_channels=_dim_, - start_level=0, - add_extra_convs='on_output', - num_outs=_num_levels_, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='BEVFormerHead', - bev_h=bev_h_, - bev_w=bev_w_, - num_query=900, - num_classes=10, - in_channels=_dim_, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='PerceptionTransformer', - rotate_prev_bev=True, - use_shift=True, - use_can_bus=True, - embed_dims=_dim_, - encoder=dict( - type='BEVFormerEncoder', - num_layers=3, - pc_range=point_cloud_range, - num_points_in_pillar=4, - return_intermediate=False, - transformerlayers=dict( - type='BEVFormerLayer', - attn_cfgs=[ - dict( - type='TemporalSelfAttention', - embed_dims=_dim_, - num_levels=1), - dict( - type='SpatialCrossAttention', - pc_range=point_cloud_range, - deformable_attention=dict( - type='MSDeformableAttention3D', - embed_dims=_dim_, - num_points=8, - num_levels=_num_levels_), - embed_dims=_dim_, - ) - ], - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm'))), - decoder=dict( - type='DetectionTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=_dim_, - num_heads=8, - dropout=0.1), - dict( - type='CustomMSDeformableAttention', - embed_dims=_dim_, - num_levels=1), - ], - - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='LearnedPositionalEncoding', - num_feats=_pos_dim_, - row_num_embed=bev_h_, - col_num_embed=bev_w_, - ), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'CustomNuScenesDataset' -data_root = 'data/nuscenes/' -file_client_args = dict(backend='disk') - - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] - -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - # dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1600, 900), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='CustomCollect3D', keys=['img']) - ]) -] - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - bev_size=(bev_h_, bev_w_), - queue_length=queue_length, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - val=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality, samples_per_gpu=1), - test=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality), - shuffler_sampler=dict(type='DistributedGroupSampler'), - nonshuffler_sampler=dict(type='DistributedSampler') -) - -optimizer = dict( - type='AdamW', - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) - -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=1, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) -load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook') - ]) - -checkpoint_config = dict(interval=1) +# BEvFormer-small consumes at lease 10500M GPU memory +# compared to bevformer_base, bevformer_small has +# smaller BEV: 200*200 -> 150*150 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> (1600*900)*0.8 +# multi-scale feautres -> single scale features (C5) +# with_cp of backbone = True + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 150 +bev_w_ = 150 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + with_cp=True, # using checkpoint to save GPU memory + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict + stage_with_dcn=(False, False, True, True)), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + # dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_tiny.py b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_tiny.py index 78858ee9a..f56d1b2f4 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_tiny.py +++ b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer/bevformer_tiny.py @@ -1,270 +1,270 @@ -# BEvFormer-tiny consumes at lease 6700M GPU memory -# compared to bevformer_base, bevformer_tiny has -# smaller backbone: R101-DCN -> R50 -# smaller BEV: 200*200 -> 50*50 -# less encoder layers: 6 -> 3 -# smaller input size: 1600*900 -> 800*450 -# multi-scale feautres -> single scale features (C5) - - -_base_ = [ - '../datasets/custom_nus-3d.py', - '../_base_/default_runtime.py' -] -# -plugin = True -plugin_dir = 'projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - - - - -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=True) - -_dim_ = 256 -_pos_dim_ = _dim_//2 -_ffn_dim_ = _dim_*2 -_num_levels_ = 1 -bev_h_ = 50 -bev_w_ = 50 -queue_length = 3 # each sequence contains `queue_length` frames. - -model = dict( - type='BEVFormer', - use_grid_mask=True, - video_test_mode=True, - pretrained=dict(img='torchvision://resnet50'), - img_backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(3,), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch'), - img_neck=dict( - type='FPN', - in_channels=[2048], - out_channels=_dim_, - start_level=0, - add_extra_convs='on_output', - num_outs=_num_levels_, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='BEVFormerHead', - bev_h=bev_h_, - bev_w=bev_w_, - num_query=900, - num_classes=10, - in_channels=_dim_, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='PerceptionTransformer', - rotate_prev_bev=True, - use_shift=True, - use_can_bus=True, - embed_dims=_dim_, - encoder=dict( - type='BEVFormerEncoder', - num_layers=3, - pc_range=point_cloud_range, - num_points_in_pillar=4, - return_intermediate=False, - transformerlayers=dict( - type='BEVFormerLayer', - attn_cfgs=[ - dict( - type='TemporalSelfAttention', - embed_dims=_dim_, - num_levels=1), - dict( - type='SpatialCrossAttention', - pc_range=point_cloud_range, - deformable_attention=dict( - type='MSDeformableAttention3D', - embed_dims=_dim_, - num_points=8, - num_levels=_num_levels_), - embed_dims=_dim_, - ) - ], - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm'))), - decoder=dict( - type='DetectionTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=_dim_, - num_heads=8, - dropout=0.1), - dict( - type='CustomMSDeformableAttention', - embed_dims=_dim_, - num_levels=1), - ], - - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='LearnedPositionalEncoding', - num_feats=_pos_dim_, - row_num_embed=bev_h_, - col_num_embed=bev_w_, - ), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'CustomNuScenesDataset' -data_root = 'data/nuscenes/' -file_client_args = dict(backend='disk') - - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] - -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - - dict( - type='MultiScaleFlipAug3D', - img_scale=(1600, 900), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='CustomCollect3D', keys=['img']) - ]) -] - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - bev_size=(bev_h_, bev_w_), - queue_length=queue_length, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - val=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality, samples_per_gpu=1), - test=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality), - shuffler_sampler=dict(type='DistributedGroupSampler'), - nonshuffler_sampler=dict(type='DistributedSampler') -) - -optimizer = dict( - type='AdamW', - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) - -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=1, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) - -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook') - ]) - -checkpoint_config = dict(interval=1) +# BEvFormer-tiny consumes at lease 6700M GPU memory +# compared to bevformer_base, bevformer_tiny has +# smaller backbone: R101-DCN -> R50 +# smaller BEV: 200*200 -> 50*50 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> 800*450 +# multi-scale feautres -> single scale features (C5) + + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 50 +bev_w_ = 50 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + pretrained=dict(img='torchvision://resnet50'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py index 0294890b8..fe6beae5c 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py +++ b/cv/3d_detection/bevformer/pytorch/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py @@ -1,272 +1,272 @@ -# BEvFormer-tiny consumes at lease 6700M GPU memory -# compared to bevformer_base, bevformer_tiny has -# smaller backbone: R101-DCN -> R50 -# smaller BEV: 200*200 -> 50*50 -# less encoder layers: 6 -> 3 -# smaller input size: 1600*900 -> 800*450 -# multi-scale feautres -> single scale features (C5) - - -_base_ = [ - '../datasets/custom_nus-3d.py', - '../_base_/default_runtime.py' -] -# -plugin = True -plugin_dir = 'projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - - - - -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=True) - -_dim_ = 256 -_pos_dim_ = _dim_//2 -_ffn_dim_ = _dim_*2 -_num_levels_ = 1 -bev_h_ = 50 -bev_w_ = 50 -queue_length = 3 # each sequence contains `queue_length` frames. - -model = dict( - type='BEVFormer_fp16', - use_grid_mask=True, - video_test_mode=True, - pretrained=dict(img='torchvision://resnet50'), - img_backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(3,), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch'), - img_neck=dict( - type='FPN', - in_channels=[2048], - out_channels=_dim_, - start_level=0, - add_extra_convs='on_output', - num_outs=_num_levels_, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='BEVFormerHead', - bev_h=bev_h_, - bev_w=bev_w_, - num_query=900, - num_classes=10, - in_channels=_dim_, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='PerceptionTransformer', - rotate_prev_bev=True, - use_shift=True, - use_can_bus=True, - embed_dims=_dim_, - encoder=dict( - type='BEVFormerEncoder', - num_layers=3, - pc_range=point_cloud_range, - num_points_in_pillar=4, - return_intermediate=False, - transformerlayers=dict( - type='BEVFormerLayer', - attn_cfgs=[ - dict( - type='TemporalSelfAttention', - embed_dims=_dim_, - num_levels=1), - dict( - type='SpatialCrossAttention', - pc_range=point_cloud_range, - deformable_attention=dict( - type='MSDeformableAttention3D', - embed_dims=_dim_, - num_points=8, - num_levels=_num_levels_), - embed_dims=_dim_, - ) - ], - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm'))), - decoder=dict( - type='DetectionTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=_dim_, - num_heads=8, - dropout=0.1), - dict( - type='CustomMSDeformableAttention', - embed_dims=_dim_, - num_levels=1), - ], - - feedforward_channels=_ffn_dim_, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='LearnedPositionalEncoding', - num_feats=_pos_dim_, - row_num_embed=bev_h_, - col_num_embed=bev_w_, - ), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'CustomNuScenesDataset' -data_root = 'data/nuscenes/' -file_client_args = dict(backend='disk') - - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] - -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - - dict( - type='MultiScaleFlipAug3D', - img_scale=(1600, 900), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='CustomCollect3D', keys=['img']) - ]) -] - -data = dict( - samples_per_gpu=2, - workers_per_gpu=8, - train=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - bev_size=(bev_h_, bev_w_), - queue_length=queue_length, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - val=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality, samples_per_gpu=1), - test=dict(type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', - pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), - classes=class_names, modality=input_modality), - shuffler_sampler=dict(type='DistributedGroupSampler'), - nonshuffler_sampler=dict(type='DistributedSampler') -) - -optimizer = dict( - type='AdamW', - lr=2.8e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) - -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=1, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs) -load_from = 'ckpts/bevformer_tiny_fp16_epoch_24.pth' -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook') - ]) - -fp16 = dict(loss_scale=512.) -checkpoint_config = dict(interval=1) -custom_hooks = [dict(type='TransferWeight',priority='LOWEST')] +# BEvFormer-tiny consumes at lease 6700M GPU memory +# compared to bevformer_base, bevformer_tiny has +# smaller backbone: R101-DCN -> R50 +# smaller BEV: 200*200 -> 50*50 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> 800*450 +# multi-scale feautres -> single scale features (C5) + + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 50 +bev_w_ = 50 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer_fp16', + use_grid_mask=True, + video_test_mode=True, + pretrained=dict(img='torchvision://resnet50'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=8, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2.8e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs) +load_from = 'ckpts/bevformer_tiny_fp16_epoch_24.pth' +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) +custom_hooks = [dict(type='TransferWeight',priority='LOWEST')] diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/__init__.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/__init__.py index 98d6e7e00..27df18f84 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/__init__.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/__init__.py @@ -1,6 +1,6 @@ - -from .dense_heads import * -from .detectors import * -from .modules import * -from .runner import * -from .hooks import * + +from .dense_heads import * +from .detectors import * +from .modules import * +from .runner import * +from .hooks import * diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/apis/test.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/apis/test.py index cd507e4c8..03acb14af 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/apis/test.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/apis/test.py @@ -1,164 +1,164 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- -import os.path as osp -import pickle -import shutil -import tempfile -import time - -import mmcv -import torch -import torch.distributed as dist -from mmcv.image import tensor2imgs -from mmcv.runner import get_dist_info - -from mmdet.core import encode_mask_results - - -import mmcv -import numpy as np -import pycocotools.mask as mask_util - -def custom_encode_mask_results(mask_results): - """Encode bitmap mask to RLE code. Semantic Masks only - Args: - mask_results (list | tuple[list]): bitmap mask results. - In mask scoring rcnn, mask_results is a tuple of (segm_results, - segm_cls_score). - Returns: - list | tuple: RLE encoded mask. - """ - cls_segms = mask_results - num_classes = len(cls_segms) - encoded_mask_results = [] - for i in range(len(cls_segms)): - encoded_mask_results.append( - mask_util.encode( - np.array( - cls_segms[i][:, :, np.newaxis], order='F', - dtype='uint8'))[0]) # encoded with RLE - return [encoded_mask_results] - -def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): - """Test model with multiple gpus. - This method tests model with multiple gpus and collects the results - under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' - it encodes results to gpu tensors and use gpu communication for results - collection. On cpu mode it saves the results on different gpus to 'tmpdir' - and collects them by the rank 0 worker. - Args: - model (nn.Module): Model to be tested. - data_loader (nn.Dataloader): Pytorch data loader. - tmpdir (str): Path of directory to save the temporary results from - different gpus under cpu mode. - gpu_collect (bool): Option to use either gpu or cpu to collect results. - Returns: - list: The prediction results. - """ - model.eval() - bbox_results = [] - mask_results = [] - dataset = data_loader.dataset - rank, world_size = get_dist_info() - if rank == 0: - prog_bar = mmcv.ProgressBar(len(dataset)) - time.sleep(2) # This line can prevent deadlock problem in some cases. - have_mask = False - for i, data in enumerate(data_loader): - with torch.no_grad(): - result = model(return_loss=False, rescale=True, **data) - # encode mask results - if isinstance(result, dict): - if 'bbox_results' in result.keys(): - bbox_result = result['bbox_results'] - batch_size = len(result['bbox_results']) - bbox_results.extend(bbox_result) - if 'mask_results' in result.keys() and result['mask_results'] is not None: - mask_result = custom_encode_mask_results(result['mask_results']) - mask_results.extend(mask_result) - have_mask = True - else: - batch_size = len(result) - bbox_results.extend(result) - - #if isinstance(result[0], tuple): - # assert False, 'this code is for instance segmentation, which our code will not utilize.' - # result = [(bbox_results, encode_mask_results(mask_results)) - # for bbox_results, mask_results in result] - if rank == 0: - - for _ in range(batch_size * world_size): - prog_bar.update() - - # collect results from all ranks - if gpu_collect: - bbox_results = collect_results_gpu(bbox_results, len(dataset)) - if have_mask: - mask_results = collect_results_gpu(mask_results, len(dataset)) - else: - mask_results = None - else: - bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) - tmpdir = tmpdir+'_mask' if tmpdir is not None else None - if have_mask: - mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) - else: - mask_results = None - - if mask_results is None: - return bbox_results - return {'bbox_results': bbox_results, 'mask_results': mask_results} - - -def collect_results_cpu(result_part, size, tmpdir=None): - rank, world_size = get_dist_info() - # create a tmp dir if it is not specified - if tmpdir is None: - MAX_LEN = 512 - # 32 is whitespace - dir_tensor = torch.full((MAX_LEN, ), - 32, - dtype=torch.uint8, - device='cuda') - if rank == 0: - mmcv.mkdir_or_exist('.dist_test') - tmpdir = tempfile.mkdtemp(dir='.dist_test') - tmpdir = torch.tensor( - bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') - dir_tensor[:len(tmpdir)] = tmpdir - dist.broadcast(dir_tensor, 0) - tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() - else: - mmcv.mkdir_or_exist(tmpdir) - # dump the part result to the dir - mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) - dist.barrier() - # collect all parts - if rank != 0: - return None - else: - # load results of all parts from tmp dir - part_list = [] - for i in range(world_size): - part_file = osp.join(tmpdir, f'part_{i}.pkl') - part_list.append(mmcv.load(part_file)) - # sort the results - ordered_results = [] - ''' - bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, - ''' - #for res in zip(*part_list): - for res in part_list: - ordered_results.extend(list(res)) - # the dataloader may pad some samples - ordered_results = ordered_results[:size] - # remove tmp dir - shutil.rmtree(tmpdir) - return ordered_results - - -def collect_results_gpu(result_part, size): +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import mmcv +import torch +import torch.distributed as dist +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + +from mmdet.core import encode_mask_results + + +import mmcv +import numpy as np +import pycocotools.mask as mask_util + +def custom_encode_mask_results(mask_results): + """Encode bitmap mask to RLE code. Semantic Masks only + Args: + mask_results (list | tuple[list]): bitmap mask results. + In mask scoring rcnn, mask_results is a tuple of (segm_results, + segm_cls_score). + Returns: + list | tuple: RLE encoded mask. + """ + cls_segms = mask_results + num_classes = len(cls_segms) + encoded_mask_results = [] + for i in range(len(cls_segms)): + encoded_mask_results.append( + mask_util.encode( + np.array( + cls_segms[i][:, :, np.newaxis], order='F', + dtype='uint8'))[0]) # encoded with RLE + return [encoded_mask_results] + +def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Returns: + list: The prediction results. + """ + model.eval() + bbox_results = [] + mask_results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + have_mask = False + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + # encode mask results + if isinstance(result, dict): + if 'bbox_results' in result.keys(): + bbox_result = result['bbox_results'] + batch_size = len(result['bbox_results']) + bbox_results.extend(bbox_result) + if 'mask_results' in result.keys() and result['mask_results'] is not None: + mask_result = custom_encode_mask_results(result['mask_results']) + mask_results.extend(mask_result) + have_mask = True + else: + batch_size = len(result) + bbox_results.extend(result) + + #if isinstance(result[0], tuple): + # assert False, 'this code is for instance segmentation, which our code will not utilize.' + # result = [(bbox_results, encode_mask_results(mask_results)) + # for bbox_results, mask_results in result] + if rank == 0: + + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + bbox_results = collect_results_gpu(bbox_results, len(dataset)) + if have_mask: + mask_results = collect_results_gpu(mask_results, len(dataset)) + else: + mask_results = None + else: + bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) + tmpdir = tmpdir+'_mask' if tmpdir is not None else None + if have_mask: + mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) + else: + mask_results = None + + if mask_results is None: + return bbox_results + return {'bbox_results': bbox_results, 'mask_results': mask_results} + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + ''' + bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, + ''' + #for res in zip(*part_list): + for res in part_list: + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): collect_results_cpu(result_part, size) \ No newline at end of file diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py index 91d38d141..15691fd90 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py @@ -1,523 +1,523 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import copy -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import Linear, bias_init_with_prob -from mmcv.utils import TORCH_VERSION, digit_version - -from mmdet.core import (multi_apply, multi_apply, reduce_mean) -from mmdet.models.utils.transformer import inverse_sigmoid -from mmdet.models import HEADS -from mmdet.models.dense_heads import DETRHead -from mmdet3d.core.bbox.coders import build_bbox_coder -from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox -from mmcv.cnn.bricks.transformer import build_positional_encoding -from mmcv.runner import force_fp32, auto_fp16 -from projects.mmdet3d_plugin.models.utils.bricks import run_time -import numpy as np -import mmcv -import cv2 as cv -from projects.mmdet3d_plugin.models.utils.visual import save_tensor - - -@HEADS.register_module() -class BEVFormerHead(DETRHead): - """Head of Detr3D. - Args: - with_box_refine (bool): Whether to refine the reference points - in the decoder. Defaults to False. - as_two_stage (bool) : Whether to generate the proposal from - the outputs of encoder. - transformer (obj:`ConfigDict`): ConfigDict is used for building - the Encoder and Decoder. - bev_h, bev_w (int): spatial shape of BEV queries. - """ - - def __init__(self, - *args, - with_box_refine=False, - as_two_stage=False, - transformer=None, - bbox_coder=None, - num_cls_fcs=2, - code_weights=None, - bev_h=30, - bev_w=30, - **kwargs): - - self.bev_h = bev_h - self.bev_w = bev_w - self.fp16_enabled = False - - self.with_box_refine = with_box_refine - self.as_two_stage = as_two_stage - if self.as_two_stage: - transformer['as_two_stage'] = self.as_two_stage - if 'code_size' in kwargs: - self.code_size = kwargs['code_size'] - else: - self.code_size = 10 - if code_weights is not None: - self.code_weights = code_weights - else: - self.code_weights = [1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] - - self.bbox_coder = build_bbox_coder(bbox_coder) - self.pc_range = self.bbox_coder.pc_range - self.real_w = self.pc_range[3] - self.pc_range[0] - self.real_h = self.pc_range[4] - self.pc_range[1] - self.num_cls_fcs = num_cls_fcs - 1 - super(BEVFormerHead, self).__init__( - *args, transformer=transformer, **kwargs) - self.code_weights = nn.Parameter(torch.tensor( - self.code_weights, requires_grad=False), requires_grad=False) - - def _init_layers(self): - """Initialize classification branch and regression branch of head.""" - cls_branch = [] - for _ in range(self.num_reg_fcs): - cls_branch.append(Linear(self.embed_dims, self.embed_dims)) - cls_branch.append(nn.LayerNorm(self.embed_dims)) - cls_branch.append(nn.ReLU(inplace=True)) - cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) - fc_cls = nn.Sequential(*cls_branch) - - reg_branch = [] - for _ in range(self.num_reg_fcs): - reg_branch.append(Linear(self.embed_dims, self.embed_dims)) - reg_branch.append(nn.ReLU()) - reg_branch.append(Linear(self.embed_dims, self.code_size)) - reg_branch = nn.Sequential(*reg_branch) - - def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) - - # last reg_branch is used to generate proposal from - # encode feature map when as_two_stage is True. - num_pred = (self.transformer.decoder.num_layers + 1) if \ - self.as_two_stage else self.transformer.decoder.num_layers - - if self.with_box_refine: - self.cls_branches = _get_clones(fc_cls, num_pred) - self.reg_branches = _get_clones(reg_branch, num_pred) - else: - self.cls_branches = nn.ModuleList( - [fc_cls for _ in range(num_pred)]) - self.reg_branches = nn.ModuleList( - [reg_branch for _ in range(num_pred)]) - - if not self.as_two_stage: - self.bev_embedding = nn.Embedding( - self.bev_h * self.bev_w, self.embed_dims) - self.query_embedding = nn.Embedding(self.num_query, - self.embed_dims * 2) - - def init_weights(self): - """Initialize weights of the DeformDETR head.""" - self.transformer.init_weights() - if self.loss_cls.use_sigmoid: - bias_init = bias_init_with_prob(0.01) - for m in self.cls_branches: - nn.init.constant_(m[-1].bias, bias_init) - - @auto_fp16(apply_to=('mlvl_feats')) - def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False): - """Forward function. - Args: - mlvl_feats (tuple[Tensor]): Features from the upstream - network, each is a 5D-tensor with shape - (B, N, C, H, W). - prev_bev: previous bev featues - only_bev: only compute BEV features with encoder. - Returns: - all_cls_scores (Tensor): Outputs from the classification head, \ - shape [nb_dec, bs, num_query, cls_out_channels]. Note \ - cls_out_channels should includes background. - all_bbox_preds (Tensor): Sigmoid outputs from the regression \ - head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ - Shape [nb_dec, bs, num_query, 9]. - """ - - bs, num_cam, _, _, _ = mlvl_feats[0].shape - dtype = mlvl_feats[0].dtype - object_query_embeds = self.query_embedding.weight.to(dtype) - bev_queries = self.bev_embedding.weight.to(dtype) - - bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), - device=bev_queries.device).to(dtype) - bev_pos = self.positional_encoding(bev_mask).to(dtype) - - if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround - return self.transformer.get_bev_features( - mlvl_feats, - bev_queries, - self.bev_h, - self.bev_w, - grid_length=(self.real_h / self.bev_h, - self.real_w / self.bev_w), - bev_pos=bev_pos, - img_metas=img_metas, - prev_bev=prev_bev, - ) - else: - outputs = self.transformer( - mlvl_feats, - bev_queries, - object_query_embeds, - self.bev_h, - self.bev_w, - grid_length=(self.real_h / self.bev_h, - self.real_w / self.bev_w), - bev_pos=bev_pos, - reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 - cls_branches=self.cls_branches if self.as_two_stage else None, - img_metas=img_metas, - prev_bev=prev_bev - ) - - bev_embed, hs, init_reference, inter_references = outputs - hs = hs.permute(0, 2, 1, 3) - outputs_classes = [] - outputs_coords = [] - for lvl in range(hs.shape[0]): - if lvl == 0: - reference = init_reference - else: - reference = inter_references[lvl - 1] - reference = inverse_sigmoid(reference) - outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp = self.reg_branches[lvl](hs[lvl]) - - # TODO: check the shape of reference - assert reference.shape[-1] == 3 - tmp[..., 0:2] += reference[..., 0:2] - tmp[..., 0:2] = tmp[..., 0:2].sigmoid() - tmp[..., 4:5] += reference[..., 2:3] - tmp[..., 4:5] = tmp[..., 4:5].sigmoid() - tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - - self.pc_range[0]) + self.pc_range[0]) - tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - - self.pc_range[1]) + self.pc_range[1]) - tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - - self.pc_range[2]) + self.pc_range[2]) - - # TODO: check if using sigmoid - outputs_coord = tmp - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - - outputs_classes = torch.stack(outputs_classes) - outputs_coords = torch.stack(outputs_coords) - - outs = { - 'bev_embed': bev_embed, - 'all_cls_scores': outputs_classes, - 'all_bbox_preds': outputs_coords, - 'enc_cls_scores': None, - 'enc_bbox_preds': None, - } - - return outs - - def _get_target_single(self, - cls_score, - bbox_pred, - gt_labels, - gt_bboxes, - gt_bboxes_ignore=None): - """"Compute regression and classification targets for one image. - Outputs from a single decoder layer of a single feature level are used. - Args: - cls_score (Tensor): Box score logits from a single decoder layer - for one image. Shape [num_query, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from a single decoder layer - for one image, with normalized coordinate (cx, cy, w, h) and - shape [num_query, 4]. - gt_bboxes (Tensor): Ground truth bboxes for one image with - shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels (Tensor): Ground truth class indices for one image - with shape (num_gts, ). - gt_bboxes_ignore (Tensor, optional): Bounding boxes - which can be ignored. Default None. - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - labels (Tensor): Labels of each image. - - label_weights (Tensor]): Label weights of each image. - - bbox_targets (Tensor): BBox targets of each image. - - bbox_weights (Tensor): BBox weights of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - - num_bboxes = bbox_pred.size(0) - # assigner and sampler - gt_c = gt_bboxes.shape[-1] - - assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, - gt_labels, gt_bboxes_ignore) - - sampling_result = self.sampler.sample(assign_result, bbox_pred, - gt_bboxes) - pos_inds = sampling_result.pos_inds - neg_inds = sampling_result.neg_inds - - # label targets - labels = gt_bboxes.new_full((num_bboxes,), - self.num_classes, - dtype=torch.long) - labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] - label_weights = gt_bboxes.new_ones(num_bboxes) - - # bbox targets - bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] - bbox_weights = torch.zeros_like(bbox_pred) - bbox_weights[pos_inds] = 1.0 - - # DETR - bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes - return (labels, label_weights, bbox_targets, bbox_weights, - pos_inds, neg_inds) - - def get_targets(self, - cls_scores_list, - bbox_preds_list, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): - """"Compute regression and classification targets for a batch image. - Outputs from a single decoder layer of a single feature level are used. - Args: - cls_scores_list (list[Tensor]): Box score logits from a single - decoder layer for each image with shape [num_query, - cls_out_channels]. - bbox_preds_list (list[Tensor]): Sigmoid outputs from a single - decoder layer for each image, with normalized coordinate - (cx, cy, w, h) and shape [num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. - Returns: - tuple: a tuple containing the following targets. - - labels_list (list[Tensor]): Labels for all images. - - label_weights_list (list[Tensor]): Label weights for all \ - images. - - bbox_targets_list (list[Tensor]): BBox targets for all \ - images. - - bbox_weights_list (list[Tensor]): BBox weights for all \ - images. - - num_total_pos (int): Number of positive samples in all \ - images. - - num_total_neg (int): Number of negative samples in all \ - images. - """ - assert gt_bboxes_ignore_list is None, \ - 'Only supports for gt_bboxes_ignore setting to None.' - num_imgs = len(cls_scores_list) - gt_bboxes_ignore_list = [ - gt_bboxes_ignore_list for _ in range(num_imgs) - ] - - (labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( - self._get_target_single, cls_scores_list, bbox_preds_list, - gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) - num_total_pos = sum((inds.numel() for inds in pos_inds_list)) - num_total_neg = sum((inds.numel() for inds in neg_inds_list)) - return (labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, num_total_pos, num_total_neg) - - def loss_single(self, - cls_scores, - bbox_preds, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): - """"Loss function for outputs from a single decoder layer of a single - feature level. - Args: - cls_scores (Tensor): Box score logits from a single decoder layer - for all images. Shape [bs, num_query, cls_out_channels]. - bbox_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (cx, cy, w, h) and - shape [bs, num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. - Returns: - dict[str, Tensor]: A dictionary of loss components for outputs from - a single decoder layer. - """ - num_imgs = cls_scores.size(0) - cls_scores_list = [cls_scores[i] for i in range(num_imgs)] - bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] - cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, - gt_bboxes_list, gt_labels_list, - gt_bboxes_ignore_list) - (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, - num_total_pos, num_total_neg) = cls_reg_targets - labels = torch.cat(labels_list, 0) - label_weights = torch.cat(label_weights_list, 0) - bbox_targets = torch.cat(bbox_targets_list, 0) - bbox_weights = torch.cat(bbox_weights_list, 0) - - # classification loss - cls_scores = cls_scores.reshape(-1, self.cls_out_channels) - # construct weighted avg_factor to match with the official DETR repo - cls_avg_factor = num_total_pos * 1.0 + \ - num_total_neg * self.bg_cls_weight - if self.sync_cls_avg_factor: - cls_avg_factor = reduce_mean( - cls_scores.new_tensor([cls_avg_factor])) - - cls_avg_factor = max(cls_avg_factor, 1) - loss_cls = self.loss_cls( - cls_scores, labels, label_weights, avg_factor=cls_avg_factor) - - # Compute the average number of gt boxes accross all gpus, for - # normalization purposes - num_total_pos = loss_cls.new_tensor([num_total_pos]) - num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() - - # regression L1 loss - bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) - normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) - isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) - bbox_weights = bbox_weights * self.code_weights - - loss_bbox = self.loss_bbox( - bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, - :10], bbox_weights[isnotnan, :10], - avg_factor=num_total_pos) - if digit_version(TORCH_VERSION) >= digit_version('1.8'): - loss_cls = torch.nan_to_num(loss_cls) - loss_bbox = torch.nan_to_num(loss_bbox) - return loss_cls, loss_bbox - - @force_fp32(apply_to=('preds_dicts')) - def loss(self, - gt_bboxes_list, - gt_labels_list, - preds_dicts, - gt_bboxes_ignore=None, - img_metas=None): - """"Loss function. - Args: - - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - preds_dicts: - all_cls_scores (Tensor): Classification score of all - decoder layers, has shape - [nb_dec, bs, num_query, cls_out_channels]. - all_bbox_preds (Tensor): Sigmoid regression - outputs of all decode layers. Each is a 4D-tensor with - normalized coordinate format (cx, cy, w, h) and shape - [nb_dec, bs, num_query, 4]. - enc_cls_scores (Tensor): Classification scores of - points on encode feature map , has shape - (N, h*w, num_classes). Only be passed when as_two_stage is - True, otherwise is None. - enc_bbox_preds (Tensor): Regression results of each points - on the encode feature map, has shape (N, h*w, 4). Only be - passed when as_two_stage is True, otherwise is None. - gt_bboxes_ignore (list[Tensor], optional): Bounding boxes - which can be ignored for each image. Default None. - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - assert gt_bboxes_ignore is None, \ - f'{self.__class__.__name__} only supports ' \ - f'for gt_bboxes_ignore setting to None.' - - all_cls_scores = preds_dicts['all_cls_scores'] - all_bbox_preds = preds_dicts['all_bbox_preds'] - enc_cls_scores = preds_dicts['enc_cls_scores'] - enc_bbox_preds = preds_dicts['enc_bbox_preds'] - - num_dec_layers = len(all_cls_scores) - device = gt_labels_list[0].device - - gt_bboxes_list = [torch.cat( - (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), - dim=1).to(device) for gt_bboxes in gt_bboxes_list] - - all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] - all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] - all_gt_bboxes_ignore_list = [ - gt_bboxes_ignore for _ in range(num_dec_layers) - ] - - losses_cls, losses_bbox = multi_apply( - self.loss_single, all_cls_scores, all_bbox_preds, - all_gt_bboxes_list, all_gt_labels_list, - all_gt_bboxes_ignore_list) - - loss_dict = dict() - # loss of proposal generated from encode feature map. - if enc_cls_scores is not None: - binary_labels_list = [ - torch.zeros_like(gt_labels_list[i]) - for i in range(len(all_gt_labels_list)) - ] - enc_loss_cls, enc_losses_bbox = \ - self.loss_single(enc_cls_scores, enc_bbox_preds, - gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) - loss_dict['enc_loss_cls'] = enc_loss_cls - loss_dict['enc_loss_bbox'] = enc_losses_bbox - - # loss from the last decoder layer - loss_dict['loss_cls'] = losses_cls[-1] - loss_dict['loss_bbox'] = losses_bbox[-1] - - # loss from other decoder layers - num_dec_layer = 0 - for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], - losses_bbox[:-1]): - loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i - loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i - num_dec_layer += 1 - return loss_dict - - @force_fp32(apply_to=('preds_dicts')) - def get_bboxes(self, preds_dicts, img_metas, rescale=False): - """Generate bboxes from bbox head predictions. - Args: - preds_dicts (tuple[list[dict]]): Prediction results. - img_metas (list[dict]): Point cloud and image's meta info. - Returns: - list[dict]: Decoded bbox, scores and labels after nms. - """ - - preds_dicts = self.bbox_coder.decode(preds_dicts) - - num_samples = len(preds_dicts) - ret_list = [] - for i in range(num_samples): - preds = preds_dicts[i] - bboxes = preds['bboxes'] - - bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 - - code_size = bboxes.shape[-1] - bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) - scores = preds['scores'] - labels = preds['labels'] - - ret_list.append([bboxes, scores, labels]) - - return ret_list +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Linear, bias_init_with_prob +from mmcv.utils import TORCH_VERSION, digit_version + +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.models import HEADS +from mmdet.models.dense_heads import DETRHead +from mmdet3d.core.bbox.coders import build_bbox_coder +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from mmcv.cnn.bricks.transformer import build_positional_encoding +from mmcv.runner import force_fp32, auto_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +import numpy as np +import mmcv +import cv2 as cv +from projects.mmdet3d_plugin.models.utils.visual import save_tensor + + +@HEADS.register_module() +class BEVFormerHead(DETRHead): + """Head of Detr3D. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + super(BEVFormerHead, self).__init__( + *args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + fc_cls = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_pred = (self.transformer.decoder.num_layers + 1) if \ + self.as_two_stage else self.transformer.decoder.num_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(fc_cls, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + else: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + + if not self.as_two_stage: + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + + @auto_fp16(apply_to=('mlvl_feats')) + def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + object_query_embeds = self.query_embedding.weight.to(dtype) + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + bev_queries, + object_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + bev_embed, hs, init_reference, inter_references = outputs + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 3 + tmp[..., 0:2] += reference[..., 0:2] + tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + tmp[..., 4:5] += reference[..., 2:3] + tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - + self.pc_range[2]) + self.pc_range[2]) + + # TODO: check if using sigmoid + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + + outs = { + 'bev_embed': bev_embed, + 'all_cls_scores': outputs_classes, + 'all_bbox_preds': outputs_coords, + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + } + + return outs + + def _get_target_single(self, + cls_score, + bbox_pred, + gt_labels, + gt_bboxes, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + + assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, + gt_labels, gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + return (labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list, + gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def loss_single(self, + cls_scores, + bbox_preds, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + gt_bboxes_list, gt_labels_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, + :10], bbox_weights[isnotnan, :10], + avg_factor=num_total_pos) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + return loss_cls, loss_bbox + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + preds_dicts, + gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + gt_bboxes_list = [torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + dim=1).to(device) for gt_bboxes in gt_bboxes_list] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds, + all_gt_bboxes_list, all_gt_labels_list, + all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + enc_loss_cls, enc_losses_bbox = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, + gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], + losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + + preds_dicts = self.bbox_coder.decode(preds_dicts) + + num_samples = len(preds_dicts) + ret_list = [] + for i in range(num_samples): + preds = preds_dicts[i] + bboxes = preds['bboxes'] + + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + + code_size = bboxes.shape[-1] + bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + + ret_list.append([bboxes, scores, labels]) + + return ret_list diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/__init__.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/__init__.py index 4c39fd341..78146f064 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/__init__.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/__init__.py @@ -1,2 +1,2 @@ -from .bevformer import BEVFormer +from .bevformer import BEVFormer from .bevformer_fp16 import BEVFormer_fp16 \ No newline at end of file diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py index 6901178a5..249fc9e27 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py @@ -1,293 +1,293 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -#from tkinter.messagebox import NO -import torch -from mmcv.runner import force_fp32, auto_fp16 -from mmdet.models import DETECTORS -from mmdet3d.core import bbox3d2result -from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector -from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask -import time -import copy -import numpy as np -import mmdet3d -from projects.mmdet3d_plugin.models.utils.bricks import run_time - - -@DETECTORS.register_module() -class BEVFormer(MVXTwoStageDetector): - """BEVFormer. - Args: - video_test_mode (bool): Decide whether to use temporal information during inference. - """ - - def __init__(self, - use_grid_mask=False, - pts_voxel_layer=None, - pts_voxel_encoder=None, - pts_middle_encoder=None, - pts_fusion_layer=None, - img_backbone=None, - pts_backbone=None, - img_neck=None, - pts_neck=None, - pts_bbox_head=None, - img_roi_head=None, - img_rpn_head=None, - train_cfg=None, - test_cfg=None, - pretrained=None, - video_test_mode=False - ): - - super(BEVFormer, - self).__init__(pts_voxel_layer, pts_voxel_encoder, - pts_middle_encoder, pts_fusion_layer, - img_backbone, pts_backbone, img_neck, pts_neck, - pts_bbox_head, img_roi_head, img_rpn_head, - train_cfg, test_cfg, pretrained) - self.grid_mask = GridMask( - True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) - self.use_grid_mask = use_grid_mask - self.fp16_enabled = False - - # temporal - self.video_test_mode = video_test_mode - self.prev_frame_info = { - 'prev_bev': None, - 'scene_token': None, - 'prev_pos': 0, - 'prev_angle': 0, - } - - - def extract_img_feat(self, img, img_metas, len_queue=None): - """Extract features of images.""" - B = img.size(0) - if img is not None: - - # input_shape = img.shape[-2:] - # # update real input shape of each single img - # for img_meta in img_metas: - # img_meta.update(input_shape=input_shape) - - if img.dim() == 5 and img.size(0) == 1: - img.squeeze_() - elif img.dim() == 5 and img.size(0) > 1: - B, N, C, H, W = img.size() - img = img.reshape(B * N, C, H, W) - if self.use_grid_mask: - img = self.grid_mask(img) - - img_feats = self.img_backbone(img) - if isinstance(img_feats, dict): - img_feats = list(img_feats.values()) - else: - return None - if self.with_img_neck: - img_feats = self.img_neck(img_feats) - - img_feats_reshaped = [] - for img_feat in img_feats: - BN, C, H, W = img_feat.size() - if len_queue is not None: - img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) - else: - img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) - return img_feats_reshaped - - @auto_fp16(apply_to=('img')) - def extract_feat(self, img, img_metas=None, len_queue=None): - """Extract features from images and points.""" - - img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) - - return img_feats - - - def forward_pts_train(self, - pts_feats, - gt_bboxes_3d, - gt_labels_3d, - img_metas, - gt_bboxes_ignore=None, - prev_bev=None): - """Forward function' - Args: - pts_feats (list[torch.Tensor]): Features of point cloud branch - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth - boxes for each sample. - gt_labels_3d (list[torch.Tensor]): Ground truth labels for - boxes of each sampole - img_metas (list[dict]): Meta information of samples. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - boxes to be ignored. Defaults to None. - prev_bev (torch.Tensor, optional): BEV features of previous frame. - Returns: - dict: Losses of each branch. - """ - - outs = self.pts_bbox_head( - pts_feats, img_metas, prev_bev) - loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] - losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) - return losses - - def forward_dummy(self, img): - dummy_metas = None - return self.forward_test(img=img, img_metas=[[dummy_metas]]) - - def forward(self, return_loss=True, **kwargs): - """Calls either forward_train or forward_test depending on whether - return_loss=True. - Note this setting will change the expected inputs. When - `return_loss=True`, img and img_metas are single-nested (i.e. - torch.Tensor and list[dict]), and when `resturn_loss=False`, img and - img_metas should be double nested (i.e. list[torch.Tensor], - list[list[dict]]), with the outer list indicating test time - augmentations. - """ - if return_loss: - return self.forward_train(**kwargs) - else: - return self.forward_test(**kwargs) - - def obtain_history_bev(self, imgs_queue, img_metas_list): - """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. - """ - self.eval() - - with torch.no_grad(): - prev_bev = None - bs, len_queue, num_cams, C, H, W = imgs_queue.shape - imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) - img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) - for i in range(len_queue): - img_metas = [each[i] for each in img_metas_list] - if not img_metas[0]['prev_bev_exists']: - prev_bev = None - # img_feats = self.extract_feat(img=img, img_metas=img_metas) - img_feats = [each_scale[:, i] for each_scale in img_feats_list] - prev_bev = self.pts_bbox_head( - img_feats, img_metas, prev_bev, only_bev=True) - self.train() - return prev_bev - - @auto_fp16(apply_to=('img', 'points')) - def forward_train(self, - points=None, - img_metas=None, - gt_bboxes_3d=None, - gt_labels_3d=None, - gt_labels=None, - gt_bboxes=None, - img=None, - proposals=None, - gt_bboxes_ignore=None, - img_depth=None, - img_mask=None, - ): - """Forward training function. - Args: - points (list[torch.Tensor], optional): Points of each sample. - Defaults to None. - img_metas (list[dict], optional): Meta information of each sample. - Defaults to None. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): - Ground truth 3D boxes. Defaults to None. - gt_labels_3d (list[torch.Tensor], optional): Ground truth labels - of 3D boxes. Defaults to None. - gt_labels (list[torch.Tensor], optional): Ground truth labels - of 2D boxes in images. Defaults to None. - gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in - images. Defaults to None. - img (torch.Tensor optional): Images of each sample with shape - (N, C, H, W). Defaults to None. - proposals ([list[torch.Tensor], optional): Predicted proposals - used for training Fast RCNN. Defaults to None. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - 2D boxes in images to be ignored. Defaults to None. - Returns: - dict: Losses of different branches. - """ - - len_queue = img.size(1) - prev_img = img[:, :-1, ...] - img = img[:, -1, ...] - - prev_img_metas = copy.deepcopy(img_metas) - prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) - - img_metas = [each[len_queue-1] for each in img_metas] - if not img_metas[0]['prev_bev_exists']: - prev_bev = None - img_feats = self.extract_feat(img=img, img_metas=img_metas) - losses = dict() - losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, - gt_labels_3d, img_metas, - gt_bboxes_ignore, prev_bev) - - losses.update(losses_pts) - return losses - - def forward_test(self, img_metas, img=None, **kwargs): - for var, name in [(img_metas, 'img_metas')]: - if not isinstance(var, list): - raise TypeError('{} must be a list, but got {}'.format( - name, type(var))) - img = [img] if img is None else img - - if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: - # the first sample of each scene is truncated - self.prev_frame_info['prev_bev'] = None - # update idx - self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] - - # do not use temporal information - if not self.video_test_mode: - self.prev_frame_info['prev_bev'] = None - - # Get the delta of ego position and angle between two timestamps. - tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) - tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) - if self.prev_frame_info['prev_bev'] is not None: - img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] - img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] - else: - img_metas[0][0]['can_bus'][-1] = 0 - img_metas[0][0]['can_bus'][:3] = 0 - - new_prev_bev, bbox_results = self.simple_test( - img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs) - # During inference, we save the BEV features and ego motion of each timestamp. - self.prev_frame_info['prev_pos'] = tmp_pos - self.prev_frame_info['prev_angle'] = tmp_angle - self.prev_frame_info['prev_bev'] = new_prev_bev - return bbox_results - - def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False): - """Test function""" - outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev) - - bbox_list = self.pts_bbox_head.get_bboxes( - outs, img_metas, rescale=rescale) - bbox_results = [ - bbox3d2result(bboxes, scores, labels) - for bboxes, scores, labels in bbox_list - ] - return outs['bev_embed'], bbox_results - - def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False): - """Test function without augmentaiton.""" - img_feats = self.extract_feat(img=img, img_metas=img_metas) - - bbox_list = [dict() for i in range(len(img_metas))] - new_prev_bev, bbox_pts = self.simple_test_pts( - img_feats, img_metas, prev_bev, rescale=rescale) - for result_dict, pts_bbox in zip(bbox_list, bbox_pts): - result_dict['pts_bbox'] = pts_bbox - return new_prev_bev, bbox_list +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +#from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer(MVXTwoStageDetector): + """BEVFormer. + Args: + video_test_mode (bool): Decide whether to use temporal information during inference. + """ + + def __init__(self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + video_test_mode=False + ): + + super(BEVFormer, + self).__init__(pts_voxel_layer, pts_voxel_encoder, + pts_middle_encoder, pts_fusion_layer, + img_backbone, pts_backbone, img_neck, pts_neck, + pts_bbox_head, img_roi_head, img_rpn_head, + train_cfg, test_cfg, pretrained) + self.grid_mask = GridMask( + True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + self.fp16_enabled = False + + # temporal + self.video_test_mode = video_test_mode + self.prev_frame_info = { + 'prev_bev': None, + 'scene_token': None, + 'prev_pos': 0, + 'prev_angle': 0, + } + + + def extract_img_feat(self, img, img_metas, len_queue=None): + """Extract features of images.""" + B = img.size(0) + if img is not None: + + # input_shape = img.shape[-2:] + # # update real input shape of each single img + # for img_meta in img_metas: + # img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.reshape(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + if len_queue is not None: + img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) + else: + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=('img')) + def extract_feat(self, img, img_metas=None, len_queue=None): + """Extract features from images and points.""" + + img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) + + return img_feats + + + def forward_pts_train(self, + pts_feats, + gt_bboxes_3d, + gt_labels_3d, + img_metas, + gt_bboxes_ignore=None, + prev_bev=None): + """Forward function' + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + prev_bev (torch.Tensor, optional): BEV features of previous frame. + Returns: + dict: Losses of each branch. + """ + + outs = self.pts_bbox_head( + pts_feats, img_metas, prev_bev) + loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] + losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) + return losses + + def forward_dummy(self, img): + dummy_metas = None + return self.forward_test(img=img, img_metas=[[dummy_metas]]) + + def forward(self, return_loss=True, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def obtain_history_bev(self, imgs_queue, img_metas_list): + """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. + """ + self.eval() + + with torch.no_grad(): + prev_bev = None + bs, len_queue, num_cams, C, H, W = imgs_queue.shape + imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) + img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) + for i in range(len_queue): + img_metas = [each[i] for each in img_metas_list] + if not img_metas[0]['prev_bev_exists']: + prev_bev = None + # img_feats = self.extract_feat(img=img, img_metas=img_metas) + img_feats = [each_scale[:, i] for each_scale in img_feats_list] + prev_bev = self.pts_bbox_head( + img_feats, img_metas, prev_bev, only_bev=True) + self.train() + return prev_bev + + @auto_fp16(apply_to=('img', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + len_queue = img.size(1) + prev_img = img[:, :-1, ...] + img = img[:, -1, ...] + + prev_img_metas = copy.deepcopy(img_metas) + prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) + + img_metas = [each[len_queue-1] for each in img_metas] + if not img_metas[0]['prev_bev_exists']: + prev_bev = None + img_feats = self.extract_feat(img=img, img_metas=img_metas) + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev) + + losses.update(losses_pts) + return losses + + def forward_test(self, img_metas, img=None, **kwargs): + for var, name in [(img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + img = [img] if img is None else img + + if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: + # the first sample of each scene is truncated + self.prev_frame_info['prev_bev'] = None + # update idx + self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] + + # do not use temporal information + if not self.video_test_mode: + self.prev_frame_info['prev_bev'] = None + + # Get the delta of ego position and angle between two timestamps. + tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) + tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) + if self.prev_frame_info['prev_bev'] is not None: + img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] + img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] + else: + img_metas[0][0]['can_bus'][-1] = 0 + img_metas[0][0]['can_bus'][:3] = 0 + + new_prev_bev, bbox_results = self.simple_test( + img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs) + # During inference, we save the BEV features and ego motion of each timestamp. + self.prev_frame_info['prev_pos'] = tmp_pos + self.prev_frame_info['prev_angle'] = tmp_angle + self.prev_frame_info['prev_bev'] = new_prev_bev + return bbox_results + + def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False): + """Test function""" + outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev) + + bbox_list = self.pts_bbox_head.get_bboxes( + outs, img_metas, rescale=rescale) + bbox_results = [ + bbox3d2result(bboxes, scores, labels) + for bboxes, scores, labels in bbox_list + ] + return outs['bev_embed'], bbox_results + + def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False): + """Test function without augmentaiton.""" + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + bbox_list = [dict() for i in range(len(img_metas))] + new_prev_bev, bbox_pts = self.simple_test_pts( + img_feats, img_metas, prev_bev, rescale=rescale) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict['pts_bbox'] = pts_bbox + return new_prev_bev, bbox_list diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py index 370393706..5fb4ff87a 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py @@ -1,89 +1,89 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -#from tkinter.messagebox import NO -import torch -from mmcv.runner import force_fp32, auto_fp16 -from mmdet.models import DETECTORS -from mmdet3d.core import bbox3d2result -from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector -from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask -from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer -import time -import copy -import numpy as np -import mmdet3d -from projects.mmdet3d_plugin.models.utils.bricks import run_time - - -@DETECTORS.register_module() -class BEVFormer_fp16(BEVFormer): - """ - The default version BEVFormer currently can not support FP16. - We provide this version to resolve this issue. - """ - - @auto_fp16(apply_to=('img', 'prev_bev', 'points')) - def forward_train(self, - points=None, - img_metas=None, - gt_bboxes_3d=None, - gt_labels_3d=None, - gt_labels=None, - gt_bboxes=None, - img=None, - proposals=None, - gt_bboxes_ignore=None, - img_depth=None, - img_mask=None, - prev_bev=None, - ): - """Forward training function. - Args: - points (list[torch.Tensor], optional): Points of each sample. - Defaults to None. - img_metas (list[dict], optional): Meta information of each sample. - Defaults to None. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): - Ground truth 3D boxes. Defaults to None. - gt_labels_3d (list[torch.Tensor], optional): Ground truth labels - of 3D boxes. Defaults to None. - gt_labels (list[torch.Tensor], optional): Ground truth labels - of 2D boxes in images. Defaults to None. - gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in - images. Defaults to None. - img (torch.Tensor optional): Images of each sample with shape - (N, C, H, W). Defaults to None. - proposals ([list[torch.Tensor], optional): Predicted proposals - used for training Fast RCNN. Defaults to None. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - 2D boxes in images to be ignored. Defaults to None. - Returns: - dict: Losses of different branches. - """ - - img_feats = self.extract_feat(img=img, img_metas=img_metas) - - losses = dict() - losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, - gt_labels_3d, img_metas, - gt_bboxes_ignore, prev_bev=prev_bev) - losses.update(losses_pts) - return losses - - - def val_step(self, data, optimizer): - """ - In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`. - This is not the standard function of `val_step`. - """ - - img = data['img'] - img_metas = data['img_metas'] - img_feats = self.extract_feat(img=img, img_metas=img_metas) - prev_bev = data.get('prev_bev', None) - prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True) - return prev_bev +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +#from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer_fp16(BEVFormer): + """ + The default version BEVFormer currently can not support FP16. + We provide this version to resolve this issue. + """ + + @auto_fp16(apply_to=('img', 'prev_bev', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + prev_bev=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev=prev_bev) + losses.update(losses_pts) + return losses + + + def val_step(self, data, optimizer): + """ + In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`. + This is not the standard function of `val_step`. + """ + + img = data['img'] + img_metas = data['img_metas'] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + prev_bev = data.get('prev_bev', None) + prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True) + return prev_bev diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py index 091738a09..a98ad1cb8 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py @@ -1,14 +1,14 @@ -from mmcv.runner.hooks.hook import HOOKS, Hook -from projects.mmdet3d_plugin.models.utils import run_time - - -@HOOKS.register_module() -class TransferWeight(Hook): - - def __init__(self, every_n_inters=1): - self.every_n_inters=every_n_inters - - def after_train_iter(self, runner): - if self.every_n_inner_iters(runner, self.every_n_inters): - runner.eval_model.load_state_dict(runner.model.state_dict()) - +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time + + +@HOOKS.register_module() +class TransferWeight(Hook): + + def __init__(self, every_n_inters=1): + self.every_n_inters=every_n_inters + + def after_train_iter(self, runner): + if self.every_n_inner_iters(runner, self.every_n_inters): + runner.eval_model.load_state_dict(runner.model.state_dict()) + diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/__init__.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/__init__.py index 1bb5e04c7..555cadaf2 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/__init__.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/__init__.py @@ -1,6 +1,6 @@ -from .transformer import PerceptionTransformer -from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D -from .temporal_self_attention import TemporalSelfAttention -from .encoder import BEVFormerEncoder, BEVFormerLayer -from .decoder import DetectionTransformerDecoder - +from .transformer import PerceptionTransformer +from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D +from .temporal_self_attention import TemporalSelfAttention +from .encoder import BEVFormerEncoder, BEVFormerLayer +from .decoder import DetectionTransformerDecoder + diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py index a5d994cda..b7b088f39 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py @@ -1,260 +1,260 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import copy -import warnings - -import torch -import torch.nn as nn - -from mmcv import ConfigDict, deprecated_api_warning -from mmcv.cnn import Linear, build_activation_layer, build_norm_layer -from mmcv.runner.base_module import BaseModule, ModuleList, Sequential - -from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, - TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) - -# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file -try: - from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 - warnings.warn( - ImportWarning( - '``MultiScaleDeformableAttention`` has been moved to ' - '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 - '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 - 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 - )) -except ImportError: - warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' - '``mmcv.ops.multi_scale_deform_attn``, ' - 'You should install ``mmcv-full`` if you need this module. ') -from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention - - -@TRANSFORMER_LAYER.register_module() -class MyCustomBaseTransformerLayer(BaseModule): - """Base `TransformerLayer` for vision transformer. - It can be built from `mmcv.ConfigDict` and support more flexible - customization, for example, using any number of `FFN or LN ` and - use different kinds of `attention` by specifying a list of `ConfigDict` - named `attn_cfgs`. It is worth mentioning that it supports `prenorm` - when you specifying `norm` as the first element of `operation_order`. - More details about the `prenorm`: `On Layer Normalization in the - Transformer Architecture `_ . - Args: - attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): - Configs for `self_attention` or `cross_attention` modules, - The order of the configs in the list should be consistent with - corresponding attentions in operation_order. - If it is a dict, all of the attention modules in operation_order - will be built with this config. Default: None. - ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): - Configs for FFN, The order of the configs in the list should be - consistent with corresponding ffn in operation_order. - If it is a dict, all of the attention modules in operation_order - will be built with this config. - operation_order (tuple[str]): The execution order of operation - in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). - Support `prenorm` when you specifying first element as `norm`. - Default:None. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN'). - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - batch_first (bool): Key, Query and Value are shape - of (batch, n, embed_dim) - or (n, batch, embed_dim). Default to False. - """ - - def __init__(self, - attn_cfgs=None, - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0., - act_cfg=dict(type='ReLU', inplace=True), - ), - operation_order=None, - norm_cfg=dict(type='LN'), - init_cfg=None, - batch_first=True, - **kwargs): - - deprecated_args = dict( - feedforward_channels='feedforward_channels', - ffn_dropout='ffn_drop', - ffn_num_fcs='num_fcs') - for ori_name, new_name in deprecated_args.items(): - if ori_name in kwargs: - warnings.warn( - f'The arguments `{ori_name}` in BaseTransformerLayer ' - f'has been deprecated, now you should set `{new_name}` ' - f'and other FFN related arguments ' - f'to a dict named `ffn_cfgs`. ') - ffn_cfgs[new_name] = kwargs[ori_name] - - super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) - - self.batch_first = batch_first - - assert set(operation_order) & set( - ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ - set(operation_order), f'The operation_order of' \ - f' {self.__class__.__name__} should ' \ - f'contains all four operation type ' \ - f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" - - num_attn = operation_order.count('self_attn') + operation_order.count( - 'cross_attn') - if isinstance(attn_cfgs, dict): - attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] - else: - assert num_attn == len(attn_cfgs), f'The length ' \ - f'of attn_cfg {num_attn} is ' \ - f'not consistent with the number of attention' \ - f'in operation_order {operation_order}.' - - self.num_attn = num_attn - self.operation_order = operation_order - self.norm_cfg = norm_cfg - self.pre_norm = operation_order[0] == 'norm' - self.attentions = ModuleList() - - index = 0 - for operation_name in operation_order: - if operation_name in ['self_attn', 'cross_attn']: - if 'batch_first' in attn_cfgs[index]: - assert self.batch_first == attn_cfgs[index]['batch_first'] - else: - attn_cfgs[index]['batch_first'] = self.batch_first - attention = build_attention(attn_cfgs[index]) - # Some custom attentions used as `self_attn` - # or `cross_attn` can have different behavior. - attention.operation_name = operation_name - self.attentions.append(attention) - index += 1 - - self.embed_dims = self.attentions[0].embed_dims - - self.ffns = ModuleList() - num_ffns = operation_order.count('ffn') - if isinstance(ffn_cfgs, dict): - ffn_cfgs = ConfigDict(ffn_cfgs) - if isinstance(ffn_cfgs, dict): - ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] - assert len(ffn_cfgs) == num_ffns - for ffn_index in range(num_ffns): - if 'embed_dims' not in ffn_cfgs[ffn_index]: - ffn_cfgs['embed_dims'] = self.embed_dims - else: - assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims - - self.ffns.append( - build_feedforward_network(ffn_cfgs[ffn_index])) - - self.norms = ModuleList() - num_norms = operation_order.count('norm') - for _ in range(num_norms): - self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) - - def forward(self, - query, - key=None, - value=None, - query_pos=None, - key_pos=None, - attn_masks=None, - query_key_padding_mask=None, - key_padding_mask=None, - **kwargs): - """Forward function for `TransformerDecoderLayer`. - **kwargs contains some specific arguments of attentions. - Args: - query (Tensor): The input query with shape - [num_queries, bs, embed_dims] if - self.batch_first is False, else - [bs, num_queries embed_dims]. - key (Tensor): The key tensor with shape [num_keys, bs, - embed_dims] if self.batch_first is False, else - [bs, num_keys, embed_dims] . - value (Tensor): The value tensor with same shape as `key`. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. - Default: None. - attn_masks (List[Tensor] | None): 2D Tensor used in - calculation of corresponding attention. The length of - it should equal to the number of `attention` in - `operation_order`. Default: None. - query_key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_queries]. Only used in `self_attn` layer. - Defaults to None. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_keys]. Default: None. - Returns: - Tensor: forwarded results with shape [num_queries, bs, embed_dims]. - """ - - norm_index = 0 - attn_index = 0 - ffn_index = 0 - identity = query - if attn_masks is None: - attn_masks = [None for _ in range(self.num_attn)] - elif isinstance(attn_masks, torch.Tensor): - attn_masks = [ - copy.deepcopy(attn_masks) for _ in range(self.num_attn) - ] - warnings.warn(f'Use same attn_mask in all attentions in ' - f'{self.__class__.__name__} ') - else: - assert len(attn_masks) == self.num_attn, f'The length of ' \ - f'attn_masks {len(attn_masks)} must be equal ' \ - f'to the number of attention in ' \ - f'operation_order {self.num_attn}' - - for layer in self.operation_order: - if layer == 'self_attn': - temp_key = temp_value = query - query = self.attentions[attn_index]( - query, - temp_key, - temp_value, - identity if self.pre_norm else None, - query_pos=query_pos, - key_pos=query_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=query_key_padding_mask, - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'norm': - query = self.norms[norm_index](query) - norm_index += 1 - - elif layer == 'cross_attn': - query = self.attentions[attn_index]( - query, - key, - value, - identity if self.pre_norm else None, - query_pos=query_pos, - key_pos=key_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=key_padding_mask, - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'ffn': - query = self.ffns[ffn_index]( - query, identity if self.pre_norm else None) - ffn_index += 1 - - return query +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import warnings + +import torch +import torch.nn as nn + +from mmcv import ConfigDict, deprecated_api_warning +from mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') +from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention + + +@TRANSFORMER_LAYER.register_module() +class MyCustomBaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=True, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index])) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + **kwargs contains some specific arguments of attentions. + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/decoder.py index 33024f86a..d022eb714 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/decoder.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/decoder.py @@ -1,345 +1,345 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch -import mmcv -import cv2 as cv -import copy -import warnings -from matplotlib import pyplot as plt -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import xavier_init, constant_init -from mmcv.cnn.bricks.registry import (ATTENTION, - TRANSFORMER_LAYER_SEQUENCE) -from mmcv.cnn.bricks.transformer import TransformerLayerSequence -import math -from mmcv.runner.base_module import BaseModule, ModuleList, Sequential -from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, - to_2tuple) - -from mmcv.utils import ext_loader -from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ - MultiScaleDeformableAttnFunction_fp16 - -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -def inverse_sigmoid(x, eps=1e-5): - """Inverse function of sigmoid. - Args: - x (Tensor): The tensor to do the - inverse. - eps (float): EPS avoid numerical - overflow. Defaults 1e-5. - Returns: - Tensor: The x has passed the inverse - function of sigmoid, has same - shape with input. - """ - x = x.clamp(min=0, max=1) - x1 = x.clamp(min=eps) - x2 = (1 - x).clamp(min=eps) - return torch.log(x1 / x2) - - -@TRANSFORMER_LAYER_SEQUENCE.register_module() -class DetectionTransformerDecoder(TransformerLayerSequence): - """Implements the decoder in DETR3D transformer. - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, *args, return_intermediate=False, **kwargs): - super(DetectionTransformerDecoder, self).__init__(*args, **kwargs) - self.return_intermediate = return_intermediate - self.fp16_enabled = False - - def forward(self, - query, - *args, - reference_points=None, - reg_branches=None, - key_padding_mask=None, - **kwargs): - """Forward function for `Detr3DTransformerDecoder`. - Args: - query (Tensor): Input query with shape - `(num_query, bs, embed_dims)`. - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - reg_branch: (obj:`nn.ModuleList`): Used for - refining the regression results. Only would - be passed when with_box_refine is True, - otherwise would be passed a `None`. - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - - reference_points_input = reference_points[..., :2].unsqueeze( - 2) # BS NUM_QUERY NUM_LEVEL 2 - output = layer( - output, - *args, - reference_points=reference_points_input, - key_padding_mask=key_padding_mask, - **kwargs) - output = output.permute(1, 0, 2) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - - assert reference_points.shape[-1] == 3 - - new_reference_points = torch.zeros_like(reference_points) - new_reference_points[..., :2] = tmp[ - ..., :2] + inverse_sigmoid(reference_points[..., :2]) - new_reference_points[..., 2:3] = tmp[ - ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) - - new_reference_points = new_reference_points.sigmoid() - - reference_points = new_reference_points.detach() - - output = output.permute(1, 0, 2) - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append(reference_points) - - if self.return_intermediate: - return torch.stack(intermediate), torch.stack( - intermediate_reference_points) - - return output, reference_points - - -@ATTENTION.register_module() -class CustomMSDeformableAttention(BaseModule): - """An attention module used in Deformable-Detr. - - `Deformable DETR: Deformable Transformers for End-to-End Object Detection. - `_. - - Args: - embed_dims (int): The embedding dimension of Attention. - Default: 256. - num_heads (int): Parallel attention heads. Default: 64. - num_levels (int): The number of feature map used in - Attention. Default: 4. - num_points (int): The number of sampling points for - each query in each head. Default: 4. - im2col_step (int): The step used in image_to_column. - Default: 64. - dropout (float): A Dropout layer on `inp_identity`. - Default: 0.1. - batch_first (bool): Key, Query and Value are shape of - (batch, n, embed_dim) - or (n, batch, embed_dim). Default to False. - norm_cfg (dict): Config dict for normalization layer. - Default: None. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims=256, - num_heads=8, - num_levels=4, - num_points=4, - im2col_step=64, - dropout=0.1, - batch_first=False, - norm_cfg=None, - init_cfg=None): - super().__init__(init_cfg) - if embed_dims % num_heads != 0: - raise ValueError(f'embed_dims must be divisible by num_heads, ' - f'but got {embed_dims} and {num_heads}') - dim_per_head = embed_dims // num_heads - self.norm_cfg = norm_cfg - self.dropout = nn.Dropout(dropout) - self.batch_first = batch_first - self.fp16_enabled = False - - # you'd better set dim_per_head to a power of 2 - # which is more efficient in the CUDA implementation - def _is_power_of_2(n): - if (not isinstance(n, int)) or (n < 0): - raise ValueError( - 'invalid input for _is_power_of_2: {} (type: {})'.format( - n, type(n))) - return (n & (n - 1) == 0) and n != 0 - - if not _is_power_of_2(dim_per_head): - warnings.warn( - "You'd better set embed_dims in " - 'MultiScaleDeformAttention to make ' - 'the dimension of each attention head a power of 2 ' - 'which is more efficient in our CUDA implementation.') - - self.im2col_step = im2col_step - self.embed_dims = embed_dims - self.num_levels = num_levels - self.num_heads = num_heads - self.num_points = num_points - self.sampling_offsets = nn.Linear( - embed_dims, num_heads * num_levels * num_points * 2) - self.attention_weights = nn.Linear(embed_dims, - num_heads * num_levels * num_points) - self.value_proj = nn.Linear(embed_dims, embed_dims) - self.output_proj = nn.Linear(embed_dims, embed_dims) - self.init_weights() - - def init_weights(self): - """Default initialization for Parameters of Module.""" - constant_init(self.sampling_offsets, 0.) - thetas = torch.arange( - self.num_heads, - dtype=torch.float32) * (2.0 * math.pi / self.num_heads) - grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) - grid_init = (grid_init / - grid_init.abs().max(-1, keepdim=True)[0]).view( - self.num_heads, 1, 1, - 2).repeat(1, self.num_levels, self.num_points, 1) - for i in range(self.num_points): - grid_init[:, :, i, :] *= i + 1 - - self.sampling_offsets.bias.data = grid_init.view(-1) - constant_init(self.attention_weights, val=0., bias=0.) - xavier_init(self.value_proj, distribution='uniform', bias=0.) - xavier_init(self.output_proj, distribution='uniform', bias=0.) - self._is_init = True - - @deprecated_api_warning({'residual': 'identity'}, - cls_name='MultiScaleDeformableAttention') - def forward(self, - query, - key=None, - value=None, - identity=None, - query_pos=None, - key_padding_mask=None, - reference_points=None, - spatial_shapes=None, - level_start_index=None, - flag='decoder', - **kwargs): - """Forward Function of MultiScaleDeformAttention. - - Args: - query (Tensor): Query of Transformer with shape - (num_query, bs, embed_dims). - key (Tensor): The key tensor with shape - `(num_key, bs, embed_dims)`. - value (Tensor): The value tensor with shape - `(num_key, bs, embed_dims)`. - identity (Tensor): The tensor used for addition, with the - same shape as `query`. Default None. If None, - `query` will be used. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. Default - None. - reference_points (Tensor): The normalized reference - points with shape (bs, num_query, num_levels, 2), - all elements is range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area. - or (N, Length_{query}, num_levels, 4), add - additional two dimensions is (w, h) to - form reference boxes. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - spatial_shapes (Tensor): Spatial shape of features in - different levels. With shape (num_levels, 2), - last dimension represents (h, w). - level_start_index (Tensor): The start index of each level. - A tensor has shape ``(num_levels, )`` and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. - - Returns: - Tensor: forwarded results with shape [num_query, bs, embed_dims]. - """ - - if value is None: - value = query - - if identity is None: - identity = query - if query_pos is not None: - query = query + query_pos - if not self.batch_first: - # change to (bs, num_query ,embed_dims) - query = query.permute(1, 0, 2) - value = value.permute(1, 0, 2) - - bs, num_query, _ = query.shape - bs, num_value, _ = value.shape - assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value - - value = self.value_proj(value) - if key_padding_mask is not None: - value = value.masked_fill(key_padding_mask[..., None], 0.0) - value = value.view(bs, num_value, self.num_heads, -1) - - sampling_offsets = self.sampling_offsets(query).view( - bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) - attention_weights = self.attention_weights(query).view( - bs, num_query, self.num_heads, self.num_levels * self.num_points) - attention_weights = attention_weights.softmax(-1) - - attention_weights = attention_weights.view(bs, num_query, - self.num_heads, - self.num_levels, - self.num_points) - if reference_points.shape[-1] == 2: - offset_normalizer = torch.stack( - [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) - sampling_locations = reference_points[:, :, None, :, None, :] \ - + sampling_offsets \ - / offset_normalizer[None, None, None, :, None, :] - elif reference_points.shape[-1] == 4: - sampling_locations = reference_points[:, :, None, :, None, :2] \ - + sampling_offsets / self.num_points \ - * reference_points[:, :, None, :, None, 2:] \ - * 0.5 - else: - raise ValueError( - f'Last dim of reference_points must be' - f' 2 or 4, but get {reference_points.shape[-1]} instead.') - if torch.cuda.is_available() and value.is_cuda: - - # using fp16 deformable attention is unstable because it performs many sum operations - if value.dtype == torch.float16: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - else: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - output = MultiScaleDeformableAttnFunction.apply( - value, spatial_shapes, level_start_index, sampling_locations, - attention_weights, self.im2col_step) - else: - output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) - - output = self.output_proj(output) - - if not self.batch_first: - # (num_query, bs ,embed_dims) - output = output.permute(1, 0, 2) - - return self.dropout(output) + identity +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import mmcv +import cv2 as cv +import copy +import warnings +from matplotlib import pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetectionTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(DetectionTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 3 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + new_reference_points[..., 2:3] = tmp[ + ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@ATTENTION.register_module() +class CustomMSDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/encoder.py index b1ee30065..6f417834e 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/encoder.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/encoder.py @@ -1,403 +1,403 @@ - -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -from projects.mmdet3d_plugin.models.utils.bricks import run_time -from projects.mmdet3d_plugin.models.utils.visual import save_tensor -from .custom_base_transformer_layer import MyCustomBaseTransformerLayer -import copy -import warnings -from mmcv.cnn.bricks.registry import (ATTENTION, - TRANSFORMER_LAYER, - TRANSFORMER_LAYER_SEQUENCE) -from mmcv.cnn.bricks.transformer import TransformerLayerSequence -from mmcv.runner import force_fp32, auto_fp16 -import numpy as np -import torch -import cv2 as cv -import mmcv -from mmcv.utils import TORCH_VERSION, digit_version -from mmcv.utils import ext_loader -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -@TRANSFORMER_LAYER_SEQUENCE.register_module() -class BEVFormerEncoder(TransformerLayerSequence): - - """ - Attention with both self and cross - Implements the decoder in DETR transformer. - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', - **kwargs): - - super(BEVFormerEncoder, self).__init__(*args, **kwargs) - self.return_intermediate = return_intermediate - - self.num_points_in_pillar = num_points_in_pillar - self.pc_range = pc_range - self.fp16_enabled = False - - @staticmethod - def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): - """Get the reference points used in SCA and TSA. - Args: - H, W: spatial shape of bev. - Z: hight of pillar. - D: sample D points uniformly from each pillar. - device (obj:`device`): The device where - reference_points should be. - Returns: - Tensor: reference points used in decoder, has \ - shape (bs, num_keys, num_levels, 2). - """ - - # reference points in 3D space, used in spatial cross-attention (SCA) - if dim == '3d': - zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, - device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z - xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, - device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W - ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, - device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H - ref_3d = torch.stack((xs, ys, zs), -1) - ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) - ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) - return ref_3d - - # reference points on 2D bev plane, used in temporal self-attention (TSA). - elif dim == '2d': - ref_y, ref_x = torch.meshgrid( - torch.linspace( - 0.5, H - 0.5, H, dtype=dtype, device=device), - torch.linspace( - 0.5, W - 0.5, W, dtype=dtype, device=device) - ) - ref_y = ref_y.reshape(-1)[None] / H - ref_x = ref_x.reshape(-1)[None] / W - ref_2d = torch.stack((ref_x, ref_y), -1) - ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) - return ref_2d - - # This function must use fp32!!! - @force_fp32(apply_to=('reference_points', 'img_metas')) - def point_sampling(self, reference_points, pc_range, img_metas): - - lidar2img = [] - for img_meta in img_metas: - lidar2img.append(img_meta['lidar2img']) - lidar2img = np.asarray(lidar2img) - lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) - reference_points = reference_points.clone() - - reference_points[..., 0:1] = reference_points[..., 0:1] * \ - (pc_range[3] - pc_range[0]) + pc_range[0] - reference_points[..., 1:2] = reference_points[..., 1:2] * \ - (pc_range[4] - pc_range[1]) + pc_range[1] - reference_points[..., 2:3] = reference_points[..., 2:3] * \ - (pc_range[5] - pc_range[2]) + pc_range[2] - - reference_points = torch.cat( - (reference_points, torch.ones_like(reference_points[..., :1])), -1) - - reference_points = reference_points.permute(1, 0, 2, 3) - D, B, num_query = reference_points.size()[:3] - num_cam = lidar2img.size(1) - - reference_points = reference_points.view( - D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) - - lidar2img = lidar2img.view( - 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) - - reference_points_cam = torch.matmul(lidar2img.to(torch.float32), - reference_points.to(torch.float32)).squeeze(-1) - eps = 1e-5 - - bev_mask = (reference_points_cam[..., 2:3] > eps) - reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( - reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) - - reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] - reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] - - bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) - & (reference_points_cam[..., 1:2] < 1.0) - & (reference_points_cam[..., 0:1] < 1.0) - & (reference_points_cam[..., 0:1] > 0.0)) - if digit_version(TORCH_VERSION) >= digit_version('1.8'): - bev_mask = torch.nan_to_num(bev_mask) - else: - bev_mask = bev_mask.new_tensor( - np.nan_to_num(bev_mask.cpu().numpy())) - - reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) - bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) - - return reference_points_cam, bev_mask - - @auto_fp16() - def forward(self, - bev_query, - key, - value, - *args, - bev_h=None, - bev_w=None, - bev_pos=None, - spatial_shapes=None, - level_start_index=None, - valid_ratios=None, - prev_bev=None, - shift=0., - **kwargs): - """Forward function for `TransformerDecoder`. - Args: - bev_query (Tensor): Input BEV query with shape - `(num_query, bs, embed_dims)`. - key & value (Tensor): Input multi-cameta features with shape - (num_cam, num_value, bs, embed_dims) - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - - output = bev_query - intermediate = [] - - ref_3d = self.get_reference_points( - bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) - ref_2d = self.get_reference_points( - bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) - - reference_points_cam, bev_mask = self.point_sampling( - ref_3d, self.pc_range, kwargs['img_metas']) - - # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. - shift_ref_2d = ref_2d # .clone() - shift_ref_2d += shift[:, None, None, :] - - # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) - bev_query = bev_query.permute(1, 0, 2) - bev_pos = bev_pos.permute(1, 0, 2) - bs, len_bev, num_bev_level, _ = ref_2d.shape - if prev_bev is not None: - prev_bev = prev_bev.permute(1, 0, 2) - prev_bev = torch.stack( - [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) - hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( - bs*2, len_bev, num_bev_level, 2) - else: - hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( - bs*2, len_bev, num_bev_level, 2) - - for lid, layer in enumerate(self.layers): - output = layer( - bev_query, - key, - value, - *args, - bev_pos=bev_pos, - ref_2d=hybird_ref_2d, - ref_3d=ref_3d, - bev_h=bev_h, - bev_w=bev_w, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - reference_points_cam=reference_points_cam, - bev_mask=bev_mask, - prev_bev=prev_bev, - **kwargs) - - bev_query = output - if self.return_intermediate: - intermediate.append(output) - - if self.return_intermediate: - return torch.stack(intermediate) - - return output - - -@TRANSFORMER_LAYER.register_module() -class BEVFormerLayer(MyCustomBaseTransformerLayer): - """Implements decoder layer in DETR transformer. - Args: - attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): - Configs for self_attention or cross_attention, the order - should be consistent with it in `operation_order`. If it is - a dict, it would be expand to the number of attention in - `operation_order`. - feedforward_channels (int): The hidden dimension for FFNs. - ffn_dropout (float): Probability of an element to be zeroed - in ffn. Default 0.0. - operation_order (tuple[str]): The execution order of operation - in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). - Default:None - act_cfg (dict): The activation config for FFNs. Default: `LN` - norm_cfg (dict): Config dict for normalization layer. - Default: `LN`. - ffn_num_fcs (int): The number of fully-connected layers in FFNs. - Default:2. - """ - - def __init__(self, - attn_cfgs, - feedforward_channels, - ffn_dropout=0.0, - operation_order=None, - act_cfg=dict(type='ReLU', inplace=True), - norm_cfg=dict(type='LN'), - ffn_num_fcs=2, - **kwargs): - super(BEVFormerLayer, self).__init__( - attn_cfgs=attn_cfgs, - feedforward_channels=feedforward_channels, - ffn_dropout=ffn_dropout, - operation_order=operation_order, - act_cfg=act_cfg, - norm_cfg=norm_cfg, - ffn_num_fcs=ffn_num_fcs, - **kwargs) - self.fp16_enabled = False - assert len(operation_order) == 6 - assert set(operation_order) == set( - ['self_attn', 'norm', 'cross_attn', 'ffn']) - - def forward(self, - query, - key=None, - value=None, - bev_pos=None, - query_pos=None, - key_pos=None, - attn_masks=None, - query_key_padding_mask=None, - key_padding_mask=None, - ref_2d=None, - ref_3d=None, - bev_h=None, - bev_w=None, - reference_points_cam=None, - mask=None, - spatial_shapes=None, - level_start_index=None, - prev_bev=None, - **kwargs): - """Forward function for `TransformerDecoderLayer`. - - **kwargs contains some specific arguments of attentions. - - Args: - query (Tensor): The input query with shape - [num_queries, bs, embed_dims] if - self.batch_first is False, else - [bs, num_queries embed_dims]. - key (Tensor): The key tensor with shape [num_keys, bs, - embed_dims] if self.batch_first is False, else - [bs, num_keys, embed_dims] . - value (Tensor): The value tensor with same shape as `key`. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. - Default: None. - attn_masks (List[Tensor] | None): 2D Tensor used in - calculation of corresponding attention. The length of - it should equal to the number of `attention` in - `operation_order`. Default: None. - query_key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_queries]. Only used in `self_attn` layer. - Defaults to None. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_keys]. Default: None. - - Returns: - Tensor: forwarded results with shape [num_queries, bs, embed_dims]. - """ - - norm_index = 0 - attn_index = 0 - ffn_index = 0 - identity = query - if attn_masks is None: - attn_masks = [None for _ in range(self.num_attn)] - elif isinstance(attn_masks, torch.Tensor): - attn_masks = [ - copy.deepcopy(attn_masks) for _ in range(self.num_attn) - ] - warnings.warn(f'Use same attn_mask in all attentions in ' - f'{self.__class__.__name__} ') - else: - assert len(attn_masks) == self.num_attn, f'The length of ' \ - f'attn_masks {len(attn_masks)} must be equal ' \ - f'to the number of attention in ' \ - f'operation_order {self.num_attn}' - - for layer in self.operation_order: - # temporal self attention - if layer == 'self_attn': - - query = self.attentions[attn_index]( - query, - prev_bev, - prev_bev, - identity if self.pre_norm else None, - query_pos=bev_pos, - key_pos=bev_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=query_key_padding_mask, - reference_points=ref_2d, - spatial_shapes=torch.tensor( - [[bev_h, bev_w]], device=query.device), - level_start_index=torch.tensor([0], device=query.device), - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'norm': - query = self.norms[norm_index](query) - norm_index += 1 - - # spaital cross attention - elif layer == 'cross_attn': - query = self.attentions[attn_index]( - query, - key, - value, - identity if self.pre_norm else None, - query_pos=query_pos, - key_pos=key_pos, - reference_points=ref_3d, - reference_points_cam=reference_points_cam, - mask=mask, - attn_mask=attn_masks[attn_index], - key_padding_mask=key_padding_mask, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'ffn': - query = self.ffns[ffn_index]( - query, identity if self.pre_norm else None) - ffn_index += 1 - - return query + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from .custom_base_transformer_layer import MyCustomBaseTransformerLayer +import copy +import warnings +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmcv.runner import force_fp32, auto_fp16 +import numpy as np +import torch +import cv2 as cv +import mmcv +from mmcv.utils import TORCH_VERSION, digit_version +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class BEVFormerEncoder(TransformerLayerSequence): + + """ + Attention with both self and cross + Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', + **kwargs): + + super(BEVFormerEncoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + + self.num_points_in_pillar = num_points_in_pillar + self.pc_range = pc_range + self.fp16_enabled = False + + @staticmethod + def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): + """Get the reference points used in SCA and TSA. + Args: + H, W: spatial shape of bev. + Z: hight of pillar. + D: sample D points uniformly from each pillar. + device (obj:`device`): The device where + reference_points should be. + Returns: + Tensor: reference points used in decoder, has \ + shape (bs, num_keys, num_levels, 2). + """ + + # reference points in 3D space, used in spatial cross-attention (SCA) + if dim == '3d': + zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, + device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z + xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, + device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W + ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, + device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H + ref_3d = torch.stack((xs, ys, zs), -1) + ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) + ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) + return ref_3d + + # reference points on 2D bev plane, used in temporal self-attention (TSA). + elif dim == '2d': + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=dtype, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=dtype, device=device) + ) + ref_y = ref_y.reshape(-1)[None] / H + ref_x = ref_x.reshape(-1)[None] / W + ref_2d = torch.stack((ref_x, ref_y), -1) + ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) + return ref_2d + + # This function must use fp32!!! + @force_fp32(apply_to=('reference_points', 'img_metas')) + def point_sampling(self, reference_points, pc_range, img_metas): + + lidar2img = [] + for img_meta in img_metas: + lidar2img.append(img_meta['lidar2img']) + lidar2img = np.asarray(lidar2img) + lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) + reference_points = reference_points.clone() + + reference_points[..., 0:1] = reference_points[..., 0:1] * \ + (pc_range[3] - pc_range[0]) + pc_range[0] + reference_points[..., 1:2] = reference_points[..., 1:2] * \ + (pc_range[4] - pc_range[1]) + pc_range[1] + reference_points[..., 2:3] = reference_points[..., 2:3] * \ + (pc_range[5] - pc_range[2]) + pc_range[2] + + reference_points = torch.cat( + (reference_points, torch.ones_like(reference_points[..., :1])), -1) + + reference_points = reference_points.permute(1, 0, 2, 3) + D, B, num_query = reference_points.size()[:3] + num_cam = lidar2img.size(1) + + reference_points = reference_points.view( + D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) + + lidar2img = lidar2img.view( + 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) + + reference_points_cam = torch.matmul(lidar2img.to(torch.float32), + reference_points.to(torch.float32)).squeeze(-1) + eps = 1e-5 + + bev_mask = (reference_points_cam[..., 2:3] > eps) + reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( + reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) + + reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] + reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] + + bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) + & (reference_points_cam[..., 1:2] < 1.0) + & (reference_points_cam[..., 0:1] < 1.0) + & (reference_points_cam[..., 0:1] > 0.0)) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + bev_mask = torch.nan_to_num(bev_mask) + else: + bev_mask = bev_mask.new_tensor( + np.nan_to_num(bev_mask.cpu().numpy())) + + reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) + bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) + + return reference_points_cam, bev_mask + + @auto_fp16() + def forward(self, + bev_query, + key, + value, + *args, + bev_h=None, + bev_w=None, + bev_pos=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + prev_bev=None, + shift=0., + **kwargs): + """Forward function for `TransformerDecoder`. + Args: + bev_query (Tensor): Input BEV query with shape + `(num_query, bs, embed_dims)`. + key & value (Tensor): Input multi-cameta features with shape + (num_cam, num_value, bs, embed_dims) + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + + output = bev_query + intermediate = [] + + ref_3d = self.get_reference_points( + bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + ref_2d = self.get_reference_points( + bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + + reference_points_cam, bev_mask = self.point_sampling( + ref_3d, self.pc_range, kwargs['img_metas']) + + # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. + shift_ref_2d = ref_2d # .clone() + shift_ref_2d += shift[:, None, None, :] + + # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) + bev_query = bev_query.permute(1, 0, 2) + bev_pos = bev_pos.permute(1, 0, 2) + bs, len_bev, num_bev_level, _ = ref_2d.shape + if prev_bev is not None: + prev_bev = prev_bev.permute(1, 0, 2) + prev_bev = torch.stack( + [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) + hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + else: + hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + + for lid, layer in enumerate(self.layers): + output = layer( + bev_query, + key, + value, + *args, + bev_pos=bev_pos, + ref_2d=hybird_ref_2d, + ref_3d=ref_3d, + bev_h=bev_h, + bev_w=bev_w, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + reference_points_cam=reference_points_cam, + bev_mask=bev_mask, + prev_bev=prev_bev, + **kwargs) + + bev_query = output + if self.return_intermediate: + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output + + +@TRANSFORMER_LAYER.register_module() +class BEVFormerLayer(MyCustomBaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(BEVFormerLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + self.fp16_enabled = False + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) + + def forward(self, + query, + key=None, + value=None, + bev_pos=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + ref_2d=None, + ref_3d=None, + bev_h=None, + bev_w=None, + reference_points_cam=None, + mask=None, + spatial_shapes=None, + level_start_index=None, + prev_bev=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + # temporal self attention + if layer == 'self_attn': + + query = self.attentions[attn_index]( + query, + prev_bev, + prev_bev, + identity if self.pre_norm else None, + query_pos=bev_pos, + key_pos=bev_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + reference_points=ref_2d, + spatial_shapes=torch.tensor( + [[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + # spaital cross attention + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + reference_points=ref_3d, + reference_points_cam=reference_points_cam, + mask=mask, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py index 77b0f319c..59f917343 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py @@ -1,163 +1,163 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import torch -from torch.cuda.amp import custom_bwd, custom_fwd -from torch.autograd.function import Function, once_differentiable -from mmcv.utils import ext_loader -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -class MultiScaleDeformableAttnFunction_fp16(Function): - - @staticmethod - @custom_fwd(cast_inputs=torch.float16) - def forward(ctx, value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights, im2col_step): - """GPU version of multi-scale deformable attention. - - Args: - value (Tensor): The value has shape - (bs, num_keys, mum_heads, embed_dims//num_heads) - value_spatial_shapes (Tensor): Spatial shape of - each feature map, has shape (num_levels, 2), - last dimension 2 represent (h, w) - sampling_locations (Tensor): The location of sampling points, - has shape - (bs ,num_queries, num_heads, num_levels, num_points, 2), - the last dimension 2 represent (x, y). - attention_weights (Tensor): The weight of sampling points used - when calculate the attention, has shape - (bs ,num_queries, num_heads, num_levels, num_points), - im2col_step (Tensor): The step used in image to column. - - Returns: - Tensor: has shape (bs, num_queries, embed_dims) - """ - ctx.im2col_step = im2col_step - output = ext_module.ms_deform_attn_forward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - im2col_step=ctx.im2col_step) - ctx.save_for_backward(value, value_spatial_shapes, - value_level_start_index, sampling_locations, - attention_weights) - return output - - @staticmethod - @once_differentiable - @custom_bwd - def backward(ctx, grad_output): - """GPU version of backward function. - - Args: - grad_output (Tensor): Gradient - of output tensor of forward. - - Returns: - Tuple[Tensor]: Gradient - of input tensors in forward. - """ - value, value_spatial_shapes, value_level_start_index, \ - sampling_locations, attention_weights = ctx.saved_tensors - grad_value = torch.zeros_like(value) - grad_sampling_loc = torch.zeros_like(sampling_locations) - grad_attn_weight = torch.zeros_like(attention_weights) - - ext_module.ms_deform_attn_backward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - grad_output.contiguous(), - grad_value, - grad_sampling_loc, - grad_attn_weight, - im2col_step=ctx.im2col_step) - - return grad_value, None, None, \ - grad_sampling_loc, grad_attn_weight, None - - -class MultiScaleDeformableAttnFunction_fp32(Function): - - @staticmethod - @custom_fwd(cast_inputs=torch.float32) - def forward(ctx, value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights, im2col_step): - """GPU version of multi-scale deformable attention. - - Args: - value (Tensor): The value has shape - (bs, num_keys, mum_heads, embed_dims//num_heads) - value_spatial_shapes (Tensor): Spatial shape of - each feature map, has shape (num_levels, 2), - last dimension 2 represent (h, w) - sampling_locations (Tensor): The location of sampling points, - has shape - (bs ,num_queries, num_heads, num_levels, num_points, 2), - the last dimension 2 represent (x, y). - attention_weights (Tensor): The weight of sampling points used - when calculate the attention, has shape - (bs ,num_queries, num_heads, num_levels, num_points), - im2col_step (Tensor): The step used in image to column. - - Returns: - Tensor: has shape (bs, num_queries, embed_dims) - """ - - ctx.im2col_step = im2col_step - output = ext_module.ms_deform_attn_forward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - im2col_step=ctx.im2col_step) - ctx.save_for_backward(value, value_spatial_shapes, - value_level_start_index, sampling_locations, - attention_weights) - return output - - @staticmethod - @once_differentiable - @custom_bwd - def backward(ctx, grad_output): - """GPU version of backward function. - - Args: - grad_output (Tensor): Gradient - of output tensor of forward. - - Returns: - Tuple[Tensor]: Gradient - of input tensors in forward. - """ - value, value_spatial_shapes, value_level_start_index, \ - sampling_locations, attention_weights = ctx.saved_tensors - grad_value = torch.zeros_like(value) - grad_sampling_loc = torch.zeros_like(sampling_locations) - grad_attn_weight = torch.zeros_like(attention_weights) - - ext_module.ms_deform_attn_backward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - grad_output.contiguous(), - grad_value, - grad_sampling_loc, - grad_attn_weight, - im2col_step=ctx.im2col_step) - - return grad_value, None, None, \ - grad_sampling_loc, grad_attn_weight, None +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import torch +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.autograd.function import Function, once_differentiable +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction_fp16(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +class MultiScaleDeformableAttnFunction_fp32(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float32) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py index 100d94fef..8de4009b0 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py @@ -1,399 +1,399 @@ - -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch -import warnings -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import xavier_init, constant_init -from mmcv.cnn.bricks.registry import (ATTENTION, - TRANSFORMER_LAYER, - TRANSFORMER_LAYER_SEQUENCE) -from mmcv.cnn.bricks.transformer import build_attention -import math -from mmcv.runner import force_fp32, auto_fp16 - -from mmcv.runner.base_module import BaseModule, ModuleList, Sequential - -from mmcv.utils import ext_loader -from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ - MultiScaleDeformableAttnFunction_fp16 -from projects.mmdet3d_plugin.models.utils.bricks import run_time -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -@ATTENTION.register_module() -class SpatialCrossAttention(BaseModule): - """An attention module used in BEVFormer. - Args: - embed_dims (int): The embedding dimension of Attention. - Default: 256. - num_cams (int): The number of cameras - dropout (float): A Dropout layer on `inp_residual`. - Default: 0.. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - deformable_attention: (dict): The config for the deformable attention used in SCA. - """ - - def __init__(self, - embed_dims=256, - num_cams=6, - pc_range=None, - dropout=0.1, - init_cfg=None, - batch_first=False, - deformable_attention=dict( - type='MSDeformableAttention3D', - embed_dims=256, - num_levels=4), - **kwargs - ): - super(SpatialCrossAttention, self).__init__(init_cfg) - - self.init_cfg = init_cfg - self.dropout = nn.Dropout(dropout) - self.pc_range = pc_range - self.fp16_enabled = False - self.deformable_attention = build_attention(deformable_attention) - self.embed_dims = embed_dims - self.num_cams = num_cams - self.output_proj = nn.Linear(embed_dims, embed_dims) - self.batch_first = batch_first - self.init_weight() - - def init_weight(self): - """Default initialization for Parameters of Module.""" - xavier_init(self.output_proj, distribution='uniform', bias=0.) - - @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) - def forward(self, - query, - key, - value, - residual=None, - query_pos=None, - key_padding_mask=None, - reference_points=None, - spatial_shapes=None, - reference_points_cam=None, - bev_mask=None, - level_start_index=None, - flag='encoder', - **kwargs): - """Forward Function of Detr3DCrossAtten. - Args: - query (Tensor): Query of Transformer with shape - (num_query, bs, embed_dims). - key (Tensor): The key tensor with shape - `(num_key, bs, embed_dims)`. - value (Tensor): The value tensor with shape - `(num_key, bs, embed_dims)`. (B, N, C, H, W) - residual (Tensor): The tensor used for addition, with the - same shape as `x`. Default None. If None, `x` will be used. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. Default - None. - reference_points (Tensor): The normalized reference - points with shape (bs, num_query, 4), - all elements is range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area. - or (N, Length_{query}, num_levels, 4), add - additional two dimensions is (w, h) to - form reference boxes. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - spatial_shapes (Tensor): Spatial shape of features in - different level. With shape (num_levels, 2), - last dimension represent (h, w). - level_start_index (Tensor): The start index of each level. - A tensor has shape (num_levels) and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. - Returns: - Tensor: forwarded results with shape [num_query, bs, embed_dims]. - """ - - if key is None: - key = query - if value is None: - value = key - - if residual is None: - inp_residual = query - slots = torch.zeros_like(query) - if query_pos is not None: - query = query + query_pos - - bs, num_query, _ = query.size() - - D = reference_points_cam.size(3) - indexes = [] - for i, mask_per_img in enumerate(bev_mask): - index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) - indexes.append(index_query_per_img) - max_len = max([len(each) for each in indexes]) - - # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. - queries_rebatch = query.new_zeros( - [bs, self.num_cams, max_len, self.embed_dims]) - reference_points_rebatch = reference_points_cam.new_zeros( - [bs, self.num_cams, max_len, D, 2]) - - for j in range(bs): - for i, reference_points_per_img in enumerate(reference_points_cam): - index_query_per_img = indexes[i] - queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] - reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] - - num_cams, l, bs, embed_dims = key.shape - - key = key.permute(2, 0, 1, 3).reshape( - bs * self.num_cams, l, self.embed_dims) - value = value.permute(2, 0, 1, 3).reshape( - bs * self.num_cams, l, self.embed_dims) - - queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, - reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, - level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) - for j in range(bs): - for i, index_query_per_img in enumerate(indexes): - slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] - - count = bev_mask.sum(-1) > 0 - count = count.permute(1, 2, 0).sum(-1) - count = torch.clamp(count, min=1.0) - slots = slots / count[..., None] - slots = self.output_proj(slots) - - return self.dropout(slots) + inp_residual - - -@ATTENTION.register_module() -class MSDeformableAttention3D(BaseModule): - """An attention module used in BEVFormer based on Deformable-Detr. - `Deformable DETR: Deformable Transformers for End-to-End Object Detection. - `_. - Args: - embed_dims (int): The embedding dimension of Attention. - Default: 256. - num_heads (int): Parallel attention heads. Default: 64. - num_levels (int): The number of feature map used in - Attention. Default: 4. - num_points (int): The number of sampling points for - each query in each head. Default: 4. - im2col_step (int): The step used in image_to_column. - Default: 64. - dropout (float): A Dropout layer on `inp_identity`. - Default: 0.1. - batch_first (bool): Key, Query and Value are shape of - (batch, n, embed_dim) - or (n, batch, embed_dim). Default to False. - norm_cfg (dict): Config dict for normalization layer. - Default: None. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims=256, - num_heads=8, - num_levels=4, - num_points=8, - im2col_step=64, - dropout=0.1, - batch_first=True, - norm_cfg=None, - init_cfg=None): - super().__init__(init_cfg) - if embed_dims % num_heads != 0: - raise ValueError(f'embed_dims must be divisible by num_heads, ' - f'but got {embed_dims} and {num_heads}') - dim_per_head = embed_dims // num_heads - self.norm_cfg = norm_cfg - self.batch_first = batch_first - self.output_proj = None - self.fp16_enabled = False - - # you'd better set dim_per_head to a power of 2 - # which is more efficient in the CUDA implementation - def _is_power_of_2(n): - if (not isinstance(n, int)) or (n < 0): - raise ValueError( - 'invalid input for _is_power_of_2: {} (type: {})'.format( - n, type(n))) - return (n & (n - 1) == 0) and n != 0 - - if not _is_power_of_2(dim_per_head): - warnings.warn( - "You'd better set embed_dims in " - 'MultiScaleDeformAttention to make ' - 'the dimension of each attention head a power of 2 ' - 'which is more efficient in our CUDA implementation.') - - self.im2col_step = im2col_step - self.embed_dims = embed_dims - self.num_levels = num_levels - self.num_heads = num_heads - self.num_points = num_points - self.sampling_offsets = nn.Linear( - embed_dims, num_heads * num_levels * num_points * 2) - self.attention_weights = nn.Linear(embed_dims, - num_heads * num_levels * num_points) - self.value_proj = nn.Linear(embed_dims, embed_dims) - - self.init_weights() - - def init_weights(self): - """Default initialization for Parameters of Module.""" - constant_init(self.sampling_offsets, 0.) - thetas = torch.arange( - self.num_heads, - dtype=torch.float32) * (2.0 * math.pi / self.num_heads) - grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) - grid_init = (grid_init / - grid_init.abs().max(-1, keepdim=True)[0]).view( - self.num_heads, 1, 1, - 2).repeat(1, self.num_levels, self.num_points, 1) - for i in range(self.num_points): - grid_init[:, :, i, :] *= i + 1 - - self.sampling_offsets.bias.data = grid_init.view(-1) - constant_init(self.attention_weights, val=0., bias=0.) - xavier_init(self.value_proj, distribution='uniform', bias=0.) - xavier_init(self.output_proj, distribution='uniform', bias=0.) - self._is_init = True - - def forward(self, - query, - key=None, - value=None, - identity=None, - query_pos=None, - key_padding_mask=None, - reference_points=None, - spatial_shapes=None, - level_start_index=None, - **kwargs): - """Forward Function of MultiScaleDeformAttention. - Args: - query (Tensor): Query of Transformer with shape - ( bs, num_query, embed_dims). - key (Tensor): The key tensor with shape - `(bs, num_key, embed_dims)`. - value (Tensor): The value tensor with shape - `(bs, num_key, embed_dims)`. - identity (Tensor): The tensor used for addition, with the - same shape as `query`. Default None. If None, - `query` will be used. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. Default - None. - reference_points (Tensor): The normalized reference - points with shape (bs, num_query, num_levels, 2), - all elements is range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area. - or (N, Length_{query}, num_levels, 4), add - additional two dimensions is (w, h) to - form reference boxes. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - spatial_shapes (Tensor): Spatial shape of features in - different levels. With shape (num_levels, 2), - last dimension represents (h, w). - level_start_index (Tensor): The start index of each level. - A tensor has shape ``(num_levels, )`` and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. - Returns: - Tensor: forwarded results with shape [num_query, bs, embed_dims]. - """ - - if value is None: - value = query - if identity is None: - identity = query - if query_pos is not None: - query = query + query_pos - - if not self.batch_first: - # change to (bs, num_query ,embed_dims) - query = query.permute(1, 0, 2) - value = value.permute(1, 0, 2) - - bs, num_query, _ = query.shape - bs, num_value, _ = value.shape - assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value - - value = self.value_proj(value) - if key_padding_mask is not None: - value = value.masked_fill(key_padding_mask[..., None], 0.0) - value = value.view(bs, num_value, self.num_heads, -1) - sampling_offsets = self.sampling_offsets(query).view( - bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) - attention_weights = self.attention_weights(query).view( - bs, num_query, self.num_heads, self.num_levels * self.num_points) - - attention_weights = attention_weights.softmax(-1) - - attention_weights = attention_weights.view(bs, num_query, - self.num_heads, - self.num_levels, - self.num_points) - - if reference_points.shape[-1] == 2: - """ - For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. - After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. - For each referent point, we sample `num_points` sampling points. - For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. - """ - offset_normalizer = torch.stack( - [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) - - bs, num_query, num_Z_anchors, xy = reference_points.shape - reference_points = reference_points[:, :, None, None, None, :, :] - sampling_offsets = sampling_offsets / \ - offset_normalizer[None, None, None, :, None, :] - bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape - sampling_offsets = sampling_offsets.view( - bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) - sampling_locations = reference_points + sampling_offsets - bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape - assert num_all_points == num_points * num_Z_anchors - - sampling_locations = sampling_locations.view( - bs, num_query, num_heads, num_levels, num_all_points, xy) - - elif reference_points.shape[-1] == 4: - assert False - else: - raise ValueError( - f'Last dim of reference_points must be' - f' 2 or 4, but get {reference_points.shape[-1]} instead.') - - # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 - # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points - # - - if torch.cuda.is_available() and value.is_cuda: - if value.dtype == torch.float16: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - else: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - output = MultiScaleDeformableAttnFunction.apply( - value, spatial_shapes, level_start_index, sampling_locations, - attention_weights, self.im2col_step) - else: - output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) - if not self.batch_first: - output = output.permute(1, 0, 2) - - return output + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import build_attention +import math +from mmcv.runner import force_fp32, auto_fp16 + +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class SpatialCrossAttention(BaseModule): + """An attention module used in BEVFormer. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras + dropout (float): A Dropout layer on `inp_residual`. + Default: 0.. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + deformable_attention: (dict): The config for the deformable attention used in SCA. + """ + + def __init__(self, + embed_dims=256, + num_cams=6, + pc_range=None, + dropout=0.1, + init_cfg=None, + batch_first=False, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=256, + num_levels=4), + **kwargs + ): + super(SpatialCrossAttention, self).__init__(init_cfg) + + self.init_cfg = init_cfg + self.dropout = nn.Dropout(dropout) + self.pc_range = pc_range + self.fp16_enabled = False + self.deformable_attention = build_attention(deformable_attention) + self.embed_dims = embed_dims + self.num_cams = num_cams + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.batch_first = batch_first + self.init_weight() + + def init_weight(self): + """Default initialization for Parameters of Module.""" + xavier_init(self.output_proj, distribution='uniform', bias=0.) + + @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) + def forward(self, + query, + key, + value, + residual=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + reference_points_cam=None, + bev_mask=None, + level_start_index=None, + flag='encoder', + **kwargs): + """Forward Function of Detr3DCrossAtten. + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. (B, N, C, H, W) + residual (Tensor): The tensor used for addition, with the + same shape as `x`. Default None. If None, `x` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, 4), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different level. With shape (num_levels, 2), + last dimension represent (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + + if residual is None: + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: + query = query + query_pos + + bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] + for i, mask_per_img in enumerate(bev_mask): + index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) + indexes.append(index_query_per_img) + max_len = max([len(each) for each in indexes]) + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( + [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + + for j in range(bs): + for i, reference_points_per_img in enumerate(reference_points_cam): + index_query_per_img = indexes[i] + queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + + num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + value = value.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + + queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, + reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, + level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) + for j in range(bs): + for i, index_query_per_img in enumerate(indexes): + slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] + + count = bev_mask.sum(-1) > 0 + count = count.permute(1, 2, 0).sum(-1) + count = torch.clamp(count, min=1.0) + slots = slots / count[..., None] + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual + + +@ATTENTION.register_module() +class MSDeformableAttention3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=8, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + reference_points = reference_points[:, :, None, None, None, :, :] + sampling_offsets = sampling_offsets / \ + offset_normalizer[None, None, None, :, None, :] + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape + sampling_offsets = sampling_offsets.view( + bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + sampling_locations = reference_points + sampling_offsets + bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + assert num_all_points == num_points * num_Z_anchors + + sampling_locations = sampling_locations.view( + bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py index 78fb9f529..e80b22783 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py @@ -1,272 +1,272 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -from projects.mmdet3d_plugin.models.utils.bricks import run_time -from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 -from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch -import warnings -import torch -import torch.nn as nn -from mmcv.cnn import xavier_init, constant_init -from mmcv.cnn.bricks.registry import ATTENTION -import math -from mmcv.runner.base_module import BaseModule, ModuleList, Sequential -from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, - to_2tuple) - -from mmcv.utils import ext_loader -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -@ATTENTION.register_module() -class TemporalSelfAttention(BaseModule): - """An attention module used in BEVFormer based on Deformable-Detr. - - `Deformable DETR: Deformable Transformers for End-to-End Object Detection. - `_. - - Args: - embed_dims (int): The embedding dimension of Attention. - Default: 256. - num_heads (int): Parallel attention heads. Default: 64. - num_levels (int): The number of feature map used in - Attention. Default: 4. - num_points (int): The number of sampling points for - each query in each head. Default: 4. - im2col_step (int): The step used in image_to_column. - Default: 64. - dropout (float): A Dropout layer on `inp_identity`. - Default: 0.1. - batch_first (bool): Key, Query and Value are shape of - (batch, n, embed_dim) - or (n, batch, embed_dim). Default to True. - norm_cfg (dict): Config dict for normalization layer. - Default: None. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. - the length of BEV queue is 2. - """ - - def __init__(self, - embed_dims=256, - num_heads=8, - num_levels=4, - num_points=4, - num_bev_queue=2, - im2col_step=64, - dropout=0.1, - batch_first=True, - norm_cfg=None, - init_cfg=None): - - super().__init__(init_cfg) - if embed_dims % num_heads != 0: - raise ValueError(f'embed_dims must be divisible by num_heads, ' - f'but got {embed_dims} and {num_heads}') - dim_per_head = embed_dims // num_heads - self.norm_cfg = norm_cfg - self.dropout = nn.Dropout(dropout) - self.batch_first = batch_first - self.fp16_enabled = False - - # you'd better set dim_per_head to a power of 2 - # which is more efficient in the CUDA implementation - def _is_power_of_2(n): - if (not isinstance(n, int)) or (n < 0): - raise ValueError( - 'invalid input for _is_power_of_2: {} (type: {})'.format( - n, type(n))) - return (n & (n - 1) == 0) and n != 0 - - if not _is_power_of_2(dim_per_head): - warnings.warn( - "You'd better set embed_dims in " - 'MultiScaleDeformAttention to make ' - 'the dimension of each attention head a power of 2 ' - 'which is more efficient in our CUDA implementation.') - - self.im2col_step = im2col_step - self.embed_dims = embed_dims - self.num_levels = num_levels - self.num_heads = num_heads - self.num_points = num_points - self.num_bev_queue = num_bev_queue - self.sampling_offsets = nn.Linear( - embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) - self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, - num_bev_queue*num_heads * num_levels * num_points) - self.value_proj = nn.Linear(embed_dims, embed_dims) - self.output_proj = nn.Linear(embed_dims, embed_dims) - self.init_weights() - - def init_weights(self): - """Default initialization for Parameters of Module.""" - constant_init(self.sampling_offsets, 0.) - thetas = torch.arange( - self.num_heads, - dtype=torch.float32) * (2.0 * math.pi / self.num_heads) - grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) - grid_init = (grid_init / - grid_init.abs().max(-1, keepdim=True)[0]).view( - self.num_heads, 1, 1, - 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) - - for i in range(self.num_points): - grid_init[:, :, i, :] *= i + 1 - - self.sampling_offsets.bias.data = grid_init.view(-1) - constant_init(self.attention_weights, val=0., bias=0.) - xavier_init(self.value_proj, distribution='uniform', bias=0.) - xavier_init(self.output_proj, distribution='uniform', bias=0.) - self._is_init = True - - def forward(self, - query, - key=None, - value=None, - identity=None, - query_pos=None, - key_padding_mask=None, - reference_points=None, - spatial_shapes=None, - level_start_index=None, - flag='decoder', - - **kwargs): - """Forward Function of MultiScaleDeformAttention. - - Args: - query (Tensor): Query of Transformer with shape - (num_query, bs, embed_dims). - key (Tensor): The key tensor with shape - `(num_key, bs, embed_dims)`. - value (Tensor): The value tensor with shape - `(num_key, bs, embed_dims)`. - identity (Tensor): The tensor used for addition, with the - same shape as `query`. Default None. If None, - `query` will be used. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. Default - None. - reference_points (Tensor): The normalized reference - points with shape (bs, num_query, num_levels, 2), - all elements is range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area. - or (N, Length_{query}, num_levels, 4), add - additional two dimensions is (w, h) to - form reference boxes. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - spatial_shapes (Tensor): Spatial shape of features in - different levels. With shape (num_levels, 2), - last dimension represents (h, w). - level_start_index (Tensor): The start index of each level. - A tensor has shape ``(num_levels, )`` and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. - - Returns: - Tensor: forwarded results with shape [num_query, bs, embed_dims]. - """ - - if value is None: - assert self.batch_first - bs, len_bev, c = query.shape - value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) - - # value = torch.cat([query, query], 0) - - if identity is None: - identity = query - if query_pos is not None: - query = query + query_pos - if not self.batch_first: - # change to (bs, num_query ,embed_dims) - query = query.permute(1, 0, 2) - value = value.permute(1, 0, 2) - bs, num_query, embed_dims = query.shape - _, num_value, _ = value.shape - assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value - assert self.num_bev_queue == 2 - - query = torch.cat([value[:bs], query], -1) - value = self.value_proj(value) - - if key_padding_mask is not None: - value = value.masked_fill(key_padding_mask[..., None], 0.0) - - value = value.reshape(bs*self.num_bev_queue, - num_value, self.num_heads, -1) - - sampling_offsets = self.sampling_offsets(query) - sampling_offsets = sampling_offsets.view( - bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) - attention_weights = self.attention_weights(query).view( - bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) - attention_weights = attention_weights.softmax(-1) - - attention_weights = attention_weights.view(bs, num_query, - self.num_heads, - self.num_bev_queue, - self.num_levels, - self.num_points) - - attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ - .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() - sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ - .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) - - if reference_points.shape[-1] == 2: - offset_normalizer = torch.stack( - [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) - sampling_locations = reference_points[:, :, None, :, None, :] \ - + sampling_offsets \ - / offset_normalizer[None, None, None, :, None, :] - - elif reference_points.shape[-1] == 4: - sampling_locations = reference_points[:, :, None, :, None, :2] \ - + sampling_offsets / self.num_points \ - * reference_points[:, :, None, :, None, 2:] \ - * 0.5 - else: - raise ValueError( - f'Last dim of reference_points must be' - f' 2 or 4, but get {reference_points.shape[-1]} instead.') - if torch.cuda.is_available() and value.is_cuda: - - # using fp16 deformable attention is unstable because it performs many sum operations - if value.dtype == torch.float16: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - else: - MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 - output = MultiScaleDeformableAttnFunction.apply( - value, spatial_shapes, level_start_index, sampling_locations, - attention_weights, self.im2col_step) - else: - - output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) - - # output shape (bs*num_bev_queue, num_query, embed_dims) - # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) - output = output.permute(1, 2, 0) - - # fuse history value and current value - # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) - output = output.view(num_query, embed_dims, bs, self.num_bev_queue) - output = output.mean(-1) - - # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) - output = output.permute(2, 0, 1) - - output = self.output_proj(output) - - if not self.batch_first: - output = output.permute(1, 0, 2) - - return self.dropout(output) + identity +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import ATTENTION +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class TemporalSelfAttention(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to True. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. + the length of BEV queue is 2. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + num_bev_queue=2, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.num_bev_queue = num_bev_queue + self.sampling_offsets = nn.Linear( + embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, + num_bev_queue*num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) + + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + assert self.batch_first + bs, len_bev, c = query.shape + value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) + + # value = torch.cat([query, query], 0) + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + bs, num_query, embed_dims = query.shape + _, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + assert self.num_bev_queue == 2 + + query = torch.cat([value[:bs], query], -1) + value = self.value_proj(value) + + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + + value = value.reshape(bs*self.num_bev_queue, + num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query) + sampling_offsets = sampling_offsets.view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_bev_queue, + self.num_levels, + self.num_points) + + attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() + sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + # output shape (bs*num_bev_queue, num_query, embed_dims) + # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) + output = output.permute(1, 2, 0) + + # fuse history value and current value + # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) + output = output.view(num_query, embed_dims, bs, self.num_bev_queue) + output = output.mean(-1) + + # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) + output = output.permute(2, 0, 1) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/transformer.py index b740fccf5..1f8612762 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/transformer.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/modules/transformer.py @@ -1,289 +1,289 @@ -# --------------------------------------------- -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import numpy as np -import torch -import torch.nn as nn -from mmcv.cnn import xavier_init -from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence -from mmcv.runner.base_module import BaseModule - -from mmdet.models.utils.builder import TRANSFORMER -from torch.nn.init import normal_ -from projects.mmdet3d_plugin.models.utils.visual import save_tensor -from mmcv.runner.base_module import BaseModule -from torchvision.transforms.functional import rotate -from .temporal_self_attention import TemporalSelfAttention -from .spatial_cross_attention import MSDeformableAttention3D -from .decoder import CustomMSDeformableAttention -from projects.mmdet3d_plugin.models.utils.bricks import run_time -from mmcv.runner import force_fp32, auto_fp16 - - -@TRANSFORMER.register_module() -class PerceptionTransformer(BaseModule): - """Implements the Detr3D transformer. - Args: - as_two_stage (bool): Generate query from encoder features. - Default: False. - num_feature_levels (int): Number of feature maps from FPN: - Default: 4. - two_stage_num_proposals (int): Number of proposals when set - `as_two_stage` as True. Default: 300. - """ - - def __init__(self, - num_feature_levels=4, - num_cams=6, - two_stage_num_proposals=300, - encoder=None, - decoder=None, - embed_dims=256, - rotate_prev_bev=True, - use_shift=True, - use_can_bus=True, - can_bus_norm=True, - use_cams_embeds=True, - rotate_center=[100, 100], - **kwargs): - super(PerceptionTransformer, self).__init__(**kwargs) - self.encoder = build_transformer_layer_sequence(encoder) - self.decoder = build_transformer_layer_sequence(decoder) - self.embed_dims = embed_dims - self.num_feature_levels = num_feature_levels - self.num_cams = num_cams - self.fp16_enabled = False - - self.rotate_prev_bev = rotate_prev_bev - self.use_shift = use_shift - self.use_can_bus = use_can_bus - self.can_bus_norm = can_bus_norm - self.use_cams_embeds = use_cams_embeds - - self.two_stage_num_proposals = two_stage_num_proposals - self.init_layers() - self.rotate_center = rotate_center - - def init_layers(self): - """Initialize layers of the Detr3DTransformer.""" - self.level_embeds = nn.Parameter(torch.Tensor( - self.num_feature_levels, self.embed_dims)) - self.cams_embeds = nn.Parameter( - torch.Tensor(self.num_cams, self.embed_dims)) - self.reference_points = nn.Linear(self.embed_dims, 3) - self.can_bus_mlp = nn.Sequential( - nn.Linear(18, self.embed_dims // 2), - nn.ReLU(inplace=True), - nn.Linear(self.embed_dims // 2, self.embed_dims), - nn.ReLU(inplace=True), - ) - if self.can_bus_norm: - self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) - - def init_weights(self): - """Initialize the transformer weights.""" - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - for m in self.modules(): - if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ - or isinstance(m, CustomMSDeformableAttention): - try: - m.init_weight() - except AttributeError: - m.init_weights() - normal_(self.level_embeds) - normal_(self.cams_embeds) - xavier_init(self.reference_points, distribution='uniform', bias=0.) - xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) - - @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) - def get_bev_features( - self, - mlvl_feats, - bev_queries, - bev_h, - bev_w, - grid_length=[0.512, 0.512], - bev_pos=None, - prev_bev=None, - **kwargs): - """ - obtain bev features. - """ - - bs = mlvl_feats[0].size(0) - bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) - bev_pos = bev_pos.flatten(2).permute(2, 0, 1) - - # obtain rotation angle and shift with ego motion - delta_x = np.array([each['can_bus'][0] - for each in kwargs['img_metas']]) - delta_y = np.array([each['can_bus'][1] - for each in kwargs['img_metas']]) - ego_angle = np.array( - [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) - grid_length_y = grid_length[0] - grid_length_x = grid_length[1] - translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) - translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 - bev_angle = ego_angle - translation_angle - shift_y = translation_length * \ - np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h - shift_x = translation_length * \ - np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w - shift_y = shift_y * self.use_shift - shift_x = shift_x * self.use_shift - shift = bev_queries.new_tensor( - [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy - - if prev_bev is not None: - if prev_bev.shape[1] == bev_h * bev_w: - prev_bev = prev_bev.permute(1, 0, 2) - if self.rotate_prev_bev: - for i in range(bs): - # num_prev_bev = prev_bev.size(1) - rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] - tmp_prev_bev = prev_bev[:, i].reshape( - bev_h, bev_w, -1).permute(2, 0, 1) - tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, - center=self.rotate_center) - tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( - bev_h * bev_w, 1, -1) - prev_bev[:, i] = tmp_prev_bev[:, 0] - - # add can bus signals - can_bus = bev_queries.new_tensor( - [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] - can_bus = self.can_bus_mlp(can_bus)[None, :, :] - bev_queries = bev_queries + can_bus * self.use_can_bus - - feat_flatten = [] - spatial_shapes = [] - for lvl, feat in enumerate(mlvl_feats): - bs, num_cam, c, h, w = feat.shape - spatial_shape = (h, w) - feat = feat.flatten(3).permute(1, 0, 3, 2) - if self.use_cams_embeds: - feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) - feat = feat + self.level_embeds[None, - None, lvl:lvl + 1, :].to(feat.dtype) - spatial_shapes.append(spatial_shape) - feat_flatten.append(feat) - - feat_flatten = torch.cat(feat_flatten, 2) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=bev_pos.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) - - feat_flatten = feat_flatten.permute( - 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) - - bev_embed = self.encoder( - bev_queries, - feat_flatten, - feat_flatten, - bev_h=bev_h, - bev_w=bev_w, - bev_pos=bev_pos, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - prev_bev=prev_bev, - shift=shift, - **kwargs - ) - - return bev_embed - - @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) - def forward(self, - mlvl_feats, - bev_queries, - object_query_embed, - bev_h, - bev_w, - grid_length=[0.512, 0.512], - bev_pos=None, - reg_branches=None, - cls_branches=None, - prev_bev=None, - **kwargs): - """Forward function for `Detr3DTransformer`. - Args: - mlvl_feats (list(Tensor)): Input queries from - different level. Each element has shape - [bs, num_cams, embed_dims, h, w]. - bev_queries (Tensor): (bev_h*bev_w, c) - bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) - object_query_embed (Tensor): The query embedding for decoder, - with shape [num_query, c]. - reg_branches (obj:`nn.ModuleList`): Regression heads for - feature maps from each decoder layer. Only would - be passed when `with_box_refine` is True. Default to None. - Returns: - tuple[Tensor]: results of decoder containing the following tensor. - - bev_embed: BEV features - - inter_states: Outputs from decoder. If - return_intermediate_dec is True output has shape \ - (num_dec_layers, bs, num_query, embed_dims), else has \ - shape (1, bs, num_query, embed_dims). - - init_reference_out: The initial value of reference \ - points, has shape (bs, num_queries, 4). - - inter_references_out: The internal value of reference \ - points in decoder, has shape \ - (num_dec_layers, bs,num_query, embed_dims) - - enc_outputs_class: The classification score of \ - proposals generated from \ - encoder's feature maps, has shape \ - (batch, h*w, num_classes). \ - Only would be returned when `as_two_stage` is True, \ - otherwise None. - - enc_outputs_coord_unact: The regression results \ - generated from encoder's feature maps., has shape \ - (batch, h*w, 4). Only would \ - be returned when `as_two_stage` is True, \ - otherwise None. - """ - - bev_embed = self.get_bev_features( - mlvl_feats, - bev_queries, - bev_h, - bev_w, - grid_length=grid_length, - bev_pos=bev_pos, - prev_bev=prev_bev, - **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims - - bs = mlvl_feats[0].size(0) - query_pos, query = torch.split( - object_query_embed, self.embed_dims, dim=1) - query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) - query = query.unsqueeze(0).expand(bs, -1, -1) - reference_points = self.reference_points(query_pos) - reference_points = reference_points.sigmoid() - init_reference_out = reference_points - - query = query.permute(1, 0, 2) - query_pos = query_pos.permute(1, 0, 2) - bev_embed = bev_embed.permute(1, 0, 2) - - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=bev_embed, - query_pos=query_pos, - reference_points=reference_points, - reg_branches=reg_branches, - cls_branches=cls_branches, - spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), - level_start_index=torch.tensor([0], device=query.device), - **kwargs) - - inter_references_out = inter_references - - return bev_embed, inter_states, init_reference_out, inter_references_out +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from mmcv.runner.base_module import BaseModule + +from mmdet.models.utils.builder import TRANSFORMER +from torch.nn.init import normal_ +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.runner.base_module import BaseModule +from torchvision.transforms.functional import rotate +from .temporal_self_attention import TemporalSelfAttention +from .spatial_cross_attention import MSDeformableAttention3D +from .decoder import CustomMSDeformableAttention +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from mmcv.runner import force_fp32, auto_fp16 + + +@TRANSFORMER.register_module() +class PerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + encoder=None, + decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + **kwargs): + super(PerceptionTransformer, self).__init__(**kwargs) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + + self.two_stage_num_proposals = two_stage_num_proposals + self.init_layers() + self.rotate_center = rotate_center + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 3) + self.can_bus_mlp = nn.Sequential( + nn.Linear(18, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) + def get_bev_features( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + + return bev_embed + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + bev_queries, + object_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + + inter_references_out = inter_references + + return bev_embed, inter_states, init_reference_out, inter_references_out diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py index bd3fb34b0..ad176abc1 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py @@ -1,97 +1,97 @@ -# Copyright (c) OpenMMLab. All rights reserved. -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import os.path as osp -import torch -import mmcv -from mmcv.runner.base_runner import BaseRunner -from mmcv.runner.epoch_based_runner import EpochBasedRunner -from mmcv.runner.builder import RUNNERS -from mmcv.runner.checkpoint import save_checkpoint -from mmcv.runner.utils import get_host_info -from pprint import pprint -from mmcv.parallel.data_container import DataContainer - - -@RUNNERS.register_module() -class EpochBasedRunner_video(EpochBasedRunner): - - ''' - # basic logic - - input_sequence = [a, b, c] # given a sequence of samples - - prev_bev = None - for each in input_sequcene[:-1] - prev_bev = eval_model(each, prev_bev)) # inference only. - - model(input_sequcene[-1], prev_bev) # train the last sample. - ''' - - def __init__(self, - model, - eval_model=None, - batch_processor=None, - optimizer=None, - work_dir=None, - logger=None, - meta=None, - keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], - max_iters=None, - max_epochs=None): - super().__init__(model, - batch_processor, - optimizer, - work_dir, - logger, - meta, - max_iters, - max_epochs) - keys.append('img_metas') - self.keys = keys - self.eval_model = eval_model - self.eval_model.eval() - - def run_iter(self, data_batch, train_mode, **kwargs): - if self.batch_processor is not None: - assert False - # outputs = self.batch_processor( - # self.model, data_batch, train_mode=train_mode, **kwargs) - elif train_mode: - - num_samples = data_batch['img'].data[0].size(1) - data_list = [] - prev_bev = None - for i in range(num_samples): - data = {} - for key in self.keys: - if key not in ['img_metas', 'img', 'points']: - data[key] = data_batch[key] - else: - if key == 'img': - data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True) - elif key == 'img_metas': - data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only) - else: - assert False - data_list.append(data) - with torch.no_grad(): - for i in range(num_samples-1): - if data_list[i]['img_metas'].data[0][0]['prev_bev_exists']: - data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) - prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs) - if data_list[-1]['img_metas'].data[0][0]['prev_bev_exists']: - data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) - outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs) - else: - assert False - # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) - - if not isinstance(outputs, dict): - raise TypeError('"batch_processor()" or "model.train_step()"' - 'and "model.val_step()" must return a dict') - if 'log_vars' in outputs: - self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import os.path as osp +import torch +import mmcv +from mmcv.runner.base_runner import BaseRunner +from mmcv.runner.epoch_based_runner import EpochBasedRunner +from mmcv.runner.builder import RUNNERS +from mmcv.runner.checkpoint import save_checkpoint +from mmcv.runner.utils import get_host_info +from pprint import pprint +from mmcv.parallel.data_container import DataContainer + + +@RUNNERS.register_module() +class EpochBasedRunner_video(EpochBasedRunner): + + ''' + # basic logic + + input_sequence = [a, b, c] # given a sequence of samples + + prev_bev = None + for each in input_sequcene[:-1] + prev_bev = eval_model(each, prev_bev)) # inference only. + + model(input_sequcene[-1], prev_bev) # train the last sample. + ''' + + def __init__(self, + model, + eval_model=None, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + max_iters=None, + max_epochs=None): + super().__init__(model, + batch_processor, + optimizer, + work_dir, + logger, + meta, + max_iters, + max_epochs) + keys.append('img_metas') + self.keys = keys + self.eval_model = eval_model + self.eval_model.eval() + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + assert False + # outputs = self.batch_processor( + # self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + + num_samples = data_batch['img'].data[0].size(1) + data_list = [] + prev_bev = None + for i in range(num_samples): + data = {} + for key in self.keys: + if key not in ['img_metas', 'img', 'points']: + data[key] = data_batch[key] + else: + if key == 'img': + data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True) + elif key == 'img_metas': + data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only) + else: + assert False + data_list.append(data) + with torch.no_grad(): + for i in range(num_samples-1): + if data_list[i]['img_metas'].data[0][0]['prev_bev_exists']: + data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs) + if data_list[-1]['img_metas'].data[0][0]['prev_bev_exists']: + data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs) + else: + assert False + # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs \ No newline at end of file diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py index b036b87b8..0c76b9998 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py @@ -1,777 +1,777 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import mmcv -import numpy as np -import pyquaternion -import tempfile -import torch -import warnings -from nuscenes.utils.data_classes import Box as NuScenesBox -from os import path as osp - -from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr -from mmdet.datasets import DATASETS, CocoDataset -from mmdet3d.core import show_multi_modality_result -from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type -from mmdet3d.datasets.pipelines import Compose -from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline - - -@DATASETS.register_module() -class CustomNuScenesMonoDataset(CocoDataset): - r"""Monocular 3D detection on NuScenes Dataset. - This class serves as the API for experiments on the NuScenes Dataset. - Please refer to `NuScenes Dataset `_ - for data downloading. - Args: - ann_file (str): Path of annotation file. - data_root (str): Path of dataset root. - load_interval (int, optional): Interval of loading the dataset. It is - used to uniformly sample the dataset. Defaults to 1. - with_velocity (bool, optional): Whether include velocity prediction - into the experiments. Defaults to True. - modality (dict, optional): Modality to specify the sensor data used - as input. Defaults to None. - box_type_3d (str, optional): Type of 3D box of this dataset. - Based on the `box_type_3d`, the dataset will encapsulate the box - to its original format then converted them to `box_type_3d`. - Defaults to 'Camera' in this class. Available options includes. - - 'LiDAR': Box in LiDAR coordinates. - - 'Depth': Box in depth coordinates, usually for indoor dataset. - - 'Camera': Box in camera coordinates. - eval_version (str, optional): Configuration version of evaluation. - Defaults to 'detection_cvpr_2019'. - use_valid_flag (bool): Whether to use `use_valid_flag` key in the info - file as mask to filter gt_boxes and gt_names. Defaults to False. - version (str, optional): Dataset version. Defaults to 'v1.0-trainval'. - """ - CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', - 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', - 'barrier') - DefaultAttribute = { - 'car': 'vehicle.parked', - 'pedestrian': 'pedestrian.moving', - 'trailer': 'vehicle.parked', - 'truck': 'vehicle.parked', - 'bus': 'vehicle.moving', - 'motorcycle': 'cycle.without_rider', - 'construction_vehicle': 'vehicle.parked', - 'bicycle': 'cycle.without_rider', - 'barrier': '', - 'traffic_cone': '', - } - # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa - ErrNameMapping = { - 'trans_err': 'mATE', - 'scale_err': 'mASE', - 'orient_err': 'mAOE', - 'vel_err': 'mAVE', - 'attr_err': 'mAAE' - } - - def __init__(self, - data_root, - load_interval=1, - with_velocity=True, - modality=None, - box_type_3d='Camera', - eval_version='detection_cvpr_2019', - use_valid_flag=False, - overlap_test=False, - version='v1.0-trainval', - **kwargs): - super().__init__(**kwargs) - # overlap_test = True - self.data_root = data_root - self.overlap_test = overlap_test - self.load_interval = load_interval - self.with_velocity = with_velocity - self.modality = modality - self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) - self.eval_version = eval_version - self.use_valid_flag = use_valid_flag - self.bbox_code_size = 9 - self.version = version - if self.eval_version is not None: - from nuscenes.eval.detection.config import config_factory - self.eval_detection_configs = config_factory(self.eval_version) - if self.modality is None: - self.modality = dict( - use_camera=True, - use_lidar=False, - use_radar=False, - use_map=False, - use_external=False) - - def pre_pipeline(self, results): - """Initialization before data preparation. - Args: - results (dict): Dict before data preprocessing. - - img_fields (list): Image fields. - - bbox3d_fields (list): 3D bounding boxes fields. - - pts_mask_fields (list): Mask fields of points. - - pts_seg_fields (list): Mask fields of point segments. - - bbox_fields (list): Fields of bounding boxes. - - mask_fields (list): Fields of masks. - - seg_fields (list): Segment fields. - - box_type_3d (str): 3D box type. - - box_mode_3d (str): 3D box mode. - """ - results['img_prefix'] = '' # self.img_prefix - # print('img_prefix', self.img_prefix) - results['seg_prefix'] = self.seg_prefix - results['proposal_file'] = self.proposal_file - results['img_fields'] = [] - results['bbox3d_fields'] = [] - results['pts_mask_fields'] = [] - results['pts_seg_fields'] = [] - results['bbox_fields'] = [] - results['mask_fields'] = [] - results['seg_fields'] = [] - results['box_type_3d'] = self.box_type_3d - results['box_mode_3d'] = self.box_mode_3d - - def _parse_ann_info(self, img_info, ann_info): - """Parse bbox annotation. - Args: - img_info (list[dict]): Image info. - ann_info (list[dict]): Annotation info of an image. - Returns: - dict: A dict containing the following keys: bboxes, labels, \ - gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \ - depths, bboxes_ignore, masks, seg_map - """ - gt_bboxes = [] - gt_labels = [] - attr_labels = [] - gt_bboxes_ignore = [] - gt_masks_ann = [] - gt_bboxes_cam3d = [] - centers2d = [] - depths = [] - for i, ann in enumerate(ann_info): - if ann.get('ignore', False): - continue - x1, y1, w, h = ann['bbox'] - inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) - inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) - if inter_w * inter_h == 0: - continue - if ann['area'] <= 0 or w < 1 or h < 1: - continue - if ann['category_id'] not in self.cat_ids: - continue - bbox = [x1, y1, x1 + w, y1 + h] - if ann.get('iscrowd', False): - gt_bboxes_ignore.append(bbox) - else: - gt_bboxes.append(bbox) - gt_labels.append(self.cat2label[ann['category_id']]) - attr_labels.append(ann['attribute_id']) - gt_masks_ann.append(ann.get('segmentation', None)) - # 3D annotations in camera coordinates - bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1) - velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2) - nan_mask = np.isnan(velo_cam3d[:, 0]) - velo_cam3d[nan_mask] = [0.0, 0.0] - bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1) - gt_bboxes_cam3d.append(bbox_cam3d.squeeze()) - # 2.5D annotations in camera coordinates - center2d = ann['center2d'][:2] - depth = ann['center2d'][2] - centers2d.append(center2d) - depths.append(depth) - - if gt_bboxes: - gt_bboxes = np.array(gt_bboxes, dtype=np.float32) - gt_labels = np.array(gt_labels, dtype=np.int64) - attr_labels = np.array(attr_labels, dtype=np.int64) - else: - gt_bboxes = np.zeros((0, 4), dtype=np.float32) - gt_labels = np.array([], dtype=np.int64) - attr_labels = np.array([], dtype=np.int64) - - if gt_bboxes_cam3d: - gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32) - centers2d = np.array(centers2d, dtype=np.float32) - depths = np.array(depths, dtype=np.float32) - else: - gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size), - dtype=np.float32) - centers2d = np.zeros((0, 2), dtype=np.float32) - depths = np.zeros((0), dtype=np.float32) - - gt_bboxes_cam3d = CameraInstance3DBoxes( - gt_bboxes_cam3d, - box_dim=gt_bboxes_cam3d.shape[-1], - origin=(0.5, 0.5, 0.5)) - gt_labels_3d = copy.deepcopy(gt_labels) - - if gt_bboxes_ignore: - gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) - else: - gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) - - seg_map = img_info['filename'].replace('jpg', 'png') - - ann = dict( - bboxes=gt_bboxes, - labels=gt_labels, - gt_bboxes_3d=gt_bboxes_cam3d, - gt_labels_3d=gt_labels_3d, - attr_labels=attr_labels, - centers2d=centers2d, - depths=depths, - bboxes_ignore=gt_bboxes_ignore, - masks=gt_masks_ann, - seg_map=seg_map) - - return ann - - def get_attr_name(self, attr_idx, label_name): - """Get attribute from predicted index. - This is a workaround to predict attribute when the predicted velocity - is not reliable. We map the predicted attribute index to the one - in the attribute set. If it is consistent with the category, we will - keep it. Otherwise, we will use the default attribute. - Args: - attr_idx (int): Attribute index. - label_name (str): Predicted category name. - Returns: - str: Predicted attribute name. - """ - # TODO: Simplify the variable name - AttrMapping_rev2 = [ - 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', - 'pedestrian.standing', 'pedestrian.sitting_lying_down', - 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None' - ] - if label_name == 'car' or label_name == 'bus' \ - or label_name == 'truck' or label_name == 'trailer' \ - or label_name == 'construction_vehicle': - if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \ - AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \ - AttrMapping_rev2[attr_idx] == 'vehicle.stopped': - return AttrMapping_rev2[attr_idx] - else: - return CustomNuScenesMonoDataset.DefaultAttribute[label_name] - elif label_name == 'pedestrian': - if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \ - AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \ - AttrMapping_rev2[attr_idx] == \ - 'pedestrian.sitting_lying_down': - return AttrMapping_rev2[attr_idx] - else: - return CustomNuScenesMonoDataset.DefaultAttribute[label_name] - elif label_name == 'bicycle' or label_name == 'motorcycle': - if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \ - AttrMapping_rev2[attr_idx] == 'cycle.without_rider': - return AttrMapping_rev2[attr_idx] - else: - return CustomNuScenesMonoDataset.DefaultAttribute[label_name] - else: - return CustomNuScenesMonoDataset.DefaultAttribute[label_name] - - def _format_bbox(self, results, jsonfile_prefix=None): - """Convert the results to the standard format. - Args: - results (list[dict]): Testing results of the dataset. - jsonfile_prefix (str): The prefix of the output jsonfile. - You can specify the output directory/filename by - modifying the jsonfile_prefix. Default: None. - Returns: - str: Path of the output json file. - """ - nusc_annos = {} - mapped_class_names = self.CLASSES - - print('Start to convert detection format...') - - CAM_NUM = 6 - - for sample_id, det in enumerate(mmcv.track_iter_progress(results)): - - if sample_id % CAM_NUM == 0: - boxes_per_frame = [] - attrs_per_frame = [] - - # need to merge results from images of the same sample - annos = [] - boxes, attrs = output_to_nusc_box(det) - sample_token = self.data_infos[sample_id]['token'] - boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id], - boxes, attrs, - mapped_class_names, - self.eval_detection_configs, - self.eval_version) - - boxes_per_frame.extend(boxes) - attrs_per_frame.extend(attrs) - # Remove redundant predictions caused by overlap of images - if (sample_id + 1) % CAM_NUM != 0: - continue - boxes = global_nusc_box_to_cam( - self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame, - mapped_class_names, self.eval_detection_configs, - self.eval_version) - cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes) - # box nms 3d over 6 images in a frame - # TODO: move this global setting into config - nms_cfg = dict( - use_rotate_nms=True, - nms_across_levels=False, - nms_pre=4096, - nms_thr=0.05, - score_thr=0.01, - min_bbox_size=0, - max_per_frame=500) - from mmcv import Config - nms_cfg = Config(nms_cfg) - cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev) - boxes3d = cam_boxes3d.tensor - # generate attr scores from attr labels - attrs = labels.new_tensor([attr for attr in attrs_per_frame]) - boxes3d, scores, labels, attrs = box3d_multiclass_nms( - boxes3d, - cam_boxes3d_for_nms, - scores, - nms_cfg.score_thr, - nms_cfg.max_per_frame, - nms_cfg, - mlvl_attr_scores=attrs) - cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9) - det = bbox3d2result(cam_boxes3d, scores, labels, attrs) - boxes, attrs = output_to_nusc_box(det) - boxes, attrs = cam_nusc_box_to_global( - self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs, - mapped_class_names, self.eval_detection_configs, - self.eval_version) - - for i, box in enumerate(boxes): - name = mapped_class_names[box.label] - attr = self.get_attr_name(attrs[i], name) - nusc_anno = dict( - sample_token=sample_token, - translation=box.center.tolist(), - size=box.wlh.tolist(), - rotation=box.orientation.elements.tolist(), - velocity=box.velocity[:2].tolist(), - detection_name=name, - detection_score=box.score, - attribute_name=attr) - annos.append(nusc_anno) - # other views results of the same frame should be concatenated - if sample_token in nusc_annos: - nusc_annos[sample_token].extend(annos) - else: - nusc_annos[sample_token] = annos - - nusc_submissions = { - 'meta': self.modality, - 'results': nusc_annos, - } - - mmcv.mkdir_or_exist(jsonfile_prefix) - res_path = osp.join(jsonfile_prefix, 'results_nusc.json') - print('Results writes to', res_path) - mmcv.dump(nusc_submissions, res_path) - return res_path - - def _evaluate_single(self, - result_path, - logger=None, - metric='bbox', - result_name='img_bbox'): - """Evaluation for a single model in nuScenes protocol. - Args: - result_path (str): Path of the result file. - logger (logging.Logger | str | None): Logger used for printing - related information during evaluation. Default: None. - metric (str): Metric name used for evaluation. Default: 'bbox'. - result_name (str): Result name in the metric prefix. - Default: 'img_bbox'. - Returns: - dict: Dictionary of evaluation details. - """ - from nuscenes import NuScenes - #from nuscenes.eval.detection.evaluate import NuScenesEval - from .nuscnes_eval import NuScenesEval_custom - output_dir = osp.join(*osp.split(result_path)[:-1]) - self.nusc = NuScenes( - version=self.version, dataroot=self.data_root, verbose=False) - eval_set_map = { - 'v1.0-mini': 'mini_val', - 'v1.0-trainval': 'val', - } - # nusc_eval = NuScenesEval( - # nusc, - # config=self.eval_detection_configs, - # result_path=result_path, - # eval_set=eval_set_map[self.version], - # output_dir=output_dir, - # verbose=False) - self.nusc_eval = NuScenesEval_custom( - self.nusc, - config=self.eval_detection_configs, - result_path=result_path, - eval_set=eval_set_map[self.version], - output_dir=output_dir, - verbose=True, - overlap_test=self.overlap_test, - data_infos=self.data_infos - ) - - self.nusc_eval.main(render_curves=True) - - # record metrics - metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) - detail = dict() - metric_prefix = f'{result_name}_NuScenes' - for name in self.CLASSES: - for k, v in metrics['label_aps'][name].items(): - val = float('{:.4f}'.format(v)) - detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val - for k, v in metrics['label_tp_errors'][name].items(): - val = float('{:.4f}'.format(v)) - detail['{}/{}_{}'.format(metric_prefix, name, k)] = val - for k, v in metrics['tp_errors'].items(): - val = float('{:.4f}'.format(v)) - detail['{}/{}'.format(metric_prefix, - self.ErrNameMapping[k])] = val - - detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] - detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] - return detail - - def format_results(self, results, jsonfile_prefix=None, **kwargs): - """Format the results to json (standard format for COCO evaluation). - Args: - results (list[tuple | numpy.ndarray]): Testing results of the - dataset. - jsonfile_prefix (str | None): The prefix of json files. It includes - the file path and the prefix of filename, e.g., "a/b/prefix". - If not specified, a temp file will be created. Default: None. - Returns: - tuple: (result_files, tmp_dir), result_files is a dict containing \ - the json filepaths, tmp_dir is the temporal directory created \ - for saving json files when jsonfile_prefix is not specified. - """ - assert isinstance(results, list), 'results must be a list' - assert len(results) == len(self), ( - 'The length of results is not equal to the dataset len: {} != {}'. - format(len(results), len(self))) - - if jsonfile_prefix is None: - tmp_dir = tempfile.TemporaryDirectory() - jsonfile_prefix = osp.join(tmp_dir.name, 'results') - else: - tmp_dir = None - - # currently the output prediction results could be in two formats - # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) - # 2. list of dict('pts_bbox' or 'img_bbox': - # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) - # this is a workaround to enable evaluation of both formats on nuScenes - # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 - if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): - result_files = self._format_bbox(results, jsonfile_prefix) - else: - # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict - result_files = dict() - for name in results[0]: - # not evaluate 2D predictions on nuScenes - if '2d' in name: - continue - print(f'\nFormating bboxes of {name}') - results_ = [out[name] for out in results] - tmp_file_ = osp.join(jsonfile_prefix, name) - result_files.update( - {name: self._format_bbox(results_, tmp_file_)}) - - return result_files, tmp_dir - - def evaluate(self, - results, - metric='bbox', - logger=None, - jsonfile_prefix=None, - result_names=['img_bbox'], - show=False, - out_dir=None, - pipeline=None): - """Evaluation in nuScenes protocol. - Args: - results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing - related information during evaluation. Default: None. - jsonfile_prefix (str | None): The prefix of json files. It includes - the file path and the prefix of filename, e.g., "a/b/prefix". - If not specified, a temp file will be created. Default: None. - show (bool): Whether to visualize. - Default: False. - out_dir (str): Path to save the visualization results. - Default: None. - pipeline (list[dict], optional): raw data loading for showing. - Default: None. - Returns: - dict[str, float]: Results of each evaluation metric. - """ - - result_files, tmp_dir = self.format_results(results, jsonfile_prefix) - - if isinstance(result_files, dict): - results_dict = dict() - for name in result_names: - print('Evaluating bboxes of {}'.format(name)) - ret_dict = self._evaluate_single(result_files[name]) - results_dict.update(ret_dict) - elif isinstance(result_files, str): - results_dict = self._evaluate_single(result_files) - - if tmp_dir is not None: - tmp_dir.cleanup() - - if show: - self.show(results, out_dir, pipeline=pipeline) - return results_dict - - def _extract_data(self, index, pipeline, key, load_annos=False): - """Load data using input pipeline and extract data according to key. - Args: - index (int): Index for accessing the target data. - pipeline (:obj:`Compose`): Composed data loading pipeline. - key (str | list[str]): One single or a list of data key. - load_annos (bool): Whether to load data annotations. - If True, need to set self.test_mode as False before loading. - Returns: - np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: - A single or a list of loaded data. - """ - assert pipeline is not None, 'data loading pipeline is not provided' - img_info = self.data_infos[index] - input_dict = dict(img_info=img_info) - - if load_annos: - ann_info = self.get_ann_info(index) - input_dict.update(dict(ann_info=ann_info)) - - self.pre_pipeline(input_dict) - example = pipeline(input_dict) - - # extract data items according to keys - if isinstance(key, str): - data = extract_result_dict(example, key) - else: - data = [extract_result_dict(example, k) for k in key] - - return data - - def _get_pipeline(self, pipeline): - """Get data loading pipeline in self.show/evaluate function. - Args: - pipeline (list[dict] | None): Input pipeline. If None is given, \ - get from self.pipeline. - """ - if pipeline is None: - if not hasattr(self, 'pipeline') or self.pipeline is None: - warnings.warn( - 'Use default pipeline for data loading, this may cause ' - 'errors when data is on ceph') - return self._build_default_pipeline() - loading_pipeline = get_loading_pipeline(self.pipeline.transforms) - return Compose(loading_pipeline) - return Compose(pipeline) - - def _build_default_pipeline(self): - """Build the default pipeline for this dataset.""" - pipeline = [ - dict(type='LoadImageFromFileMono3D'), - dict( - type='DefaultFormatBundle3D', - class_names=self.CLASSES, - with_label=False), - dict(type='Collect3D', keys=['img']) - ] - return Compose(pipeline) - - def show(self, results, out_dir, show=True, pipeline=None): - """Results visualization. - Args: - results (list[dict]): List of bounding boxes results. - out_dir (str): Output directory of visualization result. - show (bool): Visualize the results online. - pipeline (list[dict], optional): raw data loading for showing. - Default: None. - """ - assert out_dir is not None, 'Expect out_dir, got none.' - pipeline = self._get_pipeline(pipeline) - for i, result in enumerate(results): - if 'img_bbox' in result.keys(): - result = result['img_bbox'] - data_info = self.data_infos[i] - img_path = data_info['file_name'] - file_name = osp.split(img_path)[-1].split('.')[0] - img, img_metas = self._extract_data(i, pipeline, - ['img', 'img_metas']) - # need to transpose channel to first dim - img = img.numpy().transpose(1, 2, 0) - gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'] - pred_bboxes = result['boxes_3d'] - show_multi_modality_result( - img, - gt_bboxes, - pred_bboxes, - img_metas['cam2img'], - out_dir, - file_name, - box_mode='camera', - show=show) - - -def output_to_nusc_box(detection): - """Convert the output to the box class in the nuScenes. - Args: - detection (dict): Detection results. - - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - - scores_3d (torch.Tensor): Detection scores. - - labels_3d (torch.Tensor): Predicted box labels. - - attrs_3d (torch.Tensor, optional): Predicted attributes. - Returns: - list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. - """ - box3d = detection['boxes_3d'] - scores = detection['scores_3d'].numpy() - labels = detection['labels_3d'].numpy() - attrs = None - if 'attrs_3d' in detection: - attrs = detection['attrs_3d'].numpy() - - box_gravity_center = box3d.gravity_center.numpy() - box_dims = box3d.dims.numpy() - box_yaw = box3d.yaw.numpy() - - # convert the dim/rot to nuscbox convention - box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]] - box_yaw = -box_yaw - - box_list = [] - for i in range(len(box3d)): - q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) - q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2) - quat = q2 * q1 - velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8]) - box = NuScenesBox( - box_gravity_center[i], - box_dims[i], - quat, - label=labels[i], - score=scores[i], - velocity=velocity) - box_list.append(box) - return box_list, attrs - - -def cam_nusc_box_to_global(info, - boxes, - attrs, - classes, - eval_configs, - eval_version='detection_cvpr_2019'): - """Convert the box from camera to global coordinate. - Args: - info (dict): Info for a specific sample data, including the - calibration information. - boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. - classes (list[str]): Mapped classes in the evaluation. - eval_configs (object): Evaluation configuration object. - eval_version (str): Evaluation version. - Default: 'detection_cvpr_2019' - Returns: - list: List of standard NuScenesBoxes in the global - coordinate. - """ - box_list = [] - attr_list = [] - for (box, attr) in zip(boxes, attrs): - # Move box to ego vehicle coord system - box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation'])) - box.translate(np.array(info['cam2ego_translation'])) - # filter det in ego. - cls_range_map = eval_configs.class_range - radius = np.linalg.norm(box.center[:2], 2) - det_range = cls_range_map[classes[box.label]] - if radius > det_range: - continue - # Move box to global coord system - box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) - box.translate(np.array(info['ego2global_translation'])) - box_list.append(box) - attr_list.append(attr) - return box_list, attr_list - - -def global_nusc_box_to_cam(info, - boxes, - classes, - eval_configs, - eval_version='detection_cvpr_2019'): - """Convert the box from global to camera coordinate. - Args: - info (dict): Info for a specific sample data, including the - calibration information. - boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. - classes (list[str]): Mapped classes in the evaluation. - eval_configs (object): Evaluation configuration object. - eval_version (str): Evaluation version. - Default: 'detection_cvpr_2019' - Returns: - list: List of standard NuScenesBoxes in the global - coordinate. - """ - box_list = [] - for box in boxes: - # Move box to ego vehicle coord system - box.translate(-np.array(info['ego2global_translation'])) - box.rotate( - pyquaternion.Quaternion(info['ego2global_rotation']).inverse) - # filter det in ego. - cls_range_map = eval_configs.class_range - radius = np.linalg.norm(box.center[:2], 2) - det_range = cls_range_map[classes[box.label]] - if radius > det_range: - continue - # Move box to camera coord system - box.translate(-np.array(info['cam2ego_translation'])) - box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse) - box_list.append(box) - return box_list - - -def nusc_box_to_cam_box3d(boxes): - """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`. - Args: - boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. - Returns: - tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \ - Converted 3D bounding boxes, scores and labels. - """ - locs = torch.Tensor([b.center for b in boxes]).view(-1, 3) - dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3) - rots = torch.Tensor([b.orientation.yaw_pitch_roll[0] - for b in boxes]).view(-1, 1) - velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2) - - # convert nusbox to cambox convention - dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]] - rots = -rots - - boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda() - cam_boxes3d = CameraInstance3DBoxes( - boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5)) - scores = torch.Tensor([b.score for b in boxes]).cuda() - labels = torch.LongTensor([b.label for b in boxes]).cuda() - nms_scores = scores.new_zeros(scores.shape[0], 10 + 1) - indices = labels.new_tensor(list(range(scores.shape[0]))) - nms_scores[indices, labels] = scores +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import mmcv +import numpy as np +import pyquaternion +import tempfile +import torch +import warnings +from nuscenes.utils.data_classes import Box as NuScenesBox +from os import path as osp + +from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr +from mmdet.datasets import DATASETS, CocoDataset +from mmdet3d.core import show_multi_modality_result +from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type +from mmdet3d.datasets.pipelines import Compose +from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline + + +@DATASETS.register_module() +class CustomNuScenesMonoDataset(CocoDataset): + r"""Monocular 3D detection on NuScenes Dataset. + This class serves as the API for experiments on the NuScenes Dataset. + Please refer to `NuScenes Dataset `_ + for data downloading. + Args: + ann_file (str): Path of annotation file. + data_root (str): Path of dataset root. + load_interval (int, optional): Interval of loading the dataset. It is + used to uniformly sample the dataset. Defaults to 1. + with_velocity (bool, optional): Whether include velocity prediction + into the experiments. Defaults to True. + modality (dict, optional): Modality to specify the sensor data used + as input. Defaults to None. + box_type_3d (str, optional): Type of 3D box of this dataset. + Based on the `box_type_3d`, the dataset will encapsulate the box + to its original format then converted them to `box_type_3d`. + Defaults to 'Camera' in this class. Available options includes. + - 'LiDAR': Box in LiDAR coordinates. + - 'Depth': Box in depth coordinates, usually for indoor dataset. + - 'Camera': Box in camera coordinates. + eval_version (str, optional): Configuration version of evaluation. + Defaults to 'detection_cvpr_2019'. + use_valid_flag (bool): Whether to use `use_valid_flag` key in the info + file as mask to filter gt_boxes and gt_names. Defaults to False. + version (str, optional): Dataset version. Defaults to 'v1.0-trainval'. + """ + CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + DefaultAttribute = { + 'car': 'vehicle.parked', + 'pedestrian': 'pedestrian.moving', + 'trailer': 'vehicle.parked', + 'truck': 'vehicle.parked', + 'bus': 'vehicle.moving', + 'motorcycle': 'cycle.without_rider', + 'construction_vehicle': 'vehicle.parked', + 'bicycle': 'cycle.without_rider', + 'barrier': '', + 'traffic_cone': '', + } + # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa + ErrNameMapping = { + 'trans_err': 'mATE', + 'scale_err': 'mASE', + 'orient_err': 'mAOE', + 'vel_err': 'mAVE', + 'attr_err': 'mAAE' + } + + def __init__(self, + data_root, + load_interval=1, + with_velocity=True, + modality=None, + box_type_3d='Camera', + eval_version='detection_cvpr_2019', + use_valid_flag=False, + overlap_test=False, + version='v1.0-trainval', + **kwargs): + super().__init__(**kwargs) + # overlap_test = True + self.data_root = data_root + self.overlap_test = overlap_test + self.load_interval = load_interval + self.with_velocity = with_velocity + self.modality = modality + self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) + self.eval_version = eval_version + self.use_valid_flag = use_valid_flag + self.bbox_code_size = 9 + self.version = version + if self.eval_version is not None: + from nuscenes.eval.detection.config import config_factory + self.eval_detection_configs = config_factory(self.eval_version) + if self.modality is None: + self.modality = dict( + use_camera=True, + use_lidar=False, + use_radar=False, + use_map=False, + use_external=False) + + def pre_pipeline(self, results): + """Initialization before data preparation. + Args: + results (dict): Dict before data preprocessing. + - img_fields (list): Image fields. + - bbox3d_fields (list): 3D bounding boxes fields. + - pts_mask_fields (list): Mask fields of points. + - pts_seg_fields (list): Mask fields of point segments. + - bbox_fields (list): Fields of bounding boxes. + - mask_fields (list): Fields of masks. + - seg_fields (list): Segment fields. + - box_type_3d (str): 3D box type. + - box_mode_3d (str): 3D box mode. + """ + results['img_prefix'] = '' # self.img_prefix + # print('img_prefix', self.img_prefix) + results['seg_prefix'] = self.seg_prefix + results['proposal_file'] = self.proposal_file + results['img_fields'] = [] + results['bbox3d_fields'] = [] + results['pts_mask_fields'] = [] + results['pts_seg_fields'] = [] + results['bbox_fields'] = [] + results['mask_fields'] = [] + results['seg_fields'] = [] + results['box_type_3d'] = self.box_type_3d + results['box_mode_3d'] = self.box_mode_3d + + def _parse_ann_info(self, img_info, ann_info): + """Parse bbox annotation. + Args: + img_info (list[dict]): Image info. + ann_info (list[dict]): Annotation info of an image. + Returns: + dict: A dict containing the following keys: bboxes, labels, \ + gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \ + depths, bboxes_ignore, masks, seg_map + """ + gt_bboxes = [] + gt_labels = [] + attr_labels = [] + gt_bboxes_ignore = [] + gt_masks_ann = [] + gt_bboxes_cam3d = [] + centers2d = [] + depths = [] + for i, ann in enumerate(ann_info): + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + if ann.get('iscrowd', False): + gt_bboxes_ignore.append(bbox) + else: + gt_bboxes.append(bbox) + gt_labels.append(self.cat2label[ann['category_id']]) + attr_labels.append(ann['attribute_id']) + gt_masks_ann.append(ann.get('segmentation', None)) + # 3D annotations in camera coordinates + bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1) + velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2) + nan_mask = np.isnan(velo_cam3d[:, 0]) + velo_cam3d[nan_mask] = [0.0, 0.0] + bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1) + gt_bboxes_cam3d.append(bbox_cam3d.squeeze()) + # 2.5D annotations in camera coordinates + center2d = ann['center2d'][:2] + depth = ann['center2d'][2] + centers2d.append(center2d) + depths.append(depth) + + if gt_bboxes: + gt_bboxes = np.array(gt_bboxes, dtype=np.float32) + gt_labels = np.array(gt_labels, dtype=np.int64) + attr_labels = np.array(attr_labels, dtype=np.int64) + else: + gt_bboxes = np.zeros((0, 4), dtype=np.float32) + gt_labels = np.array([], dtype=np.int64) + attr_labels = np.array([], dtype=np.int64) + + if gt_bboxes_cam3d: + gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32) + centers2d = np.array(centers2d, dtype=np.float32) + depths = np.array(depths, dtype=np.float32) + else: + gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size), + dtype=np.float32) + centers2d = np.zeros((0, 2), dtype=np.float32) + depths = np.zeros((0), dtype=np.float32) + + gt_bboxes_cam3d = CameraInstance3DBoxes( + gt_bboxes_cam3d, + box_dim=gt_bboxes_cam3d.shape[-1], + origin=(0.5, 0.5, 0.5)) + gt_labels_3d = copy.deepcopy(gt_labels) + + if gt_bboxes_ignore: + gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) + else: + gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + seg_map = img_info['filename'].replace('jpg', 'png') + + ann = dict( + bboxes=gt_bboxes, + labels=gt_labels, + gt_bboxes_3d=gt_bboxes_cam3d, + gt_labels_3d=gt_labels_3d, + attr_labels=attr_labels, + centers2d=centers2d, + depths=depths, + bboxes_ignore=gt_bboxes_ignore, + masks=gt_masks_ann, + seg_map=seg_map) + + return ann + + def get_attr_name(self, attr_idx, label_name): + """Get attribute from predicted index. + This is a workaround to predict attribute when the predicted velocity + is not reliable. We map the predicted attribute index to the one + in the attribute set. If it is consistent with the category, we will + keep it. Otherwise, we will use the default attribute. + Args: + attr_idx (int): Attribute index. + label_name (str): Predicted category name. + Returns: + str: Predicted attribute name. + """ + # TODO: Simplify the variable name + AttrMapping_rev2 = [ + 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', + 'pedestrian.standing', 'pedestrian.sitting_lying_down', + 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None' + ] + if label_name == 'car' or label_name == 'bus' \ + or label_name == 'truck' or label_name == 'trailer' \ + or label_name == 'construction_vehicle': + if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \ + AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \ + AttrMapping_rev2[attr_idx] == 'vehicle.stopped': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + elif label_name == 'pedestrian': + if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \ + AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \ + AttrMapping_rev2[attr_idx] == \ + 'pedestrian.sitting_lying_down': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + elif label_name == 'bicycle' or label_name == 'motorcycle': + if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \ + AttrMapping_rev2[attr_idx] == 'cycle.without_rider': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + + def _format_bbox(self, results, jsonfile_prefix=None): + """Convert the results to the standard format. + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str): The prefix of the output jsonfile. + You can specify the output directory/filename by + modifying the jsonfile_prefix. Default: None. + Returns: + str: Path of the output json file. + """ + nusc_annos = {} + mapped_class_names = self.CLASSES + + print('Start to convert detection format...') + + CAM_NUM = 6 + + for sample_id, det in enumerate(mmcv.track_iter_progress(results)): + + if sample_id % CAM_NUM == 0: + boxes_per_frame = [] + attrs_per_frame = [] + + # need to merge results from images of the same sample + annos = [] + boxes, attrs = output_to_nusc_box(det) + sample_token = self.data_infos[sample_id]['token'] + boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id], + boxes, attrs, + mapped_class_names, + self.eval_detection_configs, + self.eval_version) + + boxes_per_frame.extend(boxes) + attrs_per_frame.extend(attrs) + # Remove redundant predictions caused by overlap of images + if (sample_id + 1) % CAM_NUM != 0: + continue + boxes = global_nusc_box_to_cam( + self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame, + mapped_class_names, self.eval_detection_configs, + self.eval_version) + cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes) + # box nms 3d over 6 images in a frame + # TODO: move this global setting into config + nms_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.05, + score_thr=0.01, + min_bbox_size=0, + max_per_frame=500) + from mmcv import Config + nms_cfg = Config(nms_cfg) + cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev) + boxes3d = cam_boxes3d.tensor + # generate attr scores from attr labels + attrs = labels.new_tensor([attr for attr in attrs_per_frame]) + boxes3d, scores, labels, attrs = box3d_multiclass_nms( + boxes3d, + cam_boxes3d_for_nms, + scores, + nms_cfg.score_thr, + nms_cfg.max_per_frame, + nms_cfg, + mlvl_attr_scores=attrs) + cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9) + det = bbox3d2result(cam_boxes3d, scores, labels, attrs) + boxes, attrs = output_to_nusc_box(det) + boxes, attrs = cam_nusc_box_to_global( + self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs, + mapped_class_names, self.eval_detection_configs, + self.eval_version) + + for i, box in enumerate(boxes): + name = mapped_class_names[box.label] + attr = self.get_attr_name(attrs[i], name) + nusc_anno = dict( + sample_token=sample_token, + translation=box.center.tolist(), + size=box.wlh.tolist(), + rotation=box.orientation.elements.tolist(), + velocity=box.velocity[:2].tolist(), + detection_name=name, + detection_score=box.score, + attribute_name=attr) + annos.append(nusc_anno) + # other views results of the same frame should be concatenated + if sample_token in nusc_annos: + nusc_annos[sample_token].extend(annos) + else: + nusc_annos[sample_token] = annos + + nusc_submissions = { + 'meta': self.modality, + 'results': nusc_annos, + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + res_path = osp.join(jsonfile_prefix, 'results_nusc.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def _evaluate_single(self, + result_path, + logger=None, + metric='bbox', + result_name='img_bbox'): + """Evaluation for a single model in nuScenes protocol. + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'img_bbox'. + Returns: + dict: Dictionary of evaluation details. + """ + from nuscenes import NuScenes + #from nuscenes.eval.detection.evaluate import NuScenesEval + from .nuscnes_eval import NuScenesEval_custom + output_dir = osp.join(*osp.split(result_path)[:-1]) + self.nusc = NuScenes( + version=self.version, dataroot=self.data_root, verbose=False) + eval_set_map = { + 'v1.0-mini': 'mini_val', + 'v1.0-trainval': 'val', + } + # nusc_eval = NuScenesEval( + # nusc, + # config=self.eval_detection_configs, + # result_path=result_path, + # eval_set=eval_set_map[self.version], + # output_dir=output_dir, + # verbose=False) + self.nusc_eval = NuScenesEval_custom( + self.nusc, + config=self.eval_detection_configs, + result_path=result_path, + eval_set=eval_set_map[self.version], + output_dir=output_dir, + verbose=True, + overlap_test=self.overlap_test, + data_infos=self.data_infos + ) + + self.nusc_eval.main(render_curves=True) + + # record metrics + metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) + detail = dict() + metric_prefix = f'{result_name}_NuScenes' + for name in self.CLASSES: + for k, v in metrics['label_aps'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['label_tp_errors'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['tp_errors'].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}'.format(metric_prefix, + self.ErrNameMapping[k])] = val + + detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] + detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] + return detail + + def format_results(self, results, jsonfile_prefix=None, **kwargs): + """Format the results to json (standard format for COCO evaluation). + Args: + results (list[tuple | numpy.ndarray]): Testing results of the + dataset. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + Returns: + tuple: (result_files, tmp_dir), result_files is a dict containing \ + the json filepaths, tmp_dir is the temporal directory created \ + for saving json files when jsonfile_prefix is not specified. + """ + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: {} != {}'. + format(len(results), len(self))) + + if jsonfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + jsonfile_prefix = osp.join(tmp_dir.name, 'results') + else: + tmp_dir = None + + # currently the output prediction results could be in two formats + # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) + # 2. list of dict('pts_bbox' or 'img_bbox': + # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) + # this is a workaround to enable evaluation of both formats on nuScenes + # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 + if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): + result_files = self._format_bbox(results, jsonfile_prefix) + else: + # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict + result_files = dict() + for name in results[0]: + # not evaluate 2D predictions on nuScenes + if '2d' in name: + continue + print(f'\nFormating bboxes of {name}') + results_ = [out[name] for out in results] + tmp_file_ = osp.join(jsonfile_prefix, name) + result_files.update( + {name: self._format_bbox(results_, tmp_file_)}) + + return result_files, tmp_dir + + def evaluate(self, + results, + metric='bbox', + logger=None, + jsonfile_prefix=None, + result_names=['img_bbox'], + show=False, + out_dir=None, + pipeline=None): + """Evaluation in nuScenes protocol. + Args: + results (list[dict]): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + show (bool): Whether to visualize. + Default: False. + out_dir (str): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + Returns: + dict[str, float]: Results of each evaluation metric. + """ + + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name]) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files) + + if tmp_dir is not None: + tmp_dir.cleanup() + + if show: + self.show(results, out_dir, pipeline=pipeline) + return results_dict + + def _extract_data(self, index, pipeline, key, load_annos=False): + """Load data using input pipeline and extract data according to key. + Args: + index (int): Index for accessing the target data. + pipeline (:obj:`Compose`): Composed data loading pipeline. + key (str | list[str]): One single or a list of data key. + load_annos (bool): Whether to load data annotations. + If True, need to set self.test_mode as False before loading. + Returns: + np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: + A single or a list of loaded data. + """ + assert pipeline is not None, 'data loading pipeline is not provided' + img_info = self.data_infos[index] + input_dict = dict(img_info=img_info) + + if load_annos: + ann_info = self.get_ann_info(index) + input_dict.update(dict(ann_info=ann_info)) + + self.pre_pipeline(input_dict) + example = pipeline(input_dict) + + # extract data items according to keys + if isinstance(key, str): + data = extract_result_dict(example, key) + else: + data = [extract_result_dict(example, k) for k in key] + + return data + + def _get_pipeline(self, pipeline): + """Get data loading pipeline in self.show/evaluate function. + Args: + pipeline (list[dict] | None): Input pipeline. If None is given, \ + get from self.pipeline. + """ + if pipeline is None: + if not hasattr(self, 'pipeline') or self.pipeline is None: + warnings.warn( + 'Use default pipeline for data loading, this may cause ' + 'errors when data is on ceph') + return self._build_default_pipeline() + loading_pipeline = get_loading_pipeline(self.pipeline.transforms) + return Compose(loading_pipeline) + return Compose(pipeline) + + def _build_default_pipeline(self): + """Build the default pipeline for this dataset.""" + pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='DefaultFormatBundle3D', + class_names=self.CLASSES, + with_label=False), + dict(type='Collect3D', keys=['img']) + ] + return Compose(pipeline) + + def show(self, results, out_dir, show=True, pipeline=None): + """Results visualization. + Args: + results (list[dict]): List of bounding boxes results. + out_dir (str): Output directory of visualization result. + show (bool): Visualize the results online. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + """ + assert out_dir is not None, 'Expect out_dir, got none.' + pipeline = self._get_pipeline(pipeline) + for i, result in enumerate(results): + if 'img_bbox' in result.keys(): + result = result['img_bbox'] + data_info = self.data_infos[i] + img_path = data_info['file_name'] + file_name = osp.split(img_path)[-1].split('.')[0] + img, img_metas = self._extract_data(i, pipeline, + ['img', 'img_metas']) + # need to transpose channel to first dim + img = img.numpy().transpose(1, 2, 0) + gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'] + pred_bboxes = result['boxes_3d'] + show_multi_modality_result( + img, + gt_bboxes, + pred_bboxes, + img_metas['cam2img'], + out_dir, + file_name, + box_mode='camera', + show=show) + + +def output_to_nusc_box(detection): + """Convert the output to the box class in the nuScenes. + Args: + detection (dict): Detection results. + - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. + - scores_3d (torch.Tensor): Detection scores. + - labels_3d (torch.Tensor): Predicted box labels. + - attrs_3d (torch.Tensor, optional): Predicted attributes. + Returns: + list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. + """ + box3d = detection['boxes_3d'] + scores = detection['scores_3d'].numpy() + labels = detection['labels_3d'].numpy() + attrs = None + if 'attrs_3d' in detection: + attrs = detection['attrs_3d'].numpy() + + box_gravity_center = box3d.gravity_center.numpy() + box_dims = box3d.dims.numpy() + box_yaw = box3d.yaw.numpy() + + # convert the dim/rot to nuscbox convention + box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]] + box_yaw = -box_yaw + + box_list = [] + for i in range(len(box3d)): + q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) + q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2) + quat = q2 * q1 + velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8]) + box = NuScenesBox( + box_gravity_center[i], + box_dims[i], + quat, + label=labels[i], + score=scores[i], + velocity=velocity) + box_list.append(box) + return box_list, attrs + + +def cam_nusc_box_to_global(info, + boxes, + attrs, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + """Convert the box from camera to global coordinate. + Args: + info (dict): Info for a specific sample data, including the + calibration information. + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + classes (list[str]): Mapped classes in the evaluation. + eval_configs (object): Evaluation configuration object. + eval_version (str): Evaluation version. + Default: 'detection_cvpr_2019' + Returns: + list: List of standard NuScenesBoxes in the global + coordinate. + """ + box_list = [] + attr_list = [] + for (box, attr) in zip(boxes, attrs): + # Move box to ego vehicle coord system + box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation'])) + box.translate(np.array(info['cam2ego_translation'])) + # filter det in ego. + cls_range_map = eval_configs.class_range + radius = np.linalg.norm(box.center[:2], 2) + det_range = cls_range_map[classes[box.label]] + if radius > det_range: + continue + # Move box to global coord system + box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) + box.translate(np.array(info['ego2global_translation'])) + box_list.append(box) + attr_list.append(attr) + return box_list, attr_list + + +def global_nusc_box_to_cam(info, + boxes, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + """Convert the box from global to camera coordinate. + Args: + info (dict): Info for a specific sample data, including the + calibration information. + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + classes (list[str]): Mapped classes in the evaluation. + eval_configs (object): Evaluation configuration object. + eval_version (str): Evaluation version. + Default: 'detection_cvpr_2019' + Returns: + list: List of standard NuScenesBoxes in the global + coordinate. + """ + box_list = [] + for box in boxes: + # Move box to ego vehicle coord system + box.translate(-np.array(info['ego2global_translation'])) + box.rotate( + pyquaternion.Quaternion(info['ego2global_rotation']).inverse) + # filter det in ego. + cls_range_map = eval_configs.class_range + radius = np.linalg.norm(box.center[:2], 2) + det_range = cls_range_map[classes[box.label]] + if radius > det_range: + continue + # Move box to camera coord system + box.translate(-np.array(info['cam2ego_translation'])) + box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse) + box_list.append(box) + return box_list + + +def nusc_box_to_cam_box3d(boxes): + """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`. + Args: + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + Returns: + tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \ + Converted 3D bounding boxes, scores and labels. + """ + locs = torch.Tensor([b.center for b in boxes]).view(-1, 3) + dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3) + rots = torch.Tensor([b.orientation.yaw_pitch_roll[0] + for b in boxes]).view(-1, 1) + velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2) + + # convert nusbox to cambox convention + dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]] + rots = -rots + + boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda() + cam_boxes3d = CameraInstance3DBoxes( + boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5)) + scores = torch.Tensor([b.score for b in boxes]).cuda() + labels = torch.LongTensor([b.label for b in boxes]).cuda() + nms_scores = scores.new_zeros(scores.shape[0], 10 + 1) + indices = labels.new_tensor(list(range(scores.shape[0]))) + nms_scores[indices, labels] = scores return cam_boxes3d, nms_scores, labels \ No newline at end of file diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/pipelines/formating.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/pipelines/formating.py index 52878523f..d52a15c0a 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/pipelines/formating.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/pipelines/formating.py @@ -1,39 +1,39 @@ - -# Copyright (c) OpenMMLab. All rights reserved. -import numpy as np -from mmcv.parallel import DataContainer as DC - -from mmdet3d.core.bbox import BaseInstance3DBoxes -from mmdet3d.core.points import BasePoints -from mmdet.datasets.builder import PIPELINES -from mmdet.datasets.pipelines import to_tensor -from mmdet3d.datasets.pipelines import DefaultFormatBundle3D - -@PIPELINES.register_module() -class CustomDefaultFormatBundle3D(DefaultFormatBundle3D): - """Default formatting bundle. - It simplifies the pipeline of formatting common fields for voxels, - including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and - "gt_semantic_seg". - These fields are formatted as follows. - - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - - proposals: (1)to tensor, (2)to DataContainer - - gt_bboxes: (1)to tensor, (2)to DataContainer - - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - - gt_labels: (1)to tensor, (2)to DataContainer - """ - - def __call__(self, results): - """Call function to transform and format common fields in results. - Args: - results (dict): Result dict contains the data to convert. - Returns: - dict: The result dict contains the data that is formatted with - default bundle. - """ - # Format 3D data - results = super(CustomDefaultFormatBundle3D, self).__call__(results) - results['gt_map_masks'] = DC( - to_tensor(results['gt_map_masks']), stack=True) - + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.parallel import DataContainer as DC + +from mmdet3d.core.bbox import BaseInstance3DBoxes +from mmdet3d.core.points import BasePoints +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import to_tensor +from mmdet3d.datasets.pipelines import DefaultFormatBundle3D + +@PIPELINES.register_module() +class CustomDefaultFormatBundle3D(DefaultFormatBundle3D): + """Default formatting bundle. + It simplifies the pipeline of formatting common fields for voxels, + including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and + "gt_semantic_seg". + These fields are formatted as follows. + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + """ + + def __call__(self, results): + """Call function to transform and format common fields in results. + Args: + results (dict): Result dict contains the data to convert. + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + # Format 3D data + results = super(CustomDefaultFormatBundle3D, self).__call__(results) + results['gt_map_masks'] = DC( + to_tensor(results['gt_map_masks']), stack=True) + return results \ No newline at end of file diff --git a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/samplers/sampler.py b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/samplers/sampler.py index 1906049c4..30276cff6 100755 --- a/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/samplers/sampler.py +++ b/cv/3d_detection/bevformer/pytorch/projects/mmdet3d_plugin/datasets/samplers/sampler.py @@ -1,7 +1,7 @@ -from mmcv.utils.registry import Registry, build_from_cfg - -SAMPLER = Registry('sampler') - - -def build_sampler(cfg, default_args): - return build_from_cfg(cfg, SAMPLER, default_args) +from mmcv.utils.registry import Registry, build_from_cfg + +SAMPLER = Registry('sampler') + + +def build_sampler(cfg, default_args): + return build_from_cfg(cfg, SAMPLER, default_args) diff --git a/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/get_params.py b/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/get_params.py index fb697ad80..2818900b2 100755 --- a/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/get_params.py +++ b/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/get_params.py @@ -1,10 +1,10 @@ -import torch -file_path = './ckpts/bevformer_v4.pth' -model = torch.load(file_path, map_location='cpu') -all = 0 -for key in list(model['state_dict'].keys()): - all += model['state_dict'][key].nelement() -print(all) - -# smaller 63374123 -# v4 69140395 +import torch +file_path = './ckpts/bevformer_v4.pth' +model = torch.load(file_path, map_location='cpu') +all = 0 +for key in list(model['state_dict'].keys()): + all += model['state_dict'][key].nelement() +print(all) + +# smaller 63374123 +# v4 69140395 diff --git a/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/visual.py b/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/visual.py index f711b7530..648129897 100755 --- a/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/visual.py +++ b/cv/3d_detection/bevformer/pytorch/tools/analysis_tools/visual.py @@ -1,477 +1,477 @@ -# Based on https://github.com/nutonomy/nuscenes-devkit -# --------------------------------------------- -# Modified by Zhiqi Li -# --------------------------------------------- - -import mmcv -from nuscenes.nuscenes import NuScenes -from PIL import Image -from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix -from typing import Tuple, List, Iterable -import matplotlib.pyplot as plt -import numpy as np -from PIL import Image -from matplotlib import rcParams -from matplotlib.axes import Axes -from pyquaternion import Quaternion -from PIL import Image -from matplotlib import rcParams -from matplotlib.axes import Axes -from pyquaternion import Quaternion -from tqdm import tqdm -from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box -from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix -from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox -from nuscenes.eval.detection.data_classes import DetectionBox -from nuscenes.eval.detection.utils import category_to_detection_name -from nuscenes.eval.detection.render import visualize_sample - - - - -cams = ['CAM_FRONT', - 'CAM_FRONT_RIGHT', - 'CAM_BACK_RIGHT', - 'CAM_BACK', - 'CAM_BACK_LEFT', - 'CAM_FRONT_LEFT'] - -import numpy as np -import matplotlib.pyplot as plt -from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box -from PIL import Image -from matplotlib import rcParams - - -def render_annotation( - anntoken: str, - margin: float = 10, - view: np.ndarray = np.eye(4), - box_vis_level: BoxVisibility = BoxVisibility.ANY, - out_path: str = 'render.png', - extra_info: bool = False) -> None: - """ - Render selected annotation. - :param anntoken: Sample_annotation token. - :param margin: How many meters in each direction to include in LIDAR view. - :param view: LIDAR view point. - :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. - :param out_path: Optional path to save the rendered figure to disk. - :param extra_info: Whether to render extra information below camera view. - """ - ann_record = nusc.get('sample_annotation', anntoken) - sample_record = nusc.get('sample', ann_record['sample_token']) - assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.' - - # Figure out which camera the object is fully visible in (this may return nothing). - boxes, cam = [], [] - cams = [key for key in sample_record['data'].keys() if 'CAM' in key] - all_bboxes = [] - select_cams = [] - for cam in cams: - _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level, - selected_anntokens=[anntoken]) - if len(boxes) > 0: - all_bboxes.append(boxes) - select_cams.append(cam) - # We found an image that matches. Let's abort. - # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \ - # 'Try using e.g. BoxVisibility.ANY.' - # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!' - - num_cam = len(all_bboxes) - - fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9)) - select_cams = [sample_record['data'][cam] for cam in select_cams] - print('bbox in cams:', select_cams) - # Plot LIDAR view. - lidar = sample_record['data']['LIDAR_TOP'] - data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken]) - LidarPointCloud.from_file(data_path).render_height(axes[0], view=view) - for box in boxes: - c = np.array(get_color(box.name)) / 255.0 - box.render(axes[0], view=view, colors=(c, c, c)) - corners = view_points(boxes[0].corners(), view, False)[:2, :] - axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin]) - axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin]) - axes[0].axis('off') - axes[0].set_aspect('equal') - - # Plot CAMERA view. - for i in range(1, num_cam + 1): - cam = select_cams[i - 1] - data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken]) - im = Image.open(data_path) - axes[i].imshow(im) - axes[i].set_title(nusc.get('sample_data', cam)['channel']) - axes[i].axis('off') - axes[i].set_aspect('equal') - for box in boxes: - c = np.array(get_color(box.name)) / 255.0 - box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c)) - - # Print extra information about the annotation below the camera view. - axes[i].set_xlim(0, im.size[0]) - axes[i].set_ylim(im.size[1], 0) - - if extra_info: - rcParams['font.family'] = 'monospace' - - w, l, h = ann_record['size'] - category = ann_record['category_name'] - lidar_points = ann_record['num_lidar_pts'] - radar_points = ann_record['num_radar_pts'] - - sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) - pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) - dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation'])) - - information = ' \n'.join(['category: {}'.format(category), - '', - '# lidar points: {0:>4}'.format(lidar_points), - '# radar points: {0:>4}'.format(radar_points), - '', - 'distance: {:>7.3f}m'.format(dist), - '', - 'width: {:>7.3f}m'.format(w), - 'length: {:>7.3f}m'.format(l), - 'height: {:>7.3f}m'.format(h)]) - - plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top') - - if out_path is not None: - plt.savefig(out_path) - - - -def get_sample_data(sample_data_token: str, - box_vis_level: BoxVisibility = BoxVisibility.ANY, - selected_anntokens=None, - use_flat_vehicle_coordinates: bool = False): - """ - Returns the data path as well as all annotations related to that sample_data. - Note that the boxes are transformed into the current sensor's coordinate frame. - :param sample_data_token: Sample_data token. - :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. - :param selected_anntokens: If provided only return the selected annotation. - :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is - aligned to z-plane in the world. - :return: (data_path, boxes, camera_intrinsic ) - """ - - # Retrieve sensor & pose records - sd_record = nusc.get('sample_data', sample_data_token) - cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) - sensor_record = nusc.get('sensor', cs_record['sensor_token']) - pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) - - data_path = nusc.get_sample_data_path(sample_data_token) - - if sensor_record['modality'] == 'camera': - cam_intrinsic = np.array(cs_record['camera_intrinsic']) - imsize = (sd_record['width'], sd_record['height']) - else: - cam_intrinsic = None - imsize = None - - # Retrieve all sample annotations and map to sensor coordinate system. - if selected_anntokens is not None: - boxes = list(map(nusc.get_box, selected_anntokens)) - else: - boxes = nusc.get_boxes(sample_data_token) - - # Make list of Box objects including coord system transforms. - box_list = [] - for box in boxes: - if use_flat_vehicle_coordinates: - # Move box to ego vehicle coord system parallel to world z plane. - yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] - box.translate(-np.array(pose_record['translation'])) - box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) - else: - # Move box to ego vehicle coord system. - box.translate(-np.array(pose_record['translation'])) - box.rotate(Quaternion(pose_record['rotation']).inverse) - - # Move box to sensor coord system. - box.translate(-np.array(cs_record['translation'])) - box.rotate(Quaternion(cs_record['rotation']).inverse) - - if sensor_record['modality'] == 'camera' and not \ - box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): - continue - - box_list.append(box) - - return data_path, box_list, cam_intrinsic - - - -def get_predicted_data(sample_data_token: str, - box_vis_level: BoxVisibility = BoxVisibility.ANY, - selected_anntokens=None, - use_flat_vehicle_coordinates: bool = False, - pred_anns=None - ): - """ - Returns the data path as well as all annotations related to that sample_data. - Note that the boxes are transformed into the current sensor's coordinate frame. - :param sample_data_token: Sample_data token. - :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. - :param selected_anntokens: If provided only return the selected annotation. - :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is - aligned to z-plane in the world. - :return: (data_path, boxes, camera_intrinsic ) - """ - - # Retrieve sensor & pose records - sd_record = nusc.get('sample_data', sample_data_token) - cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) - sensor_record = nusc.get('sensor', cs_record['sensor_token']) - pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) - - data_path = nusc.get_sample_data_path(sample_data_token) - - if sensor_record['modality'] == 'camera': - cam_intrinsic = np.array(cs_record['camera_intrinsic']) - imsize = (sd_record['width'], sd_record['height']) - else: - cam_intrinsic = None - imsize = None - - # Retrieve all sample annotations and map to sensor coordinate system. - # if selected_anntokens is not None: - # boxes = list(map(nusc.get_box, selected_anntokens)) - # else: - # boxes = nusc.get_boxes(sample_data_token) - boxes = pred_anns - # Make list of Box objects including coord system transforms. - box_list = [] - for box in boxes: - if use_flat_vehicle_coordinates: - # Move box to ego vehicle coord system parallel to world z plane. - yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] - box.translate(-np.array(pose_record['translation'])) - box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) - else: - # Move box to ego vehicle coord system. - box.translate(-np.array(pose_record['translation'])) - box.rotate(Quaternion(pose_record['rotation']).inverse) - - # Move box to sensor coord system. - box.translate(-np.array(cs_record['translation'])) - box.rotate(Quaternion(cs_record['rotation']).inverse) - - if sensor_record['modality'] == 'camera' and not \ - box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): - continue - box_list.append(box) - - return data_path, box_list, cam_intrinsic - - - - -def lidiar_render(sample_token, data,out_path=None): - bbox_gt_list = [] - bbox_pred_list = [] - anns = nusc.get('sample', sample_token)['anns'] - for ann in anns: - content = nusc.get('sample_annotation', ann) - try: - bbox_gt_list.append(DetectionBox( - sample_token=content['sample_token'], - translation=tuple(content['translation']), - size=tuple(content['size']), - rotation=tuple(content['rotation']), - velocity=nusc.box_velocity(content['token'])[:2], - ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content - else tuple(content['ego_translation']), - num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), - detection_name=category_to_detection_name(content['category_name']), - detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), - attribute_name='')) - except: - pass - - bbox_anns = data['results'][sample_token] - for content in bbox_anns: - bbox_pred_list.append(DetectionBox( - sample_token=content['sample_token'], - translation=tuple(content['translation']), - size=tuple(content['size']), - rotation=tuple(content['rotation']), - velocity=tuple(content['velocity']), - ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content - else tuple(content['ego_translation']), - num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), - detection_name=content['detection_name'], - detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), - attribute_name=content['attribute_name'])) - gt_annotations = EvalBoxes() - pred_annotations = EvalBoxes() - gt_annotations.add_boxes(sample_token, bbox_gt_list) - pred_annotations.add_boxes(sample_token, bbox_pred_list) - print('green is ground truth') - print('blue is the predited result') - visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev') - - -def get_color(category_name: str): - """ - Provides the default colors based on the category names. - This method works for the general nuScenes categories, as well as the nuScenes detection categories. - """ - a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker', - 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller', - 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris', - 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle', - 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance', - 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface', - 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation', - 'vehicle.ego'] - class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' - ] - #print(category_name) - if category_name == 'bicycle': - return nusc.colormap['vehicle.bicycle'] - elif category_name == 'construction_vehicle': - return nusc.colormap['vehicle.construction'] - elif category_name == 'traffic_cone': - return nusc.colormap['movable_object.trafficcone'] - - for key in nusc.colormap.keys(): - if category_name in key: - return nusc.colormap[key] - return [0, 0, 0] - - -def render_sample_data( - sample_toekn: str, - with_anns: bool = True, - box_vis_level: BoxVisibility = BoxVisibility.ANY, - axes_limit: float = 40, - ax=None, - nsweeps: int = 1, - out_path: str = None, - underlay_map: bool = True, - use_flat_vehicle_coordinates: bool = True, - show_lidarseg: bool = False, - show_lidarseg_legend: bool = False, - filter_lidarseg_labels=None, - lidarseg_preds_bin_path: str = None, - verbose: bool = True, - show_panoptic: bool = False, - pred_data=None, - ) -> None: - """ - Render sample data onto axis. - :param sample_data_token: Sample_data token. - :param with_anns: Whether to draw box annotations. - :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. - :param axes_limit: Axes limit for lidar and radar (measured in meters). - :param ax: Axes onto which to render. - :param nsweeps: Number of sweeps for lidar and radar. - :param out_path: Optional path to save the rendered figure to disk. - :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow. - :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is - aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which - can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new - setting is more correct and rotates the plot by ~90 degrees. - :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set - to False, the colors of the lidar data represent the distance from the center of the ego vehicle. - :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame. - :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None - or the list is empty, all classes will be displayed. - :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation - predictions for the sample. - :param verbose: Whether to display the image after it is rendered. - :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set - to False, the colors of the lidar data represent the distance from the center of the ego vehicle. - If show_lidarseg is True, show_panoptic will be set to False. - """ - lidiar_render(sample_toekn, pred_data, out_path=out_path) - sample = nusc.get('sample', sample_toekn) - # sample = data['results'][sample_token_list[0]][0] - cams = [ - 'CAM_FRONT_LEFT', - 'CAM_FRONT', - 'CAM_FRONT_RIGHT', - 'CAM_BACK_LEFT', - 'CAM_BACK', - 'CAM_BACK_RIGHT', - ] - if ax is None: - _, ax = plt.subplots(4, 3, figsize=(24, 18)) - j = 0 - for ind, cam in enumerate(cams): - sample_data_token = sample['data'][cam] - - sd_record = nusc.get('sample_data', sample_data_token) - sensor_modality = sd_record['sensor_modality'] - - if sensor_modality in ['lidar', 'radar']: - assert False - elif sensor_modality == 'camera': - # Load boxes and image. - boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']), - name=record['detection_name'], token='predicted') for record in - pred_data['results'][sample_toekn] if record['detection_score'] > 0.2] - - data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token, - box_vis_level=box_vis_level, pred_anns=boxes) - _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level) - if ind == 3: - j += 1 - ind = ind % 3 - data = Image.open(data_path) - # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png') - # Init axes. - - # Show image. - ax[j, ind].imshow(data) - ax[j + 2, ind].imshow(data) - - # Show boxes. - if with_anns: - for box in boxes_pred: - c = np.array(get_color(box.name)) / 255.0 - box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) - for box in boxes_gt: - c = np.array(get_color(box.name)) / 255.0 - box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) - - # Limit visible range. - ax[j, ind].set_xlim(0, data.size[0]) - ax[j, ind].set_ylim(data.size[1], 0) - ax[j + 2, ind].set_xlim(0, data.size[0]) - ax[j + 2, ind].set_ylim(data.size[1], 0) - - else: - raise ValueError("Error: Unknown sensor modality!") - - ax[j, ind].axis('off') - ax[j, ind].set_title('PRED: {} {labels_type}'.format( - sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) - ax[j, ind].set_aspect('equal') - - ax[j + 2, ind].axis('off') - ax[j + 2, ind].set_title('GT:{} {labels_type}'.format( - sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) - ax[j + 2, ind].set_aspect('equal') - - if out_path is not None: - plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200) - if verbose: - plt.show() - plt.close() - -if __name__ == '__main__': - nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) - # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52') - bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json') - sample_token_list = list(bevformer_results['results'].keys()) - for id in range(0, 10): - render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id]) +# Based on https://github.com/nutonomy/nuscenes-devkit +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import mmcv +from nuscenes.nuscenes import NuScenes +from PIL import Image +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from typing import Tuple, List, Iterable +import matplotlib.pyplot as plt +import numpy as np +from PIL import Image +from matplotlib import rcParams +from matplotlib.axes import Axes +from pyquaternion import Quaternion +from PIL import Image +from matplotlib import rcParams +from matplotlib.axes import Axes +from pyquaternion import Quaternion +from tqdm import tqdm +from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox +from nuscenes.eval.detection.data_classes import DetectionBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.eval.detection.render import visualize_sample + + + + +cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + +import numpy as np +import matplotlib.pyplot as plt +from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box +from PIL import Image +from matplotlib import rcParams + + +def render_annotation( + anntoken: str, + margin: float = 10, + view: np.ndarray = np.eye(4), + box_vis_level: BoxVisibility = BoxVisibility.ANY, + out_path: str = 'render.png', + extra_info: bool = False) -> None: + """ + Render selected annotation. + :param anntoken: Sample_annotation token. + :param margin: How many meters in each direction to include in LIDAR view. + :param view: LIDAR view point. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param out_path: Optional path to save the rendered figure to disk. + :param extra_info: Whether to render extra information below camera view. + """ + ann_record = nusc.get('sample_annotation', anntoken) + sample_record = nusc.get('sample', ann_record['sample_token']) + assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.' + + # Figure out which camera the object is fully visible in (this may return nothing). + boxes, cam = [], [] + cams = [key for key in sample_record['data'].keys() if 'CAM' in key] + all_bboxes = [] + select_cams = [] + for cam in cams: + _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level, + selected_anntokens=[anntoken]) + if len(boxes) > 0: + all_bboxes.append(boxes) + select_cams.append(cam) + # We found an image that matches. Let's abort. + # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \ + # 'Try using e.g. BoxVisibility.ANY.' + # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!' + + num_cam = len(all_bboxes) + + fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9)) + select_cams = [sample_record['data'][cam] for cam in select_cams] + print('bbox in cams:', select_cams) + # Plot LIDAR view. + lidar = sample_record['data']['LIDAR_TOP'] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken]) + LidarPointCloud.from_file(data_path).render_height(axes[0], view=view) + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[0], view=view, colors=(c, c, c)) + corners = view_points(boxes[0].corners(), view, False)[:2, :] + axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin]) + axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin]) + axes[0].axis('off') + axes[0].set_aspect('equal') + + # Plot CAMERA view. + for i in range(1, num_cam + 1): + cam = select_cams[i - 1] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken]) + im = Image.open(data_path) + axes[i].imshow(im) + axes[i].set_title(nusc.get('sample_data', cam)['channel']) + axes[i].axis('off') + axes[i].set_aspect('equal') + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Print extra information about the annotation below the camera view. + axes[i].set_xlim(0, im.size[0]) + axes[i].set_ylim(im.size[1], 0) + + if extra_info: + rcParams['font.family'] = 'monospace' + + w, l, h = ann_record['size'] + category = ann_record['category_name'] + lidar_points = ann_record['num_lidar_pts'] + radar_points = ann_record['num_radar_pts'] + + sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) + pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) + dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation'])) + + information = ' \n'.join(['category: {}'.format(category), + '', + '# lidar points: {0:>4}'.format(lidar_points), + '# radar points: {0:>4}'.format(radar_points), + '', + 'distance: {:>7.3f}m'.format(dist), + '', + 'width: {:>7.3f}m'.format(w), + 'length: {:>7.3f}m'.format(l), + 'height: {:>7.3f}m'.format(h)]) + + plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top') + + if out_path is not None: + plt.savefig(out_path) + + + +def get_sample_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + if selected_anntokens is not None: + boxes = list(map(nusc.get_box, selected_anntokens)) + else: + boxes = nusc.get_boxes(sample_data_token) + + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + + +def get_predicted_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False, + pred_anns=None + ): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + # if selected_anntokens is not None: + # boxes = list(map(nusc.get_box, selected_anntokens)) + # else: + # boxes = nusc.get_boxes(sample_data_token) + boxes = pred_anns + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + + + +def lidiar_render(sample_token, data,out_path=None): + bbox_gt_list = [] + bbox_pred_list = [] + anns = nusc.get('sample', sample_token)['anns'] + for ann in anns: + content = nusc.get('sample_annotation', ann) + try: + bbox_gt_list.append(DetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=nusc.box_velocity(content['token'])[:2], + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=category_to_detection_name(content['category_name']), + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name='')) + except: + pass + + bbox_anns = data['results'][sample_token] + for content in bbox_anns: + bbox_pred_list.append(DetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'])) + gt_annotations = EvalBoxes() + pred_annotations = EvalBoxes() + gt_annotations.add_boxes(sample_token, bbox_gt_list) + pred_annotations.add_boxes(sample_token, bbox_pred_list) + print('green is ground truth') + print('blue is the predited result') + visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev') + + +def get_color(category_name: str): + """ + Provides the default colors based on the category names. + This method works for the general nuScenes categories, as well as the nuScenes detection categories. + """ + a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker', + 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller', + 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris', + 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle', + 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance', + 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface', + 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation', + 'vehicle.ego'] + class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' + ] + #print(category_name) + if category_name == 'bicycle': + return nusc.colormap['vehicle.bicycle'] + elif category_name == 'construction_vehicle': + return nusc.colormap['vehicle.construction'] + elif category_name == 'traffic_cone': + return nusc.colormap['movable_object.trafficcone'] + + for key in nusc.colormap.keys(): + if category_name in key: + return nusc.colormap[key] + return [0, 0, 0] + + +def render_sample_data( + sample_toekn: str, + with_anns: bool = True, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + axes_limit: float = 40, + ax=None, + nsweeps: int = 1, + out_path: str = None, + underlay_map: bool = True, + use_flat_vehicle_coordinates: bool = True, + show_lidarseg: bool = False, + show_lidarseg_legend: bool = False, + filter_lidarseg_labels=None, + lidarseg_preds_bin_path: str = None, + verbose: bool = True, + show_panoptic: bool = False, + pred_data=None, + ) -> None: + """ + Render sample data onto axis. + :param sample_data_token: Sample_data token. + :param with_anns: Whether to draw box annotations. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param axes_limit: Axes limit for lidar and radar (measured in meters). + :param ax: Axes onto which to render. + :param nsweeps: Number of sweeps for lidar and radar. + :param out_path: Optional path to save the rendered figure to disk. + :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which + can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new + setting is more correct and rotates the plot by ~90 degrees. + :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame. + :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None + or the list is empty, all classes will be displayed. + :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation + predictions for the sample. + :param verbose: Whether to display the image after it is rendered. + :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + If show_lidarseg is True, show_panoptic will be set to False. + """ + lidiar_render(sample_toekn, pred_data, out_path=out_path) + sample = nusc.get('sample', sample_toekn) + # sample = data['results'][sample_token_list[0]][0] + cams = [ + 'CAM_FRONT_LEFT', + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_LEFT', + 'CAM_BACK', + 'CAM_BACK_RIGHT', + ] + if ax is None: + _, ax = plt.subplots(4, 3, figsize=(24, 18)) + j = 0 + for ind, cam in enumerate(cams): + sample_data_token = sample['data'][cam] + + sd_record = nusc.get('sample_data', sample_data_token) + sensor_modality = sd_record['sensor_modality'] + + if sensor_modality in ['lidar', 'radar']: + assert False + elif sensor_modality == 'camera': + # Load boxes and image. + boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']), + name=record['detection_name'], token='predicted') for record in + pred_data['results'][sample_toekn] if record['detection_score'] > 0.2] + + data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token, + box_vis_level=box_vis_level, pred_anns=boxes) + _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level) + if ind == 3: + j += 1 + ind = ind % 3 + data = Image.open(data_path) + # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png') + # Init axes. + + # Show image. + ax[j, ind].imshow(data) + ax[j + 2, ind].imshow(data) + + # Show boxes. + if with_anns: + for box in boxes_pred: + c = np.array(get_color(box.name)) / 255.0 + box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + for box in boxes_gt: + c = np.array(get_color(box.name)) / 255.0 + box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Limit visible range. + ax[j, ind].set_xlim(0, data.size[0]) + ax[j, ind].set_ylim(data.size[1], 0) + ax[j + 2, ind].set_xlim(0, data.size[0]) + ax[j + 2, ind].set_ylim(data.size[1], 0) + + else: + raise ValueError("Error: Unknown sensor modality!") + + ax[j, ind].axis('off') + ax[j, ind].set_title('PRED: {} {labels_type}'.format( + sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) + ax[j, ind].set_aspect('equal') + + ax[j + 2, ind].axis('off') + ax[j + 2, ind].set_title('GT:{} {labels_type}'.format( + sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) + ax[j + 2, ind].set_aspect('equal') + + if out_path is not None: + plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200) + if verbose: + plt.show() + plt.close() + +if __name__ == '__main__': + nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) + # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52') + bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json') + sample_token_list = list(bevformer_results['results'].keys()) + for id in range(0, 10): + render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id]) diff --git a/cv/3d_detection/bevformer/pytorch/tools/fp16/train.py b/cv/3d_detection/bevformer/pytorch/tools/fp16/train.py index 322c77587..3a6d9dd0b 100755 --- a/cv/3d_detection/bevformer/pytorch/tools/fp16/train.py +++ b/cv/3d_detection/bevformer/pytorch/tools/fp16/train.py @@ -1,272 +1,272 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from __future__ import division - -import argparse -import copy -import mmcv -import os -import time -import torch -import warnings -from mmcv import Config, DictAction -from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model -from os import path as osp - -from mmdet import __version__ as mmdet_version -from mmdet3d import __version__ as mmdet3d_version -#from mmdet3d.apis import train_model - -from mmdet3d.datasets import build_dataset -from mmdet3d.models import build_model -from mmdet3d.utils import collect_env, get_root_logger -from mmdet.apis import set_random_seed -from mmseg import __version__ as mmseg_version - -from mmcv.utils import TORCH_VERSION, digit_version - -def parse_args(): - parser = argparse.ArgumentParser(description='Train a detector') - parser.add_argument('config', help='train config file path') - parser.add_argument('--work-dir', help='the dir to save logs and models') - parser.add_argument( - '--resume-from', help='the checkpoint file to resume from') - parser.add_argument( - '--no-validate', - action='store_true', - help='whether not to evaluate the checkpoint during training') - group_gpus = parser.add_mutually_exclusive_group() - group_gpus.add_argument( - '--gpus', - type=int, - help='number of gpus to use ' - '(only applicable to non-distributed training)') - group_gpus.add_argument( - '--gpu-ids', - type=int, - nargs='+', - help='ids of gpus to use ' - '(only applicable to non-distributed training)') - parser.add_argument('--seed', type=int, default=0, help='random seed') - parser.add_argument( - '--deterministic', - action='store_true', - help='whether to set deterministic options for CUDNN backend.') - parser.add_argument( - '--options', - nargs='+', - action=DictAction, - help='override some settings in the used config, the key-value pair ' - 'in xxx=yyy format will be merged into config file (deprecate), ' - 'change to --cfg-options instead.') - parser.add_argument( - '--cfg-options', - nargs='+', - action=DictAction, - help='override some settings in the used config, the key-value pair ' - 'in xxx=yyy format will be merged into config file. If the value to ' - 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' - 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' - 'Note that the quotation marks are necessary and that no white space ' - 'is allowed.') - parser.add_argument( - '--launcher', - choices=['none', 'pytorch', 'slurm', 'mpi'], - default='none', - help='job launcher') - parser.add_argument('--local_rank', type=int, default=0) - parser.add_argument( - '--autoscale-lr', - action='store_true', - help='automatically scale lr with the number of gpus') - args = parser.parse_args() - if 'LOCAL_RANK' not in os.environ: - os.environ['LOCAL_RANK'] = str(args.local_rank) - - if args.options and args.cfg_options: - raise ValueError( - '--options and --cfg-options cannot be both specified, ' - '--options is deprecated in favor of --cfg-options') - if args.options: - warnings.warn('--options is deprecated in favor of --cfg-options') - args.cfg_options = args.options - - return args - - -def main(): - args = parse_args() - - cfg = Config.fromfile(args.config) - if args.cfg_options is not None: - cfg.merge_from_dict(args.cfg_options) - # import modules from string list. - if cfg.get('custom_imports', None): - from mmcv.utils import import_modules_from_strings - import_modules_from_strings(**cfg['custom_imports']) - - # import modules from plguin/xx, registry will be updated - if hasattr(cfg, 'plugin'): - if cfg.plugin: - import importlib - if hasattr(cfg, 'plugin_dir'): - plugin_dir = cfg.plugin_dir - _module_dir = os.path.dirname(plugin_dir) - _module_dir = _module_dir.split('/') - _module_path = _module_dir[0] - - for m in _module_dir[1:]: - _module_path = _module_path + '.' + m - print(_module_path) - plg_lib = importlib.import_module(_module_path) - else: - # import dir is the dirpath for the config file - _module_dir = os.path.dirname(args.config) - _module_dir = _module_dir.split('/') - _module_path = _module_dir[0] - for m in _module_dir[1:]: - _module_path = _module_path + '.' + m - print(_module_path) - plg_lib = importlib.import_module(_module_path) - - from projects.mmdet3d_plugin.bevformer.apis import custom_train_model - # set cudnn_benchmark - if cfg.get('cudnn_benchmark', False): - torch.backends.cudnn.benchmark = True - - # work_dir is determined in this priority: CLI > segment in file > filename - if args.work_dir is not None: - # update configs according to CLI args if args.work_dir is not None - cfg.work_dir = args.work_dir - elif cfg.get('work_dir', None) is None: - # use config filename as default work_dir if cfg.work_dir is None - cfg.work_dir = osp.join('./work_dirs', - osp.splitext(osp.basename(args.config))[0]) - #if args.resume_from is not None: - - if args.resume_from is not None and osp.isfile(args.resume_from): - cfg.resume_from = args.resume_from - - if args.gpu_ids is not None: - cfg.gpu_ids = args.gpu_ids - else: - cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) - if digit_version(TORCH_VERSION) != digit_version('1.8.1'): - cfg.optimizer['type'] = 'AdamW' - if args.autoscale_lr: - # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) - cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 - - # init distributed env first, since logger depends on the dist info. - if args.launcher == 'none': - assert False, 'DOT NOT SUPPORT!!!' - distributed = False - else: - distributed = True - init_dist(args.launcher, **cfg.dist_params) - # re-set gpu_ids with distributed training mode - _, world_size = get_dist_info() - cfg.gpu_ids = range(world_size) - - # create work_dir - mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) - # dump config - cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) - # init the logger before other steps - timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) - log_file = osp.join(cfg.work_dir, f'{timestamp}.log') - # specify logger name, if we still use 'mmdet', the output info will be - # filtered and won't be saved in the log_file - # TODO: ugly workaround to judge whether we are training det or seg model - if cfg.model.type in ['EncoderDecoder3D']: - logger_name = 'mmseg' - else: - logger_name = 'mmdet' - logger = get_root_logger( - log_file=log_file, log_level=cfg.log_level, name=logger_name) - - # init the meta dict to record some important information such as - # environment info and seed, which will be logged - meta = dict() - # log env info - env_info_dict = collect_env() - env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) - dash_line = '-' * 60 + '\n' - logger.info('Environment info:\n' + dash_line + env_info + '\n' + - dash_line) - meta['env_info'] = env_info - meta['config'] = cfg.pretty_text - - # log some basic info - logger.info(f'Distributed training: {distributed}') - logger.info(f'Config:\n{cfg.pretty_text}') - - # set random seeds - if args.seed is not None: - logger.info(f'Set random seed to {args.seed}, ' - f'deterministic: {args.deterministic}') - set_random_seed(args.seed, deterministic=args.deterministic) - cfg.seed = args.seed - meta['seed'] = args.seed - meta['exp_name'] = osp.basename(args.config) - - model = build_model( - cfg.model, - train_cfg=cfg.get('train_cfg'), - test_cfg=cfg.get('test_cfg')) - model.init_weights() - - eval_model_config = copy.deepcopy(cfg.model) - eval_model = build_model( - eval_model_config, - train_cfg=cfg.get('train_cfg'), - test_cfg=cfg.get('test_cfg')) - - fp16_cfg = cfg.get('fp16', None) - if fp16_cfg is not None: - wrap_fp16_model(eval_model) - - #eval_model.init_weights() - eval_model.load_state_dict(model.state_dict()) - - logger.info(f'Model:\n{model}') - from projects.mmdet3d_plugin.datasets import custom_build_dataset - datasets = [custom_build_dataset(cfg.data.train)] - if len(cfg.workflow) == 2: - val_dataset = copy.deepcopy(cfg.data.val) - # in case we use a dataset wrapper - if 'dataset' in cfg.data.train: - val_dataset.pipeline = cfg.data.train.dataset.pipeline - else: - val_dataset.pipeline = cfg.data.train.pipeline - # set test_mode=False here in deep copied config - # which do not affect AP/AR calculation later - # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa - val_dataset.test_mode = False - datasets.append(custom_build_dataset(val_dataset)) - if cfg.checkpoint_config is not None: - # save mmdet version, config file content and class names in - # checkpoints as meta data - cfg.checkpoint_config.meta = dict( - mmdet_version=mmdet_version, - mmseg_version=mmseg_version, - mmdet3d_version=mmdet3d_version, - config=cfg.pretty_text, - CLASSES=datasets[0].CLASSES, - PALETTE=datasets[0].PALETTE # for segmentors - if hasattr(datasets[0], 'PALETTE') else None) - # add an attribute for visualization convenience - model.CLASSES = datasets[0].CLASSES - custom_train_model( - model, - datasets, - cfg, - eval_model=eval_model, - distributed=distributed, - validate=(not args.no_validate), - timestamp=timestamp, - meta=meta) - - -if __name__ == '__main__': - torch.multiprocessing.set_start_method('fork') - main() +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import division + +import argparse +import copy +import mmcv +import os +import time +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model +from os import path as osp + +from mmdet import __version__ as mmdet_version +from mmdet3d import __version__ as mmdet3d_version +#from mmdet3d.apis import train_model + +from mmdet3d.datasets import build_dataset +from mmdet3d.models import build_model +from mmdet3d.utils import collect_env, get_root_logger +from mmdet.apis import set_random_seed +from mmseg import __version__ as mmseg_version + +from mmcv.utils import TORCH_VERSION, digit_version + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file (deprecate), ' + 'change to --cfg-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both specified, ' + '--options is deprecated in favor of --cfg-options') + if args.options: + warnings.warn('--options is deprecated in favor of --cfg-options') + args.cfg_options = args.options + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + from projects.mmdet3d_plugin.bevformer.apis import custom_train_model + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + #if args.resume_from is not None: + + if args.resume_from is not None and osp.isfile(args.resume_from): + cfg.resume_from = args.resume_from + + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + if digit_version(TORCH_VERSION) != digit_version('1.8.1'): + cfg.optimizer['type'] = 'AdamW' + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + assert False, 'DOT NOT SUPPORT!!!' + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # dump config + cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + # specify logger name, if we still use 'mmdet', the output info will be + # filtered and won't be saved in the log_file + # TODO: ugly workaround to judge whether we are training det or seg model + if cfg.model.type in ['EncoderDecoder3D']: + logger_name = 'mmseg' + else: + logger_name = 'mmdet' + logger = get_root_logger( + log_file=log_file, log_level=cfg.log_level, name=logger_name) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + meta['config'] = cfg.pretty_text + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + meta['exp_name'] = osp.basename(args.config) + + model = build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + model.init_weights() + + eval_model_config = copy.deepcopy(cfg.model) + eval_model = build_model( + eval_model_config, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(eval_model) + + #eval_model.init_weights() + eval_model.load_state_dict(model.state_dict()) + + logger.info(f'Model:\n{model}') + from projects.mmdet3d_plugin.datasets import custom_build_dataset + datasets = [custom_build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + # in case we use a dataset wrapper + if 'dataset' in cfg.data.train: + val_dataset.pipeline = cfg.data.train.dataset.pipeline + else: + val_dataset.pipeline = cfg.data.train.pipeline + # set test_mode=False here in deep copied config + # which do not affect AP/AR calculation later + # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa + val_dataset.test_mode = False + datasets.append(custom_build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmdet version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmdet_version=mmdet_version, + mmseg_version=mmseg_version, + mmdet3d_version=mmdet3d_version, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + PALETTE=datasets[0].PALETTE # for segmentors + if hasattr(datasets[0], 'PALETTE') else None) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + custom_train_model( + model, + datasets, + cfg, + eval_model=eval_model, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + torch.multiprocessing.set_start_method('fork') + main() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py index 5d7e5c81a..0d06127f9 100644 --- a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py @@ -1,803 +1,803 @@ -from pathlib import Path - -import numba -import numpy as np -from det3d.core.bbox.geometry import ( - points_count_convex_polygon_3d_jit, - points_in_convex_polygon_3d_jit, -) -try: - from spconv.utils import rbbox_intersection, rbbox_iou -except: - print("Import spconv fail, no support for sparse convolution!") - - -def points_count_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): - rbbox_corners = center_to_corner_box3d( - rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis - ) - surfaces = corner_to_surfaces_3d(rbbox_corners) - return points_count_convex_polygon_3d_jit(points[:, :3], surfaces) - - -def riou_cc(rbboxes, qrbboxes, standup_thresh=0.0): - # less than 50ms when used in second one thread. 10x slower than gpu - boxes_corners = center_to_corner_box2d( - rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] - ) - boxes_standup = corner_to_standup_nd(boxes_corners) - qboxes_corners = center_to_corner_box2d( - qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] - ) - qboxes_standup = corner_to_standup_nd(qboxes_corners) - # if standup box not overlapped, rbbox not overlapped too. - standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) - return rbbox_iou(boxes_corners, qboxes_corners, standup_iou, standup_thresh) - - -def rinter_cc(rbboxes, qrbboxes, standup_thresh=0.0): - # less than 50ms when used in second one thread. 10x slower than gpu - boxes_corners = center_to_corner_box2d( - rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] - ) - boxes_standup = corner_to_standup_nd(boxes_corners) - qboxes_corners = center_to_corner_box2d( - qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] - ) - qboxes_standup = corner_to_standup_nd(qboxes_corners) - # if standup box not overlapped, rbbox not overlapped too. - standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) - return rbbox_intersection( - boxes_corners, qboxes_corners, standup_iou, standup_thresh - ) - - -def corners_nd(dims, origin=0.5): - """generate relative box corners based on length per dim and - origin point. - - Args: - dims (float array, shape=[N, ndim]): array of length per dim - origin (list or array or float): origin point relate to smallest point. - - Returns: - float array, shape=[N, 2 ** ndim, ndim]: returned corners. - point layout example: (2d) x0y0, x0y1, x1y0, x1y1; - (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 - where x0 < x1, y0 < y1, z0 < z1 - """ - ndim = int(dims.shape[1]) - corners_norm = np.stack( - np.unravel_index(np.arange(2 ** ndim), [2] * ndim), axis=1 - ).astype(dims.dtype) - # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 - # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 - # so need to convert to a format which is convenient to do other computing. - # for 2d boxes, format is clockwise start with minimum point - # for 3d boxes, please draw lines by your hand. - if ndim == 2: - # generate clockwise box corners - corners_norm = corners_norm[[0, 1, 3, 2]] - elif ndim == 3: - corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] - corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) - corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape([1, 2 ** ndim, ndim]) - return corners - - -@numba.njit -def corners_2d_jit(dims, origin=0.5): - ndim = 2 - corners_norm = np.array([[0, 0], [0, 1], [1, 1], [1, 0]], dtype=dims.dtype) - corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) - corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) - return corners - - -@numba.njit -def corners_3d_jit(dims, origin=0.5): - ndim = 3 - corners_norm = np.array( - [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1], - dtype=dims.dtype, - ).reshape((8, 3)) - corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] - corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) - corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) - return corners - - -@numba.njit -def corner_to_standup_nd_jit(boxes_corner): - num_boxes = boxes_corner.shape[0] - ndim = boxes_corner.shape[-1] - result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) - for i in range(num_boxes): - for j in range(ndim): - result[i, j] = np.min(boxes_corner[i, :, j]) - for j in range(ndim): - result[i, j + ndim] = np.max(boxes_corner[i, :, j]) - return result - - -def corner_to_standup_nd(boxes_corner): - assert len(boxes_corner.shape) == 3 - standup_boxes = [] - standup_boxes.append(np.min(boxes_corner, axis=1)) - standup_boxes.append(np.max(boxes_corner, axis=1)) - return np.concatenate(standup_boxes, -1) - - -def rbbox2d_to_near_bbox(rbboxes): - """convert rotated bbox to nearest 'standing' or 'lying' bbox. - Args: - rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes - Returns: - bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes - """ - rots = rbboxes[..., -1] - rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) - cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] - bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) - bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) - return bboxes - - -def rotation_3d_in_axis(points, angles, axis=0): - # points: [N, point_size, 3] - rot_sin = np.sin(angles) - rot_cos = np.cos(angles) - ones = np.ones_like(rot_cos) - zeros = np.zeros_like(rot_cos) - if axis == 1: - rot_mat_T = np.stack( - [ - [rot_cos, zeros, -rot_sin], - [zeros, ones, zeros], - [rot_sin, zeros, rot_cos], - ] - ) - elif axis == 2 or axis == -1: - rot_mat_T = np.stack( - [ - [rot_cos, -rot_sin, zeros], - [rot_sin, rot_cos, zeros], - [zeros, zeros, ones], - ] - ) - elif axis == 0: - rot_mat_T = np.stack( - [ - [zeros, rot_cos, -rot_sin], - [zeros, rot_sin, rot_cos], - [ones, zeros, zeros], - ] - ) - else: - raise ValueError("axis should in range") - - return np.einsum("aij,jka->aik", points, rot_mat_T) - - -def rotation_points_single_angle(points, angle, axis=0): - # points: [N, 3] - rot_sin = np.sin(angle) - rot_cos = np.cos(angle) - if axis == 1: - rot_mat_T = np.array( - [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], - dtype=points.dtype, - ) - elif axis == 2 or axis == -1: - rot_mat_T = np.array( - [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], - dtype=points.dtype, - ) - elif axis == 0: - rot_mat_T = np.array( - [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], - dtype=points.dtype, - ) - else: - raise ValueError("axis should in range") - - return points @ rot_mat_T - - -def rotation_2d(points, angles): - """rotation 2d points based on origin point clockwise when angle positive. - - Args: - points (float array, shape=[N, point_size, 2]): points to be rotated. - angles (float array, shape=[N]): rotation angle. - - Returns: - float array: same shape as points - """ - rot_sin = np.sin(angles) - rot_cos = np.cos(angles) - rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) - return np.einsum("aij,jka->aik", points, rot_mat_T) - - -def rotation_box(box_corners, angle): - """rotation 2d points based on origin point clockwise when angle positive. - - Args: - points (float array, shape=[N, point_size, 2]): points to be rotated. - angle (float): rotation angle. - - Returns: - float array: same shape as points - """ - rot_sin = np.sin(angle) - rot_cos = np.cos(angle) - rot_mat_T = np.array( - [[rot_cos, -rot_sin], [rot_sin, rot_cos]], dtype=box_corners.dtype - ) - return box_corners @ rot_mat_T - - -def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 0.5, 0.5), axis=2): - """convert kitti locations, dimensions and angles to corners - - Args: - centers (float array, shape=[N, 3]): locations in kitti label file. - dims (float array, shape=[N, 3]): dimensions in kitti label file. - angles (float array, shape=[N]): rotation_y in kitti label file. - origin (list or array or float): origin point relate to smallest point. - use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. - axis (int): rotation axis. 1 for camera and 2 for lidar. - Returns: - [type]: [description] - """ - # 'length' in kitti format is in x axis. - # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) - # center in kitti format is [0.5, 1.0, 0.5] in xyz. - corners = corners_nd(dims, origin=origin) - # corners: [N, 8, 3] - if angles is not None: - corners = rotation_3d_in_axis(corners, angles, axis=axis) - corners += centers.reshape([-1, 1, 3]) - return corners - - -def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): - """convert kitti locations, dimensions and angles to corners. - format: center(xy), dims(xy), angles(clockwise when positive) - - Args: - centers (float array, shape=[N, 2]): locations in kitti label file. - dims (float array, shape=[N, 2]): dimensions in kitti label file. - angles (float array, shape=[N]): rotation_y in kitti label file. - - Returns: - [type]: [description] - """ - # 'length' in kitti format is in x axis. - # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) - # center in kitti format is [0.5, 1.0, 0.5] in xyz. - corners = corners_nd(dims, origin=origin) - # corners: [N, 4, 2] - if angles is not None: - corners = rotation_2d(corners, angles) - corners += centers.reshape([-1, 1, 2]) - return corners - - -@numba.jit(nopython=True) -def box2d_to_corner_jit(boxes): - num_box = boxes.shape[0] - corners_norm = np.zeros((4, 2), dtype=boxes.dtype) - corners_norm[1, 1] = 1.0 - corners_norm[2] = 1.0 - corners_norm[3, 0] = 1.0 - corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) - corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) - for i in range(num_box): - rot_sin = np.sin(boxes[i, -1]) - rot_cos = np.cos(boxes[i, -1]) - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] - return box_corners - - -def rbbox3d_to_corners(rbboxes, origin=[0.5, 0.5, 0.5], axis=2): - return center_to_corner_box3d( - rbboxes[..., :3], rbboxes[..., 3:6], rbboxes[..., 6], origin, axis=axis - ) - - -def rbbox3d_to_bev_corners(rbboxes, origin=0.5): - return center_to_corner_box2d( - rbboxes[..., :2], rbboxes[..., 3:5], rbboxes[..., 6], origin - ) - - -def minmax_to_corner_2d(minmax_box): - ndim = minmax_box.shape[-1] // 2 - center = minmax_box[..., :ndim] - dims = minmax_box[..., ndim:] - center - return center_to_corner_box2d(center, dims, origin=0.0) - - -def minmax_to_corner_2d_v2(minmax_box): - # N, 4 -> N 4 2 - return minmax_box[..., [0, 1, 0, 3, 2, 3, 2, 1]].reshape(-1, 4, 2) - - -def minmax_to_corner_3d(minmax_box): - ndim = minmax_box.shape[-1] // 2 - center = minmax_box[..., :ndim] - dims = minmax_box[..., ndim:] - center - return center_to_corner_box3d(center, dims, origin=0.0) - - -def minmax_to_center_2d(minmax_box): - ndim = minmax_box.shape[-1] // 2 - center_min = minmax_box[..., :ndim] - dims = minmax_box[..., ndim:] - center_min - center = center_min + 0.5 * dims - return np.concatenate([center, dims], axis=-1) - - -def center_to_minmax_2d_0_5(centers, dims): - return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) - - -def center_to_minmax_2d(centers, dims, origin=0.5): - if origin == 0.5: - return center_to_minmax_2d_0_5(centers, dims) - corners = center_to_corner_box2d(centers, dims, origin=origin) - return corners[:, [0, 2]].reshape([-1, 4]) - - -def limit_period(val, offset=0.5, period=np.pi): - return val - np.floor(val / period + offset) * period - - -def projection_matrix_to_CRT_kitti(proj): - # P = C @ [R|T] - # C is upper triangular matrix, so we need to inverse CR and use QR - # stable for all kitti camera projection matrix - CR = proj[0:3, 0:3] - CT = proj[0:3, 3] - RinvCinv = np.linalg.inv(CR) - Rinv, Cinv = np.linalg.qr(RinvCinv) - C = np.linalg.inv(Cinv) - R = np.linalg.inv(Rinv) - T = Cinv @ CT - return C, R, T - - -def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): - fku = C[0, 0] - fkv = -C[1, 1] - u0v0 = C[0:2, 2] - z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] - b = bbox_image - box_corners = np.array( - [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype - ) - near_box_corners = (box_corners - u0v0) / np.array( - [fku / near_clip, -fkv / near_clip], dtype=C.dtype - ) - far_box_corners = (box_corners - u0v0) / np.array( - [fku / far_clip, -fkv / far_clip], dtype=C.dtype - ) - ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] - ret_xyz = np.concatenate([ret_xy, z_points], axis=1) - return ret_xyz - - -def get_frustum_v2(bboxes, C, near_clip=0.001, far_clip=100): - fku = C[0, 0] - fkv = -C[1, 1] - u0v0 = C[0:2, 2] - num_box = bboxes.shape[0] - z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[ - np.newaxis, :, np.newaxis - ] - z_points = np.tile(z_points, [num_box, 1, 1]) - box_corners = minmax_to_corner_2d_v2(bboxes) - near_box_corners = (box_corners - u0v0) / np.array( - [fku / near_clip, -fkv / near_clip], dtype=C.dtype - ) - far_box_corners = (box_corners - u0v0) / np.array( - [fku / far_clip, -fkv / far_clip], dtype=C.dtype - ) - ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=1) # [8, 2] - ret_xyz = np.concatenate([ret_xy, z_points], axis=-1) - return ret_xyz - - -@numba.njit -def _add_rgb_to_points_kernel(points_2d, image, points_rgb): - num_points = points_2d.shape[0] - image_h, image_w = image.shape[:2] - for i in range(num_points): - img_pos = np.floor(points_2d[i]).astype(np.int32) - if img_pos[0] >= 0 and img_pos[0] < image_w: - if img_pos[1] >= 0 and img_pos[1] < image_h: - points_rgb[i, :] = image[img_pos[1], img_pos[0], :] - # image[img_pos[1], img_pos[0]] = 0 - - -def add_rgb_to_points(points, image, rect, Trv2c, P2, mean_size=[5, 5]): - kernel = np.ones(mean_size, np.float32) / np.prod(mean_size) - # image = cv2.filter2D(image, -1, kernel) - points_cam = lidar_to_camera(points[:, :3], rect, Trv2c) - points_2d = project_to_image(points_cam, P2) - points_rgb = np.zeros([points_cam.shape[0], 3], dtype=points.dtype) - _add_rgb_to_points_kernel(points_2d, image, points_rgb) - return points_rgb - - -def project_to_image(points_3d, proj_mat): - points_shape = list(points_3d.shape) - points_shape[-1] = 1 - points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1) - point_2d = points_4 @ proj_mat.T - point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] - return point_2d_res - - -def camera_to_lidar(points, r_rect, velo2cam): - points_shape = list(points.shape[0:-1]) - if points.shape[-1] == 3: - points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) - lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) - return lidar_points[..., :3] - - -def lidar_to_camera(points, r_rect, velo2cam): - points_shape = list(points.shape[:-1]) - if points.shape[-1] == 3: - points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) - camera_points = points @ (r_rect @ velo2cam).T - return camera_points[..., :3] - - -def box_camera_to_lidar(data, r_rect, velo2cam): - xyz = data[:, 0:3] - l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] - r = data[:, 6:7] - xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) - return np.concatenate([xyz_lidar, w, l, h, r], axis=1) - - -def box_lidar_to_camera(data, r_rect, velo2cam): - xyz_lidar = data[:, 0:3] - w, l, h = data[:, 3:4], data[:, 4:5], data[:, 5:6] - r = data[:, 6:7] - xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam) - return np.concatenate([xyz, l, h, w, r], axis=1) - - -def remove_outside_points(points, rect, Trv2c, P2, image_shape): - # 5x faster than remove_outside_points_v1(2ms vs 10ms) - C, R, T = projection_matrix_to_CRT_kitti(P2) - image_bbox = [0, 0, image_shape[1], image_shape[0]] - frustum = get_frustum(image_bbox, C) - frustum -= T - frustum = np.linalg.inv(R) @ frustum.T - frustum = camera_to_lidar(frustum.T, rect, Trv2c) - frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) - indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) - points = points[indices.reshape([-1])] - return points - - -@numba.jit(nopython=True) -def iou_jit(boxes, query_boxes, eps=1.0): - """calculate box iou. note that jit version runs 2x faster than cython in - my machine! - Parameters - ---------- - boxes: (N, 4) ndarray of float - query_boxes: (K, 4) ndarray of float - Returns - ------- - overlaps: (N, K) ndarray of overlap between boxes and query_boxes - """ - N = boxes.shape[0] - K = query_boxes.shape[0] - overlaps = np.zeros((N, K), dtype=boxes.dtype) - for k in range(K): - box_area = (query_boxes[k, 2] - query_boxes[k, 0] + eps) * ( - query_boxes[k, 3] - query_boxes[k, 1] + eps - ) - for n in range(N): - iw = ( - min(boxes[n, 2], query_boxes[k, 2]) - - max(boxes[n, 0], query_boxes[k, 0]) - + eps - ) - if iw > 0: - ih = ( - min(boxes[n, 3], query_boxes[k, 3]) - - max(boxes[n, 1], query_boxes[k, 1]) - + eps - ) - if ih > 0: - ua = ( - (boxes[n, 2] - boxes[n, 0] + eps) - * (boxes[n, 3] - boxes[n, 1] + eps) - + box_area - - iw * ih - ) - overlaps[n, k] = iw * ih / ua - return overlaps - - -@numba.jit(nopython=True) -def iou_3d_jit(boxes, query_boxes, add1=True): - """calculate box iou3d, - ---------- - boxes: (N, 6) ndarray of float - query_boxes: (K, 6) ndarray of float - Returns - ------- - overlaps: (N, K) ndarray of overlap between boxes and query_boxes - """ - N = boxes.shape[0] - K = query_boxes.shape[0] - overlaps = np.zeros((N, K), dtype=boxes.dtype) - if add1: - add1 = 1.0 - else: - add1 = 0.0 - for k in range(K): - box_area = ( - (query_boxes[k, 3] - query_boxes[k, 0] + add1) - * (query_boxes[k, 4] - query_boxes[k, 1] + add1) - * (query_boxes[k, 5] - query_boxes[k, 2] + add1) - ) - for n in range(N): - iw = ( - min(boxes[n, 3], query_boxes[k, 3]) - - max(boxes[n, 0], query_boxes[k, 0]) - + add1 - ) - if iw > 0: - ih = ( - min(boxes[n, 4], query_boxes[k, 4]) - - max(boxes[n, 1], query_boxes[k, 1]) - + add1 - ) - if ih > 0: - il = ( - min(boxes[n, 5], query_boxes[k, 5]) - - max(boxes[n, 2], query_boxes[k, 2]) - + add1 - ) - if il > 0: - ua = float( - (boxes[n, 3] - boxes[n, 0] + add1) - * (boxes[n, 4] - boxes[n, 1] + add1) - * (boxes[n, 5] - boxes[n, 2] + add1) - + box_area - - iw * ih * il - ) - overlaps[n, k] = iw * ih * il / ua - return overlaps - - -@numba.jit(nopython=True) -def iou_nd_jit(boxes, query_boxes, add1=True): - """calculate box iou nd, 2x slower than iou_jit. - ---------- - boxes: (N, ndim * 2) ndarray of float - query_boxes: (K, ndim * 2) ndarray of float - Returns - ------- - overlaps: (N, K) ndarray of overlap between boxes and query_boxes - """ - N = boxes.shape[0] - K = query_boxes.shape[0] - ndim = boxes.shape[1] // 2 - overlaps = np.zeros((N, K), dtype=boxes.dtype) - side_lengths = np.zeros((ndim,), dtype=boxes.dtype) - if add1: - add1 = 1.0 - else: - add1 = 0.0 - invalid = False - for k in range(K): - qbox_area = query_boxes[k, ndim] - query_boxes[k, 0] + add1 - for i in range(1, ndim): - qbox_area *= query_boxes[k, ndim + i] - query_boxes[k, i] + add1 - for n in range(N): - invalid = False - for i in range(ndim): - side_length = ( - min(boxes[n, i + ndim], query_boxes[k, i + ndim]) - - max(boxes[n, i], query_boxes[k, i]) - + add1 - ) - if side_length <= 0: - invalid = True - break - side_lengths[i] = side_length - if not invalid: - box_area = boxes[n, ndim] - boxes[n, 0] + add1 - for i in range(1, ndim): - box_area *= boxes[n, ndim + i] - boxes[n, i] + add1 - inter = side_lengths[0] - for i in range(1, ndim): - inter *= side_lengths[i] - # inter = np.prod(side_lengths) - ua = float(box_area + qbox_area - inter) - overlaps[n, k] = inter / ua - - return overlaps - - -def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): - rbbox_corners = center_to_corner_box3d( - rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis - ) - surfaces = corner_to_surfaces_3d(rbbox_corners) - indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) - return indices - - -def corner_to_surfaces_3d(corners): - """convert 3d box corners from corner function above - to surfaces that normal vectors all direct to internal. - - Args: - corners (float array, [N, 8, 3]): 3d box corners. - Returns: - surfaces (float array, [N, 6, 4, 3]): - """ - # box_corners: [N, 8, 3], must from corner functions in this module - surfaces = np.array( - [ - [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], - [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], - [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], - [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], - [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], - [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], - ] - ).transpose([2, 0, 1, 3]) - return surfaces - - -@numba.jit(nopython=True) -def corner_to_surfaces_3d_jit(corners): - """convert 3d box corners from corner function above - to surfaces that normal vectors all direct to internal. - - Args: - corners (float array, [N, 8, 3]): 3d box corners. - Returns: - surfaces (float array, [N, 6, 4, 3]): - """ - # box_corners: [N, 8, 3], must from corner functions in this module - num_boxes = corners.shape[0] - surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) - corner_idxes = np.array( - [0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7] - ).reshape(6, 4) - for i in range(num_boxes): - for j in range(6): - for k in range(4): - surfaces[i, j, k] = corners[i, corner_idxes[j, k]] - return surfaces - - -def assign_label_to_voxel(gt_boxes, coors, voxel_size, coors_range): - """assign a 0/1 label to each voxel based on whether - the center of voxel is in gt_box. LIDAR. - """ - voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) - coors_range = np.array(coors_range, dtype=gt_boxes.dtype) - shift = coors_range[:3] - voxel_origins = coors[:, ::-1] * voxel_size + shift - voxel_centers = voxel_origins + voxel_size * 0.5 - gt_box_corners = center_to_corner_box3d( - gt_boxes[:, :3] - voxel_size * 0.5, - gt_boxes[:, 3:6] + voxel_size, - gt_boxes[:, 6], - origin=[0.5, 0.5, 0.5], - axis=2, - ) - gt_surfaces = corner_to_surfaces_3d(gt_box_corners) - ret = points_in_convex_polygon_3d_jit(voxel_centers, gt_surfaces) - return np.any(ret, axis=1).astype(np.int64) - - -def assign_label_to_voxel_v3(gt_boxes, coors, voxel_size, coors_range): - """assign a 0/1 label to each voxel based on whether - the center of voxel is in gt_box. LIDAR. - """ - voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) - coors_range = np.array(coors_range, dtype=gt_boxes.dtype) - shift = coors_range[:3] - voxel_origins = coors[:, ::-1] * voxel_size + shift - voxel_maxes = voxel_origins + voxel_size - voxel_minmax = np.concatenate([voxel_origins, voxel_maxes], axis=-1) - voxel_corners = minmax_to_corner_3d(voxel_minmax) - gt_box_corners = center_to_corner_box3d( - gt_boxes[:, :3], - gt_boxes[:, 3:6], - gt_boxes[:, 6], - origin=[0.5, 0.5, 0.5], - axis=2, - ) - gt_surfaces = corner_to_surfaces_3d(gt_box_corners) - voxel_corners_flat = voxel_corners.reshape([-1, 3]) - ret = points_in_convex_polygon_3d_jit(voxel_corners_flat, gt_surfaces) - ret = ret.reshape([-1, 8, ret.shape[-1]]) - return ret.any(-1).any(-1).astype(np.int64) - - -def image_box_region_area(img_cumsum, bbox): - """check a 2d voxel is contained by a box. used to filter empty - anchors. - Summed-area table algorithm: - ==> W - ------------------ - | | | - |------A---------B - | | | - | | | - |----- C---------D - Iabcd = ID-IB-IC+IA - Args: - img_cumsum: [M, H, W](yx) cumsumed image. - bbox: [N, 4](xyxy) bounding box, - """ - N = bbox.shape[0] - M = img_cumsum.shape[0] - ret = np.zeros([N, M], dtype=img_cumsum.dtype) - ID = img_cumsum[:, bbox[:, 3], bbox[:, 2]] - IA = img_cumsum[:, bbox[:, 1], bbox[:, 0]] - IB = img_cumsum[:, bbox[:, 3], bbox[:, 0]] - IC = img_cumsum[:, bbox[:, 1], bbox[:, 2]] - ret = ID - IB - IC + IA - return ret - - -def get_minimum_bounding_box_bv(points, voxel_size, bound, downsample=8, margin=1.6): - x_vsize = voxel_size[0] - y_vsize = voxel_size[1] - max_x = points[:, 0].max() - max_y = points[:, 1].max() - min_x = points[:, 0].min() - min_y = points[:, 1].min() - max_x = np.floor(max_x / (x_vsize * downsample) + 1) * (x_vsize * downsample) - max_y = np.floor(max_y / (y_vsize * downsample) + 1) * (y_vsize * downsample) - min_x = np.floor(min_x / (x_vsize * downsample)) * (x_vsize * downsample) - min_y = np.floor(min_y / (y_vsize * downsample)) * (y_vsize * downsample) - max_x = np.minimum(max_x + margin, bound[2]) - max_y = np.minimum(max_y + margin, bound[3]) - min_x = np.maximum(min_x - margin, bound[0]) - min_y = np.maximum(min_y - margin, bound[1]) - return np.array([min_x, min_y, max_x, max_y]) - - -def box3d_to_bbox(box3d, rect, Trv2c, P2): - box3d_to_cam = box_lidar_to_camera(box3d, rect, Trv2c) - box_corners = center_to_corner_box3d( - box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1 - ) - box_corners_in_image = project_to_image(box_corners, P2) - # box_corners_in_image: [N, 8, 2] - minxy = np.min(box_corners_in_image, axis=1) - maxxy = np.max(box_corners_in_image, axis=1) - bbox = np.concatenate([minxy, maxxy], axis=1) - return bbox - - -def change_box3d_center_(box3d, src, dst): - dst = np.array(dst, dtype=box3d.dtype) - src = np.array(src, dtype=box3d.dtype) - box3d[..., :3] += box3d[..., 3:6] * (dst - src) +from pathlib import Path + +import numba +import numpy as np +from det3d.core.bbox.geometry import ( + points_count_convex_polygon_3d_jit, + points_in_convex_polygon_3d_jit, +) +try: + from spconv.utils import rbbox_intersection, rbbox_iou +except: + print("Import spconv fail, no support for sparse convolution!") + + +def points_count_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): + rbbox_corners = center_to_corner_box3d( + rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis + ) + surfaces = corner_to_surfaces_3d(rbbox_corners) + return points_count_convex_polygon_3d_jit(points[:, :3], surfaces) + + +def riou_cc(rbboxes, qrbboxes, standup_thresh=0.0): + # less than 50ms when used in second one thread. 10x slower than gpu + boxes_corners = center_to_corner_box2d( + rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] + ) + boxes_standup = corner_to_standup_nd(boxes_corners) + qboxes_corners = center_to_corner_box2d( + qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] + ) + qboxes_standup = corner_to_standup_nd(qboxes_corners) + # if standup box not overlapped, rbbox not overlapped too. + standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) + return rbbox_iou(boxes_corners, qboxes_corners, standup_iou, standup_thresh) + + +def rinter_cc(rbboxes, qrbboxes, standup_thresh=0.0): + # less than 50ms when used in second one thread. 10x slower than gpu + boxes_corners = center_to_corner_box2d( + rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] + ) + boxes_standup = corner_to_standup_nd(boxes_corners) + qboxes_corners = center_to_corner_box2d( + qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] + ) + qboxes_standup = corner_to_standup_nd(qboxes_corners) + # if standup box not overlapped, rbbox not overlapped too. + standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) + return rbbox_intersection( + boxes_corners, qboxes_corners, standup_iou, standup_thresh + ) + + +def corners_nd(dims, origin=0.5): + """generate relative box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, ndim]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + + Returns: + float array, shape=[N, 2 ** ndim, ndim]: returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1 + """ + ndim = int(dims.shape[1]) + corners_norm = np.stack( + np.unravel_index(np.arange(2 ** ndim), [2] * ndim), axis=1 + ).astype(dims.dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start with minimum point + # for 3d boxes, please draw lines by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape([1, 2 ** ndim, ndim]) + return corners + + +@numba.njit +def corners_2d_jit(dims, origin=0.5): + ndim = 2 + corners_norm = np.array([[0, 0], [0, 1], [1, 1], [1, 0]], dtype=dims.dtype) + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) + return corners + + +@numba.njit +def corners_3d_jit(dims, origin=0.5): + ndim = 3 + corners_norm = np.array( + [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1], + dtype=dims.dtype, + ).reshape((8, 3)) + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) + return corners + + +@numba.njit +def corner_to_standup_nd_jit(boxes_corner): + num_boxes = boxes_corner.shape[0] + ndim = boxes_corner.shape[-1] + result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) + for i in range(num_boxes): + for j in range(ndim): + result[i, j] = np.min(boxes_corner[i, :, j]) + for j in range(ndim): + result[i, j + ndim] = np.max(boxes_corner[i, :, j]) + return result + + +def corner_to_standup_nd(boxes_corner): + assert len(boxes_corner.shape) == 3 + standup_boxes = [] + standup_boxes.append(np.min(boxes_corner, axis=1)) + standup_boxes.append(np.max(boxes_corner, axis=1)) + return np.concatenate(standup_boxes, -1) + + +def rbbox2d_to_near_bbox(rbboxes): + """convert rotated bbox to nearest 'standing' or 'lying' bbox. + Args: + rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes + Returns: + bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes + """ + rots = rbboxes[..., -1] + rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) + cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] + bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) + bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) + return bboxes + + +def rotation_3d_in_axis(points, angles, axis=0): + # points: [N, point_size, 3] + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + ones = np.ones_like(rot_cos) + zeros = np.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = np.stack( + [ + [rot_cos, zeros, -rot_sin], + [zeros, ones, zeros], + [rot_sin, zeros, rot_cos], + ] + ) + elif axis == 2 or axis == -1: + rot_mat_T = np.stack( + [ + [rot_cos, -rot_sin, zeros], + [rot_sin, rot_cos, zeros], + [zeros, zeros, ones], + ] + ) + elif axis == 0: + rot_mat_T = np.stack( + [ + [zeros, rot_cos, -rot_sin], + [zeros, rot_sin, rot_cos], + [ones, zeros, zeros], + ] + ) + else: + raise ValueError("axis should in range") + + return np.einsum("aij,jka->aik", points, rot_mat_T) + + +def rotation_points_single_angle(points, angle, axis=0): + # points: [N, 3] + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + if axis == 1: + rot_mat_T = np.array( + [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], + dtype=points.dtype, + ) + elif axis == 2 or axis == -1: + rot_mat_T = np.array( + [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], + dtype=points.dtype, + ) + elif axis == 0: + rot_mat_T = np.array( + [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], + dtype=points.dtype, + ) + else: + raise ValueError("axis should in range") + + return points @ rot_mat_T + + +def rotation_2d(points, angles): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angles (float array, shape=[N]): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) + return np.einsum("aij,jka->aik", points, rot_mat_T) + + +def rotation_box(box_corners, angle): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angle (float): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T = np.array( + [[rot_cos, -rot_sin], [rot_sin, rot_cos]], dtype=box_corners.dtype + ) + return box_corners @ rot_mat_T + + +def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 0.5, 0.5), axis=2): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 3]): locations in kitti label file. + dims (float array, shape=[N, 3]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + origin (list or array or float): origin point relate to smallest point. + use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. + axis (int): rotation axis. 1 for camera and 2 for lidar. + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + if angles is not None: + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.reshape([-1, 1, 3]) + return corners + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """convert kitti locations, dimensions and angles to corners. + format: center(xy), dims(xy), angles(clockwise when positive) + + Args: + centers (float array, shape=[N, 2]): locations in kitti label file. + dims (float array, shape=[N, 2]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_2d(corners, angles) + corners += centers.reshape([-1, 1, 2]) + return corners + + +@numba.jit(nopython=True) +def box2d_to_corner_jit(boxes): + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +def rbbox3d_to_corners(rbboxes, origin=[0.5, 0.5, 0.5], axis=2): + return center_to_corner_box3d( + rbboxes[..., :3], rbboxes[..., 3:6], rbboxes[..., 6], origin, axis=axis + ) + + +def rbbox3d_to_bev_corners(rbboxes, origin=0.5): + return center_to_corner_box2d( + rbboxes[..., :2], rbboxes[..., 3:5], rbboxes[..., 6], origin + ) + + +def minmax_to_corner_2d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box2d(center, dims, origin=0.0) + + +def minmax_to_corner_2d_v2(minmax_box): + # N, 4 -> N 4 2 + return minmax_box[..., [0, 1, 0, 3, 2, 3, 2, 1]].reshape(-1, 4, 2) + + +def minmax_to_corner_3d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box3d(center, dims, origin=0.0) + + +def minmax_to_center_2d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center_min = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center_min + center = center_min + 0.5 * dims + return np.concatenate([center, dims], axis=-1) + + +def center_to_minmax_2d_0_5(centers, dims): + return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) + + +def center_to_minmax_2d(centers, dims, origin=0.5): + if origin == 0.5: + return center_to_minmax_2d_0_5(centers, dims) + corners = center_to_corner_box2d(centers, dims, origin=origin) + return corners[:, [0, 2]].reshape([-1, 4]) + + +def limit_period(val, offset=0.5, period=np.pi): + return val - np.floor(val / period + offset) * period + + +def projection_matrix_to_CRT_kitti(proj): + # P = C @ [R|T] + # C is upper triangular matrix, so we need to inverse CR and use QR + # stable for all kitti camera projection matrix + CR = proj[0:3, 0:3] + CT = proj[0:3, 3] + RinvCinv = np.linalg.inv(CR) + Rinv, Cinv = np.linalg.qr(RinvCinv) + C = np.linalg.inv(Cinv) + R = np.linalg.inv(Rinv) + T = Cinv @ CT + return C, R, T + + +def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] + b = bbox_image + box_corners = np.array( + [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype + ) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype + ) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype + ) + ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=1) + return ret_xyz + + +def get_frustum_v2(bboxes, C, near_clip=0.001, far_clip=100): + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + num_box = bboxes.shape[0] + z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[ + np.newaxis, :, np.newaxis + ] + z_points = np.tile(z_points, [num_box, 1, 1]) + box_corners = minmax_to_corner_2d_v2(bboxes) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype + ) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype + ) + ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=1) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=-1) + return ret_xyz + + +@numba.njit +def _add_rgb_to_points_kernel(points_2d, image, points_rgb): + num_points = points_2d.shape[0] + image_h, image_w = image.shape[:2] + for i in range(num_points): + img_pos = np.floor(points_2d[i]).astype(np.int32) + if img_pos[0] >= 0 and img_pos[0] < image_w: + if img_pos[1] >= 0 and img_pos[1] < image_h: + points_rgb[i, :] = image[img_pos[1], img_pos[0], :] + # image[img_pos[1], img_pos[0]] = 0 + + +def add_rgb_to_points(points, image, rect, Trv2c, P2, mean_size=[5, 5]): + kernel = np.ones(mean_size, np.float32) / np.prod(mean_size) + # image = cv2.filter2D(image, -1, kernel) + points_cam = lidar_to_camera(points[:, :3], rect, Trv2c) + points_2d = project_to_image(points_cam, P2) + points_rgb = np.zeros([points_cam.shape[0], 3], dtype=points.dtype) + _add_rgb_to_points_kernel(points_2d, image, points_rgb) + return points_rgb + + +def project_to_image(points_3d, proj_mat): + points_shape = list(points_3d.shape) + points_shape[-1] = 1 + points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1) + point_2d = points_4 @ proj_mat.T + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + return point_2d_res + + +def camera_to_lidar(points, r_rect, velo2cam): + points_shape = list(points.shape[0:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) + return lidar_points[..., :3] + + +def lidar_to_camera(points, r_rect, velo2cam): + points_shape = list(points.shape[:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + camera_points = points @ (r_rect @ velo2cam).T + return camera_points[..., :3] + + +def box_camera_to_lidar(data, r_rect, velo2cam): + xyz = data[:, 0:3] + l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) + return np.concatenate([xyz_lidar, w, l, h, r], axis=1) + + +def box_lidar_to_camera(data, r_rect, velo2cam): + xyz_lidar = data[:, 0:3] + w, l, h = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam) + return np.concatenate([xyz, l, h, w, r], axis=1) + + +def remove_outside_points(points, rect, Trv2c, P2, image_shape): + # 5x faster than remove_outside_points_v1(2ms vs 10ms) + C, R, T = projection_matrix_to_CRT_kitti(P2) + image_bbox = [0, 0, image_shape[1], image_shape[0]] + frustum = get_frustum(image_bbox, C) + frustum -= T + frustum = np.linalg.inv(R) @ frustum.T + frustum = camera_to_lidar(frustum.T, rect, Trv2c) + frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) + indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) + points = points[indices.reshape([-1])] + return points + + +@numba.jit(nopython=True) +def iou_jit(boxes, query_boxes, eps=1.0): + """calculate box iou. note that jit version runs 2x faster than cython in + my machine! + Parameters + ---------- + boxes: (N, 4) ndarray of float + query_boxes: (K, 4) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + box_area = (query_boxes[k, 2] - query_boxes[k, 0] + eps) * ( + query_boxes[k, 3] - query_boxes[k, 1] + eps + ) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) + - max(boxes[n, 0], query_boxes[k, 0]) + + eps + ) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) + - max(boxes[n, 1], query_boxes[k, 1]) + + eps + ) + if ih > 0: + ua = ( + (boxes[n, 2] - boxes[n, 0] + eps) + * (boxes[n, 3] - boxes[n, 1] + eps) + + box_area + - iw * ih + ) + overlaps[n, k] = iw * ih / ua + return overlaps + + +@numba.jit(nopython=True) +def iou_3d_jit(boxes, query_boxes, add1=True): + """calculate box iou3d, + ---------- + boxes: (N, 6) ndarray of float + query_boxes: (K, 6) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + if add1: + add1 = 1.0 + else: + add1 = 0.0 + for k in range(K): + box_area = ( + (query_boxes[k, 3] - query_boxes[k, 0] + add1) + * (query_boxes[k, 4] - query_boxes[k, 1] + add1) + * (query_boxes[k, 5] - query_boxes[k, 2] + add1) + ) + for n in range(N): + iw = ( + min(boxes[n, 3], query_boxes[k, 3]) + - max(boxes[n, 0], query_boxes[k, 0]) + + add1 + ) + if iw > 0: + ih = ( + min(boxes[n, 4], query_boxes[k, 4]) + - max(boxes[n, 1], query_boxes[k, 1]) + + add1 + ) + if ih > 0: + il = ( + min(boxes[n, 5], query_boxes[k, 5]) + - max(boxes[n, 2], query_boxes[k, 2]) + + add1 + ) + if il > 0: + ua = float( + (boxes[n, 3] - boxes[n, 0] + add1) + * (boxes[n, 4] - boxes[n, 1] + add1) + * (boxes[n, 5] - boxes[n, 2] + add1) + + box_area + - iw * ih * il + ) + overlaps[n, k] = iw * ih * il / ua + return overlaps + + +@numba.jit(nopython=True) +def iou_nd_jit(boxes, query_boxes, add1=True): + """calculate box iou nd, 2x slower than iou_jit. + ---------- + boxes: (N, ndim * 2) ndarray of float + query_boxes: (K, ndim * 2) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + ndim = boxes.shape[1] // 2 + overlaps = np.zeros((N, K), dtype=boxes.dtype) + side_lengths = np.zeros((ndim,), dtype=boxes.dtype) + if add1: + add1 = 1.0 + else: + add1 = 0.0 + invalid = False + for k in range(K): + qbox_area = query_boxes[k, ndim] - query_boxes[k, 0] + add1 + for i in range(1, ndim): + qbox_area *= query_boxes[k, ndim + i] - query_boxes[k, i] + add1 + for n in range(N): + invalid = False + for i in range(ndim): + side_length = ( + min(boxes[n, i + ndim], query_boxes[k, i + ndim]) + - max(boxes[n, i], query_boxes[k, i]) + + add1 + ) + if side_length <= 0: + invalid = True + break + side_lengths[i] = side_length + if not invalid: + box_area = boxes[n, ndim] - boxes[n, 0] + add1 + for i in range(1, ndim): + box_area *= boxes[n, ndim + i] - boxes[n, i] + add1 + inter = side_lengths[0] + for i in range(1, ndim): + inter *= side_lengths[i] + # inter = np.prod(side_lengths) + ua = float(box_area + qbox_area - inter) + overlaps[n, k] = inter / ua + + return overlaps + + +def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): + rbbox_corners = center_to_corner_box3d( + rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis + ) + surfaces = corner_to_surfaces_3d(rbbox_corners) + indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return indices + + +def corner_to_surfaces_3d(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + surfaces = np.array( + [ + [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], + [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], + [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], + [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], + [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], + [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], + ] + ).transpose([2, 0, 1, 3]) + return surfaces + + +@numba.jit(nopython=True) +def corner_to_surfaces_3d_jit(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + num_boxes = corners.shape[0] + surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) + corner_idxes = np.array( + [0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7] + ).reshape(6, 4) + for i in range(num_boxes): + for j in range(6): + for k in range(4): + surfaces[i, j, k] = corners[i, corner_idxes[j, k]] + return surfaces + + +def assign_label_to_voxel(gt_boxes, coors, voxel_size, coors_range): + """assign a 0/1 label to each voxel based on whether + the center of voxel is in gt_box. LIDAR. + """ + voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) + coors_range = np.array(coors_range, dtype=gt_boxes.dtype) + shift = coors_range[:3] + voxel_origins = coors[:, ::-1] * voxel_size + shift + voxel_centers = voxel_origins + voxel_size * 0.5 + gt_box_corners = center_to_corner_box3d( + gt_boxes[:, :3] - voxel_size * 0.5, + gt_boxes[:, 3:6] + voxel_size, + gt_boxes[:, 6], + origin=[0.5, 0.5, 0.5], + axis=2, + ) + gt_surfaces = corner_to_surfaces_3d(gt_box_corners) + ret = points_in_convex_polygon_3d_jit(voxel_centers, gt_surfaces) + return np.any(ret, axis=1).astype(np.int64) + + +def assign_label_to_voxel_v3(gt_boxes, coors, voxel_size, coors_range): + """assign a 0/1 label to each voxel based on whether + the center of voxel is in gt_box. LIDAR. + """ + voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) + coors_range = np.array(coors_range, dtype=gt_boxes.dtype) + shift = coors_range[:3] + voxel_origins = coors[:, ::-1] * voxel_size + shift + voxel_maxes = voxel_origins + voxel_size + voxel_minmax = np.concatenate([voxel_origins, voxel_maxes], axis=-1) + voxel_corners = minmax_to_corner_3d(voxel_minmax) + gt_box_corners = center_to_corner_box3d( + gt_boxes[:, :3], + gt_boxes[:, 3:6], + gt_boxes[:, 6], + origin=[0.5, 0.5, 0.5], + axis=2, + ) + gt_surfaces = corner_to_surfaces_3d(gt_box_corners) + voxel_corners_flat = voxel_corners.reshape([-1, 3]) + ret = points_in_convex_polygon_3d_jit(voxel_corners_flat, gt_surfaces) + ret = ret.reshape([-1, 8, ret.shape[-1]]) + return ret.any(-1).any(-1).astype(np.int64) + + +def image_box_region_area(img_cumsum, bbox): + """check a 2d voxel is contained by a box. used to filter empty + anchors. + Summed-area table algorithm: + ==> W + ------------------ + | | | + |------A---------B + | | | + | | | + |----- C---------D + Iabcd = ID-IB-IC+IA + Args: + img_cumsum: [M, H, W](yx) cumsumed image. + bbox: [N, 4](xyxy) bounding box, + """ + N = bbox.shape[0] + M = img_cumsum.shape[0] + ret = np.zeros([N, M], dtype=img_cumsum.dtype) + ID = img_cumsum[:, bbox[:, 3], bbox[:, 2]] + IA = img_cumsum[:, bbox[:, 1], bbox[:, 0]] + IB = img_cumsum[:, bbox[:, 3], bbox[:, 0]] + IC = img_cumsum[:, bbox[:, 1], bbox[:, 2]] + ret = ID - IB - IC + IA + return ret + + +def get_minimum_bounding_box_bv(points, voxel_size, bound, downsample=8, margin=1.6): + x_vsize = voxel_size[0] + y_vsize = voxel_size[1] + max_x = points[:, 0].max() + max_y = points[:, 1].max() + min_x = points[:, 0].min() + min_y = points[:, 1].min() + max_x = np.floor(max_x / (x_vsize * downsample) + 1) * (x_vsize * downsample) + max_y = np.floor(max_y / (y_vsize * downsample) + 1) * (y_vsize * downsample) + min_x = np.floor(min_x / (x_vsize * downsample)) * (x_vsize * downsample) + min_y = np.floor(min_y / (y_vsize * downsample)) * (y_vsize * downsample) + max_x = np.minimum(max_x + margin, bound[2]) + max_y = np.minimum(max_y + margin, bound[3]) + min_x = np.maximum(min_x - margin, bound[0]) + min_y = np.maximum(min_y - margin, bound[1]) + return np.array([min_x, min_y, max_x, max_y]) + + +def box3d_to_bbox(box3d, rect, Trv2c, P2): + box3d_to_cam = box_lidar_to_camera(box3d, rect, Trv2c) + box_corners = center_to_corner_box3d( + box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1 + ) + box_corners_in_image = project_to_image(box_corners, P2) + # box_corners_in_image: [N, 8, 2] + minxy = np.min(box_corners_in_image, axis=1) + maxxy = np.max(box_corners_in_image, axis=1) + bbox = np.concatenate([minxy, maxxy], axis=1) + return bbox + + +def change_box3d_center_(box3d, src, dst): + dst = np.array(dst, dtype=box3d.dtype) + src = np.array(src, dtype=box3d.dtype) + box3d[..., :3] += box3d[..., 3:6] * (dst - src) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py index a62ee7ba2..d4ea9355f 100644 --- a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py @@ -1,457 +1,457 @@ -import numba -import numpy as np - - -@numba.njit -def _points_count_convex_polygon_3d_jit( - points, polygon_surfaces, normal_vec, d, num_surfaces=None -): - """count points in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_polygon] array. - """ - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - ret = np.full((num_polygons,), num_points, dtype=np.int64) - sign = 0.0 - for i in range(num_points): - for j in range(num_polygons): - for k in range(max_num_surfaces): - if k > num_surfaces[j]: - break - sign = ( - points[i, 0] * normal_vec[j, k, 0] - + points[i, 1] * normal_vec[j, k, 1] - + points[i, 2] * normal_vec[j, k, 2] - + d[j, k] - ) - if sign >= 0: - ret[j] -= 1 - break - return ret - - -def points_count_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): - """check points is in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_polygon] array. - """ - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - if num_surfaces is None: - num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) - normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) - # normal_vec: [num_polygon, max_num_surfaces, 3] - # d: [num_polygon, max_num_surfaces] - return _points_count_convex_polygon_3d_jit( - points, polygon_surfaces, normal_vec, d, num_surfaces - ) - - -@numba.njit -def is_line_segment_intersection_jit(lines1, lines2): - """check if line segments1 and line segments2 have cross point - - Args: - lines1 (float, [N, 2, 2]): [description] - lines2 (float, [M, 2, 2]): [description] - - Returns: - [type]: [description] - """ - - # Return true if line segments AB and CD intersect - N = lines1.shape[0] - M = lines2.shape[0] - ret = np.zeros((N, M), dtype=np.bool_) - for i in range(N): - for j in range(M): - A = lines1[i, 0] - B = lines1[i, 1] - C = lines2[j, 0] - D = lines2[j, 1] - acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * (D[0] - A[0]) - bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) - if acd != bcd: - abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (C[0] - A[0]) - abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (D[0] - A[0]) - if abc != abd: - ret[i, j] = True - return ret - - -@numba.njit -def line_segment_intersection(line1, line2, intersection): - A = line1[0] - B = line1[1] - C = line2[0] - D = line2[1] - BA0 = B[0] - A[0] - BA1 = B[1] - A[1] - DA0 = D[0] - A[0] - CA0 = C[0] - A[0] - DA1 = D[1] - A[1] - CA1 = C[1] - A[1] - acd = DA1 * CA0 > CA1 * DA0 - bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) - if acd != bcd: - abc = CA1 * BA0 > BA1 * CA0 - abd = DA1 * BA0 > BA1 * DA0 - if abc != abd: - DC0 = D[0] - C[0] - DC1 = D[1] - C[1] - ABBA = A[0] * B[1] - B[0] * A[1] - CDDC = C[0] * D[1] - D[0] * C[1] - DH = BA1 * DC0 - BA0 * DC1 - intersection[0] = (ABBA * DC0 - BA0 * CDDC) / DH - intersection[1] = (ABBA * DC1 - BA1 * CDDC) / DH - return True - return False - - -def _ccw(A, B, C): - return (C[..., 1] - A[..., 1]) * (B[..., 0] - A[..., 0]) > ( - B[..., 1] - A[..., 1] - ) * (C[..., 0] - A[..., 0]) - - -def is_line_segment_cross(lines1, lines2): - # 10x slower than jit version with 1000-1000 random lines input. - # lines1, [N, 2, 2] - # lines2, [M, 2, 2] - A = lines1[:, 0, :][:, np.newaxis, :] - B = lines1[:, 1, :][:, np.newaxis, :] - C = lines2[:, 0, :][np.newaxis, :, :] - D = lines2[:, 1, :][np.newaxis, :, :] - return np.logical_and( - _ccw(A, C, D) != _ccw(B, C, D), _ccw(A, B, C) != _ccw(A, B, D) - ) - - -@numba.jit(nopython=False) -def surface_equ_3d_jit(polygon_surfaces): - # return [a, b, c], d in ax+by+cz+d=0 - # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] - surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] - # normal_vec: [..., 3] - normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) - # print(normal_vec.shape, points[..., 0, :].shape) - # d = -np.inner(normal_vec, points[..., 0, :]) - d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) - return normal_vec, -d - - -@numba.jit(nopython=False) -def points_in_convex_polygon_3d_jit_v1(points, polygon_surfaces, num_surfaces=None): - """check points is in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_points, num_polygon] bool array. - """ - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - if num_surfaces is None: - num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) - normal_vec, d = surface_equ_3d_jit(polygon_surfaces[:, :, :3, :]) - # normal_vec: [num_polygon, max_num_surfaces, 3] - # d: [num_polygon, max_num_surfaces] - ret = np.ones((num_points, num_polygons), dtype=np.bool_) - sign = 0.0 - for i in range(num_points): - for j in range(num_polygons): - for k in range(max_num_surfaces): - if k > num_surfaces[j]: - break - sign = ( - points[i, 0] * normal_vec[j, k, 0] - + points[i, 1] * normal_vec[j, k, 1] - + points[i, 2] * normal_vec[j, k, 2] - + d[j, k] - ) - if sign >= 0: - ret[i, j] = False - break - return ret - - -def surface_equ_3d(polygon_surfaces): - # return [a, b, c], d in ax+by+cz+d=0 - # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] - surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] - # normal_vec: [..., 3] - normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) - # print(normal_vec.shape, points[..., 0, :].shape) - # d = -np.inner(normal_vec, points[..., 0, :]) - d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) - return normal_v, -d - - -def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): - """check points is in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_points, num_polygon] bool array. - """ - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - if num_surfaces is None: - num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) - normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) - # normal_vec: [num_polygon, max_num_surfaces, 3] - # d: [num_polygon, max_num_surfaces] - return _points_in_convex_polygon_3d_jit( - points, polygon_surfaces, normal_vec, d, num_surfaces - ) - - -@numba.njit -def _points_in_convex_polygon_3d_jit( - points, polygon_surfaces, normal_vec, d, num_surfaces=None -): - """check points is in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_points, num_polygon] bool array. - """ - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - ret = np.ones((num_points, num_polygons), dtype=np.bool_) - sign = 0.0 - for i in range(num_points): - for j in range(num_polygons): - for k in range(max_num_surfaces): - if k > num_surfaces[j]: - break - sign = ( - points[i, 0] * normal_vec[j, k, 0] - + points[i, 1] * normal_vec[j, k, 1] - + points[i, 2] * normal_vec[j, k, 2] - + d[j, k] - ) - if sign >= 0: - ret[i, j] = False - break - return ret - - -@numba.jit -def points_in_convex_polygon_jit(points, polygon, clockwise=True): - """check points is in 2d convex polygons. True when point in polygon - Args: - points: [num_points, 2] array. - polygon: [num_polygon, num_points_of_polygon, 2] array. - clockwise: bool. indicate polygon is clockwise. - Returns: - [num_points, num_polygon] bool array. - """ - # first convert polygon to directed lines - num_points_of_polygon = polygon.shape[1] - num_points = points.shape[0] - num_polygons = polygon.shape[0] - if clockwise: - vec1 = ( - polygon - - polygon[ - :, - [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), - :, - ] - ) - else: - vec1 = ( - polygon[ - :, - [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), - :, - ] - - polygon - ) - # vec1: [num_polygon, num_points_of_polygon, 2] - ret = np.zeros((num_points, num_polygons), dtype=np.bool_) - success = True - cross = 0.0 - for i in range(num_points): - for j in range(num_polygons): - success = True - for k in range(num_points_of_polygon): - cross = vec1[j, k, 1] * (polygon[j, k, 0] - points[i, 0]) - cross -= vec1[j, k, 0] * (polygon[j, k, 1] - points[i, 1]) - if cross >= 0: - success = False - break - ret[i, j] = success - return ret - - -def points_in_convex_polygon(points, polygon, clockwise=True): - """check points is in convex polygons. may run 2x faster when write in - cython(don't need to calculate all cross-product between edge and point) - Args: - points: [num_points, 2] array. - polygon: [num_polygon, num_points_of_polygon, 2] array. - clockwise: bool. indicate polygon is clockwise. - Returns: - [num_points, num_polygon] bool array. - """ - # first convert polygon to directed lines - num_lines = polygon.shape[1] - polygon_next = polygon[:, [num_lines - 1] + list(range(num_lines - 1)), :] - if clockwise: - vec1 = (polygon - polygon_next)[np.newaxis, ...] - else: - vec1 = (polygon_next - polygon)[np.newaxis, ...] - vec2 = polygon[np.newaxis, ...] - points[:, np.newaxis, np.newaxis, :] - # [num_points, num_polygon, num_points_of_polygon, 2] - cross = np.cross(vec1, vec2) - return np.all(cross > 0, axis=2) - - -@numba.njit -def surface_equ_3d_jitv2(surfaces): - # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] - num_polygon = surfaces.shape[0] - max_num_surfaces = surfaces.shape[1] - normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) - d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) - sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] - sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] - for i in range(num_polygon): - for j in range(max_num_surfaces): - sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] - sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] - sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] - sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] - sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] - sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] - normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] - normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] - normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] - - d[i, j] = ( - -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] - - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] - - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] - ) - return normal_vec, d - - -@numba.njit -def _points_in_convex_polygon_3d_jit_v2(points, surfaces): - max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] - num_points = points.shape[0] - num_polygons = polygon_surfaces.shape[0] - ret = np.ones((num_points, num_polygons), dtype=np.bool_) - sign = 0.0 - for i in range(num_points): - for j in range(num_polygons): - for k in range(max_num_surfaces): - if k > num_surfaces[j]: - break - sign = ( - points[i, 0] * normal_vec[j, k, 0] - + points[i, 1] * normal_vec[j, k, 1] - + points[i, 2] * normal_vec[j, k, 2] - + d[j, k] - ) - if sign >= 0: - ret[i, j] = False - break - return ret - - -@numba.njit -def points_in_convex_polygon_3d_jit_v2(points, surfaces, num_surfaces=None): - """check points is in 3d convex polygons. - Args: - points: [num_points, 3] array. - polygon_surfaces: [num_polygon, max_num_surfaces, - max_num_points_of_surface, 3] - array. all surfaces' normal vector must direct to internal. - max_num_points_of_surface must at least 3. - num_surfaces: [num_polygon] array. indicate how many surfaces - a polygon contain - Returns: - [num_points, num_polygon] bool array. - """ - num_polygon = surfaces.shape[0] - max_num_surfaces = surfaces.shape[1] - num_points = points.shape[0] - normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) - d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) - sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] - sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] - ret = np.ones((num_points, num_polygon), dtype=np.bool_) - for i in range(num_polygon): - for j in range(max_num_surfaces): - sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] - sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] - sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] - sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] - sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] - sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] - normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] - normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] - normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] - - d[i, j] = ( - -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] - - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] - - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] - ) - - sign = 0.0 - for i in range(num_points): - for j in range(num_polygon): - for k in range(max_num_surfaces): - sign = ( - points[i, 0] * normal_vec[j, k, 0] - + points[i, 1] * normal_vec[j, k, 1] - + points[i, 2] * normal_vec[j, k, 2] - + d[j, k] - ) - if sign >= 0: - ret[i, j] = False - break - return ret +import numba +import numpy as np + + +@numba.njit +def _points_count_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces=None +): + """count points in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_polygon] array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.full((num_polygons,), num_points, dtype=np.int64) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[j] -= 1 + break + return ret + + +def points_count_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_polygon] array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_count_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces + ) + + +@numba.njit +def is_line_segment_intersection_jit(lines1, lines2): + """check if line segments1 and line segments2 have cross point + + Args: + lines1 (float, [N, 2, 2]): [description] + lines2 (float, [M, 2, 2]): [description] + + Returns: + [type]: [description] + """ + + # Return true if line segments AB and CD intersect + N = lines1.shape[0] + M = lines2.shape[0] + ret = np.zeros((N, M), dtype=np.bool_) + for i in range(N): + for j in range(M): + A = lines1[i, 0] + B = lines1[i, 1] + C = lines2[j, 0] + D = lines2[j, 1] + acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * (D[0] - A[0]) + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) + if acd != bcd: + abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (C[0] - A[0]) + abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (D[0] - A[0]) + if abc != abd: + ret[i, j] = True + return ret + + +@numba.njit +def line_segment_intersection(line1, line2, intersection): + A = line1[0] + B = line1[1] + C = line2[0] + D = line2[1] + BA0 = B[0] - A[0] + BA1 = B[1] - A[1] + DA0 = D[0] - A[0] + CA0 = C[0] - A[0] + DA1 = D[1] - A[1] + CA1 = C[1] - A[1] + acd = DA1 * CA0 > CA1 * DA0 + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) + if acd != bcd: + abc = CA1 * BA0 > BA1 * CA0 + abd = DA1 * BA0 > BA1 * DA0 + if abc != abd: + DC0 = D[0] - C[0] + DC1 = D[1] - C[1] + ABBA = A[0] * B[1] - B[0] * A[1] + CDDC = C[0] * D[1] - D[0] * C[1] + DH = BA1 * DC0 - BA0 * DC1 + intersection[0] = (ABBA * DC0 - BA0 * CDDC) / DH + intersection[1] = (ABBA * DC1 - BA1 * CDDC) / DH + return True + return False + + +def _ccw(A, B, C): + return (C[..., 1] - A[..., 1]) * (B[..., 0] - A[..., 0]) > ( + B[..., 1] - A[..., 1] + ) * (C[..., 0] - A[..., 0]) + + +def is_line_segment_cross(lines1, lines2): + # 10x slower than jit version with 1000-1000 random lines input. + # lines1, [N, 2, 2] + # lines2, [M, 2, 2] + A = lines1[:, 0, :][:, np.newaxis, :] + B = lines1[:, 1, :][:, np.newaxis, :] + C = lines2[:, 0, :][np.newaxis, :, :] + D = lines2[:, 1, :][np.newaxis, :, :] + return np.logical_and( + _ccw(A, C, D) != _ccw(B, C, D), _ccw(A, B, C) != _ccw(A, B, D) + ) + + +@numba.jit(nopython=False) +def surface_equ_3d_jit(polygon_surfaces): + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] + # normal_vec: [..., 3] + normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) + return normal_vec, -d + + +@numba.jit(nopython=False) +def points_in_convex_polygon_3d_jit_v1(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jit(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +def surface_equ_3d(polygon_surfaces): + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] + # normal_vec: [..., 3] + normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) + return normal_v, -d + + +def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_in_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces + ) + + +@numba.njit +def _points_in_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces=None +): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +@numba.jit +def points_in_convex_polygon_jit(points, polygon, clockwise=True): + """check points is in 2d convex polygons. True when point in polygon + Args: + points: [num_points, 2] array. + polygon: [num_polygon, num_points_of_polygon, 2] array. + clockwise: bool. indicate polygon is clockwise. + Returns: + [num_points, num_polygon] bool array. + """ + # first convert polygon to directed lines + num_points_of_polygon = polygon.shape[1] + num_points = points.shape[0] + num_polygons = polygon.shape[0] + if clockwise: + vec1 = ( + polygon + - polygon[ + :, + [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), + :, + ] + ) + else: + vec1 = ( + polygon[ + :, + [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), + :, + ] + - polygon + ) + # vec1: [num_polygon, num_points_of_polygon, 2] + ret = np.zeros((num_points, num_polygons), dtype=np.bool_) + success = True + cross = 0.0 + for i in range(num_points): + for j in range(num_polygons): + success = True + for k in range(num_points_of_polygon): + cross = vec1[j, k, 1] * (polygon[j, k, 0] - points[i, 0]) + cross -= vec1[j, k, 0] * (polygon[j, k, 1] - points[i, 1]) + if cross >= 0: + success = False + break + ret[i, j] = success + return ret + + +def points_in_convex_polygon(points, polygon, clockwise=True): + """check points is in convex polygons. may run 2x faster when write in + cython(don't need to calculate all cross-product between edge and point) + Args: + points: [num_points, 2] array. + polygon: [num_polygon, num_points_of_polygon, 2] array. + clockwise: bool. indicate polygon is clockwise. + Returns: + [num_points, num_polygon] bool array. + """ + # first convert polygon to directed lines + num_lines = polygon.shape[1] + polygon_next = polygon[:, [num_lines - 1] + list(range(num_lines - 1)), :] + if clockwise: + vec1 = (polygon - polygon_next)[np.newaxis, ...] + else: + vec1 = (polygon_next - polygon)[np.newaxis, ...] + vec2 = polygon[np.newaxis, ...] - points[:, np.newaxis, np.newaxis, :] + # [num_points, num_polygon, num_points_of_polygon, 2] + cross = np.cross(vec1, vec2) + return np.all(cross > 0, axis=2) + + +@numba.njit +def surface_equ_3d_jitv2(surfaces): + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + num_polygon = surfaces.shape[0] + max_num_surfaces = surfaces.shape[1] + normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) + d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) + sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + for i in range(num_polygon): + for j in range(max_num_surfaces): + sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] + sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] + sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] + sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] + sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] + sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] + normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] + normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] + normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] + + d[i, j] = ( + -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] + - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] + - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] + ) + return normal_vec, d + + +@numba.njit +def _points_in_convex_polygon_3d_jit_v2(points, surfaces): + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +@numba.njit +def points_in_convex_polygon_3d_jit_v2(points, surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + num_polygon = surfaces.shape[0] + max_num_surfaces = surfaces.shape[1] + num_points = points.shape[0] + normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) + d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) + sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + ret = np.ones((num_points, num_polygon), dtype=np.bool_) + for i in range(num_polygon): + for j in range(max_num_surfaces): + sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] + sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] + sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] + sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] + sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] + sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] + normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] + normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] + normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] + + d[i, j] = ( + -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] + - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] + - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] + ) + + sign = 0.0 + for i in range(num_points): + for j in range(num_polygon): + for k in range(max_num_surfaces): + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py index 071c3cdb2..8680e5894 100644 --- a/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py @@ -1,976 +1,976 @@ -import abc -import sys -import time -from collections import OrderedDict -from functools import reduce - -import numba -import numpy as np - -from det3d.core.bbox import box_np_ops -from det3d.core.bbox.geometry import ( - is_line_segment_intersection_jit, - points_in_convex_polygon_3d_jit, - points_in_convex_polygon_jit, -) -import copy - - -class BatchSampler: - def __init__( - self, sampled_list, name=None, epoch=None, shuffle=True, drop_reminder=False - ): - self._sampled_list = sampled_list - self._indices = np.arange(len(sampled_list)) - if shuffle: - np.random.shuffle(self._indices) - self._idx = 0 - self._example_num = len(sampled_list) - self._name = name - self._shuffle = shuffle - self._epoch = epoch - self._epoch_counter = 0 - self._drop_reminder = drop_reminder - - def _sample(self, num): - if self._idx + num >= self._example_num: - ret = self._indices[self._idx :].copy() - self._reset() - else: - ret = self._indices[self._idx : self._idx + num] - self._idx += num - return ret - - def _reset(self): - # if self._name is not None: - # print("reset", self._name) - if self._shuffle: - np.random.shuffle(self._indices) - self._idx = 0 - - def sample(self, num): - indices = self._sample(num) - return [self._sampled_list[i] for i in indices] - # return np.random.choice(self._sampled_list, num) - - -class DataBasePreprocessing: - def __call__(self, db_infos): - return self._preprocess(db_infos) - - @abc.abstractclassmethod - def _preprocess(self, db_infos): - pass - - -class DBFilterByDifficulty(DataBasePreprocessing): - def __init__(self, removed_difficulties, logger=None): - self._removed_difficulties = removed_difficulties - logger.info(f"{removed_difficulties}") - - def _preprocess(self, db_infos): - new_db_infos = {} - for key, dinfos in db_infos.items(): - new_db_infos[key] = [ - info - for info in dinfos - if info["difficulty"] not in self._removed_difficulties - ] - return new_db_infos - - -class DBFilterByMinNumPoint(DataBasePreprocessing): - def __init__(self, min_gt_point_dict, logger=None): - self._min_gt_point_dict = min_gt_point_dict - logger.info(f"{min_gt_point_dict}") - - def _preprocess(self, db_infos): - for name, min_num in self._min_gt_point_dict.items(): - if min_num > 0: - filtered_infos = [] - for info in db_infos[name]: - if info["num_points_in_gt"] >= min_num: - filtered_infos.append(info) - db_infos[name] = filtered_infos - return db_infos - - -class DataBasePreprocessor: - def __init__(self, preprocessors): - self._preprocessors = preprocessors - - def __call__(self, db_infos): - for prepor in self._preprocessors: - db_infos = prepor(db_infos) - return db_infos - - -def filter_gt_box_outside_range(gt_boxes, limit_range): - """remove gtbox outside training range. - this function should be applied after other prep functions - Args: - gt_boxes ([type]): [description] - limit_range ([type]): [description] - """ - gt_boxes_bv = box_np_ops.center_to_corner_box2d( - gt_boxes[:, [0, 1]], gt_boxes[:, [3, 3 + 1]], gt_boxes[:, -1] - ) - bounding_box = box_np_ops.minmax_to_corner_2d( - np.asarray(limit_range)[np.newaxis, ...] - ) - ret = points_in_convex_polygon_jit(gt_boxes_bv.reshape(-1, 2), bounding_box) - return np.any(ret.reshape(-1, 4), axis=1) - - -def filter_gt_box_outside_range_by_center(gt_boxes, limit_range): - """remove gtbox outside training range. - this function should be applied after other prep functions - Args: - gt_boxes ([type]): [description] - limit_range ([type]): [description] - """ - gt_box_centers = gt_boxes[:, :2] - bounding_box = box_np_ops.minmax_to_corner_2d( - np.asarray(limit_range)[np.newaxis, ...] - ) - ret = points_in_convex_polygon_jit(gt_box_centers, bounding_box) - return ret.reshape(-1) - - -def filter_gt_low_points(gt_boxes, points, num_gt_points, point_num_threshold=2): - points_mask = np.ones([points.shape[0]], np.bool) - gt_boxes_mask = np.ones([gt_boxes.shape[0]], np.bool) - for i, num in enumerate(num_gt_points): - if num <= point_num_threshold: - masks = box_np_ops.points_in_rbbox(points, gt_boxes[i : i + 1]) - masks = masks.reshape([-1]) - points_mask &= np.logical_not(masks) - gt_boxes_mask[i] = False - return gt_boxes[gt_boxes_mask], points[points_mask] - - -def mask_points_in_corners(points, box_corners): - surfaces = box_np_ops.corner_to_surfaces_3d(box_corners) - mask = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) - return mask - - -@numba.njit -def _rotation_matrix_3d_(rot_mat_T, angle, axis): - rot_sin = np.sin(angle) - rot_cos = np.cos(angle) - rot_mat_T[:] = np.eye(3) - if axis == 1: - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 2] = -rot_sin - rot_mat_T[2, 0] = rot_sin - rot_mat_T[2, 2] = rot_cos - elif axis == 2 or axis == -1: - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - elif axis == 0: - rot_mat_T[1, 1] = rot_cos - rot_mat_T[1, 2] = -rot_sin - rot_mat_T[2, 1] = rot_sin - rot_mat_T[2, 2] = rot_cos - - -@numba.njit -def _rotation_box2d_jit_(corners, angle, rot_mat_T): - rot_sin = np.sin(angle) - rot_cos = np.cos(angle) - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - corners[:] = corners @ rot_mat_T - - -@numba.jit(nopython=True) -def _box_single_to_corner_jit(boxes): - num_box = boxes.shape[0] - corners_norm = np.zeros((4, 2), dtype=boxes.dtype) - corners_norm[1, 1] = 1.0 - corners_norm[2] = 1.0 - corners_norm[3, 0] = 1.0 - corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) - corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) - for i in range(num_box): - rot_sin = np.sin(boxes[i, -1]) - rot_cos = np.cos(boxes[i, -1]) - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] - return box_corners - - -@numba.njit -def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): - # boxes: [N, 5] - # valid_mask: [N] - # loc_noises: [N, M, 3] - # rot_noises: [N, M] - num_boxes = boxes.shape[0] - num_tests = loc_noises.shape[1] - box_corners = box_np_ops.box2d_to_corner_jit(boxes) - current_corners = np.zeros((4, 2), dtype=boxes.dtype) - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - success_mask = -np.ones((num_boxes,), dtype=np.int64) - # print(valid_mask) - for i in range(num_boxes): - if valid_mask[i]: - for j in range(num_tests): - current_corners[:] = box_corners[i] - current_corners -= boxes[i, :2] - _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) - current_corners += boxes[i, :2] + loc_noises[i, j, :2] - coll_mat = box_collision_test( - current_corners.reshape(1, 4, 2), box_corners - ) - coll_mat[0, i] = False - # print(coll_mat) - if not coll_mat.any(): - success_mask[i] = j - box_corners[i] = current_corners - break - return success_mask - - -@numba.njit -def noise_per_box_group(boxes, valid_mask, loc_noises, rot_noises, group_nums): - # WARNING: this function need boxes to be sorted by group id. - # boxes: [N, 5] - # valid_mask: [N] - # loc_noises: [N, M, 3] - # rot_noises: [N, M] - num_groups = group_nums.shape[0] - num_boxes = boxes.shape[0] - num_tests = loc_noises.shape[1] - box_corners = box_np_ops.box2d_to_corner_jit(boxes) - max_group_num = group_nums.max() - current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - success_mask = -np.ones((num_boxes,), dtype=np.int64) - # print(valid_mask) - idx = 0 - for num in group_nums: - if valid_mask[idx]: - for j in range(num_tests): - for i in range(num): - current_corners[i] = box_corners[i + idx] - current_corners[i] -= boxes[i + idx, :2] - _rotation_box2d_jit_( - current_corners[i], rot_noises[idx + i, j], rot_mat_T - ) - current_corners[i] += ( - boxes[i + idx, :2] + loc_noises[i + idx, j, :2] - ) - coll_mat = box_collision_test( - current_corners[:num].reshape(num, 4, 2), box_corners - ) - for i in range(num): # remove self-coll - coll_mat[i, idx : idx + num] = False - if not coll_mat.any(): - for i in range(num): - success_mask[i + idx] = j - box_corners[i + idx] = current_corners[i] - break - idx += num - return success_mask - - -@numba.njit -def noise_per_box_group_v2_( - boxes, valid_mask, loc_noises, rot_noises, group_nums, global_rot_noises -): - # WARNING: this function need boxes to be sorted by group id. - # boxes: [N, 5] - # valid_mask: [N] - # loc_noises: [N, M, 3] - # rot_noises: [N, M] - num_boxes = boxes.shape[0] - num_tests = loc_noises.shape[1] - box_corners = box_np_ops.box2d_to_corner_jit(boxes) - max_group_num = group_nums.max() - current_box = np.zeros((1, 5), dtype=boxes.dtype) - current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) - dst_pos = np.zeros((max_group_num, 2), dtype=boxes.dtype) - - current_grot = np.zeros((max_group_num,), dtype=boxes.dtype) - dst_grot = np.zeros((max_group_num,), dtype=boxes.dtype) - - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - success_mask = -np.ones((num_boxes,), dtype=np.int64) - corners_norm = np.zeros((4, 2), dtype=boxes.dtype) - corners_norm[1, 1] = 1.0 - corners_norm[2] = 1.0 - corners_norm[3, 0] = 1.0 - corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) - corners_norm = corners_norm.reshape(4, 2) - - # print(valid_mask) - idx = 0 - for num in group_nums: - if valid_mask[idx]: - for j in range(num_tests): - for i in range(num): - current_box[0, :] = boxes[i + idx] - current_radius = np.sqrt( - current_box[0, 0] ** 2 + current_box[0, 1] ** 2 - ) - current_grot[i] = np.arctan2(current_box[0, 0], current_box[0, 1]) - dst_grot[i] = current_grot[i] + global_rot_noises[idx + i, j] - dst_pos[i, 0] = current_radius * np.sin(dst_grot[i]) - dst_pos[i, 1] = current_radius * np.cos(dst_grot[i]) - current_box[0, :2] = dst_pos[i] - current_box[0, -1] += dst_grot[i] - current_grot[i] - - rot_sin = np.sin(current_box[0, -1]) - rot_cos = np.cos(current_box[0, -1]) - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - current_corners[i] = ( - current_box[0, 2:4] * corners_norm @ rot_mat_T - + current_box[0, :2] - ) - current_corners[i] -= current_box[0, :2] - - _rotation_box2d_jit_( - current_corners[i], rot_noises[idx + i, j], rot_mat_T - ) - current_corners[i] += ( - current_box[0, :2] + loc_noises[i + idx, j, :2] - ) - coll_mat = box_collision_test( - current_corners[:num].reshape(num, 4, 2), box_corners - ) - for i in range(num): # remove self-coll - coll_mat[i, idx : idx + num] = False - if not coll_mat.any(): - for i in range(num): - success_mask[i + idx] = j - box_corners[i + idx] = current_corners[i] - loc_noises[i + idx, j, :2] += dst_pos[i] - boxes[i + idx, :2] - rot_noises[i + idx, j] += dst_grot[i] - current_grot[i] - break - idx += num - return success_mask - - -@numba.njit -def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, global_rot_noises): - # boxes: [N, 5] - # valid_mask: [N] - # loc_noises: [N, M, 3] - # rot_noises: [N, M] - num_boxes = boxes.shape[0] - num_tests = loc_noises.shape[1] - box_corners = box_np_ops.box2d_to_corner_jit(boxes) - current_corners = np.zeros((4, 2), dtype=boxes.dtype) - current_box = np.zeros((1, 5), dtype=boxes.dtype) - rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) - dst_pos = np.zeros((2,), dtype=boxes.dtype) - success_mask = -np.ones((num_boxes,), dtype=np.int64) - corners_norm = np.zeros((4, 2), dtype=boxes.dtype) - corners_norm[1, 1] = 1.0 - corners_norm[2] = 1.0 - corners_norm[3, 0] = 1.0 - corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) - corners_norm = corners_norm.reshape(4, 2) - for i in range(num_boxes): - if valid_mask[i]: - for j in range(num_tests): - current_box[0, :] = boxes[i] - current_radius = np.sqrt(boxes[i, 0] ** 2 + boxes[i, 1] ** 2) - current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) - dst_grot = current_grot + global_rot_noises[i, j] - dst_pos[0] = current_radius * np.sin(dst_grot) - dst_pos[1] = current_radius * np.cos(dst_grot) - current_box[0, :2] = dst_pos - current_box[0, -1] += dst_grot - current_grot - - rot_sin = np.sin(current_box[0, -1]) - rot_cos = np.cos(current_box[0, -1]) - rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin - rot_mat_T[1, 1] = rot_cos - current_corners[:] = ( - current_box[0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] - ) - current_corners -= current_box[0, :2] - _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) - current_corners += current_box[0, :2] + loc_noises[i, j, :2] - coll_mat = box_collision_test( - current_corners.reshape(1, 4, 2), box_corners - ) - coll_mat[0, i] = False - if not coll_mat.any(): - success_mask[i] = j - box_corners[i] = current_corners - loc_noises[i, j, :2] += dst_pos - boxes[i, :2] - rot_noises[i, j] += dst_grot - current_grot - break - return success_mask - - -@numba.njit -def points_transform_( - points, centers, point_masks, loc_transform, rot_transform, valid_mask -): - num_box = centers.shape[0] - num_points = points.shape[0] - rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) - for i in range(num_box): - _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) - for i in range(num_points): - for j in range(num_box): - if valid_mask[j]: - if point_masks[i, j] == 1: - points[i, :3] -= centers[j, :3] - points[i : i + 1, :3] = points[i : i + 1, :3] @ rot_mat_T[j] - points[i, :3] += centers[j, :3] - points[i, :3] += loc_transform[j] - break # only apply first box's transform - - -@numba.njit -def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): - num_box = boxes.shape[0] - for i in range(num_box): - if valid_mask[i]: - boxes[i, :3] += loc_transform[i] - boxes[i, 6] += rot_transform[i] - - -def _select_transform(transform, indices): - result = np.zeros((transform.shape[0], *transform.shape[2:]), dtype=transform.dtype) - for i in range(transform.shape[0]): - if indices[i] != -1: - result[i] = transform[i, indices[i]] - return result - - -@numba.njit -def group_transform_(loc_noise, rot_noise, locs, rots, group_center, valid_mask): - # loc_noise: [N, M, 3], locs: [N, 3] - # rot_noise: [N, M] - # group_center: [N, 3] - num_try = loc_noise.shape[1] - r = 0.0 - x = 0.0 - y = 0.0 - rot_center = 0.0 - for i in range(loc_noise.shape[0]): - if valid_mask[i]: - x = locs[i, 0] - group_center[i, 0] - y = locs[i, 1] - group_center[i, 1] - r = np.sqrt(x ** 2 + y ** 2) - # calculate rots related to group center - rot_center = np.arctan2(x, y) - for j in range(num_try): - loc_noise[i, j, 0] += r * ( - np.sin(rot_center + rot_noise[i, j]) - np.sin(rot_center) - ) - loc_noise[i, j, 1] += r * ( - np.cos(rot_center + rot_noise[i, j]) - np.cos(rot_center) - ) - - -@numba.njit -def group_transform_v2_( - loc_noise, rot_noise, locs, rots, group_center, grot_noise, valid_mask -): - # loc_noise: [N, M, 3], locs: [N, 3] - # rot_noise: [N, M] - # group_center: [N, 3] - num_try = loc_noise.shape[1] - r = 0.0 - x = 0.0 - y = 0.0 - rot_center = 0.0 - for i in range(loc_noise.shape[0]): - if valid_mask[i]: - x = locs[i, 0] - group_center[i, 0] - y = locs[i, 1] - group_center[i, 1] - r = np.sqrt(x ** 2 + y ** 2) - # calculate rots related to group center - rot_center = np.arctan2(x, y) - for j in range(num_try): - loc_noise[i, j, 0] += r * ( - np.sin(rot_center + rot_noise[i, j] + grot_noise[i, j]) - - np.sin(rot_center + grot_noise[i, j]) - ) - loc_noise[i, j, 1] += r * ( - np.cos(rot_center + rot_noise[i, j] + grot_noise[i, j]) - - np.cos(rot_center + grot_noise[i, j]) - ) - - -def set_group_noise_same_(loc_noise, rot_noise, group_ids): - gid_to_index_dict = {} - for i, gid in enumerate(group_ids): - if gid not in gid_to_index_dict: - gid_to_index_dict[gid] = i - for i in range(loc_noise.shape[0]): - loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] - rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] - - -def set_group_noise_same_v2_(loc_noise, rot_noise, grot_noise, group_ids): - gid_to_index_dict = {} - for i, gid in enumerate(group_ids): - if gid not in gid_to_index_dict: - gid_to_index_dict[gid] = i - for i in range(loc_noise.shape[0]): - loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] - rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] - grot_noise[i] = grot_noise[gid_to_index_dict[group_ids[i]]] - - -def get_group_center(locs, group_ids): - num_groups = 0 - group_centers = np.zeros_like(locs) - group_centers_ret = np.zeros_like(locs) - group_id_dict = {} - group_id_num_dict = OrderedDict() - for i, gid in enumerate(group_ids): - if gid >= 0: - if gid in group_id_dict: - group_centers[group_id_dict[gid]] += locs[i] - group_id_num_dict[gid] += 1 - else: - group_id_dict[gid] = num_groups - num_groups += 1 - group_id_num_dict[gid] = 1 - group_centers[group_id_dict[gid]] = locs[i] - for i, gid in enumerate(group_ids): - group_centers_ret[i] = ( - group_centers[group_id_dict[gid]] / group_id_num_dict[gid] - ) - return group_centers_ret, group_id_num_dict - - -def noise_per_object_v3_( - gt_boxes, - points=None, - valid_mask=None, - rotation_perturb=np.pi / 4, - center_noise_std=1.0, - global_random_rot_range=np.pi / 4, - num_try=5, - group_ids=None, -): - """random rotate or remove each groundtrutn independently. - use kitti viewer to test this function points_transform_ - - Args: - gt_boxes: [N, 7], gt box in lidar.points_transform_ - points: [M, 4], point cloud in lidar. - """ - num_boxes = gt_boxes.shape[0] - if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): - rotation_perturb = [-rotation_perturb, rotation_perturb] - if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): - global_random_rot_range = [-global_random_rot_range, global_random_rot_range] - enable_grot = ( - np.abs(global_random_rot_range[0] - global_random_rot_range[1]) >= 1e-3 - ) - if not isinstance(center_noise_std, (list, tuple, np.ndarray)): - center_noise_std = [center_noise_std, center_noise_std, center_noise_std] - if valid_mask is None: - valid_mask = np.ones((num_boxes,), dtype=np.bool_) - center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) - loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) - # loc_noises = np.random.uniform( - # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) - rot_noises = np.random.uniform( - rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] - ) - gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) - grot_lowers = global_random_rot_range[0] - gt_grots - grot_uppers = global_random_rot_range[1] - gt_grots - global_rot_noises = np.random.uniform( - grot_lowers[..., np.newaxis], - grot_uppers[..., np.newaxis], - size=[num_boxes, num_try], - ) - if group_ids is not None: - if enable_grot: - set_group_noise_same_v2_( - loc_noises, rot_noises, global_rot_noises, group_ids - ) - else: - set_group_noise_same_(loc_noises, rot_noises, group_ids) - group_centers, group_id_num_dict = get_group_center(gt_boxes[:, :3], group_ids) - if enable_grot: - group_transform_v2_( - loc_noises, - rot_noises, - gt_boxes[:, :3], - gt_boxes[:, 6], - group_centers, - global_rot_noises, - valid_mask, - ) - else: - group_transform_( - loc_noises, - rot_noises, - gt_boxes[:, :3], - gt_boxes[:, 6], - group_centers, - valid_mask, - ) - group_nums = np.array(list(group_id_num_dict.values()), dtype=np.int64) - - origin = [0.5, 0.5, 0.5] - gt_box_corners = box_np_ops.center_to_corner_box3d( - gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 - ) - if group_ids is not None: - if not enable_grot: - selected_noise = noise_per_box_group( - gt_boxes[:, [0, 1, 3, 4, 6]], - valid_mask, - loc_noises, - rot_noises, - group_nums, - ) - else: - selected_noise = noise_per_box_group_v2_( - gt_boxes[:, [0, 1, 3, 4, 6]], - valid_mask, - loc_noises, - rot_noises, - group_nums, - global_rot_noises, - ) - else: - if not enable_grot: - selected_noise = noise_per_box( - gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises - ) - else: - selected_noise = noise_per_box_v2_( - gt_boxes[:, [0, 1, 3, 4, 6]], - valid_mask, - loc_noises, - rot_noises, - global_rot_noises, - ) - loc_transforms = _select_transform(loc_noises, selected_noise) - rot_transforms = _select_transform(rot_noises, selected_noise) - surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) - if points is not None: - point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) - points_transform_( - points, - gt_boxes[:, :3], - point_masks, - loc_transforms, - rot_transforms, - valid_mask, - ) - - box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) - - -def noise_per_object_v2_( - gt_boxes, - points=None, - valid_mask=None, - rotation_perturb=np.pi / 4, - center_noise_std=1.0, - global_random_rot_range=np.pi / 4, - num_try=100, -): - """random rotate or remove each groundtrutn independently. - use kitti viewer to test this function points_transform_ - - Args: - gt_boxes: [N, 7], gt box in lidar.points_transform_ - points: [M, 4], point cloud in lidar. - """ - num_boxes = gt_boxes.shape[0] - if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): - rotation_perturb = [-rotation_perturb, rotation_perturb] - if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): - global_random_rot_range = [-global_random_rot_range, global_random_rot_range] - - if not isinstance(center_noise_std, (list, tuple, np.ndarray)): - center_noise_std = [center_noise_std, center_noise_std, center_noise_std] - if valid_mask is None: - valid_mask = np.ones((num_boxes,), dtype=np.bool_) - center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) - loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) - # loc_noises = np.random.uniform( - # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) - rot_noises = np.random.uniform( - rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] - ) - gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) - grot_lowers = global_random_rot_range[0] - gt_grots - grot_uppers = global_random_rot_range[1] - gt_grots - global_rot_noises = np.random.uniform( - grot_lowers[..., np.newaxis], - grot_uppers[..., np.newaxis], - size=[num_boxes, num_try], - ) - - origin = [0.5, 0.5, 0] - gt_box_corners = box_np_ops.center_to_corner_box3d( - gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 - ) - if np.abs(global_random_rot_range[0] - global_random_rot_range[1]) < 1e-3: - selected_noise = noise_per_box( - gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises - ) - else: - selected_noise = noise_per_box_v2_( - gt_boxes[:, [0, 1, 3, 4, 6]], - valid_mask, - loc_noises, - rot_noises, - global_rot_noises, - ) - loc_transforms = _select_transform(loc_noises, selected_noise) - rot_transforms = _select_transform(rot_noises, selected_noise) - if points is not None: - surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) - point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) - points_transform_( - points, - gt_boxes[:, :3], - point_masks, - loc_transforms, - rot_transforms, - valid_mask, - ) - - box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) - - -def global_scaling(gt_boxes, points, scale=0.05): - if not isinstance(scale, list): - scale = [-scale, scale] - noise_scale = np.random.uniform(scale[0] + 1, scale[1] + 1) - points[:, :3] *= noise_scale - gt_boxes[:, :6] *= noise_scale - return gt_boxes, points - - -def global_rotation(gt_boxes, points, rotation=np.pi / 4): - if not isinstance(rotation, list): - rotation = [-rotation, rotation] - noise_rotation = np.random.uniform(rotation[0], rotation[1]) - points[:, :3] = box_np_ops.rotation_points_single_angle( - points[:, :3], noise_rotation, axis=2 - ) - gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( - gt_boxes[:, :3], noise_rotation, axis=2 - ) - if gt_boxes.shape[1] > 7: - gt_boxes[:, 6:8] = box_np_ops.rotation_points_single_angle( - np.hstack([gt_boxes[:, 6:8], np.zeros((gt_boxes.shape[0], 1))]), - noise_rotation, - axis=2, - )[:, :2] - gt_boxes[:, -1] += noise_rotation - return gt_boxes, points - - -def random_flip(gt_boxes, points, probability=0.5): - enable = np.random.choice( - [False, True], replace=False, p=[1 - probability, probability] - ) - if enable: - gt_boxes[:, 1] = -gt_boxes[:, 1] - gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi - points[:, 1] = -points[:, 1] - if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r - gt_boxes[:, 7] = -gt_boxes[:, 7] - return gt_boxes, points - -def random_flip_both(gt_boxes, points, probability=0.5, flip_coor=None): - # x flip - enable = np.random.choice( - [False, True], replace=False, p=[1 - probability, probability] - ) - if enable: - gt_boxes[:, 1] = -gt_boxes[:, 1] - gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi - points[:, 1] = -points[:, 1] - if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r - gt_boxes[:, 7] = -gt_boxes[:, 7] - - # y flip - enable = np.random.choice( - [False, True], replace=False, p=[1 - probability, probability] - ) - if enable: - if flip_coor is None: - gt_boxes[:, 0] = -gt_boxes[:, 0] - points[:, 0] = -points[:, 0] - else: - gt_boxes[:, 0] = flip_coor * 2 - gt_boxes[:, 0] - points[:, 0] = flip_coor * 2 - points[:, 0] - - gt_boxes[:, -1] = -gt_boxes[:, -1] + 2*np.pi # TODO: CHECK THIS - - if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r - gt_boxes[:, 6] = -gt_boxes[:, 6] - - return gt_boxes, points - - -def global_scaling_v2(gt_boxes, points, min_scale=0.95, max_scale=1.05): - noise_scale = np.random.uniform(min_scale, max_scale) - points[:, :3] *= noise_scale - gt_boxes[:, :-1] *= noise_scale - return gt_boxes, points - - -def global_rotation_v2(gt_boxes, points, min_rad=-np.pi / 4, max_rad=np.pi / 4): - noise_rotation = np.random.uniform(min_rad, max_rad) - points[:, :3] = box_np_ops.rotation_points_single_angle( - points[:, :3], noise_rotation, axis=2 - ) - gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( - gt_boxes[:, :3], noise_rotation, axis=2 - ) - gt_boxes[:, -1] += noise_rotation - return gt_boxes, points - - -@numba.jit(nopython=True) -def box_collision_test(boxes, qboxes, clockwise=True): - N = boxes.shape[0] - K = qboxes.shape[0] - ret = np.zeros((N, K), dtype=np.bool_) - slices = np.array([1, 2, 3, 0]) - lines_boxes = np.stack( - (boxes, boxes[:, slices, :]), axis=2 - ) # [N, 4, 2(line), 2(xy)] - lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) - # vec = np.zeros((2,), dtype=boxes.dtype) - boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) - qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) - for i in range(N): - for j in range(K): - # calculate standup first - iw = min(boxes_standup[i, 2], qboxes_standup[j, 2]) - max( - boxes_standup[i, 0], qboxes_standup[j, 0] - ) - if iw > 0: - ih = min(boxes_standup[i, 3], qboxes_standup[j, 3]) - max( - boxes_standup[i, 1], qboxes_standup[j, 1] - ) - if ih > 0: - for k in range(4): - for l in range(4): - A = lines_boxes[i, k, 0] - B = lines_boxes[i, k, 1] - C = lines_qboxes[j, l, 0] - D = lines_qboxes[j, l, 1] - acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * ( - D[0] - A[0] - ) - bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * ( - D[0] - B[0] - ) - if acd != bcd: - abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( - C[0] - A[0] - ) - abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( - D[0] - A[0] - ) - if abc != abd: - ret[i, j] = True # collision. - break - if ret[i, j] is True: - break - if ret[i, j] is False: - # now check complete overlap. - # box overlap qbox: - box_overlap_qbox = True - for l in range(4): # point l in qboxes - for k in range(4): # corner k in boxes - vec = boxes[i, k] - boxes[i, (k + 1) % 4] - if clockwise: - vec = -vec - cross = vec[1] * (boxes[i, k, 0] - qboxes[j, l, 0]) - cross -= vec[0] * (boxes[i, k, 1] - qboxes[j, l, 1]) - if cross >= 0: - box_overlap_qbox = False - break - if box_overlap_qbox is False: - break - - if box_overlap_qbox is False: - qbox_overlap_box = True - for l in range(4): # point l in boxes - for k in range(4): # corner k in qboxes - vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] - if clockwise: - vec = -vec - cross = vec[1] * (qboxes[j, k, 0] - boxes[i, l, 0]) - cross -= vec[0] * (qboxes[j, k, 1] - boxes[i, l, 1]) - if cross >= 0: # - qbox_overlap_box = False - break - if qbox_overlap_box is False: - break - if qbox_overlap_box: - ret[i, j] = True # collision. - else: - ret[i, j] = True # collision. - return ret - - -def global_translate_(gt_boxes, points, noise_translate_std): - """ - Apply global translation to gt_boxes and points. - """ - - if not isinstance(noise_translate_std, (list, tuple, np.ndarray)): - noise_translate_std = np.array( - [noise_translate_std, noise_translate_std, noise_translate_std] - ) - if all([e == 0 for e in noise_translate_std]): - return gt_boxes, points - noise_translate = np.array( - [ - np.random.normal(0, noise_translate_std[0], 1), - np.random.normal(0, noise_translate_std[1], 1), - np.random.normal(0, noise_translate_std[0], 1), - ] - ).T - - points[:, :3] += noise_translate - gt_boxes[:, :3] += noise_translate - - return gt_boxes, points - - -if __name__ == "__main__": - bboxes = np.array( - [ - [0.0, 0.0, 0.5, 0.5], - [0.2, 0.2, 0.6, 0.6], - [0.7, 0.7, 0.9, 0.9], - [0.55, 0.55, 0.8, 0.8], - ] - ) - bbox_corners = box_np_ops.minmax_to_corner_2d(bboxes) - print(bbox_corners.shape) - print(box_collision_test(bbox_corners, bbox_corners)) +import abc +import sys +import time +from collections import OrderedDict +from functools import reduce + +import numba +import numpy as np + +from det3d.core.bbox import box_np_ops +from det3d.core.bbox.geometry import ( + is_line_segment_intersection_jit, + points_in_convex_polygon_3d_jit, + points_in_convex_polygon_jit, +) +import copy + + +class BatchSampler: + def __init__( + self, sampled_list, name=None, epoch=None, shuffle=True, drop_reminder=False + ): + self._sampled_list = sampled_list + self._indices = np.arange(len(sampled_list)) + if shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + self._example_num = len(sampled_list) + self._name = name + self._shuffle = shuffle + self._epoch = epoch + self._epoch_counter = 0 + self._drop_reminder = drop_reminder + + def _sample(self, num): + if self._idx + num >= self._example_num: + ret = self._indices[self._idx :].copy() + self._reset() + else: + ret = self._indices[self._idx : self._idx + num] + self._idx += num + return ret + + def _reset(self): + # if self._name is not None: + # print("reset", self._name) + if self._shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + + def sample(self, num): + indices = self._sample(num) + return [self._sampled_list[i] for i in indices] + # return np.random.choice(self._sampled_list, num) + + +class DataBasePreprocessing: + def __call__(self, db_infos): + return self._preprocess(db_infos) + + @abc.abstractclassmethod + def _preprocess(self, db_infos): + pass + + +class DBFilterByDifficulty(DataBasePreprocessing): + def __init__(self, removed_difficulties, logger=None): + self._removed_difficulties = removed_difficulties + logger.info(f"{removed_difficulties}") + + def _preprocess(self, db_infos): + new_db_infos = {} + for key, dinfos in db_infos.items(): + new_db_infos[key] = [ + info + for info in dinfos + if info["difficulty"] not in self._removed_difficulties + ] + return new_db_infos + + +class DBFilterByMinNumPoint(DataBasePreprocessing): + def __init__(self, min_gt_point_dict, logger=None): + self._min_gt_point_dict = min_gt_point_dict + logger.info(f"{min_gt_point_dict}") + + def _preprocess(self, db_infos): + for name, min_num in self._min_gt_point_dict.items(): + if min_num > 0: + filtered_infos = [] + for info in db_infos[name]: + if info["num_points_in_gt"] >= min_num: + filtered_infos.append(info) + db_infos[name] = filtered_infos + return db_infos + + +class DataBasePreprocessor: + def __init__(self, preprocessors): + self._preprocessors = preprocessors + + def __call__(self, db_infos): + for prepor in self._preprocessors: + db_infos = prepor(db_infos) + return db_infos + + +def filter_gt_box_outside_range(gt_boxes, limit_range): + """remove gtbox outside training range. + this function should be applied after other prep functions + Args: + gt_boxes ([type]): [description] + limit_range ([type]): [description] + """ + gt_boxes_bv = box_np_ops.center_to_corner_box2d( + gt_boxes[:, [0, 1]], gt_boxes[:, [3, 3 + 1]], gt_boxes[:, -1] + ) + bounding_box = box_np_ops.minmax_to_corner_2d( + np.asarray(limit_range)[np.newaxis, ...] + ) + ret = points_in_convex_polygon_jit(gt_boxes_bv.reshape(-1, 2), bounding_box) + return np.any(ret.reshape(-1, 4), axis=1) + + +def filter_gt_box_outside_range_by_center(gt_boxes, limit_range): + """remove gtbox outside training range. + this function should be applied after other prep functions + Args: + gt_boxes ([type]): [description] + limit_range ([type]): [description] + """ + gt_box_centers = gt_boxes[:, :2] + bounding_box = box_np_ops.minmax_to_corner_2d( + np.asarray(limit_range)[np.newaxis, ...] + ) + ret = points_in_convex_polygon_jit(gt_box_centers, bounding_box) + return ret.reshape(-1) + + +def filter_gt_low_points(gt_boxes, points, num_gt_points, point_num_threshold=2): + points_mask = np.ones([points.shape[0]], np.bool) + gt_boxes_mask = np.ones([gt_boxes.shape[0]], np.bool) + for i, num in enumerate(num_gt_points): + if num <= point_num_threshold: + masks = box_np_ops.points_in_rbbox(points, gt_boxes[i : i + 1]) + masks = masks.reshape([-1]) + points_mask &= np.logical_not(masks) + gt_boxes_mask[i] = False + return gt_boxes[gt_boxes_mask], points[points_mask] + + +def mask_points_in_corners(points, box_corners): + surfaces = box_np_ops.corner_to_surfaces_3d(box_corners) + mask = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return mask + + +@numba.njit +def _rotation_matrix_3d_(rot_mat_T, angle, axis): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[:] = np.eye(3) + if axis == 1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 2] = -rot_sin + rot_mat_T[2, 0] = rot_sin + rot_mat_T[2, 2] = rot_cos + elif axis == 2 or axis == -1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + elif axis == 0: + rot_mat_T[1, 1] = rot_cos + rot_mat_T[1, 2] = -rot_sin + rot_mat_T[2, 1] = rot_sin + rot_mat_T[2, 2] = rot_cos + + +@numba.njit +def _rotation_box2d_jit_(corners, angle, rot_mat_T): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + corners[:] = corners @ rot_mat_T + + +@numba.jit(nopython=True) +def _box_single_to_corner_jit(boxes): + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +@numba.njit +def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + # print(valid_mask) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_corners[:] = box_corners[i] + current_corners -= boxes[i, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) + current_corners += boxes[i, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners + ) + coll_mat[0, i] = False + # print(coll_mat) + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + break + return success_mask + + +@numba.njit +def noise_per_box_group(boxes, valid_mask, loc_noises, rot_noises, group_nums): + # WARNING: this function need boxes to be sorted by group id. + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_groups = group_nums.shape[0] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + max_group_num = group_nums.max() + current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + # print(valid_mask) + idx = 0 + for num in group_nums: + if valid_mask[idx]: + for j in range(num_tests): + for i in range(num): + current_corners[i] = box_corners[i + idx] + current_corners[i] -= boxes[i + idx, :2] + _rotation_box2d_jit_( + current_corners[i], rot_noises[idx + i, j], rot_mat_T + ) + current_corners[i] += ( + boxes[i + idx, :2] + loc_noises[i + idx, j, :2] + ) + coll_mat = box_collision_test( + current_corners[:num].reshape(num, 4, 2), box_corners + ) + for i in range(num): # remove self-coll + coll_mat[i, idx : idx + num] = False + if not coll_mat.any(): + for i in range(num): + success_mask[i + idx] = j + box_corners[i + idx] = current_corners[i] + break + idx += num + return success_mask + + +@numba.njit +def noise_per_box_group_v2_( + boxes, valid_mask, loc_noises, rot_noises, group_nums, global_rot_noises +): + # WARNING: this function need boxes to be sorted by group id. + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + max_group_num = group_nums.max() + current_box = np.zeros((1, 5), dtype=boxes.dtype) + current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) + dst_pos = np.zeros((max_group_num, 2), dtype=boxes.dtype) + + current_grot = np.zeros((max_group_num,), dtype=boxes.dtype) + dst_grot = np.zeros((max_group_num,), dtype=boxes.dtype) + + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners_norm = corners_norm.reshape(4, 2) + + # print(valid_mask) + idx = 0 + for num in group_nums: + if valid_mask[idx]: + for j in range(num_tests): + for i in range(num): + current_box[0, :] = boxes[i + idx] + current_radius = np.sqrt( + current_box[0, 0] ** 2 + current_box[0, 1] ** 2 + ) + current_grot[i] = np.arctan2(current_box[0, 0], current_box[0, 1]) + dst_grot[i] = current_grot[i] + global_rot_noises[idx + i, j] + dst_pos[i, 0] = current_radius * np.sin(dst_grot[i]) + dst_pos[i, 1] = current_radius * np.cos(dst_grot[i]) + current_box[0, :2] = dst_pos[i] + current_box[0, -1] += dst_grot[i] - current_grot[i] + + rot_sin = np.sin(current_box[0, -1]) + rot_cos = np.cos(current_box[0, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + current_corners[i] = ( + current_box[0, 2:4] * corners_norm @ rot_mat_T + + current_box[0, :2] + ) + current_corners[i] -= current_box[0, :2] + + _rotation_box2d_jit_( + current_corners[i], rot_noises[idx + i, j], rot_mat_T + ) + current_corners[i] += ( + current_box[0, :2] + loc_noises[i + idx, j, :2] + ) + coll_mat = box_collision_test( + current_corners[:num].reshape(num, 4, 2), box_corners + ) + for i in range(num): # remove self-coll + coll_mat[i, idx : idx + num] = False + if not coll_mat.any(): + for i in range(num): + success_mask[i + idx] = j + box_corners[i + idx] = current_corners[i] + loc_noises[i + idx, j, :2] += dst_pos[i] - boxes[i + idx, :2] + rot_noises[i + idx, j] += dst_grot[i] - current_grot[i] + break + idx += num + return success_mask + + +@numba.njit +def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, global_rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + current_box = np.zeros((1, 5), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + dst_pos = np.zeros((2,), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners_norm = corners_norm.reshape(4, 2) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_box[0, :] = boxes[i] + current_radius = np.sqrt(boxes[i, 0] ** 2 + boxes[i, 1] ** 2) + current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) + dst_grot = current_grot + global_rot_noises[i, j] + dst_pos[0] = current_radius * np.sin(dst_grot) + dst_pos[1] = current_radius * np.cos(dst_grot) + current_box[0, :2] = dst_pos + current_box[0, -1] += dst_grot - current_grot + + rot_sin = np.sin(current_box[0, -1]) + rot_cos = np.cos(current_box[0, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + current_corners[:] = ( + current_box[0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] + ) + current_corners -= current_box[0, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) + current_corners += current_box[0, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners + ) + coll_mat[0, i] = False + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + loc_noises[i, j, :2] += dst_pos - boxes[i, :2] + rot_noises[i, j] += dst_grot - current_grot + break + return success_mask + + +@numba.njit +def points_transform_( + points, centers, point_masks, loc_transform, rot_transform, valid_mask +): + num_box = centers.shape[0] + num_points = points.shape[0] + rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) + for i in range(num_box): + _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) + for i in range(num_points): + for j in range(num_box): + if valid_mask[j]: + if point_masks[i, j] == 1: + points[i, :3] -= centers[j, :3] + points[i : i + 1, :3] = points[i : i + 1, :3] @ rot_mat_T[j] + points[i, :3] += centers[j, :3] + points[i, :3] += loc_transform[j] + break # only apply first box's transform + + +@numba.njit +def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): + num_box = boxes.shape[0] + for i in range(num_box): + if valid_mask[i]: + boxes[i, :3] += loc_transform[i] + boxes[i, 6] += rot_transform[i] + + +def _select_transform(transform, indices): + result = np.zeros((transform.shape[0], *transform.shape[2:]), dtype=transform.dtype) + for i in range(transform.shape[0]): + if indices[i] != -1: + result[i] = transform[i, indices[i]] + return result + + +@numba.njit +def group_transform_(loc_noise, rot_noise, locs, rots, group_center, valid_mask): + # loc_noise: [N, M, 3], locs: [N, 3] + # rot_noise: [N, M] + # group_center: [N, 3] + num_try = loc_noise.shape[1] + r = 0.0 + x = 0.0 + y = 0.0 + rot_center = 0.0 + for i in range(loc_noise.shape[0]): + if valid_mask[i]: + x = locs[i, 0] - group_center[i, 0] + y = locs[i, 1] - group_center[i, 1] + r = np.sqrt(x ** 2 + y ** 2) + # calculate rots related to group center + rot_center = np.arctan2(x, y) + for j in range(num_try): + loc_noise[i, j, 0] += r * ( + np.sin(rot_center + rot_noise[i, j]) - np.sin(rot_center) + ) + loc_noise[i, j, 1] += r * ( + np.cos(rot_center + rot_noise[i, j]) - np.cos(rot_center) + ) + + +@numba.njit +def group_transform_v2_( + loc_noise, rot_noise, locs, rots, group_center, grot_noise, valid_mask +): + # loc_noise: [N, M, 3], locs: [N, 3] + # rot_noise: [N, M] + # group_center: [N, 3] + num_try = loc_noise.shape[1] + r = 0.0 + x = 0.0 + y = 0.0 + rot_center = 0.0 + for i in range(loc_noise.shape[0]): + if valid_mask[i]: + x = locs[i, 0] - group_center[i, 0] + y = locs[i, 1] - group_center[i, 1] + r = np.sqrt(x ** 2 + y ** 2) + # calculate rots related to group center + rot_center = np.arctan2(x, y) + for j in range(num_try): + loc_noise[i, j, 0] += r * ( + np.sin(rot_center + rot_noise[i, j] + grot_noise[i, j]) + - np.sin(rot_center + grot_noise[i, j]) + ) + loc_noise[i, j, 1] += r * ( + np.cos(rot_center + rot_noise[i, j] + grot_noise[i, j]) + - np.cos(rot_center + grot_noise[i, j]) + ) + + +def set_group_noise_same_(loc_noise, rot_noise, group_ids): + gid_to_index_dict = {} + for i, gid in enumerate(group_ids): + if gid not in gid_to_index_dict: + gid_to_index_dict[gid] = i + for i in range(loc_noise.shape[0]): + loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] + rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] + + +def set_group_noise_same_v2_(loc_noise, rot_noise, grot_noise, group_ids): + gid_to_index_dict = {} + for i, gid in enumerate(group_ids): + if gid not in gid_to_index_dict: + gid_to_index_dict[gid] = i + for i in range(loc_noise.shape[0]): + loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] + rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] + grot_noise[i] = grot_noise[gid_to_index_dict[group_ids[i]]] + + +def get_group_center(locs, group_ids): + num_groups = 0 + group_centers = np.zeros_like(locs) + group_centers_ret = np.zeros_like(locs) + group_id_dict = {} + group_id_num_dict = OrderedDict() + for i, gid in enumerate(group_ids): + if gid >= 0: + if gid in group_id_dict: + group_centers[group_id_dict[gid]] += locs[i] + group_id_num_dict[gid] += 1 + else: + group_id_dict[gid] = num_groups + num_groups += 1 + group_id_num_dict[gid] = 1 + group_centers[group_id_dict[gid]] = locs[i] + for i, gid in enumerate(group_ids): + group_centers_ret[i] = ( + group_centers[group_id_dict[gid]] / group_id_num_dict[gid] + ) + return group_centers_ret, group_id_num_dict + + +def noise_per_object_v3_( + gt_boxes, + points=None, + valid_mask=None, + rotation_perturb=np.pi / 4, + center_noise_std=1.0, + global_random_rot_range=np.pi / 4, + num_try=5, + group_ids=None, +): + """random rotate or remove each groundtrutn independently. + use kitti viewer to test this function points_transform_ + + Args: + gt_boxes: [N, 7], gt box in lidar.points_transform_ + points: [M, 4], point cloud in lidar. + """ + num_boxes = gt_boxes.shape[0] + if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): + rotation_perturb = [-rotation_perturb, rotation_perturb] + if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): + global_random_rot_range = [-global_random_rot_range, global_random_rot_range] + enable_grot = ( + np.abs(global_random_rot_range[0] - global_random_rot_range[1]) >= 1e-3 + ) + if not isinstance(center_noise_std, (list, tuple, np.ndarray)): + center_noise_std = [center_noise_std, center_noise_std, center_noise_std] + if valid_mask is None: + valid_mask = np.ones((num_boxes,), dtype=np.bool_) + center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) + loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) + # loc_noises = np.random.uniform( + # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) + rot_noises = np.random.uniform( + rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] + ) + gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) + grot_lowers = global_random_rot_range[0] - gt_grots + grot_uppers = global_random_rot_range[1] - gt_grots + global_rot_noises = np.random.uniform( + grot_lowers[..., np.newaxis], + grot_uppers[..., np.newaxis], + size=[num_boxes, num_try], + ) + if group_ids is not None: + if enable_grot: + set_group_noise_same_v2_( + loc_noises, rot_noises, global_rot_noises, group_ids + ) + else: + set_group_noise_same_(loc_noises, rot_noises, group_ids) + group_centers, group_id_num_dict = get_group_center(gt_boxes[:, :3], group_ids) + if enable_grot: + group_transform_v2_( + loc_noises, + rot_noises, + gt_boxes[:, :3], + gt_boxes[:, 6], + group_centers, + global_rot_noises, + valid_mask, + ) + else: + group_transform_( + loc_noises, + rot_noises, + gt_boxes[:, :3], + gt_boxes[:, 6], + group_centers, + valid_mask, + ) + group_nums = np.array(list(group_id_num_dict.values()), dtype=np.int64) + + origin = [0.5, 0.5, 0.5] + gt_box_corners = box_np_ops.center_to_corner_box3d( + gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 + ) + if group_ids is not None: + if not enable_grot: + selected_noise = noise_per_box_group( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + group_nums, + ) + else: + selected_noise = noise_per_box_group_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + group_nums, + global_rot_noises, + ) + else: + if not enable_grot: + selected_noise = noise_per_box( + gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises + ) + else: + selected_noise = noise_per_box_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + global_rot_noises, + ) + loc_transforms = _select_transform(loc_noises, selected_noise) + rot_transforms = _select_transform(rot_noises, selected_noise) + surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) + if points is not None: + point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + points_transform_( + points, + gt_boxes[:, :3], + point_masks, + loc_transforms, + rot_transforms, + valid_mask, + ) + + box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) + + +def noise_per_object_v2_( + gt_boxes, + points=None, + valid_mask=None, + rotation_perturb=np.pi / 4, + center_noise_std=1.0, + global_random_rot_range=np.pi / 4, + num_try=100, +): + """random rotate or remove each groundtrutn independently. + use kitti viewer to test this function points_transform_ + + Args: + gt_boxes: [N, 7], gt box in lidar.points_transform_ + points: [M, 4], point cloud in lidar. + """ + num_boxes = gt_boxes.shape[0] + if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): + rotation_perturb = [-rotation_perturb, rotation_perturb] + if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): + global_random_rot_range = [-global_random_rot_range, global_random_rot_range] + + if not isinstance(center_noise_std, (list, tuple, np.ndarray)): + center_noise_std = [center_noise_std, center_noise_std, center_noise_std] + if valid_mask is None: + valid_mask = np.ones((num_boxes,), dtype=np.bool_) + center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) + loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) + # loc_noises = np.random.uniform( + # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) + rot_noises = np.random.uniform( + rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] + ) + gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) + grot_lowers = global_random_rot_range[0] - gt_grots + grot_uppers = global_random_rot_range[1] - gt_grots + global_rot_noises = np.random.uniform( + grot_lowers[..., np.newaxis], + grot_uppers[..., np.newaxis], + size=[num_boxes, num_try], + ) + + origin = [0.5, 0.5, 0] + gt_box_corners = box_np_ops.center_to_corner_box3d( + gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 + ) + if np.abs(global_random_rot_range[0] - global_random_rot_range[1]) < 1e-3: + selected_noise = noise_per_box( + gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises + ) + else: + selected_noise = noise_per_box_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + global_rot_noises, + ) + loc_transforms = _select_transform(loc_noises, selected_noise) + rot_transforms = _select_transform(rot_noises, selected_noise) + if points is not None: + surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) + point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + points_transform_( + points, + gt_boxes[:, :3], + point_masks, + loc_transforms, + rot_transforms, + valid_mask, + ) + + box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) + + +def global_scaling(gt_boxes, points, scale=0.05): + if not isinstance(scale, list): + scale = [-scale, scale] + noise_scale = np.random.uniform(scale[0] + 1, scale[1] + 1) + points[:, :3] *= noise_scale + gt_boxes[:, :6] *= noise_scale + return gt_boxes, points + + +def global_rotation(gt_boxes, points, rotation=np.pi / 4): + if not isinstance(rotation, list): + rotation = [-rotation, rotation] + noise_rotation = np.random.uniform(rotation[0], rotation[1]) + points[:, :3] = box_np_ops.rotation_points_single_angle( + points[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( + gt_boxes[:, :3], noise_rotation, axis=2 + ) + if gt_boxes.shape[1] > 7: + gt_boxes[:, 6:8] = box_np_ops.rotation_points_single_angle( + np.hstack([gt_boxes[:, 6:8], np.zeros((gt_boxes.shape[0], 1))]), + noise_rotation, + axis=2, + )[:, :2] + gt_boxes[:, -1] += noise_rotation + return gt_boxes, points + + +def random_flip(gt_boxes, points, probability=0.5): + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + gt_boxes[:, 1] = -gt_boxes[:, 1] + gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi + points[:, 1] = -points[:, 1] + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 7] = -gt_boxes[:, 7] + return gt_boxes, points + +def random_flip_both(gt_boxes, points, probability=0.5, flip_coor=None): + # x flip + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + gt_boxes[:, 1] = -gt_boxes[:, 1] + gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi + points[:, 1] = -points[:, 1] + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 7] = -gt_boxes[:, 7] + + # y flip + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + if flip_coor is None: + gt_boxes[:, 0] = -gt_boxes[:, 0] + points[:, 0] = -points[:, 0] + else: + gt_boxes[:, 0] = flip_coor * 2 - gt_boxes[:, 0] + points[:, 0] = flip_coor * 2 - points[:, 0] + + gt_boxes[:, -1] = -gt_boxes[:, -1] + 2*np.pi # TODO: CHECK THIS + + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 6] = -gt_boxes[:, 6] + + return gt_boxes, points + + +def global_scaling_v2(gt_boxes, points, min_scale=0.95, max_scale=1.05): + noise_scale = np.random.uniform(min_scale, max_scale) + points[:, :3] *= noise_scale + gt_boxes[:, :-1] *= noise_scale + return gt_boxes, points + + +def global_rotation_v2(gt_boxes, points, min_rad=-np.pi / 4, max_rad=np.pi / 4): + noise_rotation = np.random.uniform(min_rad, max_rad) + points[:, :3] = box_np_ops.rotation_points_single_angle( + points[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( + gt_boxes[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, -1] += noise_rotation + return gt_boxes, points + + +@numba.jit(nopython=True) +def box_collision_test(boxes, qboxes, clockwise=True): + N = boxes.shape[0] + K = qboxes.shape[0] + ret = np.zeros((N, K), dtype=np.bool_) + slices = np.array([1, 2, 3, 0]) + lines_boxes = np.stack( + (boxes, boxes[:, slices, :]), axis=2 + ) # [N, 4, 2(line), 2(xy)] + lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) + # vec = np.zeros((2,), dtype=boxes.dtype) + boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) + qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) + for i in range(N): + for j in range(K): + # calculate standup first + iw = min(boxes_standup[i, 2], qboxes_standup[j, 2]) - max( + boxes_standup[i, 0], qboxes_standup[j, 0] + ) + if iw > 0: + ih = min(boxes_standup[i, 3], qboxes_standup[j, 3]) - max( + boxes_standup[i, 1], qboxes_standup[j, 1] + ) + if ih > 0: + for k in range(4): + for l in range(4): + A = lines_boxes[i, k, 0] + B = lines_boxes[i, k, 1] + C = lines_qboxes[j, l, 0] + D = lines_qboxes[j, l, 1] + acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * ( + D[0] - A[0] + ) + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * ( + D[0] - B[0] + ) + if acd != bcd: + abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( + C[0] - A[0] + ) + abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( + D[0] - A[0] + ) + if abc != abd: + ret[i, j] = True # collision. + break + if ret[i, j] is True: + break + if ret[i, j] is False: + # now check complete overlap. + # box overlap qbox: + box_overlap_qbox = True + for l in range(4): # point l in qboxes + for k in range(4): # corner k in boxes + vec = boxes[i, k] - boxes[i, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * (boxes[i, k, 0] - qboxes[j, l, 0]) + cross -= vec[0] * (boxes[i, k, 1] - qboxes[j, l, 1]) + if cross >= 0: + box_overlap_qbox = False + break + if box_overlap_qbox is False: + break + + if box_overlap_qbox is False: + qbox_overlap_box = True + for l in range(4): # point l in boxes + for k in range(4): # corner k in qboxes + vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * (qboxes[j, k, 0] - boxes[i, l, 0]) + cross -= vec[0] * (qboxes[j, k, 1] - boxes[i, l, 1]) + if cross >= 0: # + qbox_overlap_box = False + break + if qbox_overlap_box is False: + break + if qbox_overlap_box: + ret[i, j] = True # collision. + else: + ret[i, j] = True # collision. + return ret + + +def global_translate_(gt_boxes, points, noise_translate_std): + """ + Apply global translation to gt_boxes and points. + """ + + if not isinstance(noise_translate_std, (list, tuple, np.ndarray)): + noise_translate_std = np.array( + [noise_translate_std, noise_translate_std, noise_translate_std] + ) + if all([e == 0 for e in noise_translate_std]): + return gt_boxes, points + noise_translate = np.array( + [ + np.random.normal(0, noise_translate_std[0], 1), + np.random.normal(0, noise_translate_std[1], 1), + np.random.normal(0, noise_translate_std[0], 1), + ] + ).T + + points[:, :3] += noise_translate + gt_boxes[:, :3] += noise_translate + + return gt_boxes, points + + +if __name__ == "__main__": + bboxes = np.array( + [ + [0.0, 0.0, 0.5, 0.5], + [0.2, 0.2, 0.6, 0.6], + [0.7, 0.7, 0.9, 0.9], + [0.55, 0.55, 0.8, 0.8], + ] + ) + bbox_corners = box_np_ops.minmax_to_corner_2d(bboxes) + print(bbox_corners.shape) + print(box_collision_test(bbox_corners, bbox_corners)) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py index 358350857..9404cf163 100644 --- a/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py @@ -1,202 +1,202 @@ -import time - -import numba -import numpy as np - - -@numba.jit(nopython=True) -def _points_to_voxel_reverse_kernel( - points, - voxel_size, - coors_range, - num_points_per_voxel, - coor_to_voxelidx, - voxels, - coors, - max_points=35, - max_voxels=20000, -): - # put all computations to one loop. - # we shouldn't create large array in main jit code, otherwise - # reduce performance - N = points.shape[0] - # ndim = points.shape[1] - 1 - ndim = 3 - ndim_minus_1 = ndim - 1 - grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size - # np.round(grid_size) - # grid_size = np.round(grid_size).astype(np.int64)(np.int32) - grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) - coor = np.zeros(shape=(3,), dtype=np.int32) - voxel_num = 0 - failed = False - for i in range(N): - failed = False - for j in range(ndim): - c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) - if c < 0 or c >= grid_size[j]: - failed = True - break - coor[ndim_minus_1 - j] = c - if failed: - continue - voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] - if voxelidx == -1: - voxelidx = voxel_num - if voxel_num >= max_voxels: - continue - voxel_num += 1 - coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx - coors[voxelidx] = coor - num = num_points_per_voxel[voxelidx] - if num < max_points: - voxels[voxelidx, num] = points[i] - num_points_per_voxel[voxelidx] += 1 - return voxel_num - - -@numba.jit(nopython=True) -def _points_to_voxel_kernel( - points, - voxel_size, - coors_range, - num_points_per_voxel, - coor_to_voxelidx, - voxels, - coors, - max_points=35, - max_voxels=20000, -): - # need mutex if write in cuda, but numba.cuda don't support mutex. - # in addition, pytorch don't support cuda in dataloader(tensorflow support this). - # put all computations to one loop. - # we shouldn't create large array in main jit code, otherwise - # decrease performance - N = points.shape[0] - # ndim = points.shape[1] - 1 - ndim = 3 - grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size - # grid_size = np.round(grid_size).astype(np.int64)(np.int32) - grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) - - lower_bound = coors_range[:3] - upper_bound = coors_range[3:] - coor = np.zeros(shape=(3,), dtype=np.int32) - voxel_num = 0 - failed = False - for i in range(N): - failed = False - for j in range(ndim): - c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) - if c < 0 or c >= grid_size[j]: - failed = True - break - coor[j] = c - if failed: - continue - voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] - if voxelidx == -1: - voxelidx = voxel_num - if voxel_num >= max_voxels: - continue - voxel_num += 1 - coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx - coors[voxelidx] = coor - num = num_points_per_voxel[voxelidx] - if num < max_points: - voxels[voxelidx, num] = points[i] - num_points_per_voxel[voxelidx] += 1 - return voxel_num - - -def points_to_voxel( - points, voxel_size, coors_range, max_points=35, reverse_index=True, max_voxels=20000 -): - """convert kitti points(N, >=3) to voxels. This version calculate - everything in one loop. now it takes only 4.2ms(complete point cloud) - with jit and 3.2ghz cpu.(don't calculate other features) - Note: this function in ubuntu seems faster than windows 10. - - Args: - points: [N, ndim] float tensor. points[:, :3] contain xyz points and - points[:, 3:] contain other information such as reflectivity. - voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size - coors_range: [6] list/tuple or array, float. indicate voxel range. - format: xyzxyz, minmax - max_points: int. indicate maximum points contained in a voxel. - reverse_index: boolean. indicate whether return reversed coordinates. - if points has xyz format and reverse_index is True, output - coordinates will be zyx format, but points in features always - xyz format. - max_voxels: int. indicate maximum voxels this function create. - for second, 20000 is a good choice. you should shuffle points - before call this function because max_voxels may drop some points. - - Returns: - voxels: [M, max_points, ndim] float tensor. only contain points. - coordinates: [M, 3] int32 tensor. - num_points_per_voxel: [M] int32 tensor. - """ - if not isinstance(voxel_size, np.ndarray): - voxel_size = np.array(voxel_size, dtype=points.dtype) - if not isinstance(coors_range, np.ndarray): - coors_range = np.array(coors_range, dtype=points.dtype) - voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size - voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) - if reverse_index: - voxelmap_shape = voxelmap_shape[::-1] - # don't create large array in jit(nopython=True) code. - num_points_per_voxel = np.zeros(shape=(max_voxels,), dtype=np.int32) - coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) - voxels = np.zeros( - shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype - ) - coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) - if reverse_index: - voxel_num = _points_to_voxel_reverse_kernel( - points, - voxel_size, - coors_range, - num_points_per_voxel, - coor_to_voxelidx, - voxels, - coors, - max_points, - max_voxels, - ) - - else: - voxel_num = _points_to_voxel_kernel( - points, - voxel_size, - coors_range, - num_points_per_voxel, - coor_to_voxelidx, - voxels, - coors, - max_points, - max_voxels, - ) - - coors = coors[:voxel_num] - voxels = voxels[:voxel_num] - num_points_per_voxel = num_points_per_voxel[:voxel_num] - return voxels, coors, num_points_per_voxel - - -@numba.jit(nopython=True) -def bound_points_jit(points, upper_bound, lower_bound): - # to use nopython=True, np.bool is not supported. so you need - # convert result to np.bool after this function. - N = points.shape[0] - ndim = points.shape[1] - keep_indices = np.zeros((N,), dtype=np.int32) - success = 0 - for i in range(N): - success = 1 - for j in range(ndim): - if points[i, j] < lower_bound[j] or points[i, j] >= upper_bound[j]: - success = 0 - break - keep_indices[i] = success - return keep_indices +import time + +import numba +import numpy as np + + +@numba.jit(nopython=True) +def _points_to_voxel_reverse_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000, +): + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # reduce performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + ndim_minus_1 = ndim - 1 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # np.round(grid_size) + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + coor = np.zeros(shape=(3,), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[ndim_minus_1 - j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + continue + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num + + +@numba.jit(nopython=True) +def _points_to_voxel_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000, +): + # need mutex if write in cuda, but numba.cuda don't support mutex. + # in addition, pytorch don't support cuda in dataloader(tensorflow support this). + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # decrease performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + + lower_bound = coors_range[:3] + upper_bound = coors_range[3:] + coor = np.zeros(shape=(3,), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + continue + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num + + +def points_to_voxel( + points, voxel_size, coors_range, max_points=35, reverse_index=True, max_voxels=20000 +): + """convert kitti points(N, >=3) to voxels. This version calculate + everything in one loop. now it takes only 4.2ms(complete point cloud) + with jit and 3.2ghz cpu.(don't calculate other features) + Note: this function in ubuntu seems faster than windows 10. + + Args: + points: [N, ndim] float tensor. points[:, :3] contain xyz points and + points[:, 3:] contain other information such as reflectivity. + voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size + coors_range: [6] list/tuple or array, float. indicate voxel range. + format: xyzxyz, minmax + max_points: int. indicate maximum points contained in a voxel. + reverse_index: boolean. indicate whether return reversed coordinates. + if points has xyz format and reverse_index is True, output + coordinates will be zyx format, but points in features always + xyz format. + max_voxels: int. indicate maximum voxels this function create. + for second, 20000 is a good choice. you should shuffle points + before call this function because max_voxels may drop some points. + + Returns: + voxels: [M, max_points, ndim] float tensor. only contain points. + coordinates: [M, 3] int32 tensor. + num_points_per_voxel: [M] int32 tensor. + """ + if not isinstance(voxel_size, np.ndarray): + voxel_size = np.array(voxel_size, dtype=points.dtype) + if not isinstance(coors_range, np.ndarray): + coors_range = np.array(coors_range, dtype=points.dtype) + voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size + voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) + if reverse_index: + voxelmap_shape = voxelmap_shape[::-1] + # don't create large array in jit(nopython=True) code. + num_points_per_voxel = np.zeros(shape=(max_voxels,), dtype=np.int32) + coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) + voxels = np.zeros( + shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype + ) + coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) + if reverse_index: + voxel_num = _points_to_voxel_reverse_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points, + max_voxels, + ) + + else: + voxel_num = _points_to_voxel_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points, + max_voxels, + ) + + coors = coors[:voxel_num] + voxels = voxels[:voxel_num] + num_points_per_voxel = num_points_per_voxel[:voxel_num] + return voxels, coors, num_points_per_voxel + + +@numba.jit(nopython=True) +def bound_points_jit(points, upper_bound, lower_bound): + # to use nopython=True, np.bool is not supported. so you need + # convert result to np.bool after this function. + N = points.shape[0] + ndim = points.shape[1] + keep_indices = np.zeros((N,), dtype=np.int32) + success = 0 + for i in range(N): + success = 1 + for j in range(ndim): + if points[i, j] < lower_bound[j] or points[i, j] >= upper_bound[j]: + success = 0 + break + keep_indices[i] = success + return keep_indices diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat b/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat index 29b481d88..b08ce1c9d 100644 --- a/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat @@ -1,242 +1,242 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source -set I18NSPHINXOPTS=%SPHINXOPTS% source -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. xml to make Docutils-native XML files - echo. pseudoxml to make pseudoxml-XML files for display purposes - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - - -%SPHINXBUILD% 2> nul -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Numba.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Numba.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdf" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdfja" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf-ja - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -if "%1" == "xml" ( - %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The XML files are in %BUILDDIR%/xml. - goto end -) - -if "%1" == "pseudoxml" ( - %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. - goto end -) - -:end +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +set I18NSPHINXOPTS=%SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Numba.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Numba.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/cv/3d_detection/centerpoint/pytorch/tools/create_data.py b/cv/3d_detection/centerpoint/pytorch/tools/create_data.py index a0c5670de..a9adcf658 100644 --- a/cv/3d_detection/centerpoint/pytorch/tools/create_data.py +++ b/cv/3d_detection/centerpoint/pytorch/tools/create_data.py @@ -1,35 +1,35 @@ -import copy -from pathlib import Path -import pickle - -import fire, os - -from det3d.datasets.nuscenes import nusc_common as nu_ds -from det3d.datasets.utils.create_gt_database import create_groundtruth_database -from det3d.datasets.waymo import waymo_common as waymo_ds - -def nuscenes_data_prep(root_path, version, nsweeps=10, filter_zero=True, virtual=False): - nu_ds.create_nuscenes_infos(root_path, version=version, nsweeps=nsweeps, filter_zero=filter_zero) - if version == 'v1.0-trainval': - create_groundtruth_database( - "NUSC", - root_path, - Path(root_path) / "infos_train_{:02d}sweeps_withvelo_filter_{}.pkl".format(nsweeps, filter_zero), - nsweeps=nsweeps, - virtual=virtual - ) - -def waymo_data_prep(root_path, split, nsweeps=1): - waymo_ds.create_waymo_infos(root_path, split=split, nsweeps=nsweeps) - if split == 'train': - create_groundtruth_database( - "WAYMO", - root_path, - Path(root_path) / "infos_train_{:02d}sweeps_filter_zero_gt.pkl".format(nsweeps), - used_classes=['VEHICLE', 'CYCLIST', 'PEDESTRIAN'], - nsweeps=nsweeps - ) - - -if __name__ == "__main__": - fire.Fire() +import copy +from pathlib import Path +import pickle + +import fire, os + +from det3d.datasets.nuscenes import nusc_common as nu_ds +from det3d.datasets.utils.create_gt_database import create_groundtruth_database +from det3d.datasets.waymo import waymo_common as waymo_ds + +def nuscenes_data_prep(root_path, version, nsweeps=10, filter_zero=True, virtual=False): + nu_ds.create_nuscenes_infos(root_path, version=version, nsweeps=nsweeps, filter_zero=filter_zero) + if version == 'v1.0-trainval': + create_groundtruth_database( + "NUSC", + root_path, + Path(root_path) / "infos_train_{:02d}sweeps_withvelo_filter_{}.pkl".format(nsweeps, filter_zero), + nsweeps=nsweeps, + virtual=virtual + ) + +def waymo_data_prep(root_path, split, nsweeps=1): + waymo_ds.create_waymo_infos(root_path, split=split, nsweeps=nsweeps) + if split == 'train': + create_groundtruth_database( + "WAYMO", + root_path, + Path(root_path) / "infos_train_{:02d}sweeps_filter_zero_gt.pkl".format(nsweeps), + used_classes=['VEHICLE', 'CYCLIST', 'PEDESTRIAN'], + nsweeps=nsweeps + ) + + +if __name__ == "__main__": + fire.Fire() diff --git a/cv/detection/ssd/tensorflow/ci/prepare.sh b/cv/detection/ssd/tensorflow/ci/prepare.sh index 363671f6a..a8fec8183 100644 --- a/cv/detection/ssd/tensorflow/ci/prepare.sh +++ b/cv/detection/ssd/tensorflow/ci/prepare.sh @@ -1,35 +1,35 @@ -#!/bin/bash -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -x -## install libGL -ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') -if [[ ${ID} == "ubuntu" ]]; then - apt install -y libgl1-mesa-glx -elif [[ ${ID} == "centos" ]]; then - yum install -y mesa-libGL -else - echo "Not Support Os" -fi - -pip3 install tf_slim -cd dataset -mkdir tfrecords -ln -s /mnt/deepspark/data/datasets/VOC2012_sample ./ -python3 convert_voc_sample_tfrecords.py --dataset_directory=./ --output_directory=tfrecords --train_splits VOC2012_sample --validation_splits VOC2012_sample -cd .. -ln -s /mnt/deepspark/data/checkpoints/ssd-vgg16 ./model +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +## install libGL +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install tf_slim +cd dataset +mkdir tfrecords +ln -s /mnt/deepspark/data/datasets/VOC2012_sample ./ +python3 convert_voc_sample_tfrecords.py --dataset_directory=./ --output_directory=tfrecords --train_splits VOC2012_sample --validation_splits VOC2012_sample +cd .. +ln -s /mnt/deepspark/data/checkpoints/ssd-vgg16 ./model python3 train_ssd.py --batch_size 16 \ No newline at end of file diff --git a/cv/face_recognition/arcface/pytorch/arcface.py b/cv/face_recognition/arcface/pytorch/arcface.py index 8991d519a..570069cfc 100755 --- a/cv/face_recognition/arcface/pytorch/arcface.py +++ b/cv/face_recognition/arcface/pytorch/arcface.py @@ -1,141 +1,141 @@ -import matplotlib.pyplot as plt -import numpy as np -import torch -import torch.backends.cudnn as cudnn - -from nets.arcface import Arcface as arcface -from utils.utils import preprocess_input, resize_image, show_config - - -class Arcface(object): - _defaults = { - #--------------------------------------------------------------------------# - # 使用自己训练好的模型进行预测要修改model_path,指向logs文件夹下的权值文件 - # 训练好后logs文件夹下存在多个权值文件,选择验证集损失较低的即可。 - # 验证集损失较低不代表准确度较高,仅代表该权值在验证集上泛化性能较好。 - #--------------------------------------------------------------------------# - "model_path" : "model_data/arcface_mobilefacenet.pth", - #-------------------------------------------# - # 输入图片的大小。 - #-------------------------------------------# - "input_shape" : [112, 112, 3], - #-------------------------------------------# - # 所使用到的主干特征提取网络,与训练的相同 - # mobilefacenet - # mobilenetv1 - # iresnet18 - # iresnet34 - # iresnet50 - # iresnet100 - # iresnet200 - #-------------------------------------------# - "backbone" : "mobilefacenet", - #-------------------------------------------# - # 是否进行不失真的resize - #-------------------------------------------# - "letterbox_image" : True, - #-------------------------------------------# - # 是否使用Cuda - # 没有GPU可以设置成False - #-------------------------------------------# - "cuda" : True, - } - - @classmethod - def get_defaults(cls, n): - if n in cls._defaults: - return cls._defaults[n] - else: - return "Unrecognized attribute name '" + n + "'" - - #---------------------------------------------------# - # 初始化Arcface - #---------------------------------------------------# - def __init__(self, **kwargs): - self.__dict__.update(self._defaults) - for name, value in kwargs.items(): - setattr(self, name, value) - - self.generate() - - show_config(**self._defaults) - - def generate(self): - #---------------------------------------------------# - # 载入模型与权值 - #---------------------------------------------------# - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.net = arcface(backbone=self.backbone, mode="predict").eval() - self.net.load_state_dict(torch.load(self.model_path, map_location=device), strict=False) - print('{} model loaded.'.format(self.model_path)) - - if self.cuda: - self.net = torch.nn.DataParallel(self.net) - cudnn.benchmark = True - self.net = self.net.cuda() - - #---------------------------------------------------# - # 检测图片 - #---------------------------------------------------# - def detect_image(self, image_1, image_2): - #---------------------------------------------------# - # 图片预处理,归一化 - #---------------------------------------------------# - with torch.no_grad(): - image_1 = resize_image(image_1, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) - image_2 = resize_image(image_2, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) - - photo_1 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_1, np.float32)), (2, 0, 1)), 0)) - photo_2 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_2, np.float32)), (2, 0, 1)), 0)) - - if self.cuda: - photo_1 = photo_1.cuda() - photo_2 = photo_2.cuda() - - #---------------------------------------------------# - # 图片传入网络进行预测 - #---------------------------------------------------# - output1 = self.net(photo_1).cpu().numpy() - output2 = self.net(photo_2).cpu().numpy() - - #---------------------------------------------------# - # 计算二者之间的距离 - #---------------------------------------------------# - l1 = np.linalg.norm(output1 - output2, axis=1) - - plt.subplot(1, 2, 1) - plt.imshow(np.array(image_1)) - - plt.subplot(1, 2, 2) - plt.imshow(np.array(image_2)) - plt.text(-12, -12, 'Distance:%.3f' % l1, ha='center', va= 'bottom',fontsize=11) - plt.show() - return l1 - - def get_FPS(self, image, test_interval): - #---------------------------------------------------# - # 对图片进行不失真的resize - #---------------------------------------------------# - image_data = resize_image(image, [self.input_shape[1], self.input_shape[0]], self.letterbox_image) - #---------------------------------------------------------# - # 归一化+添加上batch_size维度 - #---------------------------------------------------------# - image_data = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)) - with torch.no_grad(): - #---------------------------------------------------# - # 图片传入网络进行预测 - #---------------------------------------------------# - preds = self.net(image_data).cpu().numpy() - - import time - t1 = time.time() - for _ in range(test_interval): - with torch.no_grad(): - #---------------------------------------------------# - # 图片传入网络进行预测 - #---------------------------------------------------# - preds = self.net(image_data).cpu().numpy() - t2 = time.time() - tact_time = (t2 - t1) / test_interval - return tact_time +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.backends.cudnn as cudnn + +from nets.arcface import Arcface as arcface +from utils.utils import preprocess_input, resize_image, show_config + + +class Arcface(object): + _defaults = { + #--------------------------------------------------------------------------# + # 使用自己训练好的模型进行预测要修改model_path,指向logs文件夹下的权值文件 + # 训练好后logs文件夹下存在多个权值文件,选择验证集损失较低的即可。 + # 验证集损失较低不代表准确度较高,仅代表该权值在验证集上泛化性能较好。 + #--------------------------------------------------------------------------# + "model_path" : "model_data/arcface_mobilefacenet.pth", + #-------------------------------------------# + # 输入图片的大小。 + #-------------------------------------------# + "input_shape" : [112, 112, 3], + #-------------------------------------------# + # 所使用到的主干特征提取网络,与训练的相同 + # mobilefacenet + # mobilenetv1 + # iresnet18 + # iresnet34 + # iresnet50 + # iresnet100 + # iresnet200 + #-------------------------------------------# + "backbone" : "mobilefacenet", + #-------------------------------------------# + # 是否进行不失真的resize + #-------------------------------------------# + "letterbox_image" : True, + #-------------------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #-------------------------------------------# + "cuda" : True, + } + + @classmethod + def get_defaults(cls, n): + if n in cls._defaults: + return cls._defaults[n] + else: + return "Unrecognized attribute name '" + n + "'" + + #---------------------------------------------------# + # 初始化Arcface + #---------------------------------------------------# + def __init__(self, **kwargs): + self.__dict__.update(self._defaults) + for name, value in kwargs.items(): + setattr(self, name, value) + + self.generate() + + show_config(**self._defaults) + + def generate(self): + #---------------------------------------------------# + # 载入模型与权值 + #---------------------------------------------------# + print('Loading weights into state dict...') + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.net = arcface(backbone=self.backbone, mode="predict").eval() + self.net.load_state_dict(torch.load(self.model_path, map_location=device), strict=False) + print('{} model loaded.'.format(self.model_path)) + + if self.cuda: + self.net = torch.nn.DataParallel(self.net) + cudnn.benchmark = True + self.net = self.net.cuda() + + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self, image_1, image_2): + #---------------------------------------------------# + # 图片预处理,归一化 + #---------------------------------------------------# + with torch.no_grad(): + image_1 = resize_image(image_1, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) + image_2 = resize_image(image_2, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) + + photo_1 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_1, np.float32)), (2, 0, 1)), 0)) + photo_2 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_2, np.float32)), (2, 0, 1)), 0)) + + if self.cuda: + photo_1 = photo_1.cuda() + photo_2 = photo_2.cuda() + + #---------------------------------------------------# + # 图片传入网络进行预测 + #---------------------------------------------------# + output1 = self.net(photo_1).cpu().numpy() + output2 = self.net(photo_2).cpu().numpy() + + #---------------------------------------------------# + # 计算二者之间的距离 + #---------------------------------------------------# + l1 = np.linalg.norm(output1 - output2, axis=1) + + plt.subplot(1, 2, 1) + plt.imshow(np.array(image_1)) + + plt.subplot(1, 2, 2) + plt.imshow(np.array(image_2)) + plt.text(-12, -12, 'Distance:%.3f' % l1, ha='center', va= 'bottom',fontsize=11) + plt.show() + return l1 + + def get_FPS(self, image, test_interval): + #---------------------------------------------------# + # 对图片进行不失真的resize + #---------------------------------------------------# + image_data = resize_image(image, [self.input_shape[1], self.input_shape[0]], self.letterbox_image) + #---------------------------------------------------------# + # 归一化+添加上batch_size维度 + #---------------------------------------------------------# + image_data = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)) + with torch.no_grad(): + #---------------------------------------------------# + # 图片传入网络进行预测 + #---------------------------------------------------# + preds = self.net(image_data).cpu().numpy() + + import time + t1 = time.time() + for _ in range(test_interval): + with torch.no_grad(): + #---------------------------------------------------# + # 图片传入网络进行预测 + #---------------------------------------------------# + preds = self.net(image_data).cpu().numpy() + t2 = time.time() + tact_time = (t2 - t1) / test_interval + return tact_time diff --git a/cv/face_recognition/arcface/pytorch/eval_LFW.py b/cv/face_recognition/arcface/pytorch/eval_LFW.py index 53b2a0fc4..38df1f3b0 100755 --- a/cv/face_recognition/arcface/pytorch/eval_LFW.py +++ b/cv/face_recognition/arcface/pytorch/eval_LFW.py @@ -1,65 +1,65 @@ -import torch -import torch.backends.cudnn as cudnn - -from nets.arcface import Arcface -from utils.dataloader import LFWDataset -from utils.utils_metrics import test - - -if __name__ == "__main__": - #--------------------------------------# - # 是否使用Cuda - # 没有GPU可以设置成False - #--------------------------------------# - cuda = True - #--------------------------------------# - # 主干特征提取网络的选择 - # mobilefacenet - # mobilenetv1 - # iresnet18 - # iresnet34 - # iresnet50 - # iresnet100 - # iresnet200 - #--------------------------------------# - backbone = "mobilefacenet" - #--------------------------------------# - # 输入图像大小 - #--------------------------------------# - input_shape = [112, 112, 3] - #--------------------------------------# - # 训练好的权值文件 - #--------------------------------------# - model_path = "model_data/arcface_mobilefacenet.pth" - #--------------------------------------# - # LFW评估数据集的文件路径 - # 以及对应的txt文件 - #--------------------------------------# - lfw_dir_path = "lfw" - lfw_pairs_path = "model_data/lfw_pair.txt" - #--------------------------------------# - # 评估的批次大小和记录间隔 - #--------------------------------------# - batch_size = 256 - log_interval = 1 - #--------------------------------------# - # ROC图的保存路径 - #--------------------------------------# - png_save_path = "model_data/roc_test.png" - - test_loader = torch.utils.data.DataLoader( - LFWDataset(dir=lfw_dir_path, pairs_path=lfw_pairs_path, image_size=input_shape), batch_size=batch_size, shuffle=False) - - model = Arcface(backbone=backbone, mode="predict") - - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.load_state_dict(torch.load(model_path, map_location=device), strict=False) - model = model.eval() - - if cuda: - model = torch.nn.DataParallel(model) - cudnn.benchmark = True - model = model.cuda() - - test(test_loader, model, png_save_path, log_interval, batch_size, cuda) +import torch +import torch.backends.cudnn as cudnn + +from nets.arcface import Arcface +from utils.dataloader import LFWDataset +from utils.utils_metrics import test + + +if __name__ == "__main__": + #--------------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #--------------------------------------# + cuda = True + #--------------------------------------# + # 主干特征提取网络的选择 + # mobilefacenet + # mobilenetv1 + # iresnet18 + # iresnet34 + # iresnet50 + # iresnet100 + # iresnet200 + #--------------------------------------# + backbone = "mobilefacenet" + #--------------------------------------# + # 输入图像大小 + #--------------------------------------# + input_shape = [112, 112, 3] + #--------------------------------------# + # 训练好的权值文件 + #--------------------------------------# + model_path = "model_data/arcface_mobilefacenet.pth" + #--------------------------------------# + # LFW评估数据集的文件路径 + # 以及对应的txt文件 + #--------------------------------------# + lfw_dir_path = "lfw" + lfw_pairs_path = "model_data/lfw_pair.txt" + #--------------------------------------# + # 评估的批次大小和记录间隔 + #--------------------------------------# + batch_size = 256 + log_interval = 1 + #--------------------------------------# + # ROC图的保存路径 + #--------------------------------------# + png_save_path = "model_data/roc_test.png" + + test_loader = torch.utils.data.DataLoader( + LFWDataset(dir=lfw_dir_path, pairs_path=lfw_pairs_path, image_size=input_shape), batch_size=batch_size, shuffle=False) + + model = Arcface(backbone=backbone, mode="predict") + + print('Loading weights into state dict...') + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.load_state_dict(torch.load(model_path, map_location=device), strict=False) + model = model.eval() + + if cuda: + model = torch.nn.DataParallel(model) + cudnn.benchmark = True + model = model.cuda() + + test(test_loader, model, png_save_path, log_interval, batch_size, cuda) diff --git a/cv/face_recognition/arcface/pytorch/nets/arcface.py b/cv/face_recognition/arcface/pytorch/nets/arcface.py index c44cb0dcb..6c1af00ea 100755 --- a/cv/face_recognition/arcface/pytorch/nets/arcface.py +++ b/cv/face_recognition/arcface/pytorch/nets/arcface.py @@ -1,90 +1,90 @@ -import math - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn import Module, Parameter - -from nets.iresnet import (iresnet18, iresnet34, iresnet50, iresnet100, - iresnet200) -from nets.mobilefacenet import get_mbf -from nets.mobilenet import get_mobilenet - -class Arcface_Head(Module): - def __init__(self, embedding_size=128, num_classes=10575, s=64., m=0.5): - super(Arcface_Head, self).__init__() - self.s = s - self.m = m - self.weight = Parameter(torch.FloatTensor(num_classes, embedding_size)) - nn.init.xavier_uniform_(self.weight) - - self.cos_m = math.cos(m) - self.sin_m = math.sin(m) - self.th = math.cos(math.pi - m) - self.mm = math.sin(math.pi - m) * m - - def forward(self, input, label): - cosine = F.linear(input, F.normalize(self.weight)) - sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1)) - phi = cosine * self.cos_m - sine * self.sin_m - phi = torch.where(cosine.float() > self.th, phi.float(), cosine.float() - self.mm) - - one_hot = torch.zeros(cosine.size()).type_as(phi).long() - one_hot.scatter_(1, label.view(-1, 1).long(), 1) - output = (one_hot * phi) + ((1.0 - one_hot) * cosine) - output *= self.s - return output - -class Arcface(nn.Module): - def __init__(self, num_classes=None, backbone="mobilefacenet", pretrained=False, mode="train"): - super(Arcface, self).__init__() - if backbone=="mobilefacenet": - embedding_size = 128 - s = 32 - self.arcface = get_mbf(embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="mobilenetv1": - embedding_size = 512 - s = 64 - self.arcface = get_mobilenet(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="iresnet18": - embedding_size = 512 - s = 64 - self.arcface = iresnet18(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="iresnet34": - embedding_size = 512 - s = 64 - self.arcface = iresnet34(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="iresnet50": - embedding_size = 512 - s = 64 - self.arcface = iresnet50(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="iresnet100": - embedding_size = 512 - s = 64 - self.arcface = iresnet100(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - - elif backbone=="iresnet200": - embedding_size = 512 - s = 64 - self.arcface = iresnet200(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) - else: - raise ValueError('Unsupported backbone - `{}`, Use mobilefacenet, mobilenetv1.'.format(backbone)) - - self.mode = mode - if mode == "train": - self.head = Arcface_Head(embedding_size=embedding_size, num_classes=num_classes, s=s) - - def forward(self, x, y = None, mode = "predict"): - x = self.arcface(x) - x = x.view(x.size()[0], -1) - x = F.normalize(x) - if mode == "predict": - return x - else: - x = self.head(x, y) - return x +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Module, Parameter + +from nets.iresnet import (iresnet18, iresnet34, iresnet50, iresnet100, + iresnet200) +from nets.mobilefacenet import get_mbf +from nets.mobilenet import get_mobilenet + +class Arcface_Head(Module): + def __init__(self, embedding_size=128, num_classes=10575, s=64., m=0.5): + super(Arcface_Head, self).__init__() + self.s = s + self.m = m + self.weight = Parameter(torch.FloatTensor(num_classes, embedding_size)) + nn.init.xavier_uniform_(self.weight) + + self.cos_m = math.cos(m) + self.sin_m = math.sin(m) + self.th = math.cos(math.pi - m) + self.mm = math.sin(math.pi - m) * m + + def forward(self, input, label): + cosine = F.linear(input, F.normalize(self.weight)) + sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1)) + phi = cosine * self.cos_m - sine * self.sin_m + phi = torch.where(cosine.float() > self.th, phi.float(), cosine.float() - self.mm) + + one_hot = torch.zeros(cosine.size()).type_as(phi).long() + one_hot.scatter_(1, label.view(-1, 1).long(), 1) + output = (one_hot * phi) + ((1.0 - one_hot) * cosine) + output *= self.s + return output + +class Arcface(nn.Module): + def __init__(self, num_classes=None, backbone="mobilefacenet", pretrained=False, mode="train"): + super(Arcface, self).__init__() + if backbone=="mobilefacenet": + embedding_size = 128 + s = 32 + self.arcface = get_mbf(embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="mobilenetv1": + embedding_size = 512 + s = 64 + self.arcface = get_mobilenet(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="iresnet18": + embedding_size = 512 + s = 64 + self.arcface = iresnet18(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="iresnet34": + embedding_size = 512 + s = 64 + self.arcface = iresnet34(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="iresnet50": + embedding_size = 512 + s = 64 + self.arcface = iresnet50(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="iresnet100": + embedding_size = 512 + s = 64 + self.arcface = iresnet100(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + + elif backbone=="iresnet200": + embedding_size = 512 + s = 64 + self.arcface = iresnet200(dropout_keep_prob=0.5, embedding_size=embedding_size, pretrained=pretrained) + else: + raise ValueError('Unsupported backbone - `{}`, Use mobilefacenet, mobilenetv1.'.format(backbone)) + + self.mode = mode + if mode == "train": + self.head = Arcface_Head(embedding_size=embedding_size, num_classes=num_classes, s=s) + + def forward(self, x, y = None, mode = "predict"): + x = self.arcface(x) + x = x.view(x.size()[0], -1) + x = F.normalize(x) + if mode == "predict": + return x + else: + x = self.head(x, y) + return x diff --git a/cv/face_recognition/arcface/pytorch/nets/arcface_training.py b/cv/face_recognition/arcface/pytorch/nets/arcface_training.py index 2c4f688e1..059fa4843 100755 --- a/cv/face_recognition/arcface/pytorch/nets/arcface_training.py +++ b/cv/face_recognition/arcface/pytorch/nets/arcface_training.py @@ -1,46 +1,46 @@ -import math -from functools import partial - - -def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.1, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.3, step_num = 10): - def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters): - if iters <= warmup_total_iters: - # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start - lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2 - ) + warmup_lr_start - elif iters >= total_iters - no_aug_iter: - lr = min_lr - else: - lr = min_lr + 0.5 * (lr - min_lr) * ( - 1.0 - + math.cos( - math.pi - * (iters - warmup_total_iters) - / (total_iters - warmup_total_iters - no_aug_iter) - ) - ) - return lr - - def step_lr(lr, decay_rate, step_size, iters): - if step_size < 1: - raise ValueError("step_size must above 1.") - n = iters // step_size - out_lr = lr * decay_rate ** n - return out_lr - - if lr_decay_type == "cos": - warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3) - warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6) - no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15) - func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter) - else: - decay_rate = (min_lr / lr) ** (1 / (step_num - 1)) - step_size = total_iters / step_num - func = partial(step_lr, lr, decay_rate, step_size) - - return func - -def set_optimizer_lr(optimizer, lr_scheduler_func, epoch): - lr = lr_scheduler_func(epoch) - for param_group in optimizer.param_groups: - param_group['lr'] = lr +import math +from functools import partial + + +def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.1, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.3, step_num = 10): + def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters): + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= total_iters - no_aug_iter: + lr = min_lr + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iter) + ) + ) + return lr + + def step_lr(lr, decay_rate, step_size, iters): + if step_size < 1: + raise ValueError("step_size must above 1.") + n = iters // step_size + out_lr = lr * decay_rate ** n + return out_lr + + if lr_decay_type == "cos": + warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3) + warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6) + no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15) + func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter) + else: + decay_rate = (min_lr / lr) ** (1 / (step_num - 1)) + step_size = total_iters / step_num + func = partial(step_lr, lr, decay_rate, step_size) + + return func + +def set_optimizer_lr(optimizer, lr_scheduler_func, epoch): + lr = lr_scheduler_func(epoch) + for param_group in optimizer.param_groups: + param_group['lr'] = lr diff --git a/cv/face_recognition/arcface/pytorch/nets/iresnet.py b/cv/face_recognition/arcface/pytorch/nets/iresnet.py index 4f818b56e..181d27184 100755 --- a/cv/face_recognition/arcface/pytorch/nets/iresnet.py +++ b/cv/face_recognition/arcface/pytorch/nets/iresnet.py @@ -1,184 +1,184 @@ - -import torch -from torch import nn - -__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200'] - - -def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): - return nn.Conv2d(in_planes, - out_planes, - kernel_size=3, - stride=stride, - padding=dilation, - groups=groups, - bias=False, - dilation=dilation) - - -def conv1x1(in_planes, out_planes, stride=1): - return nn.Conv2d(in_planes, - out_planes, - kernel_size=1, - stride=stride, - bias=False) - - -class IBasicBlock(nn.Module): - expansion = 1 - def __init__(self, inplanes, planes, stride=1, downsample=None, - groups=1, base_width=64, dilation=1): - super(IBasicBlock, self).__init__() - if groups != 1 or base_width != 64: - raise ValueError('BasicBlock only supports groups=1 and base_width=64') - if dilation > 1: - raise NotImplementedError("Dilation > 1 not supported in BasicBlock") - self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,) - self.conv1 = conv3x3(inplanes, planes) - self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,) - self.prelu = nn.PReLU(planes) - self.conv2 = conv3x3(planes, planes, stride) - self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - out = self.bn1(x) - out = self.conv1(out) - out = self.bn2(out) - out = self.prelu(out) - out = self.conv2(out) - out = self.bn3(out) - if self.downsample is not None: - identity = self.downsample(x) - out += identity - return out - - -class IResNet(nn.Module): - fc_scale = 7 * 7 - def __init__(self, - block, layers, dropout_keep_prob=0, embedding_size=512, zero_init_residual=False, - groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False): - super(IResNet, self).__init__() - self.fp16 = fp16 - self.inplanes = 64 - self.dilation = 1 - if replace_stride_with_dilation is None: - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError("replace_stride_with_dilation should be None " - "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) - self.groups = groups - self.base_width = width_per_group - self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05) - self.prelu = nn.PReLU(self.inplanes) - self.layer1 = self._make_layer(block, 64, layers[0], stride=2) - self.layer2 = self._make_layer(block, - 128, - layers[1], - stride=2, - dilate=replace_stride_with_dilation[0]) - self.layer3 = self._make_layer(block, - 256, - layers[2], - stride=2, - dilate=replace_stride_with_dilation[1]) - self.layer4 = self._make_layer(block, - 512, - layers[3], - stride=2, - dilate=replace_stride_with_dilation[2]) - self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,) - self.dropout = nn.Dropout(p=dropout_keep_prob, inplace=True) - self.fc = nn.Linear(512 * block.expansion * self.fc_scale, embedding_size) - self.features = nn.BatchNorm1d(embedding_size, eps=1e-05) - nn.init.constant_(self.features.weight, 1.0) - self.features.weight.requires_grad = False - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, 0, 0.1) - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - if zero_init_residual: - for m in self.modules(): - if isinstance(m, IBasicBlock): - nn.init.constant_(m.bn2.weight, 0) - - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, planes * block.expansion, stride), - nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ), - ) - layers = [] - layers.append( - block(self.inplanes, planes, stride, downsample, self.groups, - self.base_width, previous_dilation)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append( - block(self.inplanes, - planes, - groups=self.groups, - base_width=self.base_width, - dilation=self.dilation)) - - return nn.Sequential(*layers) - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.prelu(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - x = self.bn2(x) - x = torch.flatten(x, 1) - x = self.dropout(x) - x = self.fc(x) - x = self.features(x) - return x - - -def _iresnet(arch, block, layers, pretrained, progress, **kwargs): - model = IResNet(block, layers, **kwargs) - if pretrained: - raise ValueError("No pretrained model for iresnet") - return model - - -def iresnet18(pretrained=False, progress=True, **kwargs): - return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained, - progress, **kwargs) - - -def iresnet34(pretrained=False, progress=True, **kwargs): - return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained, - progress, **kwargs) - - -def iresnet50(pretrained=False, progress=True, **kwargs): - return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained, - progress, **kwargs) - - -def iresnet100(pretrained=False, progress=True, **kwargs): - return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained, - progress, **kwargs) - - -def iresnet200(pretrained=False, progress=True, **kwargs): - return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained, + +import torch +from torch import nn + +__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200'] + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + return nn.Conv2d(in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2d(in_planes, + out_planes, + kernel_size=1, + stride=stride, + bias=False) + + +class IBasicBlock(nn.Module): + expansion = 1 + def __init__(self, inplanes, planes, stride=1, downsample=None, + groups=1, base_width=64, dilation=1): + super(IBasicBlock, self).__init__() + if groups != 1 or base_width != 64: + raise ValueError('BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,) + self.conv1 = conv3x3(inplanes, planes) + self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,) + self.prelu = nn.PReLU(planes) + self.conv2 = conv3x3(planes, planes, stride) + self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + out = self.bn1(x) + out = self.conv1(out) + out = self.bn2(out) + out = self.prelu(out) + out = self.conv2(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + return out + + +class IResNet(nn.Module): + fc_scale = 7 * 7 + def __init__(self, + block, layers, dropout_keep_prob=0, embedding_size=512, zero_init_residual=False, + groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False): + super(IResNet, self).__init__() + self.fp16 = fp16 + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05) + self.prelu = nn.PReLU(self.inplanes) + self.layer1 = self._make_layer(block, 64, layers[0], stride=2) + self.layer2 = self._make_layer(block, + 128, + layers[1], + stride=2, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, + 256, + layers[2], + stride=2, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, + 512, + layers[3], + stride=2, + dilate=replace_stride_with_dilation[2]) + self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,) + self.dropout = nn.Dropout(p=dropout_keep_prob, inplace=True) + self.fc = nn.Linear(512 * block.expansion * self.fc_scale, embedding_size) + self.features = nn.BatchNorm1d(embedding_size, eps=1e-05) + nn.init.constant_(self.features.weight, 1.0) + self.features.weight.requires_grad = False + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, 0, 0.1) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + if zero_init_residual: + for m in self.modules(): + if isinstance(m, IBasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ), + ) + layers = [] + layers.append( + block(self.inplanes, planes, stride, downsample, self.groups, + self.base_width, previous_dilation)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block(self.inplanes, + planes, + groups=self.groups, + base_width=self.base_width, + dilation=self.dilation)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.prelu(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.bn2(x) + x = torch.flatten(x, 1) + x = self.dropout(x) + x = self.fc(x) + x = self.features(x) + return x + + +def _iresnet(arch, block, layers, pretrained, progress, **kwargs): + model = IResNet(block, layers, **kwargs) + if pretrained: + raise ValueError("No pretrained model for iresnet") + return model + + +def iresnet18(pretrained=False, progress=True, **kwargs): + return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained, + progress, **kwargs) + + +def iresnet34(pretrained=False, progress=True, **kwargs): + return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained, + progress, **kwargs) + + +def iresnet50(pretrained=False, progress=True, **kwargs): + return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained, + progress, **kwargs) + + +def iresnet100(pretrained=False, progress=True, **kwargs): + return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained, + progress, **kwargs) + + +def iresnet200(pretrained=False, progress=True, **kwargs): + return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained, progress, **kwargs) \ No newline at end of file diff --git a/cv/face_recognition/arcface/pytorch/nets/mobilefacenet.py b/cv/face_recognition/arcface/pytorch/nets/mobilefacenet.py index c39afb4eb..d60893375 100755 --- a/cv/face_recognition/arcface/pytorch/nets/mobilefacenet.py +++ b/cv/face_recognition/arcface/pytorch/nets/mobilefacenet.py @@ -1,131 +1,131 @@ -from torch import nn -from torch.nn import BatchNorm2d, Conv2d, Module, PReLU, Sequential - -class Flatten(Module): - def forward(self, input): - return input.view(input.size(0), -1) - -class Linear_block(Module): - def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): - super(Linear_block, self).__init__() - self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) - self.bn = BatchNorm2d(out_c) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return x - -class Residual_Block(Module): - def __init__(self, in_c, out_c, residual = False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1): - super(Residual_Block, self).__init__() - self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) - self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride) - self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) - self.residual = residual - def forward(self, x): - if self.residual: - short_cut = x - x = self.conv(x) - x = self.conv_dw(x) - x = self.project(x) - if self.residual: - output = short_cut + x - else: - output = x - return output - -class Residual(Module): - def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)): - super(Residual, self).__init__() - modules = [] - for _ in range(num_block): - modules.append(Residual_Block(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups)) - self.model = Sequential(*modules) - def forward(self, x): - return self.model(x) - -class Conv_block(Module): - def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): - super(Conv_block, self).__init__() - self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) - self.bn = BatchNorm2d(out_c) - self.prelu = PReLU(out_c) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.prelu(x) - return x - -class MobileFaceNet(Module): - def __init__(self, embedding_size): - super(MobileFaceNet, self).__init__() - # 112,112,3 -> 56,56,64 - self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1)) - - # 56,56,64 -> 56,56,64 - self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64) - - # 56,56,64 -> 28,28,64 - self.conv_23 = Residual_Block(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128) - self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) - - # 28,28,64 -> 14,14,128 - self.conv_34 = Residual_Block(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) - self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) - - # 14,14,128 -> 7,7,128 - self.conv_45 = Residual_Block(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512) - self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) - - self.sep = nn.Conv2d(128, 512, kernel_size=1, bias=False) - self.sep_bn = nn.BatchNorm2d(512) - self.prelu = nn.PReLU(512) - - self.GDC_dw = nn.Conv2d(512, 512, kernel_size=7, bias=False, groups=512) - self.GDC_bn = nn.BatchNorm2d(512) - - self.features = nn.Conv2d(512, embedding_size, kernel_size=1, bias=False) - self.last_bn = nn.BatchNorm2d(embedding_size) - - self._initialize_weights() - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - m.bias.data.zero_() - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1) - m.bias.data.zero_() - elif isinstance(m, nn.Linear): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - m.bias.data.zero_() - - def forward(self, x): - x = self.conv1(x) - x = self.conv2_dw(x) - x = self.conv_23(x) - x = self.conv_3(x) - x = self.conv_34(x) - x = self.conv_4(x) - x = self.conv_45(x) - x = self.conv_5(x) - - x = self.sep(x) - x = self.sep_bn(x) - x = self.prelu(x) - - x = self.GDC_dw(x) - x = self.GDC_bn(x) - - x = self.features(x) - x = self.last_bn(x) - return x - - -def get_mbf(embedding_size, pretrained): - if pretrained: - raise ValueError("No pretrained model for mobilefacenet") - return MobileFaceNet(embedding_size) +from torch import nn +from torch.nn import BatchNorm2d, Conv2d, Module, PReLU, Sequential + +class Flatten(Module): + def forward(self, input): + return input.view(input.size(0), -1) + +class Linear_block(Module): + def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): + super(Linear_block, self).__init__() + self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) + self.bn = BatchNorm2d(out_c) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + +class Residual_Block(Module): + def __init__(self, in_c, out_c, residual = False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1): + super(Residual_Block, self).__init__() + self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride) + self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.residual = residual + def forward(self, x): + if self.residual: + short_cut = x + x = self.conv(x) + x = self.conv_dw(x) + x = self.project(x) + if self.residual: + output = short_cut + x + else: + output = x + return output + +class Residual(Module): + def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)): + super(Residual, self).__init__() + modules = [] + for _ in range(num_block): + modules.append(Residual_Block(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups)) + self.model = Sequential(*modules) + def forward(self, x): + return self.model(x) + +class Conv_block(Module): + def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): + super(Conv_block, self).__init__() + self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) + self.bn = BatchNorm2d(out_c) + self.prelu = PReLU(out_c) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.prelu(x) + return x + +class MobileFaceNet(Module): + def __init__(self, embedding_size): + super(MobileFaceNet, self).__init__() + # 112,112,3 -> 56,56,64 + self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1)) + + # 56,56,64 -> 56,56,64 + self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64) + + # 56,56,64 -> 28,28,64 + self.conv_23 = Residual_Block(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128) + self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + + # 28,28,64 -> 14,14,128 + self.conv_34 = Residual_Block(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) + self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + + # 14,14,128 -> 7,7,128 + self.conv_45 = Residual_Block(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512) + self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + + self.sep = nn.Conv2d(128, 512, kernel_size=1, bias=False) + self.sep_bn = nn.BatchNorm2d(512) + self.prelu = nn.PReLU(512) + + self.GDC_dw = nn.Conv2d(512, 512, kernel_size=7, bias=False, groups=512) + self.GDC_bn = nn.BatchNorm2d(512) + + self.features = nn.Conv2d(512, embedding_size, kernel_size=1, bias=False) + self.last_bn = nn.BatchNorm2d(embedding_size) + + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x): + x = self.conv1(x) + x = self.conv2_dw(x) + x = self.conv_23(x) + x = self.conv_3(x) + x = self.conv_34(x) + x = self.conv_4(x) + x = self.conv_45(x) + x = self.conv_5(x) + + x = self.sep(x) + x = self.sep_bn(x) + x = self.prelu(x) + + x = self.GDC_dw(x) + x = self.GDC_bn(x) + + x = self.features(x) + x = self.last_bn(x) + return x + + +def get_mbf(embedding_size, pretrained): + if pretrained: + raise ValueError("No pretrained model for mobilefacenet") + return MobileFaceNet(embedding_size) diff --git a/cv/face_recognition/arcface/pytorch/nets/mobilenet.py b/cv/face_recognition/arcface/pytorch/nets/mobilenet.py index b02dca354..5e0a8eec7 100755 --- a/cv/face_recognition/arcface/pytorch/nets/mobilenet.py +++ b/cv/face_recognition/arcface/pytorch/nets/mobilenet.py @@ -1,86 +1,86 @@ -import torch -import torch.nn as nn - - -def conv_bn(inp, oup, stride = 1): - return nn.Sequential( - nn.Conv2d(inp, oup, 3, stride, 1, bias=False), - nn.BatchNorm2d(oup), - nn.ReLU6(inplace=True) - ) - -def conv_dw(inp, oup, stride = 1): - return nn.Sequential( - nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), - nn.BatchNorm2d(inp), - nn.ReLU6(inplace=True), - - nn.Conv2d(inp, oup, 1, 1, 0, bias=False), - nn.BatchNorm2d(oup), - nn.ReLU6(inplace=True), - ) - -class MobileNetV1(nn.Module): - fc_scale = 7 * 7 - def __init__(self, dropout_keep_prob, embedding_size, pretrained): - super(MobileNetV1, self).__init__() - self.stage1 = nn.Sequential( - conv_bn(3, 32, 1), # 3 - conv_dw(32, 64, 1), # 7 - - conv_dw(64, 128, 2), # 11 - conv_dw(128, 128, 1), # 19 - - conv_dw(128, 256, 2), # 27 - conv_dw(256, 256, 1), # 43 - ) - self.stage2 = nn.Sequential( - conv_dw(256, 512, 2), # 43 + 16 = 59 - conv_dw(512, 512, 1), # 59 + 32 = 91 - conv_dw(512, 512, 1), # 91 + 32 = 123 - conv_dw(512, 512, 1), # 123 + 32 = 155 - conv_dw(512, 512, 1), # 155 + 32 = 187 - conv_dw(512, 512, 1), # 187 + 32 = 219 - ) - self.stage3 = nn.Sequential( - conv_dw(512, 1024, 2), # 219 +3 2 = 241 - conv_dw(1024, 1024, 1), # 241 + 64 = 301 - ) - - self.sep = nn.Conv2d(1024, 512, kernel_size=1, bias=False) - self.sep_bn = nn.BatchNorm2d(512) - self.prelu = nn.PReLU(512) - - self.bn2 = nn.BatchNorm2d(512, eps=1e-05) - self.dropout = nn.Dropout(p=dropout_keep_prob, inplace=True) - self.linear = nn.Linear(512 * self.fc_scale, embedding_size) - self.features = nn.BatchNorm1d(embedding_size, eps=1e-05) - - if pretrained: - self.load_state_dict(torch.load("model_data/mobilenet_v1_backbone_weights.pth"), strict = False) - else: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, 0, 0.1) - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def forward(self, x): - x = self.stage1(x) - x = self.stage2(x) - x = self.stage3(x) - - x = self.sep(x) - x = self.sep_bn(x) - x = self.prelu(x) - - x = self.bn2(x) - x = torch.flatten(x, 1) - x = self.dropout(x) - x = self.linear(x) - x = self.features(x) - return x - -def get_mobilenet(dropout_keep_prob, embedding_size, pretrained): - return MobileNetV1(dropout_keep_prob, embedding_size, pretrained) +import torch +import torch.nn as nn + + +def conv_bn(inp, oup, stride = 1): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + +def conv_dw(inp, oup, stride = 1): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.ReLU6(inplace=True), + + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True), + ) + +class MobileNetV1(nn.Module): + fc_scale = 7 * 7 + def __init__(self, dropout_keep_prob, embedding_size, pretrained): + super(MobileNetV1, self).__init__() + self.stage1 = nn.Sequential( + conv_bn(3, 32, 1), # 3 + conv_dw(32, 64, 1), # 7 + + conv_dw(64, 128, 2), # 11 + conv_dw(128, 128, 1), # 19 + + conv_dw(128, 256, 2), # 27 + conv_dw(256, 256, 1), # 43 + ) + self.stage2 = nn.Sequential( + conv_dw(256, 512, 2), # 43 + 16 = 59 + conv_dw(512, 512, 1), # 59 + 32 = 91 + conv_dw(512, 512, 1), # 91 + 32 = 123 + conv_dw(512, 512, 1), # 123 + 32 = 155 + conv_dw(512, 512, 1), # 155 + 32 = 187 + conv_dw(512, 512, 1), # 187 + 32 = 219 + ) + self.stage3 = nn.Sequential( + conv_dw(512, 1024, 2), # 219 +3 2 = 241 + conv_dw(1024, 1024, 1), # 241 + 64 = 301 + ) + + self.sep = nn.Conv2d(1024, 512, kernel_size=1, bias=False) + self.sep_bn = nn.BatchNorm2d(512) + self.prelu = nn.PReLU(512) + + self.bn2 = nn.BatchNorm2d(512, eps=1e-05) + self.dropout = nn.Dropout(p=dropout_keep_prob, inplace=True) + self.linear = nn.Linear(512 * self.fc_scale, embedding_size) + self.features = nn.BatchNorm1d(embedding_size, eps=1e-05) + + if pretrained: + self.load_state_dict(torch.load("model_data/mobilenet_v1_backbone_weights.pth"), strict = False) + else: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, 0, 0.1) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.stage1(x) + x = self.stage2(x) + x = self.stage3(x) + + x = self.sep(x) + x = self.sep_bn(x) + x = self.prelu(x) + + x = self.bn2(x) + x = torch.flatten(x, 1) + x = self.dropout(x) + x = self.linear(x) + x = self.features(x) + return x + +def get_mobilenet(dropout_keep_prob, embedding_size, pretrained): + return MobileNetV1(dropout_keep_prob, embedding_size, pretrained) diff --git a/cv/face_recognition/arcface/pytorch/predict.py b/cv/face_recognition/arcface/pytorch/predict.py index be5a6aa07..4c0d614e9 100755 --- a/cv/face_recognition/arcface/pytorch/predict.py +++ b/cv/face_recognition/arcface/pytorch/predict.py @@ -1,44 +1,44 @@ -from PIL import Image - -from arcface import Arcface - -if __name__ == "__main__": - model = Arcface() - - #----------------------------------------------------------------------------------------------------------# - # mode用于指定测试的模式: - # 'predict'表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释 - # 'fps'表示测试fps,使用的图片是img里面的street.jpg,详情查看下方注释。 - #----------------------------------------------------------------------------------------------------------# - mode = "predict" - #-------------------------------------------------------------------------# - # test_interval 用于指定测量fps的时候,图片检测的次数 - # 理论上test_interval越大,fps越准确。 - # fps_test_image fps测试图片 - #-------------------------------------------------------------------------# - test_interval = 100 - fps_test_image = 'img/1_001.jpg' - - if mode == "predict": - while True: - image_1 = input('Input image_1 filename:') - try: - image_1 = Image.open(image_1) - except: - print('Image_1 Open Error! Try again!') - continue - - image_2 = input('Input image_2 filename:') - try: - image_2 = Image.open(image_2) - except: - print('Image_2 Open Error! Try again!') - continue - - probability = model.detect_image(image_1,image_2) - print(probability) - - elif mode == "fps": - img = Image.open(fps_test_image) - tact_time = model.get_FPS(img, test_interval) +from PIL import Image + +from arcface import Arcface + +if __name__ == "__main__": + model = Arcface() + + #----------------------------------------------------------------------------------------------------------# + # mode用于指定测试的模式: + # 'predict'表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释 + # 'fps'表示测试fps,使用的图片是img里面的street.jpg,详情查看下方注释。 + #----------------------------------------------------------------------------------------------------------# + mode = "predict" + #-------------------------------------------------------------------------# + # test_interval 用于指定测量fps的时候,图片检测的次数 + # 理论上test_interval越大,fps越准确。 + # fps_test_image fps测试图片 + #-------------------------------------------------------------------------# + test_interval = 100 + fps_test_image = 'img/1_001.jpg' + + if mode == "predict": + while True: + image_1 = input('Input image_1 filename:') + try: + image_1 = Image.open(image_1) + except: + print('Image_1 Open Error! Try again!') + continue + + image_2 = input('Input image_2 filename:') + try: + image_2 = Image.open(image_2) + except: + print('Image_2 Open Error! Try again!') + continue + + probability = model.detect_image(image_1,image_2) + print(probability) + + elif mode == "fps": + img = Image.open(fps_test_image) + tact_time = model.get_FPS(img, test_interval) print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1') \ No newline at end of file diff --git a/cv/face_recognition/arcface/pytorch/summary.py b/cv/face_recognition/arcface/pytorch/summary.py index 3a14dfeb6..0698a87cb 100755 --- a/cv/face_recognition/arcface/pytorch/summary.py +++ b/cv/face_recognition/arcface/pytorch/summary.py @@ -1,29 +1,29 @@ -#--------------------------------------------# -# 该部分代码只用于看网络结构,并非测试代码 -#--------------------------------------------# -import torch -from thop import clever_format, profile -from torchsummary import summary - -from nets.arcface import Arcface - -if __name__ == "__main__": - input_shape = [112, 112] - backbone = 'mobilefacenet' - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model = Arcface(num_classes=10575, backbone=backbone, mode="predict").to(device) - summary(model, (3, input_shape[0], input_shape[1])) - - dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device) - flops, params = profile(model.to(device), (dummy_input, ), verbose=False) - #--------------------------------------------------------# - # flops * 2是因为profile没有将卷积作为两个operations - # 有些论文将卷积算乘法、加法两个operations。此时乘2 - # 有些论文只考虑乘法的运算次数,忽略加法。此时不乘2 - # 本代码选择乘2,参考YOLOX。 - #--------------------------------------------------------# - flops = flops * 2 - flops, params = clever_format([flops, params], "%.3f") - print('Total GFLOPS: %s' % (flops)) +#--------------------------------------------# +# 该部分代码只用于看网络结构,并非测试代码 +#--------------------------------------------# +import torch +from thop import clever_format, profile +from torchsummary import summary + +from nets.arcface import Arcface + +if __name__ == "__main__": + input_shape = [112, 112] + backbone = 'mobilefacenet' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model = Arcface(num_classes=10575, backbone=backbone, mode="predict").to(device) + summary(model, (3, input_shape[0], input_shape[1])) + + dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device) + flops, params = profile(model.to(device), (dummy_input, ), verbose=False) + #--------------------------------------------------------# + # flops * 2是因为profile没有将卷积作为两个operations + # 有些论文将卷积算乘法、加法两个operations。此时乘2 + # 有些论文只考虑乘法的运算次数,忽略加法。此时不乘2 + # 本代码选择乘2,参考YOLOX。 + #--------------------------------------------------------# + flops = flops * 2 + flops, params = clever_format([flops, params], "%.3f") + print('Total GFLOPS: %s' % (flops)) print('Total params: %s' % (params)) \ No newline at end of file diff --git a/cv/face_recognition/arcface/pytorch/txt_annotation.py b/cv/face_recognition/arcface/pytorch/txt_annotation.py index c79725a9a..8edf96798 100755 --- a/cv/face_recognition/arcface/pytorch/txt_annotation.py +++ b/cv/face_recognition/arcface/pytorch/txt_annotation.py @@ -1,27 +1,27 @@ -# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. - -#------------------------------------------------# -# 进行训练前需要利用这个文件生成cls_train.txt -#------------------------------------------------# -import os - -if __name__ == "__main__": - #---------------------# - # 训练集所在的路径 - #---------------------# - datasets_path = "datasets/datasets" - - types_name = os.listdir(datasets_path) - types_name = sorted(types_name) - - list_file = open('cls_train.txt', 'w') - for cls_id, type_name in enumerate(types_name): - photos_path = os.path.join(datasets_path, type_name) - if not os.path.isdir(photos_path): - continue - photos_name = os.listdir(photos_path) - - for photo_name in photos_name: - list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) - list_file.write('\n') - list_file.close() +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. + +#------------------------------------------------# +# 进行训练前需要利用这个文件生成cls_train.txt +#------------------------------------------------# +import os + +if __name__ == "__main__": + #---------------------# + # 训练集所在的路径 + #---------------------# + datasets_path = "datasets/datasets" + + types_name = os.listdir(datasets_path) + types_name = sorted(types_name) + + list_file = open('cls_train.txt', 'w') + for cls_id, type_name in enumerate(types_name): + photos_path = os.path.join(datasets_path, type_name) + if not os.path.isdir(photos_path): + continue + photos_name = os.listdir(photos_path) + + for photo_name in photos_name: + list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) + list_file.write('\n') + list_file.close() diff --git a/cv/face_recognition/arcface/pytorch/utils/callback.py b/cv/face_recognition/arcface/pytorch/utils/callback.py index 386f88f75..f6ceb433d 100755 --- a/cv/face_recognition/arcface/pytorch/utils/callback.py +++ b/cv/face_recognition/arcface/pytorch/utils/callback.py @@ -1,85 +1,85 @@ -import datetime -import os - -import torch -import matplotlib -matplotlib.use('Agg') -import scipy.signal -from matplotlib import pyplot as plt -from torch.utils.tensorboard import SummaryWriter - -class LossHistory(): - def __init__(self, log_dir, model, input_shape): - time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S') - self.log_dir = os.path.join(log_dir, "loss_" + str(time_str)) - self.acc = [] - self.losses = [] - self.val_loss = [] - - os.makedirs(self.log_dir) - self.writer = SummaryWriter(self.log_dir) - dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1]) - self.writer.add_graph(model, dummy_input) - - def append_loss(self, epoch, acc, loss, val_loss): - if not os.path.exists(self.log_dir): - os.makedirs(self.log_dir) - - self.acc.append(acc) - self.losses.append(loss) - self.val_loss.append(val_loss) - - with open(os.path.join(self.log_dir, "epoch_acc.txt"), 'a') as f: - f.write(str(acc)) - f.write("\n") - with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f: - f.write(str(loss)) - f.write("\n") - with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f: - f.write(str(val_loss)) - f.write("\n") - - self.writer.add_scalar('loss', loss, epoch) - self.writer.add_scalar('val_loss', val_loss, epoch) - self.loss_plot() - - def loss_plot(self): - iters = range(len(self.losses)) - - plt.figure() - plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss') - plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss') - try: - if len(self.losses) < 25: - num = 5 - else: - num = 15 - plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss') - plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss') - except: - pass - plt.grid(True) - plt.xlabel('Epoch') - plt.ylabel('Loss') - plt.legend(loc="upper right") - plt.savefig(os.path.join(self.log_dir, "epoch_loss.png")) - plt.cla() - plt.close("all") - - plt.figure() - plt.plot(iters, self.acc, 'red', linewidth = 2, label='lfw acc') - try: - if len(self.losses) < 25: - num = 5 - else: - num = 15 - plt.plot(iters, scipy.signal.savgol_filter(self.acc, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth lfw acc') - except: - pass - plt.grid(True) - plt.xlabel('Epoch') - plt.ylabel('Lfw Acc') - plt.legend(loc="upper right") - plt.savefig(os.path.join(self.log_dir, "epoch_acc.png")) - plt.cla() - plt.close("all") +import datetime +import os + +import torch +import matplotlib +matplotlib.use('Agg') +import scipy.signal +from matplotlib import pyplot as plt +from torch.utils.tensorboard import SummaryWriter + +class LossHistory(): + def __init__(self, log_dir, model, input_shape): + time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S') + self.log_dir = os.path.join(log_dir, "loss_" + str(time_str)) + self.acc = [] + self.losses = [] + self.val_loss = [] + + os.makedirs(self.log_dir) + self.writer = SummaryWriter(self.log_dir) + dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1]) + self.writer.add_graph(model, dummy_input) + + def append_loss(self, epoch, acc, loss, val_loss): + if not os.path.exists(self.log_dir): + os.makedirs(self.log_dir) + + self.acc.append(acc) + self.losses.append(loss) + self.val_loss.append(val_loss) + + with open(os.path.join(self.log_dir, "epoch_acc.txt"), 'a') as f: + f.write(str(acc)) + f.write("\n") + with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f: + f.write(str(loss)) + f.write("\n") + with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f: + f.write(str(val_loss)) + f.write("\n") + + self.writer.add_scalar('loss', loss, epoch) + self.writer.add_scalar('val_loss', val_loss, epoch) + self.loss_plot() + + def loss_plot(self): + iters = range(len(self.losses)) + + plt.figure() + plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss') + plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss') + try: + if len(self.losses) < 25: + num = 5 + else: + num = 15 + plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss') + plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss') + except: + pass + plt.grid(True) + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.legend(loc="upper right") + plt.savefig(os.path.join(self.log_dir, "epoch_loss.png")) + plt.cla() + plt.close("all") + + plt.figure() + plt.plot(iters, self.acc, 'red', linewidth = 2, label='lfw acc') + try: + if len(self.losses) < 25: + num = 5 + else: + num = 15 + plt.plot(iters, scipy.signal.savgol_filter(self.acc, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth lfw acc') + except: + pass + plt.grid(True) + plt.xlabel('Epoch') + plt.ylabel('Lfw Acc') + plt.legend(loc="upper right") + plt.savefig(os.path.join(self.log_dir, "epoch_acc.png")) + plt.cla() + plt.close("all") diff --git a/cv/face_recognition/arcface/pytorch/utils/dataloader.py b/cv/face_recognition/arcface/pytorch/utils/dataloader.py index 9836fda1a..ef68a9ce9 100755 --- a/cv/face_recognition/arcface/pytorch/utils/dataloader.py +++ b/cv/face_recognition/arcface/pytorch/utils/dataloader.py @@ -1,105 +1,105 @@ -import os - -import numpy as np -import torch -import torch.utils.data as data -import torchvision.datasets as datasets -from PIL import Image - -from .utils import cvtColor, preprocess_input, resize_image - - -class FacenetDataset(data.Dataset): - def __init__(self, input_shape, lines, random): - self.input_shape = input_shape - self.lines = lines - self.random = random - - def __len__(self): - return len(self.lines) - - def rand(self, a=0, b=1): - return np.random.rand()*(b-a) + a - - def __getitem__(self, index): - annotation_path = self.lines[index].split(';')[1].split()[0] - y = int(self.lines[index].split(';')[0]) - - image = cvtColor(Image.open(annotation_path)) - #------------------------------------------# - # 翻转图像 - #------------------------------------------# - if self.rand()<.5 and self.random: - image = image.transpose(Image.FLIP_LEFT_RIGHT) - image = resize_image(image, [self.input_shape[1], self.input_shape[0]], letterbox_image = True) - - image = np.transpose(preprocess_input(np.array(image, dtype='float32')), (2, 0, 1)) - return image, y - -def dataset_collate(batch): - images = [] - targets = [] - for image, y in batch: - images.append(image) - targets.append(y) - images = torch.from_numpy(np.array(images)).type(torch.FloatTensor) - targets = torch.from_numpy(np.array(targets)).long() - return images, targets - -class LFWDataset(datasets.ImageFolder): - def __init__(self, dir, pairs_path, image_size, transform=None): - super(LFWDataset, self).__init__(dir,transform) - self.image_size = image_size - self.pairs_path = pairs_path - self.validation_images = self.get_lfw_paths(dir) - - def read_lfw_pairs(self,pairs_filename): - pairs = [] - with open(pairs_filename, 'r') as f: - for line in f.readlines()[1:]: - pair = line.strip().split() - pairs.append(pair) - return np.array(pairs) - - def get_lfw_paths(self,lfw_dir,file_ext="jpg"): - - pairs = self.read_lfw_pairs(self.pairs_path) - - nrof_skipped_pairs = 0 - path_list = [] - issame_list = [] - - for i in range(len(pairs)): - #for pair in pairs: - pair = pairs[i] - if len(pair) == 3: - path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) - path1 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[2])+'.'+file_ext) - issame = True - elif len(pair) == 4: - path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) - path1 = os.path.join(lfw_dir, pair[2], pair[2] + '_' + '%04d' % int(pair[3])+'.'+file_ext) - issame = False - if os.path.exists(path0) and os.path.exists(path1): # Only add the pair if both paths exist - path_list.append((path0,path1,issame)) - issame_list.append(issame) - else: - nrof_skipped_pairs += 1 - if nrof_skipped_pairs>0: - print('Skipped %d image pairs' % nrof_skipped_pairs) - - return path_list - - def __getitem__(self, index): - (path_1, path_2, issame) = self.validation_images[index] - image1, image2 = Image.open(path_1), Image.open(path_2) - - image1 = resize_image(image1, [self.image_size[1], self.image_size[0]], letterbox_image = True) - image2 = resize_image(image2, [self.image_size[1], self.image_size[0]], letterbox_image = True) - - image1, image2 = np.transpose(preprocess_input(np.array(image1, np.float32)),[2, 0, 1]), np.transpose(preprocess_input(np.array(image2, np.float32)),[2, 0, 1]) - - return image1, image2, issame - - def __len__(self): - return len(self.validation_images) +import os + +import numpy as np +import torch +import torch.utils.data as data +import torchvision.datasets as datasets +from PIL import Image + +from .utils import cvtColor, preprocess_input, resize_image + + +class FacenetDataset(data.Dataset): + def __init__(self, input_shape, lines, random): + self.input_shape = input_shape + self.lines = lines + self.random = random + + def __len__(self): + return len(self.lines) + + def rand(self, a=0, b=1): + return np.random.rand()*(b-a) + a + + def __getitem__(self, index): + annotation_path = self.lines[index].split(';')[1].split()[0] + y = int(self.lines[index].split(';')[0]) + + image = cvtColor(Image.open(annotation_path)) + #------------------------------------------# + # 翻转图像 + #------------------------------------------# + if self.rand()<.5 and self.random: + image = image.transpose(Image.FLIP_LEFT_RIGHT) + image = resize_image(image, [self.input_shape[1], self.input_shape[0]], letterbox_image = True) + + image = np.transpose(preprocess_input(np.array(image, dtype='float32')), (2, 0, 1)) + return image, y + +def dataset_collate(batch): + images = [] + targets = [] + for image, y in batch: + images.append(image) + targets.append(y) + images = torch.from_numpy(np.array(images)).type(torch.FloatTensor) + targets = torch.from_numpy(np.array(targets)).long() + return images, targets + +class LFWDataset(datasets.ImageFolder): + def __init__(self, dir, pairs_path, image_size, transform=None): + super(LFWDataset, self).__init__(dir,transform) + self.image_size = image_size + self.pairs_path = pairs_path + self.validation_images = self.get_lfw_paths(dir) + + def read_lfw_pairs(self,pairs_filename): + pairs = [] + with open(pairs_filename, 'r') as f: + for line in f.readlines()[1:]: + pair = line.strip().split() + pairs.append(pair) + return np.array(pairs) + + def get_lfw_paths(self,lfw_dir,file_ext="jpg"): + + pairs = self.read_lfw_pairs(self.pairs_path) + + nrof_skipped_pairs = 0 + path_list = [] + issame_list = [] + + for i in range(len(pairs)): + #for pair in pairs: + pair = pairs[i] + if len(pair) == 3: + path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) + path1 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[2])+'.'+file_ext) + issame = True + elif len(pair) == 4: + path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) + path1 = os.path.join(lfw_dir, pair[2], pair[2] + '_' + '%04d' % int(pair[3])+'.'+file_ext) + issame = False + if os.path.exists(path0) and os.path.exists(path1): # Only add the pair if both paths exist + path_list.append((path0,path1,issame)) + issame_list.append(issame) + else: + nrof_skipped_pairs += 1 + if nrof_skipped_pairs>0: + print('Skipped %d image pairs' % nrof_skipped_pairs) + + return path_list + + def __getitem__(self, index): + (path_1, path_2, issame) = self.validation_images[index] + image1, image2 = Image.open(path_1), Image.open(path_2) + + image1 = resize_image(image1, [self.image_size[1], self.image_size[0]], letterbox_image = True) + image2 = resize_image(image2, [self.image_size[1], self.image_size[0]], letterbox_image = True) + + image1, image2 = np.transpose(preprocess_input(np.array(image1, np.float32)),[2, 0, 1]), np.transpose(preprocess_input(np.array(image2, np.float32)),[2, 0, 1]) + + return image1, image2, issame + + def __len__(self): + return len(self.validation_images) diff --git a/cv/face_recognition/arcface/pytorch/utils/utils_fit.py b/cv/face_recognition/arcface/pytorch/utils/utils_fit.py index 121153205..34b30c4a2 100755 --- a/cv/face_recognition/arcface/pytorch/utils/utils_fit.py +++ b/cv/face_recognition/arcface/pytorch/utils/utils_fit.py @@ -1,126 +1,126 @@ -import os - -import numpy as np -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -from tqdm import tqdm - -from .utils import get_lr -from .utils_metrics import evaluate - - -def fit_one_epoch(model_train, model, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, test_loader, lfw_eval_flag, fp16, scaler, save_period, save_dir, local_rank=0): - total_loss = 0 - total_accuracy = 0 - - val_total_loss = 0 - val_total_accuracy = 0 - - if local_rank == 0: - print('Start Train') - pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) - model_train.train() - for iteration, batch in enumerate(gen): - if iteration >= epoch_step: - break - images, labels = batch - with torch.no_grad(): - if cuda: - images = images.cuda(local_rank) - labels = labels.cuda(local_rank) - - #----------------------# - # 清零梯度 - #----------------------# - optimizer.zero_grad() - if not fp16: - outputs = model_train(images, labels, mode="train") - loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) - - loss.backward() - optimizer.step() - else: - from torch.cuda.amp import autocast - with autocast(): - outputs = model_train(images, labels, mode="train") - loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) - #----------------------# - # 反向传播 - #----------------------# - scaler.scale(loss).backward() - scaler.step(optimizer) - scaler.update() - - with torch.no_grad(): - accuracy = torch.mean((torch.argmax(F.softmax(outputs, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) - - total_loss += loss.item() - total_accuracy += accuracy.item() - - if local_rank == 0: - pbar.set_postfix(**{'total_loss': total_loss / (iteration + 1), - 'accuracy' : total_accuracy / (iteration + 1), - 'lr' : get_lr(optimizer)}) - pbar.update(1) - - if local_rank == 0: - pbar.close() - print('Finish Train') - print('Start Validation') - pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) - model_train.eval() - for iteration, batch in enumerate(gen_val): - if iteration >= epoch_step_val: - break - images, labels = batch - with torch.no_grad(): - if cuda: - images = images.cuda(local_rank) - labels = labels.cuda(local_rank) - - optimizer.zero_grad() - outputs = model_train(images, labels, mode="train") - loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) - - accuracy = torch.mean((torch.argmax(F.softmax(outputs, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) - - val_total_loss += loss.item() - val_total_accuracy += accuracy.item() - - if local_rank == 0: - pbar.set_postfix(**{'total_loss': val_total_loss / (iteration + 1), - 'accuracy' : val_total_accuracy / (iteration + 1), - 'lr' : get_lr(optimizer)}) - pbar.update(1) - - if lfw_eval_flag: - print("开始进行LFW数据集的验证。") - labels, distances = [], [] - for _, (data_a, data_p, label) in enumerate(test_loader): - with torch.no_grad(): - data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) - if cuda: - data_a, data_p = data_a.cuda(local_rank), data_p.cuda(local_rank) - - out_a, out_p = model_train(data_a), model_train(data_p) - dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) - distances.append(dists.data.cpu().numpy()) - labels.append(label.data.cpu().numpy()) - - labels = np.array([sublabel for label in labels for sublabel in label]) - distances = np.array([subdist for dist in distances for subdist in dist]) - _, _, accuracy, _, _, _, _ = evaluate(distances,labels) - - if local_rank == 0: - pbar.close() - print('Finish Validation') - - if lfw_eval_flag: - print('LFW_Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) - - loss_history.append_loss(epoch, np.mean(accuracy) if lfw_eval_flag else total_accuracy / epoch_step, total_loss / epoch_step, val_total_loss / epoch_step_val) - print('Total Loss: %.4f' % (total_loss / epoch_step)) - if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: - torch.save(model.state_dict(), os.path.join(save_dir, 'ep%03d-loss%.3f-val_loss%.3f.pth'%((epoch+1), total_loss / epoch_step, val_total_loss / epoch_step_val))) +import os + +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from tqdm import tqdm + +from .utils import get_lr +from .utils_metrics import evaluate + + +def fit_one_epoch(model_train, model, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, test_loader, lfw_eval_flag, fp16, scaler, save_period, save_dir, local_rank=0): + total_loss = 0 + total_accuracy = 0 + + val_total_loss = 0 + val_total_accuracy = 0 + + if local_rank == 0: + print('Start Train') + pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + model_train.train() + for iteration, batch in enumerate(gen): + if iteration >= epoch_step: + break + images, labels = batch + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + labels = labels.cuda(local_rank) + + #----------------------# + # 清零梯度 + #----------------------# + optimizer.zero_grad() + if not fp16: + outputs = model_train(images, labels, mode="train") + loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) + + loss.backward() + optimizer.step() + else: + from torch.cuda.amp import autocast + with autocast(): + outputs = model_train(images, labels, mode="train") + loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) + #----------------------# + # 反向传播 + #----------------------# + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + + with torch.no_grad(): + accuracy = torch.mean((torch.argmax(F.softmax(outputs, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) + + total_loss += loss.item() + total_accuracy += accuracy.item() + + if local_rank == 0: + pbar.set_postfix(**{'total_loss': total_loss / (iteration + 1), + 'accuracy' : total_accuracy / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + if local_rank == 0: + pbar.close() + print('Finish Train') + print('Start Validation') + pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + model_train.eval() + for iteration, batch in enumerate(gen_val): + if iteration >= epoch_step_val: + break + images, labels = batch + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + labels = labels.cuda(local_rank) + + optimizer.zero_grad() + outputs = model_train(images, labels, mode="train") + loss = nn.NLLLoss()(F.log_softmax(outputs, -1), labels) + + accuracy = torch.mean((torch.argmax(F.softmax(outputs, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) + + val_total_loss += loss.item() + val_total_accuracy += accuracy.item() + + if local_rank == 0: + pbar.set_postfix(**{'total_loss': val_total_loss / (iteration + 1), + 'accuracy' : val_total_accuracy / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + if lfw_eval_flag: + print("开始进行LFW数据集的验证。") + labels, distances = [], [] + for _, (data_a, data_p, label) in enumerate(test_loader): + with torch.no_grad(): + data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) + if cuda: + data_a, data_p = data_a.cuda(local_rank), data_p.cuda(local_rank) + + out_a, out_p = model_train(data_a), model_train(data_p) + dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) + distances.append(dists.data.cpu().numpy()) + labels.append(label.data.cpu().numpy()) + + labels = np.array([sublabel for label in labels for sublabel in label]) + distances = np.array([subdist for dist in distances for subdist in dist]) + _, _, accuracy, _, _, _, _ = evaluate(distances,labels) + + if local_rank == 0: + pbar.close() + print('Finish Validation') + + if lfw_eval_flag: + print('LFW_Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) + + loss_history.append_loss(epoch, np.mean(accuracy) if lfw_eval_flag else total_accuracy / epoch_step, total_loss / epoch_step, val_total_loss / epoch_step_val) + print('Total Loss: %.4f' % (total_loss / epoch_step)) + if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: + torch.save(model.state_dict(), os.path.join(save_dir, 'ep%03d-loss%.3f-val_loss%.3f.pth'%((epoch+1), total_loss / epoch_step, val_total_loss / epoch_step_val))) diff --git a/cv/face_recognition/arcface/pytorch/utils/utils_metrics.py b/cv/face_recognition/arcface/pytorch/utils/utils_metrics.py index 110fc2bc1..7cdfe4804 100755 --- a/cv/face_recognition/arcface/pytorch/utils/utils_metrics.py +++ b/cv/face_recognition/arcface/pytorch/utils/utils_metrics.py @@ -1,157 +1,157 @@ -import numpy as np -import torch -from scipy import interpolate -from sklearn.model_selection import KFold -from tqdm import tqdm - -def evaluate(distances, labels, nrof_folds=10): - # Calculate evaluation metrics - thresholds = np.arange(0, 4, 0.01) - tpr, fpr, accuracy, best_thresholds = calculate_roc(thresholds, distances, - labels, nrof_folds=nrof_folds) - thresholds = np.arange(0, 4, 0.001) - val, val_std, far = calculate_val(thresholds, distances, - labels, 1e-3, nrof_folds=nrof_folds) - return tpr, fpr, accuracy, val, val_std, far, best_thresholds - -def calculate_roc(thresholds, distances, labels, nrof_folds=10): - - nrof_pairs = min(len(labels), len(distances)) - nrof_thresholds = len(thresholds) - k_fold = KFold(n_splits=nrof_folds, shuffle=False) - - tprs = np.zeros((nrof_folds,nrof_thresholds)) - fprs = np.zeros((nrof_folds,nrof_thresholds)) - accuracy = np.zeros((nrof_folds)) - - indices = np.arange(nrof_pairs) - - for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): - - # Find the best threshold for the fold - acc_train = np.zeros((nrof_thresholds)) - for threshold_idx, threshold in enumerate(thresholds): - _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, distances[train_set], labels[train_set]) - - best_threshold_index = np.argmax(acc_train) - for threshold_idx, threshold in enumerate(thresholds): - tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, distances[test_set], labels[test_set]) - _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], distances[test_set], labels[test_set]) - tpr = np.mean(tprs,0) - fpr = np.mean(fprs,0) - return tpr, fpr, accuracy, thresholds[best_threshold_index] - -def calculate_accuracy(threshold, dist, actual_issame): - predict_issame = np.less(dist, threshold) - tp = np.sum(np.logical_and(predict_issame, actual_issame)) - fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) - tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))) - fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) - - tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn) - fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn) - acc = float(tp+tn)/dist.size - return tpr, fpr, acc - -def calculate_val(thresholds, distances, labels, far_target=1e-3, nrof_folds=10): - nrof_pairs = min(len(labels), len(distances)) - nrof_thresholds = len(thresholds) - k_fold = KFold(n_splits=nrof_folds, shuffle=False) - - val = np.zeros(nrof_folds) - far = np.zeros(nrof_folds) - - indices = np.arange(nrof_pairs) - - for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): - # Find the threshold that gives FAR = far_target - far_train = np.zeros(nrof_thresholds) - for threshold_idx, threshold in enumerate(thresholds): - _, far_train[threshold_idx] = calculate_val_far(threshold, distances[train_set], labels[train_set]) - if np.max(far_train)>=far_target: - f = interpolate.interp1d(far_train, thresholds, kind='slinear') - threshold = f(far_target) - else: - threshold = 0.0 - - val[fold_idx], far[fold_idx] = calculate_val_far(threshold, distances[test_set], labels[test_set]) - - val_mean = np.mean(val) - far_mean = np.mean(far) - val_std = np.std(val) - return val_mean, val_std, far_mean - -def calculate_val_far(threshold, dist, actual_issame): - predict_issame = np.less(dist, threshold) - true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) - false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) - n_same = np.sum(actual_issame) - n_diff = np.sum(np.logical_not(actual_issame)) - if n_diff == 0: - n_diff = 1 - if n_same == 0: - return 0,0 - val = float(true_accept) / float(n_same) - far = float(false_accept) / float(n_diff) - return val, far - -def test(test_loader, model, png_save_path, log_interval, batch_size, cuda): - labels, distances = [], [] - pbar = tqdm(enumerate(test_loader)) - for batch_idx, (data_a, data_p, label) in pbar: - with torch.no_grad(): - #--------------------------------------# - # 加载数据,设置成cuda - #--------------------------------------# - data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) - if cuda: - data_a, data_p = data_a.cuda(), data_p.cuda() - #--------------------------------------# - # 传入模型预测,获得预测结果 - # 获得预测结果的距离 - #--------------------------------------# - out_a, out_p = model(data_a), model(data_p) - dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) - - #--------------------------------------# - # 将结果添加进列表中 - #--------------------------------------# - distances.append(dists.data.cpu().numpy()) - labels.append(label.data.cpu().numpy()) - - #--------------------------------------# - # 打印 - #--------------------------------------# - if batch_idx % log_interval == 0: - pbar.set_description('Test Epoch: [{}/{} ({:.0f}%)]'.format( - batch_idx * batch_size, len(test_loader.dataset), - 100. * batch_idx / len(test_loader))) - - #--------------------------------------# - # 转换成numpy - #--------------------------------------# - labels = np.array([sublabel for label in labels for sublabel in label]) - distances = np.array([subdist for dist in distances for subdist in dist]) - - tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(distances,labels) - print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) - print('Best_thresholds: %2.5f' % best_thresholds) - print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) - plot_roc(fpr, tpr, figure_name = png_save_path) - -def plot_roc(fpr, tpr, figure_name = "roc.png"): - import matplotlib.pyplot as plt - from sklearn.metrics import auc, roc_curve - roc_auc = auc(fpr, tpr) - fig = plt.figure() - lw = 2 - plt.plot(fpr, tpr, color='darkorange', - lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) - plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('Receiver operating characteristic') - plt.legend(loc="lower right") - fig.savefig(figure_name, dpi=fig.dpi) +import numpy as np +import torch +from scipy import interpolate +from sklearn.model_selection import KFold +from tqdm import tqdm + +def evaluate(distances, labels, nrof_folds=10): + # Calculate evaluation metrics + thresholds = np.arange(0, 4, 0.01) + tpr, fpr, accuracy, best_thresholds = calculate_roc(thresholds, distances, + labels, nrof_folds=nrof_folds) + thresholds = np.arange(0, 4, 0.001) + val, val_std, far = calculate_val(thresholds, distances, + labels, 1e-3, nrof_folds=nrof_folds) + return tpr, fpr, accuracy, val, val_std, far, best_thresholds + +def calculate_roc(thresholds, distances, labels, nrof_folds=10): + + nrof_pairs = min(len(labels), len(distances)) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + tprs = np.zeros((nrof_folds,nrof_thresholds)) + fprs = np.zeros((nrof_folds,nrof_thresholds)) + accuracy = np.zeros((nrof_folds)) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + + # Find the best threshold for the fold + acc_train = np.zeros((nrof_thresholds)) + for threshold_idx, threshold in enumerate(thresholds): + _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, distances[train_set], labels[train_set]) + + best_threshold_index = np.argmax(acc_train) + for threshold_idx, threshold in enumerate(thresholds): + tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, distances[test_set], labels[test_set]) + _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], distances[test_set], labels[test_set]) + tpr = np.mean(tprs,0) + fpr = np.mean(fprs,0) + return tpr, fpr, accuracy, thresholds[best_threshold_index] + +def calculate_accuracy(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + tp = np.sum(np.logical_and(predict_issame, actual_issame)) + fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))) + fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) + + tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn) + fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn) + acc = float(tp+tn)/dist.size + return tpr, fpr, acc + +def calculate_val(thresholds, distances, labels, far_target=1e-3, nrof_folds=10): + nrof_pairs = min(len(labels), len(distances)) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + val = np.zeros(nrof_folds) + far = np.zeros(nrof_folds) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + # Find the threshold that gives FAR = far_target + far_train = np.zeros(nrof_thresholds) + for threshold_idx, threshold in enumerate(thresholds): + _, far_train[threshold_idx] = calculate_val_far(threshold, distances[train_set], labels[train_set]) + if np.max(far_train)>=far_target: + f = interpolate.interp1d(far_train, thresholds, kind='slinear') + threshold = f(far_target) + else: + threshold = 0.0 + + val[fold_idx], far[fold_idx] = calculate_val_far(threshold, distances[test_set], labels[test_set]) + + val_mean = np.mean(val) + far_mean = np.mean(far) + val_std = np.std(val) + return val_mean, val_std, far_mean + +def calculate_val_far(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) + false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + n_same = np.sum(actual_issame) + n_diff = np.sum(np.logical_not(actual_issame)) + if n_diff == 0: + n_diff = 1 + if n_same == 0: + return 0,0 + val = float(true_accept) / float(n_same) + far = float(false_accept) / float(n_diff) + return val, far + +def test(test_loader, model, png_save_path, log_interval, batch_size, cuda): + labels, distances = [], [] + pbar = tqdm(enumerate(test_loader)) + for batch_idx, (data_a, data_p, label) in pbar: + with torch.no_grad(): + #--------------------------------------# + # 加载数据,设置成cuda + #--------------------------------------# + data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) + if cuda: + data_a, data_p = data_a.cuda(), data_p.cuda() + #--------------------------------------# + # 传入模型预测,获得预测结果 + # 获得预测结果的距离 + #--------------------------------------# + out_a, out_p = model(data_a), model(data_p) + dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) + + #--------------------------------------# + # 将结果添加进列表中 + #--------------------------------------# + distances.append(dists.data.cpu().numpy()) + labels.append(label.data.cpu().numpy()) + + #--------------------------------------# + # 打印 + #--------------------------------------# + if batch_idx % log_interval == 0: + pbar.set_description('Test Epoch: [{}/{} ({:.0f}%)]'.format( + batch_idx * batch_size, len(test_loader.dataset), + 100. * batch_idx / len(test_loader))) + + #--------------------------------------# + # 转换成numpy + #--------------------------------------# + labels = np.array([sublabel for label in labels for sublabel in label]) + distances = np.array([subdist for dist in distances for subdist in dist]) + + tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(distances,labels) + print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) + print('Best_thresholds: %2.5f' % best_thresholds) + print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) + plot_roc(fpr, tpr, figure_name = png_save_path) + +def plot_roc(fpr, tpr, figure_name = "roc.png"): + import matplotlib.pyplot as plt + from sklearn.metrics import auc, roc_curve + roc_auc = auc(fpr, tpr) + fig = plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic') + plt.legend(loc="lower right") + fig.savefig(figure_name, dpi=fig.dpi) diff --git a/cv/face_recognition/cosface/pytorch/dataset.py b/cv/face_recognition/cosface/pytorch/dataset.py index 7c969c4f3..ff90c97ba 100644 --- a/cv/face_recognition/cosface/pytorch/dataset.py +++ b/cv/face_recognition/cosface/pytorch/dataset.py @@ -1,46 +1,46 @@ -# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. - -import torch.utils.data as data -from PIL import Image, ImageFile -import os - -ImageFile.LOAD_TRUNCATED_IAMGES = True - - -# https://github.com/pytorch/vision/issues/81 - -def PIL_loader(path): - try: - with open(path, 'rb') as f: - return Image.open(f).convert('RGB') - except IOError: - print('Cannot load image ' + path) - - -def default_reader(fileList): - imgList = [] - with open(fileList, 'r') as file: - for line in file.readlines(): - # imgPath, label = line.strip().split(' ') - label, imgPath = line.strip().split(';') - imgList.append((imgPath, int(label))) - return imgList - - -class ImageList(data.Dataset): - def __init__(self, root, fileList, transform=None, list_reader=default_reader, loader=PIL_loader): - self.root = root - self.imgList = list_reader(fileList) - self.transform = transform - self.loader = loader - - def __getitem__(self, index): - imgPath, target = self.imgList[index] - img = self.loader(os.path.join(self.root, imgPath)) - - if self.transform is not None: - img = self.transform(img) - return img, target - - def __len__(self): - return len(self.imgList) +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. + +import torch.utils.data as data +from PIL import Image, ImageFile +import os + +ImageFile.LOAD_TRUNCATED_IAMGES = True + + +# https://github.com/pytorch/vision/issues/81 + +def PIL_loader(path): + try: + with open(path, 'rb') as f: + return Image.open(f).convert('RGB') + except IOError: + print('Cannot load image ' + path) + + +def default_reader(fileList): + imgList = [] + with open(fileList, 'r') as file: + for line in file.readlines(): + # imgPath, label = line.strip().split(' ') + label, imgPath = line.strip().split(';') + imgList.append((imgPath, int(label))) + return imgList + + +class ImageList(data.Dataset): + def __init__(self, root, fileList, transform=None, list_reader=default_reader, loader=PIL_loader): + self.root = root + self.imgList = list_reader(fileList) + self.transform = transform + self.loader = loader + + def __getitem__(self, index): + imgPath, target = self.imgList[index] + img = self.loader(os.path.join(self.root, imgPath)) + + if self.transform is not None: + img = self.transform(img) + return img, target + + def __len__(self): + return len(self.imgList) diff --git a/cv/face_recognition/cosface/pytorch/layer.py b/cv/face_recognition/cosface/pytorch/layer.py index c54431ccc..52e2b57b0 100644 --- a/cv/face_recognition/cosface/pytorch/layer.py +++ b/cv/face_recognition/cosface/pytorch/layer.py @@ -1,107 +1,107 @@ -from __future__ import print_function -from __future__ import division -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn import Parameter -import math - -def cosine_sim(x1, x2, dim=1, eps=1e-8): - ip = torch.mm(x1, x2.t()) - w1 = torch.norm(x1, 2, dim) - w2 = torch.norm(x2, 2, dim) - return ip / torch.ger(w1,w2).clamp(min=eps) - -class MarginCosineProduct(nn.Module): - r"""Implement of large margin cosine distance: : - Args: - in_features: size of each input sample - out_features: size of each output sample - s: norm of input feature - m: margin - """ - - def __init__(self, in_features, out_features, s=30.0, m=0.40): - super(MarginCosineProduct, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.s = s - self.m = m - self.weight = Parameter(torch.Tensor(out_features, in_features)) - nn.init.xavier_uniform_(self.weight) - #stdv = 1. / math.sqrt(self.weight.size(1)) - #self.weight.data.uniform_(-stdv, stdv) - - def forward(self, input, label): - cosine = cosine_sim(input, self.weight) - # cosine = F.linear(F.normalize(input), F.normalize(self.weight)) - # --------------------------- convert label to one-hot --------------------------- - # https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507 - one_hot = torch.zeros_like(cosine) - one_hot.scatter_(1, label.view(-1, 1), 1.0) - # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- - output = self.s * (cosine - one_hot * self.m) - - return output - - def __repr__(self): - return self.__class__.__name__ + '(' \ - + 'in_features=' + str(self.in_features) \ - + ', out_features=' + str(self.out_features) \ - + ', s=' + str(self.s) \ - + ', m=' + str(self.m) + ')' - - -class AngleLinear(nn.Module): - def __init__(self, in_features, out_features, m=4): - super(AngleLinear, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.m = m - self.base = 1000.0 - self.gamma = 0.12 - self.power = 1 - self.LambdaMin = 5.0 - self.iter = 0 - self.weight = Parameter(torch.Tensor(out_features, in_features)) - nn.init.xavier_uniform_(self.weight) - - # duplication formula - self.mlambda = [ - lambda x: x ** 0, - lambda x: x ** 1, - lambda x: 2 * x ** 2 - 1, - lambda x: 4 * x ** 3 - 3 * x, - lambda x: 8 * x ** 4 - 8 * x ** 2 + 1, - lambda x: 16 * x ** 5 - 20 * x ** 3 + 5 * x - ] - - def forward(self, input, label): - # lambda = max(lambda_min,base*(1+gamma*iteration)^(-power)) - self.iter += 1 - self.lamb = max(self.LambdaMin, self.base * (1 + self.gamma * self.iter) ** (-1 * self.power)) - - # --------------------------- cos(theta) & phi(theta) --------------------------- - cos_theta = F.linear(F.normalize(input), F.normalize(self.weight)) - cos_theta = cos_theta.clamp(-1, 1) - cos_m_theta = self.mlambda[self.m](cos_theta) - theta = cos_theta.data.acos() - k = (self.m * theta / 3.14159265).floor() - phi_theta = ((-1.0) ** k) * cos_m_theta - 2 * k - NormOfFeature = torch.norm(input, 2, 1) - - # --------------------------- convert label to one-hot --------------------------- - one_hot = torch.zeros_like(cos_theta) - one_hot.scatter_(1, label.view(-1, 1), 1) - - # --------------------------- Calculate output --------------------------- - output = (one_hot * (phi_theta - cos_theta) / (1 + self.lamb)) + cos_theta - output *= NormOfFeature.view(-1, 1) - - return output - - def __repr__(self): - return self.__class__.__name__ + '(' \ - + 'in_features=' + str(self.in_features) \ - + ', out_features=' + str(self.out_features) \ - + ', m=' + str(self.m) + ')' +from __future__ import print_function +from __future__ import division +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +import math + +def cosine_sim(x1, x2, dim=1, eps=1e-8): + ip = torch.mm(x1, x2.t()) + w1 = torch.norm(x1, 2, dim) + w2 = torch.norm(x2, 2, dim) + return ip / torch.ger(w1,w2).clamp(min=eps) + +class MarginCosineProduct(nn.Module): + r"""Implement of large margin cosine distance: : + Args: + in_features: size of each input sample + out_features: size of each output sample + s: norm of input feature + m: margin + """ + + def __init__(self, in_features, out_features, s=30.0, m=0.40): + super(MarginCosineProduct, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.s = s + self.m = m + self.weight = Parameter(torch.Tensor(out_features, in_features)) + nn.init.xavier_uniform_(self.weight) + #stdv = 1. / math.sqrt(self.weight.size(1)) + #self.weight.data.uniform_(-stdv, stdv) + + def forward(self, input, label): + cosine = cosine_sim(input, self.weight) + # cosine = F.linear(F.normalize(input), F.normalize(self.weight)) + # --------------------------- convert label to one-hot --------------------------- + # https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507 + one_hot = torch.zeros_like(cosine) + one_hot.scatter_(1, label.view(-1, 1), 1.0) + # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- + output = self.s * (cosine - one_hot * self.m) + + return output + + def __repr__(self): + return self.__class__.__name__ + '(' \ + + 'in_features=' + str(self.in_features) \ + + ', out_features=' + str(self.out_features) \ + + ', s=' + str(self.s) \ + + ', m=' + str(self.m) + ')' + + +class AngleLinear(nn.Module): + def __init__(self, in_features, out_features, m=4): + super(AngleLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.m = m + self.base = 1000.0 + self.gamma = 0.12 + self.power = 1 + self.LambdaMin = 5.0 + self.iter = 0 + self.weight = Parameter(torch.Tensor(out_features, in_features)) + nn.init.xavier_uniform_(self.weight) + + # duplication formula + self.mlambda = [ + lambda x: x ** 0, + lambda x: x ** 1, + lambda x: 2 * x ** 2 - 1, + lambda x: 4 * x ** 3 - 3 * x, + lambda x: 8 * x ** 4 - 8 * x ** 2 + 1, + lambda x: 16 * x ** 5 - 20 * x ** 3 + 5 * x + ] + + def forward(self, input, label): + # lambda = max(lambda_min,base*(1+gamma*iteration)^(-power)) + self.iter += 1 + self.lamb = max(self.LambdaMin, self.base * (1 + self.gamma * self.iter) ** (-1 * self.power)) + + # --------------------------- cos(theta) & phi(theta) --------------------------- + cos_theta = F.linear(F.normalize(input), F.normalize(self.weight)) + cos_theta = cos_theta.clamp(-1, 1) + cos_m_theta = self.mlambda[self.m](cos_theta) + theta = cos_theta.data.acos() + k = (self.m * theta / 3.14159265).floor() + phi_theta = ((-1.0) ** k) * cos_m_theta - 2 * k + NormOfFeature = torch.norm(input, 2, 1) + + # --------------------------- convert label to one-hot --------------------------- + one_hot = torch.zeros_like(cos_theta) + one_hot.scatter_(1, label.view(-1, 1), 1) + + # --------------------------- Calculate output --------------------------- + output = (one_hot * (phi_theta - cos_theta) / (1 + self.lamb)) + cos_theta + output *= NormOfFeature.view(-1, 1) + + return output + + def __repr__(self): + return self.__class__.__name__ + '(' \ + + 'in_features=' + str(self.in_features) \ + + ', out_features=' + str(self.out_features) \ + + ', m=' + str(self.m) + ')' diff --git a/cv/face_recognition/cosface/pytorch/lfw_eval.py b/cv/face_recognition/cosface/pytorch/lfw_eval.py index 63dc4b7dd..c335ef81a 100644 --- a/cv/face_recognition/cosface/pytorch/lfw_eval.py +++ b/cv/face_recognition/cosface/pytorch/lfw_eval.py @@ -1,117 +1,117 @@ -# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. - -from PIL import Image -import numpy as np - -from torchvision.transforms import functional as F -import torchvision.transforms as transforms -import torch -from torch.autograd import Variable -import torch.backends.cudnn as cudnn - -cudnn.benchmark = True - -import net - - -def extractDeepFeature(img, model, is_gray): - if is_gray: - transform = transforms.Compose([ - transforms.Grayscale(), - transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] - transforms.Normalize(mean=(0.5,), std=(0.5,)) # range [0.0, 1.0] -> [-1.0,1.0] - ]) - else: - transform = transforms.Compose([ - transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] - transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] - ]) - img, img_ = transform(img), transform(F.hflip(img)) - img, img_ = img.unsqueeze(0).to('cuda'), img_.unsqueeze(0).to('cuda') - ft = torch.cat((model(img), model(img_)), 1)[0].to('cpu') - return ft - - -def KFold(n=6000, n_folds=10): - folds = [] - base = list(range(n)) - for i in range(n_folds): - test = base[int(i * n / n_folds):int((i + 1) * n / n_folds)] - train = list(set(base) - set(test)) - folds.append([train, test]) - return folds - - -def eval_acc(threshold, diff): - y_true = [] - y_predict = [] - for d in diff: - same = 1 if float(d[2]) > threshold else 0 - y_predict.append(same) - y_true.append(int(d[3])) - y_true = np.array(y_true) - y_predict = np.array(y_predict) - accuracy = 1.0 * np.count_nonzero(y_true == y_predict) / len(y_true) - return accuracy - - -def find_best_threshold(thresholds, predicts): - best_threshold = best_acc = 0 - for threshold in thresholds: - accuracy = eval_acc(threshold, predicts) - if accuracy >= best_acc: - best_acc = accuracy - best_threshold = threshold - return best_threshold - - -def eval(model, model_path=None, is_gray=False): - predicts = [] - model.load_state_dict(torch.load(model_path)) - model.eval() - root = 'datasets/lfw/' - with open('datasets/lfw_pair.txt') as f: - pairs_lines = f.readlines()[1:] - - with torch.no_grad(): - for i in range(6000): - p = pairs_lines[i].replace('\n', '').split('\t') - - if 3 == len(p): - sameflag = 1 - name1 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[1])) - name2 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[2])) - elif 4 == len(p): - sameflag = 0 - name1 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[1])) - name2 = p[2] + '/' + p[2] + '_' + '{:04}.jpg'.format(int(p[3])) - else: - raise ValueError("WRONG LINE IN 'pairs.txt! ") - - with open(root + name1, 'rb') as f: - img1 = Image.open(f).convert('RGB') - with open(root + name2, 'rb') as f: - img2 = Image.open(f).convert('RGB') - f1 = extractDeepFeature(img1, model, is_gray) - f2 = extractDeepFeature(img2, model, is_gray) - - distance = f1.dot(f2) / (f1.norm() * f2.norm() + 1e-5) - predicts.append('{}\t{}\t{}\t{}\n'.format(name1, name2, distance, sameflag)) - - accuracy = [] - thd = [] - folds = KFold(n=6000, n_folds=10) - thresholds = np.arange(-1.0, 1.0, 0.005) - predicts = np.array(list(map(lambda line: line.strip('\n').split(), predicts))) - for idx, (train, test) in enumerate(folds): - best_thresh = find_best_threshold(thresholds, predicts[train]) - accuracy.append(eval_acc(best_thresh, predicts[test])) - thd.append(best_thresh) - print('LFWACC={:.4f} std={:.4f} thd={:.4f}'.format(np.mean(accuracy), np.std(accuracy), np.mean(thd))) - - return np.mean(accuracy), predicts - - -if __name__ == '__main__': - _, result = eval(net.sphere().to('cuda'), model_path='checkpoint/CosFace_24_checkpoint.pth') - np.savetxt("result.txt", result, '%s') +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. + +from PIL import Image +import numpy as np + +from torchvision.transforms import functional as F +import torchvision.transforms as transforms +import torch +from torch.autograd import Variable +import torch.backends.cudnn as cudnn + +cudnn.benchmark = True + +import net + + +def extractDeepFeature(img, model, is_gray): + if is_gray: + transform = transforms.Compose([ + transforms.Grayscale(), + transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] + transforms.Normalize(mean=(0.5,), std=(0.5,)) # range [0.0, 1.0] -> [-1.0,1.0] + ]) + else: + transform = transforms.Compose([ + transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] + ]) + img, img_ = transform(img), transform(F.hflip(img)) + img, img_ = img.unsqueeze(0).to('cuda'), img_.unsqueeze(0).to('cuda') + ft = torch.cat((model(img), model(img_)), 1)[0].to('cpu') + return ft + + +def KFold(n=6000, n_folds=10): + folds = [] + base = list(range(n)) + for i in range(n_folds): + test = base[int(i * n / n_folds):int((i + 1) * n / n_folds)] + train = list(set(base) - set(test)) + folds.append([train, test]) + return folds + + +def eval_acc(threshold, diff): + y_true = [] + y_predict = [] + for d in diff: + same = 1 if float(d[2]) > threshold else 0 + y_predict.append(same) + y_true.append(int(d[3])) + y_true = np.array(y_true) + y_predict = np.array(y_predict) + accuracy = 1.0 * np.count_nonzero(y_true == y_predict) / len(y_true) + return accuracy + + +def find_best_threshold(thresholds, predicts): + best_threshold = best_acc = 0 + for threshold in thresholds: + accuracy = eval_acc(threshold, predicts) + if accuracy >= best_acc: + best_acc = accuracy + best_threshold = threshold + return best_threshold + + +def eval(model, model_path=None, is_gray=False): + predicts = [] + model.load_state_dict(torch.load(model_path)) + model.eval() + root = 'datasets/lfw/' + with open('datasets/lfw_pair.txt') as f: + pairs_lines = f.readlines()[1:] + + with torch.no_grad(): + for i in range(6000): + p = pairs_lines[i].replace('\n', '').split('\t') + + if 3 == len(p): + sameflag = 1 + name1 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[1])) + name2 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[2])) + elif 4 == len(p): + sameflag = 0 + name1 = p[0] + '/' + p[0] + '_' + '{:04}.jpg'.format(int(p[1])) + name2 = p[2] + '/' + p[2] + '_' + '{:04}.jpg'.format(int(p[3])) + else: + raise ValueError("WRONG LINE IN 'pairs.txt! ") + + with open(root + name1, 'rb') as f: + img1 = Image.open(f).convert('RGB') + with open(root + name2, 'rb') as f: + img2 = Image.open(f).convert('RGB') + f1 = extractDeepFeature(img1, model, is_gray) + f2 = extractDeepFeature(img2, model, is_gray) + + distance = f1.dot(f2) / (f1.norm() * f2.norm() + 1e-5) + predicts.append('{}\t{}\t{}\t{}\n'.format(name1, name2, distance, sameflag)) + + accuracy = [] + thd = [] + folds = KFold(n=6000, n_folds=10) + thresholds = np.arange(-1.0, 1.0, 0.005) + predicts = np.array(list(map(lambda line: line.strip('\n').split(), predicts))) + for idx, (train, test) in enumerate(folds): + best_thresh = find_best_threshold(thresholds, predicts[train]) + accuracy.append(eval_acc(best_thresh, predicts[test])) + thd.append(best_thresh) + print('LFWACC={:.4f} std={:.4f} thd={:.4f}'.format(np.mean(accuracy), np.std(accuracy), np.mean(thd))) + + return np.mean(accuracy), predicts + + +if __name__ == '__main__': + _, result = eval(net.sphere().to('cuda'), model_path='checkpoint/CosFace_24_checkpoint.pth') + np.savetxt("result.txt", result, '%s') diff --git a/cv/face_recognition/cosface/pytorch/main.py b/cv/face_recognition/cosface/pytorch/main.py index 877816e18..348b2ab00 100644 --- a/cv/face_recognition/cosface/pytorch/main.py +++ b/cv/face_recognition/cosface/pytorch/main.py @@ -1,208 +1,208 @@ -# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. - -from __future__ import print_function -from __future__ import division -import argparse -import os -import time - -import torch -import torch.utils.data -import torch.optim -import torchvision.transforms as transforms -import torch.backends.cudnn as cudnn - -cudnn.benchmark = True - -import net -from dataset import ImageList -import lfw_eval -import layer - -#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch CosFace') - -# DATA -parser.add_argument('--root_path', type=str, default='', - help='path to root path of images') -parser.add_argument('--database', type=str, default='WebFace', - help='Which Database for train. (WebFace, VggFace2)') -parser.add_argument('--train_list', type=str, default=None, - help='path to training list') -parser.add_argument('--batch_size', type=int, default=512, - help='input batch size for training (default: 512)') -parser.add_argument('--is_gray', type=bool, default=False, - help='Transform input image to gray or not (default: False)') -# Network -parser.add_argument('--network', type=str, default='sphere20', - help='Which network for train. (sphere20, sphere64, LResNet50E_IR)') -# Classifier -parser.add_argument('--num_class', type=int, default=None, - help='number of people(class)') -parser.add_argument('--classifier_type', type=str, default='MCP', - help='Which classifier for train. (MCP, AL, L)') -# LR policy -parser.add_argument('--epochs', type=int, default=30, - help='number of epochs to train (default: 30)') -parser.add_argument('--lr', type=float, default=0.1, - help='learning rate (default: 0.1)') -parser.add_argument('--step_size', type=list, default=None, - help='lr decay step') # [15000, 22000, 26000][80000,120000,140000][100000, 140000, 160000] -parser.add_argument('--momentum', type=float, default=0.9, - help='SGD momentum (default: 0.9)') -parser.add_argument('--weight_decay', type=float, default=5e-4, - metavar='W', help='weight decay (default: 0.0005)') -# Common settings -parser.add_argument('--log_interval', type=int, default=100, - help='how many batches to wait before logging training status') -parser.add_argument('--save_path', type=str, default='checkpoint/', - help='path to save checkpoint') -parser.add_argument('--no_cuda', type=bool, default=False, - help='disables CUDA training') -parser.add_argument('--workers', type=int, default=4, - help='how many workers to load data') -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() -device = torch.device("cuda" if args.cuda else "cpu") - -if args.database is 'WebFace': - args.train_list = 'cls_train.txt' - args.num_class = 10575 - args.step_size = [16000, 24000] -elif args.database is 'VggFace2': - args.train_list = 'VGG-Face2-112X96.txt' - args.num_class = 8069 - args.step_size = [80000, 120000, 140000] -else: - raise ValueError("NOT SUPPORT DATABASE! ") - - -def main(): - # --------------------------------------model---------------------------------------- - if args.network is 'sphere20': - model = net.sphere(type=20, is_gray=args.is_gray) - model_eval = net.sphere(type=20, is_gray=args.is_gray) - elif args.network is 'sphere64': - model = net.sphere(type=64, is_gray=args.is_gray) - model_eval = net.sphere(type=64, is_gray=args.is_gray) - elif args.network is 'LResNet50E_IR': - model = net.LResNet50E_IR(is_gray=args.is_gray) - model_eval = net.LResNet50E_IR(is_gray=args.is_gray) - else: - raise ValueError("NOT SUPPORT NETWORK! ") - - num_gpus = torch.cuda.device_count() - model = torch.nn.DataParallel(model).to(device) - model_eval = model_eval.to(device) - print(model) - if not os.path.exists(args.save_path): - os.makedirs(args.save_path) - model.module.save(args.save_path + 'CosFace_0_checkpoint.pth') - - # 512 is dimension of feature - classifier = { - 'MCP': layer.MarginCosineProduct(512, args.num_class).to(device), - 'AL' : layer.AngleLinear(512, args.num_class).to(device), - 'L' : torch.nn.Linear(512, args.num_class, bias=False).to(device) - }[args.classifier_type] - - # ------------------------------------load image--------------------------------------- - if args.is_gray: - train_transform = transforms.Compose([ - transforms.Grayscale(), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] - transforms.Normalize(mean=(0.5,), std=(0.5,)) - ]) # gray - else: - train_transform = transforms.Compose([ - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] - transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] - ]) - train_loader = torch.utils.data.DataLoader( - ImageList(root=args.root_path, fileList=args.train_list, - transform=train_transform), - batch_size=args.batch_size * num_gpus, shuffle=True, - num_workers=args.workers, pin_memory=True, drop_last=True) - - print('length of train Database: ' + str(len(train_loader.dataset))) - print('Number of Identities: ' + str(args.num_class)) - - # --------------------------------loss function and optimizer----------------------------- - criterion = torch.nn.CrossEntropyLoss().to(device) - optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': classifier.parameters()}], - lr=args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) - - # ----------------------------------------train---------------------------------------- - # lfw_eval.eval(model_eval, args.save_path + 'CosFace_0_checkpoint.pth') - for epoch in range(1, args.epochs + 1): - train(train_loader, model, classifier, criterion, optimizer, epoch) - model.module.save(args.save_path + 'CosFace_' + str(epoch) + '_checkpoint.pth') - lfw_eval.eval(model_eval, args.save_path + 'CosFace_' + str(epoch) + '_checkpoint.pth', args.is_gray) - print('Finished Training') - - -def train(train_loader, model, classifier, criterion, optimizer, epoch): - model.train() - print_with_time('Epoch {} start training'.format(epoch)) - time_curr = time.time() - loss_display = 0.0 - - for batch_idx, (data, target) in enumerate(train_loader, 1): - iteration = (epoch - 1) * len(train_loader) + batch_idx - adjust_learning_rate(optimizer, iteration, args.step_size) - data, target = data.to(device), target.to(device) - # compute output - output = model(data) - if isinstance(classifier, torch.nn.Linear): - output = classifier(output) - else: - output = classifier(output, target) - loss = criterion(output, target) - loss_display += loss.item() - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if batch_idx % args.log_interval == 0: - time_used = time.time() - time_curr - loss_display /= args.log_interval - if args.classifier_type is 'MCP': - INFO = ' Margin: {:.4f}, Scale: {:.2f}'.format(classifier.m, classifier.s) - elif args.classifier_type is 'AL': - INFO = ' lambda: {:.4f}'.format(classifier.lamb) - else: - INFO = '' - print_with_time( - 'Train Epoch: {} [{}/{} ({:.0f}%)]{}, Loss: {:.6f}, Elapsed time: {:.4f}s({} iters)'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), - iteration, loss_display, time_used, args.log_interval) + INFO - ) - time_curr = time.time() - loss_display = 0.0 - - -def print_with_time(string): - print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()) + string) - - -def adjust_learning_rate(optimizer, iteration, step_size): - """Sets the learning rate to the initial LR decayed by 10 each step size""" - if iteration in step_size: - lr = args.lr * (0.1 ** (step_size.index(iteration) + 1)) - print_with_time('Adjust learning rate to {}'.format(lr)) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - else: - pass - - -if __name__ == '__main__': - print(args) - main() +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. + +from __future__ import print_function +from __future__ import division +import argparse +import os +import time + +import torch +import torch.utils.data +import torch.optim +import torchvision.transforms as transforms +import torch.backends.cudnn as cudnn + +cudnn.benchmark = True + +import net +from dataset import ImageList +import lfw_eval +import layer + +#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch CosFace') + +# DATA +parser.add_argument('--root_path', type=str, default='', + help='path to root path of images') +parser.add_argument('--database', type=str, default='WebFace', + help='Which Database for train. (WebFace, VggFace2)') +parser.add_argument('--train_list', type=str, default=None, + help='path to training list') +parser.add_argument('--batch_size', type=int, default=512, + help='input batch size for training (default: 512)') +parser.add_argument('--is_gray', type=bool, default=False, + help='Transform input image to gray or not (default: False)') +# Network +parser.add_argument('--network', type=str, default='sphere20', + help='Which network for train. (sphere20, sphere64, LResNet50E_IR)') +# Classifier +parser.add_argument('--num_class', type=int, default=None, + help='number of people(class)') +parser.add_argument('--classifier_type', type=str, default='MCP', + help='Which classifier for train. (MCP, AL, L)') +# LR policy +parser.add_argument('--epochs', type=int, default=30, + help='number of epochs to train (default: 30)') +parser.add_argument('--lr', type=float, default=0.1, + help='learning rate (default: 0.1)') +parser.add_argument('--step_size', type=list, default=None, + help='lr decay step') # [15000, 22000, 26000][80000,120000,140000][100000, 140000, 160000] +parser.add_argument('--momentum', type=float, default=0.9, + help='SGD momentum (default: 0.9)') +parser.add_argument('--weight_decay', type=float, default=5e-4, + metavar='W', help='weight decay (default: 0.0005)') +# Common settings +parser.add_argument('--log_interval', type=int, default=100, + help='how many batches to wait before logging training status') +parser.add_argument('--save_path', type=str, default='checkpoint/', + help='path to save checkpoint') +parser.add_argument('--no_cuda', type=bool, default=False, + help='disables CUDA training') +parser.add_argument('--workers', type=int, default=4, + help='how many workers to load data') +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() +device = torch.device("cuda" if args.cuda else "cpu") + +if args.database is 'WebFace': + args.train_list = 'cls_train.txt' + args.num_class = 10575 + args.step_size = [16000, 24000] +elif args.database is 'VggFace2': + args.train_list = 'VGG-Face2-112X96.txt' + args.num_class = 8069 + args.step_size = [80000, 120000, 140000] +else: + raise ValueError("NOT SUPPORT DATABASE! ") + + +def main(): + # --------------------------------------model---------------------------------------- + if args.network is 'sphere20': + model = net.sphere(type=20, is_gray=args.is_gray) + model_eval = net.sphere(type=20, is_gray=args.is_gray) + elif args.network is 'sphere64': + model = net.sphere(type=64, is_gray=args.is_gray) + model_eval = net.sphere(type=64, is_gray=args.is_gray) + elif args.network is 'LResNet50E_IR': + model = net.LResNet50E_IR(is_gray=args.is_gray) + model_eval = net.LResNet50E_IR(is_gray=args.is_gray) + else: + raise ValueError("NOT SUPPORT NETWORK! ") + + num_gpus = torch.cuda.device_count() + model = torch.nn.DataParallel(model).to(device) + model_eval = model_eval.to(device) + print(model) + if not os.path.exists(args.save_path): + os.makedirs(args.save_path) + model.module.save(args.save_path + 'CosFace_0_checkpoint.pth') + + # 512 is dimension of feature + classifier = { + 'MCP': layer.MarginCosineProduct(512, args.num_class).to(device), + 'AL' : layer.AngleLinear(512, args.num_class).to(device), + 'L' : torch.nn.Linear(512, args.num_class, bias=False).to(device) + }[args.classifier_type] + + # ------------------------------------load image--------------------------------------- + if args.is_gray: + train_transform = transforms.Compose([ + transforms.Grayscale(), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] + transforms.Normalize(mean=(0.5,), std=(0.5,)) + ]) # gray + else: + train_transform = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] + ]) + train_loader = torch.utils.data.DataLoader( + ImageList(root=args.root_path, fileList=args.train_list, + transform=train_transform), + batch_size=args.batch_size * num_gpus, shuffle=True, + num_workers=args.workers, pin_memory=True, drop_last=True) + + print('length of train Database: ' + str(len(train_loader.dataset))) + print('Number of Identities: ' + str(args.num_class)) + + # --------------------------------loss function and optimizer----------------------------- + criterion = torch.nn.CrossEntropyLoss().to(device) + optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': classifier.parameters()}], + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + # ----------------------------------------train---------------------------------------- + # lfw_eval.eval(model_eval, args.save_path + 'CosFace_0_checkpoint.pth') + for epoch in range(1, args.epochs + 1): + train(train_loader, model, classifier, criterion, optimizer, epoch) + model.module.save(args.save_path + 'CosFace_' + str(epoch) + '_checkpoint.pth') + lfw_eval.eval(model_eval, args.save_path + 'CosFace_' + str(epoch) + '_checkpoint.pth', args.is_gray) + print('Finished Training') + + +def train(train_loader, model, classifier, criterion, optimizer, epoch): + model.train() + print_with_time('Epoch {} start training'.format(epoch)) + time_curr = time.time() + loss_display = 0.0 + + for batch_idx, (data, target) in enumerate(train_loader, 1): + iteration = (epoch - 1) * len(train_loader) + batch_idx + adjust_learning_rate(optimizer, iteration, args.step_size) + data, target = data.to(device), target.to(device) + # compute output + output = model(data) + if isinstance(classifier, torch.nn.Linear): + output = classifier(output) + else: + output = classifier(output, target) + loss = criterion(output, target) + loss_display += loss.item() + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if batch_idx % args.log_interval == 0: + time_used = time.time() - time_curr + loss_display /= args.log_interval + if args.classifier_type is 'MCP': + INFO = ' Margin: {:.4f}, Scale: {:.2f}'.format(classifier.m, classifier.s) + elif args.classifier_type is 'AL': + INFO = ' lambda: {:.4f}'.format(classifier.lamb) + else: + INFO = '' + print_with_time( + 'Train Epoch: {} [{}/{} ({:.0f}%)]{}, Loss: {:.6f}, Elapsed time: {:.4f}s({} iters)'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), + iteration, loss_display, time_used, args.log_interval) + INFO + ) + time_curr = time.time() + loss_display = 0.0 + + +def print_with_time(string): + print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()) + string) + + +def adjust_learning_rate(optimizer, iteration, step_size): + """Sets the learning rate to the initial LR decayed by 10 each step size""" + if iteration in step_size: + lr = args.lr * (0.1 ** (step_size.index(iteration) + 1)) + print_with_time('Adjust learning rate to {}'.format(lr)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + else: + pass + + +if __name__ == '__main__': + print(args) + main() diff --git a/cv/face_recognition/cosface/pytorch/txt_annotation.py b/cv/face_recognition/cosface/pytorch/txt_annotation.py index 63f5ef606..2d81b9a5b 100755 --- a/cv/face_recognition/cosface/pytorch/txt_annotation.py +++ b/cv/face_recognition/cosface/pytorch/txt_annotation.py @@ -1,39 +1,39 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -#------------------------------------------------# -# 进行训练前需要利用这个文件生成cls_train.txt -#------------------------------------------------# -import os - -if __name__ == "__main__": - #---------------------# - # 训练集所在的路径 - #---------------------# - datasets_path = "datasets/datasets" - - types_name = os.listdir(datasets_path) - types_name = sorted(types_name) - - list_file = open('cls_train.txt', 'w') - for cls_id, type_name in enumerate(types_name): - photos_path = os.path.join(datasets_path, type_name) - if not os.path.isdir(photos_path): - continue - photos_name = os.listdir(photos_path) - - for photo_name in photos_name: - list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) - list_file.write('\n') - list_file.close() +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +#------------------------------------------------# +# 进行训练前需要利用这个文件生成cls_train.txt +#------------------------------------------------# +import os + +if __name__ == "__main__": + #---------------------# + # 训练集所在的路径 + #---------------------# + datasets_path = "datasets/datasets" + + types_name = os.listdir(datasets_path) + types_name = sorted(types_name) + + list_file = open('cls_train.txt', 'w') + for cls_id, type_name in enumerate(types_name): + photos_path = os.path.join(datasets_path, type_name) + if not os.path.isdir(photos_path): + continue + photos_name = os.listdir(photos_path) + + for photo_name in photos_name: + list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) + list_file.write('\n') + list_file.close() diff --git a/cv/face_recognition/facenet/pytorch/eval_LFW.py b/cv/face_recognition/facenet/pytorch/eval_LFW.py index 6ff72dc3f..9bc300254 100644 --- a/cv/face_recognition/facenet/pytorch/eval_LFW.py +++ b/cv/face_recognition/facenet/pytorch/eval_LFW.py @@ -1,59 +1,59 @@ -import torch -import torch.backends.cudnn as cudnn - -from nets.facenet import Facenet -from utils.dataloader import LFWDataset -from utils.utils_metrics import test - -if __name__ == "__main__": - #--------------------------------------# - # 是否使用Cuda - # 没有GPU可以设置成False - #--------------------------------------# - cuda = True - #--------------------------------------# - # 主干特征提取网络的选择 - # mobilenet - # inception_resnetv1 - #--------------------------------------# - backbone = "mobilenet" - #--------------------------------------------------------# - # 输入图像大小,常用设置如[112, 112, 3] - #--------------------------------------------------------# - input_shape = [160, 160, 3] - #--------------------------------------# - # 训练好的权值文件 - #--------------------------------------# - model_path = "model_data/facenet_mobilenet.pth" - #--------------------------------------# - # LFW评估数据集的文件路径 - # 以及对应的txt文件 - #--------------------------------------# - lfw_dir_path = "lfw" - lfw_pairs_path = "model_data/lfw_pair.txt" - #--------------------------------------# - # 评估的批次大小和记录间隔 - #--------------------------------------# - batch_size = 256 - log_interval = 1 - #--------------------------------------# - # ROC图的保存路径 - #--------------------------------------# - png_save_path = "model_data/roc_test.png" - - test_loader = torch.utils.data.DataLoader( - LFWDataset(dir=lfw_dir_path, pairs_path=lfw_pairs_path, image_size=input_shape), batch_size=batch_size, shuffle=False) - - model = Facenet(backbone=backbone, mode="predict") - - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.load_state_dict(torch.load(model_path, map_location=device), strict=False) - model = model.eval() - - if cuda: - model = torch.nn.DataParallel(model) - cudnn.benchmark = True - model = model.cuda() - - test(test_loader, model, png_save_path, log_interval, batch_size, cuda) +import torch +import torch.backends.cudnn as cudnn + +from nets.facenet import Facenet +from utils.dataloader import LFWDataset +from utils.utils_metrics import test + +if __name__ == "__main__": + #--------------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #--------------------------------------# + cuda = True + #--------------------------------------# + # 主干特征提取网络的选择 + # mobilenet + # inception_resnetv1 + #--------------------------------------# + backbone = "mobilenet" + #--------------------------------------------------------# + # 输入图像大小,常用设置如[112, 112, 3] + #--------------------------------------------------------# + input_shape = [160, 160, 3] + #--------------------------------------# + # 训练好的权值文件 + #--------------------------------------# + model_path = "model_data/facenet_mobilenet.pth" + #--------------------------------------# + # LFW评估数据集的文件路径 + # 以及对应的txt文件 + #--------------------------------------# + lfw_dir_path = "lfw" + lfw_pairs_path = "model_data/lfw_pair.txt" + #--------------------------------------# + # 评估的批次大小和记录间隔 + #--------------------------------------# + batch_size = 256 + log_interval = 1 + #--------------------------------------# + # ROC图的保存路径 + #--------------------------------------# + png_save_path = "model_data/roc_test.png" + + test_loader = torch.utils.data.DataLoader( + LFWDataset(dir=lfw_dir_path, pairs_path=lfw_pairs_path, image_size=input_shape), batch_size=batch_size, shuffle=False) + + model = Facenet(backbone=backbone, mode="predict") + + print('Loading weights into state dict...') + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.load_state_dict(torch.load(model_path, map_location=device), strict=False) + model = model.eval() + + if cuda: + model = torch.nn.DataParallel(model) + cudnn.benchmark = True + model = model.cuda() + + test(test_loader, model, png_save_path, log_interval, batch_size, cuda) diff --git a/cv/face_recognition/facenet/pytorch/facenet.py b/cv/face_recognition/facenet/pytorch/facenet.py index 0c33f266e..b00d6f973 100644 --- a/cv/face_recognition/facenet/pytorch/facenet.py +++ b/cv/face_recognition/facenet/pytorch/facenet.py @@ -1,111 +1,111 @@ -import matplotlib.pyplot as plt -import numpy as np -import torch -import torch.backends.cudnn as cudnn - -from nets.facenet import Facenet as facenet -from utils.utils import preprocess_input, resize_image, show_config - - -#--------------------------------------------# -# 使用自己训练好的模型预测需要修改2个参数 -# model_path和backbone需要修改! -#--------------------------------------------# -class Facenet(object): - _defaults = { - #--------------------------------------------------------------------------# - # 使用自己训练好的模型进行预测要修改model_path,指向logs文件夹下的权值文件 - # 训练好后logs文件夹下存在多个权值文件,选择验证集损失较低的即可。 - # 验证集损失较低不代表准确度较高,仅代表该权值在验证集上泛化性能较好。 - #--------------------------------------------------------------------------# - "model_path" : "model_data/facenet_mobilenet.pth", - #--------------------------------------------------------------------------# - # 输入图片的大小。 - #--------------------------------------------------------------------------# - "input_shape" : [160, 160, 3], - #--------------------------------------------------------------------------# - # 所使用到的主干特征提取网络 - #--------------------------------------------------------------------------# - "backbone" : "mobilenet", - #-------------------------------------------# - # 是否进行不失真的resize - #-------------------------------------------# - "letterbox_image" : True, - #-------------------------------------------# - # 是否使用Cuda - # 没有GPU可以设置成False - #-------------------------------------------# - "cuda" : True, - } - - @classmethod - def get_defaults(cls, n): - if n in cls._defaults: - return cls._defaults[n] - else: - return "Unrecognized attribute name '" + n + "'" - - #---------------------------------------------------# - # 初始化Facenet - #---------------------------------------------------# - def __init__(self, **kwargs): - self.__dict__.update(self._defaults) - for name, value in kwargs.items(): - setattr(self, name, value) - - self.generate() - - show_config(**self._defaults) - - def generate(self): - #---------------------------------------------------# - # 载入模型与权值 - #---------------------------------------------------# - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.net = facenet(backbone=self.backbone, mode="predict").eval() - self.net.load_state_dict(torch.load(self.model_path, map_location=device), strict=False) - print('{} model loaded.'.format(self.model_path)) - - if self.cuda: - self.net = torch.nn.DataParallel(self.net) - cudnn.benchmark = True - self.net = self.net.cuda() - - #---------------------------------------------------# - # 检测图片 - #---------------------------------------------------# - def detect_image(self, image_1, image_2): - #---------------------------------------------------# - # 图片预处理,归一化 - #---------------------------------------------------# - with torch.no_grad(): - image_1 = resize_image(image_1, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) - image_2 = resize_image(image_2, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) - - photo_1 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_1, np.float32)), (2, 0, 1)), 0)) - photo_2 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_2, np.float32)), (2, 0, 1)), 0)) - - if self.cuda: - photo_1 = photo_1.cuda() - photo_2 = photo_2.cuda() - - #---------------------------------------------------# - # 图片传入网络进行预测 - #---------------------------------------------------# - output1 = self.net(photo_1).cpu().numpy() - output2 = self.net(photo_2).cpu().numpy() - - #---------------------------------------------------# - # 计算二者之间的距离 - #---------------------------------------------------# - l1 = np.linalg.norm(output1 - output2, axis=1) - - plt.subplot(1, 2, 1) - plt.imshow(np.array(image_1)) - - plt.subplot(1, 2, 2) - plt.imshow(np.array(image_2)) - plt.text(-12, -12, 'Distance:%.3f' % l1, ha='center', va= 'bottom',fontsize=11) - plt.show() - return l1 +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.backends.cudnn as cudnn + +from nets.facenet import Facenet as facenet +from utils.utils import preprocess_input, resize_image, show_config + + +#--------------------------------------------# +# 使用自己训练好的模型预测需要修改2个参数 +# model_path和backbone需要修改! +#--------------------------------------------# +class Facenet(object): + _defaults = { + #--------------------------------------------------------------------------# + # 使用自己训练好的模型进行预测要修改model_path,指向logs文件夹下的权值文件 + # 训练好后logs文件夹下存在多个权值文件,选择验证集损失较低的即可。 + # 验证集损失较低不代表准确度较高,仅代表该权值在验证集上泛化性能较好。 + #--------------------------------------------------------------------------# + "model_path" : "model_data/facenet_mobilenet.pth", + #--------------------------------------------------------------------------# + # 输入图片的大小。 + #--------------------------------------------------------------------------# + "input_shape" : [160, 160, 3], + #--------------------------------------------------------------------------# + # 所使用到的主干特征提取网络 + #--------------------------------------------------------------------------# + "backbone" : "mobilenet", + #-------------------------------------------# + # 是否进行不失真的resize + #-------------------------------------------# + "letterbox_image" : True, + #-------------------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #-------------------------------------------# + "cuda" : True, + } + + @classmethod + def get_defaults(cls, n): + if n in cls._defaults: + return cls._defaults[n] + else: + return "Unrecognized attribute name '" + n + "'" + + #---------------------------------------------------# + # 初始化Facenet + #---------------------------------------------------# + def __init__(self, **kwargs): + self.__dict__.update(self._defaults) + for name, value in kwargs.items(): + setattr(self, name, value) + + self.generate() + + show_config(**self._defaults) + + def generate(self): + #---------------------------------------------------# + # 载入模型与权值 + #---------------------------------------------------# + print('Loading weights into state dict...') + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.net = facenet(backbone=self.backbone, mode="predict").eval() + self.net.load_state_dict(torch.load(self.model_path, map_location=device), strict=False) + print('{} model loaded.'.format(self.model_path)) + + if self.cuda: + self.net = torch.nn.DataParallel(self.net) + cudnn.benchmark = True + self.net = self.net.cuda() + + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self, image_1, image_2): + #---------------------------------------------------# + # 图片预处理,归一化 + #---------------------------------------------------# + with torch.no_grad(): + image_1 = resize_image(image_1, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) + image_2 = resize_image(image_2, [self.input_shape[1], self.input_shape[0]], letterbox_image=self.letterbox_image) + + photo_1 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_1, np.float32)), (2, 0, 1)), 0)) + photo_2 = torch.from_numpy(np.expand_dims(np.transpose(preprocess_input(np.array(image_2, np.float32)), (2, 0, 1)), 0)) + + if self.cuda: + photo_1 = photo_1.cuda() + photo_2 = photo_2.cuda() + + #---------------------------------------------------# + # 图片传入网络进行预测 + #---------------------------------------------------# + output1 = self.net(photo_1).cpu().numpy() + output2 = self.net(photo_2).cpu().numpy() + + #---------------------------------------------------# + # 计算二者之间的距离 + #---------------------------------------------------# + l1 = np.linalg.norm(output1 - output2, axis=1) + + plt.subplot(1, 2, 1) + plt.imshow(np.array(image_1)) + + plt.subplot(1, 2, 2) + plt.imshow(np.array(image_2)) + plt.text(-12, -12, 'Distance:%.3f' % l1, ha='center', va= 'bottom',fontsize=11) + plt.show() + return l1 diff --git a/cv/face_recognition/facenet/pytorch/nets/facenet.py b/cv/face_recognition/facenet/pytorch/nets/facenet.py index e7a6fcd92..98dceb797 100644 --- a/cv/face_recognition/facenet/pytorch/nets/facenet.py +++ b/cv/face_recognition/facenet/pytorch/nets/facenet.py @@ -1,103 +1,103 @@ -import torch.nn as nn -from torch.hub import load_state_dict_from_url -from torch.nn import functional as F - -from nets.inception_resnetv1 import InceptionResnetV1 -from nets.mobilenet import MobileNetV1 - - -class mobilenet(nn.Module): - def __init__(self, pretrained): - super(mobilenet, self).__init__() - self.model = MobileNetV1() - if pretrained: - state_dict = load_state_dict_from_url("https://github.com/bubbliiiing/facenet-pytorch/releases/download/v1.0/backbone_weights_of_mobilenetv1.pth", model_dir="model_data", - progress=True) - self.model.load_state_dict(state_dict) - - del self.model.fc - del self.model.avg - - def forward(self, x): - x = self.model.stage1(x) - x = self.model.stage2(x) - x = self.model.stage3(x) - return x - -class inception_resnet(nn.Module): - def __init__(self, pretrained): - super(inception_resnet, self).__init__() - self.model = InceptionResnetV1() - if pretrained: - state_dict = load_state_dict_from_url("https://github.com/bubbliiiing/facenet-pytorch/releases/download/v1.0/backbone_weights_of_inception_resnetv1.pth", model_dir="model_data", - progress=True) - self.model.load_state_dict(state_dict) - - def forward(self, x): - x = self.model.conv2d_1a(x) - x = self.model.conv2d_2a(x) - x = self.model.conv2d_2b(x) - x = self.model.maxpool_3a(x) - x = self.model.conv2d_3b(x) - x = self.model.conv2d_4a(x) - x = self.model.conv2d_4b(x) - x = self.model.repeat_1(x) - x = self.model.mixed_6a(x) - x = self.model.repeat_2(x) - x = self.model.mixed_7a(x) - x = self.model.repeat_3(x) - x = self.model.block8(x) - return x - -class Facenet(nn.Module): - def __init__(self, backbone="mobilenet", dropout_keep_prob=0.5, embedding_size=128, num_classes=None, mode="train", pretrained=False): - super(Facenet, self).__init__() - if backbone == "mobilenet": - self.backbone = mobilenet(pretrained) - flat_shape = 1024 - elif backbone == "inception_resnetv1": - self.backbone = inception_resnet(pretrained) - flat_shape = 1792 - else: - raise ValueError('Unsupported backbone - `{}`, Use mobilenet, inception_resnetv1.'.format(backbone)) - self.avg = nn.AdaptiveAvgPool2d((1,1)) - self.Dropout = nn.Dropout(1 - dropout_keep_prob) - self.Bottleneck = nn.Linear(flat_shape, embedding_size,bias=False) - self.last_bn = nn.BatchNorm1d(embedding_size, eps=0.001, momentum=0.1, affine=True) - if mode == "train": - self.classifier = nn.Linear(embedding_size, num_classes) - - def forward(self, x, mode = "predict"): - if mode == 'predict': - x = self.backbone(x) - x = self.avg(x) - x = x.view(x.size(0), -1) - x = self.Dropout(x) - x = self.Bottleneck(x) - x = self.last_bn(x) - x = F.normalize(x, p=2, dim=1) - return x - x = self.backbone(x) - x = self.avg(x) - x = x.view(x.size(0), -1) - x = self.Dropout(x) - x = self.Bottleneck(x) - before_normalize = self.last_bn(x) - - x = F.normalize(before_normalize, p=2, dim=1) - cls = self.classifier(before_normalize) - return x, cls - - def forward_feature(self, x): - x = self.backbone(x) - x = self.avg(x) - x = x.view(x.size(0), -1) - x = self.Dropout(x) - x = self.Bottleneck(x) - before_normalize = self.last_bn(x) - x = F.normalize(before_normalize, p=2, dim=1) - return before_normalize, x - - def forward_classifier(self, x): - x = self.classifier(x) - return x +import torch.nn as nn +from torch.hub import load_state_dict_from_url +from torch.nn import functional as F + +from nets.inception_resnetv1 import InceptionResnetV1 +from nets.mobilenet import MobileNetV1 + + +class mobilenet(nn.Module): + def __init__(self, pretrained): + super(mobilenet, self).__init__() + self.model = MobileNetV1() + if pretrained: + state_dict = load_state_dict_from_url("https://github.com/bubbliiiing/facenet-pytorch/releases/download/v1.0/backbone_weights_of_mobilenetv1.pth", model_dir="model_data", + progress=True) + self.model.load_state_dict(state_dict) + + del self.model.fc + del self.model.avg + + def forward(self, x): + x = self.model.stage1(x) + x = self.model.stage2(x) + x = self.model.stage3(x) + return x + +class inception_resnet(nn.Module): + def __init__(self, pretrained): + super(inception_resnet, self).__init__() + self.model = InceptionResnetV1() + if pretrained: + state_dict = load_state_dict_from_url("https://github.com/bubbliiiing/facenet-pytorch/releases/download/v1.0/backbone_weights_of_inception_resnetv1.pth", model_dir="model_data", + progress=True) + self.model.load_state_dict(state_dict) + + def forward(self, x): + x = self.model.conv2d_1a(x) + x = self.model.conv2d_2a(x) + x = self.model.conv2d_2b(x) + x = self.model.maxpool_3a(x) + x = self.model.conv2d_3b(x) + x = self.model.conv2d_4a(x) + x = self.model.conv2d_4b(x) + x = self.model.repeat_1(x) + x = self.model.mixed_6a(x) + x = self.model.repeat_2(x) + x = self.model.mixed_7a(x) + x = self.model.repeat_3(x) + x = self.model.block8(x) + return x + +class Facenet(nn.Module): + def __init__(self, backbone="mobilenet", dropout_keep_prob=0.5, embedding_size=128, num_classes=None, mode="train", pretrained=False): + super(Facenet, self).__init__() + if backbone == "mobilenet": + self.backbone = mobilenet(pretrained) + flat_shape = 1024 + elif backbone == "inception_resnetv1": + self.backbone = inception_resnet(pretrained) + flat_shape = 1792 + else: + raise ValueError('Unsupported backbone - `{}`, Use mobilenet, inception_resnetv1.'.format(backbone)) + self.avg = nn.AdaptiveAvgPool2d((1,1)) + self.Dropout = nn.Dropout(1 - dropout_keep_prob) + self.Bottleneck = nn.Linear(flat_shape, embedding_size,bias=False) + self.last_bn = nn.BatchNorm1d(embedding_size, eps=0.001, momentum=0.1, affine=True) + if mode == "train": + self.classifier = nn.Linear(embedding_size, num_classes) + + def forward(self, x, mode = "predict"): + if mode == 'predict': + x = self.backbone(x) + x = self.avg(x) + x = x.view(x.size(0), -1) + x = self.Dropout(x) + x = self.Bottleneck(x) + x = self.last_bn(x) + x = F.normalize(x, p=2, dim=1) + return x + x = self.backbone(x) + x = self.avg(x) + x = x.view(x.size(0), -1) + x = self.Dropout(x) + x = self.Bottleneck(x) + before_normalize = self.last_bn(x) + + x = F.normalize(before_normalize, p=2, dim=1) + cls = self.classifier(before_normalize) + return x, cls + + def forward_feature(self, x): + x = self.backbone(x) + x = self.avg(x) + x = x.view(x.size(0), -1) + x = self.Dropout(x) + x = self.Bottleneck(x) + before_normalize = self.last_bn(x) + x = F.normalize(before_normalize, p=2, dim=1) + return before_normalize, x + + def forward_classifier(self, x): + x = self.classifier(x) + return x diff --git a/cv/face_recognition/facenet/pytorch/nets/facenet_training.py b/cv/face_recognition/facenet/pytorch/nets/facenet_training.py index 5d6a3e3d0..d683c9da0 100644 --- a/cv/face_recognition/facenet/pytorch/nets/facenet_training.py +++ b/cv/face_recognition/facenet/pytorch/nets/facenet_training.py @@ -1,89 +1,89 @@ - - -import math -from functools import partial - -import numpy as np -import torch - - -def triplet_loss(alpha = 0.2): - def _triplet_loss(y_pred,Batch_size): - anchor, positive, negative = y_pred[:int(Batch_size)], y_pred[int(Batch_size):int(2*Batch_size)], y_pred[int(2*Batch_size):] - - pos_dist = torch.sqrt(torch.sum(torch.pow(anchor - positive,2), axis=-1)) - neg_dist = torch.sqrt(torch.sum(torch.pow(anchor - negative,2), axis=-1)) - - keep_all = (neg_dist - pos_dist < alpha).cpu().numpy().flatten() - hard_triplets = np.where(keep_all == 1) - - pos_dist = pos_dist[hard_triplets] - neg_dist = neg_dist[hard_triplets] - - basic_loss = pos_dist - neg_dist + alpha - loss = torch.sum(basic_loss) / torch.max(torch.tensor(1), torch.tensor(len(hard_triplets[0]))) - return loss - return _triplet_loss - -def weights_init(net, init_type='normal', init_gain=0.02): - def init_func(m): - classname = m.__class__.__name__ - if hasattr(m, 'weight') and classname.find('Conv') != -1: - if init_type == 'normal': - torch.nn.init.normal_(m.weight.data, 0.0, init_gain) - elif init_type == 'xavier': - torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain) - elif init_type == 'kaiming': - torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') - elif init_type == 'orthogonal': - torch.nn.init.orthogonal_(m.weight.data, gain=init_gain) - else: - raise NotImplementedError('initialization method [%s] is not implemented' % init_type) - elif classname.find('BatchNorm2d') != -1: - torch.nn.init.normal_(m.weight.data, 1.0, 0.02) - torch.nn.init.constant_(m.bias.data, 0.0) - print('initialize network with %s type' % init_type) - net.apply(init_func) - -def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.1, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.3, step_num = 10): - def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters): - if iters <= warmup_total_iters: - # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start - lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2 - ) + warmup_lr_start - elif iters >= total_iters - no_aug_iter: - lr = min_lr - else: - lr = min_lr + 0.5 * (lr - min_lr) * ( - 1.0 - + math.cos( - math.pi - * (iters - warmup_total_iters) - / (total_iters - warmup_total_iters - no_aug_iter) - ) - ) - return lr - - def step_lr(lr, decay_rate, step_size, iters): - if step_size < 1: - raise ValueError("step_size must above 1.") - n = iters // step_size - out_lr = lr * decay_rate ** n - return out_lr - - if lr_decay_type == "cos": - warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3) - warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6) - no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15) - func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter) - else: - decay_rate = (min_lr / lr) ** (1 / (step_num - 1)) - step_size = total_iters / step_num - func = partial(step_lr, lr, decay_rate, step_size) - - return func - -def set_optimizer_lr(optimizer, lr_scheduler_func, epoch): - lr = lr_scheduler_func(epoch) - for param_group in optimizer.param_groups: - param_group['lr'] = lr + + +import math +from functools import partial + +import numpy as np +import torch + + +def triplet_loss(alpha = 0.2): + def _triplet_loss(y_pred,Batch_size): + anchor, positive, negative = y_pred[:int(Batch_size)], y_pred[int(Batch_size):int(2*Batch_size)], y_pred[int(2*Batch_size):] + + pos_dist = torch.sqrt(torch.sum(torch.pow(anchor - positive,2), axis=-1)) + neg_dist = torch.sqrt(torch.sum(torch.pow(anchor - negative,2), axis=-1)) + + keep_all = (neg_dist - pos_dist < alpha).cpu().numpy().flatten() + hard_triplets = np.where(keep_all == 1) + + pos_dist = pos_dist[hard_triplets] + neg_dist = neg_dist[hard_triplets] + + basic_loss = pos_dist - neg_dist + alpha + loss = torch.sum(basic_loss) / torch.max(torch.tensor(1), torch.tensor(len(hard_triplets[0]))) + return loss + return _triplet_loss + +def weights_init(net, init_type='normal', init_gain=0.02): + def init_func(m): + classname = m.__class__.__name__ + if hasattr(m, 'weight') and classname.find('Conv') != -1: + if init_type == 'normal': + torch.nn.init.normal_(m.weight.data, 0.0, init_gain) + elif init_type == 'xavier': + torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain) + elif init_type == 'kaiming': + torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') + elif init_type == 'orthogonal': + torch.nn.init.orthogonal_(m.weight.data, gain=init_gain) + else: + raise NotImplementedError('initialization method [%s] is not implemented' % init_type) + elif classname.find('BatchNorm2d') != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.02) + torch.nn.init.constant_(m.bias.data, 0.0) + print('initialize network with %s type' % init_type) + net.apply(init_func) + +def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.1, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.3, step_num = 10): + def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters): + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= total_iters - no_aug_iter: + lr = min_lr + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iter) + ) + ) + return lr + + def step_lr(lr, decay_rate, step_size, iters): + if step_size < 1: + raise ValueError("step_size must above 1.") + n = iters // step_size + out_lr = lr * decay_rate ** n + return out_lr + + if lr_decay_type == "cos": + warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3) + warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6) + no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15) + func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter) + else: + decay_rate = (min_lr / lr) ** (1 / (step_num - 1)) + step_size = total_iters / step_num + func = partial(step_lr, lr, decay_rate, step_size) + + return func + +def set_optimizer_lr(optimizer, lr_scheduler_func, epoch): + lr = lr_scheduler_func(epoch) + for param_group in optimizer.param_groups: + param_group['lr'] = lr diff --git a/cv/face_recognition/facenet/pytorch/nets/inception_resnetv1.py b/cv/face_recognition/facenet/pytorch/nets/inception_resnetv1.py index 3f029af4a..d7df3d2c8 100644 --- a/cv/face_recognition/facenet/pytorch/nets/inception_resnetv1.py +++ b/cv/face_recognition/facenet/pytorch/nets/inception_resnetv1.py @@ -1,232 +1,232 @@ -import torch -from torch import nn - -class BasicConv2d(nn.Module): - def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0): - super(BasicConv2d,self).__init__() - self.conv = nn.Conv2d( - in_planes, out_planes, - kernel_size=kernel_size, stride=stride, - padding=padding, bias=False - ) # verify bias false - self.bn = nn.BatchNorm2d( - out_planes, - eps=0.001, # value found in tensorflow - momentum=0.1, # default pytorch value - affine=True - ) - self.relu = nn.ReLU(inplace=False) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x - -class Block35(nn.Module): - def __init__(self, scale=1.0): - super(Block35,self).__init__() - - self.scale = scale - - self.branch0 = BasicConv2d(256, 32, kernel_size=1, stride=1) - - self.branch1 = nn.Sequential( - BasicConv2d(256, 32, kernel_size=1, stride=1), - BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1) - ) - - self.branch2 = nn.Sequential( - BasicConv2d(256, 32, kernel_size=1, stride=1), - BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1), - BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1) - ) - - self.conv2d = nn.Conv2d(96, 256, kernel_size=1, stride=1) - self.relu = nn.ReLU(inplace=False) - - def forward(self, x): - x0 = self.branch0(x) - x1 = self.branch1(x) - x2 = self.branch2(x) - out = torch.cat((x0, x1, x2), 1) - out = self.conv2d(out) - out = out * self.scale + x - out = self.relu(out) - return out - - -class Block17(nn.Module): - def __init__(self, scale=1.0): - super(Block17,self).__init__() - - self.scale = scale - - self.branch0 = BasicConv2d(896, 128, kernel_size=1, stride=1) - - self.branch1 = nn.Sequential( - BasicConv2d(896, 128, kernel_size=1, stride=1), - BasicConv2d(128, 128, kernel_size=(1,7), stride=1, padding=(0,3)), - BasicConv2d(128, 128, kernel_size=(7,1), stride=1, padding=(3,0)) - ) - - self.conv2d = nn.Conv2d(256, 896, kernel_size=1, stride=1) - self.relu = nn.ReLU(inplace=False) - - def forward(self, x): - x0 = self.branch0(x) - x1 = self.branch1(x) - out = torch.cat((x0, x1), 1) - out = self.conv2d(out) - out = out * self.scale + x - out = self.relu(out) - return out - - -class Block8(nn.Module): - def __init__(self, scale=1.0, noReLU=False): - super(Block8,self).__init__() - - self.scale = scale - self.noReLU = noReLU - - self.branch0 = BasicConv2d(1792, 192, kernel_size=1, stride=1) - - self.branch1 = nn.Sequential( - BasicConv2d(1792, 192, kernel_size=1, stride=1), - BasicConv2d(192, 192, kernel_size=(1,3), stride=1, padding=(0,1)), - BasicConv2d(192, 192, kernel_size=(3,1), stride=1, padding=(1,0)) - ) - - self.conv2d = nn.Conv2d(384, 1792, kernel_size=1, stride=1) - if not self.noReLU: - self.relu = nn.ReLU(inplace=False) - - def forward(self, x): - x0 = self.branch0(x) - x1 = self.branch1(x) - out = torch.cat((x0, x1), 1) - out = self.conv2d(out) - out = out * self.scale + x - if not self.noReLU: - out = self.relu(out) - return out - -class Mixed_6a(nn.Module): - def __init__(self): - super(Mixed_6a,self).__init__() - - self.branch0 = BasicConv2d(256, 384, kernel_size=3, stride=2) - - self.branch1 = nn.Sequential( - BasicConv2d(256, 192, kernel_size=1, stride=1), - BasicConv2d(192, 192, kernel_size=3, stride=1, padding=1), - BasicConv2d(192, 256, kernel_size=3, stride=2) - ) - - self.branch2 = nn.MaxPool2d(3, stride=2) - - def forward(self, x): - x0 = self.branch0(x) - x1 = self.branch1(x) - x2 = self.branch2(x) - out = torch.cat((x0, x1, x2), 1) - return out - - -class Mixed_7a(nn.Module): - def __init__(self): - super(Mixed_7a,self).__init__() - - self.branch0 = nn.Sequential( - BasicConv2d(896, 256, kernel_size=1, stride=1), - BasicConv2d(256, 384, kernel_size=3, stride=2) - ) - - self.branch1 = nn.Sequential( - BasicConv2d(896, 256, kernel_size=1, stride=1), - BasicConv2d(256, 256, kernel_size=3, stride=2) - ) - - self.branch2 = nn.Sequential( - BasicConv2d(896, 256, kernel_size=1, stride=1), - BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1), - BasicConv2d(256, 256, kernel_size=3, stride=2) - ) - - self.branch3 = nn.MaxPool2d(3, stride=2) - - def forward(self, x): - x0 = self.branch0(x) - x1 = self.branch1(x) - x2 = self.branch2(x) - x3 = self.branch3(x) - out = torch.cat((x0, x1, x2, x3), 1) - return out - - -class InceptionResnetV1(nn.Module): - def __init__(self): - super(InceptionResnetV1,self).__init__() - self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2) - self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1) - self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1) - self.maxpool_3a = nn.MaxPool2d(3, stride=2) - self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1) - self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1) - self.conv2d_4b = BasicConv2d(192, 256, kernel_size=3, stride=2) - self.repeat_1 = nn.Sequential( - Block35(scale=0.17), - Block35(scale=0.17), - Block35(scale=0.17), - Block35(scale=0.17), - Block35(scale=0.17), - ) - self.mixed_6a = Mixed_6a() - self.repeat_2 = nn.Sequential( - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - Block17(scale=0.10), - ) - self.mixed_7a = Mixed_7a() - self.repeat_3 = nn.Sequential( - Block8(scale=0.20), - Block8(scale=0.20), - Block8(scale=0.20), - Block8(scale=0.20), - Block8(scale=0.20), - ) - self.block8 = Block8(noReLU=True) - self.avgpool_1a = nn.AdaptiveAvgPool2d(1) - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, 0, 0.1) - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def forward(self, x): - x = self.conv2d_1a(x) - x = self.conv2d_2a(x) - x = self.conv2d_2b(x) - x = self.maxpool_3a(x) - x = self.conv2d_3b(x) - x = self.conv2d_4a(x) - x = self.conv2d_4b(x) - x = self.repeat_1(x) - x = self.mixed_6a(x) - x = self.repeat_2(x) - x = self.mixed_7a(x) - x = self.repeat_3(x) - x = self.block8(x) - x = self.avgpool_1a(x) - return x - +import torch +from torch import nn + +class BasicConv2d(nn.Module): + def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0): + super(BasicConv2d,self).__init__() + self.conv = nn.Conv2d( + in_planes, out_planes, + kernel_size=kernel_size, stride=stride, + padding=padding, bias=False + ) # verify bias false + self.bn = nn.BatchNorm2d( + out_planes, + eps=0.001, # value found in tensorflow + momentum=0.1, # default pytorch value + affine=True + ) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + +class Block35(nn.Module): + def __init__(self, scale=1.0): + super(Block35,self).__init__() + + self.scale = scale + + self.branch0 = BasicConv2d(256, 32, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(256, 32, kernel_size=1, stride=1), + BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(256, 32, kernel_size=1, stride=1), + BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1), + BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1) + ) + + self.conv2d = nn.Conv2d(96, 256, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + out = self.conv2d(out) + out = out * self.scale + x + out = self.relu(out) + return out + + +class Block17(nn.Module): + def __init__(self, scale=1.0): + super(Block17,self).__init__() + + self.scale = scale + + self.branch0 = BasicConv2d(896, 128, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(896, 128, kernel_size=1, stride=1), + BasicConv2d(128, 128, kernel_size=(1,7), stride=1, padding=(0,3)), + BasicConv2d(128, 128, kernel_size=(7,1), stride=1, padding=(3,0)) + ) + + self.conv2d = nn.Conv2d(256, 896, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + out = torch.cat((x0, x1), 1) + out = self.conv2d(out) + out = out * self.scale + x + out = self.relu(out) + return out + + +class Block8(nn.Module): + def __init__(self, scale=1.0, noReLU=False): + super(Block8,self).__init__() + + self.scale = scale + self.noReLU = noReLU + + self.branch0 = BasicConv2d(1792, 192, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(1792, 192, kernel_size=1, stride=1), + BasicConv2d(192, 192, kernel_size=(1,3), stride=1, padding=(0,1)), + BasicConv2d(192, 192, kernel_size=(3,1), stride=1, padding=(1,0)) + ) + + self.conv2d = nn.Conv2d(384, 1792, kernel_size=1, stride=1) + if not self.noReLU: + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + out = torch.cat((x0, x1), 1) + out = self.conv2d(out) + out = out * self.scale + x + if not self.noReLU: + out = self.relu(out) + return out + +class Mixed_6a(nn.Module): + def __init__(self): + super(Mixed_6a,self).__init__() + + self.branch0 = BasicConv2d(256, 384, kernel_size=3, stride=2) + + self.branch1 = nn.Sequential( + BasicConv2d(256, 192, kernel_size=1, stride=1), + BasicConv2d(192, 192, kernel_size=3, stride=1, padding=1), + BasicConv2d(192, 256, kernel_size=3, stride=2) + ) + + self.branch2 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + return out + + +class Mixed_7a(nn.Module): + def __init__(self): + super(Mixed_7a,self).__init__() + + self.branch0 = nn.Sequential( + BasicConv2d(896, 256, kernel_size=1, stride=1), + BasicConv2d(256, 384, kernel_size=3, stride=2) + ) + + self.branch1 = nn.Sequential( + BasicConv2d(896, 256, kernel_size=1, stride=1), + BasicConv2d(256, 256, kernel_size=3, stride=2) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(896, 256, kernel_size=1, stride=1), + BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1), + BasicConv2d(256, 256, kernel_size=3, stride=2) + ) + + self.branch3 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + x3 = self.branch3(x) + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class InceptionResnetV1(nn.Module): + def __init__(self): + super(InceptionResnetV1,self).__init__() + self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2) + self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1) + self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1) + self.maxpool_3a = nn.MaxPool2d(3, stride=2) + self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1) + self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1) + self.conv2d_4b = BasicConv2d(192, 256, kernel_size=3, stride=2) + self.repeat_1 = nn.Sequential( + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + ) + self.mixed_6a = Mixed_6a() + self.repeat_2 = nn.Sequential( + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + ) + self.mixed_7a = Mixed_7a() + self.repeat_3 = nn.Sequential( + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + ) + self.block8 = Block8(noReLU=True) + self.avgpool_1a = nn.AdaptiveAvgPool2d(1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, 0, 0.1) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.conv2d_1a(x) + x = self.conv2d_2a(x) + x = self.conv2d_2b(x) + x = self.maxpool_3a(x) + x = self.conv2d_3b(x) + x = self.conv2d_4a(x) + x = self.conv2d_4b(x) + x = self.repeat_1(x) + x = self.mixed_6a(x) + x = self.repeat_2(x) + x = self.mixed_7a(x) + x = self.repeat_3(x) + x = self.block8(x) + x = self.avgpool_1a(x) + return x + diff --git a/cv/face_recognition/facenet/pytorch/nets/mobilenet.py b/cv/face_recognition/facenet/pytorch/nets/mobilenet.py index 4b7c455c0..da3141c91 100644 --- a/cv/face_recognition/facenet/pytorch/nets/mobilenet.py +++ b/cv/face_recognition/facenet/pytorch/nets/mobilenet.py @@ -1,72 +1,72 @@ -import torch.nn as nn - - -def conv_bn(inp, oup, stride = 1): - return nn.Sequential( - nn.Conv2d(inp, oup, 3, stride, 1, bias=False), - nn.BatchNorm2d(oup), - nn.ReLU6() - ) - -def conv_dw(inp, oup, stride = 1): - return nn.Sequential( - nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), - nn.BatchNorm2d(inp), - nn.ReLU6(), - - nn.Conv2d(inp, oup, 1, 1, 0, bias=False), - nn.BatchNorm2d(oup), - nn.ReLU6(), - ) - -class MobileNetV1(nn.Module): - def __init__(self): - super(MobileNetV1, self).__init__() - self.stage1 = nn.Sequential( - # 160,160,3 -> 80,80,32 - conv_bn(3, 32, 2), - # 80,80,32 -> 80,80,64 - conv_dw(32, 64, 1), - - # 80,80,64 -> 40,40,128 - conv_dw(64, 128, 2), - conv_dw(128, 128, 1), - - # 40,40,128 -> 20,20,256 - conv_dw(128, 256, 2), - conv_dw(256, 256, 1), - ) - self.stage2 = nn.Sequential( - # 20,20,256 -> 10,10,512 - conv_dw(256, 512, 2), - conv_dw(512, 512, 1), - conv_dw(512, 512, 1), - conv_dw(512, 512, 1), - conv_dw(512, 512, 1), - conv_dw(512, 512, 1), - ) - self.stage3 = nn.Sequential( - # 10,10,512 -> 5,5,1024 - conv_dw(512, 1024, 2), - conv_dw(1024, 1024, 1), - ) - - self.avg = nn.AdaptiveAvgPool2d((1,1)) - self.fc = nn.Linear(1024, 1000) - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, 0, 0.1) - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def forward(self, x): - x = self.stage1(x) - x = self.stage2(x) - x = self.stage3(x) - x = self.avg(x) - # x = self.model(x) - x = x.view(-1, 1024) - x = self.fc(x) - return x +import torch.nn as nn + + +def conv_bn(inp, oup, stride = 1): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6() + ) + +def conv_dw(inp, oup, stride = 1): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.ReLU6(), + + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(), + ) + +class MobileNetV1(nn.Module): + def __init__(self): + super(MobileNetV1, self).__init__() + self.stage1 = nn.Sequential( + # 160,160,3 -> 80,80,32 + conv_bn(3, 32, 2), + # 80,80,32 -> 80,80,64 + conv_dw(32, 64, 1), + + # 80,80,64 -> 40,40,128 + conv_dw(64, 128, 2), + conv_dw(128, 128, 1), + + # 40,40,128 -> 20,20,256 + conv_dw(128, 256, 2), + conv_dw(256, 256, 1), + ) + self.stage2 = nn.Sequential( + # 20,20,256 -> 10,10,512 + conv_dw(256, 512, 2), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + ) + self.stage3 = nn.Sequential( + # 10,10,512 -> 5,5,1024 + conv_dw(512, 1024, 2), + conv_dw(1024, 1024, 1), + ) + + self.avg = nn.AdaptiveAvgPool2d((1,1)) + self.fc = nn.Linear(1024, 1000) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, 0, 0.1) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.stage1(x) + x = self.stage2(x) + x = self.stage3(x) + x = self.avg(x) + # x = self.model(x) + x = x.view(-1, 1024) + x = self.fc(x) + return x diff --git a/cv/face_recognition/facenet/pytorch/predict.py b/cv/face_recognition/facenet/pytorch/predict.py index 1125a05cd..55dbdfd91 100644 --- a/cv/face_recognition/facenet/pytorch/predict.py +++ b/cv/face_recognition/facenet/pytorch/predict.py @@ -1,24 +1,24 @@ -from PIL import Image - -from facenet import Facenet - -if __name__ == "__main__": - model = Facenet() - - while True: - image_1 = input('Input image_1 filename:') - try: - image_1 = Image.open(image_1) - except: - print('Image_1 Open Error! Try again!') - continue - - image_2 = input('Input image_2 filename:') - try: - image_2 = Image.open(image_2) - except: - print('Image_2 Open Error! Try again!') - continue - - probability = model.detect_image(image_1,image_2) - print(probability) +from PIL import Image + +from facenet import Facenet + +if __name__ == "__main__": + model = Facenet() + + while True: + image_1 = input('Input image_1 filename:') + try: + image_1 = Image.open(image_1) + except: + print('Image_1 Open Error! Try again!') + continue + + image_2 = input('Input image_2 filename:') + try: + image_2 = Image.open(image_2) + except: + print('Image_2 Open Error! Try again!') + continue + + probability = model.detect_image(image_1,image_2) + print(probability) diff --git a/cv/face_recognition/facenet/pytorch/requirements.txt b/cv/face_recognition/facenet/pytorch/requirements.txt index 4dd31c8ec..97acecede 100644 --- a/cv/face_recognition/facenet/pytorch/requirements.txt +++ b/cv/face_recognition/facenet/pytorch/requirements.txt @@ -1,9 +1,9 @@ -scipy==1.2.1 -numpy==1.17.0 -matplotlib==3.1.2 -opencv_python==4.1.2.30 - -torchvision==0.4.0 -tqdm==4.60.0 -Pillow==8.2.0 -h5py==2.10.0 +scipy==1.2.1 +numpy==1.17.0 +matplotlib==3.1.2 +opencv_python==4.1.2.30 + +torchvision==0.4.0 +tqdm==4.60.0 +Pillow==8.2.0 +h5py==2.10.0 diff --git a/cv/face_recognition/facenet/pytorch/summary.py b/cv/face_recognition/facenet/pytorch/summary.py index 01b600fdd..c355f5ef2 100644 --- a/cv/face_recognition/facenet/pytorch/summary.py +++ b/cv/face_recognition/facenet/pytorch/summary.py @@ -1,29 +1,29 @@ -#--------------------------------------------# -# 该部分代码只用于看网络结构,并非测试代码 -#--------------------------------------------# -import torch -from thop import clever_format, profile -from torchsummary import summary - -from nets.facenet import Facenet - -if __name__ == "__main__": - input_shape = [160, 160] - backbone = 'mobilenet' - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model = Facenet(num_classes = 10575, backbone = backbone).to(device) - summary(model, (3, input_shape[0], input_shape[1])) - - dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device) - flops, params = profile(model.to(device), (dummy_input, ), verbose=False) - #--------------------------------------------------------# - # flops * 2是因为profile没有将卷积作为两个operations - # 有些论文将卷积算乘法、加法两个operations。此时乘2 - # 有些论文只考虑乘法的运算次数,忽略加法。此时不乘2 - # 本代码选择乘2,参考YOLOX。 - #--------------------------------------------------------# - flops = flops * 2 - flops, params = clever_format([flops, params], "%.3f") - print('Total GFLOPS: %s' % (flops)) - print('Total params: %s' % (params)) +#--------------------------------------------# +# 该部分代码只用于看网络结构,并非测试代码 +#--------------------------------------------# +import torch +from thop import clever_format, profile +from torchsummary import summary + +from nets.facenet import Facenet + +if __name__ == "__main__": + input_shape = [160, 160] + backbone = 'mobilenet' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model = Facenet(num_classes = 10575, backbone = backbone).to(device) + summary(model, (3, input_shape[0], input_shape[1])) + + dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device) + flops, params = profile(model.to(device), (dummy_input, ), verbose=False) + #--------------------------------------------------------# + # flops * 2是因为profile没有将卷积作为两个operations + # 有些论文将卷积算乘法、加法两个operations。此时乘2 + # 有些论文只考虑乘法的运算次数,忽略加法。此时不乘2 + # 本代码选择乘2,参考YOLOX。 + #--------------------------------------------------------# + flops = flops * 2 + flops, params = clever_format([flops, params], "%.3f") + print('Total GFLOPS: %s' % (flops)) + print('Total params: %s' % (params)) diff --git a/cv/face_recognition/facenet/pytorch/txt_annotation.py b/cv/face_recognition/facenet/pytorch/txt_annotation.py index 68e6ec6bf..2b2743fe4 100644 --- a/cv/face_recognition/facenet/pytorch/txt_annotation.py +++ b/cv/face_recognition/facenet/pytorch/txt_annotation.py @@ -1,25 +1,25 @@ -#------------------------------------------------# -# 进行训练前需要利用这个文件生成cls_train.txt -#------------------------------------------------# -import os - -if __name__ == "__main__": - #---------------------# - # 训练集所在的路径 - #---------------------# - datasets_path = "datasets" - - types_name = os.listdir(datasets_path) - types_name = sorted(types_name) - - list_file = open('cls_train.txt', 'w') - for cls_id, type_name in enumerate(types_name): - photos_path = os.path.join(datasets_path, type_name) - if not os.path.isdir(photos_path): - continue - photos_name = os.listdir(photos_path) - - for photo_name in photos_name: - list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) - list_file.write('\n') - list_file.close() +#------------------------------------------------# +# 进行训练前需要利用这个文件生成cls_train.txt +#------------------------------------------------# +import os + +if __name__ == "__main__": + #---------------------# + # 训练集所在的路径 + #---------------------# + datasets_path = "datasets" + + types_name = os.listdir(datasets_path) + types_name = sorted(types_name) + + list_file = open('cls_train.txt', 'w') + for cls_id, type_name in enumerate(types_name): + photos_path = os.path.join(datasets_path, type_name) + if not os.path.isdir(photos_path): + continue + photos_name = os.listdir(photos_path) + + for photo_name in photos_name: + list_file.write(str(cls_id) + ";" + '%s'%(os.path.join(os.path.abspath(datasets_path), type_name, photo_name))) + list_file.write('\n') + list_file.close() diff --git a/cv/face_recognition/facenet/pytorch/utils/utils_fit.py b/cv/face_recognition/facenet/pytorch/utils/utils_fit.py index 3b61c6a54..98032f877 100644 --- a/cv/face_recognition/facenet/pytorch/utils/utils_fit.py +++ b/cv/face_recognition/facenet/pytorch/utils/utils_fit.py @@ -1,140 +1,140 @@ -import os - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from tqdm import tqdm - -from utils.utils import get_lr -from utils.utils_metrics import evaluate - - -def fit_one_epoch(model_train, model, loss_history, loss, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, test_loader, Batch_size, lfw_eval_flag, fp16, scaler, save_period, save_dir, local_rank): - total_triple_loss = 0 - total_CE_loss = 0 - total_accuracy = 0 - - val_total_triple_loss = 0 - val_total_CE_loss = 0 - val_total_accuracy = 0 - - if local_rank == 0: - print('Start Train') - pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) - model_train.train() - for iteration, batch in enumerate(gen): - if iteration >= epoch_step: - break - images, labels = batch - with torch.no_grad(): - if cuda: - images = images.cuda(local_rank) - labels = labels.cuda(local_rank) - - optimizer.zero_grad() - if not fp16: - outputs1, outputs2 = model_train(images, "train") - - _triplet_loss = loss(outputs1, Batch_size) - _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) - _loss = _triplet_loss + _CE_loss - - _loss.backward() - optimizer.step() - else: - from torch.cuda.amp import autocast - with autocast(): - outputs1, outputs2 = model_train(images, "train") - - _triplet_loss = loss(outputs1, Batch_size) - _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) - _loss = _triplet_loss + _CE_loss - #----------------------# - # 反向传播 - #----------------------# - scaler.scale(_loss).backward() - scaler.step(optimizer) - scaler.update() - - with torch.no_grad(): - accuracy = torch.mean((torch.argmax(F.softmax(outputs2, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) - - total_triple_loss += _triplet_loss.item() - total_CE_loss += _CE_loss.item() - total_accuracy += accuracy.item() - - if local_rank == 0: - pbar.set_postfix(**{'total_triple_loss' : total_triple_loss / (iteration + 1), - 'total_CE_loss' : total_CE_loss / (iteration + 1), - 'accuracy' : total_accuracy / (iteration + 1), - 'lr' : get_lr(optimizer)}) - pbar.update(1) - - if local_rank == 0: - pbar.close() - print('Finish Train') - print('Start Validation') - pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) - model_train.eval() - for iteration, batch in enumerate(gen_val): - if iteration >= epoch_step_val: - break - images, labels = batch - with torch.no_grad(): - if cuda: - images = images.cuda(local_rank) - labels = labels.cuda(local_rank) - - optimizer.zero_grad() - outputs1, outputs2 = model_train(images, "train") - - _triplet_loss = loss(outputs1, Batch_size) - _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) - _loss = _triplet_loss + _CE_loss - - accuracy = torch.mean((torch.argmax(F.softmax(outputs2, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) - - val_total_triple_loss += _triplet_loss.item() - val_total_CE_loss += _CE_loss.item() - val_total_accuracy += accuracy.item() - - if local_rank == 0: - pbar.set_postfix(**{'val_total_triple_loss' : val_total_triple_loss / (iteration + 1), - 'val_total_CE_loss' : val_total_CE_loss / (iteration + 1), - 'val_accuracy' : val_total_accuracy / (iteration + 1), - 'lr' : get_lr(optimizer)}) - pbar.update(1) - - if lfw_eval_flag: - print("开始进行LFW数据集的验证。") - labels, distances = [], [] - for _, (data_a, data_p, label) in enumerate(test_loader): - with torch.no_grad(): - data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) - if cuda: - data_a, data_p = data_a.cuda(local_rank), data_p.cuda(local_rank) - out_a, out_p = model_train(data_a), model_train(data_p) - dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) - distances.append(dists.data.cpu().numpy()) - labels.append(label.data.cpu().numpy()) - - labels = np.array([sublabel for label in labels for sublabel in label]) - distances = np.array([subdist for dist in distances for subdist in dist]) - _, _, accuracy, _, _, _, _ = evaluate(distances,labels) - - if local_rank == 0: - pbar.close() - print('Finish Validation') - - if lfw_eval_flag: - print('LFW_Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) - - loss_history.append_loss(epoch, np.mean(accuracy) if lfw_eval_flag else total_accuracy / epoch_step, \ - (total_triple_loss + total_CE_loss) / epoch_step, (val_total_triple_loss + val_total_CE_loss) / epoch_step_val) - print('Epoch:' + str(epoch + 1) + '/' + str(Epoch)) - print('Total Loss: %.4f' % ((total_triple_loss + total_CE_loss) / epoch_step)) - if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: - torch.save(model.state_dict(), os.path.join(save_dir, 'ep%03d-loss%.3f-val_loss%.3f.pth'%((epoch + 1), - (total_triple_loss + total_CE_loss) / epoch_step, - (val_total_triple_loss + val_total_CE_loss) / epoch_step_val))) +import os + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm import tqdm + +from utils.utils import get_lr +from utils.utils_metrics import evaluate + + +def fit_one_epoch(model_train, model, loss_history, loss, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, test_loader, Batch_size, lfw_eval_flag, fp16, scaler, save_period, save_dir, local_rank): + total_triple_loss = 0 + total_CE_loss = 0 + total_accuracy = 0 + + val_total_triple_loss = 0 + val_total_CE_loss = 0 + val_total_accuracy = 0 + + if local_rank == 0: + print('Start Train') + pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + model_train.train() + for iteration, batch in enumerate(gen): + if iteration >= epoch_step: + break + images, labels = batch + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + labels = labels.cuda(local_rank) + + optimizer.zero_grad() + if not fp16: + outputs1, outputs2 = model_train(images, "train") + + _triplet_loss = loss(outputs1, Batch_size) + _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) + _loss = _triplet_loss + _CE_loss + + _loss.backward() + optimizer.step() + else: + from torch.cuda.amp import autocast + with autocast(): + outputs1, outputs2 = model_train(images, "train") + + _triplet_loss = loss(outputs1, Batch_size) + _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) + _loss = _triplet_loss + _CE_loss + #----------------------# + # 反向传播 + #----------------------# + scaler.scale(_loss).backward() + scaler.step(optimizer) + scaler.update() + + with torch.no_grad(): + accuracy = torch.mean((torch.argmax(F.softmax(outputs2, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) + + total_triple_loss += _triplet_loss.item() + total_CE_loss += _CE_loss.item() + total_accuracy += accuracy.item() + + if local_rank == 0: + pbar.set_postfix(**{'total_triple_loss' : total_triple_loss / (iteration + 1), + 'total_CE_loss' : total_CE_loss / (iteration + 1), + 'accuracy' : total_accuracy / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + if local_rank == 0: + pbar.close() + print('Finish Train') + print('Start Validation') + pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + model_train.eval() + for iteration, batch in enumerate(gen_val): + if iteration >= epoch_step_val: + break + images, labels = batch + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + labels = labels.cuda(local_rank) + + optimizer.zero_grad() + outputs1, outputs2 = model_train(images, "train") + + _triplet_loss = loss(outputs1, Batch_size) + _CE_loss = nn.NLLLoss()(F.log_softmax(outputs2, dim = -1), labels) + _loss = _triplet_loss + _CE_loss + + accuracy = torch.mean((torch.argmax(F.softmax(outputs2, dim=-1), dim=-1) == labels).type(torch.FloatTensor)) + + val_total_triple_loss += _triplet_loss.item() + val_total_CE_loss += _CE_loss.item() + val_total_accuracy += accuracy.item() + + if local_rank == 0: + pbar.set_postfix(**{'val_total_triple_loss' : val_total_triple_loss / (iteration + 1), + 'val_total_CE_loss' : val_total_CE_loss / (iteration + 1), + 'val_accuracy' : val_total_accuracy / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + if lfw_eval_flag: + print("开始进行LFW数据集的验证。") + labels, distances = [], [] + for _, (data_a, data_p, label) in enumerate(test_loader): + with torch.no_grad(): + data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) + if cuda: + data_a, data_p = data_a.cuda(local_rank), data_p.cuda(local_rank) + out_a, out_p = model_train(data_a), model_train(data_p) + dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) + distances.append(dists.data.cpu().numpy()) + labels.append(label.data.cpu().numpy()) + + labels = np.array([sublabel for label in labels for sublabel in label]) + distances = np.array([subdist for dist in distances for subdist in dist]) + _, _, accuracy, _, _, _, _ = evaluate(distances,labels) + + if local_rank == 0: + pbar.close() + print('Finish Validation') + + if lfw_eval_flag: + print('LFW_Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) + + loss_history.append_loss(epoch, np.mean(accuracy) if lfw_eval_flag else total_accuracy / epoch_step, \ + (total_triple_loss + total_CE_loss) / epoch_step, (val_total_triple_loss + val_total_CE_loss) / epoch_step_val) + print('Epoch:' + str(epoch + 1) + '/' + str(Epoch)) + print('Total Loss: %.4f' % ((total_triple_loss + total_CE_loss) / epoch_step)) + if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: + torch.save(model.state_dict(), os.path.join(save_dir, 'ep%03d-loss%.3f-val_loss%.3f.pth'%((epoch + 1), + (total_triple_loss + total_CE_loss) / epoch_step, + (val_total_triple_loss + val_total_CE_loss) / epoch_step_val))) diff --git a/cv/face_recognition/facenet/pytorch/utils/utils_metrics.py b/cv/face_recognition/facenet/pytorch/utils/utils_metrics.py index 110fc2bc1..7cdfe4804 100644 --- a/cv/face_recognition/facenet/pytorch/utils/utils_metrics.py +++ b/cv/face_recognition/facenet/pytorch/utils/utils_metrics.py @@ -1,157 +1,157 @@ -import numpy as np -import torch -from scipy import interpolate -from sklearn.model_selection import KFold -from tqdm import tqdm - -def evaluate(distances, labels, nrof_folds=10): - # Calculate evaluation metrics - thresholds = np.arange(0, 4, 0.01) - tpr, fpr, accuracy, best_thresholds = calculate_roc(thresholds, distances, - labels, nrof_folds=nrof_folds) - thresholds = np.arange(0, 4, 0.001) - val, val_std, far = calculate_val(thresholds, distances, - labels, 1e-3, nrof_folds=nrof_folds) - return tpr, fpr, accuracy, val, val_std, far, best_thresholds - -def calculate_roc(thresholds, distances, labels, nrof_folds=10): - - nrof_pairs = min(len(labels), len(distances)) - nrof_thresholds = len(thresholds) - k_fold = KFold(n_splits=nrof_folds, shuffle=False) - - tprs = np.zeros((nrof_folds,nrof_thresholds)) - fprs = np.zeros((nrof_folds,nrof_thresholds)) - accuracy = np.zeros((nrof_folds)) - - indices = np.arange(nrof_pairs) - - for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): - - # Find the best threshold for the fold - acc_train = np.zeros((nrof_thresholds)) - for threshold_idx, threshold in enumerate(thresholds): - _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, distances[train_set], labels[train_set]) - - best_threshold_index = np.argmax(acc_train) - for threshold_idx, threshold in enumerate(thresholds): - tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, distances[test_set], labels[test_set]) - _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], distances[test_set], labels[test_set]) - tpr = np.mean(tprs,0) - fpr = np.mean(fprs,0) - return tpr, fpr, accuracy, thresholds[best_threshold_index] - -def calculate_accuracy(threshold, dist, actual_issame): - predict_issame = np.less(dist, threshold) - tp = np.sum(np.logical_and(predict_issame, actual_issame)) - fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) - tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))) - fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) - - tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn) - fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn) - acc = float(tp+tn)/dist.size - return tpr, fpr, acc - -def calculate_val(thresholds, distances, labels, far_target=1e-3, nrof_folds=10): - nrof_pairs = min(len(labels), len(distances)) - nrof_thresholds = len(thresholds) - k_fold = KFold(n_splits=nrof_folds, shuffle=False) - - val = np.zeros(nrof_folds) - far = np.zeros(nrof_folds) - - indices = np.arange(nrof_pairs) - - for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): - # Find the threshold that gives FAR = far_target - far_train = np.zeros(nrof_thresholds) - for threshold_idx, threshold in enumerate(thresholds): - _, far_train[threshold_idx] = calculate_val_far(threshold, distances[train_set], labels[train_set]) - if np.max(far_train)>=far_target: - f = interpolate.interp1d(far_train, thresholds, kind='slinear') - threshold = f(far_target) - else: - threshold = 0.0 - - val[fold_idx], far[fold_idx] = calculate_val_far(threshold, distances[test_set], labels[test_set]) - - val_mean = np.mean(val) - far_mean = np.mean(far) - val_std = np.std(val) - return val_mean, val_std, far_mean - -def calculate_val_far(threshold, dist, actual_issame): - predict_issame = np.less(dist, threshold) - true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) - false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) - n_same = np.sum(actual_issame) - n_diff = np.sum(np.logical_not(actual_issame)) - if n_diff == 0: - n_diff = 1 - if n_same == 0: - return 0,0 - val = float(true_accept) / float(n_same) - far = float(false_accept) / float(n_diff) - return val, far - -def test(test_loader, model, png_save_path, log_interval, batch_size, cuda): - labels, distances = [], [] - pbar = tqdm(enumerate(test_loader)) - for batch_idx, (data_a, data_p, label) in pbar: - with torch.no_grad(): - #--------------------------------------# - # 加载数据,设置成cuda - #--------------------------------------# - data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) - if cuda: - data_a, data_p = data_a.cuda(), data_p.cuda() - #--------------------------------------# - # 传入模型预测,获得预测结果 - # 获得预测结果的距离 - #--------------------------------------# - out_a, out_p = model(data_a), model(data_p) - dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) - - #--------------------------------------# - # 将结果添加进列表中 - #--------------------------------------# - distances.append(dists.data.cpu().numpy()) - labels.append(label.data.cpu().numpy()) - - #--------------------------------------# - # 打印 - #--------------------------------------# - if batch_idx % log_interval == 0: - pbar.set_description('Test Epoch: [{}/{} ({:.0f}%)]'.format( - batch_idx * batch_size, len(test_loader.dataset), - 100. * batch_idx / len(test_loader))) - - #--------------------------------------# - # 转换成numpy - #--------------------------------------# - labels = np.array([sublabel for label in labels for sublabel in label]) - distances = np.array([subdist for dist in distances for subdist in dist]) - - tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(distances,labels) - print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) - print('Best_thresholds: %2.5f' % best_thresholds) - print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) - plot_roc(fpr, tpr, figure_name = png_save_path) - -def plot_roc(fpr, tpr, figure_name = "roc.png"): - import matplotlib.pyplot as plt - from sklearn.metrics import auc, roc_curve - roc_auc = auc(fpr, tpr) - fig = plt.figure() - lw = 2 - plt.plot(fpr, tpr, color='darkorange', - lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) - plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('Receiver operating characteristic') - plt.legend(loc="lower right") - fig.savefig(figure_name, dpi=fig.dpi) +import numpy as np +import torch +from scipy import interpolate +from sklearn.model_selection import KFold +from tqdm import tqdm + +def evaluate(distances, labels, nrof_folds=10): + # Calculate evaluation metrics + thresholds = np.arange(0, 4, 0.01) + tpr, fpr, accuracy, best_thresholds = calculate_roc(thresholds, distances, + labels, nrof_folds=nrof_folds) + thresholds = np.arange(0, 4, 0.001) + val, val_std, far = calculate_val(thresholds, distances, + labels, 1e-3, nrof_folds=nrof_folds) + return tpr, fpr, accuracy, val, val_std, far, best_thresholds + +def calculate_roc(thresholds, distances, labels, nrof_folds=10): + + nrof_pairs = min(len(labels), len(distances)) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + tprs = np.zeros((nrof_folds,nrof_thresholds)) + fprs = np.zeros((nrof_folds,nrof_thresholds)) + accuracy = np.zeros((nrof_folds)) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + + # Find the best threshold for the fold + acc_train = np.zeros((nrof_thresholds)) + for threshold_idx, threshold in enumerate(thresholds): + _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, distances[train_set], labels[train_set]) + + best_threshold_index = np.argmax(acc_train) + for threshold_idx, threshold in enumerate(thresholds): + tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, distances[test_set], labels[test_set]) + _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], distances[test_set], labels[test_set]) + tpr = np.mean(tprs,0) + fpr = np.mean(fprs,0) + return tpr, fpr, accuracy, thresholds[best_threshold_index] + +def calculate_accuracy(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + tp = np.sum(np.logical_and(predict_issame, actual_issame)) + fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))) + fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) + + tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn) + fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn) + acc = float(tp+tn)/dist.size + return tpr, fpr, acc + +def calculate_val(thresholds, distances, labels, far_target=1e-3, nrof_folds=10): + nrof_pairs = min(len(labels), len(distances)) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + val = np.zeros(nrof_folds) + far = np.zeros(nrof_folds) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + # Find the threshold that gives FAR = far_target + far_train = np.zeros(nrof_thresholds) + for threshold_idx, threshold in enumerate(thresholds): + _, far_train[threshold_idx] = calculate_val_far(threshold, distances[train_set], labels[train_set]) + if np.max(far_train)>=far_target: + f = interpolate.interp1d(far_train, thresholds, kind='slinear') + threshold = f(far_target) + else: + threshold = 0.0 + + val[fold_idx], far[fold_idx] = calculate_val_far(threshold, distances[test_set], labels[test_set]) + + val_mean = np.mean(val) + far_mean = np.mean(far) + val_std = np.std(val) + return val_mean, val_std, far_mean + +def calculate_val_far(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) + false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + n_same = np.sum(actual_issame) + n_diff = np.sum(np.logical_not(actual_issame)) + if n_diff == 0: + n_diff = 1 + if n_same == 0: + return 0,0 + val = float(true_accept) / float(n_same) + far = float(false_accept) / float(n_diff) + return val, far + +def test(test_loader, model, png_save_path, log_interval, batch_size, cuda): + labels, distances = [], [] + pbar = tqdm(enumerate(test_loader)) + for batch_idx, (data_a, data_p, label) in pbar: + with torch.no_grad(): + #--------------------------------------# + # 加载数据,设置成cuda + #--------------------------------------# + data_a, data_p = data_a.type(torch.FloatTensor), data_p.type(torch.FloatTensor) + if cuda: + data_a, data_p = data_a.cuda(), data_p.cuda() + #--------------------------------------# + # 传入模型预测,获得预测结果 + # 获得预测结果的距离 + #--------------------------------------# + out_a, out_p = model(data_a), model(data_p) + dists = torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) + + #--------------------------------------# + # 将结果添加进列表中 + #--------------------------------------# + distances.append(dists.data.cpu().numpy()) + labels.append(label.data.cpu().numpy()) + + #--------------------------------------# + # 打印 + #--------------------------------------# + if batch_idx % log_interval == 0: + pbar.set_description('Test Epoch: [{}/{} ({:.0f}%)]'.format( + batch_idx * batch_size, len(test_loader.dataset), + 100. * batch_idx / len(test_loader))) + + #--------------------------------------# + # 转换成numpy + #--------------------------------------# + labels = np.array([sublabel for label in labels for sublabel in label]) + distances = np.array([subdist for dist in distances for subdist in dist]) + + tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(distances,labels) + print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) + print('Best_thresholds: %2.5f' % best_thresholds) + print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) + plot_roc(fpr, tpr, figure_name = png_save_path) + +def plot_roc(fpr, tpr, figure_name = "roc.png"): + import matplotlib.pyplot as plt + from sklearn.metrics import auc, roc_curve + roc_auc = auc(fpr, tpr) + fig = plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic') + plt.legend(loc="lower right") + fig.savefig(figure_name, dpi=fig.dpi) diff --git a/cv/multi_object_tracking/deep_sort/pytorch/evaluate.py b/cv/multi_object_tracking/deep_sort/pytorch/evaluate.py index 85eaa6fc3..31c40a46e 100644 --- a/cv/multi_object_tracking/deep_sort/pytorch/evaluate.py +++ b/cv/multi_object_tracking/deep_sort/pytorch/evaluate.py @@ -1,15 +1,15 @@ -import torch - -features = torch.load("features.pth") -qf = features["qf"] -ql = features["ql"] -gf = features["gf"] -gl = features["gl"] - -scores = qf.mm(gf.t()) -res = scores.topk(5, dim=1)[1][:,0] -top1correct = gl[res].eq(ql).sum().item() - -print("Acc top1:{:.3f}".format(top1correct/ql.size(0))) - - +import torch + +features = torch.load("features.pth") +qf = features["qf"] +ql = features["ql"] +gf = features["gf"] +gl = features["gl"] + +scores = qf.mm(gf.t()) +res = scores.topk(5, dim=1)[1][:,0] +top1correct = gl[res].eq(ql).sum().item() + +print("Acc top1:{:.3f}".format(top1correct/ql.size(0))) + + diff --git a/cv/multi_object_tracking/deep_sort/pytorch/model.py b/cv/multi_object_tracking/deep_sort/pytorch/model.py index bc92106d9..97e87547c 100644 --- a/cv/multi_object_tracking/deep_sort/pytorch/model.py +++ b/cv/multi_object_tracking/deep_sort/pytorch/model.py @@ -1,104 +1,104 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -class BasicBlock(nn.Module): - def __init__(self, c_in, c_out,is_downsample=False): - super(BasicBlock,self).__init__() - self.is_downsample = is_downsample - if is_downsample: - self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) - else: - self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(c_out) - self.relu = nn.ReLU(True) - self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(c_out) - if is_downsample: - self.downsample = nn.Sequential( - nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), - nn.BatchNorm2d(c_out) - ) - elif c_in != c_out: - self.downsample = nn.Sequential( - nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), - nn.BatchNorm2d(c_out) - ) - self.is_downsample = True - - def forward(self,x): - y = self.conv1(x) - y = self.bn1(y) - y = self.relu(y) - y = self.conv2(y) - y = self.bn2(y) - if self.is_downsample: - x = self.downsample(x) - return F.relu(x.add(y),True) - -def make_layers(c_in,c_out,repeat_times, is_downsample=False): - blocks = [] - for i in range(repeat_times): - if i ==0: - blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] - else: - blocks += [BasicBlock(c_out,c_out),] - return nn.Sequential(*blocks) - -class Net(nn.Module): - def __init__(self, num_classes=751 ,reid=False): - super(Net,self).__init__() - # 3 128 64 - self.conv = nn.Sequential( - nn.Conv2d(3,64,3,stride=1,padding=1), - nn.BatchNorm2d(64), - nn.ReLU(inplace=True), - # nn.Conv2d(32,32,3,stride=1,padding=1), - # nn.BatchNorm2d(32), - # nn.ReLU(inplace=True), - nn.MaxPool2d(3,2,padding=1), - ) - # 32 64 32 - self.layer1 = make_layers(64,64,2,False) - # 32 64 32 - self.layer2 = make_layers(64,128,2,True) - # 64 32 16 - self.layer3 = make_layers(128,256,2,True) - # 128 16 8 - self.layer4 = make_layers(256,512,2,True) - # 256 8 4 - self.avgpool = nn.AvgPool2d((8,4),1) - # 256 1 1 - self.reid = reid - self.classifier = nn.Sequential( - nn.Linear(512, 256), - nn.BatchNorm1d(256), - nn.ReLU(inplace=True), - nn.Dropout(), - nn.Linear(256, num_classes), - ) - - def forward(self, x): - x = self.conv(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - x = self.avgpool(x) - x = x.view(x.size(0),-1) - # B x 128 - if self.reid: - x = x.div(x.norm(p=2,dim=1,keepdim=True)) - return x - # classifier - x = self.classifier(x) - return x - - -if __name__ == '__main__': - net = Net() - x = torch.randn(4,3,128,64) - y = net(x) - import ipdb; ipdb.set_trace() - - +import torch +import torch.nn as nn +import torch.nn.functional as F + +class BasicBlock(nn.Module): + def __init__(self, c_in, c_out,is_downsample=False): + super(BasicBlock,self).__init__() + self.is_downsample = is_downsample + if is_downsample: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) + else: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(c_out) + self.relu = nn.ReLU(True) + self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(c_out) + if is_downsample: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), + nn.BatchNorm2d(c_out) + ) + elif c_in != c_out: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), + nn.BatchNorm2d(c_out) + ) + self.is_downsample = True + + def forward(self,x): + y = self.conv1(x) + y = self.bn1(y) + y = self.relu(y) + y = self.conv2(y) + y = self.bn2(y) + if self.is_downsample: + x = self.downsample(x) + return F.relu(x.add(y),True) + +def make_layers(c_in,c_out,repeat_times, is_downsample=False): + blocks = [] + for i in range(repeat_times): + if i ==0: + blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] + else: + blocks += [BasicBlock(c_out,c_out),] + return nn.Sequential(*blocks) + +class Net(nn.Module): + def __init__(self, num_classes=751 ,reid=False): + super(Net,self).__init__() + # 3 128 64 + self.conv = nn.Sequential( + nn.Conv2d(3,64,3,stride=1,padding=1), + nn.BatchNorm2d(64), + nn.ReLU(inplace=True), + # nn.Conv2d(32,32,3,stride=1,padding=1), + # nn.BatchNorm2d(32), + # nn.ReLU(inplace=True), + nn.MaxPool2d(3,2,padding=1), + ) + # 32 64 32 + self.layer1 = make_layers(64,64,2,False) + # 32 64 32 + self.layer2 = make_layers(64,128,2,True) + # 64 32 16 + self.layer3 = make_layers(128,256,2,True) + # 128 16 8 + self.layer4 = make_layers(256,512,2,True) + # 256 8 4 + self.avgpool = nn.AvgPool2d((8,4),1) + # 256 1 1 + self.reid = reid + self.classifier = nn.Sequential( + nn.Linear(512, 256), + nn.BatchNorm1d(256), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(256, num_classes), + ) + + def forward(self, x): + x = self.conv(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.avgpool(x) + x = x.view(x.size(0),-1) + # B x 128 + if self.reid: + x = x.div(x.norm(p=2,dim=1,keepdim=True)) + return x + # classifier + x = self.classifier(x) + return x + + +if __name__ == '__main__': + net = Net() + x = torch.randn(4,3,128,64) + y = net(x) + import ipdb; ipdb.set_trace() + + diff --git a/cv/multi_object_tracking/deep_sort/pytorch/test.py b/cv/multi_object_tracking/deep_sort/pytorch/test.py index ecac0ad5c..ebd590336 100644 --- a/cv/multi_object_tracking/deep_sort/pytorch/test.py +++ b/cv/multi_object_tracking/deep_sort/pytorch/test.py @@ -1,77 +1,77 @@ -import torch -import torch.backends.cudnn as cudnn -import torchvision - -import argparse -import os - -from model import Net - -parser = argparse.ArgumentParser(description="Train on market1501") -parser.add_argument("--data-dir",default='data',type=str) -parser.add_argument("--no-cuda",action="store_true") -parser.add_argument("--gpu-id",default=0,type=int) -args = parser.parse_args() - -# device -device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" -if torch.cuda.is_available() and not args.no_cuda: - cudnn.benchmark = True - -# data loader -root = args.data_dir -query_dir = os.path.join(root,"query") -gallery_dir = os.path.join(root,"gallery") -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((128,64)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) -]) -queryloader = torch.utils.data.DataLoader( - torchvision.datasets.ImageFolder(query_dir, transform=transform), - batch_size=64, shuffle=False -) -galleryloader = torch.utils.data.DataLoader( - torchvision.datasets.ImageFolder(gallery_dir, transform=transform), - batch_size=64, shuffle=False -) - -# net definition -net = Net(reid=True) -assert os.path.isfile("./checkpoint/ckpt.t7"), "Error: no checkpoint file found!" -print('Loading from checkpoint/ckpt.t7') -checkpoint = torch.load("./checkpoint/ckpt.t7") -net_dict = checkpoint['net_dict'] -net.load_state_dict(net_dict, strict=False) -net.eval() -net.to(device) - -# compute features -query_features = torch.tensor([]).float() -query_labels = torch.tensor([]).long() -gallery_features = torch.tensor([]).float() -gallery_labels = torch.tensor([]).long() - -with torch.no_grad(): - for idx,(inputs,labels) in enumerate(queryloader): - inputs = inputs.to(device) - features = net(inputs).cpu() - query_features = torch.cat((query_features, features), dim=0) - query_labels = torch.cat((query_labels, labels)) - - for idx,(inputs,labels) in enumerate(galleryloader): - inputs = inputs.to(device) - features = net(inputs).cpu() - gallery_features = torch.cat((gallery_features, features), dim=0) - gallery_labels = torch.cat((gallery_labels, labels)) - -gallery_labels -= 2 - -# save features -features = { - "qf": query_features, - "ql": query_labels, - "gf": gallery_features, - "gl": gallery_labels -} +import torch +import torch.backends.cudnn as cudnn +import torchvision + +import argparse +import os + +from model import Net + +parser = argparse.ArgumentParser(description="Train on market1501") +parser.add_argument("--data-dir",default='data',type=str) +parser.add_argument("--no-cuda",action="store_true") +parser.add_argument("--gpu-id",default=0,type=int) +args = parser.parse_args() + +# device +device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" +if torch.cuda.is_available() and not args.no_cuda: + cudnn.benchmark = True + +# data loader +root = args.data_dir +query_dir = os.path.join(root,"query") +gallery_dir = os.path.join(root,"gallery") +transform = torchvision.transforms.Compose([ + torchvision.transforms.Resize((128,64)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) +queryloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(query_dir, transform=transform), + batch_size=64, shuffle=False +) +galleryloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(gallery_dir, transform=transform), + batch_size=64, shuffle=False +) + +# net definition +net = Net(reid=True) +assert os.path.isfile("./checkpoint/ckpt.t7"), "Error: no checkpoint file found!" +print('Loading from checkpoint/ckpt.t7') +checkpoint = torch.load("./checkpoint/ckpt.t7") +net_dict = checkpoint['net_dict'] +net.load_state_dict(net_dict, strict=False) +net.eval() +net.to(device) + +# compute features +query_features = torch.tensor([]).float() +query_labels = torch.tensor([]).long() +gallery_features = torch.tensor([]).float() +gallery_labels = torch.tensor([]).long() + +with torch.no_grad(): + for idx,(inputs,labels) in enumerate(queryloader): + inputs = inputs.to(device) + features = net(inputs).cpu() + query_features = torch.cat((query_features, features), dim=0) + query_labels = torch.cat((query_labels, labels)) + + for idx,(inputs,labels) in enumerate(galleryloader): + inputs = inputs.to(device) + features = net(inputs).cpu() + gallery_features = torch.cat((gallery_features, features), dim=0) + gallery_labels = torch.cat((gallery_labels, labels)) + +gallery_labels -= 2 + +# save features +features = { + "qf": query_features, + "ql": query_labels, + "gf": gallery_features, + "gl": gallery_labels +} torch.save(features,"features.pth") \ No newline at end of file diff --git a/cv/multi_object_tracking/fairmot/pytorch/src/lib/tracking_utils/kalman_filter.py b/cv/multi_object_tracking/fairmot/pytorch/src/lib/tracking_utils/kalman_filter.py index ab7b31e25..b4c4e9854 100644 --- a/cv/multi_object_tracking/fairmot/pytorch/src/lib/tracking_utils/kalman_filter.py +++ b/cv/multi_object_tracking/fairmot/pytorch/src/lib/tracking_utils/kalman_filter.py @@ -1,269 +1,269 @@ -# vim: expandtab:ts=4:sw=4 -import numpy as np -import scipy.linalg - -""" -Table for the 0.95 quantile of the chi-square distribution with N degrees of -freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv -function and used as Mahalanobis gating threshold. -""" -chi2inv95 = { - 1: 3.8415, - 2: 5.9915, - 3: 7.8147, - 4: 9.4877, - 5: 11.070, - 6: 12.592, - 7: 14.067, - 8: 15.507, - 9: 16.919} - - -class KalmanFilter(object): - """ - A simple Kalman filter for tracking bounding boxes in image space. - - The 8-dimensional state space - - x, y, a, h, vx, vy, va, vh - - contains the bounding box center position (x, y), aspect ratio a, height h, - and their respective velocities. - - Object motion follows a constant velocity model. The bounding box location - (x, y, a, h) is taken as direct observation of the state space (linear - observation model). - - """ - - def __init__(self): - ndim, dt = 4, 1. - - # Create Kalman filter model matrices. - self._motion_mat = np.eye(2 * ndim, 2 * ndim) - for i in range(ndim): - self._motion_mat[i, ndim + i] = dt - self._update_mat = np.eye(ndim, 2 * ndim) - - # Motion and observation uncertainty are chosen relative to the current - # state estimate. These weights control the amount of uncertainty in - # the model. This is a bit hacky. - self._std_weight_position = 1. / 20 - self._std_weight_velocity = 1. / 160 - - def initiate(self, measurement): - """Create track from unassociated measurement. - - Parameters - ---------- - measurement : ndarray - Bounding box coordinates (x, y, a, h) with center position (x, y), - aspect ratio a, and height h. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are initialized - to 0 mean. - - """ - mean_pos = measurement - mean_vel = np.zeros_like(mean_pos) - mean = np.r_[mean_pos, mean_vel] - - std = [ - 2 * self._std_weight_position * measurement[3], - 2 * self._std_weight_position * measurement[3], - 1e-2, - 2 * self._std_weight_position * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 1e-5, - 10 * self._std_weight_velocity * measurement[3]] - covariance = np.diag(np.square(std)) - return mean, covariance - - def predict(self, mean, covariance): - """Run Kalman filter prediction step. - - Parameters - ---------- - mean : ndarray - The 8 dimensional mean vector of the object state at the previous - time step. - covariance : ndarray - The 8x8 dimensional covariance matrix of the object state at the - previous time step. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - - """ - std_pos = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-2, - self._std_weight_position * mean[3]] - std_vel = [ - self._std_weight_velocity * mean[3], - self._std_weight_velocity * mean[3], - 1e-5, - self._std_weight_velocity * mean[3]] - motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - - #mean = np.dot(self._motion_mat, mean) - mean = np.dot(mean, self._motion_mat.T) - covariance = np.linalg.multi_dot(( - self._motion_mat, covariance, self._motion_mat.T)) + motion_cov - - return mean, covariance - - def project(self, mean, covariance): - """Project state distribution to measurement space. - - Parameters - ---------- - mean : ndarray - The state's mean vector (8 dimensional array). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - - Returns - ------- - (ndarray, ndarray) - Returns the projected mean and covariance matrix of the given state - estimate. - - """ - std = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-1, - self._std_weight_position * mean[3]] - innovation_cov = np.diag(np.square(std)) - - mean = np.dot(self._update_mat, mean) - covariance = np.linalg.multi_dot(( - self._update_mat, covariance, self._update_mat.T)) - return mean, covariance + innovation_cov - - def multi_predict(self, mean, covariance): - """Run Kalman filter prediction step (Vectorized version). - Parameters - ---------- - mean : ndarray - The Nx8 dimensional mean matrix of the object states at the previous - time step. - covariance : ndarray - The Nx8x8 dimensional covariance matrics of the object states at the - previous time step. - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - """ - std_pos = [ - self._std_weight_position * mean[:, 3], - self._std_weight_position * mean[:, 3], - 1e-2 * np.ones_like(mean[:, 3]), - self._std_weight_position * mean[:, 3]] - std_vel = [ - self._std_weight_velocity * mean[:, 3], - self._std_weight_velocity * mean[:, 3], - 1e-5 * np.ones_like(mean[:, 3]), - self._std_weight_velocity * mean[:, 3]] - sqr = np.square(np.r_[std_pos, std_vel]).T - - motion_cov = [] - for i in range(len(mean)): - motion_cov.append(np.diag(sqr[i])) - motion_cov = np.asarray(motion_cov) - - mean = np.dot(mean, self._motion_mat.T) - left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) - covariance = np.dot(left, self._motion_mat.T) + motion_cov - - return mean, covariance - - def update(self, mean, covariance, measurement): - """Run Kalman filter correction step. - - Parameters - ---------- - mean : ndarray - The predicted state's mean vector (8 dimensional). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - measurement : ndarray - The 4 dimensional measurement vector (x, y, a, h), where (x, y) - is the center position, a the aspect ratio, and h the height of the - bounding box. - - Returns - ------- - (ndarray, ndarray) - Returns the measurement-corrected state distribution. - - """ - projected_mean, projected_cov = self.project(mean, covariance) - - chol_factor, lower = scipy.linalg.cho_factor( - projected_cov, lower=True, check_finite=False) - kalman_gain = scipy.linalg.cho_solve( - (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, - check_finite=False).T - innovation = measurement - projected_mean - - new_mean = mean + np.dot(innovation, kalman_gain.T) - new_covariance = covariance - np.linalg.multi_dot(( - kalman_gain, projected_cov, kalman_gain.T)) - return new_mean, new_covariance - - def gating_distance(self, mean, covariance, measurements, - only_position=False, metric='maha'): - """Compute gating distance between state distribution and measurements. - A suitable distance threshold can be obtained from `chi2inv95`. If - `only_position` is False, the chi-square distribution has 4 degrees of - freedom, otherwise 2. - Parameters - ---------- - mean : ndarray - Mean vector over the state distribution (8 dimensional). - covariance : ndarray - Covariance of the state distribution (8x8 dimensional). - measurements : ndarray - An Nx4 dimensional matrix of N measurements, each in - format (x, y, a, h) where (x, y) is the bounding box center - position, a the aspect ratio, and h the height. - only_position : Optional[bool] - If True, distance computation is done with respect to the bounding - box center position only. - Returns - ------- - ndarray - Returns an array of length N, where the i-th element contains the - squared Mahalanobis distance between (mean, covariance) and - `measurements[i]`. - """ - mean, covariance = self.project(mean, covariance) - if only_position: - mean, covariance = mean[:2], covariance[:2, :2] - measurements = measurements[:, :2] - - d = measurements - mean - if metric == 'gaussian': - return np.sum(d * d, axis=1) - elif metric == 'maha': - cholesky_factor = np.linalg.cholesky(covariance) - z = scipy.linalg.solve_triangular( - cholesky_factor, d.T, lower=True, check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) - return squared_maha - else: - raise ValueError('invalid distance metric') +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + #mean = np.dot(self._motion_mat, mean) + mean = np.dot(mean, self._motion_mat.T) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def multi_predict(self, mean, covariance): + """Run Kalman filter prediction step (Vectorized version). + Parameters + ---------- + mean : ndarray + The Nx8 dimensional mean matrix of the object states at the previous + time step. + covariance : ndarray + The Nx8x8 dimensional covariance matrics of the object states at the + previous time step. + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + """ + std_pos = [ + self._std_weight_position * mean[:, 3], + self._std_weight_position * mean[:, 3], + 1e-2 * np.ones_like(mean[:, 3]), + self._std_weight_position * mean[:, 3]] + std_vel = [ + self._std_weight_velocity * mean[:, 3], + self._std_weight_velocity * mean[:, 3], + 1e-5 * np.ones_like(mean[:, 3]), + self._std_weight_velocity * mean[:, 3]] + sqr = np.square(np.r_[std_pos, std_vel]).T + + motion_cov = [] + for i in range(len(mean)): + motion_cov.append(np.diag(sqr[i])) + motion_cov = np.asarray(motion_cov) + + mean = np.dot(mean, self._motion_mat.T) + left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) + covariance = np.dot(left, self._motion_mat.T) + motion_cov + + return mean, covariance + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False, metric='maha'): + """Compute gating distance between state distribution and measurements. + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + d = measurements - mean + if metric == 'gaussian': + return np.sum(d * d, axis=1) + elif metric == 'maha': + cholesky_factor = np.linalg.cholesky(covariance) + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + else: + raise ValueError('invalid distance metric') diff --git a/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody.py b/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody.py index 23f391e37..f021d7f89 100755 --- a/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody.py +++ b/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody.py @@ -1,161 +1,161 @@ -# ----------------------------------------------------- -# Copyright (c) Shanghai Jiao Tong University. All rights reserved. -# Written by Haoyi Zhu and Hao-Shu Fang -# ----------------------------------------------------- -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -"""Halpe Full-Body(136 points) Human keypoint dataset.""" -import os - -import numpy as np -from tkinter import _flatten - -from alphapose.models.builder import DATASET -from alphapose.utils.bbox import bbox_clip_xyxy, bbox_xywh_to_xyxy - -from .custom import CustomDataset - - -@DATASET.register_module -class coco_wholebody(CustomDataset): - """ Halpe Full-Body(136 points) Person dataset. - - Parameters - ---------- - train: bool, default is True - If true, will set as training mode. - skip_empty: bool, default is False - Whether skip entire image if no valid label is found. Use `False` if this dataset is - for validation to avoid COCO metric error. - dpg: bool, default is False - If true, will activate `dpg` for data augmentation. - """ - CLASSES = ['person'] - EVAL_JOINTS = list(range(133)) - num_joints = 133 - CustomDataset.lower_body_ids = (11, 12, 13, 14, 15, 16, 17, 21-3, 22-3, 23-3, 24-3, 25-3) - """Joint pairs which defines the pairs of joint to be swapped - when the image is flipped horizontally.""" - joint_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], #17 body keypoints - [20-3, 23-3], [21-3, 24-3], [22-3, 25-3], [26-3, 42-3], [27-3, 41-3], [28-3, 40-3], [29-3, 39-3], [30-3, 38-3], - [31-3, 37-3], [32-3, 36-3], [33-3, 35-3], [43-3, 52-3], [44-3, 51-3], [45-3, 50-3], [46-3, 49-3], [47-3, 48-3], - [62-3, 71-3], [63-3, 70-3], [64-3, 69-3], [65-3, 68-3], [66-3, 73-3], [67-3, 72-3], [57-3, 61-3], [58-3, 60-3], - [74-3, 80-3], [75-3, 79-3], [76-3, 78-3], [87-3, 89-3], [93-3, 91-3], [86-3, 90-3], [85-3, 81-3], [84-3, 82-3], - [94-3, 115-3], [95-3, 116-3], [96-3, 117-3], [97-3, 118-3], [98-3, 119-3], [99-3, 120-3], [100-3, 121-3], - [101-3, 122-3], [102-3, 123-3], [103-3, 124-3], [104-3, 125-3], [105-3, 126-3], [106-3, 127-3], [107-3, 128-3], - [108-3, 129-3], [109-3, 130-3], [110-3, 131-3], [111-3, 132-3], [112-3, 133-3], [113-3, 134-3], [114-3, 135-3]] - - - def _load_jsons(self): - """Load all image paths and labels from JSON annotation files into buffer.""" - items = [] - labels = [] - - _coco = self._lazy_load_ann_file() - - classes = [c['name'] for c in _coco.loadCats(_coco.getCatIds())] - assert classes == self.CLASSES, "Incompatible category names with COCO. " - - self.json_id_to_contiguous = { - v: k for k, v in enumerate(_coco.getCatIds())} - - # iterate through the annotations - image_ids = sorted(_coco.getImgIds()) - for entry in _coco.loadImgs(image_ids): - dirname, filename = entry['coco_url'].split('/')[-2:] - abs_path = os.path.join('/DATA1/Benchmark/coco', dirname, filename) - if not os.path.exists(abs_path): - raise IOError('Image: {} not exists.'.format(abs_path)) - label = self._check_load_keypoints(_coco, entry) - if not label: - continue - for obj in label: - items.append(abs_path) - labels.append(obj) - - return items, labels - - def _check_load_keypoints(self, coco, entry): - """Check and load ground-truth keypoints""" - ann_ids = coco.getAnnIds(imgIds=entry['id'], iscrowd=False) - objs = coco.loadAnns(ann_ids) - # check valid bboxes - valid_objs = [] - width = entry['width'] - height = entry['height'] - - for obj in objs: - #obj['keypoints'].extend([0,0,0, 0,0,0, 0,0,0]) - obj['keypoints'].extend(obj['foot_kpts']) - obj['keypoints'].extend(obj['face_kpts']) - obj['keypoints'].extend(obj['lefthand_kpts']) - obj['keypoints'].extend(obj['righthand_kpts']) - contiguous_cid = self.json_id_to_contiguous[obj['category_id']] - if contiguous_cid >= self.num_class: - # not class of interest - continue - if max(obj['keypoints']) == 0: - continue - # convert from (x, y, w, h) to (xmin, ymin, xmax, ymax) and clip bound - xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height) - # require non-zero box area - #if obj['area'] <= 0 or xmax <= xmin or ymax <= ymin: - if (xmax-xmin)*(ymax-ymin) <= 0 or xmax <= xmin or ymax <= ymin: - continue - if 'num_keypoints' in obj and obj['num_keypoints'] == 0: - continue - # joints 3d: (num_joints, 3, 2); 3 is for x, y, z; 2 is for position, visibility - joints_3d = np.zeros((self.num_joints, 3, 2), dtype=np.float32) - for i in range(self.num_joints): - joints_3d[i, 0, 0] = obj['keypoints'][i * 3 + 0] - joints_3d[i, 1, 0] = obj['keypoints'][i * 3 + 1] - # joints_3d[i, 2, 0] = 0 - if obj['keypoints'][i * 3 + 2] >= 0.35: - visible = 1 - else: - visible = 0 - #visible = min(1, visible) - joints_3d[i, :2, 1] = visible - # joints_3d[i, 2, 1] = 0 - - if np.sum(joints_3d[:, 0, 1]) < 1: - # no visible keypoint - continue - - if self._check_centers and self._train: - bbox_center, bbox_area = self._get_box_center_area((xmin, ymin, xmax, ymax)) - kp_center, num_vis = self._get_keypoints_center_count(joints_3d) - ks = np.exp(-2 * np.sum(np.square(bbox_center - kp_center)) / bbox_area) - if (num_vis / 80.0 + 47 / 80.0) > ks: - continue - - valid_objs.append({ - 'bbox': (xmin, ymin, xmax, ymax), - 'width': width, - 'height': height, - 'joints_3d': joints_3d - }) - - if not valid_objs: - if not self._skip_empty: - # dummy invalid labels if no valid objects are found - valid_objs.append({ - 'bbox': np.array([-1, -1, 0, 0]), - 'width': width, - 'height': height, - 'joints_3d': np.zeros((self.num_joints, 2, 2), dtype=np.float32) - }) - return valid_objs - - def _get_box_center_area(self, bbox): - """Get bbox center""" - c = np.array([(bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0]) - area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0]) - return c, area - - def _get_keypoints_center_count(self, keypoints): - """Get geometric center of all keypoints""" - keypoint_x = np.sum(keypoints[:, 0, 0] * (keypoints[:, 0, 1] > 0)) - keypoint_y = np.sum(keypoints[:, 1, 0] * (keypoints[:, 1, 1] > 0)) - num = float(np.sum(keypoints[:, 0, 1])) - return np.array([keypoint_x / num, keypoint_y / num]), num +# ----------------------------------------------------- +# Copyright (c) Shanghai Jiao Tong University. All rights reserved. +# Written by Haoyi Zhu and Hao-Shu Fang +# ----------------------------------------------------- +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +"""Halpe Full-Body(136 points) Human keypoint dataset.""" +import os + +import numpy as np +from tkinter import _flatten + +from alphapose.models.builder import DATASET +from alphapose.utils.bbox import bbox_clip_xyxy, bbox_xywh_to_xyxy + +from .custom import CustomDataset + + +@DATASET.register_module +class coco_wholebody(CustomDataset): + """ Halpe Full-Body(136 points) Person dataset. + + Parameters + ---------- + train: bool, default is True + If true, will set as training mode. + skip_empty: bool, default is False + Whether skip entire image if no valid label is found. Use `False` if this dataset is + for validation to avoid COCO metric error. + dpg: bool, default is False + If true, will activate `dpg` for data augmentation. + """ + CLASSES = ['person'] + EVAL_JOINTS = list(range(133)) + num_joints = 133 + CustomDataset.lower_body_ids = (11, 12, 13, 14, 15, 16, 17, 21-3, 22-3, 23-3, 24-3, 25-3) + """Joint pairs which defines the pairs of joint to be swapped + when the image is flipped horizontally.""" + joint_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], #17 body keypoints + [20-3, 23-3], [21-3, 24-3], [22-3, 25-3], [26-3, 42-3], [27-3, 41-3], [28-3, 40-3], [29-3, 39-3], [30-3, 38-3], + [31-3, 37-3], [32-3, 36-3], [33-3, 35-3], [43-3, 52-3], [44-3, 51-3], [45-3, 50-3], [46-3, 49-3], [47-3, 48-3], + [62-3, 71-3], [63-3, 70-3], [64-3, 69-3], [65-3, 68-3], [66-3, 73-3], [67-3, 72-3], [57-3, 61-3], [58-3, 60-3], + [74-3, 80-3], [75-3, 79-3], [76-3, 78-3], [87-3, 89-3], [93-3, 91-3], [86-3, 90-3], [85-3, 81-3], [84-3, 82-3], + [94-3, 115-3], [95-3, 116-3], [96-3, 117-3], [97-3, 118-3], [98-3, 119-3], [99-3, 120-3], [100-3, 121-3], + [101-3, 122-3], [102-3, 123-3], [103-3, 124-3], [104-3, 125-3], [105-3, 126-3], [106-3, 127-3], [107-3, 128-3], + [108-3, 129-3], [109-3, 130-3], [110-3, 131-3], [111-3, 132-3], [112-3, 133-3], [113-3, 134-3], [114-3, 135-3]] + + + def _load_jsons(self): + """Load all image paths and labels from JSON annotation files into buffer.""" + items = [] + labels = [] + + _coco = self._lazy_load_ann_file() + + classes = [c['name'] for c in _coco.loadCats(_coco.getCatIds())] + assert classes == self.CLASSES, "Incompatible category names with COCO. " + + self.json_id_to_contiguous = { + v: k for k, v in enumerate(_coco.getCatIds())} + + # iterate through the annotations + image_ids = sorted(_coco.getImgIds()) + for entry in _coco.loadImgs(image_ids): + dirname, filename = entry['coco_url'].split('/')[-2:] + abs_path = os.path.join('/DATA1/Benchmark/coco', dirname, filename) + if not os.path.exists(abs_path): + raise IOError('Image: {} not exists.'.format(abs_path)) + label = self._check_load_keypoints(_coco, entry) + if not label: + continue + for obj in label: + items.append(abs_path) + labels.append(obj) + + return items, labels + + def _check_load_keypoints(self, coco, entry): + """Check and load ground-truth keypoints""" + ann_ids = coco.getAnnIds(imgIds=entry['id'], iscrowd=False) + objs = coco.loadAnns(ann_ids) + # check valid bboxes + valid_objs = [] + width = entry['width'] + height = entry['height'] + + for obj in objs: + #obj['keypoints'].extend([0,0,0, 0,0,0, 0,0,0]) + obj['keypoints'].extend(obj['foot_kpts']) + obj['keypoints'].extend(obj['face_kpts']) + obj['keypoints'].extend(obj['lefthand_kpts']) + obj['keypoints'].extend(obj['righthand_kpts']) + contiguous_cid = self.json_id_to_contiguous[obj['category_id']] + if contiguous_cid >= self.num_class: + # not class of interest + continue + if max(obj['keypoints']) == 0: + continue + # convert from (x, y, w, h) to (xmin, ymin, xmax, ymax) and clip bound + xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height) + # require non-zero box area + #if obj['area'] <= 0 or xmax <= xmin or ymax <= ymin: + if (xmax-xmin)*(ymax-ymin) <= 0 or xmax <= xmin or ymax <= ymin: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + # joints 3d: (num_joints, 3, 2); 3 is for x, y, z; 2 is for position, visibility + joints_3d = np.zeros((self.num_joints, 3, 2), dtype=np.float32) + for i in range(self.num_joints): + joints_3d[i, 0, 0] = obj['keypoints'][i * 3 + 0] + joints_3d[i, 1, 0] = obj['keypoints'][i * 3 + 1] + # joints_3d[i, 2, 0] = 0 + if obj['keypoints'][i * 3 + 2] >= 0.35: + visible = 1 + else: + visible = 0 + #visible = min(1, visible) + joints_3d[i, :2, 1] = visible + # joints_3d[i, 2, 1] = 0 + + if np.sum(joints_3d[:, 0, 1]) < 1: + # no visible keypoint + continue + + if self._check_centers and self._train: + bbox_center, bbox_area = self._get_box_center_area((xmin, ymin, xmax, ymax)) + kp_center, num_vis = self._get_keypoints_center_count(joints_3d) + ks = np.exp(-2 * np.sum(np.square(bbox_center - kp_center)) / bbox_area) + if (num_vis / 80.0 + 47 / 80.0) > ks: + continue + + valid_objs.append({ + 'bbox': (xmin, ymin, xmax, ymax), + 'width': width, + 'height': height, + 'joints_3d': joints_3d + }) + + if not valid_objs: + if not self._skip_empty: + # dummy invalid labels if no valid objects are found + valid_objs.append({ + 'bbox': np.array([-1, -1, 0, 0]), + 'width': width, + 'height': height, + 'joints_3d': np.zeros((self.num_joints, 2, 2), dtype=np.float32) + }) + return valid_objs + + def _get_box_center_area(self, bbox): + """Get bbox center""" + c = np.array([(bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0]) + area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0]) + return c, area + + def _get_keypoints_center_count(self, keypoints): + """Get geometric center of all keypoints""" + keypoint_x = np.sum(keypoints[:, 0, 0] * (keypoints[:, 0, 1] > 0)) + keypoint_y = np.sum(keypoints[:, 1, 0] * (keypoints[:, 1, 1] > 0)) + num = float(np.sum(keypoints[:, 0, 1])) + return np.array([keypoint_x / num, keypoint_y / num]), num diff --git a/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody_det.py b/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody_det.py index c27ea15b0..dd1d34986 100755 --- a/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody_det.py +++ b/cv/pose/alphapose/pytorch/alphapose/datasets/coco_wholebody_det.py @@ -1,114 +1,114 @@ -# ----------------------------------------------------- -# Copyright (c) Shanghai Jiao Tong University. All rights reserved. -# Written by Haoyi Zhu -# ----------------------------------------------------- -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -"""Haple_136 Human Detection Box dataset.""" -import json -import os - -import cv2 -import torch -import torch.utils.data as data -from tqdm import tqdm - -from alphapose.utils.presets import SimpleTransform -from detector.apis import get_detector -from alphapose.models.builder import DATASET - - -@DATASET.register_module -class coco_wholebody_det(data.Dataset): - """ Halpe_136 human detection box dataset. - - """ - EVAL_JOINTS = list(range(133)) - - def __init__(self, - det_file=None, - opt=None, - **cfg): - - self._cfg = cfg - self._opt = opt - self._preset_cfg = cfg['PRESET'] - self._root = cfg['ROOT'] - self._img_prefix = cfg['IMG_PREFIX'] - if not det_file: - det_file = cfg['DET_FILE'] - self._ann_file = os.path.join(self._root, cfg['ANN']) - - if os.path.exists(det_file): - print("Detection results exist, will use it") - else: - print("Will create detection results to {}".format(det_file)) - self.write_coco_json(det_file) - - assert os.path.exists(det_file), "Error: no detection results found" - with open(det_file, 'r') as fid: - self._det_json = json.load(fid) - - self._input_size = self._preset_cfg['IMAGE_SIZE'] - self._output_size = self._preset_cfg['HEATMAP_SIZE'] - - self._sigma = self._preset_cfg['SIGMA'] - - if self._preset_cfg['TYPE'] == 'simple': - self.transformation = SimpleTransform( - self, scale_factor=0, - input_size=self._input_size, - output_size=self._output_size, - rot=0, sigma=self._sigma, - train=False, add_dpg=False) - - def __getitem__(self, index): - det_res = self._det_json[index] - if not isinstance(det_res['image_id'], int): - img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) - img_id = int(img_id) - else: - img_id = det_res['image_id'] - img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id - - # Load image - image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') - - imght, imgwidth = image.shape[1], image.shape[2] - x1, y1, w, h = det_res['bbox'] - bbox = [x1, y1, x1 + w, y1 + h] - inp, bbox = self.transformation.test_transform(image, bbox) - return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) - - def __len__(self): - return len(self._det_json) - - def write_coco_json(self, det_file): - from pycocotools.coco import COCO - import pathlib - - _coco = COCO(self._ann_file) - image_ids = sorted(_coco.getImgIds()) - det_model = get_detector(self._opt) - dets = [] - for entry in tqdm(_coco.loadImgs(image_ids)): - abs_path = os.path.join( - '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) - det = det_model.detect_one_img(abs_path) - if det: - dets += det - pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) - json.dump(dets, open(det_file, 'w')) - - @property - def joint_pairs(self): - """Joint pairs which defines the pairs of joint to be swapped - when the image is flipped horizontally.""" - return [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], #17 body keypoints - [20-3, 23-3], [21-3, 24-3], [22-3, 25-3], [26-3, 42-3], [27-3, 41-3], [28-3, 40-3], [29-3, 39-3], [30-3, 38-3], - [31-3, 37-3], [32-3, 36-3], [33-3, 35-3], [43-3, 52-3], [44-3, 51-3], [45-3, 50-3], [46-3, 49-3], [47-3, 48-3], - [62-3, 71-3], [63-3, 70-3], [64-3, 69-3], [65-3, 68-3], [66-3, 73-3], [67-3, 72-3], [57-3, 61-3], [58-3, 60-3], - [74-3, 80-3], [75-3, 79-3], [76-3, 78-3], [87-3, 89-3], [93-3, 91-3], [86-3, 90-3], [85-3, 81-3], [84-3, 82-3], - [94-3, 115-3], [95-3, 116-3], [96-3, 117-3], [97-3, 118-3], [98-3, 119-3], [99-3, 120-3], [100-3, 121-3], - [101-3, 122-3], [102-3, 123-3], [103-3, 124-3], [104-3, 125-3], [105-3, 126-3], [106-3, 127-3], [107-3, 128-3], - [108-3, 129-3], [109-3, 130-3], [110-3, 131-3], [111-3, 132-3], [112-3, 133-3], [113-3, 134-3], [114-3, 135-3]] +# ----------------------------------------------------- +# Copyright (c) Shanghai Jiao Tong University. All rights reserved. +# Written by Haoyi Zhu +# ----------------------------------------------------- +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +"""Haple_136 Human Detection Box dataset.""" +import json +import os + +import cv2 +import torch +import torch.utils.data as data +from tqdm import tqdm + +from alphapose.utils.presets import SimpleTransform +from detector.apis import get_detector +from alphapose.models.builder import DATASET + + +@DATASET.register_module +class coco_wholebody_det(data.Dataset): + """ Halpe_136 human detection box dataset. + + """ + EVAL_JOINTS = list(range(133)) + + def __init__(self, + det_file=None, + opt=None, + **cfg): + + self._cfg = cfg + self._opt = opt + self._preset_cfg = cfg['PRESET'] + self._root = cfg['ROOT'] + self._img_prefix = cfg['IMG_PREFIX'] + if not det_file: + det_file = cfg['DET_FILE'] + self._ann_file = os.path.join(self._root, cfg['ANN']) + + if os.path.exists(det_file): + print("Detection results exist, will use it") + else: + print("Will create detection results to {}".format(det_file)) + self.write_coco_json(det_file) + + assert os.path.exists(det_file), "Error: no detection results found" + with open(det_file, 'r') as fid: + self._det_json = json.load(fid) + + self._input_size = self._preset_cfg['IMAGE_SIZE'] + self._output_size = self._preset_cfg['HEATMAP_SIZE'] + + self._sigma = self._preset_cfg['SIGMA'] + + if self._preset_cfg['TYPE'] == 'simple': + self.transformation = SimpleTransform( + self, scale_factor=0, + input_size=self._input_size, + output_size=self._output_size, + rot=0, sigma=self._sigma, + train=False, add_dpg=False) + + def __getitem__(self, index): + det_res = self._det_json[index] + if not isinstance(det_res['image_id'], int): + img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) + img_id = int(img_id) + else: + img_id = det_res['image_id'] + img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id + + # Load image + image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') + + imght, imgwidth = image.shape[1], image.shape[2] + x1, y1, w, h = det_res['bbox'] + bbox = [x1, y1, x1 + w, y1 + h] + inp, bbox = self.transformation.test_transform(image, bbox) + return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) + + def __len__(self): + return len(self._det_json) + + def write_coco_json(self, det_file): + from pycocotools.coco import COCO + import pathlib + + _coco = COCO(self._ann_file) + image_ids = sorted(_coco.getImgIds()) + det_model = get_detector(self._opt) + dets = [] + for entry in tqdm(_coco.loadImgs(image_ids)): + abs_path = os.path.join( + '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) + det = det_model.detect_one_img(abs_path) + if det: + dets += det + pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) + json.dump(dets, open(det_file, 'w')) + + @property + def joint_pairs(self): + """Joint pairs which defines the pairs of joint to be swapped + when the image is flipped horizontally.""" + return [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], #17 body keypoints + [20-3, 23-3], [21-3, 24-3], [22-3, 25-3], [26-3, 42-3], [27-3, 41-3], [28-3, 40-3], [29-3, 39-3], [30-3, 38-3], + [31-3, 37-3], [32-3, 36-3], [33-3, 35-3], [43-3, 52-3], [44-3, 51-3], [45-3, 50-3], [46-3, 49-3], [47-3, 48-3], + [62-3, 71-3], [63-3, 70-3], [64-3, 69-3], [65-3, 68-3], [66-3, 73-3], [67-3, 72-3], [57-3, 61-3], [58-3, 60-3], + [74-3, 80-3], [75-3, 79-3], [76-3, 78-3], [87-3, 89-3], [93-3, 91-3], [86-3, 90-3], [85-3, 81-3], [84-3, 82-3], + [94-3, 115-3], [95-3, 116-3], [96-3, 117-3], [97-3, 118-3], [98-3, 119-3], [99-3, 120-3], [100-3, 121-3], + [101-3, 122-3], [102-3, 123-3], [103-3, 124-3], [104-3, 125-3], [105-3, 126-3], [106-3, 127-3], [107-3, 128-3], + [108-3, 129-3], [109-3, 130-3], [110-3, 131-3], [111-3, 132-3], [112-3, 133-3], [113-3, 134-3], [114-3, 135-3]] diff --git a/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_136_det.py b/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_136_det.py index 07ee5b93a..bd871a7ca 100755 --- a/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_136_det.py +++ b/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_136_det.py @@ -1,114 +1,114 @@ -# ----------------------------------------------------- -# Copyright (c) Shanghai Jiao Tong University. All rights reserved. -# Written by Haoyi Zhu -# ----------------------------------------------------- -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -"""Haple_136 Human Detection Box dataset.""" -import json -import os - -import cv2 -import torch -import torch.utils.data as data -from tqdm import tqdm - -from alphapose.utils.presets import SimpleTransform -from detector.apis import get_detector -from alphapose.models.builder import DATASET - - -@DATASET.register_module -class Halpe_136_det(data.Dataset): - """ Halpe_136 human detection box dataset. - - """ - EVAL_JOINTS = list(range(136)) - - def __init__(self, - det_file=None, - opt=None, - **cfg): - - self._cfg = cfg - self._opt = opt - self._preset_cfg = cfg['PRESET'] - self._root = cfg['ROOT'] - self._img_prefix = cfg['IMG_PREFIX'] - if not det_file: - det_file = cfg['DET_FILE'] - self._ann_file = os.path.join(self._root, cfg['ANN']) - - if os.path.exists(det_file): - print("Detection results exist, will use it") - else: - print("Will create detection results to {}".format(det_file)) - self.write_coco_json(det_file) - - assert os.path.exists(det_file), "Error: no detection results found" - with open(det_file, 'r') as fid: - self._det_json = json.load(fid) - - self._input_size = self._preset_cfg['IMAGE_SIZE'] - self._output_size = self._preset_cfg['HEATMAP_SIZE'] - - self._sigma = self._preset_cfg['SIGMA'] - - if self._preset_cfg['TYPE'] == 'simple': - self.transformation = SimpleTransform( - self, scale_factor=0, - input_size=self._input_size, - output_size=self._output_size, - rot=0, sigma=self._sigma, - train=False, add_dpg=False) - - def __getitem__(self, index): - det_res = self._det_json[index] - if not isinstance(det_res['image_id'], int): - img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) - img_id = int(img_id) - else: - img_id = det_res['image_id'] - img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id - - # Load image - image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') - - imght, imgwidth = image.shape[1], image.shape[2] - x1, y1, w, h = det_res['bbox'] - bbox = [x1, y1, x1 + w, y1 + h] - inp, bbox = self.transformation.test_transform(image, bbox) - return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) - - def __len__(self): - return len(self._det_json) - - def write_coco_json(self, det_file): - from pycocotools.coco import COCO - import pathlib - - _coco = COCO(self._ann_file) - image_ids = sorted(_coco.getImgIds()) - det_model = get_detector(self._opt) - dets = [] - for entry in tqdm(_coco.loadImgs(image_ids)): - abs_path = os.path.join( - '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) - det = det_model.detect_one_img(abs_path) - if det: - dets += det - pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) - json.dump(dets, open(det_file, 'w')) - - @property - def joint_pairs(self): - """Joint pairs which defines the pairs of joint to be swapped - when the image is flipped horizontally.""" - return[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], - [20, 21], [22, 23], [24, 25], [26, 42], [27, 41], [28, 40], [29, 39], [30, 38], - [31, 37], [32, 36], [33, 35], [43, 52], [44, 51], [45, 50],[46, 49], [47, 48], - [62, 71], [63, 70], [64, 69], [65, 68], [66, 73], [67, 72], [57, 61], [58, 60], - [74, 80], [75, 79], [76, 78], [87, 89], [93, 91], [86, 90], [85, 81], [84, 82], - [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], - [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], - [108, 129], [109, 130], [110, 131], [111, 132], [112, 133], [113, 134], [114, 135]] +# ----------------------------------------------------- +# Copyright (c) Shanghai Jiao Tong University. All rights reserved. +# Written by Haoyi Zhu +# ----------------------------------------------------- +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +"""Haple_136 Human Detection Box dataset.""" +import json +import os + +import cv2 +import torch +import torch.utils.data as data +from tqdm import tqdm + +from alphapose.utils.presets import SimpleTransform +from detector.apis import get_detector +from alphapose.models.builder import DATASET + + +@DATASET.register_module +class Halpe_136_det(data.Dataset): + """ Halpe_136 human detection box dataset. + + """ + EVAL_JOINTS = list(range(136)) + + def __init__(self, + det_file=None, + opt=None, + **cfg): + + self._cfg = cfg + self._opt = opt + self._preset_cfg = cfg['PRESET'] + self._root = cfg['ROOT'] + self._img_prefix = cfg['IMG_PREFIX'] + if not det_file: + det_file = cfg['DET_FILE'] + self._ann_file = os.path.join(self._root, cfg['ANN']) + + if os.path.exists(det_file): + print("Detection results exist, will use it") + else: + print("Will create detection results to {}".format(det_file)) + self.write_coco_json(det_file) + + assert os.path.exists(det_file), "Error: no detection results found" + with open(det_file, 'r') as fid: + self._det_json = json.load(fid) + + self._input_size = self._preset_cfg['IMAGE_SIZE'] + self._output_size = self._preset_cfg['HEATMAP_SIZE'] + + self._sigma = self._preset_cfg['SIGMA'] + + if self._preset_cfg['TYPE'] == 'simple': + self.transformation = SimpleTransform( + self, scale_factor=0, + input_size=self._input_size, + output_size=self._output_size, + rot=0, sigma=self._sigma, + train=False, add_dpg=False) + + def __getitem__(self, index): + det_res = self._det_json[index] + if not isinstance(det_res['image_id'], int): + img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) + img_id = int(img_id) + else: + img_id = det_res['image_id'] + img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id + + # Load image + image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') + + imght, imgwidth = image.shape[1], image.shape[2] + x1, y1, w, h = det_res['bbox'] + bbox = [x1, y1, x1 + w, y1 + h] + inp, bbox = self.transformation.test_transform(image, bbox) + return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) + + def __len__(self): + return len(self._det_json) + + def write_coco_json(self, det_file): + from pycocotools.coco import COCO + import pathlib + + _coco = COCO(self._ann_file) + image_ids = sorted(_coco.getImgIds()) + det_model = get_detector(self._opt) + dets = [] + for entry in tqdm(_coco.loadImgs(image_ids)): + abs_path = os.path.join( + '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) + det = det_model.detect_one_img(abs_path) + if det: + dets += det + pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) + json.dump(dets, open(det_file, 'w')) + + @property + def joint_pairs(self): + """Joint pairs which defines the pairs of joint to be swapped + when the image is flipped horizontally.""" + return[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], + [20, 21], [22, 23], [24, 25], [26, 42], [27, 41], [28, 40], [29, 39], [30, 38], + [31, 37], [32, 36], [33, 35], [43, 52], [44, 51], [45, 50],[46, 49], [47, 48], + [62, 71], [63, 70], [64, 69], [65, 68], [66, 73], [67, 72], [57, 61], [58, 60], + [74, 80], [75, 79], [76, 78], [87, 89], [93, 91], [86, 90], [85, 81], [84, 82], + [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], + [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], + [108, 129], [109, 130], [110, 131], [111, 132], [112, 133], [113, 134], [114, 135]] diff --git a/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_26_det.py b/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_26_det.py index 04a67c0b2..0c0af8b6c 100755 --- a/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_26_det.py +++ b/cv/pose/alphapose/pytorch/alphapose/datasets/halpe_26_det.py @@ -1,108 +1,108 @@ -# ----------------------------------------------------- -# Copyright (c) Shanghai Jiao Tong University. All rights reserved. -# Written by HaoyiZhu -# ----------------------------------------------------- -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -"""Haple_26 Human Detection Box dataset.""" -import json -import os - -import cv2 -import torch -import torch.utils.data as data -from tqdm import tqdm - -from alphapose.utils.presets import SimpleTransform -from detector.apis import get_detector -from alphapose.models.builder import DATASET - - -@DATASET.register_module -class Halpe_26_det(data.Dataset): - """ Halpe_26 human detection box dataset. - - """ - EVAL_JOINTS = list(range(26)) - - def __init__(self, - det_file=None, - opt=None, - **cfg): - - self._cfg = cfg - self._opt = opt - self._preset_cfg = cfg['PRESET'] - self._root = cfg['ROOT'] - self._img_prefix = cfg['IMG_PREFIX'] - if not det_file: - det_file = cfg['DET_FILE'] - self._ann_file = os.path.join(self._root, cfg['ANN']) - - if os.path.exists(det_file): - print("Detection results exist, will use it") - else: - print("Will create detection results to {}".format(det_file)) - self.write_coco_json(det_file) - - assert os.path.exists(det_file), "Error: no detection results found" - with open(det_file, 'r') as fid: - self._det_json = json.load(fid) - - self._input_size = self._preset_cfg['IMAGE_SIZE'] - self._output_size = self._preset_cfg['HEATMAP_SIZE'] - - self._sigma = self._preset_cfg['SIGMA'] - - if self._preset_cfg['TYPE'] == 'simple': - self.transformation = SimpleTransform( - self, scale_factor=0, - input_size=self._input_size, - output_size=self._output_size, - rot=0, sigma=self._sigma, - train=False, add_dpg=False) - - def __getitem__(self, index): - det_res = self._det_json[index] - if not isinstance(det_res['image_id'], int): - img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) - img_id = int(img_id) - else: - img_id = det_res['image_id'] - img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id - - # Load image - image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') - - imght, imgwidth = image.shape[1], image.shape[2] - x1, y1, w, h = det_res['bbox'] - bbox = [x1, y1, x1 + w, y1 + h] - inp, bbox = self.transformation.test_transform(image, bbox) - return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) - - def __len__(self): - return len(self._det_json) - - def write_coco_json(self, det_file): - from pycocotools.coco import COCO - import pathlib - - _coco = COCO(self._ann_file) - image_ids = sorted(_coco.getImgIds()) - det_model = get_detector(self._opt) - dets = [] - for entry in tqdm(_coco.loadImgs(image_ids)): - abs_path = os.path.join( - '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) - det = det_model.detect_one_img(abs_path) - if det: - dets += det - pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) - json.dump(dets, open(det_file, 'w')) - - @property - def joint_pairs(self): - """Joint pairs which defines the pairs of joint to be swapped - when the image is flipped horizontally.""" - return[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], - [20, 21], [22, 23], [24, 25]] +# ----------------------------------------------------- +# Copyright (c) Shanghai Jiao Tong University. All rights reserved. +# Written by HaoyiZhu +# ----------------------------------------------------- +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +"""Haple_26 Human Detection Box dataset.""" +import json +import os + +import cv2 +import torch +import torch.utils.data as data +from tqdm import tqdm + +from alphapose.utils.presets import SimpleTransform +from detector.apis import get_detector +from alphapose.models.builder import DATASET + + +@DATASET.register_module +class Halpe_26_det(data.Dataset): + """ Halpe_26 human detection box dataset. + + """ + EVAL_JOINTS = list(range(26)) + + def __init__(self, + det_file=None, + opt=None, + **cfg): + + self._cfg = cfg + self._opt = opt + self._preset_cfg = cfg['PRESET'] + self._root = cfg['ROOT'] + self._img_prefix = cfg['IMG_PREFIX'] + if not det_file: + det_file = cfg['DET_FILE'] + self._ann_file = os.path.join(self._root, cfg['ANN']) + + if os.path.exists(det_file): + print("Detection results exist, will use it") + else: + print("Will create detection results to {}".format(det_file)) + self.write_coco_json(det_file) + + assert os.path.exists(det_file), "Error: no detection results found" + with open(det_file, 'r') as fid: + self._det_json = json.load(fid) + + self._input_size = self._preset_cfg['IMAGE_SIZE'] + self._output_size = self._preset_cfg['HEATMAP_SIZE'] + + self._sigma = self._preset_cfg['SIGMA'] + + if self._preset_cfg['TYPE'] == 'simple': + self.transformation = SimpleTransform( + self, scale_factor=0, + input_size=self._input_size, + output_size=self._output_size, + rot=0, sigma=self._sigma, + train=False, add_dpg=False) + + def __getitem__(self, index): + det_res = self._det_json[index] + if not isinstance(det_res['image_id'], int): + img_id, _ = os.path.splitext(os.path.basename(det_res['image_id'])) + img_id = int(img_id) + else: + img_id = det_res['image_id'] + img_path = '/DATA1/Benchmark/coco/val2017/%012d.jpg' % img_id + + # Load image + image = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) #scipy.misc.imread(img_path, mode='RGB') + + imght, imgwidth = image.shape[1], image.shape[2] + x1, y1, w, h = det_res['bbox'] + bbox = [x1, y1, x1 + w, y1 + h] + inp, bbox = self.transformation.test_transform(image, bbox) + return inp, torch.Tensor(bbox), torch.Tensor([det_res['bbox']]), torch.Tensor([det_res['image_id']]), torch.Tensor([det_res['score']]), torch.Tensor([imght]), torch.Tensor([imgwidth]) + + def __len__(self): + return len(self._det_json) + + def write_coco_json(self, det_file): + from pycocotools.coco import COCO + import pathlib + + _coco = COCO(self._ann_file) + image_ids = sorted(_coco.getImgIds()) + det_model = get_detector(self._opt) + dets = [] + for entry in tqdm(_coco.loadImgs(image_ids)): + abs_path = os.path.join( + '/DATA1/Benchmark/coco', self._img_prefix, entry['file_name']) + det = det_model.detect_one_img(abs_path) + if det: + dets += det + pathlib.Path(os.path.split(det_file)[0]).mkdir(parents=True, exist_ok=True) + json.dump(dets, open(det_file, 'w')) + + @property + def joint_pairs(self): + """Joint pairs which defines the pairs of joint to be swapped + when the image is flipped horizontally.""" + return[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], + [20, 21], [22, 23], [24, 25]] diff --git a/cv/pose/alphapose/pytorch/alphapose/models/hardnet.py b/cv/pose/alphapose/pytorch/alphapose/models/hardnet.py index 97a269bd6..449a7940f 100755 --- a/cv/pose/alphapose/pytorch/alphapose/models/hardnet.py +++ b/cv/pose/alphapose/pytorch/alphapose/models/hardnet.py @@ -1,570 +1,570 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import collections -import numpy as np - -import torch -from torch import nn -import torch.nn.functional as F -import torch.nn.init as init - - -from .builder import SPPE -from .layers.Resnet import ResNet -from .layers.SE_Resnet import SEResnet - -BN_MOMENTUM = 0.1 -DEBUG = False - -def conv3x3(in_planes, out_planes, stride=1): - "3x3 convolution with padding" - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, - padding=1, bias=False) - - -class Identity(nn.Module): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, x): - return x - - -class Flatten(nn.Module): - def __init__(self): - super().__init__() - def forward(self, x): - return x.view(x.data.size(0),-1) - - -class CombConvLayer(nn.Sequential): - def __init__(self, in_channels, out_channels, norm_layer, kernel=1, stride=1, dropout=0.1, bias=False): - super().__init__() - self.add_module('layer1',ConvLayer(in_channels, out_channels, kernel)) - self.add_module('layer2',DWConvLayer(out_channels, out_channels, norm_layer, stride=stride)) - - def forward(self, x): - return super().forward(x) - - -class DWConvLayer(nn.Sequential): - def __init__(self, in_channels, out_channels, norm_layer, stride=1, bias=False): - super().__init__() - out_ch = out_channels - - groups = in_channels - kernel = 3 - if DEBUG: - print(kernel, 'x', kernel, 'x', out_channels, 'x', out_channels, 'DepthWise') - - self.add_module('dwconv', nn.Conv2d(groups, groups, kernel_size=3, - stride=stride, padding=1, groups=groups, bias=bias)) - - self.add_module('norm', norm_layer(groups, momentum=BN_MOMENTUM)) - def forward(self, x): - return super().forward(x) - - -class ConvLayer(nn.Sequential): - def __init__(self, in_channels, out_channels, norm_layer, kernel=3, stride=1, padding=0, bias=False): - super().__init__() - self.out_channels = out_channels - out_ch = out_channels - groups = 1 - if DEBUG: - print(kernel, 'x', kernel, 'x', in_channels, 'x', out_channels) - pad = kernel//2 if padding == 0 else padding - self.add_module('conv', nn.Conv2d(in_channels, out_ch, kernel_size=kernel, - stride=stride, padding=pad, groups=groups, bias=bias)) - self.add_module('norm', norm_layer(out_ch, momentum=BN_MOMENTUM)) - self.add_module('relu', nn.ReLU(True)) - def forward(self, x): - return super().forward(x) - - -class BRLayer(nn.Sequential): - def __init__(self, in_channels, norm_layer): - super().__init__() - - self.add_module('norm', norm_layer(in_channels)) - self.add_module('relu', nn.ReLU(True)) - def forward(self, x): - return super().forward(x) - - -class HarDBlock(nn.Module): - def get_link(self, layer, base_ch, growth_rate, grmul): - if layer == 0: - return base_ch, 0, [] - out_channels = growth_rate - link = [] - for i in range(10): - dv = 2 ** i - if layer % dv == 0: - k = layer - dv - link.append(k) - if i > 0: - out_channels *= grmul - out_channels = int(int(out_channels + 1) / 2) * 2 - in_channels = 0 - for i in link: - ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul) - in_channels += ch - return out_channels, in_channels, link - - def get_out_ch(self): - return self.out_channels - - def __init__(self, in_channels, growth_rate, grmul, n_layers, norm_layer, keepBase=False, residual_out=False, dwconv=False): - super().__init__() - self.in_channels = in_channels - self.growth_rate = growth_rate - self.grmul = grmul - self.n_layers = n_layers - self.norm_layer = norm_layer - self.keepBase = keepBase - self.links = [] - layers_ = [] - self.out_channels = 0 - - for i in range(n_layers): - outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul) - self.links.append(link) - use_relu = residual_out - if dwconv: - layers_.append(CombConvLayer(inch, outch, norm_layer)) - else: - layers_.append(ConvLayer(inch, outch, norm_layer)) - - if (i % 2 == 0) or (i == n_layers - 1): - self.out_channels += outch - if DEBUG: - print("Blk out =",self.out_channels) - self.layers = nn.ModuleList(layers_) - - def forward(self, x): - layers_ = [x] - for layer in range(len(self.layers)): - link = self.links[layer] - tin = [] - for i in link: - tin.append(layers_[i]) - if len(tin) > 1: - x = torch.cat(tin, 1) - else: - x = tin[0] - out = self.layers[layer](x) - layers_.append(out) - t = len(layers_) - out_ = [] - for i in range(t): - if (i == 0 and self.keepBase) or \ - (i == t-1) or (i%2 == 1): - out_.append(layers_[i]) - out = torch.cat(out_, 1) - return out - - -class HarDBlock_v2(nn.Module): - def get_link(self, layer, base_ch, growth_rate, grmul): - if layer == 0: - return base_ch, 0, [] - out_channels = growth_rate - link = [] - for i in range(10): - dv = 2 ** i - if layer % dv == 0: - k = layer - dv - link.insert(0, k) - if i > 0: - out_channels *= grmul - out_channels = int(int(out_channels + 1) / 2) * 2 - in_channels = 0 - for i in link: - ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul) - in_channels += ch - return out_channels, in_channels, link - - def get_out_ch(self): - return self.out_channels - - def __init__(self, in_channels, growth_rate, grmul, n_layers, norm_layer, dwconv=False): - super().__init__() - self.links = [] - conv_layers_ = [] - bnrelu_layers_ = [] - self.layer_bias = [] - self.out_channels = 0 - self.norm_layer = norm_layer - self.out_partition = collections.defaultdict(list) - - for i in range(n_layers): - outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul) - self.links.append(link) - for j in link: - self.out_partition[j].append(outch) - - cur_ch = in_channels - for i in range(n_layers): - accum_out_ch = sum( self.out_partition[i] ) - real_out_ch = self.out_partition[i][0] - conv_layers_.append( nn.Conv2d(cur_ch, accum_out_ch, kernel_size=3, stride=1, padding=1, bias=True) ) - bnrelu_layers_.append( BRLayer(real_out_ch, norm_layer) ) - cur_ch = real_out_ch - if (i % 2 == 0) or (i == n_layers - 1): - self.out_channels += real_out_ch - self.conv_layers = nn.ModuleList(conv_layers_) - self.bnrelu_layers = nn.ModuleList(bnrelu_layers_) - - def transform(self, blk, trt=False): - # Transform weight matrix from a pretrained HarDBlock v1 - in_ch = blk.layers[0][0].weight.shape[1] - for i in range(len(self.conv_layers)): - link = self.links[i].copy() - link_ch = [blk.layers[k-1][0].weight.shape[0] if k > 0 else - blk.layers[0 ][0].weight.shape[1] for k in link] - part = self.out_partition[i] - w_src = blk.layers[i][0].weight - b_src = blk.layers[i][0].bias - - - self.conv_layers[i].weight[0:part[0], :, :,:] = w_src[:, 0:in_ch, :,:] - self.layer_bias.append(b_src) - #if b_src is not None: - # self.layer_bias[i] = b_src.view(1,-1,1,1) - if b_src is not None: - if trt: - self.conv_layers[i].bias[1:part[0]] = b_src[1:] - self.conv_layers[i].bias[0] = b_src[0] - self.conv_layers[i].bias[part[0]:] = 0 - self.layer_bias[i] = None - else: - #for pytorch, add bias with standalone tensor is more efficient than within conv.bias - #this is because the amount of non-zero bias is small, - #but if we use conv.bias, the number of bias will be much larger - self.conv_layers[i].bias = None - else: - self.conv_layers[i].bias = None - - - in_ch = part[0] - link_ch.reverse() - link.reverse() - if len(link) > 1: - for j in range(1, len(link) ): - ly = link[j] - part_id = self.out_partition[ly].index(part[0]) - chos = sum( self.out_partition[ly][0:part_id] ) - choe = chos + part[0] - chis = sum( link_ch[0:j] ) - chie = chis + link_ch[j] - self.conv_layers[ly].weight[chos:choe, :,:,:] = w_src[:, chis:chie,:,:] - - #update BatchNorm or remove it if there is no BatchNorm in the v1 block - self.bnrelu_layers[i] = None - if isinstance(blk.layers[i][1], self.norm_layer): - self.bnrelu_layers[i] = nn.Sequential( - blk.layers[i][1], - blk.layers[i][2]) - else: - self.bnrelu_layers[i] = blk.layers[i][1] - - def forward(self, x): - layers_ = [] - outs_ = [] - xin = x - for i in range(len(self.conv_layers)): - link = self.links[i] - part = self.out_partition[i] - - xout = self.conv_layers[i](xin) - layers_.append(xout) - - xin = xout[:,0:part[0],:,:] if len(part) > 1 else xout - if self.layer_bias[i] is not None: - xin += self.layer_bias[i].view(1,-1,1,1) - - if len(link) > 1: - for j in range( len(link) - 1 ): - ly = link[j] - part_id = self.out_partition[ly].index(part[0]) - chs = sum( self.out_partition[ly][0:part_id] ) - che = chs + part[0] - - xin += layers_[ly][:,chs:che,:,:] - - xin = self.bnrelu_layers[i](xin) - - if i%2 == 0 or i == len(self.conv_layers)-1: - outs_.append(xin) - - out = torch.cat(outs_, 1) - return out - - -class HarDNetBase(nn.Module): - def __init__(self, arch, norm_layer, depth_wise=False): - super().__init__() - if arch == 85: - first_ch = [48, 96] - second_kernel = 3 - - ch_list = [ 192, 256, 320, 480, 720] - grmul = 1.7 - gr = [ 24, 24, 28, 36, 48] - n_layers = [ 8, 16, 16, 16, 16] - elif arch == 68: - first_ch = [32, 64] - second_kernel = 3 - - ch_list = [ 128, 256, 320, 640] - grmul = 1.7 - gr = [ 14, 16, 20, 40] - n_layers = [ 8, 16, 16, 16] - else: - print("Error: HarDNet",arch," has no implementation.") - exit() - - blks = len(n_layers) - self.base = nn.ModuleList([]) - - # First Layer: Standard Conv3x3, Stride=2 - self.base.append ( - ConvLayer(in_channels=3, out_channels=first_ch[0], norm_layer=norm_layer, kernel=3, - stride=2, bias=False) ) - - # Second Layer - self.base.append ( ConvLayer(first_ch[0], first_ch[1], norm_layer, kernel=second_kernel) ) - - # Maxpooling or DWConv3x3 downsampling - self.base.append(nn.AvgPool2d(kernel_size=3, stride=2, padding=1)) - - # Build all HarDNet blocks - ch = first_ch[1] - for i in range(blks): - blk = HarDBlock(ch, gr[i], grmul, n_layers[i], norm_layer, dwconv=depth_wise) - ch = blk.get_out_ch() - self.base.append ( blk ) - - if i != blks-1: - self.base.append ( ConvLayer(ch, ch_list[i], norm_layer, kernel=1) ) - ch = ch_list[i] - if i== 0: - self.base.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)) - elif i != blks-1 and i != 1 and i != 3: - self.base.append(nn.AvgPool2d(kernel_size=2, stride=2)) - - -def fill_fc_weights(layers): - for m in layers.modules(): - if isinstance(m, nn.Conv2d): - if m.weight is not None: - init.kaiming_uniform_(m.weight, nonlinearity='relu') - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - -def weights_init(m): - for key in m.state_dict(): - if key.split('.')[-1] == 'weight': - if 'conv' in key: - init.kaiming_uniform_(m.state_dict()[key], nonlinearity='relu') - if 'bn' in key: - m.state_dict()[key][...] = 1 - elif key.split('.')[-1] == 'bias': - m.state_dict()[key][...] = 0 - - -class TransitionUp(nn.Module): - def __init__(self, in_channels, out_channels): - super().__init__() - - def forward(self, x, skip, concat=True): - out = F.interpolate( - x, - size=(skip.size(2), skip.size(3)), - mode="bilinear", - align_corners=True) - if concat: - out = torch.cat([out, skip], 1) - return out - -@SPPE.register_module -class HarDNetPose(nn.Module): - def __init__(self, norm_layer=nn.BatchNorm2d, **cfg): - super(HarDNetPose, self).__init__() - assert cfg['DOWN_RATIO'] in [2, 4, 8, 16] - self.norm_layer = norm_layer - self._preset_cfg = cfg['PRESET'] - self.first_level = int(np.log2(cfg['DOWN_RATIO']))-1 - self.trt = cfg['TRT'] - - self.base = HarDNetBase(cfg['NUM_LAYERS'], norm_layer).base - self.last_pool = nn.AvgPool2d(kernel_size=2, stride=2) - - if cfg['NUM_LAYERS'] == 85: - self.last_proj = ConvLayer(784, 256, norm_layer, kernel=1) - self.last_blk = HarDBlock(768, 80, 1.7, 8, norm_layer) - self.skip_nodes = [1,3,8,13] - self.SC = [32, 32, 0] - gr = [64, 48, 28] - layers = [8, 8, 4] - ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]] - channels = [96, 214, 458, 784] - self.skip_lv = 3 - scales = [2 ** i for i in range(len(channels[self.first_level:]))] - - elif cfg['NUM_LAYERS'] == 68: - self.last_proj = ConvLayer(654, 192, norm_layer, kernel=1) - self.last_blk = HarDBlock(576, 72, 1.7, 8, norm_layer) - self.skip_nodes = [1,3,8,11] - self.SC = [32, 32, 0 ] - gr = [48, 32, 20] - layers = [8, 8, 4] - ch_list2 = [224+self.SC[0], 96+self.SC[1], 64+self.SC[2]] - channels = [64, 124, 328, 654] - self.skip_lv = 2 - scales = [2 ** i for i in range(len(channels[self.first_level:]))] - - - - self.transUpBlocks = nn.ModuleList([]) - self.denseBlocksUp = nn.ModuleList([]) - self.conv1x1_up = nn.ModuleList([]) - self.avg9x9 = nn.AvgPool2d(kernel_size=(9,9), stride=1, padding=(4,4)) - prev_ch = self.last_blk.get_out_ch() - - for i in range(3): - skip_ch = channels[3-i] - self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch)) - if i < self.skip_lv: - cur_ch = prev_ch + skip_ch - else: - cur_ch = prev_ch - self.conv1x1_up.append(ConvLayer(cur_ch, ch_list2[i], norm_layer, kernel=1)) - cur_ch = ch_list2[i] - cur_ch -= self.SC[i] - cur_ch *= 3 - - blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i], norm_layer) - - self.denseBlocksUp.append(blk) - prev_ch = blk.get_out_ch() - - prev_ch += self.SC[0] + self.SC[1] + self.SC[2] - - weights_init(self.denseBlocksUp) - weights_init(self.conv1x1_up) - weights_init(self.last_blk) - weights_init(self.last_proj) - - out_channel = self._preset_cfg['NUM_JOINTS'] - - ch = max(128, out_channel*4) - self.conv_out = nn.Sequential( - nn.Conv2d(prev_ch, ch, - kernel_size=3, padding=1, bias=True), - nn.ReLU(inplace=True), - nn.Conv2d(ch, out_channel, - kernel_size=cfg['FINAL_CONV_KERNEL'], stride=1, - padding=cfg['FINAL_CONV_KERNEL'] // 2, bias=True)) - fill_fc_weights(self.conv_out) - self.conv_out[-1].bias.data.fill_(-2.19) - - - def v2_transform(self): - print('Transform HarDBlock v2..') - for i in range( len(self.base)): - if isinstance(self.base[i], HarDBlock): - blk = self.base[i] - self.base[i] = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) - self.base[i].transform(blk, self.trt) - blk = self.last_blk - self.last_blk = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) - self.last_blk.transform(blk, self.trt) - for i in range(3): - blk = self.denseBlocksUp[i] - self.denseBlocksUp[i] = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) - self.denseBlocksUp[i].transform(blk, self.trt) - - def forward(self, x): - xs = [] - x_sc = [] - - for i in range(len(self.base)): - x = self.base[i](x) - if i in self.skip_nodes: - xs.append(x) - - x = self.last_proj(x) - x = self.last_pool(x) - x2 = self.avg9x9(x) - x3 = x/(x.sum((2,3),keepdim=True) + 0.1) - x = torch.cat([x,x2,x3],1) - x = self.last_blk(x) - - for i in range(3): - skip_x = xs[3-i] - x = self.transUpBlocks[i](x, skip_x, (i 0: - end = x.shape[1] - x_sc.append( x[:,end-self.SC[i]:,:,:].contiguous() ) - x = x[:,:end-self.SC[i],:,:].contiguous() - x2 = self.avg9x9(x) - x3 = x/(x.sum((2,3),keepdim=True) + 0.1) - x = torch.cat([x,x2,x3],1) - x = self.denseBlocksUp[i](x) - - scs = [x] - for i in range(3): - if self.SC[i] > 0: - scs.insert(0, F.interpolate( - x_sc[i], size=(x.size(2), x.size(3)), - mode="bilinear", align_corners=True) ) - x = torch.cat(scs,1) - x = self.conv_out(x) - return x - - def _initialize(self, pretrained=''): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - nn.init.normal_(m.weight, std=0.001) - for name, _ in m.named_parameters(): - if name in ['bias']: - nn.init.constant_(m.bias, 0) - elif isinstance(m, self.norm_layer): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.ConvTranspose2d): - nn.init.normal_(m.weight, std=0.001) - for name, _ in m.named_parameters(): - if name in ['bias']: - nn.init.constant_(m.bias, 0) - - if os.path.isfile(pretrained): - pretrained_state_dict = torch.load(pretrained) - - need_init_state_dict = {} - for name, m in pretrained_state_dict.items(): - if name.split('.')[0] in self.pretrained_layers \ - or self.pretrained_layers[0] == '*': - need_init_state_dict[name] = m - self.load_state_dict(need_init_state_dict, strict=False) - elif pretrained: - raise ValueError('{} is not exist!'.format(pretrained)) - - -def get_pose_net(cfg, is_train, **kwargs): - model = HarDNetPose(cfg, **kwargs) - if is_train and cfg.MODEL.INIT_WEIGHTS: - model._initialize(cfg.MODEL.INIT_WEIGHTS) - total_params = sum(p.numel() for p in model.parameters()) - print( "Parameters=", total_params ) - return model +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import collections +import numpy as np + +import torch +from torch import nn +import torch.nn.functional as F +import torch.nn.init as init + + +from .builder import SPPE +from .layers.Resnet import ResNet +from .layers.SE_Resnet import SEResnet + +BN_MOMENTUM = 0.1 +DEBUG = False + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class Flatten(nn.Module): + def __init__(self): + super().__init__() + def forward(self, x): + return x.view(x.data.size(0),-1) + + +class CombConvLayer(nn.Sequential): + def __init__(self, in_channels, out_channels, norm_layer, kernel=1, stride=1, dropout=0.1, bias=False): + super().__init__() + self.add_module('layer1',ConvLayer(in_channels, out_channels, kernel)) + self.add_module('layer2',DWConvLayer(out_channels, out_channels, norm_layer, stride=stride)) + + def forward(self, x): + return super().forward(x) + + +class DWConvLayer(nn.Sequential): + def __init__(self, in_channels, out_channels, norm_layer, stride=1, bias=False): + super().__init__() + out_ch = out_channels + + groups = in_channels + kernel = 3 + if DEBUG: + print(kernel, 'x', kernel, 'x', out_channels, 'x', out_channels, 'DepthWise') + + self.add_module('dwconv', nn.Conv2d(groups, groups, kernel_size=3, + stride=stride, padding=1, groups=groups, bias=bias)) + + self.add_module('norm', norm_layer(groups, momentum=BN_MOMENTUM)) + def forward(self, x): + return super().forward(x) + + +class ConvLayer(nn.Sequential): + def __init__(self, in_channels, out_channels, norm_layer, kernel=3, stride=1, padding=0, bias=False): + super().__init__() + self.out_channels = out_channels + out_ch = out_channels + groups = 1 + if DEBUG: + print(kernel, 'x', kernel, 'x', in_channels, 'x', out_channels) + pad = kernel//2 if padding == 0 else padding + self.add_module('conv', nn.Conv2d(in_channels, out_ch, kernel_size=kernel, + stride=stride, padding=pad, groups=groups, bias=bias)) + self.add_module('norm', norm_layer(out_ch, momentum=BN_MOMENTUM)) + self.add_module('relu', nn.ReLU(True)) + def forward(self, x): + return super().forward(x) + + +class BRLayer(nn.Sequential): + def __init__(self, in_channels, norm_layer): + super().__init__() + + self.add_module('norm', norm_layer(in_channels)) + self.add_module('relu', nn.ReLU(True)) + def forward(self, x): + return super().forward(x) + + +class HarDBlock(nn.Module): + def get_link(self, layer, base_ch, growth_rate, grmul): + if layer == 0: + return base_ch, 0, [] + out_channels = growth_rate + link = [] + for i in range(10): + dv = 2 ** i + if layer % dv == 0: + k = layer - dv + link.append(k) + if i > 0: + out_channels *= grmul + out_channels = int(int(out_channels + 1) / 2) * 2 + in_channels = 0 + for i in link: + ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul) + in_channels += ch + return out_channels, in_channels, link + + def get_out_ch(self): + return self.out_channels + + def __init__(self, in_channels, growth_rate, grmul, n_layers, norm_layer, keepBase=False, residual_out=False, dwconv=False): + super().__init__() + self.in_channels = in_channels + self.growth_rate = growth_rate + self.grmul = grmul + self.n_layers = n_layers + self.norm_layer = norm_layer + self.keepBase = keepBase + self.links = [] + layers_ = [] + self.out_channels = 0 + + for i in range(n_layers): + outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul) + self.links.append(link) + use_relu = residual_out + if dwconv: + layers_.append(CombConvLayer(inch, outch, norm_layer)) + else: + layers_.append(ConvLayer(inch, outch, norm_layer)) + + if (i % 2 == 0) or (i == n_layers - 1): + self.out_channels += outch + if DEBUG: + print("Blk out =",self.out_channels) + self.layers = nn.ModuleList(layers_) + + def forward(self, x): + layers_ = [x] + for layer in range(len(self.layers)): + link = self.links[layer] + tin = [] + for i in link: + tin.append(layers_[i]) + if len(tin) > 1: + x = torch.cat(tin, 1) + else: + x = tin[0] + out = self.layers[layer](x) + layers_.append(out) + t = len(layers_) + out_ = [] + for i in range(t): + if (i == 0 and self.keepBase) or \ + (i == t-1) or (i%2 == 1): + out_.append(layers_[i]) + out = torch.cat(out_, 1) + return out + + +class HarDBlock_v2(nn.Module): + def get_link(self, layer, base_ch, growth_rate, grmul): + if layer == 0: + return base_ch, 0, [] + out_channels = growth_rate + link = [] + for i in range(10): + dv = 2 ** i + if layer % dv == 0: + k = layer - dv + link.insert(0, k) + if i > 0: + out_channels *= grmul + out_channels = int(int(out_channels + 1) / 2) * 2 + in_channels = 0 + for i in link: + ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul) + in_channels += ch + return out_channels, in_channels, link + + def get_out_ch(self): + return self.out_channels + + def __init__(self, in_channels, growth_rate, grmul, n_layers, norm_layer, dwconv=False): + super().__init__() + self.links = [] + conv_layers_ = [] + bnrelu_layers_ = [] + self.layer_bias = [] + self.out_channels = 0 + self.norm_layer = norm_layer + self.out_partition = collections.defaultdict(list) + + for i in range(n_layers): + outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul) + self.links.append(link) + for j in link: + self.out_partition[j].append(outch) + + cur_ch = in_channels + for i in range(n_layers): + accum_out_ch = sum( self.out_partition[i] ) + real_out_ch = self.out_partition[i][0] + conv_layers_.append( nn.Conv2d(cur_ch, accum_out_ch, kernel_size=3, stride=1, padding=1, bias=True) ) + bnrelu_layers_.append( BRLayer(real_out_ch, norm_layer) ) + cur_ch = real_out_ch + if (i % 2 == 0) or (i == n_layers - 1): + self.out_channels += real_out_ch + self.conv_layers = nn.ModuleList(conv_layers_) + self.bnrelu_layers = nn.ModuleList(bnrelu_layers_) + + def transform(self, blk, trt=False): + # Transform weight matrix from a pretrained HarDBlock v1 + in_ch = blk.layers[0][0].weight.shape[1] + for i in range(len(self.conv_layers)): + link = self.links[i].copy() + link_ch = [blk.layers[k-1][0].weight.shape[0] if k > 0 else + blk.layers[0 ][0].weight.shape[1] for k in link] + part = self.out_partition[i] + w_src = blk.layers[i][0].weight + b_src = blk.layers[i][0].bias + + + self.conv_layers[i].weight[0:part[0], :, :,:] = w_src[:, 0:in_ch, :,:] + self.layer_bias.append(b_src) + #if b_src is not None: + # self.layer_bias[i] = b_src.view(1,-1,1,1) + if b_src is not None: + if trt: + self.conv_layers[i].bias[1:part[0]] = b_src[1:] + self.conv_layers[i].bias[0] = b_src[0] + self.conv_layers[i].bias[part[0]:] = 0 + self.layer_bias[i] = None + else: + #for pytorch, add bias with standalone tensor is more efficient than within conv.bias + #this is because the amount of non-zero bias is small, + #but if we use conv.bias, the number of bias will be much larger + self.conv_layers[i].bias = None + else: + self.conv_layers[i].bias = None + + + in_ch = part[0] + link_ch.reverse() + link.reverse() + if len(link) > 1: + for j in range(1, len(link) ): + ly = link[j] + part_id = self.out_partition[ly].index(part[0]) + chos = sum( self.out_partition[ly][0:part_id] ) + choe = chos + part[0] + chis = sum( link_ch[0:j] ) + chie = chis + link_ch[j] + self.conv_layers[ly].weight[chos:choe, :,:,:] = w_src[:, chis:chie,:,:] + + #update BatchNorm or remove it if there is no BatchNorm in the v1 block + self.bnrelu_layers[i] = None + if isinstance(blk.layers[i][1], self.norm_layer): + self.bnrelu_layers[i] = nn.Sequential( + blk.layers[i][1], + blk.layers[i][2]) + else: + self.bnrelu_layers[i] = blk.layers[i][1] + + def forward(self, x): + layers_ = [] + outs_ = [] + xin = x + for i in range(len(self.conv_layers)): + link = self.links[i] + part = self.out_partition[i] + + xout = self.conv_layers[i](xin) + layers_.append(xout) + + xin = xout[:,0:part[0],:,:] if len(part) > 1 else xout + if self.layer_bias[i] is not None: + xin += self.layer_bias[i].view(1,-1,1,1) + + if len(link) > 1: + for j in range( len(link) - 1 ): + ly = link[j] + part_id = self.out_partition[ly].index(part[0]) + chs = sum( self.out_partition[ly][0:part_id] ) + che = chs + part[0] + + xin += layers_[ly][:,chs:che,:,:] + + xin = self.bnrelu_layers[i](xin) + + if i%2 == 0 or i == len(self.conv_layers)-1: + outs_.append(xin) + + out = torch.cat(outs_, 1) + return out + + +class HarDNetBase(nn.Module): + def __init__(self, arch, norm_layer, depth_wise=False): + super().__init__() + if arch == 85: + first_ch = [48, 96] + second_kernel = 3 + + ch_list = [ 192, 256, 320, 480, 720] + grmul = 1.7 + gr = [ 24, 24, 28, 36, 48] + n_layers = [ 8, 16, 16, 16, 16] + elif arch == 68: + first_ch = [32, 64] + second_kernel = 3 + + ch_list = [ 128, 256, 320, 640] + grmul = 1.7 + gr = [ 14, 16, 20, 40] + n_layers = [ 8, 16, 16, 16] + else: + print("Error: HarDNet",arch," has no implementation.") + exit() + + blks = len(n_layers) + self.base = nn.ModuleList([]) + + # First Layer: Standard Conv3x3, Stride=2 + self.base.append ( + ConvLayer(in_channels=3, out_channels=first_ch[0], norm_layer=norm_layer, kernel=3, + stride=2, bias=False) ) + + # Second Layer + self.base.append ( ConvLayer(first_ch[0], first_ch[1], norm_layer, kernel=second_kernel) ) + + # Maxpooling or DWConv3x3 downsampling + self.base.append(nn.AvgPool2d(kernel_size=3, stride=2, padding=1)) + + # Build all HarDNet blocks + ch = first_ch[1] + for i in range(blks): + blk = HarDBlock(ch, gr[i], grmul, n_layers[i], norm_layer, dwconv=depth_wise) + ch = blk.get_out_ch() + self.base.append ( blk ) + + if i != blks-1: + self.base.append ( ConvLayer(ch, ch_list[i], norm_layer, kernel=1) ) + ch = ch_list[i] + if i== 0: + self.base.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)) + elif i != blks-1 and i != 1 and i != 3: + self.base.append(nn.AvgPool2d(kernel_size=2, stride=2)) + + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + if m.weight is not None: + init.kaiming_uniform_(m.weight, nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def weights_init(m): + for key in m.state_dict(): + if key.split('.')[-1] == 'weight': + if 'conv' in key: + init.kaiming_uniform_(m.state_dict()[key], nonlinearity='relu') + if 'bn' in key: + m.state_dict()[key][...] = 1 + elif key.split('.')[-1] == 'bias': + m.state_dict()[key][...] = 0 + + +class TransitionUp(nn.Module): + def __init__(self, in_channels, out_channels): + super().__init__() + + def forward(self, x, skip, concat=True): + out = F.interpolate( + x, + size=(skip.size(2), skip.size(3)), + mode="bilinear", + align_corners=True) + if concat: + out = torch.cat([out, skip], 1) + return out + +@SPPE.register_module +class HarDNetPose(nn.Module): + def __init__(self, norm_layer=nn.BatchNorm2d, **cfg): + super(HarDNetPose, self).__init__() + assert cfg['DOWN_RATIO'] in [2, 4, 8, 16] + self.norm_layer = norm_layer + self._preset_cfg = cfg['PRESET'] + self.first_level = int(np.log2(cfg['DOWN_RATIO']))-1 + self.trt = cfg['TRT'] + + self.base = HarDNetBase(cfg['NUM_LAYERS'], norm_layer).base + self.last_pool = nn.AvgPool2d(kernel_size=2, stride=2) + + if cfg['NUM_LAYERS'] == 85: + self.last_proj = ConvLayer(784, 256, norm_layer, kernel=1) + self.last_blk = HarDBlock(768, 80, 1.7, 8, norm_layer) + self.skip_nodes = [1,3,8,13] + self.SC = [32, 32, 0] + gr = [64, 48, 28] + layers = [8, 8, 4] + ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]] + channels = [96, 214, 458, 784] + self.skip_lv = 3 + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + + elif cfg['NUM_LAYERS'] == 68: + self.last_proj = ConvLayer(654, 192, norm_layer, kernel=1) + self.last_blk = HarDBlock(576, 72, 1.7, 8, norm_layer) + self.skip_nodes = [1,3,8,11] + self.SC = [32, 32, 0 ] + gr = [48, 32, 20] + layers = [8, 8, 4] + ch_list2 = [224+self.SC[0], 96+self.SC[1], 64+self.SC[2]] + channels = [64, 124, 328, 654] + self.skip_lv = 2 + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + + + + self.transUpBlocks = nn.ModuleList([]) + self.denseBlocksUp = nn.ModuleList([]) + self.conv1x1_up = nn.ModuleList([]) + self.avg9x9 = nn.AvgPool2d(kernel_size=(9,9), stride=1, padding=(4,4)) + prev_ch = self.last_blk.get_out_ch() + + for i in range(3): + skip_ch = channels[3-i] + self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch)) + if i < self.skip_lv: + cur_ch = prev_ch + skip_ch + else: + cur_ch = prev_ch + self.conv1x1_up.append(ConvLayer(cur_ch, ch_list2[i], norm_layer, kernel=1)) + cur_ch = ch_list2[i] + cur_ch -= self.SC[i] + cur_ch *= 3 + + blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i], norm_layer) + + self.denseBlocksUp.append(blk) + prev_ch = blk.get_out_ch() + + prev_ch += self.SC[0] + self.SC[1] + self.SC[2] + + weights_init(self.denseBlocksUp) + weights_init(self.conv1x1_up) + weights_init(self.last_blk) + weights_init(self.last_proj) + + out_channel = self._preset_cfg['NUM_JOINTS'] + + ch = max(128, out_channel*4) + self.conv_out = nn.Sequential( + nn.Conv2d(prev_ch, ch, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(ch, out_channel, + kernel_size=cfg['FINAL_CONV_KERNEL'], stride=1, + padding=cfg['FINAL_CONV_KERNEL'] // 2, bias=True)) + fill_fc_weights(self.conv_out) + self.conv_out[-1].bias.data.fill_(-2.19) + + + def v2_transform(self): + print('Transform HarDBlock v2..') + for i in range( len(self.base)): + if isinstance(self.base[i], HarDBlock): + blk = self.base[i] + self.base[i] = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) + self.base[i].transform(blk, self.trt) + blk = self.last_blk + self.last_blk = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) + self.last_blk.transform(blk, self.trt) + for i in range(3): + blk = self.denseBlocksUp[i] + self.denseBlocksUp[i] = HarDBlock_v2(blk.in_channels, blk.growth_rate, blk.grmul, blk.n_layers, blk.norm_layer) + self.denseBlocksUp[i].transform(blk, self.trt) + + def forward(self, x): + xs = [] + x_sc = [] + + for i in range(len(self.base)): + x = self.base[i](x) + if i in self.skip_nodes: + xs.append(x) + + x = self.last_proj(x) + x = self.last_pool(x) + x2 = self.avg9x9(x) + x3 = x/(x.sum((2,3),keepdim=True) + 0.1) + x = torch.cat([x,x2,x3],1) + x = self.last_blk(x) + + for i in range(3): + skip_x = xs[3-i] + x = self.transUpBlocks[i](x, skip_x, (i 0: + end = x.shape[1] + x_sc.append( x[:,end-self.SC[i]:,:,:].contiguous() ) + x = x[:,:end-self.SC[i],:,:].contiguous() + x2 = self.avg9x9(x) + x3 = x/(x.sum((2,3),keepdim=True) + 0.1) + x = torch.cat([x,x2,x3],1) + x = self.denseBlocksUp[i](x) + + scs = [x] + for i in range(3): + if self.SC[i] > 0: + scs.insert(0, F.interpolate( + x_sc[i], size=(x.size(2), x.size(3)), + mode="bilinear", align_corners=True) ) + x = torch.cat(scs,1) + x = self.conv_out(x) + return x + + def _initialize(self, pretrained=''): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, self.norm_layer): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + + if os.path.isfile(pretrained): + pretrained_state_dict = torch.load(pretrained) + + need_init_state_dict = {} + for name, m in pretrained_state_dict.items(): + if name.split('.')[0] in self.pretrained_layers \ + or self.pretrained_layers[0] == '*': + need_init_state_dict[name] = m + self.load_state_dict(need_init_state_dict, strict=False) + elif pretrained: + raise ValueError('{} is not exist!'.format(pretrained)) + + +def get_pose_net(cfg, is_train, **kwargs): + model = HarDNetPose(cfg, **kwargs) + if is_train and cfg.MODEL.INIT_WEIGHTS: + model._initialize(cfg.MODEL.INIT_WEIGHTS) + total_params = sum(p.numel() for p in model.parameters()) + print( "Parameters=", total_params ) + return model diff --git a/cv/pose/alphapose/pytorch/configs/halpe_136/resnet/256x192_res50_lr1e-3_2x-regression.yaml b/cv/pose/alphapose/pytorch/configs/halpe_136/resnet/256x192_res50_lr1e-3_2x-regression.yaml index 50bfe56f1..d23293bfb 100755 --- a/cv/pose/alphapose/pytorch/configs/halpe_136/resnet/256x192_res50_lr1e-3_2x-regression.yaml +++ b/cv/pose/alphapose/pytorch/configs/halpe_136/resnet/256x192_res50_lr1e-3_2x-regression.yaml @@ -1,69 +1,69 @@ -DATASET: - TRAIN: - TYPE: 'Halpe_136' - ROOT: '' - IMG_PREFIX: 'train2017' - ANN: '/home/group3/hico-coco.json' - AUG: - FLIP: true - ROT_FACTOR: 45 - SCALE_FACTOR: 0.35 - NUM_JOINTS_HALF_BODY: 8 - PROB_HALF_BODY: 0.3 - VAL: - TYPE: 'Halpe_136' - ROOT: './data/coco/' - IMG_PREFIX: 'val2017' - ANN: 'coco_val_full_finetuned.json' - TEST: - TYPE: 'Halpe_136_det' - ROOT: './data/coco/' - IMG_PREFIX: 'val2017' - DET_FILE: './exp/json/test_det_yolo.json' - ANN: 'coco_val_full_finetuned.json' -DATA_PRESET: - TYPE: 'simple' - LOSS_TYPE: 'L1JointRegression' - SIGMA: 2 - NUM_JOINTS: 136 - IMAGE_SIZE: - - 256 - - 192 - HEATMAP_SIZE: - - 64 - - 48 -MODEL: - TYPE: 'FastPose' - PRETRAINED: '/home/group3/newrepo/AlphaPose/exp/hico-coco-_regression-256x192_res50_lr1e-3_2x-regression-frei.yaml/model_3.pth' - TRY_LOAD: '' - NUM_DECONV_FILTERS: - - 256 - - 256 - - 256 - NUM_LAYERS: 50 - CONV_DIM: 256 -LOSS: - TYPE: 'L1JointRegression' - NORM_TYPE: 'sigmoid' - OUTPUT_3D: False -DETECTOR: - NAME: 'yolo' - CONFIG: 'detector/yolo/cfg/yolov3-spp.cfg' - WEIGHTS: 'detector/yolo/data/yolov3-spp.weights' - NMS_THRES: 0.6 - CONFIDENCE: 0.05 -TRAIN: - WORLD_SIZE: 4 - BATCH_SIZE: 48 - BEGIN_EPOCH: 5 - END_EPOCH: 270 - OPTIMIZER: 'adam' - LR: 0.001 - LR_FACTOR: 0.1 - LR_STEP: - - 170 - - 200 - DPG_MILESTONE: 210 - DPG_STEP: - - 230 - - 250 +DATASET: + TRAIN: + TYPE: 'Halpe_136' + ROOT: '' + IMG_PREFIX: 'train2017' + ANN: '/home/group3/hico-coco.json' + AUG: + FLIP: true + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + VAL: + TYPE: 'Halpe_136' + ROOT: './data/coco/' + IMG_PREFIX: 'val2017' + ANN: 'coco_val_full_finetuned.json' + TEST: + TYPE: 'Halpe_136_det' + ROOT: './data/coco/' + IMG_PREFIX: 'val2017' + DET_FILE: './exp/json/test_det_yolo.json' + ANN: 'coco_val_full_finetuned.json' +DATA_PRESET: + TYPE: 'simple' + LOSS_TYPE: 'L1JointRegression' + SIGMA: 2 + NUM_JOINTS: 136 + IMAGE_SIZE: + - 256 + - 192 + HEATMAP_SIZE: + - 64 + - 48 +MODEL: + TYPE: 'FastPose' + PRETRAINED: '/home/group3/newrepo/AlphaPose/exp/hico-coco-_regression-256x192_res50_lr1e-3_2x-regression-frei.yaml/model_3.pth' + TRY_LOAD: '' + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_LAYERS: 50 + CONV_DIM: 256 +LOSS: + TYPE: 'L1JointRegression' + NORM_TYPE: 'sigmoid' + OUTPUT_3D: False +DETECTOR: + NAME: 'yolo' + CONFIG: 'detector/yolo/cfg/yolov3-spp.cfg' + WEIGHTS: 'detector/yolo/data/yolov3-spp.weights' + NMS_THRES: 0.6 + CONFIDENCE: 0.05 +TRAIN: + WORLD_SIZE: 4 + BATCH_SIZE: 48 + BEGIN_EPOCH: 5 + END_EPOCH: 270 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + DPG_MILESTONE: 210 + DPG_STEP: + - 230 + - 250 diff --git a/cv/pose/alphapose/pytorch/detector/effdet_cfg.py b/cv/pose/alphapose/pytorch/detector/effdet_cfg.py index f1ba51215..8f5e4e04d 100755 --- a/cv/pose/alphapose/pytorch/detector/effdet_cfg.py +++ b/cv/pose/alphapose/pytorch/detector/effdet_cfg.py @@ -1,8 +1,8 @@ -from easydict import EasyDict as edict - -cfg = edict() - -cfg.NMS_THRES = 0.6 # 0.6(0.713) 0.5(0.707) -cfg.CONFIDENCE = 0.2 # 0.15 0.1 -cfg.NUM_CLASSES = 80 -cfg.MAX_DETECTIONS = 200 # 100 +from easydict import EasyDict as edict + +cfg = edict() + +cfg.NMS_THRES = 0.6 # 0.6(0.713) 0.5(0.707) +cfg.CONFIDENCE = 0.2 # 0.15 0.1 +cfg.NUM_CLASSES = 80 +cfg.MAX_DETECTIONS = 200 # 100 diff --git a/cv/pose/alphapose/pytorch/detector/tracker/utils/kalman_filter.py b/cv/pose/alphapose/pytorch/detector/tracker/utils/kalman_filter.py index 29706d8f7..5b10e8d24 100755 --- a/cv/pose/alphapose/pytorch/detector/tracker/utils/kalman_filter.py +++ b/cv/pose/alphapose/pytorch/detector/tracker/utils/kalman_filter.py @@ -1,229 +1,229 @@ -# vim: expandtab:ts=4:sw=4 -import numpy as np -import scipy.linalg - - -""" -Table for the 0.95 quantile of the chi-square distribution with N degrees of -freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv -function and used as Mahalanobis gating threshold. -""" -chi2inv95 = { - 1: 3.8415, - 2: 5.9915, - 3: 7.8147, - 4: 9.4877, - 5: 11.070, - 6: 12.592, - 7: 14.067, - 8: 15.507, - 9: 16.919} - - -class KalmanFilter(object): - """ - A simple Kalman filter for tracking bounding boxes in image space. - - The 8-dimensional state space - - x, y, a, h, vx, vy, va, vh - - contains the bounding box center position (x, y), aspect ratio a, height h, - and their respective velocities. - - Object motion follows a constant velocity model. The bounding box location - (x, y, a, h) is taken as direct observation of the state space (linear - observation model). - - """ - - def __init__(self): - ndim, dt = 4, 1. - - # Create Kalman filter model matrices. - self._motion_mat = np.eye(2 * ndim, 2 * ndim) - for i in range(ndim): - self._motion_mat[i, ndim + i] = dt - self._update_mat = np.eye(ndim, 2 * ndim) - - # Motion and observation uncertainty are chosen relative to the current - # state estimate. These weights control the amount of uncertainty in - # the model. This is a bit hacky. - self._std_weight_position = 1. / 20 - self._std_weight_velocity = 1. / 160 - - def initiate(self, measurement): - """Create track from unassociated measurement. - - Parameters - ---------- - measurement : ndarray - Bounding box coordinates (x, y, a, h) with center position (x, y), - aspect ratio a, and height h. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are initialized - to 0 mean. - - """ - mean_pos = measurement - mean_vel = np.zeros_like(mean_pos) - mean = np.r_[mean_pos, mean_vel] - - std = [ - 2 * self._std_weight_position * measurement[3], - 2 * self._std_weight_position * measurement[3], - 1e-2, - 2 * self._std_weight_position * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 1e-5, - 10 * self._std_weight_velocity * measurement[3]] - covariance = np.diag(np.square(std)) - return mean, covariance - - def predict(self, mean, covariance): - """Run Kalman filter prediction step. - - Parameters - ---------- - mean : ndarray - The 8 dimensional mean vector of the object state at the previous - time step. - covariance : ndarray - The 8x8 dimensional covariance matrix of the object state at the - previous time step. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - - """ - std_pos = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-2, - self._std_weight_position * mean[3]] - std_vel = [ - self._std_weight_velocity * mean[3], - self._std_weight_velocity * mean[3], - 1e-5, - self._std_weight_velocity * mean[3]] - motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - - mean = np.dot(self._motion_mat, mean) - covariance = np.linalg.multi_dot(( - self._motion_mat, covariance, self._motion_mat.T)) + motion_cov - - return mean, covariance - - def project(self, mean, covariance): - """Project state distribution to measurement space. - - Parameters - ---------- - mean : ndarray - The state's mean vector (8 dimensional array). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - - Returns - ------- - (ndarray, ndarray) - Returns the projected mean and covariance matrix of the given state - estimate. - - """ - std = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-1, - self._std_weight_position * mean[3]] - innovation_cov = np.diag(np.square(std)) - - mean = np.dot(self._update_mat, mean) - covariance = np.linalg.multi_dot(( - self._update_mat, covariance, self._update_mat.T)) - return mean, covariance + innovation_cov - - def update(self, mean, covariance, measurement): - """Run Kalman filter correction step. - - Parameters - ---------- - mean : ndarray - The predicted state's mean vector (8 dimensional). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - measurement : ndarray - The 4 dimensional measurement vector (x, y, a, h), where (x, y) - is the center position, a the aspect ratio, and h the height of the - bounding box. - - Returns - ------- - (ndarray, ndarray) - Returns the measurement-corrected state distribution. - - """ - projected_mean, projected_cov = self.project(mean, covariance) - - chol_factor, lower = scipy.linalg.cho_factor( - projected_cov, lower=True, check_finite=False) - kalman_gain = scipy.linalg.cho_solve( - (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, - check_finite=False).T - innovation = measurement - projected_mean - - new_mean = mean + np.dot(innovation, kalman_gain.T) - new_covariance = covariance - np.linalg.multi_dot(( - kalman_gain, projected_cov, kalman_gain.T)) - return new_mean, new_covariance - - def gating_distance(self, mean, covariance, measurements, - only_position=False): - """Compute gating distance between state distribution and measurements. - - A suitable distance threshold can be obtained from `chi2inv95`. If - `only_position` is False, the chi-square distribution has 4 degrees of - freedom, otherwise 2. - - Parameters - ---------- - mean : ndarray - Mean vector over the state distribution (8 dimensional). - covariance : ndarray - Covariance of the state distribution (8x8 dimensional). - measurements : ndarray - An Nx4 dimensional matrix of N measurements, each in - format (x, y, a, h) where (x, y) is the bounding box center - position, a the aspect ratio, and h the height. - only_position : Optional[bool] - If True, distance computation is done with respect to the bounding - box center position only. - - Returns - ------- - ndarray - Returns an array of length N, where the i-th element contains the - squared Mahalanobis distance between (mean, covariance) and - `measurements[i]`. - - """ - mean, covariance = self.project(mean, covariance) - if only_position: - mean, covariance = mean[:2], covariance[:2, :2] - measurements = measurements[:, :2] - - cholesky_factor = np.linalg.cholesky(covariance) - d = measurements - mean - z = scipy.linalg.solve_triangular( - cholesky_factor, d.T, lower=True, check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(self._motion_mat, mean) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) return squared_maha \ No newline at end of file diff --git a/cv/pose/alphapose/pytorch/detector/tracker_cfg.py b/cv/pose/alphapose/pytorch/detector/tracker_cfg.py index b33c316df..e8bd55c1a 100755 --- a/cv/pose/alphapose/pytorch/detector/tracker_cfg.py +++ b/cv/pose/alphapose/pytorch/detector/tracker_cfg.py @@ -1,9 +1,9 @@ -from easydict import EasyDict as edict - -cfg = edict() -cfg.CONFIG = 'detector/tracker/cfg/yolov3.cfg' -cfg.WEIGHTS = 'detector/tracker/data/jde.1088x608.uncertainty.pt' -cfg.IMG_SIZE = (1088, 608) -cfg.NMS_THRES = 0.6 -cfg.CONFIDENCE = 0.4 +from easydict import EasyDict as edict + +cfg = edict() +cfg.CONFIG = 'detector/tracker/cfg/yolov3.cfg' +cfg.WEIGHTS = 'detector/tracker/data/jde.1088x608.uncertainty.pt' +cfg.IMG_SIZE = (1088, 608) +cfg.NMS_THRES = 0.6 +cfg.CONFIDENCE = 0.4 cfg.BUFFER_SIZE = 30 # frame buffer \ No newline at end of file diff --git a/cv/pose/alphapose/pytorch/detector/yolo_cfg.py b/cv/pose/alphapose/pytorch/detector/yolo_cfg.py index 8a755c994..f0adf357e 100755 --- a/cv/pose/alphapose/pytorch/detector/yolo_cfg.py +++ b/cv/pose/alphapose/pytorch/detector/yolo_cfg.py @@ -1,9 +1,9 @@ -from easydict import EasyDict as edict - -cfg = edict() -cfg.CONFIG = 'detector/yolo/cfg/yolov3-spp.cfg' -cfg.WEIGHTS = 'detector/yolo/data/yolov3-spp.weights' -cfg.INP_DIM = 608 -cfg.NMS_THRES = 0.6 -cfg.CONFIDENCE = 0.1 -cfg.NUM_CLASSES = 80 +from easydict import EasyDict as edict + +cfg = edict() +cfg.CONFIG = 'detector/yolo/cfg/yolov3-spp.cfg' +cfg.WEIGHTS = 'detector/yolo/data/yolov3-spp.weights' +cfg.INP_DIM = 608 +cfg.NMS_THRES = 0.6 +cfg.CONFIDENCE = 0.1 +cfg.NUM_CLASSES = 80 diff --git a/cv/pose/alphapose/pytorch/detector/yolov5/models/utils/__init__.py b/cv/pose/alphapose/pytorch/detector/yolov5/models/utils/__init__.py index 4658ed647..295aebfbc 100755 --- a/cv/pose/alphapose/pytorch/detector/yolov5/models/utils/__init__.py +++ b/cv/pose/alphapose/pytorch/detector/yolov5/models/utils/__init__.py @@ -1,37 +1,37 @@ -# YOLOv5 🚀 by Ultralytics, GPL-3.0 license -""" -utils/initialization -""" - - -def notebook_init(verbose=True): - # Check system software and hardware - print('Checking setup...') - - import os - import shutil - - from utils.general import check_requirements, emojis, is_colab - from utils.torch_utils import select_device # imports - - check_requirements(('psutil', 'IPython')) - import psutil - from IPython import display # to display images and clear console output - - if is_colab(): - shutil.rmtree('/content/sample_data', ignore_errors=True) # remove colab /sample_data directory - - if verbose: - # System info - # gb = 1 / 1000 ** 3 # bytes to GB - gib = 1 / 1024 ** 3 # bytes to GiB - ram = psutil.virtual_memory().total - total, used, free = shutil.disk_usage("/") - display.clear_output() - s = f'({os.cpu_count()} CPUs, {ram * gib:.1f} GB RAM, {(total - free) * gib:.1f}/{total * gib:.1f} GB disk)' - else: - s = '' - - select_device(newline=False) - print(emojis(f'Setup complete ✅ {s}')) - return display +# YOLOv5 🚀 by Ultralytics, GPL-3.0 license +""" +utils/initialization +""" + + +def notebook_init(verbose=True): + # Check system software and hardware + print('Checking setup...') + + import os + import shutil + + from utils.general import check_requirements, emojis, is_colab + from utils.torch_utils import select_device # imports + + check_requirements(('psutil', 'IPython')) + import psutil + from IPython import display # to display images and clear console output + + if is_colab(): + shutil.rmtree('/content/sample_data', ignore_errors=True) # remove colab /sample_data directory + + if verbose: + # System info + # gb = 1 / 1000 ** 3 # bytes to GB + gib = 1 / 1024 ** 3 # bytes to GiB + ram = psutil.virtual_memory().total + total, used, free = shutil.disk_usage("/") + display.clear_output() + s = f'({os.cpu_count()} CPUs, {ram * gib:.1f} GB RAM, {(total - free) * gib:.1f}/{total * gib:.1f} GB disk)' + else: + s = '' + + select_device(newline=False) + print(emojis(f'Setup complete ✅ {s}')) + return display diff --git a/cv/pose/alphapose/pytorch/detector/yolov5/utils/__init__.py b/cv/pose/alphapose/pytorch/detector/yolov5/utils/__init__.py index 4658ed647..295aebfbc 100755 --- a/cv/pose/alphapose/pytorch/detector/yolov5/utils/__init__.py +++ b/cv/pose/alphapose/pytorch/detector/yolov5/utils/__init__.py @@ -1,37 +1,37 @@ -# YOLOv5 🚀 by Ultralytics, GPL-3.0 license -""" -utils/initialization -""" - - -def notebook_init(verbose=True): - # Check system software and hardware - print('Checking setup...') - - import os - import shutil - - from utils.general import check_requirements, emojis, is_colab - from utils.torch_utils import select_device # imports - - check_requirements(('psutil', 'IPython')) - import psutil - from IPython import display # to display images and clear console output - - if is_colab(): - shutil.rmtree('/content/sample_data', ignore_errors=True) # remove colab /sample_data directory - - if verbose: - # System info - # gb = 1 / 1000 ** 3 # bytes to GB - gib = 1 / 1024 ** 3 # bytes to GiB - ram = psutil.virtual_memory().total - total, used, free = shutil.disk_usage("/") - display.clear_output() - s = f'({os.cpu_count()} CPUs, {ram * gib:.1f} GB RAM, {(total - free) * gib:.1f}/{total * gib:.1f} GB disk)' - else: - s = '' - - select_device(newline=False) - print(emojis(f'Setup complete ✅ {s}')) - return display +# YOLOv5 🚀 by Ultralytics, GPL-3.0 license +""" +utils/initialization +""" + + +def notebook_init(verbose=True): + # Check system software and hardware + print('Checking setup...') + + import os + import shutil + + from utils.general import check_requirements, emojis, is_colab + from utils.torch_utils import select_device # imports + + check_requirements(('psutil', 'IPython')) + import psutil + from IPython import display # to display images and clear console output + + if is_colab(): + shutil.rmtree('/content/sample_data', ignore_errors=True) # remove colab /sample_data directory + + if verbose: + # System info + # gb = 1 / 1000 ** 3 # bytes to GB + gib = 1 / 1024 ** 3 # bytes to GiB + ram = psutil.virtual_memory().total + total, used, free = shutil.disk_usage("/") + display.clear_output() + s = f'({os.cpu_count()} CPUs, {ram * gib:.1f} GB RAM, {(total - free) * gib:.1f}/{total * gib:.1f} GB disk)' + else: + s = '' + + select_device(newline=False) + print(emojis(f'Setup complete ✅ {s}')) + return display diff --git a/cv/pose/alphapose/pytorch/scripts/trainval/demo_api.py b/cv/pose/alphapose/pytorch/scripts/trainval/demo_api.py index a0e879f7a..5bd8af101 100755 --- a/cv/pose/alphapose/pytorch/scripts/trainval/demo_api.py +++ b/cv/pose/alphapose/pytorch/scripts/trainval/demo_api.py @@ -1,365 +1,365 @@ -# ----------------------------------------------------- -# Copyright (c) Shanghai Jiao Tong University. All rights reserved. -# Written by Haoyi Zhu,Hao-Shu Fang -# ----------------------------------------------------- - -"""Script for single-image demo.""" -import argparse -import torch -import os -import platform -import sys -import math -import time - -import cv2 -import numpy as np - -from alphapose.utils.transforms import get_func_heatmap_to_coord -from alphapose.utils.pPose_nms import pose_nms -from alphapose.utils.presets import SimpleTransform -from alphapose.utils.transforms import flip, flip_heatmap -from alphapose.models import builder -from alphapose.utils.config import update_config -from detector.apis import get_detector -from alphapose.utils.vis import getTime - -"""----------------------------- Demo options -----------------------------""" -parser = argparse.ArgumentParser(description='AlphaPose Single-Image Demo') -parser.add_argument('--cfg', type=str, required=True, - help='experiment configure file name') -parser.add_argument('--checkpoint', type=str, required=True, - help='checkpoint file name') -parser.add_argument('--detector', dest='detector', - help='detector name', default="yolo") -parser.add_argument('--image', dest='inputimg', - help='image-name', default="") -parser.add_argument('--save_img', default=False, action='store_true', - help='save result as image') -parser.add_argument('--vis', default=False, action='store_true', - help='visualize image') -parser.add_argument('--showbox', default=False, action='store_true', - help='visualize human bbox') -parser.add_argument('--profile', default=False, action='store_true', - help='add speed profiling at screen output') -parser.add_argument('--format', type=str, - help='save in the format of cmu or coco or openpose, option: coco/cmu/open') -parser.add_argument('--min_box_area', type=int, default=0, - help='min box area to filter out') -parser.add_argument('--eval', dest='eval', default=False, action='store_true', - help='save the result json as coco format, using image index(int) instead of image name(str)') -parser.add_argument('--gpus', type=str, dest='gpus', default="0", - help='choose which cuda device to use by index and input comma to use multi gpus, e.g. 0,1,2,3. (input -1 for cpu only)') -parser.add_argument('--flip', default=False, action='store_true', - help='enable flip testing') -parser.add_argument('--debug', default=False, action='store_true', - help='print detail information') -parser.add_argument('--vis_fast', dest='vis_fast', - help='use fast rendering', action='store_true', default=False) -"""----------------------------- Tracking options -----------------------------""" -parser.add_argument('--pose_flow', dest='pose_flow', - help='track humans in video with PoseFlow', action='store_true', default=False) -parser.add_argument('--pose_track', dest='pose_track', - help='track humans in video with reid', action='store_true', default=False) - -args = parser.parse_args() -cfg = update_config(args.cfg) - -args.gpus = [int(args.gpus[0])] if torch.cuda.device_count() >= 1 else [-1] -args.device = torch.device("cuda:" + str(args.gpus[0]) if args.gpus[0] >= 0 else "cpu") -args.tracking = args.pose_track or args.pose_flow or args.detector=='tracker' - -class DetectionLoader(): - def __init__(self, detector, cfg, opt): - self.cfg = cfg - self.opt = opt - self.device = opt.device - self.detector = detector - - self._input_size = cfg.DATA_PRESET.IMAGE_SIZE - self._output_size = cfg.DATA_PRESET.HEATMAP_SIZE - - self._sigma = cfg.DATA_PRESET.SIGMA - - pose_dataset = builder.retrieve_dataset(self.cfg.DATASET.TRAIN) - if cfg.DATA_PRESET.TYPE == 'simple': - self.transformation = SimpleTransform( - pose_dataset, scale_factor=0, - input_size=self._input_size, - output_size=self._output_size, - rot=0, sigma=self._sigma, - train=False, add_dpg=False, gpu_device=self.device) - - self.image = (None, None, None, None) - self.det = (None, None, None, None, None, None, None) - self.pose = (None, None, None, None, None, None, None) - - def process(self, im_name, image): - # start to pre process images for object detection - self.image_preprocess(im_name, image) - # start to detect human in images - self.image_detection() - # start to post process cropped human image for pose estimation - self.image_postprocess() - return self - - def image_preprocess(self, im_name, image): - # expected image shape like (1,3,h,w) or (3,h,w) - img = self.detector.image_preprocess(image) - if isinstance(img, np.ndarray): - img = torch.from_numpy(img) - # add one dimension at the front for batch if image shape (3,h,w) - if img.dim() == 3: - img = img.unsqueeze(0) - orig_img = image # scipy.misc.imread(im_name_k, mode='RGB') is depreciated - im_dim = orig_img.shape[1], orig_img.shape[0] - - im_name = os.path.basename(im_name) - - with torch.no_grad(): - im_dim = torch.FloatTensor(im_dim).repeat(1, 2) - - self.image = (img, orig_img, im_name, im_dim) - - def image_detection(self): - imgs, orig_imgs, im_names, im_dim_list = self.image - if imgs is None: - self.det = (None, None, None, None, None, None, None) - return - - with torch.no_grad(): - dets = self.detector.images_detection(imgs, im_dim_list) - if isinstance(dets, int) or dets.shape[0] == 0: - self.det = (orig_imgs, im_names, None, None, None, None, None) - return - if isinstance(dets, np.ndarray): - dets = torch.from_numpy(dets) - dets = dets.cpu() - boxes = dets[:, 1:5] - scores = dets[:, 5:6] - ids = torch.zeros(scores.shape) - - boxes = boxes[dets[:, 0] == 0] - if isinstance(boxes, int) or boxes.shape[0] == 0: - self.det = (orig_imgs, im_names, None, None, None, None, None) - return - inps = torch.zeros(boxes.size(0), 3, *self._input_size) - cropped_boxes = torch.zeros(boxes.size(0), 4) - - self.det = (orig_imgs, im_names, boxes, scores[dets[:, 0] == 0], ids[dets[:, 0] == 0], inps, cropped_boxes) - - def image_postprocess(self): - with torch.no_grad(): - (orig_img, im_name, boxes, scores, ids, inps, cropped_boxes) = self.det - if orig_img is None: - self.pose = (None, None, None, None, None, None, None) - return - if boxes is None or boxes.nelement() == 0: - self.pose = (None, orig_img, im_name, boxes, scores, ids, None) - return - - for i, box in enumerate(boxes): - inps[i], cropped_box = self.transformation.test_transform(orig_img, box) - cropped_boxes[i] = torch.FloatTensor(cropped_box) - - self.pose = (inps, orig_img, im_name, boxes, scores, ids, cropped_boxes) - - def read(self): - return self.pose - - -class DataWriter(): - def __init__(self, cfg, opt): - self.cfg = cfg - self.opt = opt - - self.eval_joints = list(range(cfg.DATA_PRESET.NUM_JOINTS)) - self.heatmap_to_coord = get_func_heatmap_to_coord(cfg) - self.item = (None, None, None, None, None, None, None) - - def start(self): - # start to read pose estimation results - return self.update() - - def update(self): - norm_type = self.cfg.LOSS.get('NORM_TYPE', None) - hm_size = self.cfg.DATA_PRESET.HEATMAP_SIZE - - # get item - (boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name) = self.item - if orig_img is None: - return None - # image channel RGB->BGR - orig_img = np.array(orig_img, dtype=np.uint8)[:, :, ::-1] - self.orig_img = orig_img - if boxes is None or len(boxes) == 0: - return None - else: - # location prediction (n, kp, 2) | score prediction (n, kp, 1) - assert hm_data.dim() == 4 - if hm_data.size()[1] == 136: - self.eval_joints = [*range(0,136)] - elif hm_data.size()[1] == 26: - self.eval_joints = [*range(0,26)] - pose_coords = [] - pose_scores = [] - - for i in range(hm_data.shape[0]): - bbox = cropped_boxes[i].tolist() - pose_coord, pose_score = self.heatmap_to_coord(hm_data[i][self.eval_joints], bbox, hm_shape=hm_size, norm_type=norm_type) - pose_coords.append(torch.from_numpy(pose_coord).unsqueeze(0)) - pose_scores.append(torch.from_numpy(pose_score).unsqueeze(0)) - preds_img = torch.cat(pose_coords) - preds_scores = torch.cat(pose_scores) - - boxes, scores, ids, preds_img, preds_scores, pick_ids = \ - pose_nms(boxes, scores, ids, preds_img, preds_scores, self.opt.min_box_area) - - _result = [] - for k in range(len(scores)): - _result.append( - { - 'keypoints':preds_img[k], - 'kp_score':preds_scores[k], - 'proposal_score': torch.mean(preds_scores[k]) + scores[k] + 1.25 * max(preds_scores[k]), - 'idx':ids[k], - 'bbox':[boxes[k][0], boxes[k][1], boxes[k][2]-boxes[k][0],boxes[k][3]-boxes[k][1]] - } - ) - - result = { - 'imgname': im_name, - 'result': _result - } - - if hm_data.size()[1] == 49: - from alphapose.utils.vis import vis_frame_dense as vis_frame - elif self.opt.vis_fast: - from alphapose.utils.vis import vis_frame_fast as vis_frame - else: - from alphapose.utils.vis import vis_frame - self.vis_frame = vis_frame - - return result - - def save(self, boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name): - self.item = (boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name) - -class SingleImageAlphaPose(): - def __init__(self, args, cfg): - self.args = args - self.cfg = cfg - - # Load pose model - self.pose_model = builder.build_sppe(cfg.MODEL, preset_cfg=cfg.DATA_PRESET) - - print(f'Loading pose model from {args.checkpoint}...') - self.pose_model.load_state_dict(torch.load(args.checkpoint, map_location=args.device)) - self.pose_dataset = builder.retrieve_dataset(cfg.DATASET.TRAIN) - - self.pose_model.to(args.device) - self.pose_model.eval() - - self.det_loader = DetectionLoader(get_detector(self.args), self.cfg, self.args) - - def process(self, im_name, image): - # Init data writer - self.writer = DataWriter(self.cfg, self.args) - - runtime_profile = { - 'dt': [], - 'pt': [], - 'pn': [] - } - pose = None - try: - start_time = getTime() - with torch.no_grad(): - (inps, orig_img, im_name, boxes, scores, ids, cropped_boxes) = self.det_loader.process(im_name, image).read() - if orig_img is None: - raise Exception("no image is given") - if boxes is None or boxes.nelement() == 0: - if self.args.profile: - ckpt_time, det_time = getTime(start_time) - runtime_profile['dt'].append(det_time) - self.writer.save(None, None, None, None, None, orig_img, im_name) - if self.args.profile: - ckpt_time, pose_time = getTime(ckpt_time) - runtime_profile['pt'].append(pose_time) - pose = self.writer.start() - if self.args.profile: - ckpt_time, post_time = getTime(ckpt_time) - runtime_profile['pn'].append(post_time) - else: - if self.args.profile: - ckpt_time, det_time = getTime(start_time) - runtime_profile['dt'].append(det_time) - # Pose Estimation - inps = inps.to(self.args.device) - if self.args.flip: - inps = torch.cat((inps, flip(inps))) - hm = self.pose_model(inps) - if self.args.flip: - hm_flip = flip_heatmap(hm[int(len(hm) / 2):], self.pose_dataset.joint_pairs, shift=True) - hm = (hm[0:int(len(hm) / 2)] + hm_flip) / 2 - if self.args.profile: - ckpt_time, pose_time = getTime(ckpt_time) - runtime_profile['pt'].append(pose_time) - hm = hm.cpu() - self.writer.save(boxes, scores, ids, hm, cropped_boxes, orig_img, im_name) - pose = self.writer.start() - if self.args.profile: - ckpt_time, post_time = getTime(ckpt_time) - runtime_profile['pn'].append(post_time) - - if self.args.profile: - print( - 'det time: {dt:.4f} | pose time: {pt:.4f} | post processing: {pn:.4f}'.format( - dt=np.mean(runtime_profile['dt']), pt=np.mean(runtime_profile['pt']), pn=np.mean(runtime_profile['pn'])) - ) - print('===========================> Finish Model Running.') - except Exception as e: - print(repr(e)) - print('An error as above occurs when processing the images, please check it') - pass - except KeyboardInterrupt: - print('===========================> Finish Model Running.') - - return pose - - def getImg(self): - return self.writer.orig_img - - def vis(self, image, pose): - if pose is not None: - image = self.writer.vis_frame(image, pose, self.writer.opt) - return image - - def writeJson(self, final_result, outputpath, form='coco', for_eval=False): - from alphapose.utils.pPose_nms import write_json - write_json(final_result, outputpath, form=form, for_eval=for_eval) - print("Results have been written to json.") - -def example(): - outputpath = "examples/res/" - if not os.path.exists(outputpath + '/vis'): - os.mkdir(outputpath + '/vis') - - demo = SingleImageAlphaPose(args, cfg) - im_name = args.inputimg # the path to the target image - image = cv2.cvtColor(cv2.imread(im_name), cv2.COLOR_BGR2RGB) - pose = demo.process(im_name, image) - img = demo.getImg() # or you can just use: img = cv2.imread(image) - img = demo.vis(img, pose) # visulize the pose result - cv2.imwrite(os.path.join(outputpath, 'vis', os.path.basename(im_name)), img) - - # if you want to vis the img: - # cv2.imshow("AlphaPose Demo", img) - # cv2.waitKey(30) - - # write the result to json: - result = [pose] - demo.writeJson(result, outputpath, form=args.format, for_eval=args.eval) - -if __name__ == "__main__": - example() +# ----------------------------------------------------- +# Copyright (c) Shanghai Jiao Tong University. All rights reserved. +# Written by Haoyi Zhu,Hao-Shu Fang +# ----------------------------------------------------- + +"""Script for single-image demo.""" +import argparse +import torch +import os +import platform +import sys +import math +import time + +import cv2 +import numpy as np + +from alphapose.utils.transforms import get_func_heatmap_to_coord +from alphapose.utils.pPose_nms import pose_nms +from alphapose.utils.presets import SimpleTransform +from alphapose.utils.transforms import flip, flip_heatmap +from alphapose.models import builder +from alphapose.utils.config import update_config +from detector.apis import get_detector +from alphapose.utils.vis import getTime + +"""----------------------------- Demo options -----------------------------""" +parser = argparse.ArgumentParser(description='AlphaPose Single-Image Demo') +parser.add_argument('--cfg', type=str, required=True, + help='experiment configure file name') +parser.add_argument('--checkpoint', type=str, required=True, + help='checkpoint file name') +parser.add_argument('--detector', dest='detector', + help='detector name', default="yolo") +parser.add_argument('--image', dest='inputimg', + help='image-name', default="") +parser.add_argument('--save_img', default=False, action='store_true', + help='save result as image') +parser.add_argument('--vis', default=False, action='store_true', + help='visualize image') +parser.add_argument('--showbox', default=False, action='store_true', + help='visualize human bbox') +parser.add_argument('--profile', default=False, action='store_true', + help='add speed profiling at screen output') +parser.add_argument('--format', type=str, + help='save in the format of cmu or coco or openpose, option: coco/cmu/open') +parser.add_argument('--min_box_area', type=int, default=0, + help='min box area to filter out') +parser.add_argument('--eval', dest='eval', default=False, action='store_true', + help='save the result json as coco format, using image index(int) instead of image name(str)') +parser.add_argument('--gpus', type=str, dest='gpus', default="0", + help='choose which cuda device to use by index and input comma to use multi gpus, e.g. 0,1,2,3. (input -1 for cpu only)') +parser.add_argument('--flip', default=False, action='store_true', + help='enable flip testing') +parser.add_argument('--debug', default=False, action='store_true', + help='print detail information') +parser.add_argument('--vis_fast', dest='vis_fast', + help='use fast rendering', action='store_true', default=False) +"""----------------------------- Tracking options -----------------------------""" +parser.add_argument('--pose_flow', dest='pose_flow', + help='track humans in video with PoseFlow', action='store_true', default=False) +parser.add_argument('--pose_track', dest='pose_track', + help='track humans in video with reid', action='store_true', default=False) + +args = parser.parse_args() +cfg = update_config(args.cfg) + +args.gpus = [int(args.gpus[0])] if torch.cuda.device_count() >= 1 else [-1] +args.device = torch.device("cuda:" + str(args.gpus[0]) if args.gpus[0] >= 0 else "cpu") +args.tracking = args.pose_track or args.pose_flow or args.detector=='tracker' + +class DetectionLoader(): + def __init__(self, detector, cfg, opt): + self.cfg = cfg + self.opt = opt + self.device = opt.device + self.detector = detector + + self._input_size = cfg.DATA_PRESET.IMAGE_SIZE + self._output_size = cfg.DATA_PRESET.HEATMAP_SIZE + + self._sigma = cfg.DATA_PRESET.SIGMA + + pose_dataset = builder.retrieve_dataset(self.cfg.DATASET.TRAIN) + if cfg.DATA_PRESET.TYPE == 'simple': + self.transformation = SimpleTransform( + pose_dataset, scale_factor=0, + input_size=self._input_size, + output_size=self._output_size, + rot=0, sigma=self._sigma, + train=False, add_dpg=False, gpu_device=self.device) + + self.image = (None, None, None, None) + self.det = (None, None, None, None, None, None, None) + self.pose = (None, None, None, None, None, None, None) + + def process(self, im_name, image): + # start to pre process images for object detection + self.image_preprocess(im_name, image) + # start to detect human in images + self.image_detection() + # start to post process cropped human image for pose estimation + self.image_postprocess() + return self + + def image_preprocess(self, im_name, image): + # expected image shape like (1,3,h,w) or (3,h,w) + img = self.detector.image_preprocess(image) + if isinstance(img, np.ndarray): + img = torch.from_numpy(img) + # add one dimension at the front for batch if image shape (3,h,w) + if img.dim() == 3: + img = img.unsqueeze(0) + orig_img = image # scipy.misc.imread(im_name_k, mode='RGB') is depreciated + im_dim = orig_img.shape[1], orig_img.shape[0] + + im_name = os.path.basename(im_name) + + with torch.no_grad(): + im_dim = torch.FloatTensor(im_dim).repeat(1, 2) + + self.image = (img, orig_img, im_name, im_dim) + + def image_detection(self): + imgs, orig_imgs, im_names, im_dim_list = self.image + if imgs is None: + self.det = (None, None, None, None, None, None, None) + return + + with torch.no_grad(): + dets = self.detector.images_detection(imgs, im_dim_list) + if isinstance(dets, int) or dets.shape[0] == 0: + self.det = (orig_imgs, im_names, None, None, None, None, None) + return + if isinstance(dets, np.ndarray): + dets = torch.from_numpy(dets) + dets = dets.cpu() + boxes = dets[:, 1:5] + scores = dets[:, 5:6] + ids = torch.zeros(scores.shape) + + boxes = boxes[dets[:, 0] == 0] + if isinstance(boxes, int) or boxes.shape[0] == 0: + self.det = (orig_imgs, im_names, None, None, None, None, None) + return + inps = torch.zeros(boxes.size(0), 3, *self._input_size) + cropped_boxes = torch.zeros(boxes.size(0), 4) + + self.det = (orig_imgs, im_names, boxes, scores[dets[:, 0] == 0], ids[dets[:, 0] == 0], inps, cropped_boxes) + + def image_postprocess(self): + with torch.no_grad(): + (orig_img, im_name, boxes, scores, ids, inps, cropped_boxes) = self.det + if orig_img is None: + self.pose = (None, None, None, None, None, None, None) + return + if boxes is None or boxes.nelement() == 0: + self.pose = (None, orig_img, im_name, boxes, scores, ids, None) + return + + for i, box in enumerate(boxes): + inps[i], cropped_box = self.transformation.test_transform(orig_img, box) + cropped_boxes[i] = torch.FloatTensor(cropped_box) + + self.pose = (inps, orig_img, im_name, boxes, scores, ids, cropped_boxes) + + def read(self): + return self.pose + + +class DataWriter(): + def __init__(self, cfg, opt): + self.cfg = cfg + self.opt = opt + + self.eval_joints = list(range(cfg.DATA_PRESET.NUM_JOINTS)) + self.heatmap_to_coord = get_func_heatmap_to_coord(cfg) + self.item = (None, None, None, None, None, None, None) + + def start(self): + # start to read pose estimation results + return self.update() + + def update(self): + norm_type = self.cfg.LOSS.get('NORM_TYPE', None) + hm_size = self.cfg.DATA_PRESET.HEATMAP_SIZE + + # get item + (boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name) = self.item + if orig_img is None: + return None + # image channel RGB->BGR + orig_img = np.array(orig_img, dtype=np.uint8)[:, :, ::-1] + self.orig_img = orig_img + if boxes is None or len(boxes) == 0: + return None + else: + # location prediction (n, kp, 2) | score prediction (n, kp, 1) + assert hm_data.dim() == 4 + if hm_data.size()[1] == 136: + self.eval_joints = [*range(0,136)] + elif hm_data.size()[1] == 26: + self.eval_joints = [*range(0,26)] + pose_coords = [] + pose_scores = [] + + for i in range(hm_data.shape[0]): + bbox = cropped_boxes[i].tolist() + pose_coord, pose_score = self.heatmap_to_coord(hm_data[i][self.eval_joints], bbox, hm_shape=hm_size, norm_type=norm_type) + pose_coords.append(torch.from_numpy(pose_coord).unsqueeze(0)) + pose_scores.append(torch.from_numpy(pose_score).unsqueeze(0)) + preds_img = torch.cat(pose_coords) + preds_scores = torch.cat(pose_scores) + + boxes, scores, ids, preds_img, preds_scores, pick_ids = \ + pose_nms(boxes, scores, ids, preds_img, preds_scores, self.opt.min_box_area) + + _result = [] + for k in range(len(scores)): + _result.append( + { + 'keypoints':preds_img[k], + 'kp_score':preds_scores[k], + 'proposal_score': torch.mean(preds_scores[k]) + scores[k] + 1.25 * max(preds_scores[k]), + 'idx':ids[k], + 'bbox':[boxes[k][0], boxes[k][1], boxes[k][2]-boxes[k][0],boxes[k][3]-boxes[k][1]] + } + ) + + result = { + 'imgname': im_name, + 'result': _result + } + + if hm_data.size()[1] == 49: + from alphapose.utils.vis import vis_frame_dense as vis_frame + elif self.opt.vis_fast: + from alphapose.utils.vis import vis_frame_fast as vis_frame + else: + from alphapose.utils.vis import vis_frame + self.vis_frame = vis_frame + + return result + + def save(self, boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name): + self.item = (boxes, scores, ids, hm_data, cropped_boxes, orig_img, im_name) + +class SingleImageAlphaPose(): + def __init__(self, args, cfg): + self.args = args + self.cfg = cfg + + # Load pose model + self.pose_model = builder.build_sppe(cfg.MODEL, preset_cfg=cfg.DATA_PRESET) + + print(f'Loading pose model from {args.checkpoint}...') + self.pose_model.load_state_dict(torch.load(args.checkpoint, map_location=args.device)) + self.pose_dataset = builder.retrieve_dataset(cfg.DATASET.TRAIN) + + self.pose_model.to(args.device) + self.pose_model.eval() + + self.det_loader = DetectionLoader(get_detector(self.args), self.cfg, self.args) + + def process(self, im_name, image): + # Init data writer + self.writer = DataWriter(self.cfg, self.args) + + runtime_profile = { + 'dt': [], + 'pt': [], + 'pn': [] + } + pose = None + try: + start_time = getTime() + with torch.no_grad(): + (inps, orig_img, im_name, boxes, scores, ids, cropped_boxes) = self.det_loader.process(im_name, image).read() + if orig_img is None: + raise Exception("no image is given") + if boxes is None or boxes.nelement() == 0: + if self.args.profile: + ckpt_time, det_time = getTime(start_time) + runtime_profile['dt'].append(det_time) + self.writer.save(None, None, None, None, None, orig_img, im_name) + if self.args.profile: + ckpt_time, pose_time = getTime(ckpt_time) + runtime_profile['pt'].append(pose_time) + pose = self.writer.start() + if self.args.profile: + ckpt_time, post_time = getTime(ckpt_time) + runtime_profile['pn'].append(post_time) + else: + if self.args.profile: + ckpt_time, det_time = getTime(start_time) + runtime_profile['dt'].append(det_time) + # Pose Estimation + inps = inps.to(self.args.device) + if self.args.flip: + inps = torch.cat((inps, flip(inps))) + hm = self.pose_model(inps) + if self.args.flip: + hm_flip = flip_heatmap(hm[int(len(hm) / 2):], self.pose_dataset.joint_pairs, shift=True) + hm = (hm[0:int(len(hm) / 2)] + hm_flip) / 2 + if self.args.profile: + ckpt_time, pose_time = getTime(ckpt_time) + runtime_profile['pt'].append(pose_time) + hm = hm.cpu() + self.writer.save(boxes, scores, ids, hm, cropped_boxes, orig_img, im_name) + pose = self.writer.start() + if self.args.profile: + ckpt_time, post_time = getTime(ckpt_time) + runtime_profile['pn'].append(post_time) + + if self.args.profile: + print( + 'det time: {dt:.4f} | pose time: {pt:.4f} | post processing: {pn:.4f}'.format( + dt=np.mean(runtime_profile['dt']), pt=np.mean(runtime_profile['pt']), pn=np.mean(runtime_profile['pn'])) + ) + print('===========================> Finish Model Running.') + except Exception as e: + print(repr(e)) + print('An error as above occurs when processing the images, please check it') + pass + except KeyboardInterrupt: + print('===========================> Finish Model Running.') + + return pose + + def getImg(self): + return self.writer.orig_img + + def vis(self, image, pose): + if pose is not None: + image = self.writer.vis_frame(image, pose, self.writer.opt) + return image + + def writeJson(self, final_result, outputpath, form='coco', for_eval=False): + from alphapose.utils.pPose_nms import write_json + write_json(final_result, outputpath, form=form, for_eval=for_eval) + print("Results have been written to json.") + +def example(): + outputpath = "examples/res/" + if not os.path.exists(outputpath + '/vis'): + os.mkdir(outputpath + '/vis') + + demo = SingleImageAlphaPose(args, cfg) + im_name = args.inputimg # the path to the target image + image = cv2.cvtColor(cv2.imread(im_name), cv2.COLOR_BGR2RGB) + pose = demo.process(im_name, image) + img = demo.getImg() # or you can just use: img = cv2.imread(image) + img = demo.vis(img, pose) # visulize the pose result + cv2.imwrite(os.path.join(outputpath, 'vis', os.path.basename(im_name)), img) + + # if you want to vis the img: + # cv2.imshow("AlphaPose Demo", img) + # cv2.waitKey(30) + + # write the result to json: + result = [pose] + demo.writeJson(result, outputpath, form=args.format, for_eval=args.eval) + +if __name__ == "__main__": + example() diff --git a/cv/pose/alphapose/pytorch/trackers/ReidModels/osnet_ain.py b/cv/pose/alphapose/pytorch/trackers/ReidModels/osnet_ain.py index 9981123e4..de2505c3e 100755 --- a/cv/pose/alphapose/pytorch/trackers/ReidModels/osnet_ain.py +++ b/cv/pose/alphapose/pytorch/trackers/ReidModels/osnet_ain.py @@ -1,540 +1,540 @@ -from __future__ import division, absolute_import -import warnings -import torch -from torch import nn -from torch.nn import functional as F - -__all__ = ['osnet_ain_x1_0'] - -pretrained_urls = { - 'osnet_ain_x1_0': - 'https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo' -} - - -########## -# Basic layers -########## -class ConvLayer(nn.Module): - """Convolution layer (conv + bn + relu).""" - - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - groups=1, - IN=False - ): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - bias=False, - groups=groups - ) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - - -class Conv1x1(nn.Module): - """1x1 convolution + bn + relu.""" - - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 1, - stride=stride, - padding=0, - bias=False, - groups=groups - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - - -class Conv1x1Linear(nn.Module): - """1x1 convolution + bn (w/o non-linearity).""" - - def __init__(self, in_channels, out_channels, stride=1, bn=True): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d( - in_channels, out_channels, 1, stride=stride, padding=0, bias=False - ) - self.bn = None - if bn: - self.bn = nn.BatchNorm2d(out_channels) - - def forward(self, x): - x = self.conv(x) - if self.bn is not None: - x = self.bn(x) - return x - - -class Conv3x3(nn.Module): - """3x3 convolution + bn + relu.""" - - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 3, - stride=stride, - padding=1, - bias=False, - groups=groups - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - - -class LightConv3x3(nn.Module): - """Lightweight 3x3 convolution. - 1x1 (linear) + dw 3x3 (nonlinear). - """ - - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d( - in_channels, out_channels, 1, stride=1, padding=0, bias=False - ) - self.conv2 = nn.Conv2d( - out_channels, - out_channels, - 3, - stride=1, - padding=1, - bias=False, - groups=out_channels - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - return self.relu(x) - - -class LightConvStream(nn.Module): - """Lightweight convolution stream.""" - - def __init__(self, in_channels, out_channels, depth): - super(LightConvStream, self).__init__() - assert depth >= 1, 'depth must be equal to or larger than 1, but got {}'.format( - depth - ) - layers = [] - layers += [LightConv3x3(in_channels, out_channels)] - for i in range(depth - 1): - layers += [LightConv3x3(out_channels, out_channels)] - self.layers = nn.Sequential(*layers) - - def forward(self, x): - return self.layers(x) - - -########## -# Building blocks for omni-scale feature learning -########## -class ChannelGate(nn.Module): - """A mini-network that generates channel-wise gates conditioned on input tensor.""" - - def __init__( - self, - in_channels, - num_gates=None, - return_gates=False, - gate_activation='sigmoid', - reduction=16, - layer_norm=False - ): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d( - in_channels, - in_channels // reduction, - kernel_size=1, - bias=True, - padding=0 - ) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU() - self.fc2 = nn.Conv2d( - in_channels // reduction, - num_gates, - kernel_size=1, - bias=True, - padding=0 - ) - if gate_activation == 'sigmoid': - self.gate_activation = nn.Sigmoid() - elif gate_activation == 'relu': - self.gate_activation = nn.ReLU() - elif gate_activation == 'linear': - self.gate_activation = None - else: - raise RuntimeError( - "Unknown gate activation: {}".format(gate_activation) - ) - - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x - - -class OSBlock(nn.Module): - """Omni-scale feature learning block.""" - - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlock, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList() - for t in range(1, T + 1): - self.conv2 += [LightConvStream(mid_channels, mid_channels, t)] - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) - - -class OSBlockINin(nn.Module): - """Omni-scale feature learning block with instance normalization.""" - - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlockINin, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList() - for t in range(1, T + 1): - self.conv2 += [LightConvStream(mid_channels, mid_channels, t)] - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - x3 = self.IN(x3) # IN inside residual - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) - - -########## -# Network architecture -########## -class OSNet(nn.Module): - """Omni-Scale Network. - - Reference: - - Zhou et al. Omni-Scale Feature Learning for Person Re-Identification. ICCV, 2019. - - Zhou et al. Learning Generalisable Omni-Scale Representations - for Person Re-Identification. arXiv preprint, 2019. - """ - - def __init__( - self, - num_classes, - blocks, - layers, - channels, - feature_dim=512, - loss='softmax', - conv1_IN=False, - **kwargs - ): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - - # convolutional backbone - self.conv1 = ConvLayer( - 3, channels[0], 7, stride=2, padding=3, IN=conv1_IN - ) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer( - blocks[0], layers[0], channels[0], channels[1] - ) - self.pool2 = nn.Sequential( - Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2) - ) - self.conv3 = self._make_layer( - blocks[1], layers[1], channels[1], channels[2] - ) - self.pool3 = nn.Sequential( - Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2) - ) - self.conv4 = self._make_layer( - blocks[2], layers[2], channels[2], channels[3] - ) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - # fully connected layer - self.fc = self._construct_fc_layer( - self.feature_dim, channels[3], dropout_p=None - ) - # identity classification layer - self.classifier = nn.Linear(self.feature_dim, num_classes) - - self._init_params() - - def _make_layer(self, blocks, layer, in_channels, out_channels): - layers = [] - layers += [blocks[0](in_channels, out_channels)] - for i in range(1, len(blocks)): - layers += [blocks[i](out_channels, out_channels)] - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU()) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - - self.feature_dim = fc_dims[-1] - - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_( - m.weight, mode='fan_out', nonlinearity='relu' - ) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - elif isinstance(m, nn.InstanceNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.pool2(x) - x = self.conv3(x) - x = self.pool3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == 'softmax': - return y - elif self.loss == 'triplet': - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) - - -def init_pretrained_weights(model, key=''): - """Initializes model with pretrained weights. - - Layers that don't match with pretrained layers in name or size are kept unchanged. - """ - import os - import errno - import gdown - from collections import OrderedDict - - def _get_torch_home(): - ENV_TORCH_HOME = 'TORCH_HOME' - ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' - DEFAULT_CACHE_DIR = '~/.cache' - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join( - os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'torch' - ) - ) - ) - return torch_home - - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, 'checkpoints') - try: - os.makedirs(model_dir) - except OSError as e: - if e.errno == errno.EEXIST: - # Directory already exists, ignore. - pass - else: - # Unexpected OSError, re-raise. - raise - filename = key + '_imagenet.pth' - cached_file = os.path.join(model_dir, filename) - - if not os.path.exists(cached_file): - gdown.download(pretrained_urls[key], cached_file, quiet=False) - - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - - for k, v in state_dict.items(): - if k.startswith('module.'): - k = k[7:] # discard module. - - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights from "{}" cannot be loaded, ' - 'please check the key names manually ' - '(** ignored and continue **)'.format(cached_file) - ) - else: - print( - 'Successfully loaded imagenet pretrained weights from "{}"'. - format(cached_file) - ) - if len(discarded_layers) > 0: - print( - '** The following layers are discarded ' - 'due to unmatched keys or layer size: {}'. - format(discarded_layers) - ) - - -########## -# Instantiation -########## -def osnet_ain_x1_0( - num_classes=1000, pretrained=True, loss='softmax', **kwargs -): - model = OSNet( - num_classes, - blocks=[ - [OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], - [OSBlockINin, OSBlock] - ], - layers=[2, 2, 2], - channels=[64, 256, 384, 512], - loss=loss, - conv1_IN=True, - **kwargs - ) - if pretrained: - init_pretrained_weights(model, key='osnet_ain_x1_0') +from __future__ import division, absolute_import +import warnings +import torch +from torch import nn +from torch.nn import functional as F + +__all__ = ['osnet_ain_x1_0'] + +pretrained_urls = { + 'osnet_ain_x1_0': + 'https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo' +} + + +########## +# Basic layers +########## +class ConvLayer(nn.Module): + """Convolution layer (conv + bn + relu).""" + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + groups=1, + IN=False + ): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + groups=groups + ) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + + +class Conv1x1(nn.Module): + """1x1 convolution + bn + relu.""" + + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=stride, + padding=0, + bias=False, + groups=groups + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + + +class Conv1x1Linear(nn.Module): + """1x1 convolution + bn (w/o non-linearity).""" + + def __init__(self, in_channels, out_channels, stride=1, bn=True): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=stride, padding=0, bias=False + ) + self.bn = None + if bn: + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + return x + + +class Conv3x3(nn.Module): + """3x3 convolution + bn + relu.""" + + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=groups + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + + +class LightConv3x3(nn.Module): + """Lightweight 3x3 convolution. + 1x1 (linear) + dw 3x3 (nonlinear). + """ + + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, 1, stride=1, padding=0, bias=False + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=False, + groups=out_channels + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + return self.relu(x) + + +class LightConvStream(nn.Module): + """Lightweight convolution stream.""" + + def __init__(self, in_channels, out_channels, depth): + super(LightConvStream, self).__init__() + assert depth >= 1, 'depth must be equal to or larger than 1, but got {}'.format( + depth + ) + layers = [] + layers += [LightConv3x3(in_channels, out_channels)] + for i in range(depth - 1): + layers += [LightConv3x3(out_channels, out_channels)] + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +########## +# Building blocks for omni-scale feature learning +########## +class ChannelGate(nn.Module): + """A mini-network that generates channel-wise gates conditioned on input tensor.""" + + def __init__( + self, + in_channels, + num_gates=None, + return_gates=False, + gate_activation='sigmoid', + reduction=16, + layer_norm=False + ): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + in_channels, + in_channels // reduction, + kernel_size=1, + bias=True, + padding=0 + ) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU() + self.fc2 = nn.Conv2d( + in_channels // reduction, + num_gates, + kernel_size=1, + bias=True, + padding=0 + ) + if gate_activation == 'sigmoid': + self.gate_activation = nn.Sigmoid() + elif gate_activation == 'relu': + self.gate_activation = nn.ReLU() + elif gate_activation == 'linear': + self.gate_activation = None + else: + raise RuntimeError( + "Unknown gate activation: {}".format(gate_activation) + ) + + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + + +class OSBlock(nn.Module): + """Omni-scale feature learning block.""" + + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlock, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList() + for t in range(1, T + 1): + self.conv2 += [LightConvStream(mid_channels, mid_channels, t)] + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + + +class OSBlockINin(nn.Module): + """Omni-scale feature learning block with instance normalization.""" + + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlockINin, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList() + for t in range(1, T + 1): + self.conv2 += [LightConvStream(mid_channels, mid_channels, t)] + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + x3 = self.IN(x3) # IN inside residual + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + + +########## +# Network architecture +########## +class OSNet(nn.Module): + """Omni-Scale Network. + + Reference: + - Zhou et al. Omni-Scale Feature Learning for Person Re-Identification. ICCV, 2019. + - Zhou et al. Learning Generalisable Omni-Scale Representations + for Person Re-Identification. arXiv preprint, 2019. + """ + + def __init__( + self, + num_classes, + blocks, + layers, + channels, + feature_dim=512, + loss='softmax', + conv1_IN=False, + **kwargs + ): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + + # convolutional backbone + self.conv1 = ConvLayer( + 3, channels[0], 7, stride=2, padding=3, IN=conv1_IN + ) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer( + blocks[0], layers[0], channels[0], channels[1] + ) + self.pool2 = nn.Sequential( + Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2) + ) + self.conv3 = self._make_layer( + blocks[1], layers[1], channels[1], channels[2] + ) + self.pool3 = nn.Sequential( + Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2) + ) + self.conv4 = self._make_layer( + blocks[2], layers[2], channels[2], channels[3] + ) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + # fully connected layer + self.fc = self._construct_fc_layer( + self.feature_dim, channels[3], dropout_p=None + ) + # identity classification layer + self.classifier = nn.Linear(self.feature_dim, num_classes) + + self._init_params() + + def _make_layer(self, blocks, layer, in_channels, out_channels): + layers = [] + layers += [blocks[0](in_channels, out_channels)] + for i in range(1, len(blocks)): + layers += [blocks[i](out_channels, out_channels)] + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU()) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + + self.feature_dim = fc_dims[-1] + + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu' + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.InstanceNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.pool2(x) + x = self.conv3(x) + x = self.pool3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == 'softmax': + return y + elif self.loss == 'triplet': + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + + +def init_pretrained_weights(model, key=''): + """Initializes model with pretrained weights. + + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + import os + import errno + import gdown + from collections import OrderedDict + + def _get_torch_home(): + ENV_TORCH_HOME = 'TORCH_HOME' + ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' + DEFAULT_CACHE_DIR = '~/.cache' + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'torch' + ) + ) + ) + return torch_home + + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, 'checkpoints') + try: + os.makedirs(model_dir) + except OSError as e: + if e.errno == errno.EEXIST: + # Directory already exists, ignore. + pass + else: + # Unexpected OSError, re-raise. + raise + filename = key + '_imagenet.pth' + cached_file = os.path.join(model_dir, filename) + + if not os.path.exists(cached_file): + gdown.download(pretrained_urls[key], cached_file, quiet=False) + + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + + for k, v in state_dict.items(): + if k.startswith('module.'): + k = k[7:] # discard module. + + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights from "{}" cannot be loaded, ' + 'please check the key names manually ' + '(** ignored and continue **)'.format(cached_file) + ) + else: + print( + 'Successfully loaded imagenet pretrained weights from "{}"'. + format(cached_file) + ) + if len(discarded_layers) > 0: + print( + '** The following layers are discarded ' + 'due to unmatched keys or layer size: {}'. + format(discarded_layers) + ) + + +########## +# Instantiation +########## +def osnet_ain_x1_0( + num_classes=1000, pretrained=True, loss='softmax', **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock] + ], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + conv1_IN=True, + **kwargs + ) + if pretrained: + init_pretrained_weights(model, key='osnet_ain_x1_0') return model \ No newline at end of file diff --git a/cv/pose/alphapose/pytorch/trackers/ReidModels/psroi_pooling/src/psroi_pooling_cuda.c b/cv/pose/alphapose/pytorch/trackers/ReidModels/psroi_pooling/src/psroi_pooling_cuda.c index 43d6dd2fb..fbbb3d212 100755 --- a/cv/pose/alphapose/pytorch/trackers/ReidModels/psroi_pooling/src/psroi_pooling_cuda.c +++ b/cv/pose/alphapose/pytorch/trackers/ReidModels/psroi_pooling/src/psroi_pooling_cuda.c @@ -1,68 +1,68 @@ -#include -#include -#include "cuda/psroi_pooling_kernel.h" - - - -extern THCState* state; - -int psroi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, int group_size, int output_dim,THCudaTensor *features, THCudaTensor* rois, THCudaTensor* output, THCudaIntTensor* mappingchannel){ - float* data_in = THCudaTensor_data(state, features); - float* rois_in = THCudaTensor_data(state, rois); - float* output_out = THCudaTensor_data(state, output); - int* mappingchannel_out = THCudaIntTensor_data(state, mappingchannel); - //Get # of Rois - int num_rois = THCudaTensor_size(state, rois, 0); - int size_rois = THCudaTensor_size(state, rois, 1); - if (size_rois!=5) - { - return -1; - } - - //Get # of batch_size - int batch_size = THCudaTensor_size(state, features, 0); - - int data_height = THCudaTensor_size(state, features, 2); - int data_width = THCudaTensor_size(state, features, 3); - int num_channels = THCudaTensor_size(state, features, 1); - - cudaStream_t stream = THCState_getCurrentStream(state); - - // call the gpu kernel for psroi_pooling - PSROIPoolForwardLauncher(data_in, spatial_scale, num_rois, data_height, data_width, num_channels, pooled_height, pooled_width,rois_in, group_size, - output_dim, output_out, mappingchannel_out,stream); - return 1; -} - - -int psroi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, int output_dim, -THCudaTensor* top_grad, THCudaTensor* rois, THCudaTensor* bottom_grad, THCudaIntTensor* mappingchannel) -{ - float *top_grad_flat = THCudaTensor_data(state, top_grad); - float *rois_flat = THCudaTensor_data(state, rois); - - float *bottom_grad_flat = THCudaTensor_data(state, bottom_grad); - int *mappingchannel_flat = THCudaIntTensor_data(state, mappingchannel); - - // Number of ROIs - int num_rois = THCudaTensor_size(state, rois, 0); - int size_rois = THCudaTensor_size(state, rois, 1); - if (size_rois != 5) - { - return -1; - } - // batch size - int batch_size = THCudaTensor_size(state, bottom_grad, 0); - - // data height - int data_height = THCudaTensor_size(state, bottom_grad, 2); - // data width - int data_width = THCudaTensor_size(state, bottom_grad, 3); - // Number of channels - int num_channels = THCudaTensor_size(state, bottom_grad, 1); - - cudaStream_t stream = THCState_getCurrentStream(state); - - PSROIPoolBackwardLauncher(top_grad_flat, mappingchannel_flat, batch_size, num_rois, spatial_scale, num_channels, data_height, data_width, pooled_width, pooled_height, output_dim, bottom_grad_flat, rois_flat, stream); - return 1; -} +#include +#include +#include "cuda/psroi_pooling_kernel.h" + + + +extern THCState* state; + +int psroi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, int group_size, int output_dim,THCudaTensor *features, THCudaTensor* rois, THCudaTensor* output, THCudaIntTensor* mappingchannel){ + float* data_in = THCudaTensor_data(state, features); + float* rois_in = THCudaTensor_data(state, rois); + float* output_out = THCudaTensor_data(state, output); + int* mappingchannel_out = THCudaIntTensor_data(state, mappingchannel); + //Get # of Rois + int num_rois = THCudaTensor_size(state, rois, 0); + int size_rois = THCudaTensor_size(state, rois, 1); + if (size_rois!=5) + { + return -1; + } + + //Get # of batch_size + int batch_size = THCudaTensor_size(state, features, 0); + + int data_height = THCudaTensor_size(state, features, 2); + int data_width = THCudaTensor_size(state, features, 3); + int num_channels = THCudaTensor_size(state, features, 1); + + cudaStream_t stream = THCState_getCurrentStream(state); + + // call the gpu kernel for psroi_pooling + PSROIPoolForwardLauncher(data_in, spatial_scale, num_rois, data_height, data_width, num_channels, pooled_height, pooled_width,rois_in, group_size, + output_dim, output_out, mappingchannel_out,stream); + return 1; +} + + +int psroi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, int output_dim, +THCudaTensor* top_grad, THCudaTensor* rois, THCudaTensor* bottom_grad, THCudaIntTensor* mappingchannel) +{ + float *top_grad_flat = THCudaTensor_data(state, top_grad); + float *rois_flat = THCudaTensor_data(state, rois); + + float *bottom_grad_flat = THCudaTensor_data(state, bottom_grad); + int *mappingchannel_flat = THCudaIntTensor_data(state, mappingchannel); + + // Number of ROIs + int num_rois = THCudaTensor_size(state, rois, 0); + int size_rois = THCudaTensor_size(state, rois, 1); + if (size_rois != 5) + { + return -1; + } + // batch size + int batch_size = THCudaTensor_size(state, bottom_grad, 0); + + // data height + int data_height = THCudaTensor_size(state, bottom_grad, 2); + // data width + int data_width = THCudaTensor_size(state, bottom_grad, 3); + // Number of channels + int num_channels = THCudaTensor_size(state, bottom_grad, 1); + + cudaStream_t stream = THCState_getCurrentStream(state); + + PSROIPoolBackwardLauncher(top_grad_flat, mappingchannel_flat, batch_size, num_rois, spatial_scale, num_channels, data_height, data_width, pooled_width, pooled_height, output_dim, bottom_grad_flat, rois_flat, stream); + return 1; +} diff --git a/cv/pose/alphapose/pytorch/trackers/tracker_cfg.py b/cv/pose/alphapose/pytorch/trackers/tracker_cfg.py index dc2751019..9d6bfe525 100755 --- a/cv/pose/alphapose/pytorch/trackers/tracker_cfg.py +++ b/cv/pose/alphapose/pytorch/trackers/tracker_cfg.py @@ -1,10 +1,10 @@ -from easydict import EasyDict as edict -cfg = edict() -cfg.nid = 1000 -cfg.arch = "osnet_ain" # "osnet" or "res50-fc512" -cfg.loadmodel = "trackers/weights/osnet_ain_x1_0_msmt17_256x128_amsgrad_ep50_lr0.0015_coslr_b64_fb10_softmax_labsmth_flip_jitter.pth" -cfg.frame_rate = 30 -cfg.track_buffer = 240 -cfg.conf_thres = 0.5 -cfg.nms_thres = 0.4 -cfg.iou_thres = 0.5 +from easydict import EasyDict as edict +cfg = edict() +cfg.nid = 1000 +cfg.arch = "osnet_ain" # "osnet" or "res50-fc512" +cfg.loadmodel = "trackers/weights/osnet_ain_x1_0_msmt17_256x128_amsgrad_ep50_lr0.0015_coslr_b64_fb10_softmax_labsmth_flip_jitter.pth" +cfg.frame_rate = 30 +cfg.track_buffer = 240 +cfg.conf_thres = 0.5 +cfg.nms_thres = 0.4 +cfg.iou_thres = 0.5 diff --git a/cv/pose/alphapose/pytorch/trackers/tracking/utils/kalman_filter.py b/cv/pose/alphapose/pytorch/trackers/tracking/utils/kalman_filter.py index 29706d8f7..5b10e8d24 100755 --- a/cv/pose/alphapose/pytorch/trackers/tracking/utils/kalman_filter.py +++ b/cv/pose/alphapose/pytorch/trackers/tracking/utils/kalman_filter.py @@ -1,229 +1,229 @@ -# vim: expandtab:ts=4:sw=4 -import numpy as np -import scipy.linalg - - -""" -Table for the 0.95 quantile of the chi-square distribution with N degrees of -freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv -function and used as Mahalanobis gating threshold. -""" -chi2inv95 = { - 1: 3.8415, - 2: 5.9915, - 3: 7.8147, - 4: 9.4877, - 5: 11.070, - 6: 12.592, - 7: 14.067, - 8: 15.507, - 9: 16.919} - - -class KalmanFilter(object): - """ - A simple Kalman filter for tracking bounding boxes in image space. - - The 8-dimensional state space - - x, y, a, h, vx, vy, va, vh - - contains the bounding box center position (x, y), aspect ratio a, height h, - and their respective velocities. - - Object motion follows a constant velocity model. The bounding box location - (x, y, a, h) is taken as direct observation of the state space (linear - observation model). - - """ - - def __init__(self): - ndim, dt = 4, 1. - - # Create Kalman filter model matrices. - self._motion_mat = np.eye(2 * ndim, 2 * ndim) - for i in range(ndim): - self._motion_mat[i, ndim + i] = dt - self._update_mat = np.eye(ndim, 2 * ndim) - - # Motion and observation uncertainty are chosen relative to the current - # state estimate. These weights control the amount of uncertainty in - # the model. This is a bit hacky. - self._std_weight_position = 1. / 20 - self._std_weight_velocity = 1. / 160 - - def initiate(self, measurement): - """Create track from unassociated measurement. - - Parameters - ---------- - measurement : ndarray - Bounding box coordinates (x, y, a, h) with center position (x, y), - aspect ratio a, and height h. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are initialized - to 0 mean. - - """ - mean_pos = measurement - mean_vel = np.zeros_like(mean_pos) - mean = np.r_[mean_pos, mean_vel] - - std = [ - 2 * self._std_weight_position * measurement[3], - 2 * self._std_weight_position * measurement[3], - 1e-2, - 2 * self._std_weight_position * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 1e-5, - 10 * self._std_weight_velocity * measurement[3]] - covariance = np.diag(np.square(std)) - return mean, covariance - - def predict(self, mean, covariance): - """Run Kalman filter prediction step. - - Parameters - ---------- - mean : ndarray - The 8 dimensional mean vector of the object state at the previous - time step. - covariance : ndarray - The 8x8 dimensional covariance matrix of the object state at the - previous time step. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - - """ - std_pos = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-2, - self._std_weight_position * mean[3]] - std_vel = [ - self._std_weight_velocity * mean[3], - self._std_weight_velocity * mean[3], - 1e-5, - self._std_weight_velocity * mean[3]] - motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - - mean = np.dot(self._motion_mat, mean) - covariance = np.linalg.multi_dot(( - self._motion_mat, covariance, self._motion_mat.T)) + motion_cov - - return mean, covariance - - def project(self, mean, covariance): - """Project state distribution to measurement space. - - Parameters - ---------- - mean : ndarray - The state's mean vector (8 dimensional array). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - - Returns - ------- - (ndarray, ndarray) - Returns the projected mean and covariance matrix of the given state - estimate. - - """ - std = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-1, - self._std_weight_position * mean[3]] - innovation_cov = np.diag(np.square(std)) - - mean = np.dot(self._update_mat, mean) - covariance = np.linalg.multi_dot(( - self._update_mat, covariance, self._update_mat.T)) - return mean, covariance + innovation_cov - - def update(self, mean, covariance, measurement): - """Run Kalman filter correction step. - - Parameters - ---------- - mean : ndarray - The predicted state's mean vector (8 dimensional). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - measurement : ndarray - The 4 dimensional measurement vector (x, y, a, h), where (x, y) - is the center position, a the aspect ratio, and h the height of the - bounding box. - - Returns - ------- - (ndarray, ndarray) - Returns the measurement-corrected state distribution. - - """ - projected_mean, projected_cov = self.project(mean, covariance) - - chol_factor, lower = scipy.linalg.cho_factor( - projected_cov, lower=True, check_finite=False) - kalman_gain = scipy.linalg.cho_solve( - (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, - check_finite=False).T - innovation = measurement - projected_mean - - new_mean = mean + np.dot(innovation, kalman_gain.T) - new_covariance = covariance - np.linalg.multi_dot(( - kalman_gain, projected_cov, kalman_gain.T)) - return new_mean, new_covariance - - def gating_distance(self, mean, covariance, measurements, - only_position=False): - """Compute gating distance between state distribution and measurements. - - A suitable distance threshold can be obtained from `chi2inv95`. If - `only_position` is False, the chi-square distribution has 4 degrees of - freedom, otherwise 2. - - Parameters - ---------- - mean : ndarray - Mean vector over the state distribution (8 dimensional). - covariance : ndarray - Covariance of the state distribution (8x8 dimensional). - measurements : ndarray - An Nx4 dimensional matrix of N measurements, each in - format (x, y, a, h) where (x, y) is the bounding box center - position, a the aspect ratio, and h the height. - only_position : Optional[bool] - If True, distance computation is done with respect to the bounding - box center position only. - - Returns - ------- - ndarray - Returns an array of length N, where the i-th element contains the - squared Mahalanobis distance between (mean, covariance) and - `measurements[i]`. - - """ - mean, covariance = self.project(mean, covariance) - if only_position: - mean, covariance = mean[:2], covariance[:2, :2] - measurements = measurements[:, :2] - - cholesky_factor = np.linalg.cholesky(covariance) - d = measurements - mean - z = scipy.linalg.solve_triangular( - cholesky_factor, d.T, lower=True, check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(self._motion_mat, mean) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) return squared_maha \ No newline at end of file diff --git a/cv/pose/alphapose/pytorch/trackers/utils/io.py b/cv/pose/alphapose/pytorch/trackers/utils/io.py index e6ddd21eb..f70decdb2 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/io.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/io.py @@ -1,112 +1,112 @@ -import os -from typing import Dict -import numpy as np - -from utils.log import logger - - -def write_results(filename, results_dict: Dict, data_type: str): - if not filename: - return - path = os.path.dirname(filename) - if not os.path.exists(path): - os.makedirs(path) - - if data_type in ('mot', 'mcmot', 'lab'): - save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' - elif data_type == 'kitti': - save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' - else: - raise ValueError(data_type) - - with open(filename, 'w') as f: - for frame_id, frame_data in results_dict.items(): - if data_type == 'kitti': - frame_id -= 1 - for tlwh, track_id in frame_data: - if track_id < 0: - continue - x1, y1, w, h = tlwh - x2, y2 = x1 + w, y1 + h - line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) - f.write(line) - logger.info('Save results to {}'.format(filename)) - - -def read_results(filename, data_type: str, is_gt=False, is_ignore=False): - if data_type in ('mot', 'lab'): - read_fun = read_mot_results - else: - raise ValueError('Unknown data type: {}'.format(data_type)) - - return read_fun(filename, is_gt, is_ignore) - - -""" -labels={'ped', ... % 1 -'person_on_vhcl', ... % 2 -'car', ... % 3 -'bicycle', ... % 4 -'mbike', ... % 5 -'non_mot_vhcl', ... % 6 -'static_person', ... % 7 -'distractor', ... % 8 -'occluder', ... % 9 -'occluder_on_grnd', ... %10 -'occluder_full', ... % 11 -'reflection', ... % 12 -'crowd' ... % 13 -}; -""" - - -def read_mot_results(filename, is_gt, is_ignore): - valid_labels = {1} - ignore_labels = {2, 7, 8, 12} - results_dict = dict() - if os.path.isfile(filename): - with open(filename, 'r') as f: - for line in f.readlines(): - linelist = line.split(',') - if len(linelist) < 7: - continue - fid = int(linelist[0]) - if fid < 1: - continue - results_dict.setdefault(fid, list()) - - if is_gt: - if 'MOT16-' in filename or 'MOT17-' in filename: - label = int(float(linelist[7])) - mark = int(float(linelist[6])) - if mark == 0 or label not in valid_labels: - continue - score = 1 - elif is_ignore: - if 'MOT16-' in filename or 'MOT17-' in filename: - label = int(float(linelist[7])) - vis_ratio = float(linelist[8]) - if label not in ignore_labels and vis_ratio >= 0: - continue - else: - continue - score = 1 - else: - score = float(linelist[6]) - - tlwh = tuple(map(float, linelist[2:6])) - target_id = int(linelist[1]) - - results_dict[fid].append((tlwh, target_id, score)) - - return results_dict - - -def unzip_objs(objs): - if len(objs) > 0: - tlwhs, ids, scores = zip(*objs) - else: - tlwhs, ids, scores = [], [], [] - tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) - +import os +from typing import Dict +import numpy as np + +from utils.log import logger + + +def write_results(filename, results_dict: Dict, data_type: str): + if not filename: + return + path = os.path.dirname(filename) + if not os.path.exists(path): + os.makedirs(path) + + if data_type in ('mot', 'mcmot', 'lab'): + save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' + elif data_type == 'kitti': + save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' + else: + raise ValueError(data_type) + + with open(filename, 'w') as f: + for frame_id, frame_data in results_dict.items(): + if data_type == 'kitti': + frame_id -= 1 + for tlwh, track_id in frame_data: + if track_id < 0: + continue + x1, y1, w, h = tlwh + x2, y2 = x1 + w, y1 + h + line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) + f.write(line) + logger.info('Save results to {}'.format(filename)) + + +def read_results(filename, data_type: str, is_gt=False, is_ignore=False): + if data_type in ('mot', 'lab'): + read_fun = read_mot_results + else: + raise ValueError('Unknown data type: {}'.format(data_type)) + + return read_fun(filename, is_gt, is_ignore) + + +""" +labels={'ped', ... % 1 +'person_on_vhcl', ... % 2 +'car', ... % 3 +'bicycle', ... % 4 +'mbike', ... % 5 +'non_mot_vhcl', ... % 6 +'static_person', ... % 7 +'distractor', ... % 8 +'occluder', ... % 9 +'occluder_on_grnd', ... %10 +'occluder_full', ... % 11 +'reflection', ... % 12 +'crowd' ... % 13 +}; +""" + + +def read_mot_results(filename, is_gt, is_ignore): + valid_labels = {1} + ignore_labels = {2, 7, 8, 12} + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + mark = int(float(linelist[6])) + if mark == 0 or label not in valid_labels: + continue + score = 1 + elif is_ignore: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + vis_ratio = float(linelist[8]) + if label not in ignore_labels and vis_ratio >= 0: + continue + else: + continue + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + + results_dict[fid].append((tlwh, target_id, score)) + + return results_dict + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + return tlwhs, ids, scores \ No newline at end of file diff --git a/cv/pose/alphapose/pytorch/trackers/utils/kalman_filter.py b/cv/pose/alphapose/pytorch/trackers/utils/kalman_filter.py index e2052368c..5c74b7ca9 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/kalman_filter.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/kalman_filter.py @@ -1,278 +1,278 @@ -# vim: expandtab:ts=4:sw=4 -#import numba -import numpy as np -import scipy.linalg - - -""" -Table for the 0.95 quantile of the chi-square distribution with N degrees of -freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv -function and used as Mahalanobis gating threshold. -""" -chi2inv95 = { - 1: 3.8415, - 2: 5.9915, - 3: 7.8147, - 4: 9.4877, - 5: 11.070, - 6: 12.592, - 7: 14.067, - 8: 15.507, - 9: 16.919} - - -class KalmanFilter(object): - """ - A simple Kalman filter for tracking bounding boxes in image space. - - The 8-dimensional state space - - x, y, a, h, vx, vy, va, vh - - contains the bounding box center position (x, y), aspect ratio a, height h, - and their respective velocities. - - Object motion follows a constant velocity model. The bounding box location - (x, y, a, h) is taken as direct observation of the state space (linear - observation model). - - """ - - def __init__(self): - ndim, dt = 4, 1. - - # Create Kalman filter model matrices. - self._motion_mat = np.eye(2 * ndim, 2 * ndim) - for i in range(ndim): - self._motion_mat[i, ndim + i] = dt - self._update_mat = np.eye(ndim, 2 * ndim) - - # Motion and observation uncertainty are chosen relative to the current - # state estimate. These weights control the amount of uncertainty in - # the model. This is a bit hacky. - self._std_weight_position = 1. / 20 - self._std_weight_velocity = 1. / 160 - - def initiate(self, measurement): - """Create track from unassociated measurement. - - Parameters - ---------- - measurement : ndarray - Bounding box coordinates (x, y, a, h) with center position (x, y), - aspect ratio a, and height h. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are initialized - to 0 mean. - - """ - mean_pos = measurement - mean_vel = np.zeros_like(mean_pos) - mean = np.r_[mean_pos, mean_vel] - - std = [ - 2 * self._std_weight_position * measurement[3], - 2 * self._std_weight_position * measurement[3], - 1e-2, - 2 * self._std_weight_position * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 1e-5, - 10 * self._std_weight_velocity * measurement[3]] - covariance = np.diag(np.square(std)) - return mean, covariance - - def predict(self, mean, covariance): - """Run Kalman filter prediction step. - - Parameters - ---------- - mean : ndarray - The 8 dimensional mean vector of the object state at the previous - time step. - covariance : ndarray - The 8x8 dimensional covariance matrix of the object state at the - previous time step. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - - """ - std_pos = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-2, - self._std_weight_position * mean[3]] - std_vel = [ - self._std_weight_velocity * mean[3], - self._std_weight_velocity * mean[3], - 1e-5, - self._std_weight_velocity * mean[3]] - motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - - mean = np.dot(mean, self._motion_mat.T) - covariance = np.linalg.multi_dot(( - self._motion_mat, covariance, self._motion_mat.T)) + motion_cov - - return mean, covariance - - def project(self, mean, covariance): - """Project state distribution to measurement space. - - Parameters - ---------- - mean : ndarray - The state's mean vector (8 dimensional array). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - - Returns - ------- - (ndarray, ndarray) - Returns the projected mean and covariance matrix of the given state - estimate. - - """ - std = [ - self._std_weight_position * mean[3], - self._std_weight_position * mean[3], - 1e-1, - self._std_weight_position * mean[3]] - innovation_cov = np.diag(np.square(std)) - - mean = np.dot(self._update_mat, mean) - covariance = np.linalg.multi_dot(( - self._update_mat, covariance, self._update_mat.T)) - return mean, covariance + innovation_cov - - def multi_predict(self, mean, covariance): - """Run Kalman filter prediction step (Vectorized version). - - Parameters - ---------- - mean : ndarray - The Nx8 dimensional mean matrix of the object states at the previous - time step. - covariance : ndarray - The Nx8x8 dimensional covariance matrics of the object states at the - previous time step. - - Returns - ------- - (ndarray, ndarray) - Returns the mean vector and covariance matrix of the predicted - state. Unobserved velocities are initialized to 0 mean. - - """ - std_pos = [ - self._std_weight_position * mean[:, 3], - self._std_weight_position * mean[:, 3], - 1e-2 * np.ones_like(mean[:, 3]), - self._std_weight_position * mean[:, 3]] - std_vel = [ - self._std_weight_velocity * mean[:, 3], - self._std_weight_velocity * mean[:, 3], - 1e-5 * np.ones_like(mean[:, 3]), - self._std_weight_velocity * mean[:, 3]] - sqr = np.square(np.r_[std_pos, std_vel]).T - - motion_cov = [] - for i in range(len(mean)): - motion_cov.append(np.diag(sqr[i])) - motion_cov = np.asarray(motion_cov) - - mean = np.dot(mean, self._motion_mat.T) - left = np.dot(self._motion_mat, covariance).transpose((1,0,2)) - covariance = np.dot(left, self._motion_mat.T) + motion_cov - - return mean, covariance - - def update(self, mean, covariance, measurement): - """Run Kalman filter correction step. - - Parameters - ---------- - mean : ndarray - The predicted state's mean vector (8 dimensional). - covariance : ndarray - The state's covariance matrix (8x8 dimensional). - measurement : ndarray - The 4 dimensional measurement vector (x, y, a, h), where (x, y) - is the center position, a the aspect ratio, and h the height of the - bounding box. - - Returns - ------- - (ndarray, ndarray) - Returns the measurement-corrected state distribution. - - """ - projected_mean, projected_cov = self.project(mean, covariance) - - chol_factor, lower = scipy.linalg.cho_factor( - projected_cov, lower=True, check_finite=False) - kalman_gain = scipy.linalg.cho_solve( - (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, - check_finite=False).T - innovation = measurement - projected_mean - - new_mean = mean + np.dot(innovation, kalman_gain.T) - new_covariance = covariance - np.linalg.multi_dot(( - kalman_gain, projected_cov, kalman_gain.T)) - return new_mean, new_covariance - - def gating_distance(self, mean, covariance, measurements, - only_position=False, metric='maha'): - """Compute gating distance between state distribution and measurements. - - A suitable distance threshold can be obtained from `chi2inv95`. If - `only_position` is False, the chi-square distribution has 4 degrees of - freedom, otherwise 2. - - Parameters - ---------- - mean : ndarray - Mean vector over the state distribution (8 dimensional). - covariance : ndarray - Covariance of the state distribution (8x8 dimensional). - measurements : ndarray - An Nx4 dimensional matrix of N measurements, each in - format (x, y, a, h) where (x, y) is the bounding box center - position, a the aspect ratio, and h the height. - only_position : Optional[bool] - If True, distance computation is done with respect to the bounding - box center position only. - - Returns - ------- - ndarray - Returns an array of length N, where the i-th element contains the - squared Mahalanobis distance between (mean, covariance) and - `measurements[i]`. - - """ - mean, covariance = self.project(mean, covariance) - if only_position: - mean, covariance = mean[:2], covariance[:2, :2] - measurements = measurements[:, :2] - - d = measurements - mean - if metric == 'gaussian': - return np.sum(d * d, axis=1) - elif metric == 'maha': - cholesky_factor = np.linalg.cholesky(covariance) - z = scipy.linalg.solve_triangular( - cholesky_factor, d.T, lower=True, check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) - return squared_maha - else: - raise ValueError('invalid distance metric') - +# vim: expandtab:ts=4:sw=4 +#import numba +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(mean, self._motion_mat.T) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def multi_predict(self, mean, covariance): + """Run Kalman filter prediction step (Vectorized version). + + Parameters + ---------- + mean : ndarray + The Nx8 dimensional mean matrix of the object states at the previous + time step. + covariance : ndarray + The Nx8x8 dimensional covariance matrics of the object states at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[:, 3], + self._std_weight_position * mean[:, 3], + 1e-2 * np.ones_like(mean[:, 3]), + self._std_weight_position * mean[:, 3]] + std_vel = [ + self._std_weight_velocity * mean[:, 3], + self._std_weight_velocity * mean[:, 3], + 1e-5 * np.ones_like(mean[:, 3]), + self._std_weight_velocity * mean[:, 3]] + sqr = np.square(np.r_[std_pos, std_vel]).T + + motion_cov = [] + for i in range(len(mean)): + motion_cov.append(np.diag(sqr[i])) + motion_cov = np.asarray(motion_cov) + + mean = np.dot(mean, self._motion_mat.T) + left = np.dot(self._motion_mat, covariance).transpose((1,0,2)) + covariance = np.dot(left, self._motion_mat.T) + motion_cov + + return mean, covariance + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False, metric='maha'): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + d = measurements - mean + if metric == 'gaussian': + return np.sum(d * d, axis=1) + elif metric == 'maha': + cholesky_factor = np.linalg.cholesky(covariance) + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + else: + raise ValueError('invalid distance metric') + diff --git a/cv/pose/alphapose/pytorch/trackers/utils/log.py b/cv/pose/alphapose/pytorch/trackers/utils/log.py index 1dbb69b0d..394e1e8c9 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/log.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/log.py @@ -1,18 +1,18 @@ -import logging - - -def get_logger(name='root'): - formatter = logging.Formatter( - # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') - fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') - - handler = logging.StreamHandler() - handler.setFormatter(formatter) - - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - logger.addHandler(handler) - return logger - - -logger = get_logger('root') +import logging + + +def get_logger(name='root'): + formatter = logging.Formatter( + # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') + fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + logger.addHandler(handler) + return logger + + +logger = get_logger('root') diff --git a/cv/pose/alphapose/pytorch/trackers/utils/parse_config.py b/cv/pose/alphapose/pytorch/trackers/utils/parse_config.py index a47b19370..70b4f4b2c 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/parse_config.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/parse_config.py @@ -1,37 +1,37 @@ -def parse_model_cfg(path): - """Parses the yolo-v3 layer configuration file and returns module definitions""" - file = open(path, 'r') - lines = file.read().split('\n') - lines = [x for x in lines if x and not x.startswith('#')] - lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces - module_defs = [] - for line in lines: - if line.startswith('['): # This marks the start of a new block - module_defs.append({}) - module_defs[-1]['type'] = line[1:-1].rstrip() - if module_defs[-1]['type'] == 'convolutional': - module_defs[-1]['batch_normalize'] = 0 - else: - key, value = line.split("=") - value = value.strip() - if value[0] == '$': - value = module_defs[0].get(value.strip('$'), None) - module_defs[-1][key.rstrip()] = value - - return module_defs - - -def parse_data_cfg(path): - """Parses the data configuration file""" - options = dict() - options['gpus'] = '0' - options['num_workers'] = '10' - with open(path, 'r') as fp: - lines = fp.readlines() - for line in lines: - line = line.strip() - if line == '' or line.startswith('#'): - continue - key, value = line.split('=') - options[key.strip()] = value.strip() - return options +def parse_model_cfg(path): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + file = open(path, 'r') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + module_defs = [] + for line in lines: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + if value[0] == '$': + value = module_defs[0].get(value.strip('$'), None) + module_defs[-1][key.rstrip()] = value + + return module_defs + + +def parse_data_cfg(path): + """Parses the data configuration file""" + options = dict() + options['gpus'] = '0' + options['num_workers'] = '10' + with open(path, 'r') as fp: + lines = fp.readlines() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + key, value = line.split('=') + options[key.strip()] = value.strip() + return options diff --git a/cv/pose/alphapose/pytorch/trackers/utils/timer.py b/cv/pose/alphapose/pytorch/trackers/utils/timer.py index b475fed73..e79f1a310 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/timer.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/timer.py @@ -1,45 +1,45 @@ -# -------------------------------------------------------- -# Fast R-CNN -# Copyright (c) 2015 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Ross Girshick -# -------------------------------------------------------- - -import time - - -class Timer(object): - """A simple timer.""" - def __init__(self): - self.total_time = 0. - self.calls = 0 - self.start_time = 0. - self.diff = 0. - self.average_time = 0. - - self.duration = 0. - - def tic(self): - # using time.time instead of time.clock because time time.clock - # does not normalize for multithreading - self.start_time = time.time() - - def toc(self, average=True): - self.diff = time.time() - self.start_time - self.total_time += self.diff - self.calls += 1 - self.average_time = self.total_time / self.calls - if average: - self.duration = self.average_time - else: - self.duration = self.diff - return self.duration - - def clear(self): - self.total_time = 0. - self.calls = 0 - self.start_time = 0. - self.diff = 0. - self.average_time = 0. - self.duration = 0. - +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import time + + +class Timer(object): + """A simple timer.""" + def __init__(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + + self.duration = 0. + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + self.total_time += self.diff + self.calls += 1 + self.average_time = self.total_time / self.calls + if average: + self.duration = self.average_time + else: + self.duration = self.diff + return self.duration + + def clear(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + self.duration = 0. + diff --git a/cv/pose/alphapose/pytorch/trackers/utils/utils.py b/cv/pose/alphapose/pytorch/trackers/utils/utils.py index a4680eac2..97b8fb96d 100755 --- a/cv/pose/alphapose/pytorch/trackers/utils/utils.py +++ b/cv/pose/alphapose/pytorch/trackers/utils/utils.py @@ -1,750 +1,750 @@ -import glob -import random -import time -import os -import os.path as osp - -import cv2 -import warnings -from functools import partial -from collections import OrderedDict -import matplotlib.pyplot as plt -import numpy as np -import torch -import pickle -import torch.nn.functional as F -from torchvision.ops import nms -#import maskrcnn_benchmark.layers.nms as nms - -def mkdir_if_missing(d): - if not osp.exists(d): - os.makedirs(d) - - -def float3(x): # format floats to 3 decimals - return float(format(x, '.3f')) - - -def init_seeds(seed=0): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - - -def load_classes(path): - """ - Loads class labels at 'path' - """ - fp = open(path, 'r') - names = fp.read().split('\n') - return list(filter(None, names)) # filter removes empty strings (such as last line) - - -def model_info(model): # Plots a line-by-line description of a PyTorch model - n_p = sum(x.numel() for x in model.parameters()) # number parameters - n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients - print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) - for i, (name, p) in enumerate(model.named_parameters()): - name = name.replace('module_list.', '') - print('%5g %50s %9s %12g %20s %12.3g %12.3g' % ( - i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) - print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g)) - - - -def plot_one_box(x, img, color=None, label=None, line_thickness=None): # Plots one bounding box on image img - tl = line_thickness or round(0.0004 * max(img.shape[0:2])) + 1 # line thickness - color = color or [random.randint(0, 255) for _ in range(3)] - c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) - cv2.rectangle(img, c1, c2, color, thickness=tl) - if label: - tf = max(tl - 1, 1) # font thickness - t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] - c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 - cv2.rectangle(img, c1, c2, color, -1) # filled - cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) - - -def weights_init_normal(m): - classname = m.__class__.__name__ - if classname.find('Conv') != -1: - torch.nn.init.normal_(m.weight.data, 0.0, 0.03) - elif classname.find('BatchNorm2d') != -1: - torch.nn.init.normal_(m.weight.data, 1.0, 0.03) - torch.nn.init.constant_(m.bias.data, 0.0) - - -def xyxy2xywh(x): - # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] - y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) - y[:, 0] = (x[:, 0] + x[:, 2]) / 2 - y[:, 1] = (x[:, 1] + x[:, 3]) / 2 - y[:, 2] = x[:, 2] - x[:, 0] - y[:, 3] = x[:, 3] - x[:, 1] - return y - - -def xywh2xyxy(x): - # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] - y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) - y[:, 0] = (x[:, 0] - x[:, 2] / 2) - y[:, 1] = (x[:, 1] - x[:, 3] / 2) - y[:, 2] = (x[:, 0] + x[:, 2] / 2) - y[:, 3] = (x[:, 1] + x[:, 3] / 2) - return y -def x1y1x2y2_to_xywh(det): - x1, y1, x2, y2 = det - w, h = int(x2) - int(x1), int(y2) - int(y1) - return [x1, y1, w, h] -def xywh_to_x1y1x2y2(det): - x1, y1, w, h = det - x2, y2 = x1 + w, y1 + h - return [x1, y1, x2, y2] -def expandBbox(xywh,width,height): - scale = 0.05 - if isinstance(xywh, (tuple, list)): - if not len(xywh) == 4: - raise IndexError( - "Bounding boxes must have 4 elements, given {}".format(len(xywh))) - center_x = xywh[0] + xywh[2]/2 - center_y = xywh[1] + xywh[3]/2 - img_width,img_height = xywh[2]+scale*width,xywh[3]+scale*height - x1 = np.minimum(width - 1, np.maximum(0, center_x-img_width/2)) - y1 = np.minimum(height - 1, np.maximum(0, center_y-img_height/2)) - x2 = np.minimum(width - 1, np.maximum(0, center_x+img_width/2)) - y2 = np.minimum(height - 1, np.maximum(0, center_y+img_height/2)) - return (x1, y1, x2, y2) - else: - raise TypeError( - 'Expect input xywh a list or tuple, given {}'.format(type(xywh))) -def bbox_clip_xyxy(xyxy, width, height): - """Clip bounding box with format (xmin, ymin, xmax, ymax) to specified boundary. - - All bounding boxes will be clipped to the new region `(0, 0, width, height)`. - - Parameters - ---------- - xyxy : list, tuple or numpy.ndarray - The bbox in format (xmin, ymin, xmax, ymax). - If numpy.ndarray is provided, we expect multiple bounding boxes with - shape `(N, 4)`. - width : int or float - Boundary width. - height : int or float - Boundary height. - - Returns - ------- - type - Description of returned object. - - """ - if isinstance(xyxy, (tuple, list)): - if not len(xyxy) == 4: - raise IndexError( - "Bounding boxes must have 4 elements, given {}".format(len(xyxy))) - x1 = np.minimum(width - 1, np.maximum(0, xyxy[0])) - y1 = np.minimum(height - 1, np.maximum(0, xyxy[1])) - x2 = np.minimum(width - 1, np.maximum(0, xyxy[2])) - y2 = np.minimum(height - 1, np.maximum(0, xyxy[3])) - return (x1, y1, x2, y2) - elif isinstance(xyxy, np.ndarray): - if not xyxy.size % 4 == 0: - raise IndexError( - "Bounding boxes must have n * 4 elements, given {}".format(xyxy.shape)) - x1 = np.minimum(width - 1, np.maximum(0, xyxy[:, 0])) - y1 = np.minimum(height - 1, np.maximum(0, xyxy[:, 1])) - x2 = np.minimum(width - 1, np.maximum(0, xyxy[:, 2])) - y2 = np.minimum(height - 1, np.maximum(0, xyxy[:, 3])) - return np.hstack((x1, y1, x2, y2)) - else: - raise TypeError( - 'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xyxy))) - -def scale_coords(img_size, coords, img0_shape): - # Rescale x1, y1, x2, y2 from 416 to image size - gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new - gain_h = float(img_size[1]) / img0_shape[0] - gain = min(gain_w, gain_h) - pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding - pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding - coords[:, [0, 2]] -= pad_x - coords[:, [1, 3]] -= pad_y - coords[:, 0:4] /= gain - coords[:, :4] = torch.clamp(coords[:, :4], min=0) - return coords - - -def ap_per_class(tp, conf, pred_cls, target_cls): - """ Compute the average precision, given the recall and precision curves. - Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. - # Arguments - tp: True positives (list). - conf: Objectness value from 0-1 (list). - pred_cls: Predicted object classes (list). - target_cls: True object classes (list). - # Returns - The average precision as computed in py-faster-rcnn. - """ - - # lists/pytorch to numpy - tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls) - - # Sort by objectness - i = np.argsort(-conf) - tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] - - # Find unique classes - unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) - - # Create Precision-Recall curve and compute AP for each class - ap, p, r = [], [], [] - for c in unique_classes: - i = pred_cls == c - n_gt = sum(target_cls == c) # Number of ground truth objects - n_p = sum(i) # Number of predicted objects - - if (n_p == 0) and (n_gt == 0): - continue - elif (n_p == 0) or (n_gt == 0): - ap.append(0) - r.append(0) - p.append(0) - else: - # Accumulate FPs and TPs - fpc = np.cumsum(1 - tp[i]) - tpc = np.cumsum(tp[i]) - - # Recall - recall_curve = tpc / (n_gt + 1e-16) - r.append(tpc[-1] / (n_gt + 1e-16)) - - # Precision - precision_curve = tpc / (tpc + fpc) - p.append(tpc[-1] / (tpc[-1] + fpc[-1])) - - # AP from recall-precision curve - ap.append(compute_ap(recall_curve, precision_curve)) - - return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p) - - -def compute_ap(recall, precision): - """ Compute the average precision, given the recall and precision curves. - Code originally from https://github.com/rbgirshick/py-faster-rcnn. - # Arguments - recall: The recall curve (list). - precision: The precision curve (list). - # Returns - The average precision as computed in py-faster-rcnn. - """ - # correct AP calculation - # first append sentinel values at the end - - mrec = np.concatenate(([0.], recall, [1.])) - mpre = np.concatenate(([0.], precision, [0.])) - - # compute the precision envelope - for i in range(mpre.size - 1, 0, -1): - mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) - - # to calculate area under PR curve, look for points - # where X axis (recall) changes value - i = np.where(mrec[1:] != mrec[:-1])[0] - - # and sum (\Delta recall) * prec - ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) - return ap - - -def bbox_iou(box1, box2, x1y1x2y2=False): - """ - Returns the IoU of two bounding boxes - """ - N, M = len(box1), len(box2) - if x1y1x2y2: - # Get the coordinates of bounding boxes - b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] - b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] - else: - # Transform from center and width to exact coordinates - b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 - b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 - b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 - b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 - - # get the coordinates of the intersection rectangle - inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1) - inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1) - inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2) - inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2) - # Intersection area - inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) - # Union Area - b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) - b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M) - b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M) - - return inter_area / (b1_area + b2_area - inter_area + 1e-16) - - -def build_targets_max(target, anchor_wh, nA, nC, nGh, nGw): - """ - returns nT, nCorrect, tx, ty, tw, th, tconf, tcls - """ - nB = len(target) # number of images in batch - - txy = torch.zeros(nB, nA, nGh, nGw, 2).cuda() # batch size, anchors, grid size - twh = torch.zeros(nB, nA, nGh, nGw, 2).cuda() - tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() - tcls = torch.ByteTensor(nB, nA, nGh, nGw, nC).fill_(0).cuda() # nC = number of classes - tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() - for b in range(nB): - t = target[b] - t_id = t[:, 1].clone().long().cuda() - t = t[:,[0,2,3,4,5]] - nTb = len(t) # number of targets - if nTb == 0: - continue - - #gxy, gwh = t[:, 1:3] * nG, t[:, 3:5] * nG - gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - gi = torch.clamp(gxy[:, 0], min=0, max=nGw -1).long() - gj = torch.clamp(gxy[:, 1], min=0, max=nGh -1).long() - - # Get grid box indices and prevent overflows (i.e. 13.01 on 13 anchors) - #gi, gj = torch.clamp(gxy.long(), min=0, max=nG - 1).t() - #gi, gj = gxy.long().t() - - # iou of targets-anchors (using wh only) - box1 = gwh - box2 = anchor_wh.unsqueeze(1) - inter_area = torch.min(box1, box2).prod(2) - iou = inter_area / (box1.prod(1) + box2.prod(2) - inter_area + 1e-16) - - # Select best iou_pred and anchor - iou_best, a = iou.max(0) # best anchor [0-2] for each target - - # Select best unique target-anchor combinations - if nTb > 1: - _, iou_order = torch.sort(-iou_best) # best to worst - - # Unique anchor selection - u = torch.stack((gi, gj, a), 0)[:, iou_order] - # _, first_unique = np.unique(u, axis=1, return_index=True) # first unique indices - first_unique = return_torch_unique_index(u, torch.unique(u, dim=1)) # torch alternative - i = iou_order[first_unique] - # best anchor must share significant commonality (iou) with target - i = i[iou_best[i] > 0.60] # TODO: examine arbitrary threshold - if len(i) == 0: - continue - - a, gj, gi, t = a[i], gj[i], gi[i], t[i] - t_id = t_id[i] - if len(t.shape) == 1: - t = t.view(1, 5) - else: - if iou_best < 0.60: - continue - - tc, gxy, gwh = t[:, 0].long(), t[:, 1:3].clone(), t[:, 3:5].clone() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - - # XY coordinates - txy[b, a, gj, gi] = gxy - gxy.floor() - - # Width and height - twh[b, a, gj, gi] = torch.log(gwh / anchor_wh[a]) # yolo method - # twh[b, a, gj, gi] = torch.sqrt(gwh / anchor_wh[a]) / 2 # power method - - # One-hot encoding of label - tcls[b, a, gj, gi, tc] = 1 - tconf[b, a, gj, gi] = 1 - tid[b, a, gj, gi] = t_id.unsqueeze(1) - tbox = torch.cat([txy, twh], -1) - return tconf, tbox, tid - - - -def build_targets_thres(target, anchor_wh, nA, nC, nGh, nGw): - ID_THRESH = 0.5 - FG_THRESH = 0.5 - BG_THRESH = 0.4 - nB = len(target) # number of images in batch - assert(len(anchor_wh)==nA) - - tbox = torch.zeros(nB, nA, nGh, nGw, 4).cuda() # batch size, anchors, grid size - tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() - tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() - for b in range(nB): - t = target[b] - t_id = t[:, 1].clone().long().cuda() - t = t[:,[0,2,3,4,5]] - nTb = len(t) # number of targets - if nTb == 0: - continue - - gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - gxy[:, 0] = torch.clamp(gxy[:, 0], min=0, max=nGw -1) - gxy[:, 1] = torch.clamp(gxy[:, 1], min=0, max=nGh -1) - - gt_boxes = torch.cat([gxy, gwh], dim=1) # Shape Ngx4 (xc, yc, w, h) - - anchor_mesh = generate_anchor(nGh, nGw, anchor_wh) - anchor_list = anchor_mesh.permute(0,2,3,1).contiguous().view(-1, 4) # Shpae (nA x nGh x nGw) x 4 - #print(anchor_list.shape, gt_boxes.shape) - iou_pdist = bbox_iou(anchor_list, gt_boxes) # Shape (nA x nGh x nGw) x Ng - iou_max, max_gt_index = torch.max(iou_pdist, dim=1) # Shape (nA x nGh x nGw), both - - iou_map = iou_max.view(nA, nGh, nGw) - gt_index_map = max_gt_index.view(nA, nGh, nGw) - - #nms_map = pooling_nms(iou_map, 3) - - id_index = iou_map > ID_THRESH - fg_index = iou_map > FG_THRESH - bg_index = iou_map < BG_THRESH - ign_index = (iou_map < FG_THRESH) * (iou_map > BG_THRESH) - tconf[b][fg_index] = 1 - tconf[b][bg_index] = 0 - tconf[b][ign_index] = -1 - - gt_index = gt_index_map[fg_index] - gt_box_list = gt_boxes[gt_index] - gt_id_list = t_id[gt_index_map[id_index]] - #print(gt_index.shape, gt_index_map[id_index].shape, gt_boxes.shape) - if torch.sum(fg_index) > 0: - tid[b][id_index] = gt_id_list.unsqueeze(1) - fg_anchor_list = anchor_list.view(nA, nGh, nGw, 4)[fg_index] - delta_target = encode_delta(gt_box_list, fg_anchor_list) - tbox[b][fg_index] = delta_target - return tconf, tbox, tid - -def generate_anchor(nGh, nGw, anchor_wh): - nA = len(anchor_wh) - yy, xx =torch.meshgrid(torch.arange(nGh), torch.arange(nGw)) - xx, yy = xx.cuda(), yy.cuda() - - mesh = torch.stack([xx, yy], dim=0) # Shape 2, nGh, nGw - mesh = mesh.unsqueeze(0).repeat(nA,1,1,1).float() # Shape nA x 2 x nGh x nGw - anchor_offset_mesh = anchor_wh.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, nGh,nGw) # Shape nA x 2 x nGh x nGw - anchor_mesh = torch.cat([mesh, anchor_offset_mesh], dim=1) # Shape nA x 4 x nGh x nGw - return anchor_mesh - -def encode_delta(gt_box_list, fg_anchor_list): - px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ - fg_anchor_list[:, 2], fg_anchor_list[:,3] - gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ - gt_box_list[:, 2], gt_box_list[:, 3] - dx = (gx - px) / pw - dy = (gy - py) / ph - dw = torch.log(gw/pw) - dh = torch.log(gh/ph) - return torch.stack([dx, dy, dw, dh], dim=1) - -def decode_delta(delta, fg_anchor_list): - px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ - fg_anchor_list[:, 2], fg_anchor_list[:,3] - dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] - gx = pw * dx + px - gy = ph * dy + py - gw = pw * torch.exp(dw) - gh = ph * torch.exp(dh) - return torch.stack([gx, gy, gw, gh], dim=1) - -def decode_delta_map(delta_map, anchors): - ''' - :param: delta_map, shape (nB, nA, nGh, nGw, 4) - :param: anchors, shape (nA,4) - ''' - nB, nA, nGh, nGw, _ = delta_map.shape - anchor_mesh = generate_anchor(nGh, nGw, anchors) - anchor_mesh = anchor_mesh.permute(0,2,3,1).contiguous() # Shpae (nA x nGh x nGw) x 4 - anchor_mesh = anchor_mesh.unsqueeze(0).repeat(nB,1,1,1,1) - pred_list = decode_delta(delta_map.view(-1,4), anchor_mesh.view(-1,4)) - pred_map = pred_list.view(nB, nA, nGh, nGw, 4) - return pred_map - - -def pooling_nms(heatmap, kernel=1): - pad = (kernel -1 ) // 2 - hmax = F.max_pool2d(heatmap, (kernel, kernel), stride=1, padding=pad) - keep = (hmax == heatmap).float() - return keep * heatmap - -def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4, method='standard'): - """ - Removes detections with lower object confidence score than 'conf_thres' - Non-Maximum Suppression to further filter detections. - Returns detections with shape: - (x1, y1, x2, y2, object_conf, class_score, class_pred) - Args: - prediction, - conf_thres, - nms_thres, - method = 'standard' or 'fast' - """ - - output = [None for _ in range(len(prediction))] - for image_i, pred in enumerate(prediction): - # Filter out confidence scores below threshold - # Get score and class with highest confidence - - v = pred[:, 4] > conf_thres - v = v.nonzero().squeeze() - if len(v.shape) == 0: - v = v.unsqueeze(0) - - pred = pred[v] - - # If none are remaining => process next image - nP = pred.shape[0] - if not nP: - continue - # From (center x, center y, width, height) to (x1, y1, x2, y2) - pred[:, :4] = xywh2xyxy(pred[:, :4]) - - - # Non-maximum suppression - if method == 'standard': - nms_indices = nms(pred[:, :4], pred[:, 4], nms_thres) - elif method == 'fast': - nms_indices = fast_nms(pred[:, :4], pred[:, 4], iou_thres=nms_thres, conf_thres=conf_thres) - else: - raise ValueError('Invalid NMS type!') - det_max = pred[nms_indices] - - if len(det_max) > 0: - # Add max detections to outputs - output[image_i] = det_max if output[image_i] is None else torch.cat((output[image_i], det_max)) - - return output - -def fast_nms(boxes, scores, iou_thres:float=0.5, top_k:int=200, second_threshold:bool=False, conf_thres:float=0.5): - ''' - Vectorized, approximated, fast NMS, adopted from YOLACT: - https://github.com/dbolya/yolact/blob/master/layers/functions/detection.py - The original version is for multi-class NMS, here we simplify the code for single-class NMS - ''' - scores, idx = scores.sort(0, descending=True) - - idx = idx[:top_k].contiguous() - scores = scores[:top_k] - num_dets = idx.size() - - boxes = boxes[idx, :] - - iou = jaccard(boxes, boxes) - iou.triu_(diagonal=1) - iou_max, _ = iou.max(dim=0) - - keep = (iou_max <= iou_thres) - - if second_threshold: - keep *= (scores > self.conf_thresh) - - return idx[keep] - - - -@torch.jit.script -def intersect(box_a, box_b): - """ We resize both tensors to [A,B,2] without new malloc: - [A,2] -> [A,1,2] -> [A,B,2] - [B,2] -> [1,B,2] -> [A,B,2] - Then we compute the area of intersect between box_a and box_b. - Args: - box_a: (tensor) bounding boxes, Shape: [n,A,4]. - box_b: (tensor) bounding boxes, Shape: [n,B,4]. - Return: - (tensor) intersection area, Shape: [n,A,B]. - """ - n = box_a.size(0) - A = box_a.size(1) - B = box_b.size(1) - max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), - box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2)) - min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), - box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2)) - inter = torch.clamp((max_xy - min_xy), min=0) - return inter[:, :, :, 0] * inter[:, :, :, 1] - - - -def jaccard(box_a, box_b, iscrowd:bool=False): - """Compute the jaccard overlap of two sets of boxes. The jaccard overlap - is simply the intersection over union of two boxes. Here we operate on - ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b. - E.g.: - A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) - Args: - box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] - box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] - Return: - jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] - """ - use_batch = True - if box_a.dim() == 2: - use_batch = False - box_a = box_a[None, ...] - box_b = box_b[None, ...] - - inter = intersect(box_a, box_b) - area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) * - (box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter) # [A,B] - area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) * - (box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter) # [A,B] - union = area_a + area_b - inter - - out = inter / area_a if iscrowd else inter / union - return out if use_batch else out.squeeze(0) - - - - -def return_torch_unique_index(u, uv): - n = uv.shape[1] # number of columns - first_unique = torch.zeros(n, device=u.device).long() - for j in range(n): - first_unique[j] = (uv[:, j:j + 1] == u).all(0).nonzero()[0] - - return first_unique - - -def strip_optimizer_from_checkpoint(filename='weights/best.pt'): - # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size) - - a = torch.load(filename, map_location='cpu') - a['optimizer'] = [] - torch.save(a, filename.replace('.pt', '_lite.pt')) - - -def plot_results(): - # Plot YOLO training results file 'results.txt' - # import os; os.system('wget https://storage.googleapis.com/ultralytics/yolov3/results_v1.txt') - - plt.figure(figsize=(14, 7)) - s = ['X + Y', 'Width + Height', 'Confidence', 'Classification', 'Total Loss', 'mAP', 'Recall', 'Precision'] - files = sorted(glob.glob('results*.txt')) - for f in files: - results = np.loadtxt(f, usecols=[2, 3, 4, 5, 6, 9, 10, 11]).T # column 11 is mAP - x = range(1, results.shape[1]) - for i in range(8): - plt.subplot(2, 4, i + 1) - plt.plot(x, results[i, x], marker='.', label=f) - plt.title(s[i]) - if i == 0: - plt.legend() -def load_checkpoint(fpath): - r"""Loads checkpoint. - - ``UnicodeDecodeError`` can be well handled, which means - python2-saved files can be read from python3. - - Args: - fpath (str): path to checkpoint. - - Returns: - dict - - Examples:: - >>> from torchreid.utils import load_checkpoint - >>> fpath = 'log/my_model/model.pth.tar-10' - >>> checkpoint = load_checkpoint(fpath) - """ - if fpath is None: - raise ValueError('File path is None') - if not osp.exists(fpath): - raise FileNotFoundError('File is not found at "{}"'.format(fpath)) - map_location = None if torch.cuda.is_available() else 'cpu' - try: - checkpoint = torch.load(fpath, map_location=map_location) - except UnicodeDecodeError: - pickle.load = partial(pickle.load, encoding="latin1") - pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") - checkpoint = torch.load( - fpath, pickle_module=pickle, map_location=map_location - ) - except Exception: - print('Unable to load checkpoint from "{}"'.format(fpath)) - raise - return checkpoint -def load_pretrained_weights(model, weight_path): - r"""Loads pretrianed weights to model. - - Features:: - - Incompatible layers (unmatched in name or size) will be ignored. - - Can automatically deal with keys containing "module.". - - Args: - model (nn.Module): network model. - weight_path (str): path to pretrained weights. - - Examples:: - >>> from torchreid.utils import load_pretrained_weights - >>> weight_path = 'log/my_model/model-best.pth.tar' - >>> load_pretrained_weights(model, weight_path) - """ - checkpoint = load_checkpoint(weight_path) - if 'state_dict' in checkpoint: - state_dict = checkpoint['state_dict'] - else: - state_dict = checkpoint - - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - #show weight keys - ''' - print('keys of loaded model:') - for k, v in state_dict.items(): - print(k) - print('keys of model archs:') - for k, v in model_dict.items(): - print(k) - ''' - for k, v in state_dict.items(): - if not k.startswith('module.'): - k = 'module.' + k - - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights "{}" cannot be loaded, ' - 'please check the key names manually ' - '(** ignored and continue **)'.format(weight_path) - ) - else: - print( - 'loading reid model from {}...'. - format(weight_path) - ) - ''' - if len(discarded_layers) > 0: - print( - '** The following layers are discarded ' - 'due to unmatched keys or layer size: {}'. - format(discarded_layers) - ) - ''' +import glob +import random +import time +import os +import os.path as osp + +import cv2 +import warnings +from functools import partial +from collections import OrderedDict +import matplotlib.pyplot as plt +import numpy as np +import torch +import pickle +import torch.nn.functional as F +from torchvision.ops import nms +#import maskrcnn_benchmark.layers.nms as nms + +def mkdir_if_missing(d): + if not osp.exists(d): + os.makedirs(d) + + +def float3(x): # format floats to 3 decimals + return float(format(x, '.3f')) + + +def init_seeds(seed=0): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def load_classes(path): + """ + Loads class labels at 'path' + """ + fp = open(path, 'r') + names = fp.read().split('\n') + return list(filter(None, names)) # filter removes empty strings (such as last line) + + +def model_info(model): # Plots a line-by-line description of a PyTorch model + n_p = sum(x.numel() for x in model.parameters()) # number parameters + n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients + print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) + for i, (name, p) in enumerate(model.named_parameters()): + name = name.replace('module_list.', '') + print('%5g %50s %9s %12g %20s %12.3g %12.3g' % ( + i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) + print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g)) + + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): # Plots one bounding box on image img + tl = line_thickness or round(0.0004 * max(img.shape[0:2])) + 1 # line thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1) # filled + cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) + + +def weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + torch.nn.init.normal_(m.weight.data, 0.0, 0.03) + elif classname.find('BatchNorm2d') != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.03) + torch.nn.init.constant_(m.bias.data, 0.0) + + +def xyxy2xywh(x): + # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] + x[:, 2]) / 2 + y[:, 1] = (x[:, 1] + x[:, 3]) / 2 + y[:, 2] = x[:, 2] - x[:, 0] + y[:, 3] = x[:, 3] - x[:, 1] + return y + + +def xywh2xyxy(x): + # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] - x[:, 2] / 2) + y[:, 1] = (x[:, 1] - x[:, 3] / 2) + y[:, 2] = (x[:, 0] + x[:, 2] / 2) + y[:, 3] = (x[:, 1] + x[:, 3] / 2) + return y +def x1y1x2y2_to_xywh(det): + x1, y1, x2, y2 = det + w, h = int(x2) - int(x1), int(y2) - int(y1) + return [x1, y1, w, h] +def xywh_to_x1y1x2y2(det): + x1, y1, w, h = det + x2, y2 = x1 + w, y1 + h + return [x1, y1, x2, y2] +def expandBbox(xywh,width,height): + scale = 0.05 + if isinstance(xywh, (tuple, list)): + if not len(xywh) == 4: + raise IndexError( + "Bounding boxes must have 4 elements, given {}".format(len(xywh))) + center_x = xywh[0] + xywh[2]/2 + center_y = xywh[1] + xywh[3]/2 + img_width,img_height = xywh[2]+scale*width,xywh[3]+scale*height + x1 = np.minimum(width - 1, np.maximum(0, center_x-img_width/2)) + y1 = np.minimum(height - 1, np.maximum(0, center_y-img_height/2)) + x2 = np.minimum(width - 1, np.maximum(0, center_x+img_width/2)) + y2 = np.minimum(height - 1, np.maximum(0, center_y+img_height/2)) + return (x1, y1, x2, y2) + else: + raise TypeError( + 'Expect input xywh a list or tuple, given {}'.format(type(xywh))) +def bbox_clip_xyxy(xyxy, width, height): + """Clip bounding box with format (xmin, ymin, xmax, ymax) to specified boundary. + + All bounding boxes will be clipped to the new region `(0, 0, width, height)`. + + Parameters + ---------- + xyxy : list, tuple or numpy.ndarray + The bbox in format (xmin, ymin, xmax, ymax). + If numpy.ndarray is provided, we expect multiple bounding boxes with + shape `(N, 4)`. + width : int or float + Boundary width. + height : int or float + Boundary height. + + Returns + ------- + type + Description of returned object. + + """ + if isinstance(xyxy, (tuple, list)): + if not len(xyxy) == 4: + raise IndexError( + "Bounding boxes must have 4 elements, given {}".format(len(xyxy))) + x1 = np.minimum(width - 1, np.maximum(0, xyxy[0])) + y1 = np.minimum(height - 1, np.maximum(0, xyxy[1])) + x2 = np.minimum(width - 1, np.maximum(0, xyxy[2])) + y2 = np.minimum(height - 1, np.maximum(0, xyxy[3])) + return (x1, y1, x2, y2) + elif isinstance(xyxy, np.ndarray): + if not xyxy.size % 4 == 0: + raise IndexError( + "Bounding boxes must have n * 4 elements, given {}".format(xyxy.shape)) + x1 = np.minimum(width - 1, np.maximum(0, xyxy[:, 0])) + y1 = np.minimum(height - 1, np.maximum(0, xyxy[:, 1])) + x2 = np.minimum(width - 1, np.maximum(0, xyxy[:, 2])) + y2 = np.minimum(height - 1, np.maximum(0, xyxy[:, 3])) + return np.hstack((x1, y1, x2, y2)) + else: + raise TypeError( + 'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xyxy))) + +def scale_coords(img_size, coords, img0_shape): + # Rescale x1, y1, x2, y2 from 416 to image size + gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new + gain_h = float(img_size[1]) / img0_shape[0] + gain = min(gain_w, gain_h) + pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding + pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding + coords[:, [0, 2]] -= pad_x + coords[:, [1, 3]] -= pad_y + coords[:, 0:4] /= gain + coords[:, :4] = torch.clamp(coords[:, :4], min=0) + return coords + + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # lists/pytorch to numpy + tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls) + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = sum(target_cls == c) # Number of ground truth objects + n_p = sum(i) # Number of predicted objects + + if (n_p == 0) and (n_gt == 0): + continue + elif (n_p == 0) or (n_gt == 0): + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = np.cumsum(1 - tp[i]) + tpc = np.cumsum(tp[i]) + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(tpc[-1] / (n_gt + 1e-16)) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(tpc[-1] / (tpc[-1] + fpc[-1])) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p) + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def bbox_iou(box1, box2, x1y1x2y2=False): + """ + Returns the IoU of two bounding boxes + """ + N, M = len(box1), len(box2) + if x1y1x2y2: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + else: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + + # get the coordinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1) + inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1) + inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2) + inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) + # Union Area + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M) + b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M) + + return inter_area / (b1_area + b2_area - inter_area + 1e-16) + + +def build_targets_max(target, anchor_wh, nA, nC, nGh, nGw): + """ + returns nT, nCorrect, tx, ty, tw, th, tconf, tcls + """ + nB = len(target) # number of images in batch + + txy = torch.zeros(nB, nA, nGh, nGw, 2).cuda() # batch size, anchors, grid size + twh = torch.zeros(nB, nA, nGh, nGw, 2).cuda() + tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() + tcls = torch.ByteTensor(nB, nA, nGh, nGw, nC).fill_(0).cuda() # nC = number of classes + tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() + for b in range(nB): + t = target[b] + t_id = t[:, 1].clone().long().cuda() + t = t[:,[0,2,3,4,5]] + nTb = len(t) # number of targets + if nTb == 0: + continue + + #gxy, gwh = t[:, 1:3] * nG, t[:, 3:5] * nG + gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + gi = torch.clamp(gxy[:, 0], min=0, max=nGw -1).long() + gj = torch.clamp(gxy[:, 1], min=0, max=nGh -1).long() + + # Get grid box indices and prevent overflows (i.e. 13.01 on 13 anchors) + #gi, gj = torch.clamp(gxy.long(), min=0, max=nG - 1).t() + #gi, gj = gxy.long().t() + + # iou of targets-anchors (using wh only) + box1 = gwh + box2 = anchor_wh.unsqueeze(1) + inter_area = torch.min(box1, box2).prod(2) + iou = inter_area / (box1.prod(1) + box2.prod(2) - inter_area + 1e-16) + + # Select best iou_pred and anchor + iou_best, a = iou.max(0) # best anchor [0-2] for each target + + # Select best unique target-anchor combinations + if nTb > 1: + _, iou_order = torch.sort(-iou_best) # best to worst + + # Unique anchor selection + u = torch.stack((gi, gj, a), 0)[:, iou_order] + # _, first_unique = np.unique(u, axis=1, return_index=True) # first unique indices + first_unique = return_torch_unique_index(u, torch.unique(u, dim=1)) # torch alternative + i = iou_order[first_unique] + # best anchor must share significant commonality (iou) with target + i = i[iou_best[i] > 0.60] # TODO: examine arbitrary threshold + if len(i) == 0: + continue + + a, gj, gi, t = a[i], gj[i], gi[i], t[i] + t_id = t_id[i] + if len(t.shape) == 1: + t = t.view(1, 5) + else: + if iou_best < 0.60: + continue + + tc, gxy, gwh = t[:, 0].long(), t[:, 1:3].clone(), t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + + # XY coordinates + txy[b, a, gj, gi] = gxy - gxy.floor() + + # Width and height + twh[b, a, gj, gi] = torch.log(gwh / anchor_wh[a]) # yolo method + # twh[b, a, gj, gi] = torch.sqrt(gwh / anchor_wh[a]) / 2 # power method + + # One-hot encoding of label + tcls[b, a, gj, gi, tc] = 1 + tconf[b, a, gj, gi] = 1 + tid[b, a, gj, gi] = t_id.unsqueeze(1) + tbox = torch.cat([txy, twh], -1) + return tconf, tbox, tid + + + +def build_targets_thres(target, anchor_wh, nA, nC, nGh, nGw): + ID_THRESH = 0.5 + FG_THRESH = 0.5 + BG_THRESH = 0.4 + nB = len(target) # number of images in batch + assert(len(anchor_wh)==nA) + + tbox = torch.zeros(nB, nA, nGh, nGw, 4).cuda() # batch size, anchors, grid size + tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() + tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() + for b in range(nB): + t = target[b] + t_id = t[:, 1].clone().long().cuda() + t = t[:,[0,2,3,4,5]] + nTb = len(t) # number of targets + if nTb == 0: + continue + + gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + gxy[:, 0] = torch.clamp(gxy[:, 0], min=0, max=nGw -1) + gxy[:, 1] = torch.clamp(gxy[:, 1], min=0, max=nGh -1) + + gt_boxes = torch.cat([gxy, gwh], dim=1) # Shape Ngx4 (xc, yc, w, h) + + anchor_mesh = generate_anchor(nGh, nGw, anchor_wh) + anchor_list = anchor_mesh.permute(0,2,3,1).contiguous().view(-1, 4) # Shpae (nA x nGh x nGw) x 4 + #print(anchor_list.shape, gt_boxes.shape) + iou_pdist = bbox_iou(anchor_list, gt_boxes) # Shape (nA x nGh x nGw) x Ng + iou_max, max_gt_index = torch.max(iou_pdist, dim=1) # Shape (nA x nGh x nGw), both + + iou_map = iou_max.view(nA, nGh, nGw) + gt_index_map = max_gt_index.view(nA, nGh, nGw) + + #nms_map = pooling_nms(iou_map, 3) + + id_index = iou_map > ID_THRESH + fg_index = iou_map > FG_THRESH + bg_index = iou_map < BG_THRESH + ign_index = (iou_map < FG_THRESH) * (iou_map > BG_THRESH) + tconf[b][fg_index] = 1 + tconf[b][bg_index] = 0 + tconf[b][ign_index] = -1 + + gt_index = gt_index_map[fg_index] + gt_box_list = gt_boxes[gt_index] + gt_id_list = t_id[gt_index_map[id_index]] + #print(gt_index.shape, gt_index_map[id_index].shape, gt_boxes.shape) + if torch.sum(fg_index) > 0: + tid[b][id_index] = gt_id_list.unsqueeze(1) + fg_anchor_list = anchor_list.view(nA, nGh, nGw, 4)[fg_index] + delta_target = encode_delta(gt_box_list, fg_anchor_list) + tbox[b][fg_index] = delta_target + return tconf, tbox, tid + +def generate_anchor(nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yy, xx =torch.meshgrid(torch.arange(nGh), torch.arange(nGw)) + xx, yy = xx.cuda(), yy.cuda() + + mesh = torch.stack([xx, yy], dim=0) # Shape 2, nGh, nGw + mesh = mesh.unsqueeze(0).repeat(nA,1,1,1).float() # Shape nA x 2 x nGh x nGw + anchor_offset_mesh = anchor_wh.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, nGh,nGw) # Shape nA x 2 x nGh x nGw + anchor_mesh = torch.cat([mesh, anchor_offset_mesh], dim=1) # Shape nA x 4 x nGh x nGw + return anchor_mesh + +def encode_delta(gt_box_list, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ + gt_box_list[:, 2], gt_box_list[:, 3] + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw/pw) + dh = torch.log(gh/ph) + return torch.stack([dx, dy, dw, dh], dim=1) + +def decode_delta(delta, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] + gx = pw * dx + px + gy = ph * dy + py + gw = pw * torch.exp(dw) + gh = ph * torch.exp(dh) + return torch.stack([gx, gy, gw, gh], dim=1) + +def decode_delta_map(delta_map, anchors): + ''' + :param: delta_map, shape (nB, nA, nGh, nGw, 4) + :param: anchors, shape (nA,4) + ''' + nB, nA, nGh, nGw, _ = delta_map.shape + anchor_mesh = generate_anchor(nGh, nGw, anchors) + anchor_mesh = anchor_mesh.permute(0,2,3,1).contiguous() # Shpae (nA x nGh x nGw) x 4 + anchor_mesh = anchor_mesh.unsqueeze(0).repeat(nB,1,1,1,1) + pred_list = decode_delta(delta_map.view(-1,4), anchor_mesh.view(-1,4)) + pred_map = pred_list.view(nB, nA, nGh, nGw, 4) + return pred_map + + +def pooling_nms(heatmap, kernel=1): + pad = (kernel -1 ) // 2 + hmax = F.max_pool2d(heatmap, (kernel, kernel), stride=1, padding=pad) + keep = (hmax == heatmap).float() + return keep * heatmap + +def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4, method='standard'): + """ + Removes detections with lower object confidence score than 'conf_thres' + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + Args: + prediction, + conf_thres, + nms_thres, + method = 'standard' or 'fast' + """ + + output = [None for _ in range(len(prediction))] + for image_i, pred in enumerate(prediction): + # Filter out confidence scores below threshold + # Get score and class with highest confidence + + v = pred[:, 4] > conf_thres + v = v.nonzero().squeeze() + if len(v.shape) == 0: + v = v.unsqueeze(0) + + pred = pred[v] + + # If none are remaining => process next image + nP = pred.shape[0] + if not nP: + continue + # From (center x, center y, width, height) to (x1, y1, x2, y2) + pred[:, :4] = xywh2xyxy(pred[:, :4]) + + + # Non-maximum suppression + if method == 'standard': + nms_indices = nms(pred[:, :4], pred[:, 4], nms_thres) + elif method == 'fast': + nms_indices = fast_nms(pred[:, :4], pred[:, 4], iou_thres=nms_thres, conf_thres=conf_thres) + else: + raise ValueError('Invalid NMS type!') + det_max = pred[nms_indices] + + if len(det_max) > 0: + # Add max detections to outputs + output[image_i] = det_max if output[image_i] is None else torch.cat((output[image_i], det_max)) + + return output + +def fast_nms(boxes, scores, iou_thres:float=0.5, top_k:int=200, second_threshold:bool=False, conf_thres:float=0.5): + ''' + Vectorized, approximated, fast NMS, adopted from YOLACT: + https://github.com/dbolya/yolact/blob/master/layers/functions/detection.py + The original version is for multi-class NMS, here we simplify the code for single-class NMS + ''' + scores, idx = scores.sort(0, descending=True) + + idx = idx[:top_k].contiguous() + scores = scores[:top_k] + num_dets = idx.size() + + boxes = boxes[idx, :] + + iou = jaccard(boxes, boxes) + iou.triu_(diagonal=1) + iou_max, _ = iou.max(dim=0) + + keep = (iou_max <= iou_thres) + + if second_threshold: + keep *= (scores > self.conf_thresh) + + return idx[keep] + + + +@torch.jit.script +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [n,A,4]. + box_b: (tensor) bounding boxes, Shape: [n,B,4]. + Return: + (tensor) intersection area, Shape: [n,A,B]. + """ + n = box_a.size(0) + A = box_a.size(1) + B = box_b.size(1) + max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), + box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2)) + min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), + box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, :, 0] * inter[:, :, :, 1] + + + +def jaccard(box_a, box_b, iscrowd:bool=False): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + use_batch = True + if box_a.dim() == 2: + use_batch = False + box_a = box_a[None, ...] + box_b = box_b[None, ...] + + inter = intersect(box_a, box_b) + area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) * + (box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter) # [A,B] + area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) * + (box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter) # [A,B] + union = area_a + area_b - inter + + out = inter / area_a if iscrowd else inter / union + return out if use_batch else out.squeeze(0) + + + + +def return_torch_unique_index(u, uv): + n = uv.shape[1] # number of columns + first_unique = torch.zeros(n, device=u.device).long() + for j in range(n): + first_unique[j] = (uv[:, j:j + 1] == u).all(0).nonzero()[0] + + return first_unique + + +def strip_optimizer_from_checkpoint(filename='weights/best.pt'): + # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size) + + a = torch.load(filename, map_location='cpu') + a['optimizer'] = [] + torch.save(a, filename.replace('.pt', '_lite.pt')) + + +def plot_results(): + # Plot YOLO training results file 'results.txt' + # import os; os.system('wget https://storage.googleapis.com/ultralytics/yolov3/results_v1.txt') + + plt.figure(figsize=(14, 7)) + s = ['X + Y', 'Width + Height', 'Confidence', 'Classification', 'Total Loss', 'mAP', 'Recall', 'Precision'] + files = sorted(glob.glob('results*.txt')) + for f in files: + results = np.loadtxt(f, usecols=[2, 3, 4, 5, 6, 9, 10, 11]).T # column 11 is mAP + x = range(1, results.shape[1]) + for i in range(8): + plt.subplot(2, 4, i + 1) + plt.plot(x, results[i, x], marker='.', label=f) + plt.title(s[i]) + if i == 0: + plt.legend() +def load_checkpoint(fpath): + r"""Loads checkpoint. + + ``UnicodeDecodeError`` can be well handled, which means + python2-saved files can be read from python3. + + Args: + fpath (str): path to checkpoint. + + Returns: + dict + + Examples:: + >>> from torchreid.utils import load_checkpoint + >>> fpath = 'log/my_model/model.pth.tar-10' + >>> checkpoint = load_checkpoint(fpath) + """ + if fpath is None: + raise ValueError('File path is None') + if not osp.exists(fpath): + raise FileNotFoundError('File is not found at "{}"'.format(fpath)) + map_location = None if torch.cuda.is_available() else 'cpu' + try: + checkpoint = torch.load(fpath, map_location=map_location) + except UnicodeDecodeError: + pickle.load = partial(pickle.load, encoding="latin1") + pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") + checkpoint = torch.load( + fpath, pickle_module=pickle, map_location=map_location + ) + except Exception: + print('Unable to load checkpoint from "{}"'.format(fpath)) + raise + return checkpoint +def load_pretrained_weights(model, weight_path): + r"""Loads pretrianed weights to model. + + Features:: + - Incompatible layers (unmatched in name or size) will be ignored. + - Can automatically deal with keys containing "module.". + + Args: + model (nn.Module): network model. + weight_path (str): path to pretrained weights. + + Examples:: + >>> from torchreid.utils import load_pretrained_weights + >>> weight_path = 'log/my_model/model-best.pth.tar' + >>> load_pretrained_weights(model, weight_path) + """ + checkpoint = load_checkpoint(weight_path) + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + #show weight keys + ''' + print('keys of loaded model:') + for k, v in state_dict.items(): + print(k) + print('keys of model archs:') + for k, v in model_dict.items(): + print(k) + ''' + for k, v in state_dict.items(): + if not k.startswith('module.'): + k = 'module.' + k + + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights "{}" cannot be loaded, ' + 'please check the key names manually ' + '(** ignored and continue **)'.format(weight_path) + ) + else: + print( + 'loading reid model from {}...'. + format(weight_path) + ) + ''' + if len(discarded_layers) > 0: + print( + '** The following layers are discarded ' + 'due to unmatched keys or layer size: {}'. + format(discarded_layers) + ) + ''' diff --git a/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.cpp b/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.cpp index 8d859a334..1634dfe8c 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.cpp +++ b/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.cpp @@ -1,349 +1,349 @@ -/* -* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "acl/acl.h" -#include "MxBase/DeviceManager/DeviceManager.h" -#include "MxBase/Log/Log.h" -#include "OpenPose.h" - -using namespace MxBase; -namespace { - const uint32_t YUV_BYTE_NU = 3; - const uint32_t YUV_BYTE_DE = 2; - const uint32_t MODEL_HEIGHT = 560; - const uint32_t MODEL_WIDTH = 560; - const int NPOINTS = 18; -} // namespace - -void PrintTensorShape(const std::vector &tensorDescVec, const std::string &tensorName) { - LogInfo << "The shape of " << tensorName << " is as follows:"; - for (size_t i = 0; i < tensorDescVec.size(); ++i) { - LogInfo << " Tensor " << i << ":"; - for (size_t j = 0; j < tensorDescVec[i].tensorDims.size(); ++j) { - LogInfo << " dim: " << j << ": " << tensorDescVec[i].tensorDims[j]; - } - } -} - -APP_ERROR OpenPose::Init(const InitParam &initParam) { - deviceId_ = initParam.deviceId; - APP_ERROR ret = MxBase::DeviceManager::GetInstance()->InitDevices(); - if (ret != APP_ERR_OK) { - LogError << "Init devices failed, ret=" << ret << "."; - return ret; - } - ret = MxBase::TensorContext::GetInstance()->SetContext(initParam.deviceId); - if (ret != APP_ERR_OK) { - LogError << "Set context failed, ret=" << ret << "."; - return ret; - } - dvppWrapper_ = std::make_shared(); - ret = dvppWrapper_->Init(); - if (ret != APP_ERR_OK) { - LogError << "DvppWrapper init failed, ret=" << ret << "."; - return ret; - } - model_ = std::make_shared(); - ret = model_->Init(initParam.modelPath, modelDesc_); - if (ret != APP_ERR_OK) { - LogError << "ModelInferenceProcessor init failed, ret=" << ret << "."; - return ret; - } - PrintTensorShape(modelDesc_.inputTensors, "Model Input Tensors"); - PrintTensorShape(modelDesc_.outputTensors, "Model Output Tensors"); - post_ = std::make_shared(); - return APP_ERR_OK; -} - -APP_ERROR OpenPose::DeInit() { - dvppWrapper_->DeInit(); - model_->DeInit(); - MxBase::DeviceManager::GetInstance()->DestroyDevices(); - return APP_ERR_OK; -} - -APP_ERROR OpenPose::ReadImage(const std::string &imgPath, MxBase::DvppDataInfo *output, ImageShape *imgShape) { - APP_ERROR ret = dvppWrapper_->DvppJpegDecode(imgPath, *output); - if (ret != APP_ERR_OK) { - LogError << "DvppWrapper DvppJpegDecode failed, ret=" << ret << "."; - return ret; - } - imgShape->width = output->width; - imgShape->height = output->height; - return APP_ERR_OK; -} - -APP_ERROR OpenPose::Resize(const MxBase::DvppDataInfo &input, MxBase::TensorBase *outputTensor) { - MxBase::CropRoiConfig cropRoi = {0, input.width, input.height, 0}; - float ratio = - std::min(static_cast(MODEL_WIDTH) / input.width, static_cast(MODEL_HEIGHT) / input.height); - MxBase::CropRoiConfig pasteRoi = {0, 0, 0, 0}; - pasteRoi.x1 = input.width * ratio; - pasteRoi.y1 = input.height * ratio; - - MxBase::MemoryData memoryData(MODEL_WIDTH * MODEL_HEIGHT * YUV_BYTE_NU / YUV_BYTE_DE, - MemoryData::MemoryType::MEMORY_DVPP, deviceId_); - APP_ERROR ret = MxBase::MemoryHelper::MxbsMalloc(memoryData); - if (ret != APP_ERR_OK) { - LogError << "Fail to allocate dvpp memory."; - MemoryHelper::MxbsFree(memoryData); - return APP_ERR_COMM_INVALID_PARAM; - } - - ret = MxBase::MemoryHelper::MxbsMemset(memoryData, 0, memoryData.size); - if (ret != APP_ERR_OK) { - LogError << "Fail to set 0."; - MemoryHelper::MxbsFree(memoryData); - return APP_ERR_COMM_INVALID_PARAM; - } - - MxBase::DvppDataInfo output = {}; - output.dataSize = memoryData.size; - output.width = MODEL_WIDTH; - output.height = MODEL_HEIGHT; - output.widthStride = MODEL_WIDTH; - output.heightStride = MODEL_HEIGHT; - output.format = input.format; - output.data = static_cast(memoryData.ptrData); - - ret = dvppWrapper_->VpcCropAndPaste(input, output, pasteRoi, cropRoi); - if (ret != APP_ERR_OK) { - LogError << "VpcCropAndPaste failed, ret=" << ret << "."; - return ret; - } - - std::vector shape = {output.heightStride * YUV_BYTE_NU / YUV_BYTE_DE, output.widthStride}; - *outputTensor = TensorBase(memoryData, false, shape, TENSOR_DTYPE_UINT8); - return APP_ERR_OK; -} - -APP_ERROR OpenPose::Inference(const std::vector &inputs, - std::vector *outputs) { - auto dtypes = model_->GetOutputDataType(); - for (size_t i = 0; i < modelDesc_.outputTensors.size(); ++i) { - std::vector shape = {}; - for (size_t j = 0; j < modelDesc_.outputTensors[i].tensorDims.size(); ++j) { - shape.push_back((uint32_t)modelDesc_.outputTensors[i].tensorDims[j]); - } - TensorBase tensor(shape, dtypes[i], MemoryData::MemoryType::MEMORY_DEVICE, deviceId_); - APP_ERROR ret = TensorBase::TensorBaseMalloc(tensor); - if (ret != APP_ERR_OK) { - LogError << "TensorBaseMalloc failed, ret=" << ret << "."; - return ret; - } - outputs->push_back(tensor); - } - DynamicInfo dynamicInfo = {}; - dynamicInfo.dynamicType = DynamicType::STATIC_BATCH; - dynamicInfo.batchSize = 1; - - APP_ERROR ret = model_->ModelInference(inputs, *outputs, dynamicInfo); - if (ret != APP_ERR_OK) { - LogError << "ModelInference failed, ret=" << ret << "."; - return ret; - } - - return APP_ERR_OK; -} - -APP_ERROR OpenPose::PostProcess(const std::vector &inputs, - const std::vector &vision_infos, - std::vector > *person_list) { - APP_ERROR ret = post_->selfProcess(inputs, vision_infos, person_list); - if (ret != APP_ERR_OK) { - LogError << "Process failed, ret=" << ret << "."; - return ret; - } - return APP_ERR_OK; -} - -APP_ERROR OpenPose::GetInferResults(const std::string &imgPath, const std::vector > &person_list, - const std::string &resultPath) { - std::string fileName = imgPath.substr(imgPath.find_last_of("/") + 1); - size_t dot = fileName.find_last_of("."); - std::string resFileName = resultPath + fileName.substr(0, dot) + "_1.txt"; - std::ofstream outfile(resFileName); - if (outfile.fail()) { - LogError << "Failed to open result file: "; - return APP_ERR_COMM_FAILURE; - } - std::vector > coco_keypoints; - std::vector scores; - float coor_bias = 0.5; - float float_equal_zero_bias = 0.000001; - for (int k = 0; k < person_list.size(); k++) { - float person_score = post_->PersonScore(person_list[k]); - // Ignore person with score 0 - if (fabs(person_score - 0) < float_equal_zero_bias) { - continue; - } - person_score = person_score - 1; - std::vector keypoints(17*3, 0.0); - int to_coco_map[] = {0, -1, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3}; - std::set seen_idx = {1}; - for (int j = 0; j < person_list[k].size(); j++) { - PartPair skele = person_list[k][j]; - int part_idx1 = skele.partIdx1; - // two end points of a skeleton - int part_idx2 = skele.partIdx2; - if (seen_idx.count(part_idx1) == 0) { - float center_x = skele.coord1[0] + coor_bias; - float center_y = skele.coord1[1] + coor_bias; - keypoints[to_coco_map[part_idx1] * 3 + 0] = center_x; - keypoints[to_coco_map[part_idx1] * 3 + 1] = center_y; - keypoints[to_coco_map[part_idx1] * 3 + 2] = 1; - seen_idx.insert(part_idx1); - } - if (seen_idx.count(part_idx2) == 0) { - float center_x = skele.coord2[0] + coor_bias; - float center_y = skele.coord2[1] + coor_bias; - keypoints[to_coco_map[part_idx2] * 3 + 0] = center_x; - keypoints[to_coco_map[part_idx2] * 3 + 1] = center_y; - keypoints[to_coco_map[part_idx2] * 3 + 2] = 1; - seen_idx.insert(part_idx2); - } - } - coco_keypoints.push_back(keypoints); - scores.push_back(person_score); - std::string resultStr; - resultStr += "["; - std::cout << "keypoints: ["; - int i = 0; - for (i = 0; i < keypoints.size()-1; i++) { - resultStr += std::to_string(keypoints[i]) + ","; - } - resultStr += "]"; - outfile << resultStr << std::endl; - resultStr = "person_score: "; - resultStr += std::to_string(person_score); - outfile << resultStr << std::endl; - } - outfile.close(); - return APP_ERR_OK; -} - -void OpenPose::DrawPoseBbox(const std::string &imgPath, const std::vector > &person_list, - const std::string &resultPath) { - std::vector > COCO_PAIRS_RENDER = {std::vector{1, 2}, std::vector{1, 5}, - std::vector{2, 3}, std::vector{3, 4}, std::vector{5, 6}, std::vector{6, 7}, - std::vector{1, 8}, std::vector{8, 9}, std::vector{9, 10}, std::vector{1, 11}, - std::vector{11, 12}, std::vector{12, 13}, std::vector{1, 0}, std::vector{0, 14}, - std::vector{14, 16}, std::vector{0, 15}, std::vector{15, 17}}; // = 19 - - std::vector COCO_COLORS = {cv::Scalar(255, 0, 0), cv::Scalar(255, 85, 0), cv::Scalar(255, 170, 0), - cv::Scalar(255, 255, 0), cv::Scalar(170, 255, 0), cv::Scalar(85, 255, 0), cv::Scalar(0, 255, 0), - cv::Scalar(0, 255, 85), cv::Scalar(0, 255, 170), cv::Scalar(0, 255, 255), cv::Scalar(0, 170, 255), - cv::Scalar(0, 85, 255), cv::Scalar(0, 0, 255), cv::Scalar(85, 0, 255), cv::Scalar(170, 0, 255), - cv::Scalar(255, 0, 255), cv::Scalar(255, 0, 170), cv::Scalar(255, 0, 85)}; - - cv::Mat imageMat = cv::imread(imgPath, cv::IMREAD_COLOR); - for (int k = 0; k < person_list.size(); k++) { - std::map centers; - std::set seen_idx; - for (int j = 0; j < person_list[k].size(); j++) { - PartPair skele = person_list[k][j]; - // two end points of a skeleton - int part_idx1 = skele.partIdx1; - int part_idx2 = skele.partIdx2; - if (seen_idx.count(part_idx1) == 0) { - cv::Point center; - center.x = static_cast(skele.coord1[0]); - center.y = static_cast(skele.coord1[1]); - centers[part_idx1] = center; - cv::circle(imageMat, center, 3, COCO_COLORS[part_idx1], -1, cv::LINE_AA); - seen_idx.insert(part_idx1); - } - if (seen_idx.count(part_idx2) == 0) { - cv::Point center; - center.x = static_cast(skele.coord2[0]); - center.y = static_cast(skele.coord2[1]); - centers[part_idx2] = center; - cv::circle(imageMat, center, 3, COCO_COLORS[part_idx2], -1, cv::LINE_AA); - seen_idx.insert(part_idx2); - } - } - for (int i = 0; i < COCO_PAIRS_RENDER.size(); i++) { - std::vector pair = COCO_PAIRS_RENDER[i]; - if ((seen_idx.count(pair[0]) != 0) && (seen_idx.count(pair[1]) != 0)) - cv::line(imageMat, centers[pair[0]], centers[pair[1]], COCO_COLORS[i], 2, cv::LINE_AA); - } - } - std::string fileName = imgPath.substr(imgPath.find_last_of("/") + 1); - size_t dot = fileName.find_last_of("."); - std::string resFileName = resultPath + fileName.substr(0, dot) + "_detect_result.jpg"; - cv::imwrite(resFileName, imageMat); -} - -APP_ERROR OpenPose::Process(const std::string &imgPath, const std::string &resultPath) { - ImageShape imageShape{}; - MxBase::DvppDataInfo dvppData = {}; - - APP_ERROR ret = ReadImage(imgPath, &dvppData, &imageShape); - if (ret != APP_ERR_OK) { - LogError << "ReadImage failed, ret=" << ret << "."; - return ret; - } - TensorBase resizeImage; - ret = Resize(dvppData, &resizeImage); - if (ret != APP_ERR_OK) { - LogError << "Resize failed, ret=" << ret << "."; - return ret; - } - - std::vector vision_infos; - vision_infos.push_back(imageShape.height); - vision_infos.push_back(imageShape.width); - vision_infos.push_back(MODEL_HEIGHT); - vision_infos.push_back(MODEL_WIDTH); - - std::vector inputs = {}; - std::vector outputs = {}; - - inputs.push_back(resizeImage); - - ret = Inference(inputs, &outputs); - if (ret != APP_ERR_OK) { - LogError << "Inference failed, ret=" << ret << "."; - return ret; - } - LogInfo << "Inference success, ret=" << ret << "."; - std::vector > person_list {}; - - ret = PostProcess(outputs, vision_infos, &person_list); - if (ret != APP_ERR_OK) { - LogError << "PostProcess failed, ret=" << ret << "."; - return ret; - } - // Get keypoints and person_score info. - ret = GetInferResults(imgPath, person_list, resultPath); - if (ret != APP_ERR_OK) { - LogError << "Save infer results into file failed. ret = " << ret << "."; - return ret; - } - // Visualize the postprocess results. - DrawPoseBbox(imgPath, person_list, resultPath); - return APP_ERR_OK; -} +/* +* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "MxBase/DeviceManager/DeviceManager.h" +#include "MxBase/Log/Log.h" +#include "OpenPose.h" + +using namespace MxBase; +namespace { + const uint32_t YUV_BYTE_NU = 3; + const uint32_t YUV_BYTE_DE = 2; + const uint32_t MODEL_HEIGHT = 560; + const uint32_t MODEL_WIDTH = 560; + const int NPOINTS = 18; +} // namespace + +void PrintTensorShape(const std::vector &tensorDescVec, const std::string &tensorName) { + LogInfo << "The shape of " << tensorName << " is as follows:"; + for (size_t i = 0; i < tensorDescVec.size(); ++i) { + LogInfo << " Tensor " << i << ":"; + for (size_t j = 0; j < tensorDescVec[i].tensorDims.size(); ++j) { + LogInfo << " dim: " << j << ": " << tensorDescVec[i].tensorDims[j]; + } + } +} + +APP_ERROR OpenPose::Init(const InitParam &initParam) { + deviceId_ = initParam.deviceId; + APP_ERROR ret = MxBase::DeviceManager::GetInstance()->InitDevices(); + if (ret != APP_ERR_OK) { + LogError << "Init devices failed, ret=" << ret << "."; + return ret; + } + ret = MxBase::TensorContext::GetInstance()->SetContext(initParam.deviceId); + if (ret != APP_ERR_OK) { + LogError << "Set context failed, ret=" << ret << "."; + return ret; + } + dvppWrapper_ = std::make_shared(); + ret = dvppWrapper_->Init(); + if (ret != APP_ERR_OK) { + LogError << "DvppWrapper init failed, ret=" << ret << "."; + return ret; + } + model_ = std::make_shared(); + ret = model_->Init(initParam.modelPath, modelDesc_); + if (ret != APP_ERR_OK) { + LogError << "ModelInferenceProcessor init failed, ret=" << ret << "."; + return ret; + } + PrintTensorShape(modelDesc_.inputTensors, "Model Input Tensors"); + PrintTensorShape(modelDesc_.outputTensors, "Model Output Tensors"); + post_ = std::make_shared(); + return APP_ERR_OK; +} + +APP_ERROR OpenPose::DeInit() { + dvppWrapper_->DeInit(); + model_->DeInit(); + MxBase::DeviceManager::GetInstance()->DestroyDevices(); + return APP_ERR_OK; +} + +APP_ERROR OpenPose::ReadImage(const std::string &imgPath, MxBase::DvppDataInfo *output, ImageShape *imgShape) { + APP_ERROR ret = dvppWrapper_->DvppJpegDecode(imgPath, *output); + if (ret != APP_ERR_OK) { + LogError << "DvppWrapper DvppJpegDecode failed, ret=" << ret << "."; + return ret; + } + imgShape->width = output->width; + imgShape->height = output->height; + return APP_ERR_OK; +} + +APP_ERROR OpenPose::Resize(const MxBase::DvppDataInfo &input, MxBase::TensorBase *outputTensor) { + MxBase::CropRoiConfig cropRoi = {0, input.width, input.height, 0}; + float ratio = + std::min(static_cast(MODEL_WIDTH) / input.width, static_cast(MODEL_HEIGHT) / input.height); + MxBase::CropRoiConfig pasteRoi = {0, 0, 0, 0}; + pasteRoi.x1 = input.width * ratio; + pasteRoi.y1 = input.height * ratio; + + MxBase::MemoryData memoryData(MODEL_WIDTH * MODEL_HEIGHT * YUV_BYTE_NU / YUV_BYTE_DE, + MemoryData::MemoryType::MEMORY_DVPP, deviceId_); + APP_ERROR ret = MxBase::MemoryHelper::MxbsMalloc(memoryData); + if (ret != APP_ERR_OK) { + LogError << "Fail to allocate dvpp memory."; + MemoryHelper::MxbsFree(memoryData); + return APP_ERR_COMM_INVALID_PARAM; + } + + ret = MxBase::MemoryHelper::MxbsMemset(memoryData, 0, memoryData.size); + if (ret != APP_ERR_OK) { + LogError << "Fail to set 0."; + MemoryHelper::MxbsFree(memoryData); + return APP_ERR_COMM_INVALID_PARAM; + } + + MxBase::DvppDataInfo output = {}; + output.dataSize = memoryData.size; + output.width = MODEL_WIDTH; + output.height = MODEL_HEIGHT; + output.widthStride = MODEL_WIDTH; + output.heightStride = MODEL_HEIGHT; + output.format = input.format; + output.data = static_cast(memoryData.ptrData); + + ret = dvppWrapper_->VpcCropAndPaste(input, output, pasteRoi, cropRoi); + if (ret != APP_ERR_OK) { + LogError << "VpcCropAndPaste failed, ret=" << ret << "."; + return ret; + } + + std::vector shape = {output.heightStride * YUV_BYTE_NU / YUV_BYTE_DE, output.widthStride}; + *outputTensor = TensorBase(memoryData, false, shape, TENSOR_DTYPE_UINT8); + return APP_ERR_OK; +} + +APP_ERROR OpenPose::Inference(const std::vector &inputs, + std::vector *outputs) { + auto dtypes = model_->GetOutputDataType(); + for (size_t i = 0; i < modelDesc_.outputTensors.size(); ++i) { + std::vector shape = {}; + for (size_t j = 0; j < modelDesc_.outputTensors[i].tensorDims.size(); ++j) { + shape.push_back((uint32_t)modelDesc_.outputTensors[i].tensorDims[j]); + } + TensorBase tensor(shape, dtypes[i], MemoryData::MemoryType::MEMORY_DEVICE, deviceId_); + APP_ERROR ret = TensorBase::TensorBaseMalloc(tensor); + if (ret != APP_ERR_OK) { + LogError << "TensorBaseMalloc failed, ret=" << ret << "."; + return ret; + } + outputs->push_back(tensor); + } + DynamicInfo dynamicInfo = {}; + dynamicInfo.dynamicType = DynamicType::STATIC_BATCH; + dynamicInfo.batchSize = 1; + + APP_ERROR ret = model_->ModelInference(inputs, *outputs, dynamicInfo); + if (ret != APP_ERR_OK) { + LogError << "ModelInference failed, ret=" << ret << "."; + return ret; + } + + return APP_ERR_OK; +} + +APP_ERROR OpenPose::PostProcess(const std::vector &inputs, + const std::vector &vision_infos, + std::vector > *person_list) { + APP_ERROR ret = post_->selfProcess(inputs, vision_infos, person_list); + if (ret != APP_ERR_OK) { + LogError << "Process failed, ret=" << ret << "."; + return ret; + } + return APP_ERR_OK; +} + +APP_ERROR OpenPose::GetInferResults(const std::string &imgPath, const std::vector > &person_list, + const std::string &resultPath) { + std::string fileName = imgPath.substr(imgPath.find_last_of("/") + 1); + size_t dot = fileName.find_last_of("."); + std::string resFileName = resultPath + fileName.substr(0, dot) + "_1.txt"; + std::ofstream outfile(resFileName); + if (outfile.fail()) { + LogError << "Failed to open result file: "; + return APP_ERR_COMM_FAILURE; + } + std::vector > coco_keypoints; + std::vector scores; + float coor_bias = 0.5; + float float_equal_zero_bias = 0.000001; + for (int k = 0; k < person_list.size(); k++) { + float person_score = post_->PersonScore(person_list[k]); + // Ignore person with score 0 + if (fabs(person_score - 0) < float_equal_zero_bias) { + continue; + } + person_score = person_score - 1; + std::vector keypoints(17*3, 0.0); + int to_coco_map[] = {0, -1, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3}; + std::set seen_idx = {1}; + for (int j = 0; j < person_list[k].size(); j++) { + PartPair skele = person_list[k][j]; + int part_idx1 = skele.partIdx1; + // two end points of a skeleton + int part_idx2 = skele.partIdx2; + if (seen_idx.count(part_idx1) == 0) { + float center_x = skele.coord1[0] + coor_bias; + float center_y = skele.coord1[1] + coor_bias; + keypoints[to_coco_map[part_idx1] * 3 + 0] = center_x; + keypoints[to_coco_map[part_idx1] * 3 + 1] = center_y; + keypoints[to_coco_map[part_idx1] * 3 + 2] = 1; + seen_idx.insert(part_idx1); + } + if (seen_idx.count(part_idx2) == 0) { + float center_x = skele.coord2[0] + coor_bias; + float center_y = skele.coord2[1] + coor_bias; + keypoints[to_coco_map[part_idx2] * 3 + 0] = center_x; + keypoints[to_coco_map[part_idx2] * 3 + 1] = center_y; + keypoints[to_coco_map[part_idx2] * 3 + 2] = 1; + seen_idx.insert(part_idx2); + } + } + coco_keypoints.push_back(keypoints); + scores.push_back(person_score); + std::string resultStr; + resultStr += "["; + std::cout << "keypoints: ["; + int i = 0; + for (i = 0; i < keypoints.size()-1; i++) { + resultStr += std::to_string(keypoints[i]) + ","; + } + resultStr += "]"; + outfile << resultStr << std::endl; + resultStr = "person_score: "; + resultStr += std::to_string(person_score); + outfile << resultStr << std::endl; + } + outfile.close(); + return APP_ERR_OK; +} + +void OpenPose::DrawPoseBbox(const std::string &imgPath, const std::vector > &person_list, + const std::string &resultPath) { + std::vector > COCO_PAIRS_RENDER = {std::vector{1, 2}, std::vector{1, 5}, + std::vector{2, 3}, std::vector{3, 4}, std::vector{5, 6}, std::vector{6, 7}, + std::vector{1, 8}, std::vector{8, 9}, std::vector{9, 10}, std::vector{1, 11}, + std::vector{11, 12}, std::vector{12, 13}, std::vector{1, 0}, std::vector{0, 14}, + std::vector{14, 16}, std::vector{0, 15}, std::vector{15, 17}}; // = 19 + + std::vector COCO_COLORS = {cv::Scalar(255, 0, 0), cv::Scalar(255, 85, 0), cv::Scalar(255, 170, 0), + cv::Scalar(255, 255, 0), cv::Scalar(170, 255, 0), cv::Scalar(85, 255, 0), cv::Scalar(0, 255, 0), + cv::Scalar(0, 255, 85), cv::Scalar(0, 255, 170), cv::Scalar(0, 255, 255), cv::Scalar(0, 170, 255), + cv::Scalar(0, 85, 255), cv::Scalar(0, 0, 255), cv::Scalar(85, 0, 255), cv::Scalar(170, 0, 255), + cv::Scalar(255, 0, 255), cv::Scalar(255, 0, 170), cv::Scalar(255, 0, 85)}; + + cv::Mat imageMat = cv::imread(imgPath, cv::IMREAD_COLOR); + for (int k = 0; k < person_list.size(); k++) { + std::map centers; + std::set seen_idx; + for (int j = 0; j < person_list[k].size(); j++) { + PartPair skele = person_list[k][j]; + // two end points of a skeleton + int part_idx1 = skele.partIdx1; + int part_idx2 = skele.partIdx2; + if (seen_idx.count(part_idx1) == 0) { + cv::Point center; + center.x = static_cast(skele.coord1[0]); + center.y = static_cast(skele.coord1[1]); + centers[part_idx1] = center; + cv::circle(imageMat, center, 3, COCO_COLORS[part_idx1], -1, cv::LINE_AA); + seen_idx.insert(part_idx1); + } + if (seen_idx.count(part_idx2) == 0) { + cv::Point center; + center.x = static_cast(skele.coord2[0]); + center.y = static_cast(skele.coord2[1]); + centers[part_idx2] = center; + cv::circle(imageMat, center, 3, COCO_COLORS[part_idx2], -1, cv::LINE_AA); + seen_idx.insert(part_idx2); + } + } + for (int i = 0; i < COCO_PAIRS_RENDER.size(); i++) { + std::vector pair = COCO_PAIRS_RENDER[i]; + if ((seen_idx.count(pair[0]) != 0) && (seen_idx.count(pair[1]) != 0)) + cv::line(imageMat, centers[pair[0]], centers[pair[1]], COCO_COLORS[i], 2, cv::LINE_AA); + } + } + std::string fileName = imgPath.substr(imgPath.find_last_of("/") + 1); + size_t dot = fileName.find_last_of("."); + std::string resFileName = resultPath + fileName.substr(0, dot) + "_detect_result.jpg"; + cv::imwrite(resFileName, imageMat); +} + +APP_ERROR OpenPose::Process(const std::string &imgPath, const std::string &resultPath) { + ImageShape imageShape{}; + MxBase::DvppDataInfo dvppData = {}; + + APP_ERROR ret = ReadImage(imgPath, &dvppData, &imageShape); + if (ret != APP_ERR_OK) { + LogError << "ReadImage failed, ret=" << ret << "."; + return ret; + } + TensorBase resizeImage; + ret = Resize(dvppData, &resizeImage); + if (ret != APP_ERR_OK) { + LogError << "Resize failed, ret=" << ret << "."; + return ret; + } + + std::vector vision_infos; + vision_infos.push_back(imageShape.height); + vision_infos.push_back(imageShape.width); + vision_infos.push_back(MODEL_HEIGHT); + vision_infos.push_back(MODEL_WIDTH); + + std::vector inputs = {}; + std::vector outputs = {}; + + inputs.push_back(resizeImage); + + ret = Inference(inputs, &outputs); + if (ret != APP_ERR_OK) { + LogError << "Inference failed, ret=" << ret << "."; + return ret; + } + LogInfo << "Inference success, ret=" << ret << "."; + std::vector > person_list {}; + + ret = PostProcess(outputs, vision_infos, &person_list); + if (ret != APP_ERR_OK) { + LogError << "PostProcess failed, ret=" << ret << "."; + return ret; + } + // Get keypoints and person_score info. + ret = GetInferResults(imgPath, person_list, resultPath); + if (ret != APP_ERR_OK) { + LogError << "Save infer results into file failed. ret = " << ret << "."; + return ret; + } + // Visualize the postprocess results. + DrawPoseBbox(imgPath, person_list, resultPath); + return APP_ERR_OK; +} diff --git a/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.h b/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.h index 6c71e9a42..e78e86a65 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.h +++ b/cv/pose/openpose/mindspore/infer/mxbase/process/src/OpenPose.h @@ -1,66 +1,66 @@ -/* -* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef OPENPOSEPOST_OPENPOSE_H -#define OPENPOSEPOST_OPENPOSE_H - -#include -#include -#include -#include "MxBase/DvppWrapper/DvppWrapper.h" -#include "MxBase/ModelInfer/ModelInferenceProcessor.h" -#include "MxBase/PostProcessBases/ObjectPostProcessBase.h" -#include "MxBase/Tensor/TensorContext/TensorContext.h" -#include "PostProcess/OpenPoseMindsporePost.h" -#include "MxBase/DeviceManager/DeviceManager.h" - -struct InitParam { - uint32_t deviceId; - std::string labelPath; - uint32_t classNum; - bool checkTensor; - std::string modelPath; -}; - -struct ImageShape { - int width; - int height; -}; - -class OpenPose { - public: - APP_ERROR Init(const InitParam &initParam); - APP_ERROR DeInit(); - APP_ERROR ReadImage(const std::string &imgPath, MxBase::DvppDataInfo *output, ImageShape *imgShape); - APP_ERROR Resize(const MxBase::DvppDataInfo &input, MxBase::TensorBase *outputTensor); - APP_ERROR Inference(const std::vector &inputs, std::vector *outputs); - APP_ERROR PostProcess(const std::vector &inputs, const std::vector &vision_infos, - std::vector > *person_list); - APP_ERROR Process(const std::string &imgPath, const std::string &resultPath); - APP_ERROR GetInferResults(const std::string &imgPath, const std::vector > &person_list, - const std::string &resultPath); - void DrawPoseBbox(const std::string &imgPath, const std::vector > &person_list, - const std::string &resultPath); - - private: - std::shared_ptr dvppWrapper_; - std::shared_ptr model_; - std::shared_ptr post_; - MxBase::ModelDesc modelDesc_; - uint32_t deviceId_ = 1; -}; - -#endif // FASTERRCNNPOST_FASTERRCNN_H +/* +* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef OPENPOSEPOST_OPENPOSE_H +#define OPENPOSEPOST_OPENPOSE_H + +#include +#include +#include +#include "MxBase/DvppWrapper/DvppWrapper.h" +#include "MxBase/ModelInfer/ModelInferenceProcessor.h" +#include "MxBase/PostProcessBases/ObjectPostProcessBase.h" +#include "MxBase/Tensor/TensorContext/TensorContext.h" +#include "PostProcess/OpenPoseMindsporePost.h" +#include "MxBase/DeviceManager/DeviceManager.h" + +struct InitParam { + uint32_t deviceId; + std::string labelPath; + uint32_t classNum; + bool checkTensor; + std::string modelPath; +}; + +struct ImageShape { + int width; + int height; +}; + +class OpenPose { + public: + APP_ERROR Init(const InitParam &initParam); + APP_ERROR DeInit(); + APP_ERROR ReadImage(const std::string &imgPath, MxBase::DvppDataInfo *output, ImageShape *imgShape); + APP_ERROR Resize(const MxBase::DvppDataInfo &input, MxBase::TensorBase *outputTensor); + APP_ERROR Inference(const std::vector &inputs, std::vector *outputs); + APP_ERROR PostProcess(const std::vector &inputs, const std::vector &vision_infos, + std::vector > *person_list); + APP_ERROR Process(const std::string &imgPath, const std::string &resultPath); + APP_ERROR GetInferResults(const std::string &imgPath, const std::vector > &person_list, + const std::string &resultPath); + void DrawPoseBbox(const std::string &imgPath, const std::vector > &person_list, + const std::string &resultPath); + + private: + std::shared_ptr dvppWrapper_; + std::shared_ptr model_; + std::shared_ptr post_; + MxBase::ModelDesc modelDesc_; + uint32_t deviceId_ = 1; +}; + +#endif // FASTERRCNNPOST_FASTERRCNN_H diff --git a/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.cpp b/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.cpp index e5a37950c..ceec125b2 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.cpp +++ b/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.cpp @@ -1,591 +1,591 @@ -/* -* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "OpenPoseMindsporePost.h" -#include -#include -#include -#include -#include "opencv2/opencv.hpp" -#include "MxBase/Log/Log.h" -#include "MxBase/Tensor/TensorBase/TensorBase.h" - -using namespace MxBase; -using namespace cv; - -namespace { - auto uint8Deleter = [](uint8_t *p) {}; - const int K_NUM_BODY_PARTS = 18; - const int K_UPSAMPLED_STRIDE = 8; - - // CocoSkeletonsNetwork - const std::vector K_POSE_MAP_INDEX { - 12, 13, 20, 21, 14, 15, 16, 17, 22, 23, - 24, 25, 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 28, 29, 30, 31, 34, 35, - 32, 33, 36, 37, 18, 19, 26, 27 - }; - - // CocoSkeletons - const std::vector K_POSE_BODY_PART_SKELETONS { - 1, 2, 1, 5, 2, 3, 3, 4, 5, 6, - 6, 7, 1, 8, 8, 9, 9, 10, 1, 11, - 11, 12, 12, 13, 1, 0, 0, 14, 14, 16, - 0, 15, 15, 17, 2, 16, 5, 17 - }; - // Nms score threshold - const float K_NMS_THRESHOLD = 0.05; - // Range of nearest neighbors - const int K_NEAREST_KEYPOINTS_THRESHOLD = 6; - // PAF score threshold as a valid inner point on a skeleton - const float K_LOCAL_PAF_SCORE_THRESHOLD = 0.05; - // The minimum number of valid inner points a skeleton includes to be regarded as a correct skeleton - const int K_LOCAL_PAF_COUNT_THRESHOLD = 8; - // The minimum number of skeletons needed to form a person - const int K_PERSON_SKELETON_COUNT_THRESHOLD = 3; - // The lowest average score per keypoint in a person - const float K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD = 0.2; -} // namespace - -namespace MxBase { - OpenPoseMindsporePost &OpenPoseMindsporePost::operator=(const OpenPoseMindsporePost &other) { - if (this == &other) { - return *this; - } - ObjectPostProcessBase::operator=(other); - return *this; - } - - /** - * @brief Parsing TensorBase data to keypoint heatmap and PAF heatmap of openpose model - * @param tensors - TensorBase vector - * @return Two-element vector, keeping keypoint heatmap and paf heatmap respectively - */ - static std::vector > ReadDataFromTensor(const std::vector &tensors) { - auto shape = tensors[11].GetShape(); - int channel_keypoint = shape[1]; - int height_index = 2, width_index = 3; - int height = shape[height_index]; - int width = shape[width_index]; - auto shape_p = tensors[5].GetShape(); - int channel_paf = shape_p[1]; - // Read keypoint data - auto dataPtr = reinterpret_cast(tensors[11].GetBuffer()); - std::shared_ptr keypoint_pointer; - keypoint_pointer.reset(dataPtr, uint8Deleter); - - std::vector keypoint_heatmap {}; - int idx = 0; - for (int i = 0; i < channel_keypoint; i++) { - cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); - for (int j = 0; j < height; j++) { - float *ptr = single_channel_mat.ptr(j); - for (int k = 0; k < width; k++) { - ptr[k] = static_cast(keypoint_pointer.get())[idx]; - idx += 1; - } - } - keypoint_heatmap.push_back(single_channel_mat); - } - // Read PAF data - auto data_paf_ptr = reinterpret_cast(tensors[5].GetBuffer()); - std::shared_ptr paf_pointer; - paf_pointer.reset(data_paf_ptr, uint8Deleter); - std::vector paf_heatmap {}; - idx = 0; - for (int i = 0; i < channel_paf; i++) { - cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); - for (int j = 0; j < height; j++) { - float *ptr = single_channel_mat.ptr(j); - for (int k = 0; k < width; k++) { - ptr[k] = static_cast(paf_pointer.get())[idx]; - idx += 1; - } - } - paf_heatmap.push_back(single_channel_mat); - } - std::vector > result = {keypoint_heatmap, paf_heatmap}; - return result; - } - - /** - * @brief Comparison between two PartPair elements - * @param p1 - PartPair p1 - * @param p2 - PartPair p2 - * @return True if the score of p1 is greater than that of p2 - */ - static bool GreaterSort(PartPair p1, PartPair p2) { - return p1.score > p2.score; - } - - /** - * @brief Comparison between two cv::Point elements - * @param p1 - cv::Point p1 - * @param p2 - cv::Point p2 - * @return True if the x coordinate of p2 is greater than that of p1 - */ - static bool PointSort(cv::Point p1, cv::Point p2) { - return p1.x < p2.x; - } - - /** - * @brief Resize output heatmaps to the size of the origin image - * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat - * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat - * @param vision_infos - Vision infos of origin image and aligned image - * @return APP_ERROR - */ - void OpenPoseMindsporePost::ResizeHeatmaps(const std::vector &vision_infos, - std::vector *keypoint_heatmap, std::vector *paf_heatmap) { - // Calculate padding direction and padding value - int origin_height = vision_infos[0]; - int origin_width = vision_infos[1]; - int inputHeight_ = vision_infos[2]; - int inputWidth_ = vision_infos[3]; - // padding along height - int padding_direction = 0; - if (origin_height > origin_width) { - // padding along width - padding_direction = 1; - } - int padding_value = 0; - if (padding_direction == 0) { - // pad height - padding_value = floor(inputHeight_ - inputWidth_ * origin_height / origin_width); - } else { - // pad width - padding_value = floor(inputWidth_ - inputHeight_ * origin_width / origin_height); - } - - // Channel Split Resize - for (int i = 0; i < keypoint_heatmap[0].size(); i++) { - cv::Mat single_channel_mat = keypoint_heatmap[0][i]; - cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), - K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE, INTER_CUBIC); - if (padding_direction == 0) { - // remove height padding - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols, single_channel_mat.rows - padding_value)); - } else { - // remove width padding - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols - padding_value, single_channel_mat.rows)); - } - cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); - keypoint_heatmap[0][i] = single_channel_mat; - } - for (int i = 0; i < paf_heatmap[0].size(); i++) { - cv::Mat single_channel_mat = paf_heatmap[0][i]; - cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), - K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE); - if (padding_direction == 0) { - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols, single_channel_mat.rows - padding_value)); - } else { - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols - padding_value, single_channel_mat.rows)); - } - cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); - paf_heatmap[0][i] = single_channel_mat; - } - } - - /** - * @brief Non-Maximum Suppression, keep points that is greater than all its four surround points, - * i.e. up, bottom, left and right points - * @param plain - 2D data for NMS - * @param threshold - NMS threshold - */ - static void NMS(cv::Mat *plain, float threshold) { - cv::GaussianBlur(*plain, *plain, cv::Size(17, 17), 2.5, 2.5); - // Keep points with score below the NMS score threshold are set to 0 - plain->setTo(0, *plain < threshold); - // Find points that is greater than all its four surround points - cv::Mat plain_with_border; - int border_padding = 2; - int bottom_right_index = 2; - cv::copyMakeBorder(*plain, plain_with_border, border_padding, border_padding, border_padding, border_padding, - BORDER_CONSTANT, cv::Scalar(0)); - cv::Mat plain_with_border_clone = plain_with_border.clone(); - int sub_mat_cols = plain_with_border.cols - border_padding; - int sub_mat_rows = plain_with_border.rows - border_padding; - cv::Mat plain_center = plain_with_border(cv::Rect(1, 1, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_bottom = plain_with_border(cv::Rect(1, bottom_right_index, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_up = plain_with_border(cv::Rect(1, 0, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_left = plain_with_border(cv::Rect(0, 1, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_right = plain_with_border(cv::Rect(bottom_right_index, 1, sub_mat_cols, sub_mat_rows)); - - for (int i = 0; i < plain_center.rows; i++) { - float *center_ptr = plain_center.ptr(i); - float *bottom_ptr = plain_bottom.ptr(i); - float *up_ptr = plain_up.ptr(i); - float *left_ptr = plain_left.ptr(i); - float *right_ptr = plain_right.ptr(i); - float *clone_border_ptr = plain_with_border_clone.ptr(i + 1); - for (int j = 0; j < plain_center.cols; j++) { - if (!((center_ptr[j] > up_ptr[j]) && (center_ptr[j] > bottom_ptr[j]) && - (center_ptr[j] > left_ptr[j]) && (center_ptr[j] > right_ptr[j]))) { - clone_border_ptr[j + 1] = 0; - } - } - } - *plain = plain_with_border_clone(cv::Rect(border_padding, border_padding, - plain_center.cols - border_padding, plain_center.rows - border_padding)).clone(); - } - - /** - * @brief Extract candidate keypoints - * @param keypoint_heatmap - Resized keypoint heatmap - * @param coor - Keep extracted result, store a point in a cv::Point object, - * store keypoints of different channel in different vectors - * @param coor_score - Scores corresponding to extracted keypoints - * @return APP_ERROR - */ - void OpenPoseMindsporePost::ExtractKeypoints(const std::vector &keypoint_heatmap, - std::vector > *coor, std::vector > *coor_score) { - int polynomial_exponent = 2; - int peak_counter = 0; - for (int i = 0; i < keypoint_heatmap.size() - 1; i++) { - // NMS - cv::Mat smoothProbMap; - cv::GaussianBlur(keypoint_heatmap[i], smoothProbMap, cv::Size(17, 17), 2.5, 2.5); - - // - NMS(&smoothProbMap, K_NMS_THRESHOLD); - std::vector non_zero_coordinates; - // - cv::findNonZero(smoothProbMap, non_zero_coordinates); - std::sort(non_zero_coordinates.begin(), non_zero_coordinates.end(), PointSort); - std::vector suppressed(non_zero_coordinates.size(), 0); - std::vector keypoints_without_nearest {}; - std::vector keypoints_score {}; - // Remove other keypoints within a certain range around one keypoints - for (int j = 0; j < non_zero_coordinates.size(); j++) { - if (suppressed[j]) { - continue; - } - int thrown_index = j + 1; - auto it = std::find_if(std::begin(non_zero_coordinates) + j + 1, std::end(non_zero_coordinates), - [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { - float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + - powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); - return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; - }); - while (it != std::end(non_zero_coordinates)) { - thrown_index = std::distance(std::begin(non_zero_coordinates) + thrown_index, it) + thrown_index; - suppressed[thrown_index] = 1; - it = std::find_if(std::next(it), std::end(non_zero_coordinates), - [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { - float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + - powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); - return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; - }); - } - keypoints_without_nearest.push_back(non_zero_coordinates[j]); - // - keypoints_score.push_back(smoothProbMap.at - (non_zero_coordinates[j].y, non_zero_coordinates[j].x)); - } - coor->push_back(keypoints_without_nearest); - coor_score->push_back(keypoints_score); - } - } - - /** - * @brief Compute expected confidence for each candidate skeleton - * @param endpoints - Coordinates of the two end points of a skeleton - * @param paf_x - PAF heatmap of x coordinate - * @param paf_y - PAF heatmap of y coordinate - * @return result - Keep confidence information of this skeleton in the form: - * [confidence score, number of successfully hit sub points] - */ - std::vector OpenPoseMindsporePost::OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, - const std::vector &endpoints) { - int x1 = endpoints[0].x, y1 = endpoints[0].y; - int x2 = endpoints[1].x, y2 = endpoints[1].y; - // affinity score of this skeleton - float score = 0; - // count: number of valid inner points on this skeleton - int count = 0, num_inter = 10; - float dx = x2 - x1; - float dy = y2 - y1; - float norm_vec = sqrt(dx * dx + dy * dy); - float vx = dx / (norm_vec + 1e-6); - float vy = dy / (norm_vec + 1e-6); - // generate 10 points equally spaced on this skeleton - std::vector xs {}; - float step_x = dx / (num_inter - 1); - for (int k = 0; k < num_inter; k++) { - float temp_x = x1 + k * step_x; - xs.push_back(round(temp_x)); - } - std::vector ys {}; - float step_y = dy / (num_inter - 1); - for (int k = 0; k < num_inter; k++) { - float temp_y = y1 + k * step_y; - ys.push_back(round(temp_y)); - } - std::vector sub_score_vec; - for (int i = 0; i < xs.size(); i++) { - // calculate PAF value of each inner point - float sub_score = paf_x.at(ys[i], xs[i]) * vx + paf_y.at(ys[i], xs[i]) * vy; - sub_score_vec.push_back(sub_score); - } - // remove inner points such that has PAF value < K_LOCAL_PAF_SCORE_THRESHOLD - sub_score_vec.erase(std::remove_if( - sub_score_vec.begin(), sub_score_vec.end(), - [](const float &x) { - return x <= K_LOCAL_PAF_SCORE_THRESHOLD; - }), sub_score_vec.end()); - std::vector result {0.0, 0.0}; - score = std::accumulate(sub_score_vec.begin(), sub_score_vec.end(), 0.0); - count = sub_score_vec.size(); - result[0] = score / (count + 1e-6); - result[1] = count; - return result; - } - - /** - * @brief Remove conflict skeletons - * @param src - Source vector that stores skeletons to be processed - * @param dst - Target vector that collects candidate skeletons - * @return APP_ERROR - */ - void OpenPoseMindsporePost::ConntectionNms(std::vector *src, std::vector *dst) { - // Remove conflict skeletons, if two skeletons of the same type share a same end point, they are conflict - std::vector used_idx1 {}; - std::vector used_idx2 {}; - // Sort skeletons in ascending order of affinity score - std::sort(src[0].begin(), src[0].end(), GreaterSort); - for (int i = 0; i < src[0].size(); i++) { - PartPair candidate = src[0][i]; - if (std::find(used_idx1.begin(), used_idx1.end(), candidate.idx1) != used_idx1.end() - || std::find(used_idx2.begin(), used_idx2.end(), candidate.idx2) != used_idx2.end()) { - continue; - } - dst->push_back(candidate); - used_idx1.push_back(candidate.idx1); - used_idx2.push_back(candidate.idx2); - } - } - - /** - * @brief Calculate expected confidence of each possible skeleton and choose candidates - * @param part_idx - Index of skeleton in K_POSE_BODY_PART_SKELETONS - * @param coor - Candidate positions of endpoints - * @param coor_score - Corresponding score of coor - * @param paf_heatmap - PAF heatmap - * @param connections - Target vector that collects candidate skeletons - * @return APP_ERROR - */ - void OpenPoseMindsporePost::ScoreSkeletons(const int part_idx, - const std::vector > &coor, const std::vector > &coor_score, - const std::vector &paf_heatmap, std::vector *connections) { - // Use point1 and point2 to represent the two endpoints of a skeleton - int coco_skeleton_idx1 = K_POSE_BODY_PART_SKELETONS[2 * part_idx]; - int coco_skeleton_idx2 = K_POSE_BODY_PART_SKELETONS[2 * part_idx + 1]; - int index_stride = 2; - int end_point_num = 2; - int paf_x_idx = K_POSE_MAP_INDEX[index_stride * part_idx]; - int paf_y_idx = K_POSE_MAP_INDEX[index_stride * part_idx + 1]; - std::vector endpoints(end_point_num, cv::Point(0, 0)); - std::vector connection_temp {}; - std::vector result {0.0, 0.0}; - // Calculate the affinity score of each skeleton composed of all candidate point1 and point2 - for (int i = 0; i < coor[coco_skeleton_idx1].size(); i++) { - cv::Point point1; - point1.x = coor[coco_skeleton_idx1][i].x; - point1.y = coor[coco_skeleton_idx1][i].y; - endpoints[0] = point1; - for (int j = 0; j < coor[coco_skeleton_idx2].size(); j++) { - cv::Point point2; - point2.x = coor[coco_skeleton_idx2][j].x; - point2.y = coor[coco_skeleton_idx2][j].y; - endpoints[1] = point2; - result = OneSkeletonScore(paf_heatmap[paf_x_idx], paf_heatmap[paf_y_idx], endpoints); - // Keep skeletons with affinity scores greater than 0 and - // valid internal points greater than K_LOCAL_PAF_COUNT_THRESHOLD - if (result[1] <= K_LOCAL_PAF_COUNT_THRESHOLD || result[0] <= 0.0) { - continue; - } - // Store the information of a skeleton in a custom structure PartPair - PartPair skeleton; - skeleton.score = result[0]; - skeleton.partIdx1 = coco_skeleton_idx1; - skeleton.partIdx2 = coco_skeleton_idx2; - skeleton.idx1 = i; - skeleton.idx2 = j; - skeleton.coord1.push_back(point1.x); - skeleton.coord1.push_back(point1.y); - skeleton.coord2.push_back(point2.x); - skeleton.coord2.push_back(point2.y); - skeleton.score1 = coor_score[coco_skeleton_idx1][i]; - skeleton.score2 = coor_score[coco_skeleton_idx2][j]; - connection_temp.push_back(skeleton); - } - } - // For skeletons with the same endpoints, keep the one with larger affinity score - ConntectionNms(&connection_temp, connections); - } - - /** - * @brief Merge a skeleton to an existed person - * @param person_list - Currently existed person list - * @param current_pair - Skeleton to be merged - * @return True if merged successfully, otherwise false - */ - bool OpenPoseMindsporePost::MergeSkeletonToPerson(std::vector > *person_list, - PartPair current_pair) { - // Use point1 and point2 to represent the two endpoints of a skeleton - for (int k = 0; k < person_list[0].size(); k++) { - std::vector ¤t_person = person_list[0][k]; - for (int i = 0; i < current_person.size(); i++) { - if (current_pair.partIdx1 == current_person[i].partIdx1 && - current_pair.idx1 == current_person[i].idx1) { - // point1 of current skeleton is the same as point1 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx1 == current_person[i].partIdx2 && - current_pair.idx1 == current_person[i].idx2) { - // point1 of current skeleton is the same as point2 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx2 == current_person[i].partIdx1 && - current_pair.idx2 == current_person[i].idx1) { - // point2 of current skeleton is the same as point1 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx2 == current_person[i].partIdx2 && - current_pair.idx2 == current_person[i].idx2) { - // point2 of current skeleton is the same as point2 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } - } - } - // Can not merge to any existed person, create new person - std::vector new_person {}; - new_person.push_back(current_pair); - person_list->push_back(new_person); - return true; - } - - /** - * @brief Group keypoints to skeletons and assemble them to person - * @param paf_heatmap - PAF heatmap - * @param coor - Coordinates of all the candidate keypoints - * @param coor_score - Corresponding score of coordinates - * @param person_list - Target vector to store person, each person is stored as a vector of skeletons - * @return APP_ERROR - */ - void OpenPoseMindsporePost::GroupKeypoints(const std::vector &paf_heatmap, - const std::vector > &coor, const std::vector > &coor_score, - std::vector > *person_list) { - for (int i = 0; i < K_NUM_BODY_PARTS + 1; i++) { - // Choose candidate skeletons for each category, there are a total of - // kNumBodyPart + 1 categories of skeletons - std::vector part_connections {}; - ScoreSkeletons(i, coor, coor_score, paf_heatmap, &part_connections); - // Merge newly generated skeletons to existed person or create new person - if (i == 0) { - // For the first category, each different skeleton of this category stands for different person - for (int j = 0; j < part_connections.size(); j++) { - std::vector new_person {}; - new_person.push_back(part_connections[j]); - person_list->push_back(new_person); - } - } else if (i == K_NUM_BODY_PARTS - 1 || i == K_NUM_BODY_PARTS) { - // The last two skeletons do not contribute to person score - for (int j = 0; j < part_connections.size(); j++) { - part_connections[j].score = 0; - part_connections[j].score1 = 0; - part_connections[j].score2 = 0; - bool can_merge = MergeSkeletonToPerson(person_list, part_connections[j]); - } - } else { - for (int j = 0; j < part_connections.size(); j++) { - MergeSkeletonToPerson(person_list, part_connections[j]); - } - } - } - } - - /** - * @brief Calculate score of a person according to its skeletons - * @param person - Target person - * @return Score value - */ - float OpenPoseMindsporePost::PersonScore(const std::vector &person) { - // The score of a person is composed of the scores of all his keypoints and that of all his skeletons - std::vector seen_keypoints = {}; - float person_score = 0.0; - for (int i = 0; i < person.size(); i++) { - PartPair skeleton = person[i]; - if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx1) == seen_keypoints.end()) { - seen_keypoints.push_back(skeleton.partIdx1); - person_score += skeleton.score1; - } - if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx2) == seen_keypoints.end()) { - seen_keypoints.push_back(skeleton.partIdx2); - person_score += skeleton.score2; - } - person_score += skeleton.score; - } - // Ignore person whose number of skeletons is less than K_PERSON_SKELETON_COUNT_THRESHOLD or - // the average score of each keypoint is less than K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD - if (seen_keypoints.size() < K_PERSON_SKELETON_COUNT_THRESHOLD || - (person_score / seen_keypoints.size()) < K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD) { - return 0.0; - } - return person_score; - } - - void OpenPoseMindsporePost::GeneratePersonList(const std::vector &tensors, - const std::vector &vision_infos, std::vector > *person_list) { - std::vector > result = ReadDataFromTensor(tensors); - std::vector keypoint_heatmap, paf_heatmap; - keypoint_heatmap = result[0]; - paf_heatmap = result[1]; - // Resize heatmaps to the size of the input image - ResizeHeatmaps(vision_infos, &keypoint_heatmap, &paf_heatmap); - // Extract candidate keypoints - std::vector > coor {}; - std::vector > coor_score {}; - ExtractKeypoints(keypoint_heatmap, &coor, &coor_score); - // Group candidate keypoints to candidate skeletons and generate person - GroupKeypoints(paf_heatmap, coor, coor_score, person_list); - } - - APP_ERROR OpenPoseMindsporePost::selfProcess(const std::vector &tensors, - const std::vector &vision_infos, std::vector > *person_list) { - auto inputs = tensors; - APP_ERROR ret = CheckAndMoveTensors(inputs); - if (ret != APP_ERR_OK) { - LogError << "CheckAndMoveTensors failed, ret=" << ret; - return ret; - } - GeneratePersonList(tensors, vision_infos, person_list); - LogInfo << "Postprocess success."; - return APP_ERR_OK; - } -} // namespace MxBase - - - +/* +* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "OpenPoseMindsporePost.h" +#include +#include +#include +#include +#include "opencv2/opencv.hpp" +#include "MxBase/Log/Log.h" +#include "MxBase/Tensor/TensorBase/TensorBase.h" + +using namespace MxBase; +using namespace cv; + +namespace { + auto uint8Deleter = [](uint8_t *p) {}; + const int K_NUM_BODY_PARTS = 18; + const int K_UPSAMPLED_STRIDE = 8; + + // CocoSkeletonsNetwork + const std::vector K_POSE_MAP_INDEX { + 12, 13, 20, 21, 14, 15, 16, 17, 22, 23, + 24, 25, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 28, 29, 30, 31, 34, 35, + 32, 33, 36, 37, 18, 19, 26, 27 + }; + + // CocoSkeletons + const std::vector K_POSE_BODY_PART_SKELETONS { + 1, 2, 1, 5, 2, 3, 3, 4, 5, 6, + 6, 7, 1, 8, 8, 9, 9, 10, 1, 11, + 11, 12, 12, 13, 1, 0, 0, 14, 14, 16, + 0, 15, 15, 17, 2, 16, 5, 17 + }; + // Nms score threshold + const float K_NMS_THRESHOLD = 0.05; + // Range of nearest neighbors + const int K_NEAREST_KEYPOINTS_THRESHOLD = 6; + // PAF score threshold as a valid inner point on a skeleton + const float K_LOCAL_PAF_SCORE_THRESHOLD = 0.05; + // The minimum number of valid inner points a skeleton includes to be regarded as a correct skeleton + const int K_LOCAL_PAF_COUNT_THRESHOLD = 8; + // The minimum number of skeletons needed to form a person + const int K_PERSON_SKELETON_COUNT_THRESHOLD = 3; + // The lowest average score per keypoint in a person + const float K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD = 0.2; +} // namespace + +namespace MxBase { + OpenPoseMindsporePost &OpenPoseMindsporePost::operator=(const OpenPoseMindsporePost &other) { + if (this == &other) { + return *this; + } + ObjectPostProcessBase::operator=(other); + return *this; + } + + /** + * @brief Parsing TensorBase data to keypoint heatmap and PAF heatmap of openpose model + * @param tensors - TensorBase vector + * @return Two-element vector, keeping keypoint heatmap and paf heatmap respectively + */ + static std::vector > ReadDataFromTensor(const std::vector &tensors) { + auto shape = tensors[11].GetShape(); + int channel_keypoint = shape[1]; + int height_index = 2, width_index = 3; + int height = shape[height_index]; + int width = shape[width_index]; + auto shape_p = tensors[5].GetShape(); + int channel_paf = shape_p[1]; + // Read keypoint data + auto dataPtr = reinterpret_cast(tensors[11].GetBuffer()); + std::shared_ptr keypoint_pointer; + keypoint_pointer.reset(dataPtr, uint8Deleter); + + std::vector keypoint_heatmap {}; + int idx = 0; + for (int i = 0; i < channel_keypoint; i++) { + cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); + for (int j = 0; j < height; j++) { + float *ptr = single_channel_mat.ptr(j); + for (int k = 0; k < width; k++) { + ptr[k] = static_cast(keypoint_pointer.get())[idx]; + idx += 1; + } + } + keypoint_heatmap.push_back(single_channel_mat); + } + // Read PAF data + auto data_paf_ptr = reinterpret_cast(tensors[5].GetBuffer()); + std::shared_ptr paf_pointer; + paf_pointer.reset(data_paf_ptr, uint8Deleter); + std::vector paf_heatmap {}; + idx = 0; + for (int i = 0; i < channel_paf; i++) { + cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); + for (int j = 0; j < height; j++) { + float *ptr = single_channel_mat.ptr(j); + for (int k = 0; k < width; k++) { + ptr[k] = static_cast(paf_pointer.get())[idx]; + idx += 1; + } + } + paf_heatmap.push_back(single_channel_mat); + } + std::vector > result = {keypoint_heatmap, paf_heatmap}; + return result; + } + + /** + * @brief Comparison between two PartPair elements + * @param p1 - PartPair p1 + * @param p2 - PartPair p2 + * @return True if the score of p1 is greater than that of p2 + */ + static bool GreaterSort(PartPair p1, PartPair p2) { + return p1.score > p2.score; + } + + /** + * @brief Comparison between two cv::Point elements + * @param p1 - cv::Point p1 + * @param p2 - cv::Point p2 + * @return True if the x coordinate of p2 is greater than that of p1 + */ + static bool PointSort(cv::Point p1, cv::Point p2) { + return p1.x < p2.x; + } + + /** + * @brief Resize output heatmaps to the size of the origin image + * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat + * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat + * @param vision_infos - Vision infos of origin image and aligned image + * @return APP_ERROR + */ + void OpenPoseMindsporePost::ResizeHeatmaps(const std::vector &vision_infos, + std::vector *keypoint_heatmap, std::vector *paf_heatmap) { + // Calculate padding direction and padding value + int origin_height = vision_infos[0]; + int origin_width = vision_infos[1]; + int inputHeight_ = vision_infos[2]; + int inputWidth_ = vision_infos[3]; + // padding along height + int padding_direction = 0; + if (origin_height > origin_width) { + // padding along width + padding_direction = 1; + } + int padding_value = 0; + if (padding_direction == 0) { + // pad height + padding_value = floor(inputHeight_ - inputWidth_ * origin_height / origin_width); + } else { + // pad width + padding_value = floor(inputWidth_ - inputHeight_ * origin_width / origin_height); + } + + // Channel Split Resize + for (int i = 0; i < keypoint_heatmap[0].size(); i++) { + cv::Mat single_channel_mat = keypoint_heatmap[0][i]; + cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), + K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE, INTER_CUBIC); + if (padding_direction == 0) { + // remove height padding + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols, single_channel_mat.rows - padding_value)); + } else { + // remove width padding + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols - padding_value, single_channel_mat.rows)); + } + cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); + keypoint_heatmap[0][i] = single_channel_mat; + } + for (int i = 0; i < paf_heatmap[0].size(); i++) { + cv::Mat single_channel_mat = paf_heatmap[0][i]; + cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), + K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE); + if (padding_direction == 0) { + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols, single_channel_mat.rows - padding_value)); + } else { + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols - padding_value, single_channel_mat.rows)); + } + cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); + paf_heatmap[0][i] = single_channel_mat; + } + } + + /** + * @brief Non-Maximum Suppression, keep points that is greater than all its four surround points, + * i.e. up, bottom, left and right points + * @param plain - 2D data for NMS + * @param threshold - NMS threshold + */ + static void NMS(cv::Mat *plain, float threshold) { + cv::GaussianBlur(*plain, *plain, cv::Size(17, 17), 2.5, 2.5); + // Keep points with score below the NMS score threshold are set to 0 + plain->setTo(0, *plain < threshold); + // Find points that is greater than all its four surround points + cv::Mat plain_with_border; + int border_padding = 2; + int bottom_right_index = 2; + cv::copyMakeBorder(*plain, plain_with_border, border_padding, border_padding, border_padding, border_padding, + BORDER_CONSTANT, cv::Scalar(0)); + cv::Mat plain_with_border_clone = plain_with_border.clone(); + int sub_mat_cols = plain_with_border.cols - border_padding; + int sub_mat_rows = plain_with_border.rows - border_padding; + cv::Mat plain_center = plain_with_border(cv::Rect(1, 1, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_bottom = plain_with_border(cv::Rect(1, bottom_right_index, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_up = plain_with_border(cv::Rect(1, 0, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_left = plain_with_border(cv::Rect(0, 1, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_right = plain_with_border(cv::Rect(bottom_right_index, 1, sub_mat_cols, sub_mat_rows)); + + for (int i = 0; i < plain_center.rows; i++) { + float *center_ptr = plain_center.ptr(i); + float *bottom_ptr = plain_bottom.ptr(i); + float *up_ptr = plain_up.ptr(i); + float *left_ptr = plain_left.ptr(i); + float *right_ptr = plain_right.ptr(i); + float *clone_border_ptr = plain_with_border_clone.ptr(i + 1); + for (int j = 0; j < plain_center.cols; j++) { + if (!((center_ptr[j] > up_ptr[j]) && (center_ptr[j] > bottom_ptr[j]) && + (center_ptr[j] > left_ptr[j]) && (center_ptr[j] > right_ptr[j]))) { + clone_border_ptr[j + 1] = 0; + } + } + } + *plain = plain_with_border_clone(cv::Rect(border_padding, border_padding, + plain_center.cols - border_padding, plain_center.rows - border_padding)).clone(); + } + + /** + * @brief Extract candidate keypoints + * @param keypoint_heatmap - Resized keypoint heatmap + * @param coor - Keep extracted result, store a point in a cv::Point object, + * store keypoints of different channel in different vectors + * @param coor_score - Scores corresponding to extracted keypoints + * @return APP_ERROR + */ + void OpenPoseMindsporePost::ExtractKeypoints(const std::vector &keypoint_heatmap, + std::vector > *coor, std::vector > *coor_score) { + int polynomial_exponent = 2; + int peak_counter = 0; + for (int i = 0; i < keypoint_heatmap.size() - 1; i++) { + // NMS + cv::Mat smoothProbMap; + cv::GaussianBlur(keypoint_heatmap[i], smoothProbMap, cv::Size(17, 17), 2.5, 2.5); + + // + NMS(&smoothProbMap, K_NMS_THRESHOLD); + std::vector non_zero_coordinates; + // + cv::findNonZero(smoothProbMap, non_zero_coordinates); + std::sort(non_zero_coordinates.begin(), non_zero_coordinates.end(), PointSort); + std::vector suppressed(non_zero_coordinates.size(), 0); + std::vector keypoints_without_nearest {}; + std::vector keypoints_score {}; + // Remove other keypoints within a certain range around one keypoints + for (int j = 0; j < non_zero_coordinates.size(); j++) { + if (suppressed[j]) { + continue; + } + int thrown_index = j + 1; + auto it = std::find_if(std::begin(non_zero_coordinates) + j + 1, std::end(non_zero_coordinates), + [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { + float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + + powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); + return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; + }); + while (it != std::end(non_zero_coordinates)) { + thrown_index = std::distance(std::begin(non_zero_coordinates) + thrown_index, it) + thrown_index; + suppressed[thrown_index] = 1; + it = std::find_if(std::next(it), std::end(non_zero_coordinates), + [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { + float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + + powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); + return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; + }); + } + keypoints_without_nearest.push_back(non_zero_coordinates[j]); + // + keypoints_score.push_back(smoothProbMap.at + (non_zero_coordinates[j].y, non_zero_coordinates[j].x)); + } + coor->push_back(keypoints_without_nearest); + coor_score->push_back(keypoints_score); + } + } + + /** + * @brief Compute expected confidence for each candidate skeleton + * @param endpoints - Coordinates of the two end points of a skeleton + * @param paf_x - PAF heatmap of x coordinate + * @param paf_y - PAF heatmap of y coordinate + * @return result - Keep confidence information of this skeleton in the form: + * [confidence score, number of successfully hit sub points] + */ + std::vector OpenPoseMindsporePost::OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, + const std::vector &endpoints) { + int x1 = endpoints[0].x, y1 = endpoints[0].y; + int x2 = endpoints[1].x, y2 = endpoints[1].y; + // affinity score of this skeleton + float score = 0; + // count: number of valid inner points on this skeleton + int count = 0, num_inter = 10; + float dx = x2 - x1; + float dy = y2 - y1; + float norm_vec = sqrt(dx * dx + dy * dy); + float vx = dx / (norm_vec + 1e-6); + float vy = dy / (norm_vec + 1e-6); + // generate 10 points equally spaced on this skeleton + std::vector xs {}; + float step_x = dx / (num_inter - 1); + for (int k = 0; k < num_inter; k++) { + float temp_x = x1 + k * step_x; + xs.push_back(round(temp_x)); + } + std::vector ys {}; + float step_y = dy / (num_inter - 1); + for (int k = 0; k < num_inter; k++) { + float temp_y = y1 + k * step_y; + ys.push_back(round(temp_y)); + } + std::vector sub_score_vec; + for (int i = 0; i < xs.size(); i++) { + // calculate PAF value of each inner point + float sub_score = paf_x.at(ys[i], xs[i]) * vx + paf_y.at(ys[i], xs[i]) * vy; + sub_score_vec.push_back(sub_score); + } + // remove inner points such that has PAF value < K_LOCAL_PAF_SCORE_THRESHOLD + sub_score_vec.erase(std::remove_if( + sub_score_vec.begin(), sub_score_vec.end(), + [](const float &x) { + return x <= K_LOCAL_PAF_SCORE_THRESHOLD; + }), sub_score_vec.end()); + std::vector result {0.0, 0.0}; + score = std::accumulate(sub_score_vec.begin(), sub_score_vec.end(), 0.0); + count = sub_score_vec.size(); + result[0] = score / (count + 1e-6); + result[1] = count; + return result; + } + + /** + * @brief Remove conflict skeletons + * @param src - Source vector that stores skeletons to be processed + * @param dst - Target vector that collects candidate skeletons + * @return APP_ERROR + */ + void OpenPoseMindsporePost::ConntectionNms(std::vector *src, std::vector *dst) { + // Remove conflict skeletons, if two skeletons of the same type share a same end point, they are conflict + std::vector used_idx1 {}; + std::vector used_idx2 {}; + // Sort skeletons in ascending order of affinity score + std::sort(src[0].begin(), src[0].end(), GreaterSort); + for (int i = 0; i < src[0].size(); i++) { + PartPair candidate = src[0][i]; + if (std::find(used_idx1.begin(), used_idx1.end(), candidate.idx1) != used_idx1.end() + || std::find(used_idx2.begin(), used_idx2.end(), candidate.idx2) != used_idx2.end()) { + continue; + } + dst->push_back(candidate); + used_idx1.push_back(candidate.idx1); + used_idx2.push_back(candidate.idx2); + } + } + + /** + * @brief Calculate expected confidence of each possible skeleton and choose candidates + * @param part_idx - Index of skeleton in K_POSE_BODY_PART_SKELETONS + * @param coor - Candidate positions of endpoints + * @param coor_score - Corresponding score of coor + * @param paf_heatmap - PAF heatmap + * @param connections - Target vector that collects candidate skeletons + * @return APP_ERROR + */ + void OpenPoseMindsporePost::ScoreSkeletons(const int part_idx, + const std::vector > &coor, const std::vector > &coor_score, + const std::vector &paf_heatmap, std::vector *connections) { + // Use point1 and point2 to represent the two endpoints of a skeleton + int coco_skeleton_idx1 = K_POSE_BODY_PART_SKELETONS[2 * part_idx]; + int coco_skeleton_idx2 = K_POSE_BODY_PART_SKELETONS[2 * part_idx + 1]; + int index_stride = 2; + int end_point_num = 2; + int paf_x_idx = K_POSE_MAP_INDEX[index_stride * part_idx]; + int paf_y_idx = K_POSE_MAP_INDEX[index_stride * part_idx + 1]; + std::vector endpoints(end_point_num, cv::Point(0, 0)); + std::vector connection_temp {}; + std::vector result {0.0, 0.0}; + // Calculate the affinity score of each skeleton composed of all candidate point1 and point2 + for (int i = 0; i < coor[coco_skeleton_idx1].size(); i++) { + cv::Point point1; + point1.x = coor[coco_skeleton_idx1][i].x; + point1.y = coor[coco_skeleton_idx1][i].y; + endpoints[0] = point1; + for (int j = 0; j < coor[coco_skeleton_idx2].size(); j++) { + cv::Point point2; + point2.x = coor[coco_skeleton_idx2][j].x; + point2.y = coor[coco_skeleton_idx2][j].y; + endpoints[1] = point2; + result = OneSkeletonScore(paf_heatmap[paf_x_idx], paf_heatmap[paf_y_idx], endpoints); + // Keep skeletons with affinity scores greater than 0 and + // valid internal points greater than K_LOCAL_PAF_COUNT_THRESHOLD + if (result[1] <= K_LOCAL_PAF_COUNT_THRESHOLD || result[0] <= 0.0) { + continue; + } + // Store the information of a skeleton in a custom structure PartPair + PartPair skeleton; + skeleton.score = result[0]; + skeleton.partIdx1 = coco_skeleton_idx1; + skeleton.partIdx2 = coco_skeleton_idx2; + skeleton.idx1 = i; + skeleton.idx2 = j; + skeleton.coord1.push_back(point1.x); + skeleton.coord1.push_back(point1.y); + skeleton.coord2.push_back(point2.x); + skeleton.coord2.push_back(point2.y); + skeleton.score1 = coor_score[coco_skeleton_idx1][i]; + skeleton.score2 = coor_score[coco_skeleton_idx2][j]; + connection_temp.push_back(skeleton); + } + } + // For skeletons with the same endpoints, keep the one with larger affinity score + ConntectionNms(&connection_temp, connections); + } + + /** + * @brief Merge a skeleton to an existed person + * @param person_list - Currently existed person list + * @param current_pair - Skeleton to be merged + * @return True if merged successfully, otherwise false + */ + bool OpenPoseMindsporePost::MergeSkeletonToPerson(std::vector > *person_list, + PartPair current_pair) { + // Use point1 and point2 to represent the two endpoints of a skeleton + for (int k = 0; k < person_list[0].size(); k++) { + std::vector ¤t_person = person_list[0][k]; + for (int i = 0; i < current_person.size(); i++) { + if (current_pair.partIdx1 == current_person[i].partIdx1 && + current_pair.idx1 == current_person[i].idx1) { + // point1 of current skeleton is the same as point1 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx1 == current_person[i].partIdx2 && + current_pair.idx1 == current_person[i].idx2) { + // point1 of current skeleton is the same as point2 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx2 == current_person[i].partIdx1 && + current_pair.idx2 == current_person[i].idx1) { + // point2 of current skeleton is the same as point1 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx2 == current_person[i].partIdx2 && + current_pair.idx2 == current_person[i].idx2) { + // point2 of current skeleton is the same as point2 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } + } + } + // Can not merge to any existed person, create new person + std::vector new_person {}; + new_person.push_back(current_pair); + person_list->push_back(new_person); + return true; + } + + /** + * @brief Group keypoints to skeletons and assemble them to person + * @param paf_heatmap - PAF heatmap + * @param coor - Coordinates of all the candidate keypoints + * @param coor_score - Corresponding score of coordinates + * @param person_list - Target vector to store person, each person is stored as a vector of skeletons + * @return APP_ERROR + */ + void OpenPoseMindsporePost::GroupKeypoints(const std::vector &paf_heatmap, + const std::vector > &coor, const std::vector > &coor_score, + std::vector > *person_list) { + for (int i = 0; i < K_NUM_BODY_PARTS + 1; i++) { + // Choose candidate skeletons for each category, there are a total of + // kNumBodyPart + 1 categories of skeletons + std::vector part_connections {}; + ScoreSkeletons(i, coor, coor_score, paf_heatmap, &part_connections); + // Merge newly generated skeletons to existed person or create new person + if (i == 0) { + // For the first category, each different skeleton of this category stands for different person + for (int j = 0; j < part_connections.size(); j++) { + std::vector new_person {}; + new_person.push_back(part_connections[j]); + person_list->push_back(new_person); + } + } else if (i == K_NUM_BODY_PARTS - 1 || i == K_NUM_BODY_PARTS) { + // The last two skeletons do not contribute to person score + for (int j = 0; j < part_connections.size(); j++) { + part_connections[j].score = 0; + part_connections[j].score1 = 0; + part_connections[j].score2 = 0; + bool can_merge = MergeSkeletonToPerson(person_list, part_connections[j]); + } + } else { + for (int j = 0; j < part_connections.size(); j++) { + MergeSkeletonToPerson(person_list, part_connections[j]); + } + } + } + } + + /** + * @brief Calculate score of a person according to its skeletons + * @param person - Target person + * @return Score value + */ + float OpenPoseMindsporePost::PersonScore(const std::vector &person) { + // The score of a person is composed of the scores of all his keypoints and that of all his skeletons + std::vector seen_keypoints = {}; + float person_score = 0.0; + for (int i = 0; i < person.size(); i++) { + PartPair skeleton = person[i]; + if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx1) == seen_keypoints.end()) { + seen_keypoints.push_back(skeleton.partIdx1); + person_score += skeleton.score1; + } + if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx2) == seen_keypoints.end()) { + seen_keypoints.push_back(skeleton.partIdx2); + person_score += skeleton.score2; + } + person_score += skeleton.score; + } + // Ignore person whose number of skeletons is less than K_PERSON_SKELETON_COUNT_THRESHOLD or + // the average score of each keypoint is less than K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD + if (seen_keypoints.size() < K_PERSON_SKELETON_COUNT_THRESHOLD || + (person_score / seen_keypoints.size()) < K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD) { + return 0.0; + } + return person_score; + } + + void OpenPoseMindsporePost::GeneratePersonList(const std::vector &tensors, + const std::vector &vision_infos, std::vector > *person_list) { + std::vector > result = ReadDataFromTensor(tensors); + std::vector keypoint_heatmap, paf_heatmap; + keypoint_heatmap = result[0]; + paf_heatmap = result[1]; + // Resize heatmaps to the size of the input image + ResizeHeatmaps(vision_infos, &keypoint_heatmap, &paf_heatmap); + // Extract candidate keypoints + std::vector > coor {}; + std::vector > coor_score {}; + ExtractKeypoints(keypoint_heatmap, &coor, &coor_score); + // Group candidate keypoints to candidate skeletons and generate person + GroupKeypoints(paf_heatmap, coor, coor_score, person_list); + } + + APP_ERROR OpenPoseMindsporePost::selfProcess(const std::vector &tensors, + const std::vector &vision_infos, std::vector > *person_list) { + auto inputs = tensors; + APP_ERROR ret = CheckAndMoveTensors(inputs); + if (ret != APP_ERR_OK) { + LogError << "CheckAndMoveTensors failed, ret=" << ret; + return ret; + } + GeneratePersonList(tensors, vision_infos, person_list); + LogInfo << "Postprocess success."; + return APP_ERR_OK; + } +} // namespace MxBase + + + diff --git a/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.h b/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.h index ceaff272a..44c0de7f3 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.h +++ b/cv/pose/openpose/mindspore/infer/mxbase/process/src/PostProcess/OpenPoseMindsporePost.h @@ -1,81 +1,81 @@ -/* -* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#ifndef OPENPOSEPOSTPROCESS_OPENPOSEPOSTPROCESS_H -#define OPENPOSEPOSTPROCESS_OPENPOSEPOSTPROCESS_H -#include -#include "MxTools/PluginToolkit/base/MxPluginGenerator.h" -#include "MxTools/PluginToolkit/base/MxPluginBase.h" -#include "MxTools/PluginToolkit/metadata/MxpiMetadataManager.h" -#include "MxTools/Proto/MxpiDataType.pb.h" -#include "MxBase/ErrorCode/ErrorCode.h" -#include "MxBase/PostProcessBases/PostProcessBase.h" -#include "MxBase/PostProcessBases/ObjectPostProcessBase.h" -#include "mxpiOpenposeProto.pb.h" -#include "opencv4/opencv2/opencv.hpp" - -struct PartPair { - float score; - int partIdx1; - int partIdx2; - int idx1; - int idx2; - std::vector coord1; - std::vector coord2; - float score1; - float score2; -}; - -namespace MxBase { -class OpenPoseMindsporePost : public ObjectPostProcessBase { - public: - OpenPoseMindsporePost() = default; - - ~OpenPoseMindsporePost() = default; - - OpenPoseMindsporePost(const OpenPoseMindsporePost &other) = default; - - OpenPoseMindsporePost &operator=(const OpenPoseMindsporePost &other); - - APP_ERROR selfProcess(const std::vector &tensors, - const std::vector &vision_infos, std::vector > *person_list); - - void GeneratePersonList(const std::vector &tensors, - const std::vector &vision_infos, std::vector > *person_list); - - void ResizeHeatmaps(const std::vector &vision_infos, std::vector *keypoint_heatmap, - std::vector *paf_heatmap); - - void ExtractKeypoints(const std::vector &keypoint_heatmap, - std::vector > *coor, std::vector > *coor_score); - - void GroupKeypoints(const std::vector& paf_heatmap, const std::vector > &coor, - const std::vector > &coor_score, std::vector > *person_list); - - void ScoreSkeletons(const int part_idx, const std::vector > &coor, - const std::vector > &coor_score, const std::vector &paf_heatmap, - std::vector *connections); - - std::vector OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, - const std::vector &endpoints); - - void ConntectionNms(std::vector *src, std::vector *dst); - - bool MergeSkeletonToPerson(std::vector > *person_list, PartPair current_pair); - - float PersonScore(const std::vector &person); -}; -#endif // OpenPose_MINSPORE_PORT_H -} // namespace MxBase +/* +* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#ifndef OPENPOSEPOSTPROCESS_OPENPOSEPOSTPROCESS_H +#define OPENPOSEPOSTPROCESS_OPENPOSEPOSTPROCESS_H +#include +#include "MxTools/PluginToolkit/base/MxPluginGenerator.h" +#include "MxTools/PluginToolkit/base/MxPluginBase.h" +#include "MxTools/PluginToolkit/metadata/MxpiMetadataManager.h" +#include "MxTools/Proto/MxpiDataType.pb.h" +#include "MxBase/ErrorCode/ErrorCode.h" +#include "MxBase/PostProcessBases/PostProcessBase.h" +#include "MxBase/PostProcessBases/ObjectPostProcessBase.h" +#include "mxpiOpenposeProto.pb.h" +#include "opencv4/opencv2/opencv.hpp" + +struct PartPair { + float score; + int partIdx1; + int partIdx2; + int idx1; + int idx2; + std::vector coord1; + std::vector coord2; + float score1; + float score2; +}; + +namespace MxBase { +class OpenPoseMindsporePost : public ObjectPostProcessBase { + public: + OpenPoseMindsporePost() = default; + + ~OpenPoseMindsporePost() = default; + + OpenPoseMindsporePost(const OpenPoseMindsporePost &other) = default; + + OpenPoseMindsporePost &operator=(const OpenPoseMindsporePost &other); + + APP_ERROR selfProcess(const std::vector &tensors, + const std::vector &vision_infos, std::vector > *person_list); + + void GeneratePersonList(const std::vector &tensors, + const std::vector &vision_infos, std::vector > *person_list); + + void ResizeHeatmaps(const std::vector &vision_infos, std::vector *keypoint_heatmap, + std::vector *paf_heatmap); + + void ExtractKeypoints(const std::vector &keypoint_heatmap, + std::vector > *coor, std::vector > *coor_score); + + void GroupKeypoints(const std::vector& paf_heatmap, const std::vector > &coor, + const std::vector > &coor_score, std::vector > *person_list); + + void ScoreSkeletons(const int part_idx, const std::vector > &coor, + const std::vector > &coor_score, const std::vector &paf_heatmap, + std::vector *connections); + + std::vector OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, + const std::vector &endpoints); + + void ConntectionNms(std::vector *src, std::vector *dst); + + bool MergeSkeletonToPerson(std::vector > *person_list, PartPair current_pair); + + float PersonScore(const std::vector &person); +}; +#endif // OpenPose_MINSPORE_PORT_H +} // namespace MxBase diff --git a/cv/pose/openpose/mindspore/infer/mxbase/proto/CMakeLists.txt b/cv/pose/openpose/mindspore/infer/mxbase/proto/CMakeLists.txt index 950e25643..293e42887 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/proto/CMakeLists.txt +++ b/cv/pose/openpose/mindspore/infer/mxbase/proto/CMakeLists.txt @@ -1,48 +1,48 @@ -cmake_minimum_required(VERSION 3.5.1) -project(ProtoFile) - -set(TARGET_LIBRARY mxpiopenposeproto) - -# Compile options -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) -add_definitions(-Dgoogle=mindxsdk_private) -add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) - -set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") - -if(EXISTS ${MX_SDK_HOME}) - set(PROTOBUF_FOUND TRUE) - set(PROTOBUF_PROTOC_EXECUTABLE ${MX_SDK_HOME}/opensource/bin/protoc) - set(PROTOBUF_INCLUDE_DIRS ${MX_SDK_HOME}/opensource/include) - include_directories(${PROTOBUF_INCLUDE_DIRS}) - link_directories(${MX_SDK_HOME}/opensource/lib) - -endif() -if(PROTOBUF_FOUND) - message(STATUS "protobuf library found") - -else() - message(FATAL_ERROR "protobuf library is needed but cant be found") -endif() - -LIST(APPEND PROTO_FLAGS -I${PROJECT_SOURCE_DIR}) - -EXECUTE_PROCESS( - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --cpp_out=${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --python_out=${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} -) - -file(GLOB_RECURSE SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.cc) -add_library(${TARGET_LIBRARY} SHARED ${SOURCE_FILES}) -target_link_libraries(${TARGET_LIBRARY} mindxsdk_protobuf) -#target_link_libraries(${TARGET_LIBRARY} -Wl,-z,relro,-z,now,-z,noexecstack -s) - -if(ENABLE_TEST) - target_link_libraries(${TARGET_LIBRARY} gcov) -endif() - -set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) -install(TARGETS ${TARGET_LIBRARY} LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/lib) +cmake_minimum_required(VERSION 3.5.1) +project(ProtoFile) + +set(TARGET_LIBRARY mxpiopenposeproto) + +# Compile options +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-Dgoogle=mindxsdk_private) +add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) + +set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") + +if(EXISTS ${MX_SDK_HOME}) + set(PROTOBUF_FOUND TRUE) + set(PROTOBUF_PROTOC_EXECUTABLE ${MX_SDK_HOME}/opensource/bin/protoc) + set(PROTOBUF_INCLUDE_DIRS ${MX_SDK_HOME}/opensource/include) + include_directories(${PROTOBUF_INCLUDE_DIRS}) + link_directories(${MX_SDK_HOME}/opensource/lib) + +endif() +if(PROTOBUF_FOUND) + message(STATUS "protobuf library found") + +else() + message(FATAL_ERROR "protobuf library is needed but cant be found") +endif() + +LIST(APPEND PROTO_FLAGS -I${PROJECT_SOURCE_DIR}) + +EXECUTE_PROCESS( + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --cpp_out=${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --python_out=${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) + +file(GLOB_RECURSE SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.cc) +add_library(${TARGET_LIBRARY} SHARED ${SOURCE_FILES}) +target_link_libraries(${TARGET_LIBRARY} mindxsdk_protobuf) +#target_link_libraries(${TARGET_LIBRARY} -Wl,-z,relro,-z,now,-z,noexecstack -s) + +if(ENABLE_TEST) + target_link_libraries(${TARGET_LIBRARY} gcov) +endif() + +set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) +install(TARGETS ${TARGET_LIBRARY} LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/lib) diff --git a/cv/pose/openpose/mindspore/infer/mxbase/proto/mxpiOpenposeProto.proto b/cv/pose/openpose/mindspore/infer/mxbase/proto/mxpiOpenposeProto.proto index 4f3b4d88e..413259642 100644 --- a/cv/pose/openpose/mindspore/infer/mxbase/proto/mxpiOpenposeProto.proto +++ b/cv/pose/openpose/mindspore/infer/mxbase/proto/mxpiOpenposeProto.proto @@ -1,33 +1,33 @@ -syntax = "proto3"; -package mxpiopenposeproto; -message MxpiMetaHeader -{ -string parentName = 1; -int32 memberId = 2; -string dataSource = 3; -} - -message MxpiSkeletonInfo -{ -int32 cocoSkeletonIndex1 = 1; -int32 cocoSkeletonIndex2 = 2; -float x0 = 3; -float y0 = 4; -float point1Score = 5; -float x1 = 6; -float y1 = 7; -float point2Score = 8; -float skeletonScore = 9; -} - -message MxpiPersonInfo -{ -repeated MxpiMetaHeader headerVec = 1; -repeated MxpiSkeletonInfo skeletonInfoVec = 2; -float score = 3; -} - -message MxpiPersonList -{ -repeated MxpiPersonInfo personInfoVec = 1; +syntax = "proto3"; +package mxpiopenposeproto; +message MxpiMetaHeader +{ +string parentName = 1; +int32 memberId = 2; +string dataSource = 3; +} + +message MxpiSkeletonInfo +{ +int32 cocoSkeletonIndex1 = 1; +int32 cocoSkeletonIndex2 = 2; +float x0 = 3; +float y0 = 4; +float point1Score = 5; +float x1 = 6; +float y1 = 7; +float point2Score = 8; +float skeletonScore = 9; +} + +message MxpiPersonInfo +{ +repeated MxpiMetaHeader headerVec = 1; +repeated MxpiSkeletonInfo skeletonInfoVec = 2; +float score = 3; +} + +message MxpiPersonList +{ +repeated MxpiPersonInfo personInfoVec = 1; } \ No newline at end of file diff --git a/cv/pose/openpose/mindspore/infer/sdk/plugins/CMakeLists.txt b/cv/pose/openpose/mindspore/infer/sdk/plugins/CMakeLists.txt index 8f7799dfe..0b4a2a841 100644 --- a/cv/pose/openpose/mindspore/infer/sdk/plugins/CMakeLists.txt +++ b/cv/pose/openpose/mindspore/infer/sdk/plugins/CMakeLists.txt @@ -1,26 +1,26 @@ -cmake_minimum_required(VERSION 3.5.2) -project(OpenposePlugin) -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0 -Dgoogle=mindxsdk_private) - -set(PLUGIN_NAME "mxpi_openposepostprocess") -set(TARGET_LIBRARY ${PLUGIN_NAME}) -set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") - -include_directories(${CMAKE_CURRENT_BINARY_DIR}) -include_directories(${MX_SDK_HOME}/include) -include_directories(${MX_SDK_HOME}/opensource/include) -include_directories(${MX_SDK_HOME}/opensource/include/gstreamer-1.0) -include_directories(${MX_SDK_HOME}/opensource/include/glib-2.0) -include_directories(${MX_SDK_HOME}/opensource/lib/glib-2.0/include) -include_directories(${MX_SDK_HOME}/opensource/include/opencv4) -include_directories(${PROJECT_SOURCE_DIR}/../proto) -link_directories(${MX_SDK_HOME}/lib) -link_directories(${MX_SDK_HOME}/opensource/lib) -link_directories(${PROJECT_SOURCE_DIR}/../proto/lib) - -add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) -add_compile_options("-DPLUGIN_NAME=${PLUGIN_NAME}") -add_definitions(-DENABLE_DVPP_INTERFACE) -add_library(${TARGET_LIBRARY} SHARED MxpiOpenposePostProcess.cpp) -target_link_libraries(${TARGET_LIBRARY} glib-2.0 gstreamer-1.0 gobject-2.0 gstbase-1.0 gmodule-2.0 glog) +cmake_minimum_required(VERSION 3.5.2) +project(OpenposePlugin) +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0 -Dgoogle=mindxsdk_private) + +set(PLUGIN_NAME "mxpi_openposepostprocess") +set(TARGET_LIBRARY ${PLUGIN_NAME}) +set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${MX_SDK_HOME}/include) +include_directories(${MX_SDK_HOME}/opensource/include) +include_directories(${MX_SDK_HOME}/opensource/include/gstreamer-1.0) +include_directories(${MX_SDK_HOME}/opensource/include/glib-2.0) +include_directories(${MX_SDK_HOME}/opensource/lib/glib-2.0/include) +include_directories(${MX_SDK_HOME}/opensource/include/opencv4) +include_directories(${PROJECT_SOURCE_DIR}/../proto) +link_directories(${MX_SDK_HOME}/lib) +link_directories(${MX_SDK_HOME}/opensource/lib) +link_directories(${PROJECT_SOURCE_DIR}/../proto/lib) + +add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) +add_compile_options("-DPLUGIN_NAME=${PLUGIN_NAME}") +add_definitions(-DENABLE_DVPP_INTERFACE) +add_library(${TARGET_LIBRARY} SHARED MxpiOpenposePostProcess.cpp) +target_link_libraries(${TARGET_LIBRARY} glib-2.0 gstreamer-1.0 gobject-2.0 gstbase-1.0 gmodule-2.0 glog) target_link_libraries(${TARGET_LIBRARY} mxpidatatype plugintoolkit mxbase mindxsdk_protobuf mxpiopenposeproto) \ No newline at end of file diff --git a/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.cpp b/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.cpp index 276de54d4..22e7d0146 100644 --- a/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.cpp +++ b/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.cpp @@ -1,806 +1,806 @@ -/* -* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "MxpiOpenposePostProcess.h" -#include -#include -#include -#include -#include -#include -#include "opencv2/opencv.hpp" -#include "MxBase/Log/Log.h" -#include "MxBase/Tensor/TensorBase/TensorBase.h" - -using namespace MxBase; -using namespace MxPlugins; -using namespace MxTools; -using namespace std; -using namespace cv; - -namespace { - auto uint8Deleter = [] (uint8_t *p) { }; - const int K_NUM_BODY_PARTS = 18; - const int K_UPSAMPLED_STRIDE = 8; - - // CocoSkeletonsNetwork - const std::vector K_POSE_MAP_INDEX { - 12, 13, 20, 21, 14, 15, 16, 17, 22, 23, - 24, 25, 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 28, 29, 30, 31, 34, 35, - 32, 33, 36, 37, 18, 19, 26, 27 - }; - - // CocoSkeletons - const std::vector K_POSE_BODY_PART_SKELETONS { - 1, 2, 1, 5, 2, 3, 3, 4, 5, 6, - 6, 7, 1, 8, 8, 9, 9, 10, 1, 11, - 11, 12, 12, 13, 1, 0, 0, 14, 14, 16, - 0, 15, 15, 17, 2, 16, 5, 17 - }; - // Nms score threshold - const float K_NMS_THRESHOLD = 0.05; - // Range of nearest neighbors - const int K_NEAREST_KEYPOINTS_THRESHOLD = 6; - // PAF score threshold as a valid inner point on a skeleton - const float K_LOCAL_PAF_SCORE_THRESHOLD = 0.05; - // The minimum number of valid inner points a skeleton includes to be regarded as a correct skeleton - const int K_LOCAL_PAF_COUNT_THRESHOLD = 8; - // The minimum number of skeletons needed to form a person - const int K_PERSON_SKELETON_COUNT_THRESHOLD = 3; - // The lowest average score per keypoint in a person - const float K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD = 0.2; -} // namespace - -/** - * @brief decode MxpiTensorPackageList - * @param tensorPackageList - Source tensorPackageList - * @param tensors - Target TensorBase data - */ -static void GetTensors(const MxTools::MxpiTensorPackageList tensor_package_list, -std::vector *tensors) { - for (int i = 0; i < tensor_package_list.tensorpackagevec_size(); ++i) { - for (int j = 0; j < tensor_package_list.tensorpackagevec(i).tensorvec_size(); j++) { - MxBase::MemoryData memory_data = {}; - memory_data.deviceId = tensor_package_list.tensorpackagevec(i).tensorvec(j).deviceid(); - memory_data.type = (MxBase::MemoryData::MemoryType)tensor_package_list. - tensorpackagevec(i).tensorvec(j).memtype(); - memory_data.size = (uint32_t)tensor_package_list. - tensorpackagevec(i).tensorvec(j).tensordatasize(); - memory_data.ptrData = reinterpret_cast(tensor_package_list. - tensorpackagevec(i).tensorvec(j).tensordataptr()); - std::vector output_shape = {}; - for (int k = 0; k < tensor_package_list. - tensorpackagevec(i).tensorvec(j).tensorshape_size(); ++k) { - output_shape.push_back((uint32_t)tensor_package_list. - tensorpackagevec(i).tensorvec(j).tensorshape(k)); - } - MxBase::TensorBase tmp_tensor(memory_data, true, output_shape, - (MxBase::TensorDataType)tensor_package_list. - tensorpackagevec(i).tensorvec(j).tensordatatype()); - tensors->push_back(tmp_tensor); - } - } -} - -/** - * @brief Get information of origin input image and aligned image - * @param vision_list - MxpiVisionList object obtained from upstream plugin - * @param vision_infos - Result vector with elements [origin_height, origin_width, aligned_height, aligned_width] - */ -static void GetImageSizes(const MxTools::MxpiVisionList vision_list, std::vector *vision_infos) { - MxpiVision vision = vision_list.visionvec(0); - MxpiVisionInfo vision_info = vision.visioninfo(); - vision_infos->push_back(vision_info.height()); - vision_infos->push_back(vision_info.width()); - vision_infos->push_back(vision_info.heightaligned()); - vision_infos->push_back(vision_info.widthaligned()); -} - -/** - * @brief Parsing TensorBase data to keypoint heatmap and PAF heatmap of openpose model - * @param tensors - TensorBase vector - * @return Two-element vector, keeping keypoint heatmap and paf heatmap respectively - */ -static std::vector > ReadDataFromTensorPytorch(const std::vector &tensors) { - auto shape = tensors[11].GetShape(); - int channel_keypoint = shape[1]; - int height_index = 2, width_index = 3; - int height = shape[height_index]; - int width = shape[width_index]; - auto shape_p = tensors[5].GetShape(); - int channel_paf = shape_p[1]; - // Read keypoint data - auto dataPtr = reinterpret_cast(tensors[11].GetBuffer()); - std::shared_ptr keypoint_pointer; - keypoint_pointer.reset(dataPtr, uint8Deleter); - std::vector keypoint_heatmap {}; - int idx = 0; - for (int i = 0; i < channel_keypoint; i++) { - cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); - for (int j = 0; j < height; j++) { - float *ptr = single_channel_mat.ptr(j); - for (int k = 0; k < width; k++) { - ptr[k] = static_cast(keypoint_pointer.get())[idx]; - idx += 1; - } - } - keypoint_heatmap.push_back(single_channel_mat); - } - // Read PAF data - auto data_paf_ptr = reinterpret_cast(tensors[5].GetBuffer()); - std::shared_ptr paf_pointer; - paf_pointer.reset(data_paf_ptr, uint8Deleter); - std::vector paf_heatmap {}; - idx = 0; - for (int i = 0; i < channel_paf; i++) { - cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); - for (int j = 0; j < height; j++) { - float *ptr = single_channel_mat.ptr(j); - for (int k = 0; k < width; k++) { - ptr[k] = static_cast(paf_pointer.get())[idx]; - idx += 1; - } - } - paf_heatmap.push_back(single_channel_mat); - } - std::vector > result = {keypoint_heatmap, paf_heatmap}; - return result; -} - -/** - * @brief Non-Maximum Suppression, keep points that is greater than all its four surround points, - * i.e. up, bottom, left and right points - * @param plain - 2D data for NMS - * @param threshold - NMS threshold - */ -static void NMS(cv::Mat *plain, float threshold) { - cv::GaussianBlur(*plain, *plain, cv::Size(17, 17), 2.5, 2.5); - // Keep points with score below the NMS score threshold are set to 0 - plain->setTo(0, *plain < threshold); - // Find points that is greater than all its four surround points - cv::Mat plain_with_border; - int border_padding = 2; - int bottom_right_index = 2; - cv::copyMakeBorder(*plain, plain_with_border, border_padding, border_padding, border_padding, border_padding, - BORDER_CONSTANT, cv::Scalar(0)); - cv::Mat plain_with_border_clone = plain_with_border.clone(); - int sub_mat_cols = plain_with_border.cols - border_padding; - int sub_mat_rows = plain_with_border.rows - border_padding; - cv::Mat plain_center = plain_with_border(cv::Rect(1, 1, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_bottom = plain_with_border(cv::Rect(1, bottom_right_index, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_up = plain_with_border(cv::Rect(1, 0, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_left = plain_with_border(cv::Rect(0, 1, sub_mat_cols, sub_mat_rows)); - cv::Mat plain_right = plain_with_border(cv::Rect(bottom_right_index, 1, sub_mat_cols, sub_mat_rows)); - - for (int i = 0; i < plain_center.rows; i++) { - float *center_ptr = plain_center.ptr(i); - float *bottom_ptr = plain_bottom.ptr(i); - float *up_ptr = plain_up.ptr(i); - float *left_ptr = plain_left.ptr(i); - float *right_ptr = plain_right.ptr(i); - float *clone_border_ptr = plain_with_border_clone.ptr(i + 1); - for (int j = 0; j < plain_center.cols; j++) { - if (!((center_ptr[j] > up_ptr[j]) && (center_ptr[j] > bottom_ptr[j]) && - (center_ptr[j] > left_ptr[j]) && (center_ptr[j] > right_ptr[j]))) { - clone_border_ptr[j + 1] = 0; - } - } - } - *plain = plain_with_border_clone(cv::Rect(border_padding, border_padding, plain_center.cols - border_padding, - plain_center.rows - border_padding)).clone(); -} - -/** - * @brief Comparison between two PartPair elements - * @param p1 - PartPair p1 - * @param p2 - PartPair p2 - * @return True if the score of p1 is greater than that of p2 - */ -static bool GreaterSort(PartPair p1, PartPair p2) { - return p1.score > p2.score; -} - -/** - * @brief Comparison between two cv::Point elements - * @param p1 - cv::Point p1 - * @param p2 - cv::Point p2 - * @return True if the x coordinate of p2 is greater than that of p1 - */ -static bool PointSort(cv::Point p1, cv::Point p2) { - return p1.x < p2.x; -} - -/** - * @brief Resize output heatmaps to the size of the origin image - * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat - * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat - * @param vision_infos - Vision infos of origin image and aligned image - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::ResizeHeatmaps(const std::vector &vision_infos, - std::vector *keypoint_heatmap, std::vector *paf_heatmap) { - // Calculate padding direction and padding value - int origin_height = vision_infos[0]; - int origin_width = vision_infos[1]; - /* - std::cout << "origin_height= " << origin_height << std::endl; - std::cout << "origin_width= " << origin_width << std::endl; - std::cout << "inputHeight_= " << inputHeight_ << std::endl; - std::cout << "inputWidth_= " << inputWidth_ << std::endl; - */ - // padding along height - int padding_direction = 0; - if (origin_height > origin_width) { - // padding along width - padding_direction = 1; - } - int padding_value = 0; - if (padding_direction == 0) { - // pad height - padding_value = floor(inputHeight_ - inputWidth_ * origin_height / origin_width); - } else { - // pad width - padding_value = floor(inputWidth_ - inputHeight_ * origin_width / origin_height); - } - - // Channel Split Resize - for (int i = 0; i < keypoint_heatmap[0].size(); i++) { - cv::Mat single_channel_mat = keypoint_heatmap[0][i]; - cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), - K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE, INTER_CUBIC); - if (padding_direction == 0) { - // remove height padding - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols, single_channel_mat.rows - padding_value)); - } else { - // remove width padding - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols - padding_value, single_channel_mat.rows)); - } - cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); - keypoint_heatmap[0][i] = single_channel_mat; - } - for (int i = 0; i < paf_heatmap[0].size(); i++) { - cv::Mat single_channel_mat = paf_heatmap[0][i]; - cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), - K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE); - if (padding_direction == 0) { - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols, single_channel_mat.rows - padding_value)); - } else { - single_channel_mat = - single_channel_mat(cv::Rect(0, 0, - single_channel_mat.cols - padding_value, single_channel_mat.rows)); - } - cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); - paf_heatmap[0][i] = single_channel_mat; - } - return APP_ERR_OK; -} - -/** - * @brief Extract candidate keypoints - * @param keypoint_heatmap - Resized keypoint heatmap - * @param coor - Keep extracted result, store a point in a cv::Point object, - * store keypoints of different channel in different vectors - * @param coor_score - Scores corresponding to extracted keypoints - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::ExtractKeypoints(const std::vector &keypoint_heatmap, -std::vector > *coor, std::vector > *coor_score) { - int polynomial_exponent = 2; - int peak_counter = 0; - for (int i = 0; i < keypoint_heatmap.size() - 1; i++) { - // NMS - cv::Mat smoothProbMap; - cv::GaussianBlur(keypoint_heatmap[i], smoothProbMap, cv::Size(17, 17), 2.5, 2.5); - - // - NMS(&smoothProbMap, K_NMS_THRESHOLD); - std::vector non_zero_coordinates; - // - cv::findNonZero(smoothProbMap, non_zero_coordinates); - std::sort(non_zero_coordinates.begin(), non_zero_coordinates.end(), PointSort); - std::vector suppressed(non_zero_coordinates.size(), 0); - std::vector keypoints_without_nearest {}; - std::vector keypoints_score {}; - // Remove other keypoints within a certain range around one keypoints - for (int j = 0; j < non_zero_coordinates.size(); j++) { - if (suppressed[j]) { - continue; - } - int thrown_index = j + 1; - auto it = std::find_if(std::begin(non_zero_coordinates) + j + 1, std::end(non_zero_coordinates), - [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { - float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + - powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); - return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; - }); - while (it != std::end(non_zero_coordinates)) { - thrown_index = std::distance(std::begin(non_zero_coordinates) + thrown_index, it) + thrown_index; - suppressed[thrown_index] = 1; - it = std::find_if(std::next(it), std::end(non_zero_coordinates), - [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { - float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + - powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); - return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; - }); - } - keypoints_without_nearest.push_back(non_zero_coordinates[j]); - // - keypoints_score.push_back(smoothProbMap.at - (non_zero_coordinates[j].y, non_zero_coordinates[j].x)); - } - coor->push_back(keypoints_without_nearest); - coor_score->push_back(keypoints_score); - } - return APP_ERR_OK; -} - -/** - * @brief Compute expected confidence for each candidate skeleton - * @param endpoints - Coordinates of the two end points of a skeleton - * @param paf_x - PAF heatmap of x coordinate - * @param paf_y - PAF heatmap of y coordinate - * @return result - Keep confidence information of this skeleton in the form: - * [confidence score, number of successfully hit sub points] - */ -std::vector MxpiOpenposePostProcess::OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, - const std::vector &endpoints) { - int x1 = endpoints[0].x, y1 = endpoints[0].y; - int x2 = endpoints[1].x, y2 = endpoints[1].y; - // affinity score of this skeleton - float score = 0; - // count: number of valid inner points on this skeleton - int count = 0, num_inter = 10; - float dx = x2 - x1; - float dy = y2 - y1; - float norm_vec = sqrt(dx * dx + dy * dy); - float vx = dx / (norm_vec + 1e-6); - float vy = dy / (norm_vec + 1e-6); - // generate 10 points equally spaced on this skeleton - std::vector xs {}; - float step_x = dx / (num_inter - 1); - for (int k = 0; k < num_inter; k++) { - float temp_x = x1 + k * step_x; - xs.push_back(round(temp_x)); - } - std::vector ys {}; - float step_y = dy / (num_inter - 1); - for (int k = 0; k < num_inter; k++) { - float temp_y = y1 + k * step_y; - ys.push_back(round(temp_y)); - } - std::vector sub_score_vec; - // calculate PAF value of each inner point - for (int i = 0; i < xs.size(); i++) { - float sub_score = paf_x.at(ys[i], xs[i]) * vx + paf_y.at(ys[i], xs[i]) * vy; - sub_score_vec.push_back(sub_score); - } - // remove inner points such that has PAF value < K_LOCAL_PAF_SCORE_THRESHOLD - sub_score_vec.erase(std::remove_if( - sub_score_vec.begin(), sub_score_vec.end(), - [](const float &x) { - return x <= K_LOCAL_PAF_SCORE_THRESHOLD; - }), sub_score_vec.end()); - std::vector result {0.0, 0.0}; - score = std::accumulate(sub_score_vec.begin(), sub_score_vec.end(), 0.0); - count = sub_score_vec.size(); - result[0] = score / (count + 1e-6); - result[1] = count; - return result; -} - -/** - * @brief Remove conflict skeletons - * @param src - Source vector that stores skeletons to be processed - * @param dst - Target vector that collects candidate skeletons - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::ConntectionNms(std::vector *src, std::vector *dst) { - // Remove conflict skeletons, if two skeletons of the same type share a same end point, they are conflict - std::vector used_idx1 {}; - std::vector used_idx2 {}; - // Sort skeletons in ascending order of affinity score - std::sort(src[0].begin(), src[0].end(), GreaterSort); - for (int i = 0; i < src[0].size(); i++) { - PartPair candidate = src[0][i]; - if (std::find(used_idx1.begin(), used_idx1.end(), candidate.idx1) != used_idx1.end() - || std::find(used_idx2.begin(), used_idx2.end(), candidate.idx2) != used_idx2.end()) { - continue; - } - dst->push_back(candidate); - used_idx1.push_back(candidate.idx1); - used_idx2.push_back(candidate.idx2); - } - return APP_ERR_OK; -} - -/** - * @brief Calculate expected confidence of each possible skeleton and choose candidates - * @param part_idx - Index of skeleton in K_POSE_BODY_PART_SKELETONS - * @param coor - Candidate positions of endpoints - * @param coor_score - Corresponding score of coor - * @param paf_heatmap - PAF heatmap - * @param connections - Target vector that collects candidate skeletons - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::ScoreSkeletons(const int part_idx, - const std::vector > &coor, const std::vector > &coor_score, - const std::vector &paf_heatmap, std::vector *connections) { - // Use point1 and point2 to represent the two endpoints of a skeleton - int coco_skeleton_idx1 = K_POSE_BODY_PART_SKELETONS[2 * part_idx]; - int coco_skeleton_idx2 = K_POSE_BODY_PART_SKELETONS[2 * part_idx + 1]; - int index_stride = 2; - int end_point_num = 2; - int paf_x_idx = K_POSE_MAP_INDEX[index_stride * part_idx]; - int paf_y_idx = K_POSE_MAP_INDEX[index_stride * part_idx + 1]; - std::vector endpoints(end_point_num, cv::Point(0, 0)); - std::vector connection_temp {}; - std::vector result {0.0, 0.0}; - // Calculate the affinity score of each skeleton composed of all candidate point1 and point2 - for (int i = 0; i < coor[coco_skeleton_idx1].size(); i++) { - cv::Point point1; - point1.x = coor[coco_skeleton_idx1][i].x; - point1.y = coor[coco_skeleton_idx1][i].y; - endpoints[0] = point1; - for (int j = 0; j < coor[coco_skeleton_idx2].size(); j++) { - cv::Point point2; - point2.x = coor[coco_skeleton_idx2][j].x; - point2.y = coor[coco_skeleton_idx2][j].y; - endpoints[1] = point2; - result = OneSkeletonScore(paf_heatmap[paf_x_idx], paf_heatmap[paf_y_idx], endpoints); - // Keep skeletons with affinity scores greater than 0 and - // valid internal points greater than K_LOCAL_PAF_COUNT_THRESHOLD - if (result[1] <= K_LOCAL_PAF_COUNT_THRESHOLD || result[0] <= 0.0) { - continue; - } - // Store the information of a skeleton in a custom structure PartPair - PartPair skeleton; - skeleton.score = result[0]; - skeleton.partIdx1 = coco_skeleton_idx1; - skeleton.partIdx2 = coco_skeleton_idx2; - skeleton.idx1 = i; - skeleton.idx2 = j; - skeleton.coord1.push_back(point1.x); - skeleton.coord1.push_back(point1.y); - skeleton.coord2.push_back(point2.x); - skeleton.coord2.push_back(point2.y); - skeleton.score1 = coor_score[coco_skeleton_idx1][i]; - skeleton.score2 = coor_score[coco_skeleton_idx2][j]; - connection_temp.push_back(skeleton); - } - } - // For skeletons with the same endpoints, keep the one with larger affinity score - ConntectionNms(&connection_temp, connections); - return APP_ERR_OK; -} - -/** - * @brief Merge a skeleton to an existed person - * @param person_list - Currently existed person list - * @param current_pair - Skeleton to be merged - * @return True if merged successfully, otherwise false - */ -bool MxpiOpenposePostProcess::MergeSkeletonToPerson(std::vector > *person_list, -PartPair current_pair) { - // Use point1 and point2 to represent the two endpoints of a skeleton - for (int k = 0; k < person_list[0].size(); k++) { - std::vector ¤t_person = person_list[0][k]; - for (int i = 0; i < current_person.size(); i++) { - if (current_pair.partIdx1 == current_person[i].partIdx1 && - current_pair.idx1 == current_person[i].idx1) { - // point1 of current skeleton is the same as point1 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx1 == current_person[i].partIdx2 && - current_pair.idx1 == current_person[i].idx2) { - // point1 of current skeleton is the same as point2 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx2 == current_person[i].partIdx1 && - current_pair.idx2 == current_person[i].idx1) { - // point2 of current skeleton is the same as point1 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } else if (current_pair.partIdx2 == current_person[i].partIdx2 && - current_pair.idx2 == current_person[i].idx2) { - // point2 of current skeleton is the same as point2 of a skeleton in current person - current_person.push_back(current_pair); - return true; - } - } - } - // Can not merge to any existed person, create new person - std::vector new_person {}; - new_person.push_back(current_pair); - person_list->push_back(new_person); - return true; -} - -/** - * @brief Group keypoints to skeletons and assemble them to person - * @param paf_heatmap - PAF heatmap - * @param coor - Coordinates of all the candidate keypoints - * @param coor_score - Corresponding score of coordinates - * @param person_list - Target vector to store person, each person is stored as a vector of skeletons - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::GroupKeypoints(const std::vector &paf_heatmap, -const std::vector > &coor, const std::vector > &coor_score, -std::vector > *person_list) { - for (int i = 0; i < K_NUM_BODY_PARTS + 1; i++) { - // Choose candidate skeletons for each category, there are a total of kNumBodyPart + 1 categories of skeletons - std::vector part_connections {}; - ScoreSkeletons(i, coor, coor_score, paf_heatmap, &part_connections); - // Merge newly generated skeletons to existed person or create new person - if (i == 0) { - // For the first category, each different skeleton of this category stands for different person - for (int j = 0; j < part_connections.size(); j++) { - std::vector new_person {}; - new_person.push_back(part_connections[j]); - person_list->push_back(new_person); - } - } else if (i == K_NUM_BODY_PARTS - 1 || i == K_NUM_BODY_PARTS) { - // The last two skeletons do not contribute to person score - for (int j = 0; j < part_connections.size(); j++) { - part_connections[j].score = 0; - part_connections[j].score1 = 0; - part_connections[j].score2 = 0; - bool can_merge = MergeSkeletonToPerson(person_list, part_connections[j]); - } - } else { - for (int j = 0; j < part_connections.size(); j++) { - MergeSkeletonToPerson(person_list, part_connections[j]); - } - } - } - return APP_ERR_OK; -} - -/** - * @brief Calculate score of a person according to its skeletons - * @param person - Target person - * @return Score value - */ -float MxpiOpenposePostProcess::PersonScore(const std::vector &person) { - // The score of a person is composed of the scores of all his keypoints and that of all his skeletons - std::vector seen_keypoints = {}; - float person_score = 0.0; - for (int i = 0; i < person.size(); i++) { - PartPair skeleton = person[i]; - if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx1) == seen_keypoints.end()) { - seen_keypoints.push_back(skeleton.partIdx1); - person_score += skeleton.score1; - } - if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx2) == seen_keypoints.end()) { - seen_keypoints.push_back(skeleton.partIdx2); - person_score += skeleton.score2; - } - person_score += skeleton.score; - } - // Ignore person whose number of skeletons is less than K_PERSON_SKELETON_COUNT_THRESHOLD or - // the average score of each keypoint is less than K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD - if (seen_keypoints.size() < K_PERSON_SKELETON_COUNT_THRESHOLD || - (person_score / seen_keypoints.size()) < K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD) { - return 0.0; - } - return person_score; -} - -/** - * @brief Prepare output in the format of MxpiPersonList - * @param person_list - Source data in the format of std::vector > - * @param dst_mxpi_person_list - Target data in the format of MxpiPersonList - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::GenerateMxpiOutput(const std::vector > &person_list, -mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list) { - float float_equal_zero_bias = 0.000001; - for (int k = 0; k < person_list.size(); k++) { - float person_score = PersonScore(person_list[k]); - // Ignore person with score 0 - if (fabs(person_score - 0) < float_equal_zero_bias) { - continue; - } - auto mxpi_person_ptr = dst_mxpi_person_list->add_personinfovec(); - mxpiopenposeproto::MxpiMetaHeader* dst_person_mxpi_metaheader_list = mxpi_person_ptr->add_headervec(); - dst_person_mxpi_metaheader_list->set_datasource(parentName_); - dst_person_mxpi_metaheader_list->set_memberid(0); - mxpi_person_ptr->set_score(person_score); - for (int j = 0; j < person_list[k].size(); j++) { - PartPair skeleton = person_list[k][j]; - auto mxpi_skeleton_ptr = mxpi_person_ptr->add_skeletoninfovec(); - mxpi_skeleton_ptr->set_cocoskeletonindex1(skeleton.partIdx1); - mxpi_skeleton_ptr->set_cocoskeletonindex2(skeleton.partIdx2); - mxpi_skeleton_ptr->set_x0(skeleton.coord1[0]); - mxpi_skeleton_ptr->set_y0(skeleton.coord1[1]); - mxpi_skeleton_ptr->set_point1score(skeleton.score1); - mxpi_skeleton_ptr->set_x1(skeleton.coord2[0]); - mxpi_skeleton_ptr->set_y1(skeleton.coord2[1]); - mxpi_skeleton_ptr->set_point2score(skeleton.score2); - mxpi_skeleton_ptr->set_skeletonscore(skeleton.score); - } - } - return APP_ERR_OK; -} - -/** - * Overall process to generate all person skeleton information - * @param image_decoder_visionListSptr - Source MxpiVisionList containing vision data including size of input image - * @param src_mxpi_tensor_package - Source MxpiTensorPackage containing heatmap data - * @param dst_mxpi_person_list - Target MxpiPersonList containing detection result list - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::GeneratePersonList(const MxpiVisionList image_decoder_visionListSptr, -const MxpiTensorPackageList src_mxpi_tensor_package, mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list) { - // Get tensor - std::vector tensors = {}; - GetTensors(src_mxpi_tensor_package, &tensors); - // Get size of the input image and the aligned image - std::vector vision_infos = {}; - GetImageSizes(image_decoder_visionListSptr, &vision_infos); - std::vector keypoint_heatmap, paf_heatmap; - // Read data from tensor output by the upstream plugin - std::vector > result = ReadDataFromTensorPytorch(tensors); - keypoint_heatmap = result[0]; - paf_heatmap = result[1]; - // Resize heatmaps to the size of the input image - ResizeHeatmaps(vision_infos, &keypoint_heatmap, &paf_heatmap); - // Extract candidate keypoints - std::vector > coor {}; - std::vector > coor_score {}; - ExtractKeypoints(keypoint_heatmap, &coor, &coor_score); - // Group candidate keypoints to candidate skeletons and generate person - std::vector > person_list {}; - GroupKeypoints(paf_heatmap, coor, coor_score, &person_list); - // Prepare output in the format of MxpiPersonList - GenerateMxpiOutput(person_list, dst_mxpi_person_list); - return APP_ERR_OK; -} - -/** - * @brief Initialize configure parameter. - * @param config_param_map - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::Init(std::map> &config_param_map) { - LogInfo << "MxpiOpenposePostProcess::Init start."; - // Get the property values by key - std::shared_ptr parent_name_prop_sptr = std::static_pointer_cast(config_param_map["dataSource"]); - this->parentName_ = *parent_name_prop_sptr.get(); - std::shared_ptr image_decoder_prop_sptr = std::static_pointer_cast(config_param_map["imageSource"]); - imageDecoderName_ = *image_decoder_prop_sptr.get(); - std::shared_ptr input_height_prop_sptr = - std::static_pointer_cast(config_param_map["inputHeight"]); - inputHeight_ = *input_height_prop_sptr.get(); - std::shared_ptr input_width_prop_sptr = - std::static_pointer_cast(config_param_map["inputWidth"]); - inputWidth_ = *input_width_prop_sptr.get(); - return APP_ERR_OK; -} - -/** - * @brief DeInitialize configure parameter. - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::DeInit() { - LogInfo << "MxpiOpenposePostProcess::DeInit end."; - LogInfo << "MxpiOpenposePostProcess::DeInit end."; - return APP_ERR_OK; -} - -/** - * @brief Process the data of MxpiBuffer. - * @param mxpi_buffer - * @return APP_ERROR - */ -APP_ERROR MxpiOpenposePostProcess::Process(std::vector &mxpi_buffer) { - LogInfo << "MxpiOpenposePostProcess::Process start"; - MxpiBuffer *buffer = mxpi_buffer[0]; - MxpiMetadataManager mxpi_metadata_manager(*buffer); - MxpiErrorInfo mxpi_error_info; - ErrorInfo_.str(""); - auto errorInfoPtr = mxpi_metadata_manager.GetErrorInfo(); - if (errorInfoPtr != nullptr) { - ErrorInfo_ << GetError(APP_ERR_COMM_FAILURE, pluginName_) << - "MxpiOpenposePostProcess process is not implemented"; - mxpi_error_info.ret = APP_ERR_COMM_FAILURE; - mxpi_error_info.errorInfo = ErrorInfo_.str(); - SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); - LogError << "MxpiOpenposePostProcess process is not implemented"; - return APP_ERR_COMM_FAILURE; - } - // Get the output of tensorinfer from buffer - shared_ptr metadata = mxpi_metadata_manager.GetMetadata(parentName_); - shared_ptr src_mxpi_tensor_packageListSptr - = static_pointer_cast(metadata); - - // Get the output of imagedecoder from buffer - shared_ptr id_metadata = mxpi_metadata_manager.GetMetadata(imageDecoderName_); - shared_ptr image_decoder_visionListSptr - = static_pointer_cast(id_metadata); - - // Generate output - shared_ptr dst_mxpi_person_listSptr = - make_shared(); - APP_ERROR ret = GeneratePersonList(*image_decoder_visionListSptr, - *src_mxpi_tensor_packageListSptr, &(*dst_mxpi_person_listSptr)); - if (ret != APP_ERR_OK) { - ErrorInfo_ << GetError(ret, pluginName_) << "MxpiOpenposePostProcess get skeleton information failed."; - mxpi_error_info.ret = ret; - mxpi_error_info.errorInfo = ErrorInfo_.str(); - SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); - return ret; - } - - ret = mxpi_metadata_manager.AddProtoMetadata(pluginName_, static_pointer_cast(dst_mxpi_person_listSptr)); - if (ret != APP_ERR_OK) { - ErrorInfo_ << GetError(ret, pluginName_) << "MxpiOpenposePostProcess add metadata failed."; - mxpi_error_info.ret = ret; - mxpi_error_info.errorInfo = ErrorInfo_.str(); - SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); - return ret; - } - - // Send the data to downstream plugin - SendData(0, *buffer); - LogInfo << "MxpiOpenposePostProcess::Process end"; - return APP_ERR_OK; -} - -/** - * @brief Definition the parameter of configure properties. - * @return std::vector> - */ -std::vector> MxpiOpenposePostProcess::DefineProperties() { - std::vector> properties; - // Set the type and related information of the properties, and the key is the name - auto parent_name_pro_sptr = std::make_shared>(ElementProperty{ - STRING, "dataSource", "parentName", "the name of previous plugin", "mxpi_modelinfer0", "NULL", "NULL"}); - auto image_decoder_name_pro_sptr = std::make_shared>(ElementProperty{ - STRING, "imageSource", "inputName", "the name of imagedecoder", "mxpi_imagedecoder0", "NULL", "NULL"}); - auto input_height_pro_sptr = std::make_shared>(ElementProperty{ - UINT, "inputHeight", "inputHeightValue", "the height of the input image", 368, 0, 1000}); - auto input_width_pro_sptr = std::make_shared>(ElementProperty{ - UINT, "inputWidth", "inputWidthValue", "the width of the input image", 368, 0, 1000}); - properties.push_back(image_decoder_name_pro_sptr); - properties.push_back(parent_name_pro_sptr); - properties.push_back(input_height_pro_sptr); - properties.push_back(input_width_pro_sptr); - return properties; -} - -APP_ERROR MxpiOpenposePostProcess::SetMxpiErrorInfo(const std::string &plugin_name, - const MxpiErrorInfo &mxpi_error_info, MxpiBuffer *buffer) { - APP_ERROR ret = APP_ERR_OK; - // Define an object of MxpiMetadataManager - MxpiMetadataManager mxpi_metadata_manager(*buffer); - ret = mxpi_metadata_manager.AddErrorInfo(plugin_name, mxpi_error_info); - if (ret != APP_ERR_OK) { - LogError << "Failed to AddErrorInfo."; - return ret; - } - ret = SendData(0, *buffer); - return ret; -} - -// Register the Sample plugin through macro -MX_PLUGIN_GENERATE(MxpiOpenposePostProcess) +/* +* Copyright (c) 2021. Huawei Technologies Co., Ltd. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "MxpiOpenposePostProcess.h" +#include +#include +#include +#include +#include +#include +#include "opencv2/opencv.hpp" +#include "MxBase/Log/Log.h" +#include "MxBase/Tensor/TensorBase/TensorBase.h" + +using namespace MxBase; +using namespace MxPlugins; +using namespace MxTools; +using namespace std; +using namespace cv; + +namespace { + auto uint8Deleter = [] (uint8_t *p) { }; + const int K_NUM_BODY_PARTS = 18; + const int K_UPSAMPLED_STRIDE = 8; + + // CocoSkeletonsNetwork + const std::vector K_POSE_MAP_INDEX { + 12, 13, 20, 21, 14, 15, 16, 17, 22, 23, + 24, 25, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 28, 29, 30, 31, 34, 35, + 32, 33, 36, 37, 18, 19, 26, 27 + }; + + // CocoSkeletons + const std::vector K_POSE_BODY_PART_SKELETONS { + 1, 2, 1, 5, 2, 3, 3, 4, 5, 6, + 6, 7, 1, 8, 8, 9, 9, 10, 1, 11, + 11, 12, 12, 13, 1, 0, 0, 14, 14, 16, + 0, 15, 15, 17, 2, 16, 5, 17 + }; + // Nms score threshold + const float K_NMS_THRESHOLD = 0.05; + // Range of nearest neighbors + const int K_NEAREST_KEYPOINTS_THRESHOLD = 6; + // PAF score threshold as a valid inner point on a skeleton + const float K_LOCAL_PAF_SCORE_THRESHOLD = 0.05; + // The minimum number of valid inner points a skeleton includes to be regarded as a correct skeleton + const int K_LOCAL_PAF_COUNT_THRESHOLD = 8; + // The minimum number of skeletons needed to form a person + const int K_PERSON_SKELETON_COUNT_THRESHOLD = 3; + // The lowest average score per keypoint in a person + const float K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD = 0.2; +} // namespace + +/** + * @brief decode MxpiTensorPackageList + * @param tensorPackageList - Source tensorPackageList + * @param tensors - Target TensorBase data + */ +static void GetTensors(const MxTools::MxpiTensorPackageList tensor_package_list, +std::vector *tensors) { + for (int i = 0; i < tensor_package_list.tensorpackagevec_size(); ++i) { + for (int j = 0; j < tensor_package_list.tensorpackagevec(i).tensorvec_size(); j++) { + MxBase::MemoryData memory_data = {}; + memory_data.deviceId = tensor_package_list.tensorpackagevec(i).tensorvec(j).deviceid(); + memory_data.type = (MxBase::MemoryData::MemoryType)tensor_package_list. + tensorpackagevec(i).tensorvec(j).memtype(); + memory_data.size = (uint32_t)tensor_package_list. + tensorpackagevec(i).tensorvec(j).tensordatasize(); + memory_data.ptrData = reinterpret_cast(tensor_package_list. + tensorpackagevec(i).tensorvec(j).tensordataptr()); + std::vector output_shape = {}; + for (int k = 0; k < tensor_package_list. + tensorpackagevec(i).tensorvec(j).tensorshape_size(); ++k) { + output_shape.push_back((uint32_t)tensor_package_list. + tensorpackagevec(i).tensorvec(j).tensorshape(k)); + } + MxBase::TensorBase tmp_tensor(memory_data, true, output_shape, + (MxBase::TensorDataType)tensor_package_list. + tensorpackagevec(i).tensorvec(j).tensordatatype()); + tensors->push_back(tmp_tensor); + } + } +} + +/** + * @brief Get information of origin input image and aligned image + * @param vision_list - MxpiVisionList object obtained from upstream plugin + * @param vision_infos - Result vector with elements [origin_height, origin_width, aligned_height, aligned_width] + */ +static void GetImageSizes(const MxTools::MxpiVisionList vision_list, std::vector *vision_infos) { + MxpiVision vision = vision_list.visionvec(0); + MxpiVisionInfo vision_info = vision.visioninfo(); + vision_infos->push_back(vision_info.height()); + vision_infos->push_back(vision_info.width()); + vision_infos->push_back(vision_info.heightaligned()); + vision_infos->push_back(vision_info.widthaligned()); +} + +/** + * @brief Parsing TensorBase data to keypoint heatmap and PAF heatmap of openpose model + * @param tensors - TensorBase vector + * @return Two-element vector, keeping keypoint heatmap and paf heatmap respectively + */ +static std::vector > ReadDataFromTensorPytorch(const std::vector &tensors) { + auto shape = tensors[11].GetShape(); + int channel_keypoint = shape[1]; + int height_index = 2, width_index = 3; + int height = shape[height_index]; + int width = shape[width_index]; + auto shape_p = tensors[5].GetShape(); + int channel_paf = shape_p[1]; + // Read keypoint data + auto dataPtr = reinterpret_cast(tensors[11].GetBuffer()); + std::shared_ptr keypoint_pointer; + keypoint_pointer.reset(dataPtr, uint8Deleter); + std::vector keypoint_heatmap {}; + int idx = 0; + for (int i = 0; i < channel_keypoint; i++) { + cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); + for (int j = 0; j < height; j++) { + float *ptr = single_channel_mat.ptr(j); + for (int k = 0; k < width; k++) { + ptr[k] = static_cast(keypoint_pointer.get())[idx]; + idx += 1; + } + } + keypoint_heatmap.push_back(single_channel_mat); + } + // Read PAF data + auto data_paf_ptr = reinterpret_cast(tensors[5].GetBuffer()); + std::shared_ptr paf_pointer; + paf_pointer.reset(data_paf_ptr, uint8Deleter); + std::vector paf_heatmap {}; + idx = 0; + for (int i = 0; i < channel_paf; i++) { + cv::Mat single_channel_mat(height, width, CV_32FC1, cv::Scalar(0)); + for (int j = 0; j < height; j++) { + float *ptr = single_channel_mat.ptr(j); + for (int k = 0; k < width; k++) { + ptr[k] = static_cast(paf_pointer.get())[idx]; + idx += 1; + } + } + paf_heatmap.push_back(single_channel_mat); + } + std::vector > result = {keypoint_heatmap, paf_heatmap}; + return result; +} + +/** + * @brief Non-Maximum Suppression, keep points that is greater than all its four surround points, + * i.e. up, bottom, left and right points + * @param plain - 2D data for NMS + * @param threshold - NMS threshold + */ +static void NMS(cv::Mat *plain, float threshold) { + cv::GaussianBlur(*plain, *plain, cv::Size(17, 17), 2.5, 2.5); + // Keep points with score below the NMS score threshold are set to 0 + plain->setTo(0, *plain < threshold); + // Find points that is greater than all its four surround points + cv::Mat plain_with_border; + int border_padding = 2; + int bottom_right_index = 2; + cv::copyMakeBorder(*plain, plain_with_border, border_padding, border_padding, border_padding, border_padding, + BORDER_CONSTANT, cv::Scalar(0)); + cv::Mat plain_with_border_clone = plain_with_border.clone(); + int sub_mat_cols = plain_with_border.cols - border_padding; + int sub_mat_rows = plain_with_border.rows - border_padding; + cv::Mat plain_center = plain_with_border(cv::Rect(1, 1, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_bottom = plain_with_border(cv::Rect(1, bottom_right_index, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_up = plain_with_border(cv::Rect(1, 0, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_left = plain_with_border(cv::Rect(0, 1, sub_mat_cols, sub_mat_rows)); + cv::Mat plain_right = plain_with_border(cv::Rect(bottom_right_index, 1, sub_mat_cols, sub_mat_rows)); + + for (int i = 0; i < plain_center.rows; i++) { + float *center_ptr = plain_center.ptr(i); + float *bottom_ptr = plain_bottom.ptr(i); + float *up_ptr = plain_up.ptr(i); + float *left_ptr = plain_left.ptr(i); + float *right_ptr = plain_right.ptr(i); + float *clone_border_ptr = plain_with_border_clone.ptr(i + 1); + for (int j = 0; j < plain_center.cols; j++) { + if (!((center_ptr[j] > up_ptr[j]) && (center_ptr[j] > bottom_ptr[j]) && + (center_ptr[j] > left_ptr[j]) && (center_ptr[j] > right_ptr[j]))) { + clone_border_ptr[j + 1] = 0; + } + } + } + *plain = plain_with_border_clone(cv::Rect(border_padding, border_padding, plain_center.cols - border_padding, + plain_center.rows - border_padding)).clone(); +} + +/** + * @brief Comparison between two PartPair elements + * @param p1 - PartPair p1 + * @param p2 - PartPair p2 + * @return True if the score of p1 is greater than that of p2 + */ +static bool GreaterSort(PartPair p1, PartPair p2) { + return p1.score > p2.score; +} + +/** + * @brief Comparison between two cv::Point elements + * @param p1 - cv::Point p1 + * @param p2 - cv::Point p2 + * @return True if the x coordinate of p2 is greater than that of p1 + */ +static bool PointSort(cv::Point p1, cv::Point p2) { + return p1.x < p2.x; +} + +/** + * @brief Resize output heatmaps to the size of the origin image + * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat + * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat + * @param vision_infos - Vision infos of origin image and aligned image + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::ResizeHeatmaps(const std::vector &vision_infos, + std::vector *keypoint_heatmap, std::vector *paf_heatmap) { + // Calculate padding direction and padding value + int origin_height = vision_infos[0]; + int origin_width = vision_infos[1]; + /* + std::cout << "origin_height= " << origin_height << std::endl; + std::cout << "origin_width= " << origin_width << std::endl; + std::cout << "inputHeight_= " << inputHeight_ << std::endl; + std::cout << "inputWidth_= " << inputWidth_ << std::endl; + */ + // padding along height + int padding_direction = 0; + if (origin_height > origin_width) { + // padding along width + padding_direction = 1; + } + int padding_value = 0; + if (padding_direction == 0) { + // pad height + padding_value = floor(inputHeight_ - inputWidth_ * origin_height / origin_width); + } else { + // pad width + padding_value = floor(inputWidth_ - inputHeight_ * origin_width / origin_height); + } + + // Channel Split Resize + for (int i = 0; i < keypoint_heatmap[0].size(); i++) { + cv::Mat single_channel_mat = keypoint_heatmap[0][i]; + cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), + K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE, INTER_CUBIC); + if (padding_direction == 0) { + // remove height padding + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols, single_channel_mat.rows - padding_value)); + } else { + // remove width padding + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols - padding_value, single_channel_mat.rows)); + } + cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); + keypoint_heatmap[0][i] = single_channel_mat; + } + for (int i = 0; i < paf_heatmap[0].size(); i++) { + cv::Mat single_channel_mat = paf_heatmap[0][i]; + cv::resize(single_channel_mat, single_channel_mat, Size(0, 0), + K_UPSAMPLED_STRIDE, K_UPSAMPLED_STRIDE); + if (padding_direction == 0) { + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols, single_channel_mat.rows - padding_value)); + } else { + single_channel_mat = + single_channel_mat(cv::Rect(0, 0, + single_channel_mat.cols - padding_value, single_channel_mat.rows)); + } + cv::resize(single_channel_mat, single_channel_mat, Size(origin_width, origin_height), 0, 0); + paf_heatmap[0][i] = single_channel_mat; + } + return APP_ERR_OK; +} + +/** + * @brief Extract candidate keypoints + * @param keypoint_heatmap - Resized keypoint heatmap + * @param coor - Keep extracted result, store a point in a cv::Point object, + * store keypoints of different channel in different vectors + * @param coor_score - Scores corresponding to extracted keypoints + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::ExtractKeypoints(const std::vector &keypoint_heatmap, +std::vector > *coor, std::vector > *coor_score) { + int polynomial_exponent = 2; + int peak_counter = 0; + for (int i = 0; i < keypoint_heatmap.size() - 1; i++) { + // NMS + cv::Mat smoothProbMap; + cv::GaussianBlur(keypoint_heatmap[i], smoothProbMap, cv::Size(17, 17), 2.5, 2.5); + + // + NMS(&smoothProbMap, K_NMS_THRESHOLD); + std::vector non_zero_coordinates; + // + cv::findNonZero(smoothProbMap, non_zero_coordinates); + std::sort(non_zero_coordinates.begin(), non_zero_coordinates.end(), PointSort); + std::vector suppressed(non_zero_coordinates.size(), 0); + std::vector keypoints_without_nearest {}; + std::vector keypoints_score {}; + // Remove other keypoints within a certain range around one keypoints + for (int j = 0; j < non_zero_coordinates.size(); j++) { + if (suppressed[j]) { + continue; + } + int thrown_index = j + 1; + auto it = std::find_if(std::begin(non_zero_coordinates) + j + 1, std::end(non_zero_coordinates), + [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { + float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + + powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); + return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; + }); + while (it != std::end(non_zero_coordinates)) { + thrown_index = std::distance(std::begin(non_zero_coordinates) + thrown_index, it) + thrown_index; + suppressed[thrown_index] = 1; + it = std::find_if(std::next(it), std::end(non_zero_coordinates), + [non_zero_coordinates, j, polynomial_exponent](cv::Point p) { + float distance = powf((non_zero_coordinates[j].x - p.x), polynomial_exponent) + + powf((non_zero_coordinates[j].y - p.y), polynomial_exponent); + return sqrtf(distance) < K_NEAREST_KEYPOINTS_THRESHOLD; + }); + } + keypoints_without_nearest.push_back(non_zero_coordinates[j]); + // + keypoints_score.push_back(smoothProbMap.at + (non_zero_coordinates[j].y, non_zero_coordinates[j].x)); + } + coor->push_back(keypoints_without_nearest); + coor_score->push_back(keypoints_score); + } + return APP_ERR_OK; +} + +/** + * @brief Compute expected confidence for each candidate skeleton + * @param endpoints - Coordinates of the two end points of a skeleton + * @param paf_x - PAF heatmap of x coordinate + * @param paf_y - PAF heatmap of y coordinate + * @return result - Keep confidence information of this skeleton in the form: + * [confidence score, number of successfully hit sub points] + */ +std::vector MxpiOpenposePostProcess::OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, + const std::vector &endpoints) { + int x1 = endpoints[0].x, y1 = endpoints[0].y; + int x2 = endpoints[1].x, y2 = endpoints[1].y; + // affinity score of this skeleton + float score = 0; + // count: number of valid inner points on this skeleton + int count = 0, num_inter = 10; + float dx = x2 - x1; + float dy = y2 - y1; + float norm_vec = sqrt(dx * dx + dy * dy); + float vx = dx / (norm_vec + 1e-6); + float vy = dy / (norm_vec + 1e-6); + // generate 10 points equally spaced on this skeleton + std::vector xs {}; + float step_x = dx / (num_inter - 1); + for (int k = 0; k < num_inter; k++) { + float temp_x = x1 + k * step_x; + xs.push_back(round(temp_x)); + } + std::vector ys {}; + float step_y = dy / (num_inter - 1); + for (int k = 0; k < num_inter; k++) { + float temp_y = y1 + k * step_y; + ys.push_back(round(temp_y)); + } + std::vector sub_score_vec; + // calculate PAF value of each inner point + for (int i = 0; i < xs.size(); i++) { + float sub_score = paf_x.at(ys[i], xs[i]) * vx + paf_y.at(ys[i], xs[i]) * vy; + sub_score_vec.push_back(sub_score); + } + // remove inner points such that has PAF value < K_LOCAL_PAF_SCORE_THRESHOLD + sub_score_vec.erase(std::remove_if( + sub_score_vec.begin(), sub_score_vec.end(), + [](const float &x) { + return x <= K_LOCAL_PAF_SCORE_THRESHOLD; + }), sub_score_vec.end()); + std::vector result {0.0, 0.0}; + score = std::accumulate(sub_score_vec.begin(), sub_score_vec.end(), 0.0); + count = sub_score_vec.size(); + result[0] = score / (count + 1e-6); + result[1] = count; + return result; +} + +/** + * @brief Remove conflict skeletons + * @param src - Source vector that stores skeletons to be processed + * @param dst - Target vector that collects candidate skeletons + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::ConntectionNms(std::vector *src, std::vector *dst) { + // Remove conflict skeletons, if two skeletons of the same type share a same end point, they are conflict + std::vector used_idx1 {}; + std::vector used_idx2 {}; + // Sort skeletons in ascending order of affinity score + std::sort(src[0].begin(), src[0].end(), GreaterSort); + for (int i = 0; i < src[0].size(); i++) { + PartPair candidate = src[0][i]; + if (std::find(used_idx1.begin(), used_idx1.end(), candidate.idx1) != used_idx1.end() + || std::find(used_idx2.begin(), used_idx2.end(), candidate.idx2) != used_idx2.end()) { + continue; + } + dst->push_back(candidate); + used_idx1.push_back(candidate.idx1); + used_idx2.push_back(candidate.idx2); + } + return APP_ERR_OK; +} + +/** + * @brief Calculate expected confidence of each possible skeleton and choose candidates + * @param part_idx - Index of skeleton in K_POSE_BODY_PART_SKELETONS + * @param coor - Candidate positions of endpoints + * @param coor_score - Corresponding score of coor + * @param paf_heatmap - PAF heatmap + * @param connections - Target vector that collects candidate skeletons + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::ScoreSkeletons(const int part_idx, + const std::vector > &coor, const std::vector > &coor_score, + const std::vector &paf_heatmap, std::vector *connections) { + // Use point1 and point2 to represent the two endpoints of a skeleton + int coco_skeleton_idx1 = K_POSE_BODY_PART_SKELETONS[2 * part_idx]; + int coco_skeleton_idx2 = K_POSE_BODY_PART_SKELETONS[2 * part_idx + 1]; + int index_stride = 2; + int end_point_num = 2; + int paf_x_idx = K_POSE_MAP_INDEX[index_stride * part_idx]; + int paf_y_idx = K_POSE_MAP_INDEX[index_stride * part_idx + 1]; + std::vector endpoints(end_point_num, cv::Point(0, 0)); + std::vector connection_temp {}; + std::vector result {0.0, 0.0}; + // Calculate the affinity score of each skeleton composed of all candidate point1 and point2 + for (int i = 0; i < coor[coco_skeleton_idx1].size(); i++) { + cv::Point point1; + point1.x = coor[coco_skeleton_idx1][i].x; + point1.y = coor[coco_skeleton_idx1][i].y; + endpoints[0] = point1; + for (int j = 0; j < coor[coco_skeleton_idx2].size(); j++) { + cv::Point point2; + point2.x = coor[coco_skeleton_idx2][j].x; + point2.y = coor[coco_skeleton_idx2][j].y; + endpoints[1] = point2; + result = OneSkeletonScore(paf_heatmap[paf_x_idx], paf_heatmap[paf_y_idx], endpoints); + // Keep skeletons with affinity scores greater than 0 and + // valid internal points greater than K_LOCAL_PAF_COUNT_THRESHOLD + if (result[1] <= K_LOCAL_PAF_COUNT_THRESHOLD || result[0] <= 0.0) { + continue; + } + // Store the information of a skeleton in a custom structure PartPair + PartPair skeleton; + skeleton.score = result[0]; + skeleton.partIdx1 = coco_skeleton_idx1; + skeleton.partIdx2 = coco_skeleton_idx2; + skeleton.idx1 = i; + skeleton.idx2 = j; + skeleton.coord1.push_back(point1.x); + skeleton.coord1.push_back(point1.y); + skeleton.coord2.push_back(point2.x); + skeleton.coord2.push_back(point2.y); + skeleton.score1 = coor_score[coco_skeleton_idx1][i]; + skeleton.score2 = coor_score[coco_skeleton_idx2][j]; + connection_temp.push_back(skeleton); + } + } + // For skeletons with the same endpoints, keep the one with larger affinity score + ConntectionNms(&connection_temp, connections); + return APP_ERR_OK; +} + +/** + * @brief Merge a skeleton to an existed person + * @param person_list - Currently existed person list + * @param current_pair - Skeleton to be merged + * @return True if merged successfully, otherwise false + */ +bool MxpiOpenposePostProcess::MergeSkeletonToPerson(std::vector > *person_list, +PartPair current_pair) { + // Use point1 and point2 to represent the two endpoints of a skeleton + for (int k = 0; k < person_list[0].size(); k++) { + std::vector ¤t_person = person_list[0][k]; + for (int i = 0; i < current_person.size(); i++) { + if (current_pair.partIdx1 == current_person[i].partIdx1 && + current_pair.idx1 == current_person[i].idx1) { + // point1 of current skeleton is the same as point1 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx1 == current_person[i].partIdx2 && + current_pair.idx1 == current_person[i].idx2) { + // point1 of current skeleton is the same as point2 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx2 == current_person[i].partIdx1 && + current_pair.idx2 == current_person[i].idx1) { + // point2 of current skeleton is the same as point1 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } else if (current_pair.partIdx2 == current_person[i].partIdx2 && + current_pair.idx2 == current_person[i].idx2) { + // point2 of current skeleton is the same as point2 of a skeleton in current person + current_person.push_back(current_pair); + return true; + } + } + } + // Can not merge to any existed person, create new person + std::vector new_person {}; + new_person.push_back(current_pair); + person_list->push_back(new_person); + return true; +} + +/** + * @brief Group keypoints to skeletons and assemble them to person + * @param paf_heatmap - PAF heatmap + * @param coor - Coordinates of all the candidate keypoints + * @param coor_score - Corresponding score of coordinates + * @param person_list - Target vector to store person, each person is stored as a vector of skeletons + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::GroupKeypoints(const std::vector &paf_heatmap, +const std::vector > &coor, const std::vector > &coor_score, +std::vector > *person_list) { + for (int i = 0; i < K_NUM_BODY_PARTS + 1; i++) { + // Choose candidate skeletons for each category, there are a total of kNumBodyPart + 1 categories of skeletons + std::vector part_connections {}; + ScoreSkeletons(i, coor, coor_score, paf_heatmap, &part_connections); + // Merge newly generated skeletons to existed person or create new person + if (i == 0) { + // For the first category, each different skeleton of this category stands for different person + for (int j = 0; j < part_connections.size(); j++) { + std::vector new_person {}; + new_person.push_back(part_connections[j]); + person_list->push_back(new_person); + } + } else if (i == K_NUM_BODY_PARTS - 1 || i == K_NUM_BODY_PARTS) { + // The last two skeletons do not contribute to person score + for (int j = 0; j < part_connections.size(); j++) { + part_connections[j].score = 0; + part_connections[j].score1 = 0; + part_connections[j].score2 = 0; + bool can_merge = MergeSkeletonToPerson(person_list, part_connections[j]); + } + } else { + for (int j = 0; j < part_connections.size(); j++) { + MergeSkeletonToPerson(person_list, part_connections[j]); + } + } + } + return APP_ERR_OK; +} + +/** + * @brief Calculate score of a person according to its skeletons + * @param person - Target person + * @return Score value + */ +float MxpiOpenposePostProcess::PersonScore(const std::vector &person) { + // The score of a person is composed of the scores of all his keypoints and that of all his skeletons + std::vector seen_keypoints = {}; + float person_score = 0.0; + for (int i = 0; i < person.size(); i++) { + PartPair skeleton = person[i]; + if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx1) == seen_keypoints.end()) { + seen_keypoints.push_back(skeleton.partIdx1); + person_score += skeleton.score1; + } + if (std::find(seen_keypoints.begin(), seen_keypoints.end(), skeleton.partIdx2) == seen_keypoints.end()) { + seen_keypoints.push_back(skeleton.partIdx2); + person_score += skeleton.score2; + } + person_score += skeleton.score; + } + // Ignore person whose number of skeletons is less than K_PERSON_SKELETON_COUNT_THRESHOLD or + // the average score of each keypoint is less than K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD + if (seen_keypoints.size() < K_PERSON_SKELETON_COUNT_THRESHOLD || + (person_score / seen_keypoints.size()) < K_PERSON_KEYPOINT_AVG_SCORE_THRESHOLD) { + return 0.0; + } + return person_score; +} + +/** + * @brief Prepare output in the format of MxpiPersonList + * @param person_list - Source data in the format of std::vector > + * @param dst_mxpi_person_list - Target data in the format of MxpiPersonList + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::GenerateMxpiOutput(const std::vector > &person_list, +mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list) { + float float_equal_zero_bias = 0.000001; + for (int k = 0; k < person_list.size(); k++) { + float person_score = PersonScore(person_list[k]); + // Ignore person with score 0 + if (fabs(person_score - 0) < float_equal_zero_bias) { + continue; + } + auto mxpi_person_ptr = dst_mxpi_person_list->add_personinfovec(); + mxpiopenposeproto::MxpiMetaHeader* dst_person_mxpi_metaheader_list = mxpi_person_ptr->add_headervec(); + dst_person_mxpi_metaheader_list->set_datasource(parentName_); + dst_person_mxpi_metaheader_list->set_memberid(0); + mxpi_person_ptr->set_score(person_score); + for (int j = 0; j < person_list[k].size(); j++) { + PartPair skeleton = person_list[k][j]; + auto mxpi_skeleton_ptr = mxpi_person_ptr->add_skeletoninfovec(); + mxpi_skeleton_ptr->set_cocoskeletonindex1(skeleton.partIdx1); + mxpi_skeleton_ptr->set_cocoskeletonindex2(skeleton.partIdx2); + mxpi_skeleton_ptr->set_x0(skeleton.coord1[0]); + mxpi_skeleton_ptr->set_y0(skeleton.coord1[1]); + mxpi_skeleton_ptr->set_point1score(skeleton.score1); + mxpi_skeleton_ptr->set_x1(skeleton.coord2[0]); + mxpi_skeleton_ptr->set_y1(skeleton.coord2[1]); + mxpi_skeleton_ptr->set_point2score(skeleton.score2); + mxpi_skeleton_ptr->set_skeletonscore(skeleton.score); + } + } + return APP_ERR_OK; +} + +/** + * Overall process to generate all person skeleton information + * @param image_decoder_visionListSptr - Source MxpiVisionList containing vision data including size of input image + * @param src_mxpi_tensor_package - Source MxpiTensorPackage containing heatmap data + * @param dst_mxpi_person_list - Target MxpiPersonList containing detection result list + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::GeneratePersonList(const MxpiVisionList image_decoder_visionListSptr, +const MxpiTensorPackageList src_mxpi_tensor_package, mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list) { + // Get tensor + std::vector tensors = {}; + GetTensors(src_mxpi_tensor_package, &tensors); + // Get size of the input image and the aligned image + std::vector vision_infos = {}; + GetImageSizes(image_decoder_visionListSptr, &vision_infos); + std::vector keypoint_heatmap, paf_heatmap; + // Read data from tensor output by the upstream plugin + std::vector > result = ReadDataFromTensorPytorch(tensors); + keypoint_heatmap = result[0]; + paf_heatmap = result[1]; + // Resize heatmaps to the size of the input image + ResizeHeatmaps(vision_infos, &keypoint_heatmap, &paf_heatmap); + // Extract candidate keypoints + std::vector > coor {}; + std::vector > coor_score {}; + ExtractKeypoints(keypoint_heatmap, &coor, &coor_score); + // Group candidate keypoints to candidate skeletons and generate person + std::vector > person_list {}; + GroupKeypoints(paf_heatmap, coor, coor_score, &person_list); + // Prepare output in the format of MxpiPersonList + GenerateMxpiOutput(person_list, dst_mxpi_person_list); + return APP_ERR_OK; +} + +/** + * @brief Initialize configure parameter. + * @param config_param_map + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::Init(std::map> &config_param_map) { + LogInfo << "MxpiOpenposePostProcess::Init start."; + // Get the property values by key + std::shared_ptr parent_name_prop_sptr = std::static_pointer_cast(config_param_map["dataSource"]); + this->parentName_ = *parent_name_prop_sptr.get(); + std::shared_ptr image_decoder_prop_sptr = std::static_pointer_cast(config_param_map["imageSource"]); + imageDecoderName_ = *image_decoder_prop_sptr.get(); + std::shared_ptr input_height_prop_sptr = + std::static_pointer_cast(config_param_map["inputHeight"]); + inputHeight_ = *input_height_prop_sptr.get(); + std::shared_ptr input_width_prop_sptr = + std::static_pointer_cast(config_param_map["inputWidth"]); + inputWidth_ = *input_width_prop_sptr.get(); + return APP_ERR_OK; +} + +/** + * @brief DeInitialize configure parameter. + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::DeInit() { + LogInfo << "MxpiOpenposePostProcess::DeInit end."; + LogInfo << "MxpiOpenposePostProcess::DeInit end."; + return APP_ERR_OK; +} + +/** + * @brief Process the data of MxpiBuffer. + * @param mxpi_buffer + * @return APP_ERROR + */ +APP_ERROR MxpiOpenposePostProcess::Process(std::vector &mxpi_buffer) { + LogInfo << "MxpiOpenposePostProcess::Process start"; + MxpiBuffer *buffer = mxpi_buffer[0]; + MxpiMetadataManager mxpi_metadata_manager(*buffer); + MxpiErrorInfo mxpi_error_info; + ErrorInfo_.str(""); + auto errorInfoPtr = mxpi_metadata_manager.GetErrorInfo(); + if (errorInfoPtr != nullptr) { + ErrorInfo_ << GetError(APP_ERR_COMM_FAILURE, pluginName_) << + "MxpiOpenposePostProcess process is not implemented"; + mxpi_error_info.ret = APP_ERR_COMM_FAILURE; + mxpi_error_info.errorInfo = ErrorInfo_.str(); + SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); + LogError << "MxpiOpenposePostProcess process is not implemented"; + return APP_ERR_COMM_FAILURE; + } + // Get the output of tensorinfer from buffer + shared_ptr metadata = mxpi_metadata_manager.GetMetadata(parentName_); + shared_ptr src_mxpi_tensor_packageListSptr + = static_pointer_cast(metadata); + + // Get the output of imagedecoder from buffer + shared_ptr id_metadata = mxpi_metadata_manager.GetMetadata(imageDecoderName_); + shared_ptr image_decoder_visionListSptr + = static_pointer_cast(id_metadata); + + // Generate output + shared_ptr dst_mxpi_person_listSptr = + make_shared(); + APP_ERROR ret = GeneratePersonList(*image_decoder_visionListSptr, + *src_mxpi_tensor_packageListSptr, &(*dst_mxpi_person_listSptr)); + if (ret != APP_ERR_OK) { + ErrorInfo_ << GetError(ret, pluginName_) << "MxpiOpenposePostProcess get skeleton information failed."; + mxpi_error_info.ret = ret; + mxpi_error_info.errorInfo = ErrorInfo_.str(); + SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); + return ret; + } + + ret = mxpi_metadata_manager.AddProtoMetadata(pluginName_, static_pointer_cast(dst_mxpi_person_listSptr)); + if (ret != APP_ERR_OK) { + ErrorInfo_ << GetError(ret, pluginName_) << "MxpiOpenposePostProcess add metadata failed."; + mxpi_error_info.ret = ret; + mxpi_error_info.errorInfo = ErrorInfo_.str(); + SetMxpiErrorInfo(pluginName_, mxpi_error_info, buffer); + return ret; + } + + // Send the data to downstream plugin + SendData(0, *buffer); + LogInfo << "MxpiOpenposePostProcess::Process end"; + return APP_ERR_OK; +} + +/** + * @brief Definition the parameter of configure properties. + * @return std::vector> + */ +std::vector> MxpiOpenposePostProcess::DefineProperties() { + std::vector> properties; + // Set the type and related information of the properties, and the key is the name + auto parent_name_pro_sptr = std::make_shared>(ElementProperty{ + STRING, "dataSource", "parentName", "the name of previous plugin", "mxpi_modelinfer0", "NULL", "NULL"}); + auto image_decoder_name_pro_sptr = std::make_shared>(ElementProperty{ + STRING, "imageSource", "inputName", "the name of imagedecoder", "mxpi_imagedecoder0", "NULL", "NULL"}); + auto input_height_pro_sptr = std::make_shared>(ElementProperty{ + UINT, "inputHeight", "inputHeightValue", "the height of the input image", 368, 0, 1000}); + auto input_width_pro_sptr = std::make_shared>(ElementProperty{ + UINT, "inputWidth", "inputWidthValue", "the width of the input image", 368, 0, 1000}); + properties.push_back(image_decoder_name_pro_sptr); + properties.push_back(parent_name_pro_sptr); + properties.push_back(input_height_pro_sptr); + properties.push_back(input_width_pro_sptr); + return properties; +} + +APP_ERROR MxpiOpenposePostProcess::SetMxpiErrorInfo(const std::string &plugin_name, + const MxpiErrorInfo &mxpi_error_info, MxpiBuffer *buffer) { + APP_ERROR ret = APP_ERR_OK; + // Define an object of MxpiMetadataManager + MxpiMetadataManager mxpi_metadata_manager(*buffer); + ret = mxpi_metadata_manager.AddErrorInfo(plugin_name, mxpi_error_info); + if (ret != APP_ERR_OK) { + LogError << "Failed to AddErrorInfo."; + return ret; + } + ret = SendData(0, *buffer); + return ret; +} + +// Register the Sample plugin through macro +MX_PLUGIN_GENERATE(MxpiOpenposePostProcess) diff --git a/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.h b/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.h index 4853269ed..262e2003f 100644 --- a/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.h +++ b/cv/pose/openpose/mindspore/infer/sdk/plugins/MxpiOpenposePostProcess.h @@ -1,189 +1,189 @@ -/* - * Copyright(C) 2021. Huawei Technologies Co.,Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H -#define OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H -#include -#include -#include -#include -#include "MxTools/PluginToolkit/base/MxPluginGenerator.h" -#include "MxTools/PluginToolkit/base/MxPluginBase.h" -#include "MxTools/PluginToolkit/metadata/MxpiMetadataManager.h" -#include "MxTools/Proto/MxpiDataType.pb.h" -#include "MxBase/ErrorCode/ErrorCode.h" -#include "mxpiOpenposeProto.pb.h" -#include "opencv2/opencv.hpp" - -/** -* @api -* @brief Definition of MxpiOpenposePostProcess class. -*/ - -namespace MxPlugins { -struct PartPair { - float score; - int partIdx1; - int partIdx2; - int idx1; - int idx2; - std::vector coord1; - std::vector coord2; - float score1; - float score2; -}; - -class MxpiOpenposePostProcess : public MxTools::MxPluginBase { - public: - MxpiOpenposePostProcess() = default; - - ~MxpiOpenposePostProcess() = default; - /** - * @brief Initialize configure parameter. - * @param config_param_map - * @return APP_ERROR - */ - APP_ERROR Init(std::map> &config_param_map) override; - - /** - * @brief DeInitialize configure parameter. - * @return APP_ERROR - */ - APP_ERROR DeInit() override; - - /** - * @brief Process the data of MxpiBuffer. - * @param mxpi_buffer - * @return APP_ERROR - */ - APP_ERROR Process(std::vector &mxpi_buffer) override; - - /** - * @brief Definition the parameter of configure properties. - * @return std::vector> - */ - static std::vector> DefineProperties(); - - /** - * Overall process to generate all person skeleton information - * @param image_decoder_visionListSptr - Source MxpiVisionList containing vision data about input and aligned image - * @param src_mxpi_tensor_package - Source MxpiTensorPackage containing heatmap data - * @param dst_mxpi_person_list - Target MxpiPersonList containing detection result list - * @return APP_ERROR - */ - APP_ERROR GeneratePersonList(const MxTools::MxpiVisionList image_decoder_visionListSptr, - const MxTools::MxpiTensorPackageList src_mxpi_tensor_package, - mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list); - - /** - * @brief Resize output heatmaps to the size of the origin image - * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat - * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat - * @param vision_infos - Vision infos of origin image and aligned image - * @return APP_ERROR - */ - APP_ERROR ResizeHeatmaps(const std::vector &vision_infos, std::vector *keypoint_heatmap, - std::vector *paf_heatmap); - - /** - * @brief Extract candidate keypoints from output heatmap - * @param keypoint_heatmap - Keypoint heatmap stored in vector - * @param coor - Keep coor for candidate keypoints by category - * @param coor_score - Keep coor score for candidate keypoints by category - * @return APP_ERROR - */ - APP_ERROR ExtractKeypoints(const std::vector &keypoint_heatmap, - std::vector > *coor, std::vector > *coor_score); - - /** - * @brief Group keypoints to skeletons and assemble them to person - * @param paf_heatmap - PAF heatmap - * @param coor - Coordinates of all the candidate keypoints - * @param coor_score - Corresponding score of coordinates - * @param person_list - Target vector to store person, each person is stored as a vector of skeletons - * @return APP_ERROR - */ - APP_ERROR GroupKeypoints(const std::vector& paf_heatmap, - const std::vector > &coor, const std::vector > &coor_score, - std::vector > *person_list); - - /** - * @brief Calculate expected confidence of each possible skeleton and choose candidates - * @param part_idx - Index of skeleton in kPoseBodyPartSkeletons - * @param coor - Candidate positions of endpoints - * @param coor_score - Corresponding score of coor - * @param paf_heatmap - PAF heatmap - * @param connections - Target vector that collects candidate skeletons - * @return APP_ERROR - */ - APP_ERROR ScoreSkeletons(const int part_idx, const std::vector > &coor, - const std::vector > &coor_score, const std::vector &paf_heatmap, - std::vector *connections); - - /** - * @brief Compute expected confidence for each candidate skeleton - * @param endpoints - Coordinates of the two end points of a skeleton - * @param paf_x - PAF heatmap of x coordinate - * @param paf_y - PAF heatmap of y coordinate - * @return result - Keep confidence information of this skeleton in the form: - * [confidence score, number of successfully hit sub points] - */ - std::vector OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, - const std::vector &endpoints); - - /** - * @brief Remove duplicate skeletons - * @param src - Source vector that stores skeletons to be processed - * @param dst - Target vector that collects filter skeletons - * @return APP_ERROR - */ - APP_ERROR ConntectionNms(std::vector *src, std::vector *dst); - - /** - * @brief Merge a skeleton to an existed person - * @param person_list - Currently existed person list - * @param current_pair - Skeleton to be merged - * @return True if merged successfully, otherwise false - */ - bool MergeSkeletonToPerson(std::vector > *person_list, PartPair current_pair); - - /** - * @brief Calculate score of a person according to its skeletons - * @param person - Target person - * @return Score value - */ - float PersonScore(const std::vector &person); - - /** - * @brief Prepare output in the format of MxpiPersonList - * @param person_list - Source data in the format of std::vector > - * @param dst_mxpi_person_list - Target data in the format of MxpiPersonList - * @return - */ - APP_ERROR GenerateMxpiOutput(const std::vector > &person_list, - mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list); - - private: - APP_ERROR SetMxpiErrorInfo(const std::string &plugin_name, - const MxTools::MxpiErrorInfo &mxpi_error_info, MxTools::MxpiBuffer *buffer); - std::string parentName_; - std::string imageDecoderName_; - std::uint32_t inputHeight_; - std::uint32_t inputWidth_; - std::ostringstream ErrorInfo_; -}; -} // namespace MxPlugins -#endif // OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H +/* + * Copyright(C) 2021. Huawei Technologies Co.,Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H +#define OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H +#include +#include +#include +#include +#include "MxTools/PluginToolkit/base/MxPluginGenerator.h" +#include "MxTools/PluginToolkit/base/MxPluginBase.h" +#include "MxTools/PluginToolkit/metadata/MxpiMetadataManager.h" +#include "MxTools/Proto/MxpiDataType.pb.h" +#include "MxBase/ErrorCode/ErrorCode.h" +#include "mxpiOpenposeProto.pb.h" +#include "opencv2/opencv.hpp" + +/** +* @api +* @brief Definition of MxpiOpenposePostProcess class. +*/ + +namespace MxPlugins { +struct PartPair { + float score; + int partIdx1; + int partIdx2; + int idx1; + int idx2; + std::vector coord1; + std::vector coord2; + float score1; + float score2; +}; + +class MxpiOpenposePostProcess : public MxTools::MxPluginBase { + public: + MxpiOpenposePostProcess() = default; + + ~MxpiOpenposePostProcess() = default; + /** + * @brief Initialize configure parameter. + * @param config_param_map + * @return APP_ERROR + */ + APP_ERROR Init(std::map> &config_param_map) override; + + /** + * @brief DeInitialize configure parameter. + * @return APP_ERROR + */ + APP_ERROR DeInit() override; + + /** + * @brief Process the data of MxpiBuffer. + * @param mxpi_buffer + * @return APP_ERROR + */ + APP_ERROR Process(std::vector &mxpi_buffer) override; + + /** + * @brief Definition the parameter of configure properties. + * @return std::vector> + */ + static std::vector> DefineProperties(); + + /** + * Overall process to generate all person skeleton information + * @param image_decoder_visionListSptr - Source MxpiVisionList containing vision data about input and aligned image + * @param src_mxpi_tensor_package - Source MxpiTensorPackage containing heatmap data + * @param dst_mxpi_person_list - Target MxpiPersonList containing detection result list + * @return APP_ERROR + */ + APP_ERROR GeneratePersonList(const MxTools::MxpiVisionList image_decoder_visionListSptr, + const MxTools::MxpiTensorPackageList src_mxpi_tensor_package, + mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list); + + /** + * @brief Resize output heatmaps to the size of the origin image + * @param keypoint_heatmap - Keypoint heatmap, each channel of the heatmap is stored as a Mat + * @param paf_heatmap - PAF heatmap, each channel of the heatmap is stored as a Mat + * @param vision_infos - Vision infos of origin image and aligned image + * @return APP_ERROR + */ + APP_ERROR ResizeHeatmaps(const std::vector &vision_infos, std::vector *keypoint_heatmap, + std::vector *paf_heatmap); + + /** + * @brief Extract candidate keypoints from output heatmap + * @param keypoint_heatmap - Keypoint heatmap stored in vector + * @param coor - Keep coor for candidate keypoints by category + * @param coor_score - Keep coor score for candidate keypoints by category + * @return APP_ERROR + */ + APP_ERROR ExtractKeypoints(const std::vector &keypoint_heatmap, + std::vector > *coor, std::vector > *coor_score); + + /** + * @brief Group keypoints to skeletons and assemble them to person + * @param paf_heatmap - PAF heatmap + * @param coor - Coordinates of all the candidate keypoints + * @param coor_score - Corresponding score of coordinates + * @param person_list - Target vector to store person, each person is stored as a vector of skeletons + * @return APP_ERROR + */ + APP_ERROR GroupKeypoints(const std::vector& paf_heatmap, + const std::vector > &coor, const std::vector > &coor_score, + std::vector > *person_list); + + /** + * @brief Calculate expected confidence of each possible skeleton and choose candidates + * @param part_idx - Index of skeleton in kPoseBodyPartSkeletons + * @param coor - Candidate positions of endpoints + * @param coor_score - Corresponding score of coor + * @param paf_heatmap - PAF heatmap + * @param connections - Target vector that collects candidate skeletons + * @return APP_ERROR + */ + APP_ERROR ScoreSkeletons(const int part_idx, const std::vector > &coor, + const std::vector > &coor_score, const std::vector &paf_heatmap, + std::vector *connections); + + /** + * @brief Compute expected confidence for each candidate skeleton + * @param endpoints - Coordinates of the two end points of a skeleton + * @param paf_x - PAF heatmap of x coordinate + * @param paf_y - PAF heatmap of y coordinate + * @return result - Keep confidence information of this skeleton in the form: + * [confidence score, number of successfully hit sub points] + */ + std::vector OneSkeletonScore(const cv::Mat &paf_x, const cv::Mat &paf_y, + const std::vector &endpoints); + + /** + * @brief Remove duplicate skeletons + * @param src - Source vector that stores skeletons to be processed + * @param dst - Target vector that collects filter skeletons + * @return APP_ERROR + */ + APP_ERROR ConntectionNms(std::vector *src, std::vector *dst); + + /** + * @brief Merge a skeleton to an existed person + * @param person_list - Currently existed person list + * @param current_pair - Skeleton to be merged + * @return True if merged successfully, otherwise false + */ + bool MergeSkeletonToPerson(std::vector > *person_list, PartPair current_pair); + + /** + * @brief Calculate score of a person according to its skeletons + * @param person - Target person + * @return Score value + */ + float PersonScore(const std::vector &person); + + /** + * @brief Prepare output in the format of MxpiPersonList + * @param person_list - Source data in the format of std::vector > + * @param dst_mxpi_person_list - Target data in the format of MxpiPersonList + * @return + */ + APP_ERROR GenerateMxpiOutput(const std::vector > &person_list, + mxpiopenposeproto::MxpiPersonList *dst_mxpi_person_list); + + private: + APP_ERROR SetMxpiErrorInfo(const std::string &plugin_name, + const MxTools::MxpiErrorInfo &mxpi_error_info, MxTools::MxpiBuffer *buffer); + std::string parentName_; + std::string imageDecoderName_; + std::uint32_t inputHeight_; + std::uint32_t inputWidth_; + std::ostringstream ErrorInfo_; +}; +} // namespace MxPlugins +#endif // OPENPOSEPOSTPROCESS_MXPIOPENPOSEPOSTPROCESS_H diff --git a/cv/pose/openpose/mindspore/infer/sdk/proto/CMakeLists.txt b/cv/pose/openpose/mindspore/infer/sdk/proto/CMakeLists.txt index 95a24569e..5c787b7b3 100644 --- a/cv/pose/openpose/mindspore/infer/sdk/proto/CMakeLists.txt +++ b/cv/pose/openpose/mindspore/infer/sdk/proto/CMakeLists.txt @@ -1,47 +1,47 @@ -cmake_minimum_required(VERSION 3.5.1) -project(ProtoFile) - -set(TARGET_LIBRARY mxpiopenposeproto) - -# Compile options -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) -add_definitions(-Dgoogle=mindxsdk_private) -add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) - -set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") - -if(EXISTS ${MX_SDK_HOME}) - set(PROTOBUF_FOUND TRUE) - set(PROTOBUF_PROTOC_EXECUTABLE ${MX_SDK_HOME}/opensource/bin/protoc) - set(PROTOBUF_INCLUDE_DIRS ${MX_SDK_HOME}/opensource/include) - include_directories(${PROTOBUF_INCLUDE_DIRS}) - link_directories(${MX_SDK_HOME}/opensource/lib) - -endif() -if(PROTOBUF_FOUND) - message(STATUS "protobuf library found") - -else() - message(FATAL_ERROR "protobuf library is needed but cant be found") -endif() - -LIST(APPEND PROTO_FLAGS -I${PROJECT_SOURCE_DIR}) - -EXECUTE_PROCESS( - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --cpp_out=${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --python_out=${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} -) - -file(GLOB_RECURSE SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.cc) -add_library(${TARGET_LIBRARY} SHARED ${SOURCE_FILES}) -target_link_libraries(${TARGET_LIBRARY} mindxsdk_protobuf) - -if(ENABLE_TEST) - target_link_libraries(${TARGET_LIBRARY} gcov) -endif() - -set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) -install(TARGETS ${TARGET_LIBRARY} LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/lib) +cmake_minimum_required(VERSION 3.5.1) +project(ProtoFile) + +set(TARGET_LIBRARY mxpiopenposeproto) + +# Compile options +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-Dgoogle=mindxsdk_private) +add_compile_options(-std=c++11 -fPIC -fstack-protector-all -pie -Wno-deprecated-declarations) + +set(MX_SDK_HOME "$ENV{MX_SDK_HOME}") + +if(EXISTS ${MX_SDK_HOME}) + set(PROTOBUF_FOUND TRUE) + set(PROTOBUF_PROTOC_EXECUTABLE ${MX_SDK_HOME}/opensource/bin/protoc) + set(PROTOBUF_INCLUDE_DIRS ${MX_SDK_HOME}/opensource/include) + include_directories(${PROTOBUF_INCLUDE_DIRS}) + link_directories(${MX_SDK_HOME}/opensource/lib) + +endif() +if(PROTOBUF_FOUND) + message(STATUS "protobuf library found") + +else() + message(FATAL_ERROR "protobuf library is needed but cant be found") +endif() + +LIST(APPEND PROTO_FLAGS -I${PROJECT_SOURCE_DIR}) + +EXECUTE_PROCESS( + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --cpp_out=${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${PROTO_FLAGS} --python_out=${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/mxpiOpenposeProto.proto + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) + +file(GLOB_RECURSE SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.cc) +add_library(${TARGET_LIBRARY} SHARED ${SOURCE_FILES}) +target_link_libraries(${TARGET_LIBRARY} mindxsdk_protobuf) + +if(ENABLE_TEST) + target_link_libraries(${TARGET_LIBRARY} gcov) +endif() + +set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) +install(TARGETS ${TARGET_LIBRARY} LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/lib) diff --git a/cv/pose/openpose/mindspore/infer/sdk/proto/mxpiOpenposeProto.proto b/cv/pose/openpose/mindspore/infer/sdk/proto/mxpiOpenposeProto.proto index 4f3b4d88e..413259642 100644 --- a/cv/pose/openpose/mindspore/infer/sdk/proto/mxpiOpenposeProto.proto +++ b/cv/pose/openpose/mindspore/infer/sdk/proto/mxpiOpenposeProto.proto @@ -1,33 +1,33 @@ -syntax = "proto3"; -package mxpiopenposeproto; -message MxpiMetaHeader -{ -string parentName = 1; -int32 memberId = 2; -string dataSource = 3; -} - -message MxpiSkeletonInfo -{ -int32 cocoSkeletonIndex1 = 1; -int32 cocoSkeletonIndex2 = 2; -float x0 = 3; -float y0 = 4; -float point1Score = 5; -float x1 = 6; -float y1 = 7; -float point2Score = 8; -float skeletonScore = 9; -} - -message MxpiPersonInfo -{ -repeated MxpiMetaHeader headerVec = 1; -repeated MxpiSkeletonInfo skeletonInfoVec = 2; -float score = 3; -} - -message MxpiPersonList -{ -repeated MxpiPersonInfo personInfoVec = 1; +syntax = "proto3"; +package mxpiopenposeproto; +message MxpiMetaHeader +{ +string parentName = 1; +int32 memberId = 2; +string dataSource = 3; +} + +message MxpiSkeletonInfo +{ +int32 cocoSkeletonIndex1 = 1; +int32 cocoSkeletonIndex2 = 2; +float x0 = 3; +float y0 = 4; +float point1Score = 5; +float x1 = 6; +float y1 = 7; +float point2Score = 8; +float skeletonScore = 9; +} + +message MxpiPersonInfo +{ +repeated MxpiMetaHeader headerVec = 1; +repeated MxpiSkeletonInfo skeletonInfoVec = 2; +float score = 3; +} + +message MxpiPersonList +{ +repeated MxpiPersonInfo personInfoVec = 1; } \ No newline at end of file diff --git a/cv/super_resolution/ttvsr/pytorch/configs/TTVSR_reds4.py b/cv/super_resolution/ttvsr/pytorch/configs/TTVSR_reds4.py index 9163be06c..39d50ed1d 100755 --- a/cv/super_resolution/ttvsr/pytorch/configs/TTVSR_reds4.py +++ b/cv/super_resolution/ttvsr/pytorch/configs/TTVSR_reds4.py @@ -1,113 +1,113 @@ -# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -exp_name = 'ttvsr_reds4' - -# model settings -model = dict( - type='TTVSR', - generator=dict( - type='TTVSRNet', mid_channels=64, num_blocks=60 ,stride=4, - spynet_pretrained='https://download.openmmlab.com/mmediting/restorers/basicvsr/spynet_20210409-c6c1bd09.pth'), - pixel_loss=dict(type='CharbonnierLoss', loss_weight=1.0, reduction='mean')) -# model training and testing settings -train_cfg = dict(fix_iter=5000) -test_cfg = dict(metrics=['PSNR', 'SSIM'], crop_border=0) - -# dataset settings -train_dataset_type = 'SRREDSMultipleGTDataset' -val_dataset_type = 'SRREDSMultipleGTDataset' -dataset_root = './data/REDS/' - -train_pipeline = [ - dict(type='GenerateSegmentIndices', interval_list=[1]), - dict(type='TemporalReverse', keys=['lq_path',"gt_path"], reverse_ratio=0.5), - dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), - dict(type='LoadImageFromFileList', io_backend='disk', key='gt', channel_order='rgb'), - dict(type='PairedRandomCrop', gt_patch_size=256), - dict(type='RescaleToZeroOne', keys=['lq', 'gt']), - dict(type='Flip', keys=['lq', 'gt'], flip_ratio=0.5, direction='horizontal'), - dict(type='Flip', keys=['lq', 'gt'], flip_ratio=0.5, direction='vertical'), - dict(type='RandomTransposeHW', keys=['lq', 'gt'], transpose_ratio=0.5), - dict(type='FramesToTensor', keys=['lq', 'gt']), - dict(type='Collect', keys=['lq', 'gt'], meta_keys=['lq_path', 'gt_path']) -] - -test_pipeline = [ - dict(type='GenerateSegmentIndices', interval_list=[1]), - dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), - dict(type='LoadImageFromFileList', io_backend='disk', key='gt', channel_order='rgb'), - dict(type='RescaleToZeroOne', keys=['lq', 'gt']), - dict(type='FramesToTensor', keys=['lq', 'gt']), - dict(type='Collect', keys=['lq', 'gt'], meta_keys=['lq_path', 'gt_path', 'key']) -] - -demo_pipeline = [ - dict(type='GenerateSegmentIndices', interval_list=[1]), - dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), - dict(type='RescaleToZeroOne', keys=['lq']), - dict(type='FramesToTensor', keys=['lq']), - dict(type='Collect', keys=['lq'], meta_keys=['lq_path', 'key']) -] - - -data = dict( - workers_per_gpu=4, - train_dataloader=dict(samples_per_gpu=2, drop_last=True), # 2 gpus - val_dataloader=dict(samples_per_gpu=1), - test_dataloader=dict(samples_per_gpu=1, workers_per_gpu=1), - - # train - train=dict( - type='RepeatDataset', - times=1000, - dataset=dict( - type=train_dataset_type, - lq_folder=dataset_root+'train/train_sharp_bicubic/X4', - gt_folder=dataset_root+'train/train_sharp', - num_input_frames=30, - pipeline=train_pipeline, - scale=4, - val_partition='REDS4', - test_mode=False)), - # val - val=dict( - type=val_dataset_type, - lq_folder=dataset_root+'train/train_sharp_bicubic/X4', - gt_folder=dataset_root+'train/train_sharp', - num_input_frames=100, - pipeline=test_pipeline, - scale=4, - val_partition='REDS4', - test_mode=True), - # test - test=dict( - type=val_dataset_type, - lq_folder=dataset_root+'train/train_sharp_bicubic/X4', - gt_folder=dataset_root+'train/train_sharp', - num_input_frames=100, - pipeline=test_pipeline, - scale=4, - val_partition='REDS4', - test_mode=True), -) -# optimizer -optimizers = dict( - generator=dict(type='Adam', lr=2e-4, betas=(0.9, 0.99), paramwise_cfg=dict(custom_keys={'spynet': dict(lr_mult=0.125)}))) - -# learning policy -total_iters = 400000 -lr_config = dict(policy='CosineRestart', by_epoch=False, periods=[400000], restart_weights=[1], min_lr=1e-7) -checkpoint_config = dict(interval=10000, save_optimizer=True, by_epoch=False, create_symlink=False) -# remove gpu_collect=True in non distributed training -evaluation = dict(interval=10000, save_image=False, gpu_collect=True) -log_config = dict(interval=1, hooks=[dict(type='TextLoggerHook', by_epoch=False, interval_exp_name=400000),]) -visual_config = None - -# runtime settings -dist_params = dict(backend='nccl') -log_level = 'INFO' -work_dir = 'work_dir/ttvsr_reds4' -load_from = None -resume_from = None -workflow = [('train', 1)] -find_unused_parameters = True -cudnn_benchmark = True +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +exp_name = 'ttvsr_reds4' + +# model settings +model = dict( + type='TTVSR', + generator=dict( + type='TTVSRNet', mid_channels=64, num_blocks=60 ,stride=4, + spynet_pretrained='https://download.openmmlab.com/mmediting/restorers/basicvsr/spynet_20210409-c6c1bd09.pth'), + pixel_loss=dict(type='CharbonnierLoss', loss_weight=1.0, reduction='mean')) +# model training and testing settings +train_cfg = dict(fix_iter=5000) +test_cfg = dict(metrics=['PSNR', 'SSIM'], crop_border=0) + +# dataset settings +train_dataset_type = 'SRREDSMultipleGTDataset' +val_dataset_type = 'SRREDSMultipleGTDataset' +dataset_root = './data/REDS/' + +train_pipeline = [ + dict(type='GenerateSegmentIndices', interval_list=[1]), + dict(type='TemporalReverse', keys=['lq_path',"gt_path"], reverse_ratio=0.5), + dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), + dict(type='LoadImageFromFileList', io_backend='disk', key='gt', channel_order='rgb'), + dict(type='PairedRandomCrop', gt_patch_size=256), + dict(type='RescaleToZeroOne', keys=['lq', 'gt']), + dict(type='Flip', keys=['lq', 'gt'], flip_ratio=0.5, direction='horizontal'), + dict(type='Flip', keys=['lq', 'gt'], flip_ratio=0.5, direction='vertical'), + dict(type='RandomTransposeHW', keys=['lq', 'gt'], transpose_ratio=0.5), + dict(type='FramesToTensor', keys=['lq', 'gt']), + dict(type='Collect', keys=['lq', 'gt'], meta_keys=['lq_path', 'gt_path']) +] + +test_pipeline = [ + dict(type='GenerateSegmentIndices', interval_list=[1]), + dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), + dict(type='LoadImageFromFileList', io_backend='disk', key='gt', channel_order='rgb'), + dict(type='RescaleToZeroOne', keys=['lq', 'gt']), + dict(type='FramesToTensor', keys=['lq', 'gt']), + dict(type='Collect', keys=['lq', 'gt'], meta_keys=['lq_path', 'gt_path', 'key']) +] + +demo_pipeline = [ + dict(type='GenerateSegmentIndices', interval_list=[1]), + dict(type='LoadImageFromFileList', io_backend='disk', key='lq', channel_order='rgb'), + dict(type='RescaleToZeroOne', keys=['lq']), + dict(type='FramesToTensor', keys=['lq']), + dict(type='Collect', keys=['lq'], meta_keys=['lq_path', 'key']) +] + + +data = dict( + workers_per_gpu=4, + train_dataloader=dict(samples_per_gpu=2, drop_last=True), # 2 gpus + val_dataloader=dict(samples_per_gpu=1), + test_dataloader=dict(samples_per_gpu=1, workers_per_gpu=1), + + # train + train=dict( + type='RepeatDataset', + times=1000, + dataset=dict( + type=train_dataset_type, + lq_folder=dataset_root+'train/train_sharp_bicubic/X4', + gt_folder=dataset_root+'train/train_sharp', + num_input_frames=30, + pipeline=train_pipeline, + scale=4, + val_partition='REDS4', + test_mode=False)), + # val + val=dict( + type=val_dataset_type, + lq_folder=dataset_root+'train/train_sharp_bicubic/X4', + gt_folder=dataset_root+'train/train_sharp', + num_input_frames=100, + pipeline=test_pipeline, + scale=4, + val_partition='REDS4', + test_mode=True), + # test + test=dict( + type=val_dataset_type, + lq_folder=dataset_root+'train/train_sharp_bicubic/X4', + gt_folder=dataset_root+'train/train_sharp', + num_input_frames=100, + pipeline=test_pipeline, + scale=4, + val_partition='REDS4', + test_mode=True), +) +# optimizer +optimizers = dict( + generator=dict(type='Adam', lr=2e-4, betas=(0.9, 0.99), paramwise_cfg=dict(custom_keys={'spynet': dict(lr_mult=0.125)}))) + +# learning policy +total_iters = 400000 +lr_config = dict(policy='CosineRestart', by_epoch=False, periods=[400000], restart_weights=[1], min_lr=1e-7) +checkpoint_config = dict(interval=10000, save_optimizer=True, by_epoch=False, create_symlink=False) +# remove gpu_collect=True in non distributed training +evaluation = dict(interval=10000, save_image=False, gpu_collect=True) +log_config = dict(interval=1, hooks=[dict(type='TextLoggerHook', by_epoch=False, interval_exp_name=400000),]) +visual_config = None + +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = 'work_dir/ttvsr_reds4' +load_from = None +resume_from = None +workflow = [('train', 1)] +find_unused_parameters = True +cudnn_benchmark = True diff --git a/cv/super_resolution/ttvsr/pytorch/mmedit/models/backbones/sr_backbones/ttvsrnet.py b/cv/super_resolution/ttvsr/pytorch/mmedit/models/backbones/sr_backbones/ttvsrnet.py index 4543c0d03..95794621e 100755 --- a/cv/super_resolution/ttvsr/pytorch/mmedit/models/backbones/sr_backbones/ttvsrnet.py +++ b/cv/super_resolution/ttvsr/pytorch/mmedit/models/backbones/sr_backbones/ttvsrnet.py @@ -1,715 +1,715 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -from mmcv.cnn import ConvModule -from mmcv.runner import load_checkpoint -from mmedit.models.common import (PixelShufflePack, ResidualBlockNoBN, make_layer) -from mmedit.models.registry import BACKBONES -from mmedit.utils import get_root_logger -from torchvision import models - -@BACKBONES.register_module() -class TTVSRNet(nn.Module): - """TTVSR - - Support only x4 upsampling. - Paper: - Learning Trajectory-Aware Transformer for Video Super-Resolution, CVPR, 2022 - - Args: - mid_channels (int): Channel number of the intermediate features. - Default: 64. - num_blocks (int): Number of residual blocks in propagation branch. - Default: 60. - stride (int): the scale of tokens. - Default: 4. - frame_stride (int): Number determining the stride of frames. If frame_stride=3, - then the (0, 3, 6, 9, ...)-th frame will be the slected frames. - Default: 3. - spynet_pretrained (str): Pre-trained model path of SPyNet. - Default: None. - """ - - def __init__(self, mid_channels=64, num_blocks=60, stride=4, frame_stride=3,spynet_pretrained=None): - - super().__init__() - - self.mid_channels = mid_channels - self.keyframe_stride = frame_stride - self.stride = stride - # optical flow network for feature alignment - self.spynet = SPyNet(pretrained=spynet_pretrained) - self.feat_extractor = ResidualBlocksWithInputConv( - 3, mid_channels, 5) - self.LTAM = LTAM(stride = self.stride) - # propagation branches - self.resblocks = ResidualBlocksWithInputConv( - 2 * mid_channels, mid_channels, num_blocks) - # upsample - self.fusion = nn.Conv2d( - 3 * mid_channels, mid_channels, 1, 1, 0, bias=True) - self.upsample1 = PixelShufflePack( - mid_channels, mid_channels, 2, upsample_kernel=3) - self.upsample2 = PixelShufflePack( - mid_channels, 64, 2, upsample_kernel=3) - self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1) - self.conv_last = nn.Conv2d(64, 3, 3, 1, 1) - self.img_upsample = nn.Upsample( - scale_factor=4, mode='bilinear', align_corners=False) - # activation function - self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) - - def check_if_mirror_extended(self, lrs): - """Check whether the input is a mirror-extended sequence. - - If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the - (t-1-i)-th frame. - - Args: - lrs (tensor): Input LR images with shape (n, t, c, h, w) - """ - - self.is_mirror_extended = False - if lrs.size(1) % 2 == 0: - lrs_1, lrs_2 = torch.chunk(lrs, 2, dim=1) - if torch.norm(lrs_1 - lrs_2.flip(1)) == 0: - self.is_mirror_extended = True - - def compute_flow(self, lrs): - """Compute optical flow using SPyNet for feature warping. - - Note that if the input is an mirror-extended sequence, 'flows_forward' - is not needed, since it is equal to 'flows_backward.flip(1)'. - - Args: - lrs (tensor): Input LR images with shape (n, t, c, h, w) - - Return: - tuple(Tensor): Optical flow. 'flows_forward' corresponds to the - flows used for forward-time propagation (current to previous). - 'flows_backward' corresponds to the flows used for - backward-time propagation (current to next). - """ - - n, t, c, h, w = lrs.size() - lrs_1 = lrs[:, :-1, :, :, :].reshape(-1, c, h, w) - lrs_2 = lrs[:, 1:, :, :, :].reshape(-1, c, h, w) - - flows_backward = self.spynet(lrs_1, lrs_2).view(n, t - 1, 2, h, w) - - if self.is_mirror_extended: # flows_forward = flows_backward.flip(1) - flows_forward = None - else: - flows_forward = self.spynet(lrs_2, lrs_1).view(n, t - 1, 2, h, w) - - return flows_forward, flows_backward - - def forward(self, lrs, to_cpu=False): - """Forward function for BasicVSR. - - Args: - lrs (Tensor): Input LR sequence with shape (n, t, c, h, w). - - Returns: - Tensor: Output HR sequence with shape (n, t, c, 4h, 4w). - """ - - n, t, c, h, w = lrs.size() - assert h >= 64 and w >= 64, ( - 'The height and width of inputs should be at least 64, ' - f'but got {h} and {w}.') - - # check whether the input is an extended sequence - self.check_if_mirror_extended(lrs) - - # compute optical flow - flows_forward, flows_backward = self.compute_flow(lrs) - outputs = self.feat_extractor(lrs.view(-1,c,h,w)).view(n,t,-1,h,w) - outputs = torch.unbind(outputs,dim=1) - outputs = list(outputs) - keyframe_idx_forward = list(range(0, t, self.keyframe_stride)) - keyframe_idx_backward = list(range(t-1, 0, 0-self.keyframe_stride)) - - # backward-time propgation - feat_buffers = [] - sparse_feat_buffers_s1 = [] - sparse_feat_buffers_s2 = [] - sparse_feat_buffers_s3 = [] - index_feat_buffers_s1 = [] - feat_prop = lrs.new_zeros(n, self.mid_channels, h, w) - grid_y, grid_x = torch.meshgrid(torch.arange(0, h//self.stride), torch.arange(0, w//self.stride)) - location_update = torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1) - for i in range(t - 1, -1, -1): - lr_curr = lrs[:, i, :, :, :] - lr_curr_feat = outputs[i] - if i < t - 1: # no warping required for the last timestep - flow = flows_backward[:, i, :, :, :] - feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1),padding_mode='border') - - # update the location map - flow = F.adaptive_avg_pool2d(flow,(h//self.stride,w//self.stride))/self.stride - location_update = flow_warp(location_update, flow.permute(0, 2, 3, 1),padding_mode='border',interpolation="nearest")# n , 2t , h , w - - feat_buffer = torch.stack(feat_buffers, dim=1) - sparse_feat_buffer_s1 = torch.stack(sparse_feat_buffers_s1, dim=1) - sparse_feat_buffer_s2 = torch.stack(sparse_feat_buffers_s2, dim=1) - sparse_feat_buffer_s3 = torch.stack(sparse_feat_buffers_s3, dim=1) - index_feat_buffer_s1 = torch.stack(index_feat_buffers_s1, dim=1) - feat_prop = self.LTAM(lr_curr_feat,index_feat_buffer_s1,feat_prop,sparse_feat_buffer_s1,sparse_feat_buffer_s2,sparse_feat_buffer_s3,location_update) - - # init the location map - if i in keyframe_idx_backward: - location_update = torch.cat([location_update,torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1)],dim=1) # n , 2t , h , w - feat_prop = torch.cat([lr_curr_feat,feat_prop], dim=1) - feat_prop = self.resblocks(feat_prop) - feat_buffers.append(feat_prop) - if i in keyframe_idx_backward: - - # feature tokenization *4 - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s1 = F.unfold(feat_prop, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s1 = F.fold(sparse_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s1.append(sparse_feat_prop_s1) - - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - index_feat_prop_s1 = F.unfold(lr_curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - index_feat_prop_s1 = F.fold(index_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - index_feat_buffers_s1.append(index_feat_prop_s1) - - # feature tokenization *6 - # bs * c * h * w --> # bs * (c*6*6) * (h//4*w//4) - sparse_feat_prop_s2 = F.unfold(feat_prop, kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=int(0.25*self.stride), stride=self.stride) - # bs * (c*6*6) * (h//4*w//4) --> bs * c * (h*1.5) * (w*1.5) - sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(int(1.5*h),int(1.5*w)), kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=0, stride=int(1.5*self.stride)) - # bs * c * (h*1.5) * (w*1.5) --> bs * c * h * w - sparse_feat_prop_s2 = F.adaptive_avg_pool2d(sparse_feat_prop_s2,(h,w)) - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s2 = F.unfold(sparse_feat_prop_s2, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s2.append(sparse_feat_prop_s2) - - # feature tokenization * 8 - # bs * c * h * w --> # bs * (c*8*8) * (h//4*w//4) - sparse_feat_prop_s3 = F.unfold(feat_prop, kernel_size=(int(2*self.stride),int(2*self.stride)), padding=int(0.5*self.stride), stride=self.stride) - # bs * (c*8*8) * (h//4*w//4) --> bs * c * (h*2) * (w*2) - sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(int(2*h),int(2*w)), kernel_size=(int(2*self.stride),int(2*self.stride)), padding=0, stride=int(2*self.stride)) - # bs * c * (h*2) * (w*2) --> bs * c * h * w - sparse_feat_prop_s3 = F.adaptive_avg_pool2d(sparse_feat_prop_s3,(h,w)) - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s3 = F.unfold(sparse_feat_prop_s3, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s3.append(sparse_feat_prop_s3) - - outputs_back = feat_buffers[::-1] - del location_update - del feat_buffers - del sparse_feat_buffers_s1 - del sparse_feat_buffers_s2 - del sparse_feat_buffers_s3 - del index_feat_buffers_s1 - - # forward-time propagation and upsampling - fina_out = [] - feat_buffers = [] - sparse_feat_buffers_s1 = [] - sparse_feat_buffers_s2 = [] - sparse_feat_buffers_s3 = [] - index_feat_buffers_s1 = [] - - feat_prop = torch.zeros_like(feat_prop) - grid_y, grid_x = torch.meshgrid(torch.arange(0, h//self.stride), torch.arange(0, w//self.stride)) - location_update = torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1) - for i in range(0, t): - lr_curr = lrs[:, i, :, :, :] - lr_curr_feat = outputs[i] - if i > 0: # no warping required for the first timestep - if flows_forward is not None: - flow = flows_forward[:, i - 1, :, :, :] - else: - flow = flows_backward[:, -i, :, :, :] - feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1),padding_mode='border') - - # update the location map - flow = F.adaptive_avg_pool2d(flow,(h//self.stride,w//self.stride))/self.stride - location_update = flow_warp(location_update, flow.permute(0, 2, 3, 1),padding_mode='border',interpolation="nearest")# n , 2t , h , w - - feat_buffer = torch.stack(feat_buffers, dim=1) - sparse_feat_buffer_s1 = torch.stack(sparse_feat_buffers_s1, dim=1) - sparse_feat_buffer_s2 = torch.stack(sparse_feat_buffers_s2, dim=1) - sparse_feat_buffer_s3 = torch.stack(sparse_feat_buffers_s3, dim=1) - index_feat_buffer_s1 = torch.stack(index_feat_buffers_s1, dim=1) - feat_prop = self.LTAM(lr_curr_feat,index_feat_buffer_s1,feat_prop,sparse_feat_buffer_s1,sparse_feat_buffer_s2,sparse_feat_buffer_s3,location_update) - - # init the location map - if i in keyframe_idx_forward: - location_update = torch.cat([location_update,torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1)],dim=1) - feat_prop = torch.cat([outputs[i], feat_prop], dim=1) - feat_prop = self.resblocks(feat_prop) - feat_buffers.append(feat_prop) - - if i in keyframe_idx_forward: - # feature tokenization *4 - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s1 = F.unfold(feat_prop, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s1 = F.fold(sparse_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s1.append(sparse_feat_prop_s1) - - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - index_feat_prop_s1 = F.unfold(lr_curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - index_feat_prop_s1 = F.fold(index_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - index_feat_buffers_s1.append(index_feat_prop_s1) - - - # feature tokenization *6 - # bs * c * h * w --> # bs * (c*6*6) * (h//4*w//4) - sparse_feat_prop_s2 = F.unfold(feat_prop, kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=int(0.25*self.stride), stride=self.stride) - # bs * (c*6*6) * (h//4*w//4) --> bs * c * (h*1.5) * (w*1.5) - sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(int(1.5*h),int(1.5*w)), kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=0, stride=int(1.5*self.stride)) - # bs * c * (h*1.5) * (w*1.5) --> bs * c * h * w - sparse_feat_prop_s2 = F.adaptive_avg_pool2d(sparse_feat_prop_s2,(h,w)) - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s2 = F.unfold(sparse_feat_prop_s2, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s2.append(sparse_feat_prop_s2) - - - # feature tokenization *8 - # bs * c * h * w --> # bs * (c*8*8) * (h//4*w//4) - sparse_feat_prop_s3 = F.unfold(feat_prop, kernel_size=(int(2*self.stride),int(2*self.stride)), padding=int(0.5*self.stride), stride=self.stride) - # bs * (c*8*8) * (h//4*w//4) --> bs * c * (h*2) * (w*2) - sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(int(2*h),int(2*w)), kernel_size=(int(2*self.stride),int(2*self.stride)), padding=0, stride=int(2*self.stride)) - # bs * c * (h*2) * (w*2) --> bs * c * h * w - sparse_feat_prop_s3 = F.adaptive_avg_pool2d(sparse_feat_prop_s3,(h,w)) - # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) - sparse_feat_prop_s3 = F.unfold(sparse_feat_prop_s3, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 - sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) - sparse_feat_buffers_s3.append(sparse_feat_prop_s3) - - # upsampling given the backward and forward features - out = torch.cat([outputs_back[i],lr_curr_feat,feat_prop], dim=1) - out = self.lrelu(self.fusion(out)) - out = self.lrelu(self.upsample1(out)) - out = self.lrelu(self.upsample2(out)) - out = self.lrelu(self.conv_hr(out)) - out = self.conv_last(out) - base = self.img_upsample(lr_curr) - out += base - if to_cpu: - fina_out.append(out.cpu()) - else: - fina_out.append(out) - del location_update - del sparse_feat_buffers_s1 - del sparse_feat_buffers_s2 - del sparse_feat_buffers_s3 - del index_feat_buffers_s1 - return torch.stack(fina_out, dim=1) - - def init_weights(self, pretrained=None, strict=True): - """Init weights for models. - - Args: - pretrained (str, optional): Path for pretrained weights. If given - None, pretrained weights will not be loaded. Defaults: None. - strict (boo, optional): Whether strictly load the pretrained model. - Defaults to True. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=strict, logger=logger) - elif pretrained is not None: - raise TypeError(f'"pretrained" must be a str or None. ' - f'But received {type(pretrained)}.') - - -class LTAM(nn.Module): - def __init__(self, stride=4): - super().__init__() - - self.stride = stride - self.fusion = nn.Conv2d(3 * 64, 64, 3, 1, 1, bias=True) - def forward(self, curr_feat, index_feat_set_s1 , anchor_feat, sparse_feat_set_s1 ,sparse_feat_set_s2, sparse_feat_set_s3, location_feat): - """Compute the long-range trajectory-aware attention. - - Args: - anchor_feat (tensor): Input feature with shape (n, c, h, w) - sparse_feat_set_s1 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) - sparse_feat_set_s2 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) - sparse_feat_set_s3 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) - location_feat (tensor): Input location map with shape (n, 2*t, h//4, w//4) - - Return: - fusion_feature (tensor): Output fusion feature with shape (n, c, h, w). - """ - - n, c, h, w = anchor_feat.size() - t = sparse_feat_set_s1.size(1) - feat_len = int(c*self.stride*self.stride) - feat_num = int((h//self.stride) * (w//self.stride)) - - # grid_flow [0,h-1][0,w-1] -> [-1,1][-1,1] - grid_flow = location_feat.contiguous().view(n,t,2,h//self.stride,w//self.stride).permute(0, 1, 3, 4, 2) - grid_flow_x = 2.0 * grid_flow[:, :, :, :, 0] / max(w//self.stride - 1, 1) - 1.0 - grid_flow_y = 2.0 * grid_flow[:, :, :, :, 1] / max(h//self.stride - 1, 1) - 1.0 - grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=4) - - output_s1 = F.grid_sample(sparse_feat_set_s1.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) - output_s2 = F.grid_sample(sparse_feat_set_s2.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) - output_s3 = F.grid_sample(sparse_feat_set_s3.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) - - index_output_s1 = F.grid_sample(index_feat_set_s1.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) - # n * c * h * w --> # n * (c*4*4) * (h//4*w//4) - curr_feat = F.unfold(curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - # n * (c*4*4) * (h//4*w//4) --> n * (h//4*w//4) * (c*4*4) - curr_feat = curr_feat.permute(0, 2, 1) - curr_feat = F.normalize(curr_feat, dim=2).unsqueeze(3) # n * (h//4*w//4) * (c*4*4) * 1 - - # cross-scale attention * 4 - # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 - index_output_s1 = index_output_s1.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) - # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) - index_output_s1 = F.unfold(index_output_s1, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) - # n * t * (c*4*4) * (h//4*w//4) --> n * (h//4*w//4) * t * (c*4*4) - index_output_s1 = index_output_s1.permute(0, 3, 1, 2) - index_output_s1 = F.normalize(index_output_s1, dim=3) # n * (h//4*w//4) * t * (c*4*4) - # [ n * (h//4*w//4) * t * (c*4*4) ] * [ n * (h//4*w//4) * (c*4*4) * 1 ] --> n * (h//4*w//4) * t - matrix_index = torch.matmul(index_output_s1, curr_feat).squeeze(3) # n * (h//4*w//4) * t - matrix_index = matrix_index.view(n,feat_num,t)# n * (h//4*w//4) * t - corr_soft, corr_index = torch.max(matrix_index, dim=2)# n * (h//4*w//4) - # n * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) - corr_soft = corr_soft.unsqueeze(1).expand(-1,feat_len,-1) - # n * (c*4*4) * (h//4*w//4) --> n * c * h * w - corr_soft = F.fold(corr_soft, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - - # Aggr - # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 - output_s1 = output_s1.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) - # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) - output_s1 = F.unfold(output_s1, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) - # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) - output_s1 = torch.gather(output_s1.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) - # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) - output_s1 = output_s1.squeeze(1) - # n * (c*4*4) * (h//4*w//4) --> n * c * h * w - output_s1 = F.fold(output_s1, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - - # Aggr - # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 - output_s2 = output_s2.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) - # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) - output_s2 = F.unfold(output_s2, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) - # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) - output_s2 = torch.gather(output_s2.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) - # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) - output_s2 = output_s2.squeeze(1) - # n * (c*4*4) * (h//4*w//4) --> n * c * h * w - output_s2 = F.fold(output_s2, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - - # Aggr - # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 - output_s3 = output_s3.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) - # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) - output_s3 = F.unfold(output_s3, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) - # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) - output_s3 = torch.gather(output_s3.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) - # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) - output_s3 = output_s3.squeeze(1) - # n * (c*4*4) * (h//4*w//4) --> n * c * h * w - output_s3 = F.fold(output_s3, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) - - out = torch.cat([output_s1,output_s2,output_s3], dim=1) - out = self.fusion(out) - out = out * corr_soft - out += anchor_feat - return out - - - -class ResidualBlocksWithInputConv(nn.Module): - """Residual blocks with a convolution in front. - - Args: - in_channels (int): Number of input channels of the first conv. - out_channels (int): Number of channels of the residual blocks. - Default: 64. - num_blocks (int): Number of residual blocks. Default: 30. - """ - - def __init__(self, in_channels, out_channels=64, num_blocks=30): - super().__init__() - - main = [] - - # a convolution used to match the channels of the residual blocks - main.append(nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=True)) - main.append(nn.LeakyReLU(negative_slope=0.1, inplace=True)) - - # residual blocks - main.append( - make_layer( - ResidualBlockNoBN, num_blocks, mid_channels=out_channels)) - - self.main = nn.Sequential(*main) - - def forward(self, feat): - """ - Forward function for ResidualBlocksWithInputConv. - - Args: - feat (Tensor): Input feature with shape (n, in_channels, h, w) - - Returns: - Tensor: Output feature with shape (n, out_channels, h, w) - """ - return self.main(feat) - - -class SPyNet(nn.Module): - """SPyNet network structure. - - The difference to the SPyNet in [tof.py] is that - 1. more SPyNetBasicModule is used in this version, and - 2. no batch normalization is used in this version. - - Paper: - Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017 - - Args: - pretrained (str): path for pre-trained SPyNet. Default: None. - """ - - def __init__(self, pretrained): - super().__init__() - - self.basic_module = nn.ModuleList( - [SPyNetBasicModule() for _ in range(6)]) - - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=True, logger=logger) - elif pretrained is not None: - raise TypeError('[pretrained] should be str or None, ' - f'but got {type(pretrained)}.') - - self.register_buffer( - 'mean', - torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) - self.register_buffer( - 'std', - torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) - - def compute_flow(self, ref, supp): - """Compute flow from ref to supp. - - Note that in this function, the images are already resized to a - multiple of 32. - - Args: - ref (Tensor): Reference image with shape of (n, 3, h, w). - supp (Tensor): Supporting image with shape of (n, 3, h, w). - - Returns: - Tensor: Estimated optical flow: (n, 2, h, w). - """ - n, _, h, w = ref.size() - - # normalize the input images - ref = [(ref - self.mean) / self.std] - supp = [(supp - self.mean) / self.std] - - # generate downsampled frames - for level in range(5): - ref.append( - F.avg_pool2d( - input=ref[-1], - kernel_size=2, - stride=2, - count_include_pad=False)) - supp.append( - F.avg_pool2d( - input=supp[-1], - kernel_size=2, - stride=2, - count_include_pad=False)) - ref = ref[::-1] - supp = supp[::-1] - - # flow computation - flow = ref[0].new_zeros(n, 2, h // 32, w // 32) - for level in range(len(ref)): - if level == 0: - flow_up = flow - else: - flow_up = F.interpolate( - input=flow, - scale_factor=2, - mode='bilinear', - align_corners=True) * 2.0 - - # add the residue to the upsampled flow - flow = flow_up + self.basic_module[level]( - torch.cat([ - ref[level], - flow_warp( - supp[level], - flow_up.permute(0, 2, 3, 1), - padding_mode='border'), flow_up - ], 1)) - - return flow - - def forward(self, ref, supp): - """Forward function of SPyNet. - - This function computes the optical flow from ref to supp. - - Args: - ref (Tensor): Reference image with shape of (n, 3, h, w). - supp (Tensor): Supporting image with shape of (n, 3, h, w). - - Returns: - Tensor: Estimated optical flow: (n, 2, h, w). - """ - - # upsize to a multiple of 32 - h, w = ref.shape[2:4] - w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1) - h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1) - ref = F.interpolate( - input=ref, size=(h_up, w_up), mode='bilinear', align_corners=False) - supp = F.interpolate( - input=supp, - size=(h_up, w_up), - mode='bilinear', - align_corners=False) - - # compute flow, and resize back to the original resolution - flow = F.interpolate( - input=self.compute_flow(ref, supp), - size=(h, w), - mode='bilinear', - align_corners=False) - - # adjust the flow values - flow[:, 0, :, :] *= float(w) / float(w_up) - flow[:, 1, :, :] *= float(h) / float(h_up) - - return flow - - -class SPyNetBasicModule(nn.Module): - """Basic Module for SPyNet. - - Paper: - Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017 - """ - - def __init__(self): - super().__init__() - - self.basic_module = nn.Sequential( - ConvModule( - in_channels=8, - out_channels=32, - kernel_size=7, - stride=1, - padding=3, - norm_cfg=None, - act_cfg=dict(type='ReLU')), - ConvModule( - in_channels=32, - out_channels=64, - kernel_size=7, - stride=1, - padding=3, - norm_cfg=None, - act_cfg=dict(type='ReLU')), - ConvModule( - in_channels=64, - out_channels=32, - kernel_size=7, - stride=1, - padding=3, - norm_cfg=None, - act_cfg=dict(type='ReLU')), - ConvModule( - in_channels=32, - out_channels=16, - kernel_size=7, - stride=1, - padding=3, - norm_cfg=None, - act_cfg=dict(type='ReLU')), - ConvModule( - in_channels=16, - out_channels=2, - kernel_size=7, - stride=1, - padding=3, - norm_cfg=None, - act_cfg=None)) - - def forward(self, tensor_input): - """ - Args: - tensor_input (Tensor): Input tensor with shape (b, 8, h, w). - 8 channels contain: - [reference image (3), neighbor image (3), initial flow (2)]. - - Returns: - Tensor: Refined flow with shape (b, 2, h, w) - """ - return self.basic_module(tensor_input) - - -def flow_warp(x, - flow, - interpolation='bilinear', - padding_mode='zeros', - align_corners=True): - """Warp an image or a feature map with optical flow. - - Args: - x (Tensor): Tensor with size (n, c, h, w). - flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is - a two-channel, denoting the width and height relative offsets. - Note that the values are not normalized to [-1, 1]. - interpolation (str): Interpolation mode: 'nearest' or 'bilinear'. - Default: 'bilinear'. - padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'. - Default: 'zeros'. - align_corners (bool): Whether align corners. Default: True. - - Returns: - Tensor: Warped image or feature map. - """ - if x.size()[-2:] != flow.size()[1:3]: - raise ValueError(f'The spatial sizes of input ({x.size()[-2:]}) and ' - f'flow ({flow.size()[1:3]}) are not the same.') - _, _, h, w = x.size() - # create mesh grid - grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w)) - grid = torch.stack((grid_x, grid_y), 2).type_as(x) # (w, h, 2) - grid.requires_grad = False - - grid_flow = grid + flow - # scale grid_flow to [-1,1] - grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0 - grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0 - grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=3) - output = F.grid_sample( - x, - grid_flow, - mode=interpolation, - padding_mode=padding_mode, - align_corners=align_corners) - return output +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from mmcv.cnn import ConvModule +from mmcv.runner import load_checkpoint +from mmedit.models.common import (PixelShufflePack, ResidualBlockNoBN, make_layer) +from mmedit.models.registry import BACKBONES +from mmedit.utils import get_root_logger +from torchvision import models + +@BACKBONES.register_module() +class TTVSRNet(nn.Module): + """TTVSR + + Support only x4 upsampling. + Paper: + Learning Trajectory-Aware Transformer for Video Super-Resolution, CVPR, 2022 + + Args: + mid_channels (int): Channel number of the intermediate features. + Default: 64. + num_blocks (int): Number of residual blocks in propagation branch. + Default: 60. + stride (int): the scale of tokens. + Default: 4. + frame_stride (int): Number determining the stride of frames. If frame_stride=3, + then the (0, 3, 6, 9, ...)-th frame will be the slected frames. + Default: 3. + spynet_pretrained (str): Pre-trained model path of SPyNet. + Default: None. + """ + + def __init__(self, mid_channels=64, num_blocks=60, stride=4, frame_stride=3,spynet_pretrained=None): + + super().__init__() + + self.mid_channels = mid_channels + self.keyframe_stride = frame_stride + self.stride = stride + # optical flow network for feature alignment + self.spynet = SPyNet(pretrained=spynet_pretrained) + self.feat_extractor = ResidualBlocksWithInputConv( + 3, mid_channels, 5) + self.LTAM = LTAM(stride = self.stride) + # propagation branches + self.resblocks = ResidualBlocksWithInputConv( + 2 * mid_channels, mid_channels, num_blocks) + # upsample + self.fusion = nn.Conv2d( + 3 * mid_channels, mid_channels, 1, 1, 0, bias=True) + self.upsample1 = PixelShufflePack( + mid_channels, mid_channels, 2, upsample_kernel=3) + self.upsample2 = PixelShufflePack( + mid_channels, 64, 2, upsample_kernel=3) + self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1) + self.conv_last = nn.Conv2d(64, 3, 3, 1, 1) + self.img_upsample = nn.Upsample( + scale_factor=4, mode='bilinear', align_corners=False) + # activation function + self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) + + def check_if_mirror_extended(self, lrs): + """Check whether the input is a mirror-extended sequence. + + If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the + (t-1-i)-th frame. + + Args: + lrs (tensor): Input LR images with shape (n, t, c, h, w) + """ + + self.is_mirror_extended = False + if lrs.size(1) % 2 == 0: + lrs_1, lrs_2 = torch.chunk(lrs, 2, dim=1) + if torch.norm(lrs_1 - lrs_2.flip(1)) == 0: + self.is_mirror_extended = True + + def compute_flow(self, lrs): + """Compute optical flow using SPyNet for feature warping. + + Note that if the input is an mirror-extended sequence, 'flows_forward' + is not needed, since it is equal to 'flows_backward.flip(1)'. + + Args: + lrs (tensor): Input LR images with shape (n, t, c, h, w) + + Return: + tuple(Tensor): Optical flow. 'flows_forward' corresponds to the + flows used for forward-time propagation (current to previous). + 'flows_backward' corresponds to the flows used for + backward-time propagation (current to next). + """ + + n, t, c, h, w = lrs.size() + lrs_1 = lrs[:, :-1, :, :, :].reshape(-1, c, h, w) + lrs_2 = lrs[:, 1:, :, :, :].reshape(-1, c, h, w) + + flows_backward = self.spynet(lrs_1, lrs_2).view(n, t - 1, 2, h, w) + + if self.is_mirror_extended: # flows_forward = flows_backward.flip(1) + flows_forward = None + else: + flows_forward = self.spynet(lrs_2, lrs_1).view(n, t - 1, 2, h, w) + + return flows_forward, flows_backward + + def forward(self, lrs, to_cpu=False): + """Forward function for BasicVSR. + + Args: + lrs (Tensor): Input LR sequence with shape (n, t, c, h, w). + + Returns: + Tensor: Output HR sequence with shape (n, t, c, 4h, 4w). + """ + + n, t, c, h, w = lrs.size() + assert h >= 64 and w >= 64, ( + 'The height and width of inputs should be at least 64, ' + f'but got {h} and {w}.') + + # check whether the input is an extended sequence + self.check_if_mirror_extended(lrs) + + # compute optical flow + flows_forward, flows_backward = self.compute_flow(lrs) + outputs = self.feat_extractor(lrs.view(-1,c,h,w)).view(n,t,-1,h,w) + outputs = torch.unbind(outputs,dim=1) + outputs = list(outputs) + keyframe_idx_forward = list(range(0, t, self.keyframe_stride)) + keyframe_idx_backward = list(range(t-1, 0, 0-self.keyframe_stride)) + + # backward-time propgation + feat_buffers = [] + sparse_feat_buffers_s1 = [] + sparse_feat_buffers_s2 = [] + sparse_feat_buffers_s3 = [] + index_feat_buffers_s1 = [] + feat_prop = lrs.new_zeros(n, self.mid_channels, h, w) + grid_y, grid_x = torch.meshgrid(torch.arange(0, h//self.stride), torch.arange(0, w//self.stride)) + location_update = torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1) + for i in range(t - 1, -1, -1): + lr_curr = lrs[:, i, :, :, :] + lr_curr_feat = outputs[i] + if i < t - 1: # no warping required for the last timestep + flow = flows_backward[:, i, :, :, :] + feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1),padding_mode='border') + + # update the location map + flow = F.adaptive_avg_pool2d(flow,(h//self.stride,w//self.stride))/self.stride + location_update = flow_warp(location_update, flow.permute(0, 2, 3, 1),padding_mode='border',interpolation="nearest")# n , 2t , h , w + + feat_buffer = torch.stack(feat_buffers, dim=1) + sparse_feat_buffer_s1 = torch.stack(sparse_feat_buffers_s1, dim=1) + sparse_feat_buffer_s2 = torch.stack(sparse_feat_buffers_s2, dim=1) + sparse_feat_buffer_s3 = torch.stack(sparse_feat_buffers_s3, dim=1) + index_feat_buffer_s1 = torch.stack(index_feat_buffers_s1, dim=1) + feat_prop = self.LTAM(lr_curr_feat,index_feat_buffer_s1,feat_prop,sparse_feat_buffer_s1,sparse_feat_buffer_s2,sparse_feat_buffer_s3,location_update) + + # init the location map + if i in keyframe_idx_backward: + location_update = torch.cat([location_update,torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1)],dim=1) # n , 2t , h , w + feat_prop = torch.cat([lr_curr_feat,feat_prop], dim=1) + feat_prop = self.resblocks(feat_prop) + feat_buffers.append(feat_prop) + if i in keyframe_idx_backward: + + # feature tokenization *4 + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s1 = F.unfold(feat_prop, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s1 = F.fold(sparse_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s1.append(sparse_feat_prop_s1) + + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + index_feat_prop_s1 = F.unfold(lr_curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + index_feat_prop_s1 = F.fold(index_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + index_feat_buffers_s1.append(index_feat_prop_s1) + + # feature tokenization *6 + # bs * c * h * w --> # bs * (c*6*6) * (h//4*w//4) + sparse_feat_prop_s2 = F.unfold(feat_prop, kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=int(0.25*self.stride), stride=self.stride) + # bs * (c*6*6) * (h//4*w//4) --> bs * c * (h*1.5) * (w*1.5) + sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(int(1.5*h),int(1.5*w)), kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=0, stride=int(1.5*self.stride)) + # bs * c * (h*1.5) * (w*1.5) --> bs * c * h * w + sparse_feat_prop_s2 = F.adaptive_avg_pool2d(sparse_feat_prop_s2,(h,w)) + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s2 = F.unfold(sparse_feat_prop_s2, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s2.append(sparse_feat_prop_s2) + + # feature tokenization * 8 + # bs * c * h * w --> # bs * (c*8*8) * (h//4*w//4) + sparse_feat_prop_s3 = F.unfold(feat_prop, kernel_size=(int(2*self.stride),int(2*self.stride)), padding=int(0.5*self.stride), stride=self.stride) + # bs * (c*8*8) * (h//4*w//4) --> bs * c * (h*2) * (w*2) + sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(int(2*h),int(2*w)), kernel_size=(int(2*self.stride),int(2*self.stride)), padding=0, stride=int(2*self.stride)) + # bs * c * (h*2) * (w*2) --> bs * c * h * w + sparse_feat_prop_s3 = F.adaptive_avg_pool2d(sparse_feat_prop_s3,(h,w)) + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s3 = F.unfold(sparse_feat_prop_s3, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s3.append(sparse_feat_prop_s3) + + outputs_back = feat_buffers[::-1] + del location_update + del feat_buffers + del sparse_feat_buffers_s1 + del sparse_feat_buffers_s2 + del sparse_feat_buffers_s3 + del index_feat_buffers_s1 + + # forward-time propagation and upsampling + fina_out = [] + feat_buffers = [] + sparse_feat_buffers_s1 = [] + sparse_feat_buffers_s2 = [] + sparse_feat_buffers_s3 = [] + index_feat_buffers_s1 = [] + + feat_prop = torch.zeros_like(feat_prop) + grid_y, grid_x = torch.meshgrid(torch.arange(0, h//self.stride), torch.arange(0, w//self.stride)) + location_update = torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1) + for i in range(0, t): + lr_curr = lrs[:, i, :, :, :] + lr_curr_feat = outputs[i] + if i > 0: # no warping required for the first timestep + if flows_forward is not None: + flow = flows_forward[:, i - 1, :, :, :] + else: + flow = flows_backward[:, -i, :, :, :] + feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1),padding_mode='border') + + # update the location map + flow = F.adaptive_avg_pool2d(flow,(h//self.stride,w//self.stride))/self.stride + location_update = flow_warp(location_update, flow.permute(0, 2, 3, 1),padding_mode='border',interpolation="nearest")# n , 2t , h , w + + feat_buffer = torch.stack(feat_buffers, dim=1) + sparse_feat_buffer_s1 = torch.stack(sparse_feat_buffers_s1, dim=1) + sparse_feat_buffer_s2 = torch.stack(sparse_feat_buffers_s2, dim=1) + sparse_feat_buffer_s3 = torch.stack(sparse_feat_buffers_s3, dim=1) + index_feat_buffer_s1 = torch.stack(index_feat_buffers_s1, dim=1) + feat_prop = self.LTAM(lr_curr_feat,index_feat_buffer_s1,feat_prop,sparse_feat_buffer_s1,sparse_feat_buffer_s2,sparse_feat_buffer_s3,location_update) + + # init the location map + if i in keyframe_idx_forward: + location_update = torch.cat([location_update,torch.stack([grid_x,grid_y],dim=0).type_as(lrs).expand(n,-1,-1,-1)],dim=1) + feat_prop = torch.cat([outputs[i], feat_prop], dim=1) + feat_prop = self.resblocks(feat_prop) + feat_buffers.append(feat_prop) + + if i in keyframe_idx_forward: + # feature tokenization *4 + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s1 = F.unfold(feat_prop, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s1 = F.fold(sparse_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s1.append(sparse_feat_prop_s1) + + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + index_feat_prop_s1 = F.unfold(lr_curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + index_feat_prop_s1 = F.fold(index_feat_prop_s1, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + index_feat_buffers_s1.append(index_feat_prop_s1) + + + # feature tokenization *6 + # bs * c * h * w --> # bs * (c*6*6) * (h//4*w//4) + sparse_feat_prop_s2 = F.unfold(feat_prop, kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=int(0.25*self.stride), stride=self.stride) + # bs * (c*6*6) * (h//4*w//4) --> bs * c * (h*1.5) * (w*1.5) + sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(int(1.5*h),int(1.5*w)), kernel_size=(int(1.5*self.stride),int(1.5*self.stride)), padding=0, stride=int(1.5*self.stride)) + # bs * c * (h*1.5) * (w*1.5) --> bs * c * h * w + sparse_feat_prop_s2 = F.adaptive_avg_pool2d(sparse_feat_prop_s2,(h,w)) + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s2 = F.unfold(sparse_feat_prop_s2, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s2 = F.fold(sparse_feat_prop_s2, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s2.append(sparse_feat_prop_s2) + + + # feature tokenization *8 + # bs * c * h * w --> # bs * (c*8*8) * (h//4*w//4) + sparse_feat_prop_s3 = F.unfold(feat_prop, kernel_size=(int(2*self.stride),int(2*self.stride)), padding=int(0.5*self.stride), stride=self.stride) + # bs * (c*8*8) * (h//4*w//4) --> bs * c * (h*2) * (w*2) + sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(int(2*h),int(2*w)), kernel_size=(int(2*self.stride),int(2*self.stride)), padding=0, stride=int(2*self.stride)) + # bs * c * (h*2) * (w*2) --> bs * c * h * w + sparse_feat_prop_s3 = F.adaptive_avg_pool2d(sparse_feat_prop_s3,(h,w)) + # bs * c * h * w --> # bs * (c*4*4) * (h//4*w//4) + sparse_feat_prop_s3 = F.unfold(sparse_feat_prop_s3, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # bs * (c*4*4) * (h//4*w//4) --> bs * (c*4*4) * h//4 * w//4 + sparse_feat_prop_s3 = F.fold(sparse_feat_prop_s3, output_size=(h//self.stride,w//self.stride), kernel_size=(1,1), padding=0, stride=1) + sparse_feat_buffers_s3.append(sparse_feat_prop_s3) + + # upsampling given the backward and forward features + out = torch.cat([outputs_back[i],lr_curr_feat,feat_prop], dim=1) + out = self.lrelu(self.fusion(out)) + out = self.lrelu(self.upsample1(out)) + out = self.lrelu(self.upsample2(out)) + out = self.lrelu(self.conv_hr(out)) + out = self.conv_last(out) + base = self.img_upsample(lr_curr) + out += base + if to_cpu: + fina_out.append(out.cpu()) + else: + fina_out.append(out) + del location_update + del sparse_feat_buffers_s1 + del sparse_feat_buffers_s2 + del sparse_feat_buffers_s3 + del index_feat_buffers_s1 + return torch.stack(fina_out, dim=1) + + def init_weights(self, pretrained=None, strict=True): + """Init weights for models. + + Args: + pretrained (str, optional): Path for pretrained weights. If given + None, pretrained weights will not be loaded. Defaults: None. + strict (boo, optional): Whether strictly load the pretrained model. + Defaults to True. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=strict, logger=logger) + elif pretrained is not None: + raise TypeError(f'"pretrained" must be a str or None. ' + f'But received {type(pretrained)}.') + + +class LTAM(nn.Module): + def __init__(self, stride=4): + super().__init__() + + self.stride = stride + self.fusion = nn.Conv2d(3 * 64, 64, 3, 1, 1, bias=True) + def forward(self, curr_feat, index_feat_set_s1 , anchor_feat, sparse_feat_set_s1 ,sparse_feat_set_s2, sparse_feat_set_s3, location_feat): + """Compute the long-range trajectory-aware attention. + + Args: + anchor_feat (tensor): Input feature with shape (n, c, h, w) + sparse_feat_set_s1 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) + sparse_feat_set_s2 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) + sparse_feat_set_s3 (tensor): Input tokens with shape (n, t, c*4*4, h//4, w//4) + location_feat (tensor): Input location map with shape (n, 2*t, h//4, w//4) + + Return: + fusion_feature (tensor): Output fusion feature with shape (n, c, h, w). + """ + + n, c, h, w = anchor_feat.size() + t = sparse_feat_set_s1.size(1) + feat_len = int(c*self.stride*self.stride) + feat_num = int((h//self.stride) * (w//self.stride)) + + # grid_flow [0,h-1][0,w-1] -> [-1,1][-1,1] + grid_flow = location_feat.contiguous().view(n,t,2,h//self.stride,w//self.stride).permute(0, 1, 3, 4, 2) + grid_flow_x = 2.0 * grid_flow[:, :, :, :, 0] / max(w//self.stride - 1, 1) - 1.0 + grid_flow_y = 2.0 * grid_flow[:, :, :, :, 1] / max(h//self.stride - 1, 1) - 1.0 + grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=4) + + output_s1 = F.grid_sample(sparse_feat_set_s1.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) + output_s2 = F.grid_sample(sparse_feat_set_s2.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) + output_s3 = F.grid_sample(sparse_feat_set_s3.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) + + index_output_s1 = F.grid_sample(index_feat_set_s1.contiguous().view(-1,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)),grid_flow.contiguous().view(-1,(h//self.stride),(w//self.stride),2),mode='nearest',padding_mode='zeros',align_corners=True) # (nt) * (c*4*4) * (h//4) * (w//4) + # n * c * h * w --> # n * (c*4*4) * (h//4*w//4) + curr_feat = F.unfold(curr_feat, kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + # n * (c*4*4) * (h//4*w//4) --> n * (h//4*w//4) * (c*4*4) + curr_feat = curr_feat.permute(0, 2, 1) + curr_feat = F.normalize(curr_feat, dim=2).unsqueeze(3) # n * (h//4*w//4) * (c*4*4) * 1 + + # cross-scale attention * 4 + # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 + index_output_s1 = index_output_s1.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) + # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) + index_output_s1 = F.unfold(index_output_s1, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) + # n * t * (c*4*4) * (h//4*w//4) --> n * (h//4*w//4) * t * (c*4*4) + index_output_s1 = index_output_s1.permute(0, 3, 1, 2) + index_output_s1 = F.normalize(index_output_s1, dim=3) # n * (h//4*w//4) * t * (c*4*4) + # [ n * (h//4*w//4) * t * (c*4*4) ] * [ n * (h//4*w//4) * (c*4*4) * 1 ] --> n * (h//4*w//4) * t + matrix_index = torch.matmul(index_output_s1, curr_feat).squeeze(3) # n * (h//4*w//4) * t + matrix_index = matrix_index.view(n,feat_num,t)# n * (h//4*w//4) * t + corr_soft, corr_index = torch.max(matrix_index, dim=2)# n * (h//4*w//4) + # n * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) + corr_soft = corr_soft.unsqueeze(1).expand(-1,feat_len,-1) + # n * (c*4*4) * (h//4*w//4) --> n * c * h * w + corr_soft = F.fold(corr_soft, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + + # Aggr + # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 + output_s1 = output_s1.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) + # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) + output_s1 = F.unfold(output_s1, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) + # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) + output_s1 = torch.gather(output_s1.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) + # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) + output_s1 = output_s1.squeeze(1) + # n * (c*4*4) * (h//4*w//4) --> n * c * h * w + output_s1 = F.fold(output_s1, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + + # Aggr + # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 + output_s2 = output_s2.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) + # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) + output_s2 = F.unfold(output_s2, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) + # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) + output_s2 = torch.gather(output_s2.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) + # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) + output_s2 = output_s2.squeeze(1) + # n * (c*4*4) * (h//4*w//4) --> n * c * h * w + output_s2 = F.fold(output_s2, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + + # Aggr + # n * t * (c*4*4) * h//4 * w//4 --> nt * (c*4*4) * h//4 * w//4 + output_s3 = output_s3.contiguous().view(n*t,(c*self.stride*self.stride),(h//self.stride),(w//self.stride)) + # nt * (c*4*4) * h//4 * w//4 --> n * t * (c*4*4) * (h//4*w//4) + output_s3 = F.unfold(output_s3, kernel_size=(1, 1), padding=0, stride=1).view(n,-1,feat_len,feat_num) + # n * t * (c*4*4) * (h//4*w//4) --> n * 1 * (c*4*4) * (h//4*w//4) + output_s3 = torch.gather(output_s3.contiguous().view(n,t,feat_len,feat_num), 1, corr_index.view(n,1,1,feat_num).expand(-1,-1,feat_len,-1))# n * 1 * (c*4*4) * (h//4*w//4) + # n * 1 * (c*4*4) * (h//4*w//4) --> n * (c*4*4) * (h//4*w//4) + output_s3 = output_s3.squeeze(1) + # n * (c*4*4) * (h//4*w//4) --> n * c * h * w + output_s3 = F.fold(output_s3, output_size=(h,w), kernel_size=(self.stride,self.stride), padding=0, stride=self.stride) + + out = torch.cat([output_s1,output_s2,output_s3], dim=1) + out = self.fusion(out) + out = out * corr_soft + out += anchor_feat + return out + + + +class ResidualBlocksWithInputConv(nn.Module): + """Residual blocks with a convolution in front. + + Args: + in_channels (int): Number of input channels of the first conv. + out_channels (int): Number of channels of the residual blocks. + Default: 64. + num_blocks (int): Number of residual blocks. Default: 30. + """ + + def __init__(self, in_channels, out_channels=64, num_blocks=30): + super().__init__() + + main = [] + + # a convolution used to match the channels of the residual blocks + main.append(nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=True)) + main.append(nn.LeakyReLU(negative_slope=0.1, inplace=True)) + + # residual blocks + main.append( + make_layer( + ResidualBlockNoBN, num_blocks, mid_channels=out_channels)) + + self.main = nn.Sequential(*main) + + def forward(self, feat): + """ + Forward function for ResidualBlocksWithInputConv. + + Args: + feat (Tensor): Input feature with shape (n, in_channels, h, w) + + Returns: + Tensor: Output feature with shape (n, out_channels, h, w) + """ + return self.main(feat) + + +class SPyNet(nn.Module): + """SPyNet network structure. + + The difference to the SPyNet in [tof.py] is that + 1. more SPyNetBasicModule is used in this version, and + 2. no batch normalization is used in this version. + + Paper: + Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017 + + Args: + pretrained (str): path for pre-trained SPyNet. Default: None. + """ + + def __init__(self, pretrained): + super().__init__() + + self.basic_module = nn.ModuleList( + [SPyNetBasicModule() for _ in range(6)]) + + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=True, logger=logger) + elif pretrained is not None: + raise TypeError('[pretrained] should be str or None, ' + f'but got {type(pretrained)}.') + + self.register_buffer( + 'mean', + torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) + self.register_buffer( + 'std', + torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) + + def compute_flow(self, ref, supp): + """Compute flow from ref to supp. + + Note that in this function, the images are already resized to a + multiple of 32. + + Args: + ref (Tensor): Reference image with shape of (n, 3, h, w). + supp (Tensor): Supporting image with shape of (n, 3, h, w). + + Returns: + Tensor: Estimated optical flow: (n, 2, h, w). + """ + n, _, h, w = ref.size() + + # normalize the input images + ref = [(ref - self.mean) / self.std] + supp = [(supp - self.mean) / self.std] + + # generate downsampled frames + for level in range(5): + ref.append( + F.avg_pool2d( + input=ref[-1], + kernel_size=2, + stride=2, + count_include_pad=False)) + supp.append( + F.avg_pool2d( + input=supp[-1], + kernel_size=2, + stride=2, + count_include_pad=False)) + ref = ref[::-1] + supp = supp[::-1] + + # flow computation + flow = ref[0].new_zeros(n, 2, h // 32, w // 32) + for level in range(len(ref)): + if level == 0: + flow_up = flow + else: + flow_up = F.interpolate( + input=flow, + scale_factor=2, + mode='bilinear', + align_corners=True) * 2.0 + + # add the residue to the upsampled flow + flow = flow_up + self.basic_module[level]( + torch.cat([ + ref[level], + flow_warp( + supp[level], + flow_up.permute(0, 2, 3, 1), + padding_mode='border'), flow_up + ], 1)) + + return flow + + def forward(self, ref, supp): + """Forward function of SPyNet. + + This function computes the optical flow from ref to supp. + + Args: + ref (Tensor): Reference image with shape of (n, 3, h, w). + supp (Tensor): Supporting image with shape of (n, 3, h, w). + + Returns: + Tensor: Estimated optical flow: (n, 2, h, w). + """ + + # upsize to a multiple of 32 + h, w = ref.shape[2:4] + w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1) + h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1) + ref = F.interpolate( + input=ref, size=(h_up, w_up), mode='bilinear', align_corners=False) + supp = F.interpolate( + input=supp, + size=(h_up, w_up), + mode='bilinear', + align_corners=False) + + # compute flow, and resize back to the original resolution + flow = F.interpolate( + input=self.compute_flow(ref, supp), + size=(h, w), + mode='bilinear', + align_corners=False) + + # adjust the flow values + flow[:, 0, :, :] *= float(w) / float(w_up) + flow[:, 1, :, :] *= float(h) / float(h_up) + + return flow + + +class SPyNetBasicModule(nn.Module): + """Basic Module for SPyNet. + + Paper: + Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017 + """ + + def __init__(self): + super().__init__() + + self.basic_module = nn.Sequential( + ConvModule( + in_channels=8, + out_channels=32, + kernel_size=7, + stride=1, + padding=3, + norm_cfg=None, + act_cfg=dict(type='ReLU')), + ConvModule( + in_channels=32, + out_channels=64, + kernel_size=7, + stride=1, + padding=3, + norm_cfg=None, + act_cfg=dict(type='ReLU')), + ConvModule( + in_channels=64, + out_channels=32, + kernel_size=7, + stride=1, + padding=3, + norm_cfg=None, + act_cfg=dict(type='ReLU')), + ConvModule( + in_channels=32, + out_channels=16, + kernel_size=7, + stride=1, + padding=3, + norm_cfg=None, + act_cfg=dict(type='ReLU')), + ConvModule( + in_channels=16, + out_channels=2, + kernel_size=7, + stride=1, + padding=3, + norm_cfg=None, + act_cfg=None)) + + def forward(self, tensor_input): + """ + Args: + tensor_input (Tensor): Input tensor with shape (b, 8, h, w). + 8 channels contain: + [reference image (3), neighbor image (3), initial flow (2)]. + + Returns: + Tensor: Refined flow with shape (b, 2, h, w) + """ + return self.basic_module(tensor_input) + + +def flow_warp(x, + flow, + interpolation='bilinear', + padding_mode='zeros', + align_corners=True): + """Warp an image or a feature map with optical flow. + + Args: + x (Tensor): Tensor with size (n, c, h, w). + flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is + a two-channel, denoting the width and height relative offsets. + Note that the values are not normalized to [-1, 1]. + interpolation (str): Interpolation mode: 'nearest' or 'bilinear'. + Default: 'bilinear'. + padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'. + Default: 'zeros'. + align_corners (bool): Whether align corners. Default: True. + + Returns: + Tensor: Warped image or feature map. + """ + if x.size()[-2:] != flow.size()[1:3]: + raise ValueError(f'The spatial sizes of input ({x.size()[-2:]}) and ' + f'flow ({flow.size()[1:3]}) are not the same.') + _, _, h, w = x.size() + # create mesh grid + grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w)) + grid = torch.stack((grid_x, grid_y), 2).type_as(x) # (w, h, 2) + grid.requires_grad = False + + grid_flow = grid + flow + # scale grid_flow to [-1,1] + grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0 + grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0 + grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=3) + output = F.grid_sample( + x, + grid_flow, + mode=interpolation, + padding_mode=padding_mode, + align_corners=align_corners) + return output diff --git a/cv/super_resolution/ttvsr/pytorch/mmedit/models/restorers/ttvsr.py b/cv/super_resolution/ttvsr/pytorch/mmedit/models/restorers/ttvsr.py index 02eeeea09..f36384086 100755 --- a/cv/super_resolution/ttvsr/pytorch/mmedit/models/restorers/ttvsr.py +++ b/cv/super_resolution/ttvsr/pytorch/mmedit/models/restorers/ttvsr.py @@ -1,205 +1,205 @@ -import numbers -import os.path as osp - -import mmcv -import numpy as np -import torch - -from mmedit.core import tensor2img -from ..registry import MODELS -from .basic_restorer import BasicRestorer - - -@MODELS.register_module() -class TTVSR(BasicRestorer): - """TTVSR - - Paper: - Learning Trajectory-Aware Transformer for Video Super-Resolution, CVPR, 2022 - - Args: - generator (dict): Config for the generator structure. - pixel_loss (dict): Config for pixel-wise loss. - train_cfg (dict): Config for training. Default: None. - test_cfg (dict): Config for testing. Default: None. - pretrained (str): Path for pretrained model. Default: None. - """ - - def __init__(self, - generator, - pixel_loss, - train_cfg=None, - test_cfg=None, - pretrained=None): - super().__init__(generator, pixel_loss, train_cfg, test_cfg, - pretrained) - - # fix pre-trained networks - self.fix_iter = train_cfg.get('fix_iter', 0) if train_cfg else 0 - self.is_weight_fixed = False - - # count training steps - self.register_buffer('step_counter', torch.zeros(1)) - - def check_if_mirror_extended(self, lrs): - """Check whether the input is a mirror-extended sequence. - - If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the - (t-1-i)-th frame. - - Args: - lrs (tensor): Input LR images with shape (n, t, c, h, w) - """ - - is_mirror_extended = False - if lrs.size(1) % 2 == 0: - lrs_1, lrs_2 = torch.chunk(lrs, 2, dim=1) - if torch.norm(lrs_1 - lrs_2.flip(1)) == 0: - is_mirror_extended = True - - return is_mirror_extended - - def train_step(self, data_batch, optimizer): - """Train step. - - Args: - data_batch (dict): A batch of data. - optimizer (obj): Optimizer. - - Returns: - dict: Returned output. - """ - # fix SPyNet and EDVR at the beginning - if self.step_counter < self.fix_iter: - if not self.is_weight_fixed: - self.is_weight_fixed = True - for k, v in self.generator.named_parameters(): - if 'spynet' in k or 'edvr' in k: - v.requires_grad_(False) - elif self.step_counter == self.fix_iter: - # train all the parameters - self.generator.requires_grad_(True) - - outputs = self(**data_batch, test_mode=False) - loss, log_vars = self.parse_losses(outputs.pop('losses')) - - # optimize - optimizer['generator'].zero_grad() - loss.backward() - optimizer['generator'].step() - - self.step_counter += 1 - - outputs.update({'log_vars': log_vars}) - return outputs - - def evaluate(self, output, gt): - """Evaluation function. - - If the output contains multiple frames, we compute the metric - one by one and take an average. - - Args: - output (Tensor): Model output with shape (n, t, c, h, w). - gt (Tensor): GT Tensor with shape (n, t, c, h, w). - - Returns: - dict: Evaluation results. - """ - crop_border = self.test_cfg.crop_border - convert_to = self.test_cfg.get('convert_to', None) - - eval_result = dict() - for metric in self.test_cfg.metrics: - if output.ndim == 5: # a sequence: (n, t, c, h, w) - avg = [] - for i in range(0, output.size(1)): - output_i = tensor2img(output[:, i, :, :, :]) - gt_i = tensor2img(gt[:, i, :, :, :]) - avg.append(self.allowed_metrics[metric]( - output_i, gt_i, crop_border, convert_to=convert_to)) - eval_result[metric] = np.mean(avg) - elif output.ndim == 4: # an image: (n, c, t, w), for Vimeo-90K-T - output_img = tensor2img(output) - gt_img = tensor2img(gt) - value = self.allowed_metrics[metric]( - output_img, gt_img, crop_border, convert_to=convert_to) - eval_result[metric] = value - - return eval_result - - def forward_test(self, - lq, - gt=None, - meta=None, - save_image=False, - save_path=None, - iteration=None): - """Testing forward function. - - Args: - lq (Tensor): LQ Tensor with shape (n, t, c, h, w). - gt (Tensor): GT Tensor with shape (n, t, c, h, w). Default: None. - save_image (bool): Whether to save image. Default: False. - save_path (str): Path to save image. Default: None. - iteration (int): Iteration for the saving image name. - Default: None. - - Returns: - dict: Output results. - """ - with torch.no_grad(): - output = self.generator(lq) - - # If the GT is an image (i.e. the center frame), the output sequence is - # turned to an image. - if gt is not None and gt.ndim == 4: - t = output.size(1) - if self.check_if_mirror_extended(lq): # with mirror extension - output = 0.25 * (output[:, t // 8] + output[:, t // 8 + 7] + output[:, -1 - t // 8 - 7] + output[:, -1 - t // 8]) - else: # without mirror extension - output = output[:, t // 2] - - if self.test_cfg is not None and self.test_cfg.get('metrics', None): - assert gt is not None, ( - 'evaluation with metrics must have gt images.') - results = dict(eval_result=self.evaluate(output, gt)) - else: - results = dict(lq=lq.cpu(), output=output.cpu()) - if gt is not None: - results['gt'] = gt.cpu() - - # save image - if save_image: - if output.ndim == 4: # an image, key = 000001/0000 (Vimeo-90K) - img_name = meta[0]['key'].replace('/', '_') - if isinstance(iteration, numbers.Number): - save_path = osp.join( - save_path, f'{img_name}-{iteration + 1:06d}.png') - elif iteration is None: - save_path = osp.join(save_path, f'{img_name}.png') - else: - raise ValueError('iteration should be number or None, ' - f'but got {type(iteration)}') - mmcv.imwrite(tensor2img(output), save_path) - elif output.ndim == 5: # a sequence, key = 000 - folder_name = meta[0]['key'].split('/')[0] - for i in range(0, output.size(1)): - if isinstance(iteration, numbers.Number): - save_path_i = osp.join( - save_path, folder_name, - f'{i:08d}-{iteration + 1:06d}.png') - elif iteration is None: - # for vid4 test - # save_path_i = osp.join(save_path, folder_name, - # 'frame_'+f'{(i+2):04d}.png') - # for REDS test - save_path_i = osp.join(save_path, folder_name, - f'{i:08d}.png') - else: - raise ValueError('iteration should be number or None, ' - f'but got {type(iteration)}') - mmcv.imwrite( - tensor2img(output[:, i, :, :, :]), save_path_i) - - return results +import numbers +import os.path as osp + +import mmcv +import numpy as np +import torch + +from mmedit.core import tensor2img +from ..registry import MODELS +from .basic_restorer import BasicRestorer + + +@MODELS.register_module() +class TTVSR(BasicRestorer): + """TTVSR + + Paper: + Learning Trajectory-Aware Transformer for Video Super-Resolution, CVPR, 2022 + + Args: + generator (dict): Config for the generator structure. + pixel_loss (dict): Config for pixel-wise loss. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path for pretrained model. Default: None. + """ + + def __init__(self, + generator, + pixel_loss, + train_cfg=None, + test_cfg=None, + pretrained=None): + super().__init__(generator, pixel_loss, train_cfg, test_cfg, + pretrained) + + # fix pre-trained networks + self.fix_iter = train_cfg.get('fix_iter', 0) if train_cfg else 0 + self.is_weight_fixed = False + + # count training steps + self.register_buffer('step_counter', torch.zeros(1)) + + def check_if_mirror_extended(self, lrs): + """Check whether the input is a mirror-extended sequence. + + If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the + (t-1-i)-th frame. + + Args: + lrs (tensor): Input LR images with shape (n, t, c, h, w) + """ + + is_mirror_extended = False + if lrs.size(1) % 2 == 0: + lrs_1, lrs_2 = torch.chunk(lrs, 2, dim=1) + if torch.norm(lrs_1 - lrs_2.flip(1)) == 0: + is_mirror_extended = True + + return is_mirror_extended + + def train_step(self, data_batch, optimizer): + """Train step. + + Args: + data_batch (dict): A batch of data. + optimizer (obj): Optimizer. + + Returns: + dict: Returned output. + """ + # fix SPyNet and EDVR at the beginning + if self.step_counter < self.fix_iter: + if not self.is_weight_fixed: + self.is_weight_fixed = True + for k, v in self.generator.named_parameters(): + if 'spynet' in k or 'edvr' in k: + v.requires_grad_(False) + elif self.step_counter == self.fix_iter: + # train all the parameters + self.generator.requires_grad_(True) + + outputs = self(**data_batch, test_mode=False) + loss, log_vars = self.parse_losses(outputs.pop('losses')) + + # optimize + optimizer['generator'].zero_grad() + loss.backward() + optimizer['generator'].step() + + self.step_counter += 1 + + outputs.update({'log_vars': log_vars}) + return outputs + + def evaluate(self, output, gt): + """Evaluation function. + + If the output contains multiple frames, we compute the metric + one by one and take an average. + + Args: + output (Tensor): Model output with shape (n, t, c, h, w). + gt (Tensor): GT Tensor with shape (n, t, c, h, w). + + Returns: + dict: Evaluation results. + """ + crop_border = self.test_cfg.crop_border + convert_to = self.test_cfg.get('convert_to', None) + + eval_result = dict() + for metric in self.test_cfg.metrics: + if output.ndim == 5: # a sequence: (n, t, c, h, w) + avg = [] + for i in range(0, output.size(1)): + output_i = tensor2img(output[:, i, :, :, :]) + gt_i = tensor2img(gt[:, i, :, :, :]) + avg.append(self.allowed_metrics[metric]( + output_i, gt_i, crop_border, convert_to=convert_to)) + eval_result[metric] = np.mean(avg) + elif output.ndim == 4: # an image: (n, c, t, w), for Vimeo-90K-T + output_img = tensor2img(output) + gt_img = tensor2img(gt) + value = self.allowed_metrics[metric]( + output_img, gt_img, crop_border, convert_to=convert_to) + eval_result[metric] = value + + return eval_result + + def forward_test(self, + lq, + gt=None, + meta=None, + save_image=False, + save_path=None, + iteration=None): + """Testing forward function. + + Args: + lq (Tensor): LQ Tensor with shape (n, t, c, h, w). + gt (Tensor): GT Tensor with shape (n, t, c, h, w). Default: None. + save_image (bool): Whether to save image. Default: False. + save_path (str): Path to save image. Default: None. + iteration (int): Iteration for the saving image name. + Default: None. + + Returns: + dict: Output results. + """ + with torch.no_grad(): + output = self.generator(lq) + + # If the GT is an image (i.e. the center frame), the output sequence is + # turned to an image. + if gt is not None and gt.ndim == 4: + t = output.size(1) + if self.check_if_mirror_extended(lq): # with mirror extension + output = 0.25 * (output[:, t // 8] + output[:, t // 8 + 7] + output[:, -1 - t // 8 - 7] + output[:, -1 - t // 8]) + else: # without mirror extension + output = output[:, t // 2] + + if self.test_cfg is not None and self.test_cfg.get('metrics', None): + assert gt is not None, ( + 'evaluation with metrics must have gt images.') + results = dict(eval_result=self.evaluate(output, gt)) + else: + results = dict(lq=lq.cpu(), output=output.cpu()) + if gt is not None: + results['gt'] = gt.cpu() + + # save image + if save_image: + if output.ndim == 4: # an image, key = 000001/0000 (Vimeo-90K) + img_name = meta[0]['key'].replace('/', '_') + if isinstance(iteration, numbers.Number): + save_path = osp.join( + save_path, f'{img_name}-{iteration + 1:06d}.png') + elif iteration is None: + save_path = osp.join(save_path, f'{img_name}.png') + else: + raise ValueError('iteration should be number or None, ' + f'but got {type(iteration)}') + mmcv.imwrite(tensor2img(output), save_path) + elif output.ndim == 5: # a sequence, key = 000 + folder_name = meta[0]['key'].split('/')[0] + for i in range(0, output.size(1)): + if isinstance(iteration, numbers.Number): + save_path_i = osp.join( + save_path, folder_name, + f'{i:08d}-{iteration + 1:06d}.png') + elif iteration is None: + # for vid4 test + # save_path_i = osp.join(save_path, folder_name, + # 'frame_'+f'{(i+2):04d}.png') + # for REDS test + save_path_i = osp.join(save_path, folder_name, + f'{i:08d}.png') + else: + raise ValueError('iteration should be number or None, ' + f'but got {type(iteration)}') + mmcv.imwrite( + tensor2img(output[:, i, :, :, :]), save_path_i) + + return results diff --git a/multimodal/vision-language_model/llava-1.5/pytorch/train.sh b/multimodal/vision-language_model/llava-1.5/pytorch/train.sh index b0dd8d64a..0178e7ea0 100644 --- a/multimodal/vision-language_model/llava-1.5/pytorch/train.sh +++ b/multimodal/vision-language_model/llava-1.5/pytorch/train.sh @@ -1,54 +1,54 @@ -#!/bin/bash - -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -export ENABLE_FLASH_ATTENTION_WITH_IXDNN=0 -export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 -export PT_SDPA_CONVERT_ATTN_MASK_TO_INF=1 - -deepspeed llava/train/train_mem.py \ - --deepspeed ./scripts/zero2.json \ - --model_name_or_path checkpoints/llava-1.5-7b-hf \ - --version plain \ - --data_path data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ - --image_folder data/LLaVA-Pretrain/images \ - --vision_tower checkpoints/clip-vit-large-patch14 \ - --mm_projector_type mlp2x_gelu \ - --tune_mm_mlp_adapter True \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ - --num_train_epochs 1 \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 24000 \ - --save_total_limit 1 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 False \ - --model_max_length 2048 \ - --gradient_checkpointing True \ - --dataloader_num_workers 4 \ - --lazy_preprocess True -# --report_to wandb +#!/bin/bash + +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +export ENABLE_FLASH_ATTENTION_WITH_IXDNN=0 +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +export PT_SDPA_CONVERT_ATTN_MASK_TO_INF=1 + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path checkpoints/llava-1.5-7b-hf \ + --version plain \ + --data_path data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ + --image_folder data/LLaVA-Pretrain/images \ + --vision_tower checkpoints/clip-vit-large-patch14 \ + --mm_projector_type mlp2x_gelu \ + --tune_mm_mlp_adapter True \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 24000 \ + --save_total_limit 1 \ + --learning_rate 1e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True +# --report_to wandb diff --git a/nlp/dialogue_generation/cpm/pytorch/base/data_preprocessing/preprocess_stc_finetune.py b/nlp/dialogue_generation/cpm/pytorch/base/data_preprocessing/preprocess_stc_finetune.py index 032eb0331..fada1a8e9 100755 --- a/nlp/dialogue_generation/cpm/pytorch/base/data_preprocessing/preprocess_stc_finetune.py +++ b/nlp/dialogue_generation/cpm/pytorch/base/data_preprocessing/preprocess_stc_finetune.py @@ -1,48 +1,48 @@ -import json -import os -import argparse -from tqdm import tqdm - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument("--data_dir", default=None, type=str, help="The input dir of original ChID data.") - parser.add_argument("--output_dir", type=str, help="The processed data output dir.") - - args = parser.parse_args() - - # train - with open(os.path.join(args.data_dir, 'STC.json'), 'r', encoding='utf-8') as f: - raw_data = json.loads(f.read()) - raw_data = raw_data["train"] - - import random - random.seed(1234) - random.shuffle(raw_data) - - # with open(os.path.join(args.output_dir, 'train_all.txt'), 'w') as f_out: - # for pr_pair in tqdm(raw_data, desc="Building Train All"): - # f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") - - with open(os.path.join(args.output_dir, 'train.txt'), 'w', encoding='utf-8') as f_out: - for pr_pair in tqdm(raw_data[:int(0.1 * len(raw_data))], desc="Building Train"): - f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") - - # valid - with open(os.path.join(args.data_dir, 'STC.json'), 'r', encoding='utf-8') as f: - raw_data = json.loads(f.read()) - raw_data = raw_data["valid"] - - with open(os.path.join(args.output_dir, 'valid.txt'), 'w', encoding='utf-8') as f_out: - for pr_pair in tqdm(raw_data, desc="Building Valid"): - f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") - - # # test - # with open(os.path.join(args.data_dir, 'STC_test.json'), 'r') as f: - # raw_data = json.loads(f.read()) - # raw_data = raw_data["test"] - - # with open(os.path.join(args.output_dir, 'test.txt'), 'w') as f_out: - # for pr_pair in tqdm(raw_data, desc="Building Test"): - # f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") +import json +import os +import argparse +from tqdm import tqdm + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--data_dir", default=None, type=str, help="The input dir of original ChID data.") + parser.add_argument("--output_dir", type=str, help="The processed data output dir.") + + args = parser.parse_args() + + # train + with open(os.path.join(args.data_dir, 'STC.json'), 'r', encoding='utf-8') as f: + raw_data = json.loads(f.read()) + raw_data = raw_data["train"] + + import random + random.seed(1234) + random.shuffle(raw_data) + + # with open(os.path.join(args.output_dir, 'train_all.txt'), 'w') as f_out: + # for pr_pair in tqdm(raw_data, desc="Building Train All"): + # f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") + + with open(os.path.join(args.output_dir, 'train.txt'), 'w', encoding='utf-8') as f_out: + for pr_pair in tqdm(raw_data[:int(0.1 * len(raw_data))], desc="Building Train"): + f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") + + # valid + with open(os.path.join(args.data_dir, 'STC.json'), 'r', encoding='utf-8') as f: + raw_data = json.loads(f.read()) + raw_data = raw_data["valid"] + + with open(os.path.join(args.output_dir, 'valid.txt'), 'w', encoding='utf-8') as f_out: + for pr_pair in tqdm(raw_data, desc="Building Valid"): + f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") + + # # test + # with open(os.path.join(args.data_dir, 'STC_test.json'), 'r') as f: + # raw_data = json.loads(f.read()) + # raw_data = raw_data["test"] + + # with open(os.path.join(args.output_dir, 'test.txt'), 'w') as f_out: + # for pr_pair in tqdm(raw_data, desc="Building Test"): + # f_out.write("对话上文:" + "".join(pr_pair[0].strip().split()) + " 回复:" + "".join(pr_pair[1].strip().split()) + "\n") diff --git a/nlp/dialogue_generation/cpm/pytorch/base/model/fp16/loss_scaler.py b/nlp/dialogue_generation/cpm/pytorch/base/model/fp16/loss_scaler.py index 90a99ce82..d686d0866 100755 --- a/nlp/dialogue_generation/cpm/pytorch/base/model/fp16/loss_scaler.py +++ b/nlp/dialogue_generation/cpm/pytorch/base/model/fp16/loss_scaler.py @@ -1,240 +1,240 @@ -# coding=utf-8 -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -# from models import mpu - -# item() is a recent addition, so this helps with backward compatibility. -def to_python_float(t): - if hasattr(t, 'item'): - return t.item() - else: - return t[0] - -class LossScaler: - """ - Class that manages a static loss scale. This class is intended to interact with - :class:`FP16_Optimizer`, and should not be directly manipulated by the user. - - Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to - :class:`FP16_Optimizer`'s constructor. - - Args: - scale (float, optional, default=1.0): The loss scale. - """ - - def __init__(self, scale=1): - self.cur_scale = scale - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - return False - - def update_scale(self, overflow): - pass - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss, retain_graph=False): - scaled_loss = loss*self.loss_scale - scaled_loss.backward(retain_graph=retain_graph) - -class DynamicLossScaler: - """ - Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` - indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of - :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` - operates, because the default options can be changed using the - the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. - - Loss scaling is designed to combat the problem of underflowing gradients encountered at long - times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss - scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are - encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has - occurred. - :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, - and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. - If a certain number of iterations occur without overflowing gradients detected, - :class:`DynamicLossScaler` increases the loss scale once more. - In this way :class:`DynamicLossScaler` attempts to "ride the edge" of - always using the highest loss scale possible without incurring overflow. - - Args: - init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` - scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. - scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. - """ - - def __init__(self, - init_scale=2**32, - scale_factor=2., - scale_window=1000, - min_scale=1, - delayed_shift=1, - consecutive_hysteresis=False): - self.cur_scale = init_scale - self.cur_iter = 0 - self.last_overflow_iter = -1 - self.scale_factor = scale_factor - self.scale_window = scale_window - self.min_scale = min_scale - self.delayed_shift = delayed_shift - self.cur_hysteresis = delayed_shift - self.consecutive_hysteresis = consecutive_hysteresis - - # `params` is a list / generator of torch.Variable - def has_overflow_serial(self, params): - for p in params: - if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): - return True - - return False - - def has_overflow(self, params): - overflow = self.has_overflow_serial(params) - # Since each model parallel GPU carries only part of the model, - # make sure overflow flag is synced across all the model parallel GPUs - if torch.cuda.is_available(): - overflow_gpu = torch.cuda.ByteTensor([overflow]) - else: - overflow_gpu = torch.ByteTensor([overflow]) - # torch.distributed.all_reduce(overflow_gpu, - # op=torch.distributed.ReduceOp.MAX, - # group=mpu.get_model_parallel_group()) - overflow = overflow_gpu.item() - return bool(overflow) - - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - try: - # if x is half, the .float() incurs an additional deep copy, but it's necessary if - # Pytorch's .sum() creates a one-element tensor of the same type as x - # (which is true for some recent version of pytorch). - cpu_sum = float(x.float().sum()) - # More efficient version that can be used if .sum() returns a Python scalar - # cpu_sum = float(x.sum()) - except RuntimeError as instance: - # We want to check if inst is actually an overflow exception. - # RuntimeError could come from a different error. - # If so, we still want the exception to propagate. - if "value cannot be converted" not in instance.args[0]: - raise - return True - else: - if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: - return True - return False - - # `overflow` is boolean indicating whether the gradient overflowed - def update_scale(self, overflow): - - if not hasattr(self, 'min_scale'): - self.min_scale = 1 - if not hasattr(self, 'delayed_shift'): - self.delayed_shift = 1 - if not hasattr(self, 'cur_hysteresis'): - self.cur_hysteresis = 1 - if not hasattr(self, 'consecutive_hysteresis'): - self.consecutive_hysteresis = True - if overflow: - # self.cur_scale /= self.scale_factor - if self.delayed_shift == 1 or self.cur_hysteresis == 1: - self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) - else: - self.cur_hysteresis -= 1 - self.last_overflow_iter = self.cur_iter - else: - if self.consecutive_hysteresis: - self.cur_hysteresis = self.delayed_shift - if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: - if not self.consecutive_hysteresis: - self.cur_hysteresis = self.delayed_shift - self.cur_scale *= self.scale_factor - self.cur_iter += 1 - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss, retain_graph=False): - scaled_loss = loss*self.loss_scale - scaled_loss.backward(retain_graph=retain_graph) - -############################################################## -# Example usage below here -- assuming it's in a separate file -############################################################## -""" -TO-DO separate out into an example. -if __name__ == "__main__": - import torch - from torch.autograd import Variable - from dynamic_loss_scaler import DynamicLossScaler - - # N is batch size; D_in is input dimension; - # H is hidden dimension; D_out is output dimension. - N, D_in, H, D_out = 64, 1000, 100, 10 - - # Create random Tensors to hold inputs and outputs, and wrap them in Variables. - x = Variable(torch.randn(N, D_in), requires_grad=False) - y = Variable(torch.randn(N, D_out), requires_grad=False) - - w1 = Variable(torch.randn(D_in, H), requires_grad=True) - w2 = Variable(torch.randn(H, D_out), requires_grad=True) - parameters = [w1, w2] - - learning_rate = 1e-6 - optimizer = torch.optim.SGD(parameters, lr=learning_rate) - loss_scaler = DynamicLossScaler() - - for t in range(500): - y_pred = x.mm(w1).clamp(min=0).mm(w2) - loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale - print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) - print('Iter {} scaled loss: {}'.format(t, loss.data[0])) - print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) - - # Run backprop - optimizer.zero_grad() - loss.backward() - - # Check for overflow - has_overflow = DynamicLossScaler.has_overflow(parameters) - - # If no overflow, unscale grad and update as usual - if not has_overflow: - for param in parameters: - param.grad.data.mul_(1. / loss_scaler.loss_scale) - optimizer.step() - # Otherwise, don't do anything -- ie, skip iteration - else: - print('OVERFLOW!') - - # Update loss scale for next iteration - loss_scaler.update_scale(has_overflow) - -""" +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +# from models import mpu + +# item() is a recent addition, so this helps with backward compatibility. +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +class LossScaler: + """ + Class that manages a static loss scale. This class is intended to interact with + :class:`FP16_Optimizer`, and should not be directly manipulated by the user. + + Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to + :class:`FP16_Optimizer`'s constructor. + + Args: + scale (float, optional, default=1.0): The loss scale. + """ + + def __init__(self, scale=1): + self.cur_scale = scale + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + return False + + def update_scale(self, overflow): + pass + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +class DynamicLossScaler: + """ + Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` + indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of + :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` + operates, because the default options can be changed using the + the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. + + Loss scaling is designed to combat the problem of underflowing gradients encountered at long + times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are + encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has + occurred. + :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, + and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients detected, + :class:`DynamicLossScaler` increases the loss scale once more. + In this way :class:`DynamicLossScaler` attempts to "ride the edge" of + always using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` + scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. + scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. + """ + + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000, + min_scale=1, + delayed_shift=1, + consecutive_hysteresis=False): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + self.min_scale = min_scale + self.delayed_shift = delayed_shift + self.cur_hysteresis = delayed_shift + self.consecutive_hysteresis = consecutive_hysteresis + + # `params` is a list / generator of torch.Variable + def has_overflow_serial(self, params): + for p in params: + if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): + return True + + return False + + def has_overflow(self, params): + overflow = self.has_overflow_serial(params) + # Since each model parallel GPU carries only part of the model, + # make sure overflow flag is synced across all the model parallel GPUs + if torch.cuda.is_available(): + overflow_gpu = torch.cuda.ByteTensor([overflow]) + else: + overflow_gpu = torch.ByteTensor([overflow]) + # torch.distributed.all_reduce(overflow_gpu, + # op=torch.distributed.ReduceOp.MAX, + # group=mpu.get_model_parallel_group()) + overflow = overflow_gpu.item() + return bool(overflow) + + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + try: + # if x is half, the .float() incurs an additional deep copy, but it's necessary if + # Pytorch's .sum() creates a one-element tensor of the same type as x + # (which is true for some recent version of pytorch). + cpu_sum = float(x.float().sum()) + # More efficient version that can be used if .sum() returns a Python scalar + # cpu_sum = float(x.sum()) + except RuntimeError as instance: + # We want to check if inst is actually an overflow exception. + # RuntimeError could come from a different error. + # If so, we still want the exception to propagate. + if "value cannot be converted" not in instance.args[0]: + raise + return True + else: + if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: + return True + return False + + # `overflow` is boolean indicating whether the gradient overflowed + def update_scale(self, overflow): + + if not hasattr(self, 'min_scale'): + self.min_scale = 1 + if not hasattr(self, 'delayed_shift'): + self.delayed_shift = 1 + if not hasattr(self, 'cur_hysteresis'): + self.cur_hysteresis = 1 + if not hasattr(self, 'consecutive_hysteresis'): + self.consecutive_hysteresis = True + if overflow: + # self.cur_scale /= self.scale_factor + if self.delayed_shift == 1 or self.cur_hysteresis == 1: + self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) + else: + self.cur_hysteresis -= 1 + self.last_overflow_iter = self.cur_iter + else: + if self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: + if not self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +############################################################## +# Example usage below here -- assuming it's in a separate file +############################################################## +""" +TO-DO separate out into an example. +if __name__ == "__main__": + import torch + from torch.autograd import Variable + from dynamic_loss_scaler import DynamicLossScaler + + # N is batch size; D_in is input dimension; + # H is hidden dimension; D_out is output dimension. + N, D_in, H, D_out = 64, 1000, 100, 10 + + # Create random Tensors to hold inputs and outputs, and wrap them in Variables. + x = Variable(torch.randn(N, D_in), requires_grad=False) + y = Variable(torch.randn(N, D_out), requires_grad=False) + + w1 = Variable(torch.randn(D_in, H), requires_grad=True) + w2 = Variable(torch.randn(H, D_out), requires_grad=True) + parameters = [w1, w2] + + learning_rate = 1e-6 + optimizer = torch.optim.SGD(parameters, lr=learning_rate) + loss_scaler = DynamicLossScaler() + + for t in range(500): + y_pred = x.mm(w1).clamp(min=0).mm(w2) + loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale + print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) + print('Iter {} scaled loss: {}'.format(t, loss.data[0])) + print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) + + # Run backprop + optimizer.zero_grad() + loss.backward() + + # Check for overflow + has_overflow = DynamicLossScaler.has_overflow(parameters) + + # If no overflow, unscale grad and update as usual + if not has_overflow: + for param in parameters: + param.grad.data.mul_(1. / loss_scaler.loss_scale) + optimizer.step() + # Otherwise, don't do anything -- ie, skip iteration + else: + print('OVERFLOW!') + + # Update loss scale for next iteration + loss_scaler.update_scale(has_overflow) + +""" diff --git a/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_full_sft.yaml b/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_full_sft.yaml index 214dcde09..a0ff9072f 100644 --- a/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_full_sft.yaml +++ b/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_full_sft.yaml @@ -1,45 +1,45 @@ -### model -model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf -trust_remote_code: true - -### method -stage: sft -do_train: true -finetuning_type: full -freeze_vision_tower: true # choices: [true, false] -freeze_multi_modal_projector: true # choices: [true, false] -train_mm_proj_only: false # choices: [true, false] -deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] - - -### dataset -dataset: mllm_demo -template: yi_vl -cutoff_len: 2048 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/Yi-VL-6B/sft -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 2 -learning_rate: 1.0e-5 -num_train_epochs: 10 -lr_scheduler_type: cosine -warmup_ratio: 0.1 -bf16: true -ddp_timeout: 180000000 - -### eval -val_size: 0.01 -# val_size: 0.1 -# per_device_eval_batch_size: 1 -# eval_strategy: steps -# eval_steps: 500 +### model +model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: full +freeze_vision_tower: true # choices: [true, false] +freeze_multi_modal_projector: true # choices: [true, false] +train_mm_proj_only: false # choices: [true, false] +deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + + +### dataset +dataset: mllm_demo +template: yi_vl +cutoff_len: 2048 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/Yi-VL-6B/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 2 +learning_rate: 1.0e-5 +num_train_epochs: 10 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.01 +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_lora_sft.yaml b/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_lora_sft.yaml index 33e436607..2a8278d19 100644 --- a/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_lora_sft.yaml +++ b/nlp/llm/Yi-VL-6B/pytorch/yi_vl_6b_lora_sft.yaml @@ -1,42 +1,42 @@ -### model -model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf -trust_remote_code: true - -### method -stage: sft -do_train: true -finetuning_type: lora -lora_rank: 8 -lora_target: all - -### dataset -dataset: mllm_demo -template: llava -cutoff_len: 2048 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/yi1_5-6b/lora/sft -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 -learning_rate: 1.0e-4 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_ratio: 0.1 -bf16: true -ddp_timeout: 180000000 - -### eval -val_size: 0.01 -# val_size: 0.1 -# per_device_eval_batch_size: 1 -# eval_strategy: steps -# eval_steps: 500 +### model +model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: mllm_demo +template: llava +cutoff_len: 2048 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/yi1_5-6b/lora/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.01 +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/nlp/llm/chatglm-6b/pytorch/ptuning/README_bi.md b/nlp/llm/chatglm-6b/pytorch/ptuning/README_bi.md index d1b4e6031..fce47a7cf 100644 --- a/nlp/llm/chatglm-6b/pytorch/ptuning/README_bi.md +++ b/nlp/llm/chatglm-6b/pytorch/ptuning/README_bi.md @@ -1,51 +1,51 @@ -# 单机训练 - bash ./ds_train_finetune.sh - - -# 多机训练: -## 启动脚本 - 在 host 机器上调用以下脚本即可: - bash ./ds_train_finetune_multi_nodes.sh - -## 注意事项 -1. 起容器时使用 --network=host - -2. 非 host 机器需要与 host 之间设置ssh免密互连 - - 设置方式: - 2.1 多机上分别设置 sshd 端口监听端口 - /usr/sbin/sshd - /usr/sbin/sshd -p 12345 - - 如果未安装sshd,需要先安装sshd - - 2.2 设置公钥认证 - cd ~/.ssh - ssh-keygen -t rsa - ssh-copy-id -i ~/.ssh/id_rsa.pub root@10.113.2.103 -p 12345 (所有非host和host上执行此语句,将公钥写入host ssh 配置中,使host能免密登录host自身和所有非host机器上) - -3. ds_train_finetune_multi_nodes.sh 中 MASTER_ADDR, MASTER_ADDR 根据实际环境设置,其中 MASTER_PORT 为torch用来通信的,须与ssh远程登录的端口不要重复。 - -4. 在本目录中使用hostfile,其中 nv_103 和 nv_104 与 ~/.ssh/config(下一步添加) 中 host 对应。 - -5. 在环境中添加文件 ~/.ssh/config,格式如下: - host nv_103 - HostName 10.113.2.103 - Port 12345 - host nv_104 - HostName 10.113.2.104 - Port 12345 - - 其中 nv_103,nv_104 与hostfile中host对应; - HostName 分别为多机的ip; - Port 是供deepspeed使用,用于ssh远程登录。 - -## 其他说明 -1. 如果遇到环境变量导致的问题可进行如下解决,如: - 1.1. pip command not found - 添加文件 ~/.deepspeed_env,在其中添加环境变量PATH在真正训练的目标机上的值,如:PATH=/opt/conda/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin - -2. 使用 deepspeed_no_cpu_offload.json 和 deepspeed.json 可以在32G 以下的 8*GPU 上进行双机训练 - -# 源码出处 +# 单机训练 + bash ./ds_train_finetune.sh + + +# 多机训练: +## 启动脚本 + 在 host 机器上调用以下脚本即可: + bash ./ds_train_finetune_multi_nodes.sh + +## 注意事项 +1. 起容器时使用 --network=host + +2. 非 host 机器需要与 host 之间设置ssh免密互连 + + 设置方式: + 2.1 多机上分别设置 sshd 端口监听端口 + /usr/sbin/sshd + /usr/sbin/sshd -p 12345 + + 如果未安装sshd,需要先安装sshd + + 2.2 设置公钥认证 + cd ~/.ssh + ssh-keygen -t rsa + ssh-copy-id -i ~/.ssh/id_rsa.pub root@10.113.2.103 -p 12345 (所有非host和host上执行此语句,将公钥写入host ssh 配置中,使host能免密登录host自身和所有非host机器上) + +3. ds_train_finetune_multi_nodes.sh 中 MASTER_ADDR, MASTER_ADDR 根据实际环境设置,其中 MASTER_PORT 为torch用来通信的,须与ssh远程登录的端口不要重复。 + +4. 在本目录中使用hostfile,其中 nv_103 和 nv_104 与 ~/.ssh/config(下一步添加) 中 host 对应。 + +5. 在环境中添加文件 ~/.ssh/config,格式如下: + host nv_103 + HostName 10.113.2.103 + Port 12345 + host nv_104 + HostName 10.113.2.104 + Port 12345 + + 其中 nv_103,nv_104 与hostfile中host对应; + HostName 分别为多机的ip; + Port 是供deepspeed使用,用于ssh远程登录。 + +## 其他说明 +1. 如果遇到环境变量导致的问题可进行如下解决,如: + 1.1. pip command not found + 添加文件 ~/.deepspeed_env,在其中添加环境变量PATH在真正训练的目标机上的值,如:PATH=/opt/conda/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin + +2. 使用 deepspeed_no_cpu_offload.json 和 deepspeed.json 可以在32G 以下的 8*GPU 上进行双机训练 + +# 源码出处 https://github.com/THUDM/ChatGLM-6B.git \ No newline at end of file diff --git a/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed.json b/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed.json index 9a113eecb..64183a7e6 100644 --- a/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed.json +++ b/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed.json @@ -1,26 +1,26 @@ -{ - "train_micro_batch_size_per_gpu": "auto", - "zero_allow_untested_optimizer": true, - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "initial_scale_power": 16, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "zero_force_ds_cpu_optimizer": false, - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": false, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients" : true - } +{ + "train_micro_batch_size_per_gpu": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients" : true + } } \ No newline at end of file diff --git a/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed_no_cpu_offload.json b/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed_no_cpu_offload.json index 09eafd9b0..39cd89c5a 100644 --- a/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed_no_cpu_offload.json +++ b/nlp/llm/chatglm-6b/pytorch/ptuning/deepspeed_no_cpu_offload.json @@ -1,22 +1,22 @@ -{ - "train_micro_batch_size_per_gpu": "auto", - "zero_allow_untested_optimizer": true, - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "initial_scale_power": 16, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "zero_force_ds_cpu_optimizer": false, - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 5e7, - "overlap_comm": false, - "reduce_scatter": true, - "reduce_bucket_size": 5e7, - "contiguous_gradients" : true - } +{ + "train_micro_batch_size_per_gpu": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e7, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 5e7, + "contiguous_gradients" : true + } } \ No newline at end of file diff --git a/nlp/llm/chatglm-6b/pytorch/ptuning/hostfile b/nlp/llm/chatglm-6b/pytorch/ptuning/hostfile index b7e21352c..a5ec9fa8c 100644 --- a/nlp/llm/chatglm-6b/pytorch/ptuning/hostfile +++ b/nlp/llm/chatglm-6b/pytorch/ptuning/hostfile @@ -1,2 +1,2 @@ -nv_103 slots=8 +nv_103 slots=8 nv_104 slots=8 \ No newline at end of file diff --git a/nlp/llm/phi-3/pytorch/FineTrainingScript.py b/nlp/llm/phi-3/pytorch/FineTrainingScript.py index 846bf0340..d46342cdb 100644 --- a/nlp/llm/phi-3/pytorch/FineTrainingScript.py +++ b/nlp/llm/phi-3/pytorch/FineTrainingScript.py @@ -1,74 +1,74 @@ -# This code is for fine-tuning Phi-3 Models. -# Note thi requires 7.4 GB of GPU RAM for the process. -# Model available at https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3 -# Model Names -# microsoft/Phi-3-mini-4k-instruct -# microsoft/Phi-3-mini-128k-instruct -# microsoft/Phi-3-small-8k-instruct -# microsoft/Phi-3-small-128k-instruct -# microsoft/Phi-3-medium-4k-instruct -# microsoft/Phi-3-medium-128k-instruct -# microsoft/Phi-3-vision-128k-instruct -# microsoft/Phi-3-mini-4k-instruct-onnx -# microsoft/Phi-3-mini-4k-instruct-onnx-web -# microsoft/Phi-3-mini-128k-instruct-onnx -# microsoft/Phi-3-small-8k-instruct-onnx-cuda -# microsoft/Phi-3-small-128k-instruct-onnx-cuda -# microsoft/Phi-3-medium-4k-instruct-onnx-cpu -# microsoft/Phi-3-medium-4k-instruct-onnx-cuda -# microsoft/Phi-3-medium-4k-instruct-onnx-directml -# microsoft/Phi-3-medium-128k-instruct-onnx-cpu -# microsoft/Phi-3-medium-128k-instruct-onnx-cuda -# microsoft/Phi-3-medium-128k-instruct-onnx-directml -# microsoft/Phi-3-mini-4k-instruct-gguf - -# Load the pre-trained model and tokenizer -model = AutoModelForCausalLM.from_pretrained('Model_Name', torch_dtype=torch.float16) -tokenizer = AutoTokenizer.from_pretrained('Mode_Name') - -# Load the dataset for fine-tuning -dataset = load_dataset(DATASET_NAME, split="train") - -# Define the formatting function for the prompts -def formatting_prompts_func(examples): - convos = examples["conversations"] - texts = [] - mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"} - end_mapper = {"system": "", "human": "", "gpt": ""} - for convo in convos: - text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo) - texts.append(f"{text}{tokenizer.eos_token}") - return {"text": texts} - -# Apply the formatting function to the dataset -dataset = dataset.map(formatting_prompts_func, batched=True) - -# Define the training arguments -args = TrainingArguments( - evaluation_strategy="steps", - per_device_train_batch_size=7, - gradient_accumulation_steps=4, - gradient_checkpointing=True, - learning_rate=1e-4, - fp16=True, - max_steps=-1, - num_train_epochs=3, - save_strategy="epoch", - logging_steps=10, - output_dir=NEW_MODEL_NAME, - optim="paged_adamw_32bit", - lr_scheduler_type="linear" -) - -# Create the trainer -trainer = SFTTrainer( - model=model, - args=args, - train_dataset=dataset, - dataset_text_field="text", - max_seq_length=128, - formatting_func=formatting_prompts_func -) - -# Start the training process +# This code is for fine-tuning Phi-3 Models. +# Note thi requires 7.4 GB of GPU RAM for the process. +# Model available at https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3 +# Model Names +# microsoft/Phi-3-mini-4k-instruct +# microsoft/Phi-3-mini-128k-instruct +# microsoft/Phi-3-small-8k-instruct +# microsoft/Phi-3-small-128k-instruct +# microsoft/Phi-3-medium-4k-instruct +# microsoft/Phi-3-medium-128k-instruct +# microsoft/Phi-3-vision-128k-instruct +# microsoft/Phi-3-mini-4k-instruct-onnx +# microsoft/Phi-3-mini-4k-instruct-onnx-web +# microsoft/Phi-3-mini-128k-instruct-onnx +# microsoft/Phi-3-small-8k-instruct-onnx-cuda +# microsoft/Phi-3-small-128k-instruct-onnx-cuda +# microsoft/Phi-3-medium-4k-instruct-onnx-cpu +# microsoft/Phi-3-medium-4k-instruct-onnx-cuda +# microsoft/Phi-3-medium-4k-instruct-onnx-directml +# microsoft/Phi-3-medium-128k-instruct-onnx-cpu +# microsoft/Phi-3-medium-128k-instruct-onnx-cuda +# microsoft/Phi-3-medium-128k-instruct-onnx-directml +# microsoft/Phi-3-mini-4k-instruct-gguf + +# Load the pre-trained model and tokenizer +model = AutoModelForCausalLM.from_pretrained('Model_Name', torch_dtype=torch.float16) +tokenizer = AutoTokenizer.from_pretrained('Mode_Name') + +# Load the dataset for fine-tuning +dataset = load_dataset(DATASET_NAME, split="train") + +# Define the formatting function for the prompts +def formatting_prompts_func(examples): + convos = examples["conversations"] + texts = [] + mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"} + end_mapper = {"system": "", "human": "", "gpt": ""} + for convo in convos: + text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo) + texts.append(f"{text}{tokenizer.eos_token}") + return {"text": texts} + +# Apply the formatting function to the dataset +dataset = dataset.map(formatting_prompts_func, batched=True) + +# Define the training arguments +args = TrainingArguments( + evaluation_strategy="steps", + per_device_train_batch_size=7, + gradient_accumulation_steps=4, + gradient_checkpointing=True, + learning_rate=1e-4, + fp16=True, + max_steps=-1, + num_train_epochs=3, + save_strategy="epoch", + logging_steps=10, + output_dir=NEW_MODEL_NAME, + optim="paged_adamw_32bit", + lr_scheduler_type="linear" +) + +# Create the trainer +trainer = SFTTrainer( + model=model, + args=args, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=128, + formatting_func=formatting_prompts_func +) + +# Start the training process trainer.train() \ No newline at end of file diff --git a/nlp/llm/phi-3/pytorch/models/configuration_phi3.py b/nlp/llm/phi-3/pytorch/models/configuration_phi3.py index 780401034..958fcf30e 100644 --- a/nlp/llm/phi-3/pytorch/models/configuration_phi3.py +++ b/nlp/llm/phi-3/pytorch/models/configuration_phi3.py @@ -1,227 +1,227 @@ -# coding=utf-8 -# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Phi-3 model configuration""" - - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json", - "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json", -} - - -class Phi3Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the - [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 32064): - Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Phi3Model`]. - hidden_size (`int`, *optional*, defaults to 3072): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 8192): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - resid_pdrop (`float`, *optional*, defaults to 0.0): - Dropout probability for mlp outputs. - embd_pdrop (`int`, *optional*, defaults to 0.0): - The dropout ratio for the embeddings. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio after computing the attention scores. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. - original_max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model was trained with. This is used to determine the size of the - original RoPE embeddings when using long scaling. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon value used for the RMSNorm. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and - the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size - divided by the number of attention heads divided by 2. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 32000): - The id of the "end-of-sequence" token. - pad_token_id (`int`, *optional*, defaults to 32000): - The id of the padding token. - sliding_window (`int`, *optional*): - Sliding window attention window size. If `None`, no sliding window is applied. - - Example: - - ```python - >>> from transformers import Phi3Model, Phi3Config - - >>> # Initializing a Phi-3 style configuration - >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - - >>> # Initializing a model from the configuration - >>> model = Phi3Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "phi3" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32064, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - resid_pdrop=0.0, - embd_pdrop=0.0, - attention_dropout=0.0, - hidden_act="silu", - max_position_embeddings=4096, - original_max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - bos_token_id=1, - eos_token_id=32000, - pad_token_id=32000, - sliding_window=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.original_max_position_embeddings = original_max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_adjustment() - self._rope_scaling_validation() - self.sliding_window = sliding_window - - super().__init__( - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - def _rope_scaling_adjustment(self): - """ - Adjust the `type` of the `rope_scaling` configuration for backward compatibility. - """ - if self.rope_scaling is None: - return - - rope_scaling_type = self.rope_scaling.get("type", None) - - # For backward compatibility if previous version used "su" or "yarn" - if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: - self.rope_scaling["type"] = "longrope" - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: - raise ValueError( - "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) - rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["longrope"]: - raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") - if not ( - isinstance(rope_scaling_short_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) - ): - raise ValueError( - f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" - ) - if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2: - raise ValueError( - f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}" - ) - if not ( - isinstance(rope_scaling_long_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) - ): - raise ValueError( - f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" - ) - if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2: - raise ValueError( - f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}" - ) +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Phi-3 model configuration""" + + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json", + "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json", +} + + +class Phi3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32064): + Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Phi3Model`]. + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + original_max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model was trained with. This is used to determine the size of the + original RoPE embeddings when using long scaling. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon value used for the RMSNorm. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and + the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size + divided by the number of attention heads divided by 2. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 32000): + The id of the "end-of-sequence" token. + pad_token_id (`int`, *optional*, defaults to 32000): + The id of the padding token. + sliding_window (`int`, *optional*): + Sliding window attention window size. If `None`, no sliding window is applied. + + Example: + + ```python + >>> from transformers import Phi3Model, Phi3Config + + >>> # Initializing a Phi-3 style configuration + >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + + >>> # Initializing a model from the configuration + >>> model = Phi3Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32064, + hidden_size=3072, + intermediate_size=8192, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="silu", + max_position_embeddings=4096, + original_max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + bos_token_id=1, + eos_token_id=32000, + pad_token_id=32000, + sliding_window=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_adjustment() + self._rope_scaling_validation() + self.sliding_window = sliding_window + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_adjustment(self): + """ + Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + """ + if self.rope_scaling is None: + return + + rope_scaling_type = self.rope_scaling.get("type", None) + + # For backward compatibility if previous version used "su" or "yarn" + if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: + self.rope_scaling["type"] = "longrope" + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: + raise ValueError( + "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) + rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["longrope"]: + raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") + if not ( + isinstance(rope_scaling_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) + ): + raise ValueError( + f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" + ) + if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2: + raise ValueError( + f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}" + ) + if not ( + isinstance(rope_scaling_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) + ): + raise ValueError( + f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" + ) + if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2: + raise ValueError( + f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}" + ) diff --git a/nlp/llm/phi-3/pytorch/models/modeling_phi3.py b/nlp/llm/phi-3/pytorch/models/modeling_phi3.py index 1374f1fca..04ebd03f3 100644 --- a/nlp/llm/phi-3/pytorch/models/modeling_phi3.py +++ b/nlp/llm/phi-3/pytorch/models/modeling_phi3.py @@ -1,1560 +1,1560 @@ -# coding=utf-8 -# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" PyTorch Phi-3 model.""" - -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask -from transformers.modeling_outputs import ( - BaseModelOutputWithPast, - CausalLMOutputWithPast, - SequenceClassifierOutputWithPast, - TokenClassifierOutput, -) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ( - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_phi3 import Phi3Config - - -logger = logging.get_logger(__name__) - -# Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements -# if is_flash_attn_2_available(): -_flash_supports_window_size = False -try: - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) -except ImportError as error: - logger.warning( - f"`flash-attention` package not found, consider installing for better performance: {error}." - ) - if not _flash_supports_window_size: - logger.warning( - "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`." - ) - -_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" -_CONFIG_FOR_DOC = "Phi3Config" - -PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-mini-128k-instruct", - # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3 -] - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3 -class Phi3RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - Phi3RMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - # hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return (self.weight * hidden_states).to(input_dtype) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3 -class Phi3RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.register_buffer("inv_freq", None, persistent=False) - - @torch.no_grad() - def forward(self, x, position_ids, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if self.inv_freq is None: - self.inv_freq = 1.0 / ( - self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim) - ) - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 - device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding): - def __init__(self, dim, config, device=None): - super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) - - self.short_factor = config.rope_scaling["short_factor"] - self.long_factor = config.rope_scaling["long_factor"] - self.original_max_position_embeddings = config.original_max_position_embeddings - - @torch.no_grad() - def forward(self, x, position_ids, seq_len=None): - seq_len = torch.max(position_ids) + 1 - if seq_len > self.original_max_position_embeddings: - ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) - else: - ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) - - inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim - self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) - - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 - device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - - scale = self.max_position_embeddings / self.original_max_position_embeddings - if scale <= 1.0: - scaling_factor = 1.0 - else: - scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) - - cos = emb.cos() * scaling_factor - sin = emb.sin() * scaling_factor - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class Phi3MLP(nn.Module): - def __init__(self, config): - super().__init__() - - self.config = config - self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) - self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) - - self.activation_fn = ACT2FN[config.hidden_act] - - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: - up_states = self.gate_up_proj(hidden_states) - - gate, up_states = up_states.chunk(2, dim=-1) - up_states = up_states * self.activation_fn(gate) - - return self.down_proj(up_states) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Phi3Attention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.attention_dropout = config.attention_dropout - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.original_max_position_embeddings = config.original_max_position_embeddings - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling - self.is_causal = True - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - - op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False) - self._init_rope() - - def _init_rope(self): - if self.rope_scaling is None: - self.rotary_emb = Phi3RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - if scaling_type == "longrope": - self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.") - - bsz, q_len, _ = hidden_states.size() - - qkv = self.qkv_proj(hidden_states) - query_pos = self.num_heads * self.head_dim - query_states = qkv[..., :query_pos] - key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] - value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class Phi3FlashAttention2(Phi3Attention): - """ - Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # Phi3FlashAttention2 attention does not support output_attentions - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library." - ) - raise ValueError("The current flash attention version does not support sliding window attention.") - - output_attentions = False - - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - - bsz, q_len, _ = hidden_states.size() - qkv = self.qkv_proj(hidden_states) - query_pos = self.num_heads * self.head_dim - query_states = qkv[..., :query_pos] - key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] - value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_dropout = self.attention_dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. - - if query_states.dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.qkv_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=attn_dropout, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3 -# TODO @Arthur no longer copied from LLama after static cache -class Phi3SdpaAttention(Phi3Attention): - """ - Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from Phi3Attention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - qkv = self.qkv_proj(hidden_states) - query_pos = self.num_heads * self.head_dim - query_states = qkv[..., :query_pos] - key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] - value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -PHI3_ATTENTION_CLASSES = { - "eager": Phi3Attention, - "flash_attention_2": Phi3FlashAttention2, - "sdpa": Phi3SdpaAttention, -} - - -class Phi3DecoderLayer(nn.Module): - def __init__(self, config: Phi3Config, layer_idx: int): - super().__init__() - - self.config = config - self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) - - self.mlp = Phi3MLP(config) - self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) - self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) - self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): - input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - position_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range - `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - attn_outputs, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = residual + self.resid_attn_dropout(attn_outputs) - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + self.resid_mlp_dropout(hidden_states) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -PHI3_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`Phi3Config`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", - PHI3_START_DOCSTRING, -) -class Phi3PreTrainedModel(PreTrainedModel): - config_class = Phi3Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Phi3DecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = False - _supports_cache_class = True - - _version = "0.0.5" - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -PHI3_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", - PHI3_START_DOCSTRING, -) -class Phi3Model(Phi3PreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`] - - Args: - config: Phi3Config - """ - - def __init__(self, config: Phi3Config): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.embed_dropout = nn.Dropout(config.embd_pdrop) - self.layers = nn.ModuleList( - [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape[:2] - elif inputs_embeds is not None: - batch_size, seq_length = inputs_embeds.shape[:2] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - past_key_values_length = 0 - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class Phi3ForCausalLM(Phi3PreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3 - def __init__(self, config): - super().__init__(config) - self.model = Phi3Model(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings - def get_input_embeddings(self): - return self.model.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder - def set_decoder(self, decoder): - self.model = decoder - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder - def get_decoder(self): - return self.model - - # Ignore copy - @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, Phi3ForCausalLM - - >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct") - >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct") - - >>> prompt = "This is an example script ." - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum' - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - -@add_start_docstrings( - """ - The [`Phi3Model`] with a sequence classification head on top (linear layer). - - [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - PHI3_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs -class Phi3ForSequenceClassification(Phi3PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Phi3Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - model_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = model_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + model_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=model_outputs.past_key_values, - hidden_states=model_outputs.hidden_states, - attentions=model_outputs.attentions, - ) - - -@add_start_docstrings( - """ - [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for - Named-Entity-Recognition (NER) tasks. - """, - PHI3_START_DOCSTRING, -) -# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs -class Phi3ForTokenClassification(Phi3PreTrainedModel): - def __init__(self, config: Phi3Config): - super().__init__(config) - self.num_labels = config.num_labels - - self.model = Phi3Model(config) - if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: - classifier_dropout = config.classifier_dropout - elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: - classifier_dropout = config.hidden_dropout - else: - classifier_dropout = 0.1 - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - attention_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **deprecated_arguments, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - model_outputs = self.model( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = model_outputs[0] - hidden_states = self.dropout(hidden_states) - logits = self.classifier(hidden_states) - - loss = None - if labels is not None: - # move labels to correct device to enable model parallelism - labels = labels.to(logits.device) - batch_size, seq_length = labels.shape - loss_fct = CrossEntropyLoss() - loss = loss_fct( - logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) - ) - - if not return_dict: - output = (logits,) + model_outputs[2:] - return ((loss,) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=model_outputs.hidden_states, - attentions=model_outputs.attentions, - ) +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" PyTorch Phi-3 model.""" + +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_phi3 import Phi3Config + + +logger = logging.get_logger(__name__) + +# Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements +# if is_flash_attn_2_available(): +_flash_supports_window_size = False +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) +except ImportError as error: + logger.warning( + f"`flash-attention` package not found, consider installing for better performance: {error}." + ) + if not _flash_supports_window_size: + logger.warning( + "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`." + ) + +_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" +_CONFIG_FOR_DOC = "Phi3Config" + +PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-mini-128k-instruct", + # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3 +] + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3 +class Phi3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Phi3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + # hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return (self.weight * hidden_states).to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3 +class Phi3RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.register_buffer("inv_freq", None, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.inv_freq is None: + self.inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim) + ) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) + else: + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) + + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.original_max_position_embeddings = config.original_max_position_embeddings + self.rope_theta = config.rope_theta + self.rope_scaling = config.rope_scaling + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.rope_scaling is None: + self.rotary_emb = Phi3RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + if scaling_type == "longrope": + self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.") + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Phi3FlashAttention2(Phi3Attention): + """ + Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # Phi3FlashAttention2 attention does not support output_attentions + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library." + ) + raise ValueError("The current flash attention version does not support sliding window attention.") + + output_attentions = False + + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + bsz, q_len, _ = hidden_states.size() + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.qkv_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=attn_dropout, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3 +# TODO @Arthur no longer copied from LLama after static cache +class Phi3SdpaAttention(Phi3Attention): + """ + Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Phi3Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +PHI3_ATTENTION_CLASSES = { + "eager": Phi3Attention, + "flash_attention_2": Phi3FlashAttention2, + "sdpa": Phi3SdpaAttention, +} + + +class Phi3DecoderLayer(nn.Module): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__() + + self.config = config + self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + + self.mlp = Phi3MLP(config) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = residual + self.resid_attn_dropout(attn_outputs) + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +PHI3_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Phi3Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3PreTrainedModel(PreTrainedModel): + config_class = Phi3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi3DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = False + _supports_cache_class = True + + _version = "0.0.5" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PHI3_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3Model(Phi3PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`] + + Args: + config: Phi3Config + """ + + def __init__(self, config: Phi3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.embed_dropout = nn.Dropout(config.embd_pdrop) + self.layers = nn.ModuleList( + [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Phi3ForCausalLM(Phi3PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3 + def __init__(self, config): + super().__init__(config) + self.model = Phi3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings + def get_input_embeddings(self): + return self.model.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings + def get_output_embeddings(self): + return self.lm_head + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder + def set_decoder(self, decoder): + self.model = decoder + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder + def get_decoder(self): + return self.model + + # Ignore copy + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Phi3ForCausalLM + + >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct") + + >>> prompt = "This is an example script ." + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum' + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The [`Phi3Model`] with a sequence classification head on top (linear layer). + + [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + PHI3_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs +class Phi3ForSequenceClassification(Phi3PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Phi3Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = model_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + model_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=model_outputs.past_key_values, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +@add_start_docstrings( + """ + [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + PHI3_START_DOCSTRING, +) +# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs +class Phi3ForTokenClassification(Phi3PreTrainedModel): + def __init__(self, config: Phi3Config): + super().__init__(config) + self.num_labels = config.num_labels + + self.model = Phi3Model(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = model_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + model_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) diff --git a/nlp/llm/phi-3/pytorch/olive-lab/data/data_sample_travel.jsonl b/nlp/llm/phi-3/pytorch/olive-lab/data/data_sample_travel.jsonl index cab17da34..32451c5a7 100644 --- a/nlp/llm/phi-3/pytorch/olive-lab/data/data_sample_travel.jsonl +++ b/nlp/llm/phi-3/pytorch/olive-lab/data/data_sample_travel.jsonl @@ -1,289 +1,289 @@ -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "Best way to get around Tokyo?","response": "Hop on a bullet train for speed, explore the city's charming neighborhoods by foot or bike."} -{"prompt": "What's the best museum in New York?","response": "The Met is a must-visit,t don't overlook the MoMA – its collection of modern art is world-class!"} -{"prompt": "What should I pack for a trip to Australia?","response": "Don't forget sunscreen and a hat for those sunny days Down Under! And remember to bring comfortable shoes for all that hiking."} -{"prompt": "Best place to eat in Bangkok?","response": "For street food heaven,y the night markets – you won't be disappointed!"} -{"prompt": "What's the best way to see the Great Barrier Reef?","response": "Snorkeling or scuba diving are a must for this natural wonder."} -{"prompt": "Best place to visit in Barcelona?","response": "La Sagrada Familia is a must-see,t don't miss Park Güell and the stunning architecture of Casa Batlló."} -{"prompt": "What's the best way to get around Rome?","response": "Hop on a Vespa for an adrenaline rush, explore the city's winding streets by foot."} -{"prompt": "Best place to eat in New Orleans?","response": "Try beignets at Cafe du Monde and gumbo at Dooky Chase's Restaurant – it's a taste of the real NOLA!"} -{"prompt": "What's the best museum in London?","response": "The British Museum is world-class,t don't miss the quirky treasures at the V&A."} -{"prompt": "Best way to see the Amalfi Coast?","response": "Take a boat ride along the stunning coastline – it's breathtaking!"} -{"prompt": "What's the best time to visit Yellowstone National Park?","response": "Summer is peak tourist season,t spring and fall offer milder weather and fewer crowds."} -{"prompt": "Best place to stay in Santorini?","response": "Oia village offers breathtaking sunsets and picturesque whitewashed houses – it's a dream come true!"} -{"prompt": "What's the best way to get around Sydney?","response": "Hop on a ferry for a scenic ride or explore the city's charming neighborhoods by foot."} -{"prompt": "Where can I find the best sushi in Tokyo?","response": "Irasshaimase! For the freshest catch,y Sushi Zanmai in Tsukiji. It's a local favorite with affordable prices. Kanpai!"} -{"prompt": "I'm planning a trip to Tokyo,at are the must-see attractions?","response": "Konnichiwa! Tokyo is an incredible destination. You can't miss the Tokyo Skytree for panoramic views of the city,d the Meiji Shrine is a serene oasis amidst the bustling streets. Don't forget to explore the colorful streets of Shinjuku's Golden Gai and try some delicious street food at the Tsukiji Outer Market! What's your favorite type of cuisine?"} -{"prompt": "Can you recommend a beach resort in Bali?","response": "Bali is paradise! I'd recommend the Four Seasons Resort Bali at Jimbaran Bay for its stunning ocean views and luxurious villas. Alternatively,e Kayon Resort in Ubud offers a tranquil retreat amidst lush greenery. Both are perfect for a relaxing getaway. Are you looking for adventure activities or a romantic escape?"} -{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps! I'd suggest using the efficient subway system or taking a ride-hailing service like Uber or Lyft. You can also walk or bike across the city,pecially in neighborhoods like SoHo or Brooklyn. Don't forget to buy a MetroCard for easy travel. Want to explore a specific neighborhood or attraction?"} -{"prompt": "Can you suggest a budget-friendly accommodation in Bangkok?","response": "Bangkok on a budget! I'd recommend staying at the Naga Residence,cozy hostel with dorms and private rooms. Alternatively,e Baan Thai Residence offers affordable apartments with a homey atmosphere. Both are centrally located and close to public transportation. What's your preferred type of accommodation?"} -{"prompt": "What are the top attractions in Rome?","response": "Rome,e Eternal City! You can't miss the Colosseum,e Vatican City,d the Pantheon. Don't forget to explore the charming streets of Trastevere and indulge in delicious Italian cuisine. Want to visit any specific historical site or try a local dish?"} -{"prompt": "Can you recommend a family-friendly resort in the Maldives?","response": "The Maldives is a tropical paradise! I'd suggest the Soneva Fushi resort,ich offers luxurious villas and a kids' club. Alternatively,e Cheval Blanc Randheli resort has a water sports center and a kids' program. Both are perfect for a family vacation. Are you looking for snorkeling or diving activities?"} -{"prompt": "What's the best way to see the Great Barrier Reef?","response": "The Great Barrier Reef is a snorkeler's and scuba diver's dream! I'd recommend taking a guided tour from Cairns or the Whitsunday Islands. You can also visit the Reef World Aquarium for a glimpse of the marine life. Want to learn more about the reef's conservation efforts?"} -{"prompt": "Can you suggest a romantic getaway in the Swiss Alps?","response": "The Swiss Alps are a romantic haven! I'd recommend staying at the Hotel Bellevue Palace in Interlaken,ich offers breathtaking views of the Eiger,nch,d Jungfrau mountains. Alternatively,e Hotel de la Paix in Lucerne has a beautiful lakefront location. Both are perfect for a cozy retreat. Want to explore any specific mountain trail or take a scenic train ride?"} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is a natural wonder! I'd recommend visiting during the spring (March to May) or fall (September to November) for mild weather and fewer crowds. Avoid visiting during the peak summer months for a more peaceful experience. Want to learn more about the canyon's geology or take a scenic helicopter tour?"} -{"prompt": "Can you recommend a luxury hotel in Paris?","response": "Paris,e City of Love! I'd suggest staying at the Shangri-La Hotel,ris,ich offers opulent rooms and a stunning view of the Eiffel Tower. Alternatively,e Four Seasons Hotel George V Paris has a world-class spa and impeccable service. Both are perfect for a luxurious getaway. Want to explore any specific neighborhood or try a local patisserie?"} -{"prompt": "What are the top attractions in Sydney?","response": "Sydney is an iconic city! You can't miss the Sydney Opera House,e Harbour Bridge,d the famous Bondi Beach. Don't forget to explore the historic Rocks neighborhood and take a scenic ferry ride to Manly Beach. Want to visit any specific museum or try a local surf school?"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best way to get around Tokyo?","response": "You'll love navigating Tokyo's efficient public transportation system! Consider purchasing a Suica or Pasmo card for convenient travel on trains,ses,d subways. Don't miss the scenic views from the Tokyo Skytree for a bird's-eye perspective!"} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is during the dry season (April to November) for calmer waters and better visibility. Be sure to book a reputable tour operator that prioritizes reef conservation and your safety."} -{"prompt": "What's the best restaurant in New York City?","response": "You're in luck! New York City offers endless dining options. For a taste of classic NYC,y Carbone in Greenwich Village for an over-the-top Italian-American experience. Be sure to book reservations well in advance!"} -{"prompt": "What's the best way to see the Northern Lights?","response": "The Northern Lights are a breathtaking spectacle! For the best viewing experience,ad to Tromsø,rway, Lapland,nland,ring the peak season (September to April). Consider booking a guided tour that includes a photographer or astronomer for expert insights."} -{"prompt": "What's the best beach in Hawaii?","response": "You're in paradise! With so many stunning beaches to choose from,recommend Kaanapali Beach on Maui for its soft white sand,ystal-clear waters,d scenic sunsets. Don't miss the nearby Lahaina town for its rich history and cultural attractions."} -{"prompt": "What's the best way to get to the top of the Burj Khalifa?","response": "You're in for a thrilling experience! Take the high-speed elevator to the observation deck on the 124th floor for panoramic views of Dubai. Be sure to book tickets in advance and consider visiting during sunset for a truly unforgettable experience."} -{"prompt": "What's the best restaurant in Rome?","response": "You're in for a culinary adventure! Try La Rosetta for its modern Italian cuisine and stunning views of the Tiber Island. Be sure to book reservations well in advance and don't miss the nearby Campo de' Fiori market for a taste of authentic Italian street food."} -{"prompt": "What's the best way to see the Grand Canyon?","response": "One of the world's most iconic natural wonders! For a breathtaking view,ke the scenic drive along the Desert View Drive or hike down into the canyon for a more immersive experience. Consider booking a guided tour that includes a knowledgeable park ranger for expert insights."} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is during the shoulder season (April to June or September to November) for mild weather and smaller crowds. Be sure to book accommodations and tours in advance to avoid the peak-season rush."} -{"prompt": "What's the best way to get around Barcelona?","response": "You'll love exploring Barcelona on foot or by bike! Consider purchasing a Hola BCN card for convenient travel on public transportation,cluding buses and metro lines. Don't miss the scenic views from the top of La Sagrada Familia for a bird's-eye perspective!"} -{"prompt": "What's the best restaurant in Bangkok?","response": "You're in for a flavorful adventure! Try Gaggan for its innovative Indian cuisine and stunning views of the city. Be sure to book reservations well in advance and don't miss the nearby Chatuchak Weekend Market for a taste of authentic Thai street food."} -{"prompt": "What's the best way to see the Great Wall of China?","response": "One of the world's most impressive architectural achievements! For a less crowded experience,sit the Mutianyu Great Wall,cated about an hour's drive north of Beijing. Consider booking a guided tour that includes a knowledgeable guide for expert insights."} -{"prompt": "What's the best time to visit the Swiss Alps?","response": "The Swiss Alps are a year-round destination,t the best time to visit is during the winter season (December to March) for world-class skiing and snowboarding. Be sure to book accommodations and ski passes in advance to avoid the peak-season rush."} -{"prompt": "Where can I find the best tacos in Mexico City?","response": "Taco 'bout a culinary adventure! Head to La Taqueria in the Condesa neighborhood for some mouth-watering,thentic street food. Don't forget to try the famous tacos al pastor!"} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "Dive into the crystal-clear waters of the Great Barrier Reef! The best time to visit is from April to November when the weather is calm and the sea life is at its best. Book your snorkeling trip now!"} -{"prompt": "Can you recommend a budget-friendly hotel in Tokyo?","response": "Budget-friendly in Tokyo? You bet! Check out the Nui.Tokyo Central Hostel in the heart of the city. It's a stylish and affordable option with dorms and private rooms available. Book now and save!"} -{"prompt": "How do I get to the top of the Burj Khalifa?","response": "Reach for the stars! Take the high-speed elevator to the observation deck on the 124th floor for breathtaking views of Dubai. Don't forget to book your tickets in advance to avoid long wait times."} -{"prompt": "What's the best way to get around Rome?","response": "Get your Roman holiday started! Rent a Vespa or take the metro to explore the Eternal City. Don't forget to validate your ticket before boarding the bus or metro for a hassle-free ride."} -{"prompt": "Can you recommend a beach in Bali?","response": "Seas the day! Head to the stunning Nusa Dua Beach for crystal-clear waters,wdery white sand,d a picturesque sunset. Don't miss the nearby Uluwatu Temple for a spiritual experience."} -{"prompt": "How do I avoid tourist traps in New York City?","response": "Avoid the tourist traps and explore the real NYC! Venture off the beaten path to discover hidden gems like the High Line,elsea Market,d the Brooklyn Flea. Your wallet (and taste buds) will thank you!"} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "Sail into the Mediterranean! The Amalfi Coast is best visited from May to October when the weather is warm and sunny. Book your boat tour or hike the famous Sentiero degli Dei (Path of the Gods) for breathtaking views."} -{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! The Grand Wailea Resort in Maui is a paradise for families. Enjoy the water park,ds' club,d stunning ocean views. Don't miss the nearby Haleakala National Park for a sunrise hike and breathtaking views!"} -{"prompt": "How do I get to the top of the Eiffel Tower?","response": "Reach new heights! Take the stairs or elevator to the top of the iconic Eiffel Tower for panoramic views of Paris. Don't forget to book your tickets in advance to avoid long wait times."} -{"prompt": "What's the best way to explore the Swiss Alps?","response": "Get your mountain fix! Take the Jungfrau Railway to the 'Top of Europe' for breathtaking views of the Swiss Alps. Don't miss the nearby Interlaken for paragliding and hiking adventures!"} -{"prompt": "Can you recommend a boutique hotel in Barcelona?","response": "Experience la vida loca! The Hotel Casa Fuster in Passeig de Gracia is a stylish and luxurious boutique hotel with stunning architecture and a rooftop pool. Book now and soak up the Barcelona vibes!"} -{"prompt": "How do I get to the ancient city of Petra?","response": "Uncover the secrets of Petra! Take a guided tour or rent a car to explore the ancient city,cluding the famous Monastery and Treasury. Don't miss the nearby Wadi Rum for a desert adventure!"} -{"prompt": "What's the best time to visit the Galapagos Islands?","response": "Explore the Galapagos! The best time to visit is from June to November when the weather is dry and the wildlife is at its best. Book your boat tour or island-hopping adventure now and get ready for an unforgettable experience!"} -{"prompt": "Can you recommend a food tour in Bangkok?","response": "Savor the flavors of Bangkok! Join a food tour to explore the city's street food scene,cluding the famous Pad Thai and Tom Yum soup. Don't miss the nearby Chatuchak Weekend Market for a shopping extravaganza!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best way to get around Tokyo?","response": "Tokyo's a bustling city,t don't worry,ve got you covered! Consider purchasing a Suica or Pasmo card for easy access to the city's efficient public transportation system. You can also take a stroll through the beautiful Imperial Palace East Garden or explore the trendy Harajuku district on foot."} -{"prompt": "Can you recommend a budget-friendly hotel in New York City?","response": "The Big Apple can be pricey,t I've got a gem for you! Consider staying at the YOTEL New York or the Pod Hotel Brooklyn for a budget-friendly and stylish stay. Both offer comfortable rooms and a great location to explore the city."} -{"prompt": "What are some must-try foods in Italy?","response": "Buon appetito! Italy's a food lover's paradise,d you simply must try the authentic pizza in Naples,dulge in a decadent gelato in Rome,d sample the rich pasta dishes in Tuscany. Don't forget to wash it all down with a glass of Chianti!"} -{"prompt": "What's the best way to see the Grand Canyon?","response": "The Grand Canyon's a breathtaking wonder,d I've got the perfect plan for you! Take a scenic helicopter tour for a bird's-eye view,ke down into the canyon for an immersive experience, simply take in the stunning views from the South Rim. Whatever you choose,ke sure to bring your camera and a sense of adventure!"} -{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! Hawaii's a perfect destination for families,d I've got a top pick for you! Consider staying at the Aulani,Disney Resort & Spa in Ko Olina for a fun and relaxing vacation. With its beautiful beaches,ter parks,d kid-friendly activities,u'll create memories that will last a lifetime."} -{"prompt": "What are some safety tips for traveling in Europe?","response": "Safety first! When traveling in Europe,ways be mindful of your surroundings,ep an eye on your belongings,d avoid walking alone in dimly lit areas at night. Also,ke sure to stay up-to-date on local events and demonstrations,d follow local advice and instructions. And don't forget to register with your government's travel advisory program for added peace of mind!"} -{"prompt": "Can you recommend a romantic getaway in Paris?","response": "Romance is in the air! For a dreamy getaway in Paris,nsider staying at the Hotel Le Bristol or the Shangri-La Hotel for a luxurious and intimate experience. Take a stroll along the Seine,sit the iconic Eiffel Tower,d indulge in a romantic dinner at a charming bistro. Whatever you choose,ke sure to bring your loved one and a sense of romance!"} -{"prompt": "What's the best way to get to the Amalfi Coast?","response": "The Amalfi Coast's a stunning destination,d I've got the perfect plan for you! Fly into Naples International Airport and take a scenic drive along the coast to Positano,alfi, Ravello. Alternatively,nsider taking a ferry from Naples to the island of Capri for a more leisurely and picturesque journey."} -{"prompt": "Can you recommend a budget-friendly destination in Southeast Asia?","response": "Southeast Asia's a treasure trove of affordable destinations! Consider visiting Cambodia,ere you can explore the ancient temples of Angkor Wat,joy delicious street food,d stay in budget-friendly accommodations for under $20 a night. Or head to Vietnam,ere you can explore the bustling streets of Hanoi,lax on the stunning beaches of Nha Trang,d indulge in delicious pho for under $5 a bowl!"} -{"prompt": "What are some must-see landmarks in China?","response": "China's a land of breathtaking landmarks,d you simply must see the Great Wall of China,e Forbidden City in Beijing,d the stunning Terracotta Warriors in Xi'an. Don't forget to explore the vibrant city of Shanghai,th its stunning skyline,stling streets,d world-class shopping!"} -{"prompt": "Can you recommend a luxury resort in the Maldives?","response": "Luxury awaits in the Maldives! Consider staying at the Soneva Fushi or the Cheval Blanc Randheli for a truly unforgettable experience. With its crystal-clear waters,istine beaches,d world-class amenities,u'll feel like royalty in paradise!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "Best beaches in Bali?","response": "Bali's got some of the most stunning beaches in the world! Seminyak Beach is a must-visit for its crystal-clear waters and powdery sand. Nusa Dua is another favorite,th its calm and secluded coves. Want to explore more? I can give you some insider tips!"} -{"prompt": "How do I get to the Great Barrier Reef?","response": "The Great Barrier Reef is a snorkeler's and scuba diver's paradise! To get there,ke a ferry from Cairns or Port Douglas, fly into the Whitsunday Islands. If you're short on time,nsider a day trip from the mainland. Don't forget to book your reef tour in advance to ensure availability!"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's a year-round destination,t the best time to visit depends on your preferences! Spring (March to May) is cherry blossom season,ile autumn (September to November) is perfect for mild weather and vibrant foliage. Summer's hot and humid,t if you're a fan of festivals,is is the time for you!"} -{"prompt": "How do I get to the Pyramids of Giza?","response": "The Pyramids of Giza are an ancient wonder! To get there,ke a taxi or Uber from downtown Cairo, hop on the metro to Giza Station. From there,'s a short walk to the pyramids. Don't forget to haggle for a good price when taking a camel ride or buying souvenirs!"} -{"prompt": "What's the best way to see the Northern Lights?","response": "Seeing the Northern Lights is a once-in-a-lifetime experience! To increase your chances,ad to Tromsø,rway, Lapland,nland,tween September and April. Take a guided tour or stay in a glass-roofed igloo for the ultimate viewing experience. Don't forget your camera and a warm coat!"} -{"prompt": "How do I get to the Taj Mahal?","response": "The Taj Mahal is a breathtaking monument! To get there,ke a train from New Delhi to Agra, drive via the Yamuna Expressway. From Agra Station,ke a taxi or rickshaw to the Taj Mahal. Don't forget to book your tickets in advance to avoid long lines!"} -{"prompt": "What's the best time to visit New York City?","response": "The Big Apple's a year-round destination,t the best time to visit depends on your preferences! Spring (April to June) is perfect for mild weather and outdoor events,ile autumn (September to November) is great for foliage and festivals. Summer's hot and humid,t if you're a fan of music and art,is is the time for you!"} -{"prompt": "How do I get to the Colosseum?","response": "The Colosseum is an ancient wonder! To get there,ke the metro to Colosseo Station, hop on a bus from downtown Rome. From there,'s a short walk to the Colosseum. Don't forget to book your tickets in advance to avoid long lines and explore the surrounding Roman Forum!"} -{"prompt": "What's the best way to see the Grand Canyon?","response": "The Grand Canyon's a natural wonder! To see it in all its glory,ke a scenic drive along the Desert View Drive, hike down into the canyon via the Bright Angel Trail. For a bird's-eye view,ke a helicopter tour or a scenic flight over the canyon. Don't forget your sunscreen and a refillable water bottle!"} -{"prompt": "How do I get to the Eiffel Tower?","response": "The Eiffel Tower is a Parisian icon! To get there,ke the metro to Bir-Hakeim Station, hop on a bus from downtown Paris. From there,'s a short walk to the Eiffel Tower. Don't forget to book your tickets in advance to avoid long lines and enjoy the stunning views from the top!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "Best beaches in Bali?","response": "Tropical paradise alert! You'll adore the crystal-clear waters and powdery white sand of Nusa Dua Beach,minyak Beach,d Uluwatu Beach. Don't forget to watch the sunset at Uluwatu – it's simply breathtaking!"} -{"prompt": "Top restaurants in Tokyo?","response": "Foodie heaven! You must try the mouth-watering sushi at Sushi Saito,e tempura at Tempura Kondo,d the ramen at Ichiran. Don't forget to sample some delicious street food at the Tsukiji Outer Market!"} -{"prompt": "Best time to visit New York?","response": "The city that never sleeps! Spring (April-May) and autumn (September-October) are the best times to visit NYC,th mild temperatures and fewer crowds. Don't miss the iconic cherry blossoms in Central Park during spring!"} -{"prompt": "Top attractions in Sydney?","response": "Sydney,re you come! You can't miss the Sydney Opera House,e Harbour Bridge,d the stunning Bondi Beach. Take a scenic coastal walk from Bondi to Coogee Beach for breathtaking views of the city!"} -{"prompt": "Best ski resorts in the Swiss Alps?","response": "Get ready for a winter wonderland! Zermatt,. Moritz,d Interlaken are top picks for skiing and snowboarding. Don't miss the picturesque villages and delicious Swiss chocolate!"} -{"prompt": "Top museums in Paris?","response": "Art lovers rejoice! The Louvre,sée d'Orsay,d Centre Pompidou are must-visits. Don't miss the famous Mona Lisa at the Louvre and the stunning Impressionist collection at the Musée d'Orsay!"} -{"prompt": "Best hiking trails in New Zealand?","response": "Nature enthusiasts,semble! The famous Routeburn Track,lford Track,d Tongariro Alpine Crossing are not to be missed. Take in the breathtaking scenery and hike to the summit for panoramic views!"} -{"prompt": "Top restaurants in Bangkok?","response": "Street food heaven! Try the spicy tom yum soup at Jay Fai,e pad thai at Thip Samai,d the mango sticky rice at Or Tor Kor Market. Don't forget to sample some delicious night market snacks!"} -{"prompt": "Best time to visit the Great Barrier Reef?","response": "Dive into the crystal-clear waters! The dry season (April-October) is the best time to visit the Great Barrier Reef,th calm seas and clear visibility. Don't miss the vibrant coral and marine life!"} -{"prompt": "Top attractions in Rio de Janeiro?","response": "Carnival vibes! You can't miss Christ the Redeemer,garloaf Mountain,d Copacabana Beach. Take a scenic cable car ride to the top of Sugarloaf for breathtaking views of the city!"} -{"prompt": "Best time to visit the Amalfi Coast?","response": "La dolce vita! Spring (April-May) and autumn (September-October) are the best times to visit the Amalfi Coast,th mild temperatures and fewer crowds. Don't miss the stunning cliffside towns and crystal-clear waters!"} -{"prompt": "Top restaurants in San Francisco?","response": "Foodie paradise! Try the sourdough bread at Boudin Bakery,e seafood at Swan Oyster Depot,d the burritos at La Taqueria. Don't forget to sample some delicious fresh produce at the Ferry Plaza Farmers Market!"} -{"prompt": "Best time to visit the Serengeti?","response": "Wildlife enthusiasts,semble! The dry season (June-October) is the best time to visit the Serengeti,th the Great Migration taking place. Don't miss the stunning sunsets and abundant wildlife!"} -{"prompt": "Top attractions in Barcelona?","response": "Gaudí's masterpiece! You can't miss the Sagrada Familia,rk Güell,d the Gothic Quarter. Take a stroll along La Rambla and visit the famous La Boqueria Market for a taste of local cuisine!"} -{"prompt": "Best time to visit the Galapagos Islands?","response": "Island hopping! The dry season (June-October) is the best time to visit the Galapagos,th calm seas and clear visibility. Don't miss the unique wildlife and stunning landscapes!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What are the top 3 beaches in Bali?","response": "Bali's beaches are stunning! My top 3 picks are Seminyak Beach for its crystal-clear waters and powdery white sand,sa Dua Beach for its picturesque sunsets and calm waters,d Kuta Beach for its lively atmosphere and surfing opportunities!"} -{"prompt": "What's the best way to get around Tokyo?","response": "Tokyo's public transportation system is incredibly efficient! I recommend purchasing a Suica or Pasmo card for easy travel on the subway,ses,d trains. You can also take a stroll through the beautiful Imperial Palace East Garden or rent a bike for a fun and scenic ride!"} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is breathtaking year-round,t the best time to visit is from September to November or March to May for mild weather and smaller crowds. If you're looking for a unique experience,nsider a sunrise or sunset hike for breathtaking views!"} -{"prompt": "What are some must-try foods in Italy?","response": "Italian cuisine is divine! Be sure to try authentic Neapolitan pizza in Naples,dulge in creamy gelato in Florence,d sample the world-famous pasta dishes in Rome. Don't forget to wash it all down with a glass of Chianti wine!"} -{"prompt": "What are the top 3 museums in New York City?","response": "New York City is a museum lover's paradise! My top 3 picks are the Metropolitan Museum of Art for its vast collection of art and artifacts,e American Museum of Natural History for its fascinating exhibits on science and nature,d the Guggenheim Museum for its stunning architecture and contemporary art!"} -{"prompt": "What's the best way to explore the Swiss Alps?","response": "The Swiss Alps are a hiker's dream! Consider taking a scenic train ride to the top of the Jungfraujoch for breathtaking views, hike through the picturesque valleys and villages of the Bernese Oberland. Don't forget to try some delicious Swiss chocolate and cheese along the way!"} -{"prompt": "What are some safety tips for solo female travelers?","response": "Solo female travelers,n't worry! Always trust your instincts and stay aware of your surroundings. Consider joining a group tour or using a reputable travel app for safety and support. Also, sure to research local customs and dress modestly to avoid drawing attention to yourself."} -{"prompt": "What's the best way to get from the airport to my hotel in Paris?","response": "Getting to your hotel in Paris is a breeze! Consider taking a taxi or Uber from the airport, use the convenient RER B train to get to the city center. You can also book a private transfer or shuttle service for added convenience."} -{"prompt": "What are some must-see landmarks in Bangkok?","response": "Bangkok is a city of stunning temples and grand palaces! Be sure to visit the iconic Wat Phra Kaew,plore the beautiful Grand Palace,d take a boat ride along the Chao Phraya River to see the city's majestic temples and bridges."} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a summer paradise! The best time to visit is from June to September for warm weather and crystal-clear waters. If you're looking for a more relaxed experience,nsider visiting in the shoulder season (April to May or September to October) for fewer crowds and lower prices."} -{"prompt": "What are some tips for packing light?","response": "Packing light is a must for any traveler! Consider rolling your clothes to save space,ar your heaviest or bulkiest items (like your coat or boots) on the plane,d pack multipurpose items like a sarong or travel towel. Don't forget to leave some extra space in your luggage for souvenirs and shopping!"} -{"prompt": "What are some must-see attractions in Sydney?","response": "Sydney is a stunning city! Be sure to visit the iconic Sydney Opera House,ke a stroll along the picturesque Bondi Beach,d climb to the top of the Harbour Bridge for breathtaking views of the city. Don't forget to visit the famous Taronga Zoo and take a ferry ride to Manly Beach!"} -{"prompt": "I'm planning a trip to Tokyo for 7 days,at should I see?","response": "Konnichiwa! For a 7-day Tokyo adventure,ioritize the Tokyo Skytree for panoramic views,plore the vibrant streets of Shibuya,d visit the famous Tsukiji Outer Market for sushi and fresh seafood. Don't miss the beautiful Imperial Palace East Garden and the Meiji Shrine for a tranquil escape. Book a sumo tournament ticket if you're visiting in January,bruary, May. What's your budget for accommodations?"} -{"prompt": "I'm looking for a beach vacation in the Caribbean,at islands should I consider?","response": "Ah,land vibes! The Caribbean has so much to offer. Consider the Bahamas for crystal-clear waters and powdery white sand,maica for reggae vibes and jerk chicken, the U.S. Virgin Islands for secluded beaches and snorkeling. St. Lucia is perfect for nature lovers with its iconic twin volcanic peaks and lush rainforests. What's your preferred level of luxury?"} -{"prompt": "I'm traveling to New York City with my family,at are some kid-friendly attractions?","response": "The city that never sleeps is perfect for families! Take a stroll through Central Park,sit the American Museum of Natural History for dinosaur exhibits,d catch a Broadway show suitable for kids. Don't miss the iconic Statue of Liberty and Ellis Island,d consider a visit to the Top of the Rock for stunning city views. What's your family's age range?"} -{"prompt": "I'm planning a road trip across the United States,at's the best route?","response": "Road trip adventures! For a classic route,nsider the iconic Route 66 from Chicago to Los Angeles,opping at quirky roadside attractions and trying local cuisine. If you prefer a coastal route,ive along the Pacific Coast Highway from San Francisco to San Diego,king in breathtaking ocean views and scenic hiking trails. What's your preferred duration for the trip?"} -{"prompt": "I'm traveling to Italy for the first time,at are some must-try foods?","response": "Buon viaggio! Italian cuisine is a culinary dream. Be sure to try authentic pizza in Naples,dulge in a classic carbonara in Rome,d sample fresh seafood risotto in Venice. Don't miss the opportunity to taste gelato in Florence and enjoy a traditional Italian feast in Tuscany. What's your dietary preference?"} -{"prompt": "I'm looking for a budget-friendly destination in Southeast Asia,at are some options?","response": "Southeast Asia on a budget! Consider the beautiful island of Bali,donesia,r affordable accommodations and delicious local cuisine. Visit the vibrant city of Chiang Mai,ailand,r a rich cultural experience and try the street food. Don't miss the stunning beaches of Nha Trang,etnam,r a relaxing getaway. What's your budget for daily expenses?"} -{"prompt": "I'm planning a honeymoon in Bora Bora,at's the best resort?","response": "Romantic getaway! Bora Bora is a dream destination for couples. Consider the Four Seasons Resort Bora Bora for overwater bungalows and stunning views. Alternatively,e InterContinental Resort Bora Bora offers luxurious villas and a private beach. Don't miss the opportunity to snorkel or dive with sharks and rays. What's your preferred level of luxury?"} -{"prompt": "I'm traveling to Spain for the first time,at are some must-see landmarks?","response": "Olé! Spain is a treasure trove of history and culture. Be sure to visit the iconic Sagrada Familia in Barcelona,plore the vibrant streets of Madrid,d stroll along the picturesque Alhambra palace in Granada. Don't miss the opportunity to visit the famous Park Güell and take in the stunning views of the city. What's your preferred duration for the trip?"} -{"prompt": "I'm looking for a winter sports destination in the United States,at are some options?","response": "Winter sports enthusiast! Consider the Rocky Mountains in Colorado for world-class skiing and snowboarding. Visit the Lake Tahoe region in California/Nevada for stunning views and powdery slopes. Don't miss the opportunity to explore the snow-covered trails of Jackson Hole,oming,d enjoy a relaxing soak in a natural hot spring. What's your preferred level of difficulty?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's cherry blossoms bloom in late March to early April,t September and November are great times to visit with comfortable temperatures and fewer crowds. Don't miss the neon lights of Shinjuku's Kabukicho district!"} -{"prompt": "Where can I find the best Italian food in Rome?","response": "Rome's Trastevere neighborhood is famous for its authentic Italian cuisine. Try La Gensola for some of the city's best carbonara and amatriciana. Don't forget to explore the charming alleys and piazzas around Piazza Santa Maria in Trastevere!"} -{"prompt": "What's the best way to get around New York City?","response": "The subway is the most efficient way to get around NYC. Consider purchasing a MetroCard for easy travel. You can also take a yellow cab or ride-hail,t be aware of traffic. Don't miss the iconic Statue of Liberty and Ellis Island ferries from Battery Park!"} -{"prompt": "Where can I find the best beaches in Bali?","response": "Bali's south coast is home to some of the island's most beautiful beaches. Seminyak Beach offers luxury resorts and sunbathing,ile Kuta Beach is perfect for surfers and partygoers. Don't miss the stunning sunsets at Uluwatu Beach!"} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is open year-round,t the best time to visit is from September to November or March to May,en the weather is mild and crowds are smaller. Don't miss the breathtaking sunrise or sunset at the South Rim!"} -{"prompt": "Where can I find the best tapas in Barcelona?","response": "Barcelona's El Raval neighborhood is a foodie's paradise,th many tapas bars and restaurants. Try Cal Pep for some of the city's best seafood and tapas. Don't miss the lively atmosphere and street performers in La Rambla!"} -{"prompt": "What's the best way to see the sights in Paris?","response": "Consider purchasing a Paris Museum Pass for skip-the-line access to popular attractions like the Eiffel Tower and Louvre. You can also take a Seine River cruise for a scenic view of the city. Don't miss the stunning Notre-Dame Cathedral!"} -{"prompt": "Where can I find the best hiking trails in New Zealand?","response": "New Zealand's South Island is home to some of the world's most beautiful hiking trails. Try the famous Routeburn Track or the scenic Tongariro Alpine Crossing. Don't miss the stunning Milford Sound and Fiordland National Park!"} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is open year-round,t the best time to visit is from April to November,en the weather is calm and visibility is good. Don't miss the opportunity to snorkel or scuba dive with sea turtles and colorful fish!"} -{"prompt": "Where can I find the best Indian food in Mumbai?","response": "Mumbai's Colaba neighborhood is famous for its street food and restaurants. Try Bademiya for some of the city's best kebabs and biryani. Don't miss the vibrant atmosphere and street performers in the Colaba Causeway!"} -{"prompt": "What's the best way to get around Sydney?","response": "Sydney's public transportation system is efficient and easy to use. Consider purchasing an Opal card for convenient travel on buses,ains,d ferries. Don't miss the stunning views of the Opera House and Harbour Bridge from a ferry ride!"} -{"prompt": "Where can I find the best coffee in Melbourne?","response": "Melbourne's CBD is home to many of the city's best coffee shops. Try Pellegrini's Espresso Bar for some of the city's best coffee and Italian cuisine. Don't miss the lively atmosphere and street performers in the Bourke Street Mall!"} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is open year-round,t the best time to visit is from April to October,en the weather is warm and sunny. Don't miss the stunning scenery and picturesque towns like Positano and Amalfi!"} -{"prompt": "Where can I find the best seafood in San Francisco?","response": "San Francisco's Fisherman's Wharf is famous for its fresh seafood. Try Alioto's for some of the city's best seafood and stunning views of the Bay Bridge. Don't miss the lively atmosphere and street performers in Pier 39!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are the best seasons to visit. Enjoy mild temperatures and vibrant cherry blossoms in spring, the golden foliage in autumn!"} -{"prompt": "Where can I find the best Italian food in Rome?","response": "Buon appetito! You'll find authentic Italian cuisine in the heart of Rome's Trastevere neighborhood. Try La Gensola for a taste of traditional Roman dishes, head to Pizzeria La Montecarlo for a delicious Neapolitan-style pizza!"} -{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps is best explored on foot or by public transportation! Grab a MetroCard and hop on the subway, take a leisurely stroll across the Brooklyn Bridge for spectacular city views."} -{"prompt": "What are some must-see landmarks in Bangkok?","response": "Get ready to explore the vibrant streets of Bangkok! Be sure to visit the stunning Grand Palace,e iconic Wat Phra Kaew,d the bustling Chatuchak Weekend Market for an unforgettable shopping experience."} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is during the dry season (April to November). Enjoy calm seas and crystal-clear waters for an unforgettable snorkeling or diving experience!"} -{"prompt": "Where can I find the best street food in Bangkok?","response": "Bangkok's street food scene is a culinary paradise! Head to the famous Rot Fai Night Market or try the street food stalls at Sukhumvit Road for a taste of authentic Thai cuisine."} -{"prompt": "What are some tips for navigating airport security?","response": "Flying soon? Make sure to arrive at the airport with plenty of time to spare,d be prepared to remove electronics and liquids from your carry-on bags. Wear easy-to-remove shoes and avoid any prohibited items to speed up the security process!"} -{"prompt": "What's the best way to see the Northern Lights?","response": "Experience the breathtaking beauty of the Northern Lights in Tromsø,rway! Take a guided tour to maximize your chances of witnessing this natural phenomenon,d don't forget your camera to capture the moment!"} -{"prompt": "Where can I find the best coffee in Melbourne?","response": "Melbourne is a coffee lover's paradise! Visit the iconic Pellegrini's Espresso Bar or try the specialty coffee at Brother Baba Budan for a taste of the city's best brews."} -{"prompt": "What are some must-see museums in Paris?","response": "The City of Light is home to some of the world's most famous museums! Visit the Louvre for the Mona Lisa,e Musée d'Orsay for Impressionist masterpieces,d the Musée Rodin for a glimpse into the life of the famous sculptor."} -{"prompt": "What's the best way to get around Barcelona?","response": "Barcelona is a pedestrian-friendly city! Walk along La Rambla,plore the Gothic Quarter,d take a stroll along the beach for a taste of the city's vibrant atmosphere."} -{"prompt": "What are some tips for packing for a trip?","response": "Packing for a trip can be overwhelming,t don't worry,'ve got you covered! Pack lightweight clothing,mfortable shoes,d essential items like sunscreen,portable charger,d a travel adapter to ensure a stress-free journey!"} -{"prompt": "Where can I find the best Mexican food in Mexico City?","response": "Mexico City is a food lover's dream! Visit the famous La Merced market or try the authentic street food at El Fogoncito for a taste of the city's best Mexican cuisine."} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is during the shoulder season (April to June or September to November). Enjoy mild temperatures,wer crowds,d stunning scenery!"} -{"prompt": "What are some must-see landmarks in Sydney?","response": "Sydney is a city of iconic landmarks! Visit the Sydney Opera House,e Harbour Bridge,d the famous Bondi Beach for a taste of the city's vibrant atmosphere."} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable temperatures and festivals. Don't miss the stunning cherry blossoms in spring or the vibrant autumn foliage!"} -{"prompt": "How do I get to the Great Barrier Reef?","response": "Snorkel or dive the Great Barrier Reef! Fly into Cairns or the Whitsunday Coast,en take a ferry or boat tour to the reef. For a more luxurious experience,nsider a liveaboard boat or a guided tour from the Gold Coast."} -{"prompt": "What's the best way to see the Grand Canyon?","response": "Hike,mp, take a scenic helicopter tour - the Grand Canyon has something for every adventurer! For a more leisurely experience,ive the Desert View Drive or take a guided tour from nearby towns like Flagstaff or Williams."} -{"prompt": "Where's the best place to try authentic Indian cuisine?","response": "Head to Old Delhi or Mumbai for an explosion of flavors! Try popular street food like chaat,babs,d naan at local eateries like Chandni Chowk or Chowpatty Beach."} -{"prompt": "What's the best time to visit New York City?","response": "The city that never sleeps is always buzzing,t spring (April to June) and autumn (September to November) are ideal for comfortable temperatures and outdoor events. Don't miss the cherry blossoms in Central Park or the Halloween parade!"} -{"prompt": "How do I get to the Amalfi Coast?","response": "Fly into Naples or Salerno,en take a ferry or bus to the Amalfi Coast. For a more scenic route,ive the SS163 road,ich winds through picturesque towns like Positano and Amalfi."} -{"prompt": "What's the best way to see the Pyramids of Giza?","response": "Explore the Pyramids of Giza at sunrise or sunset for a breathtaking experience! Take a guided tour to learn about the history and mythology behind these ancient wonders."} -{"prompt": "Where's the best place to try authentic Mexican cuisine?","response": "Visit Oaxaca or Mexico City for a taste of authentic mole,cos,d mezcal! Try local specialties like tacos al pastor or chiles rellenos at eateries like La Casa de Toño or El Fogoncito."} -{"prompt": "What's the best time to visit the Swiss Alps?","response": "The Swiss Alps are a year-round destination,t winter (December to March) is ideal for skiing and snowboarding,ile summer (June to August) is perfect for hiking and mountain biking."} -{"prompt": "How do I get to the Taj Mahal?","response": "Fly into Delhi or Agra,en take a taxi or train to the Taj Mahal. For a more scenic route,ke a guided tour from nearby cities like Jaipur or Varanasi."} -{"prompt": "What's the best way to see the Northern Lights?","response": "Head to Tromsø or Lapland for a chance to witness the breathtaking Northern Lights! Take a guided tour or stay in a glass-roofed cabin for the best viewing experience."} -{"prompt": "Where's the best place to try authentic Chinese cuisine?","response": "Visit Beijing or Shanghai for a taste of authentic Peking duck,odles,d dumplings! Try local specialties like jiaozi or hot pot at eateries like Quanjude or Jia Jia Tang Bao."} -{"prompt": "What's the best time to visit the Great Wall of China?","response": "The Great Wall is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable temperatures and fewer crowds."} -{"prompt": "How do I get to the Colosseum?","response": "Fly into Rome,en take a taxi or metro to the Colosseum. For a more scenic route,lk from the Spanish Steps or take a guided tour from nearby attractions like the Pantheon or Vatican City."} -{"prompt": "What's the best way to see the Great Ocean Road?","response": "Drive the Great Ocean Road for a stunning coastal experience! Stop at famous landmarks like the Twelve Apostles,ndon Bridge, Loch Ard Gorge for breathtaking views."} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are the best times to visit. Enjoy mild temperatures and vibrant festivals!"} -{"prompt": "Can you recommend a beach in Bali?","response": "You're in luck! Bali has some of the world's most stunning beaches. I recommend Nusa Dua Beach for its crystal-clear waters and powdery white sand."} -{"prompt": "What's a good restaurant in New York City?","response": "You're in the city that never sleeps! For a taste of classic NYC,ad to Carbone for an over-the-top Italian-American experience."} -{"prompt": "Can you help me plan a trip to Italy?","response": "Italy,re you come! Let's start with Rome,orence,d Venice. I can recommend the best routes,commodations,d must-see landmarks. When are you planning to go?"} -{"prompt": "What's the best way to get around in Barcelona?","response": "Barcelona is a walkable city,t you can also use the metro or hop on a bus. Don't miss the scenic views from the top of Montjuïc Hill!"} -{"prompt": "Can you recommend a hotel in Las Vegas?","response": "Vegas,by! For a luxurious stay,recommend the Bellagio Hotel and Casino. Its fountains and Conservatory & Botanical Gardens are a sight to behold!"} -{"prompt": "What's a good time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is from April to November when the weather is calm and the water is clear."} -{"prompt": "Can you recommend a museum in London?","response": "London's museums are world-class! I recommend the British Museum for its vast collection of artifacts from ancient civilizations."} -{"prompt": "What's a good restaurant in Sydney?","response": "Sydney's food scene is amazing! For a taste of modern Australian cuisine,ad to Quay Restaurant for stunning harbor views and exceptional service."} -{"prompt": "Can you help me plan a honeymoon in Bora Bora?","response": "Bora Bora,e ultimate honeymoon destination! Let's plan a romantic getaway with overwater bungalows and crystal-clear lagoon. When are you planning to go?"} -{"prompt": "What's a good time to visit the Grand Canyon?","response": "The Grand Canyon is a year-round destination,t the best time to visit is from September to November or March to May when the weather is mild and the crowds are smaller."} -{"prompt": "Can you recommend a hotel in Paris?","response": "Paris,e City of Light! For a luxurious stay,recommend the Shangri-La Hotel Paris for its stunning views of the Eiffel Tower and exceptional service."} -{"prompt": "What's a good restaurant in Bangkok?","response": "Bangkok's street food is incredible! For a taste of authentic Thai cuisine,ad to Jay Fai for its spicy dishes and lively atmosphere."} -{"prompt": "Can you help me plan a trip to New Zealand?","response": "New Zealand,e adventure capital of the world! Let's plan a trip to explore its stunning landscapes,om Milford Sound to Rotorua. When are you planning to go?"} -{"prompt": "What's a good time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is from April to October when the weather is warm and the sea is calm."} -{"prompt": "Can you recommend a museum in New York City?","response": "The Big Apple has some of the world's best museums! I recommend the Metropolitan Museum of Art for its vast collection of art and artifacts from around the world."} -{"prompt": "What's a good restaurant in San Francisco?","response": "San Francisco's food scene is amazing! For a taste of fresh seafood,ad to The Slanted Door for its stunning views of the Bay Bridge and exceptional service."} -{"prompt": "Can you help me plan a trip to Iceland?","response": "Iceland,e land of fire and ice! Let's plan a trip to explore its stunning landscapes,om the Blue Lagoon to the Golden Circle. When are you planning to go?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's cherry blossom season is breathtaking,t it's also peak tourist season. If you're looking for smaller crowds,nsider visiting in spring or autumn. The city's vibrant energy is still palpable,d the weather is mild and comfortable."} -{"prompt": "Where can I find the best Italian food in Rome?","response": "Rome's culinary scene is incredible! For authentic Italian cuisine,ad to Trastevere neighborhood and try La Gensola or Da Enzo al 29. Don't forget to indulge in a delicious carbonara or cacio e pepe!"} -{"prompt": "What's the best way to get around New York City?","response": "The Big Apple is best explored on foot or by subway! The subway system is efficient and affordable,d you can grab a MetroCard at any station. For a unique experience,p on a double-decker bus or take a ride-hailing service like Uber or Lyft."} -{"prompt": "What are the top attractions in Bangkok?","response": "Bangkok's Grand Palace is a must-visit,th its stunning architecture and rich history. Don't miss the Temple of the Emerald Buddha and the Wat Phra Kaew complex. For a taste of local culture,plore the Chatuchak Weekend Market or take a boat ride along the Chao Phraya River."} -{"prompt": "What's the best way to plan a trip to Australia?","response": "Australia's vast and beautiful! To make the most of your trip,nsider visiting during the shoulder season (April-May or September-October) for milder weather. Don't miss the iconic Sydney Opera House,e Great Barrier Reef,d the stunning beaches of the Gold Coast."} -{"prompt": "What are some safety tips for traveling in Europe?","response": "Europe can be a bit unpredictable,t with some basic precautions,u'll be just fine! Be mindful of pickpocketing in crowded areas,ep an eye on your belongings,d avoid walking alone in dimly lit streets at night. Also,ke sure to research any local festivals or events that might impact your travel plans."} -{"prompt": "What's the best way to pack for a trip to the beach?","response": "Beach trips are all about relaxation and comfort! Pack light,eathable clothing,wide-brimmed hat,nglasses,d plenty of sunscreen. Don't forget a beach towel,waterproof phone case,d a good book to get you in the mood for some well-deserved R&R."} -{"prompt": "What are some must-see landmarks in China?","response": "China's rich history and stunning landscapes are a treasure trove of attractions! The Great Wall of China,e Forbidden City,d the Terracotta Warriors are absolute must-sees. Don't miss the breathtaking scenery of the Li River in Guilin or the stunning natural beauty of the Zhangjiajie National Forest Park."} -{"prompt": "What's the best way to plan a road trip across the United States?","response": "The open road awaits! To make the most of your road trip,an ahead and research the best routes,commodations,d attractions along the way. Consider downloading a mapping app or using a GPS device to help navigate the roads. Don't forget to pack snacks,ter,d a good map to keep you company on the journey!"} -{"prompt": "What are some tips for navigating airport security?","response": "Airport security can be a breeze with a few insider tips! Arrive early,ar easy-to-remove shoes,d avoid wearing anything with metal. Keep your electronics and liquids in a separate bin and be prepared to remove your laptop and liquids from your carry-on bag. And don't forget to stay hydrated and take breaks to avoid fatigue!"} -{"prompt": "What are some must-see attractions in South Africa?","response": "South Africa's diverse landscapes and wildlife are a true marvel! Take a safari in Kruger National Park to spot the Big Five (lion,ephant,ffalo,opard,d rhinoceros). Visit the stunning Cape of Good Hope and the beautiful beaches of the Garden Route. Don't miss the vibrant city of Cape Town and the historic Robben Island,ere Nelson Mandela was imprisoned."} -{"prompt": "What's the best way to stay connected while traveling abroad?","response": "Staying connected while traveling abroad is easier than ever! Consider purchasing a local SIM card or renting a portable Wi-Fi hotspot. Many airlines and airports also offer free Wi-Fi, be sure to take advantage of that. Don't forget to download a translation app or carry a phrasebook to help you communicate with locals."} -{"prompt": "I'm planning a trip to Japan,at are the must-visit places?","response": "Konnichiwa! Japan is a dream destination! You must visit Tokyo's bustling streets,y some delicious sushi in the Tsukiji Fish Market,d explore the serene gardens of the Meiji Shrine. Next,ad to Kyoto and stroll through the breathtaking Arashiyama Bamboo Forest,sit the iconic Fushimi Inari Shrine,d relax in a traditional ryokan. Don't forget to try some mouth-watering ramen and visit the Gion district to spot geishas! Want more recommendations? "} -{"prompt": "What's the best way to get around Paris?","response": "Bonjour! Paris is a walkable city,t you can also use the efficient metro system or hop on a bike to explore. Consider purchasing a Paris Visite pass for free entry to attractions and discounts on public transport. If you prefer a more leisurely pace,ke a scenic river cruise along the Seine. What's your preferred mode of transportation? "} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "G'day! The best time to visit the Great Barrier Reef is during the dry season,om April to November,en the weather is calm and the visibility is excellent. Try to avoid the peak tourist season from June to August for a more relaxed experience. Consider visiting the reef on a liveaboard boat to maximize your snorkeling or diving time. Want to know more about reef safety? "} -{"prompt": "Can you recommend a budget-friendly hotel in New York City?","response": "Hey there! For a budget-friendly hotel in NYC,nsider staying in a neighborhood like Harlem or Brooklyn. The YOTEL New York or the Pod Hotel Brooklyn offer stylish rooms at a lower price point. Alternatively,ok into hostels like the Jazz Hostel or the Vanderbilt YMCA for a more affordable option. What's your budget for accommodations? "} -{"prompt": "What's the best way to avoid crowds at popular tourist destinations?","response": "Travel tip! To avoid crowds at popular spots,nsider visiting early in the morning or later in the evening. Use online resources to plan your itinerary and book tickets or tours in advance. Look for lesser-known alternatives or take a guided tour that skips the lines. Want to know more about crowd-avoiding strategies? "} -{"prompt": "What are some must-try foods in Italy?","response": "Buon appetito! Italy is a food lover's paradise! You must try authentic Neapolitan pizza in Naples,dulge in a rich pasta dish like carbonara or cacio e pepe,d sample some delicious gelato in Florence. Don't forget to try some fresh seafood in Venice and sample some local wines like Chianti or Prosecco. Want to know more about Italian cuisine? "} -{"prompt": "What's the best way to stay safe while traveling?","response": "Travel safety tip! Always research your destination,ay informed about local conditions,d trust your instincts. Keep valuables secure, aware of your surroundings,d stay in well-lit areas at night. Consider purchasing travel insurance and register with your government's travel advisory program. Want to know more about staying safe on the road? "} -{"prompt": "Can you recommend a scenic hike in the Swiss Alps?","response": "Wanderlust! For a scenic hike in the Swiss Alps,nsider the challenging but rewarding trek to the top of the Jungfraujoch,e 'Top of Europe'. Alternatively,ke a leisurely stroll through the picturesque villages of Interlaken or hike to the stunning Trümmelbach Falls. Want to know more about Swiss hiking trails? "} -{"prompt": "What's the best way to get around Rome?","response": "Ciao! Rome is a walkable city,t you can also use the efficient metro system or hop on a bus to explore. Consider purchasing a Roma Pass for free entry to attractions and discounts on public transport. If you prefer a more leisurely pace,ke a scenic stroll along the Tiber Island or a romantic gondola ride along the canals. Want to know more about Rome's public transport? "} -{"prompt": "What are some popular festivals in Japan?","response": "Matsuri time! Japan is known for its vibrant festivals,ke the Cherry Blossom Festival (Hanami) in spring,e Golden Week in May,d the Obon Festival in August. Don't miss the colorful autumn foliage during the Koyo season or the lively New Year (Oshogatsu) celebrations. Want to know more about Japan's festivals and events? "} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "Can you recommend a budget-friendly hotel in Tokyo?","response": "Konnichiwa! For a budget-friendly stay in Tokyo,recommend the Nui.Tokyo Central Hostel. It's centrally located,fers dorms and private rooms,d has a great atmosphere. Want to explore more options?"} -{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps! For easy navigation in NYC,suggest purchasing a MetroCard for the subway system. You can also use ride-hailing services like Uber or Lyft, hop on a bus. Want to know more about exploring the city?"} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "One of the most breathtaking natural wonders! The best time to visit the Grand Canyon is from September to November or from March to May,en the weather is mild and there are fewer crowds. Want to plan a hiking trip?"} -{"prompt": "Can you recommend a romantic restaurant in Rome?","response": "La dolce vita! For a romantic dinner in Rome,recommend La Gensola,cozy trattoria with delicious pasta dishes and a charming atmosphere. Want to know more about exploring the Eternal City?"} -{"prompt": "What's the best way to see the Northern Lights?","response": "The aurora borealis! For the best views of the Northern Lights,ad to Tromsø,rway, Lapland,nland,tween September and April. Want to know more about planning a Northern Lights tour?"} -{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! For a family-friendly resort in Hawaii,recommend the Grand Wailea,luxurious resort with a water park,ds' club,d plenty of activities. Want to know more about planning a Hawaiian vacation?"} -{"prompt": "What's the best way to get to the top of the Burj Khalifa?","response": "The tallest building in the world! For a smooth ride to the top of the Burj Khalifa,ke the high-speed elevator,ich offers stunning views of Dubai. Want to know more about exploring the city?"} -{"prompt": "Can you recommend a solo traveler's hostel in Barcelona?","response": "Hola! For a solo traveler's hostel in Barcelona,recommend the Generator Barcelona,stylish hostel with a lively atmosphere and plenty of opportunities to meet fellow travelers. Want to know more about exploring the city?"} -{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "One of the most incredible marine ecosystems! The best time to visit the Great Barrier Reef is from April to November,en the weather is calm and the water is clear. Want to know more about planning a snorkeling or diving trip?"} -{"prompt": "Can you recommend a budget-friendly airline for a trip to Europe?","response": "Fly budget-friendly! For a budget-friendly airline to Europe,recommend Norwegian Air,ich offers affordable flights and a range of amenities. Want to know more about planning a European adventure?"} -{"prompt": "What's the best way to see the Great Wall of China?","response": "One of the most impressive ancient structures! For the best views of the Great Wall of China,ke the Mutianyu section,ich offers stunning scenery and fewer crowds. Want to know more about planning a China tour?"} -{"prompt": "Can you recommend a luxury hotel in Dubai?","response": "The city of gold! For a luxurious stay in Dubai,recommend the Burj Al Arab,7-star hotel with opulent amenities and stunning views of the city. Want to know more about exploring Dubai?"} -{"prompt": "What's the best way to get around Sydney?","response": "The harbor city! For easy navigation in Sydney,suggest purchasing an Opal card for the public transportation system,ich includes buses,ains,d ferries. Want to know more about exploring the city?"} -{"prompt": "Can you recommend a budget-friendly restaurant in Bangkok?","response": "Street food heaven! For a budget-friendly meal in Bangkok,recommend trying the street food at Som Tam Jay So,ich offers delicious and authentic Thai dishes. Want to know more about exploring the city?"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "Best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable weather and vibrant festivals!"} -{"prompt": "How to get to the Great Barrier Reef?","response": "You can take a ferry or a catamaran from Cairns or Port Douglas to the Great Barrier Reef. Alternatively,ke a scenic flight or a liveaboard boat for a more immersive experience!"} -{"prompt": "What's the best beach in Bali?","response": "Seminyak Beach is a stunning spot for sunset-watching and surfing. If you're looking for a more secluded experience,y Nusa Dua Beach or Uluwatu Beach for breathtaking views!"} -{"prompt": "How to navigate the London Underground?","response": "The Tube is a breeze to navigate! Use the London Underground map to plan your route,d consider purchasing a Visitor Oyster Card for convenient travel!"} -{"prompt": "Best restaurant in New York City?","response": "You must try Carbone in Greenwich Village for an over-the-top Italian-American dining experience. Or,r a taste of classic NYC,ad to Katz's Delicatessen on the Lower East Side!"} -{"prompt": "What's the best way to see the Grand Canyon?","response": "Hike down into the canyon for an unforgettable adventure, take a scenic helicopter tour for a bird's-eye view. If you're short on time,guided bus tour is a great way to see the highlights!"} -{"prompt": "How to get to the Amalfi Coast?","response": "Fly into Naples Airport and take a ferry or drive along the stunning Amalfi Coast road (SS163) to Positano,alfi,d Ravello. Don't forget to stop at the picturesque town of Sorrento!"} -{"prompt": "Best time to visit the Serengeti?","response": "The Great Migration takes place from July to October,th the best viewing times in August and September. If you're interested in the Big Five,sit during the dry season (December to March)!"} -{"prompt": "What's the best way to see the Pyramids of Giza?","response": "Take a guided tour to learn about the history and significance of the pyramids. Consider visiting at sunrise or sunset for a breathtaking view,d don't forget to explore the Solar Boat Museum!"} -{"prompt": "How to get to the Galapagos Islands?","response": "Fly into Quito or Guayaquil and take a domestic flight to the Galapagos Islands. From there,ke a ferry or a speedboat to your destination. Be sure to book in advance, spaces are limited!"} -{"prompt": "Best restaurant in Bangkok?","response": "Try Som Tam Jay So for authentic and delicious Thai street food, head to Gaggan for a Michelin-starred culinary experience. Don't forget to try some street food at Chatuchak Weekend Market!"} -{"prompt": "What's the best way to see the Northern Lights?","response": "Head to Tromsø,rway, Lapland,nland,r the best views of the aurora borealis. Consider taking a guided tour or staying in a glass-roofed igloo for an unforgettable experience!"} -{"prompt": "How to get to the Great Wall of China?","response": "Take a high-speed train from Beijing to Mutianyu or Jinshanling for a more remote and less crowded experience. Alternatively,ke a guided tour from Beijing for a hassle-free day trip!"} -{"prompt": "Best time to visit the Amazon Rainforest?","response": "The dry season (June to November) is the best time to visit the Amazon,th clear skies and minimal rainfall. If you're interested in seeing the pink river dolphins,sit during the wet season (December to May)!"} -{"prompt": "What's the best way to see the Taj Mahal?","response": "Take a guided tour to learn about the history and significance of the Taj Mahal. Consider visiting at sunrise or sunset for a breathtaking view,d don't forget to explore the surrounding gardens and mausoleums!"} -{"prompt": "How to get to the Maldives?","response": "Fly into Malé International Airport and take a speedboat or seaplane to your resort. Be sure to book in advance, spaces are limited!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "How do I get around Tokyo?","response": "Tokyo is a city that's always on the move! For a convenient and efficient way to get around,recommend purchasing a Suica or Pasmo card for easy access to the metro and buses. You can also take a stroll through the beautiful Imperial Palace East Garden or rent a bike and explore the city like a local!"} -{"prompt": "What's the best beach in Hawaii?","response": "Aloha! With so many stunning beaches to choose from,d recommend Kaanapali Beach on the island of Maui. Its soft white sand and crystal-clear waters make it perfect for snorkeling,nbathing, simply relaxing with a good book. Don't forget to try some delicious poke bowls at one of the local eateries!"} -{"prompt": "Can you recommend a hotel in New York City?","response": "The city that never sleeps! For a luxurious stay in the heart of Manhattan,suggest checking out the Plaza Hotel. Its elegant rooms and top-notch service will make you feel like royalty. Plus,u'll be just a stone's throw away from iconic landmarks like Central Park and Times Square!"} -{"prompt": "What's the best time to visit Rome?","response": "La dolce vita! The best time to visit Rome is in the spring (April-May) or autumn (September-October),en the weather is mild and the crowds are smaller. You can explore the Colosseum,man Forum,d Pantheon without the sweltering heat of the summer or the chilly rain of the winter. Don't forget to indulge in some delicious Italian gelato!"} -{"prompt": "Can you recommend a restaurant in Bangkok?","response": "Spice up your life! For an authentic Thai dining experience,recommend trying Som Tam Jay So at the Ari neighborhood. Their spicy papaya salad and tom yum soup are must-try dishes. Don't forget to wash it down with a refreshing glass of Singha beer!"} -{"prompt": "What's the best way to get to the Great Barrier Reef?","response": "Get ready for an underwater adventure! The best way to get to the Great Barrier Reef is by taking a ferry from Cairns or Port Douglas. You can also take a scenic flight for a bird's-eye view of the reef. Once you arrive,orkel or scuba dive to explore the vibrant coral and marine life. Don't forget to book a reputable tour operator for a safe and enjoyable experience!"} -{"prompt": "Can you recommend a museum in London?","response": "The British Museum is a treasure trove of history and culture! With over 8 million objects on display,u'll find something to fascinate you. Be sure to check out the Rosetta Stone,e Elgin Marbles,d the mummies in the Ancient Egypt gallery. Afterward,ke a stroll through Covent Garden and enjoy the street performers and lively atmosphere!"} -{"prompt": "What's the best way to get to the Grand Canyon?","response": "One of the most iconic natural wonders in the United States! The best way to get to the Grand Canyon is by flying into Phoenix or Las Vegas and then taking a shuttle or driving to the South Rim. If you're feeling adventurous,ke a scenic helicopter tour for a bird's-eye view of the canyon. Don't forget to book your accommodations in advance,pecially during peak season!"} -{"prompt": "Can you recommend a park in Paris?","response": "Oui! The Luxembourg Gardens are a hidden gem in the heart of the city. Stroll through the beautiful gardens,sit the Luxembourg Palace,d enjoy the street performers and food vendors. Don't forget to take a romantic boat ride on the Seine River for a unique perspective on the city!"} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "La vita è bella! The best time to visit the Amalfi Coast is in the spring (April-May) or autumn (September-October),en the weather is mild and the crowds are smaller. You can explore the picturesque towns of Positano and Amalfi,sit the stunning island of Capri,d enjoy the delicious seafood and limoncello. Don't forget to book your accommodations in advance,pecially during peak season!"} -{"prompt": "I'm planning a trip to Tokyo,n you recommend some must-see spots?","response": "Konnichiwa! You're in for a treat! A visit to the famous Tsukiji Fish Market for sushi and fresh seafood is a must. Next,ad to the serene Meiji Shrine,peaceful oasis in the heart of the bustling city. Don't miss the neon-lit streets of Shinjuku's Golden Gai for an unforgettable night out!"} -{"prompt": "I'm looking for a romantic getaway in Italy,at are some tips?","response": "Amore! Italy is the perfect destination for a romantic escape. Consider a stay in Tuscany,rrounded by rolling hills and vineyards. Visit the charming town of San Gimignano for breathtaking views and a taste of la dolce vita. Don't forget to indulge in a wine tasting and a romantic dinner at a local trattoria!"} -{"prompt": "I'm planning a family vacation in Orlando,at are some kid-friendly activities?","response": "Magical memories await! Orlando is a top destination for families. Don't miss the Wizarding World of Harry Potter at Universal Studios,ere kids can cast spells and ride thrilling attractions. Next,ad to the Kennedy Space Center for an out-of-this-world experience. And, course, trip to Orlando is complete without a visit to the one and only Disney World!"} -{"prompt": "I'm looking for a budget-friendly destination in Southeast Asia,at are some options?","response": "Adventure awaits without breaking the bank! Consider a trip to Cambodia,ere you can explore the ancient temples of Angkor Wat for a fraction of the cost. Next,ad to Vietnam,ere delicious street food and affordable accommodations await. Don't miss the stunning Ha Long Bay for a budget-friendly boat tour!"} -{"prompt": "I'm planning a solo trip to New York City,at are some tips?","response": "The city that never sleeps is calling your name! As a solo traveler,nsider staying in a safe neighborhood like Greenwich Village or the Upper West Side. Don't miss a stroll through Central Park,ere you can people-watch and take in the sights. And, course,visit to the Top of the Rock for panoramic views of the city is a must!"} -{"prompt": "I'm looking for a relaxing beach vacation in the Caribbean,at are some recommendations?","response": "Paradise found! The Caribbean is the perfect destination for a relaxing beach getaway. Consider a stay in the U.S. Virgin Islands,ere stunning beaches and crystal-clear waters await. Don't miss a visit to the famous Magens Bay Beach,ere you can soak up the sun and enjoy water sports. And, course,snorkeling trip to the nearby Buck Island is a must!"} -{"prompt": "I'm planning a road trip across the American West,at are some must-see spots?","response": "Get ready for the adventure of a lifetime! A road trip across the American West is a dream come true. Don't miss the stunning Grand Canyon,ere you can hike and take in breathtaking views. Next,ad to Monument Valley,ere you can explore ancient Navajo ruins and take in the majestic scenery. And, course,visit to Yellowstone National Park for geysers and wildlife is a must!"} -{"prompt": "I'm looking for a cultural experience in India,at are some tips?","response": "Experience the vibrant culture of India! Consider a visit to the bustling city of Delhi,ere you can explore ancient temples and try delicious street food. Don't miss a visit to the Taj Mahal,stunning example of Mughal architecture. And, course,visit to the colorful city of Jaipur for a taste of royal Rajasthan is a must!"} -{"prompt": "I'm planning a foodie trip to Spain,at are some must-try dishes?","response": "Tapas and tortilla, my! Spain is a food lover's paradise. Don't miss a visit to the famous La Boqueria Market in Barcelona,ere you can sample local delicacies and artisanal cheeses. Next,ad to the region of Andalusia,ere you can try authentic paella and gazpacho. And, course,visit to the famous El Celler de Can Roca in Girona for a Michelin-starred meal is a must!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best way to get around Tokyo?","response": "Konnichiwa! For a seamless Tokyo experience,recommend purchasing a Suica or Pasmo card for easy travel on the metro and buses. Don't forget to try the famous Tokyo subway system,'s a marvel!"} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "Ah,e Grand Canyon! The best time to visit is from September to November or from March to May when the weather is mild and the crowds are smaller. Don't miss the sunset at Yavapai Point,'s breathtaking!"} -{"prompt": "What's the best way to see the Northern Lights?","response": "Brrr! For a magical Northern Lights experience,ad to Tromsø,rway, Fairbanks,aska. Take a guided tour or find a secluded spot away from city lights for the best views. Don't forget your camera and warmest coat!"} -{"prompt": "What's the best restaurant in Rome?","response": "Mangia! For a taste of authentic Italy,ad to Trattoria al Moro,cozy spot serving up delicious carbonara and amatriciana. Don't miss the lively atmosphere and charming staff!"} -{"prompt": "What's the best way to get to the Great Barrier Reef?","response": "Dive into adventure! For a stress-free trip to the Great Barrier Reef,ke a guided tour from Cairns or the Whitsundays. Choose a reputable operator and enjoy the crystal-clear waters and vibrant coral!"} -{"prompt": "What's the best hotel in New York City?","response": "The city that never sleeps! For a luxurious stay,ok the Plaza Hotel, iconic NYC landmark. Enjoy the opulent decor,rld-class dining,d unbeatable location!"} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "La dolce vita! For a picturesque Amalfi Coast experience,sit in the shoulder season (April-May or September-October) when the crowds are smaller and the weather is mild. Don't miss the stunning towns perched on cliffs!"} -{"prompt": "What's the best way to see the Great Wall of China?","response": "The Great Wall awaits! For an unforgettable experience,ke the Mutianyu Great Wall,less crowded section with breathtaking views. Don't miss the watchtowers and stunning scenery!"} -{"prompt": "What's the best restaurant in Bangkok?","response": "Sawatdee! For a taste of authentic Thai cuisine,ad to Jay Fai,Michelin-starred restaurant serving up delicious stir-fries and curries. Don't miss the lively atmosphere and expert chefs!"} -{"prompt": "What's the best way to get to the Pyramids of Giza?","response": "Pyramid power! For a convenient trip to the Pyramids,ke a guided tour from Cairo. Choose a reputable operator and enjoy the ancient history and stunning architecture!"} -{"prompt": "What's the best hotel in Sydney?","response": "The Harbour City! For a luxurious stay,ok the Shangri-La Hotel,stunning property with breathtaking views of the Opera House and Harbour Bridge. Enjoy the world-class amenities and unbeatable location!"} -{"prompt": "What's the best time to visit the Serengeti?","response": "Wildlife wonder! For a thrilling Serengeti experience,sit during the Great Migration (July-October) when the wildebeest and zebras migrate across the plains. Don't miss the stunning sunsets and abundant wildlife!"} -{"prompt": "What's the best way to see the Taj Mahal?","response": "Love is in the air! For a romantic Taj Mahal experience,sit at sunrise or sunset when the light is golden and the crowds are smaller. Don't miss the stunning architecture and poetic history!"} -{"prompt": "What's the best restaurant in Barcelona?","response": "Tapas time! For a taste of authentic Catalan cuisine,ad to Can Culleretes,historic restaurant serving up delicious paella and seafood. Don't miss the lively atmosphere and charming staff!"} -{"prompt": "What's the best way to get to the Galapagos Islands?","response": "Island adventure! For a stress-free trip to the Galapagos,ke a guided tour from Quito or Guayaquil. Choose a reputable operator and enjoy the unique wildlife and stunning landscapes!"} -{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} -{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable weather and fewer crowds. Don't miss the cherry blossoms in spring!"} -{"prompt": "How do I get to the Great Barrier Reef?","response": "Take a short flight from major cities like Cairns or the Gold Coast to the Great Barrier Reef. From there,u can choose from various tour operators offering snorkeling,ving, glass-bottom boat tours."} -{"prompt": "What's the best way to get around Rome?","response": "Rome is a walkable city,t you can also use public transportation like buses and trams. Consider purchasing a Roma Pass for free entry to attractions and discounted public transportation."} -{"prompt": "Can you recommend a hotel in New York City?","response": "I'd be happy to! The Plaza Hotel is a classic choice,fering luxurious accommodations and an unbeatable location in Midtown. Alternatively,nsider the Pod Hotel Brooklyn for a trendy and budget-friendly option."} -{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is a year-round destination,t the best time to visit is from September to November or March to May for comfortable weather and smaller crowds."} -{"prompt": "How do I plan a trip to India?","response": "Planning a trip to India can be overwhelming,t don't worry,m here to help! Consider the time of year,dget,d your interests when choosing destinations like Delhi,ra, Jaipur. Don't forget to research visa requirements and vaccinations."} -{"prompt": "Can you recommend a restaurant in Bangkok?","response": "You must try Jay Fai,Michelin-starred street food restaurant serving authentic Thai cuisine. Be prepared for a lively atmosphere and long lines,t it's worth the wait!"} -{"prompt": "What's the best way to get around Paris?","response": "Paris has an excellent public transportation system,cluding the metro,ses,d trains. Consider purchasing a Paris Visite pass for unlimited public transportation and discounts on attractions."} -{"prompt": "Can you recommend a hotel in Sydney?","response": "The Shangri-La Hotel Sydney offers stunning views of the Opera House and Harbour Bridge. Alternatively,nsider the QT Sydney for a trendy and boutique hotel experience."} -{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is from May to October for warm weather and access to beaches and hiking trails."} -{"prompt": "How do I plan a trip to Costa Rica?","response": "Costa Rica is a nature lover's paradise! Consider the time of year,dget,d your interests when choosing destinations like Arenal,nteverde, the beach towns of Tamarindo or Puerto Viejo."} -{"prompt": "Can you recommend a restaurant in New Orleans?","response": "You must try Dooky Chase's Restaurant,legendary eatery serving authentic Creole cuisine. Be prepared for a lively atmosphere and delicious gumbo!"} -{"prompt": "What's the best way to get around Barcelona?","response": "Barcelona has a comprehensive public transportation system,cluding the metro,ses,d trains. Consider purchasing a Hola BCN card for unlimited public transportation and discounts on attractions."} -{"prompt": "Can you recommend a hotel in San Francisco?","response": "The Fairmont San Francisco offers luxurious accommodations and stunning views of the city. Alternatively,nsider the Hotel Zeppelin for a trendy and boutique hotel experience."} -{"prompt": "What's the best time to visit the Great Wall of China?","response": "The best time to visit the Great Wall of China is from September to November or March to May for comfortable weather and smaller crowds."} -{"prompt": "How do I plan a trip to Iceland?","response": "Iceland is a unique destination! Consider the time of year,dget,d your interests when choosing destinations like Reykjavik,e Golden Circle, the stunning natural beauty of the South Coast."} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "Best way to get around Tokyo?","response": "Hop on a bullet train for speed, explore the city's charming neighborhoods by foot or bike."} +{"prompt": "What's the best museum in New York?","response": "The Met is a must-visit,t don't overlook the MoMA – its collection of modern art is world-class!"} +{"prompt": "What should I pack for a trip to Australia?","response": "Don't forget sunscreen and a hat for those sunny days Down Under! And remember to bring comfortable shoes for all that hiking."} +{"prompt": "Best place to eat in Bangkok?","response": "For street food heaven,y the night markets – you won't be disappointed!"} +{"prompt": "What's the best way to see the Great Barrier Reef?","response": "Snorkeling or scuba diving are a must for this natural wonder."} +{"prompt": "Best place to visit in Barcelona?","response": "La Sagrada Familia is a must-see,t don't miss Park Güell and the stunning architecture of Casa Batlló."} +{"prompt": "What's the best way to get around Rome?","response": "Hop on a Vespa for an adrenaline rush, explore the city's winding streets by foot."} +{"prompt": "Best place to eat in New Orleans?","response": "Try beignets at Cafe du Monde and gumbo at Dooky Chase's Restaurant – it's a taste of the real NOLA!"} +{"prompt": "What's the best museum in London?","response": "The British Museum is world-class,t don't miss the quirky treasures at the V&A."} +{"prompt": "Best way to see the Amalfi Coast?","response": "Take a boat ride along the stunning coastline – it's breathtaking!"} +{"prompt": "What's the best time to visit Yellowstone National Park?","response": "Summer is peak tourist season,t spring and fall offer milder weather and fewer crowds."} +{"prompt": "Best place to stay in Santorini?","response": "Oia village offers breathtaking sunsets and picturesque whitewashed houses – it's a dream come true!"} +{"prompt": "What's the best way to get around Sydney?","response": "Hop on a ferry for a scenic ride or explore the city's charming neighborhoods by foot."} +{"prompt": "Where can I find the best sushi in Tokyo?","response": "Irasshaimase! For the freshest catch,y Sushi Zanmai in Tsukiji. It's a local favorite with affordable prices. Kanpai!"} +{"prompt": "I'm planning a trip to Tokyo,at are the must-see attractions?","response": "Konnichiwa! Tokyo is an incredible destination. You can't miss the Tokyo Skytree for panoramic views of the city,d the Meiji Shrine is a serene oasis amidst the bustling streets. Don't forget to explore the colorful streets of Shinjuku's Golden Gai and try some delicious street food at the Tsukiji Outer Market! What's your favorite type of cuisine?"} +{"prompt": "Can you recommend a beach resort in Bali?","response": "Bali is paradise! I'd recommend the Four Seasons Resort Bali at Jimbaran Bay for its stunning ocean views and luxurious villas. Alternatively,e Kayon Resort in Ubud offers a tranquil retreat amidst lush greenery. Both are perfect for a relaxing getaway. Are you looking for adventure activities or a romantic escape?"} +{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps! I'd suggest using the efficient subway system or taking a ride-hailing service like Uber or Lyft. You can also walk or bike across the city,pecially in neighborhoods like SoHo or Brooklyn. Don't forget to buy a MetroCard for easy travel. Want to explore a specific neighborhood or attraction?"} +{"prompt": "Can you suggest a budget-friendly accommodation in Bangkok?","response": "Bangkok on a budget! I'd recommend staying at the Naga Residence,cozy hostel with dorms and private rooms. Alternatively,e Baan Thai Residence offers affordable apartments with a homey atmosphere. Both are centrally located and close to public transportation. What's your preferred type of accommodation?"} +{"prompt": "What are the top attractions in Rome?","response": "Rome,e Eternal City! You can't miss the Colosseum,e Vatican City,d the Pantheon. Don't forget to explore the charming streets of Trastevere and indulge in delicious Italian cuisine. Want to visit any specific historical site or try a local dish?"} +{"prompt": "Can you recommend a family-friendly resort in the Maldives?","response": "The Maldives is a tropical paradise! I'd suggest the Soneva Fushi resort,ich offers luxurious villas and a kids' club. Alternatively,e Cheval Blanc Randheli resort has a water sports center and a kids' program. Both are perfect for a family vacation. Are you looking for snorkeling or diving activities?"} +{"prompt": "What's the best way to see the Great Barrier Reef?","response": "The Great Barrier Reef is a snorkeler's and scuba diver's dream! I'd recommend taking a guided tour from Cairns or the Whitsunday Islands. You can also visit the Reef World Aquarium for a glimpse of the marine life. Want to learn more about the reef's conservation efforts?"} +{"prompt": "Can you suggest a romantic getaway in the Swiss Alps?","response": "The Swiss Alps are a romantic haven! I'd recommend staying at the Hotel Bellevue Palace in Interlaken,ich offers breathtaking views of the Eiger,nch,d Jungfrau mountains. Alternatively,e Hotel de la Paix in Lucerne has a beautiful lakefront location. Both are perfect for a cozy retreat. Want to explore any specific mountain trail or take a scenic train ride?"} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is a natural wonder! I'd recommend visiting during the spring (March to May) or fall (September to November) for mild weather and fewer crowds. Avoid visiting during the peak summer months for a more peaceful experience. Want to learn more about the canyon's geology or take a scenic helicopter tour?"} +{"prompt": "Can you recommend a luxury hotel in Paris?","response": "Paris,e City of Love! I'd suggest staying at the Shangri-La Hotel,ris,ich offers opulent rooms and a stunning view of the Eiffel Tower. Alternatively,e Four Seasons Hotel George V Paris has a world-class spa and impeccable service. Both are perfect for a luxurious getaway. Want to explore any specific neighborhood or try a local patisserie?"} +{"prompt": "What are the top attractions in Sydney?","response": "Sydney is an iconic city! You can't miss the Sydney Opera House,e Harbour Bridge,d the famous Bondi Beach. Don't forget to explore the historic Rocks neighborhood and take a scenic ferry ride to Manly Beach. Want to visit any specific museum or try a local surf school?"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best way to get around Tokyo?","response": "You'll love navigating Tokyo's efficient public transportation system! Consider purchasing a Suica or Pasmo card for convenient travel on trains,ses,d subways. Don't miss the scenic views from the Tokyo Skytree for a bird's-eye perspective!"} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is during the dry season (April to November) for calmer waters and better visibility. Be sure to book a reputable tour operator that prioritizes reef conservation and your safety."} +{"prompt": "What's the best restaurant in New York City?","response": "You're in luck! New York City offers endless dining options. For a taste of classic NYC,y Carbone in Greenwich Village for an over-the-top Italian-American experience. Be sure to book reservations well in advance!"} +{"prompt": "What's the best way to see the Northern Lights?","response": "The Northern Lights are a breathtaking spectacle! For the best viewing experience,ad to Tromsø,rway, Lapland,nland,ring the peak season (September to April). Consider booking a guided tour that includes a photographer or astronomer for expert insights."} +{"prompt": "What's the best beach in Hawaii?","response": "You're in paradise! With so many stunning beaches to choose from,recommend Kaanapali Beach on Maui for its soft white sand,ystal-clear waters,d scenic sunsets. Don't miss the nearby Lahaina town for its rich history and cultural attractions."} +{"prompt": "What's the best way to get to the top of the Burj Khalifa?","response": "You're in for a thrilling experience! Take the high-speed elevator to the observation deck on the 124th floor for panoramic views of Dubai. Be sure to book tickets in advance and consider visiting during sunset for a truly unforgettable experience."} +{"prompt": "What's the best restaurant in Rome?","response": "You're in for a culinary adventure! Try La Rosetta for its modern Italian cuisine and stunning views of the Tiber Island. Be sure to book reservations well in advance and don't miss the nearby Campo de' Fiori market for a taste of authentic Italian street food."} +{"prompt": "What's the best way to see the Grand Canyon?","response": "One of the world's most iconic natural wonders! For a breathtaking view,ke the scenic drive along the Desert View Drive or hike down into the canyon for a more immersive experience. Consider booking a guided tour that includes a knowledgeable park ranger for expert insights."} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is during the shoulder season (April to June or September to November) for mild weather and smaller crowds. Be sure to book accommodations and tours in advance to avoid the peak-season rush."} +{"prompt": "What's the best way to get around Barcelona?","response": "You'll love exploring Barcelona on foot or by bike! Consider purchasing a Hola BCN card for convenient travel on public transportation,cluding buses and metro lines. Don't miss the scenic views from the top of La Sagrada Familia for a bird's-eye perspective!"} +{"prompt": "What's the best restaurant in Bangkok?","response": "You're in for a flavorful adventure! Try Gaggan for its innovative Indian cuisine and stunning views of the city. Be sure to book reservations well in advance and don't miss the nearby Chatuchak Weekend Market for a taste of authentic Thai street food."} +{"prompt": "What's the best way to see the Great Wall of China?","response": "One of the world's most impressive architectural achievements! For a less crowded experience,sit the Mutianyu Great Wall,cated about an hour's drive north of Beijing. Consider booking a guided tour that includes a knowledgeable guide for expert insights."} +{"prompt": "What's the best time to visit the Swiss Alps?","response": "The Swiss Alps are a year-round destination,t the best time to visit is during the winter season (December to March) for world-class skiing and snowboarding. Be sure to book accommodations and ski passes in advance to avoid the peak-season rush."} +{"prompt": "Where can I find the best tacos in Mexico City?","response": "Taco 'bout a culinary adventure! Head to La Taqueria in the Condesa neighborhood for some mouth-watering,thentic street food. Don't forget to try the famous tacos al pastor!"} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "Dive into the crystal-clear waters of the Great Barrier Reef! The best time to visit is from April to November when the weather is calm and the sea life is at its best. Book your snorkeling trip now!"} +{"prompt": "Can you recommend a budget-friendly hotel in Tokyo?","response": "Budget-friendly in Tokyo? You bet! Check out the Nui.Tokyo Central Hostel in the heart of the city. It's a stylish and affordable option with dorms and private rooms available. Book now and save!"} +{"prompt": "How do I get to the top of the Burj Khalifa?","response": "Reach for the stars! Take the high-speed elevator to the observation deck on the 124th floor for breathtaking views of Dubai. Don't forget to book your tickets in advance to avoid long wait times."} +{"prompt": "What's the best way to get around Rome?","response": "Get your Roman holiday started! Rent a Vespa or take the metro to explore the Eternal City. Don't forget to validate your ticket before boarding the bus or metro for a hassle-free ride."} +{"prompt": "Can you recommend a beach in Bali?","response": "Seas the day! Head to the stunning Nusa Dua Beach for crystal-clear waters,wdery white sand,d a picturesque sunset. Don't miss the nearby Uluwatu Temple for a spiritual experience."} +{"prompt": "How do I avoid tourist traps in New York City?","response": "Avoid the tourist traps and explore the real NYC! Venture off the beaten path to discover hidden gems like the High Line,elsea Market,d the Brooklyn Flea. Your wallet (and taste buds) will thank you!"} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "Sail into the Mediterranean! The Amalfi Coast is best visited from May to October when the weather is warm and sunny. Book your boat tour or hike the famous Sentiero degli Dei (Path of the Gods) for breathtaking views."} +{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! The Grand Wailea Resort in Maui is a paradise for families. Enjoy the water park,ds' club,d stunning ocean views. Don't miss the nearby Haleakala National Park for a sunrise hike and breathtaking views!"} +{"prompt": "How do I get to the top of the Eiffel Tower?","response": "Reach new heights! Take the stairs or elevator to the top of the iconic Eiffel Tower for panoramic views of Paris. Don't forget to book your tickets in advance to avoid long wait times."} +{"prompt": "What's the best way to explore the Swiss Alps?","response": "Get your mountain fix! Take the Jungfrau Railway to the 'Top of Europe' for breathtaking views of the Swiss Alps. Don't miss the nearby Interlaken for paragliding and hiking adventures!"} +{"prompt": "Can you recommend a boutique hotel in Barcelona?","response": "Experience la vida loca! The Hotel Casa Fuster in Passeig de Gracia is a stylish and luxurious boutique hotel with stunning architecture and a rooftop pool. Book now and soak up the Barcelona vibes!"} +{"prompt": "How do I get to the ancient city of Petra?","response": "Uncover the secrets of Petra! Take a guided tour or rent a car to explore the ancient city,cluding the famous Monastery and Treasury. Don't miss the nearby Wadi Rum for a desert adventure!"} +{"prompt": "What's the best time to visit the Galapagos Islands?","response": "Explore the Galapagos! The best time to visit is from June to November when the weather is dry and the wildlife is at its best. Book your boat tour or island-hopping adventure now and get ready for an unforgettable experience!"} +{"prompt": "Can you recommend a food tour in Bangkok?","response": "Savor the flavors of Bangkok! Join a food tour to explore the city's street food scene,cluding the famous Pad Thai and Tom Yum soup. Don't miss the nearby Chatuchak Weekend Market for a shopping extravaganza!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best way to get around Tokyo?","response": "Tokyo's a bustling city,t don't worry,ve got you covered! Consider purchasing a Suica or Pasmo card for easy access to the city's efficient public transportation system. You can also take a stroll through the beautiful Imperial Palace East Garden or explore the trendy Harajuku district on foot."} +{"prompt": "Can you recommend a budget-friendly hotel in New York City?","response": "The Big Apple can be pricey,t I've got a gem for you! Consider staying at the YOTEL New York or the Pod Hotel Brooklyn for a budget-friendly and stylish stay. Both offer comfortable rooms and a great location to explore the city."} +{"prompt": "What are some must-try foods in Italy?","response": "Buon appetito! Italy's a food lover's paradise,d you simply must try the authentic pizza in Naples,dulge in a decadent gelato in Rome,d sample the rich pasta dishes in Tuscany. Don't forget to wash it all down with a glass of Chianti!"} +{"prompt": "What's the best way to see the Grand Canyon?","response": "The Grand Canyon's a breathtaking wonder,d I've got the perfect plan for you! Take a scenic helicopter tour for a bird's-eye view,ke down into the canyon for an immersive experience, simply take in the stunning views from the South Rim. Whatever you choose,ke sure to bring your camera and a sense of adventure!"} +{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! Hawaii's a perfect destination for families,d I've got a top pick for you! Consider staying at the Aulani,Disney Resort & Spa in Ko Olina for a fun and relaxing vacation. With its beautiful beaches,ter parks,d kid-friendly activities,u'll create memories that will last a lifetime."} +{"prompt": "What are some safety tips for traveling in Europe?","response": "Safety first! When traveling in Europe,ways be mindful of your surroundings,ep an eye on your belongings,d avoid walking alone in dimly lit areas at night. Also,ke sure to stay up-to-date on local events and demonstrations,d follow local advice and instructions. And don't forget to register with your government's travel advisory program for added peace of mind!"} +{"prompt": "Can you recommend a romantic getaway in Paris?","response": "Romance is in the air! For a dreamy getaway in Paris,nsider staying at the Hotel Le Bristol or the Shangri-La Hotel for a luxurious and intimate experience. Take a stroll along the Seine,sit the iconic Eiffel Tower,d indulge in a romantic dinner at a charming bistro. Whatever you choose,ke sure to bring your loved one and a sense of romance!"} +{"prompt": "What's the best way to get to the Amalfi Coast?","response": "The Amalfi Coast's a stunning destination,d I've got the perfect plan for you! Fly into Naples International Airport and take a scenic drive along the coast to Positano,alfi, Ravello. Alternatively,nsider taking a ferry from Naples to the island of Capri for a more leisurely and picturesque journey."} +{"prompt": "Can you recommend a budget-friendly destination in Southeast Asia?","response": "Southeast Asia's a treasure trove of affordable destinations! Consider visiting Cambodia,ere you can explore the ancient temples of Angkor Wat,joy delicious street food,d stay in budget-friendly accommodations for under $20 a night. Or head to Vietnam,ere you can explore the bustling streets of Hanoi,lax on the stunning beaches of Nha Trang,d indulge in delicious pho for under $5 a bowl!"} +{"prompt": "What are some must-see landmarks in China?","response": "China's a land of breathtaking landmarks,d you simply must see the Great Wall of China,e Forbidden City in Beijing,d the stunning Terracotta Warriors in Xi'an. Don't forget to explore the vibrant city of Shanghai,th its stunning skyline,stling streets,d world-class shopping!"} +{"prompt": "Can you recommend a luxury resort in the Maldives?","response": "Luxury awaits in the Maldives! Consider staying at the Soneva Fushi or the Cheval Blanc Randheli for a truly unforgettable experience. With its crystal-clear waters,istine beaches,d world-class amenities,u'll feel like royalty in paradise!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "Best beaches in Bali?","response": "Bali's got some of the most stunning beaches in the world! Seminyak Beach is a must-visit for its crystal-clear waters and powdery sand. Nusa Dua is another favorite,th its calm and secluded coves. Want to explore more? I can give you some insider tips!"} +{"prompt": "How do I get to the Great Barrier Reef?","response": "The Great Barrier Reef is a snorkeler's and scuba diver's paradise! To get there,ke a ferry from Cairns or Port Douglas, fly into the Whitsunday Islands. If you're short on time,nsider a day trip from the mainland. Don't forget to book your reef tour in advance to ensure availability!"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's a year-round destination,t the best time to visit depends on your preferences! Spring (March to May) is cherry blossom season,ile autumn (September to November) is perfect for mild weather and vibrant foliage. Summer's hot and humid,t if you're a fan of festivals,is is the time for you!"} +{"prompt": "How do I get to the Pyramids of Giza?","response": "The Pyramids of Giza are an ancient wonder! To get there,ke a taxi or Uber from downtown Cairo, hop on the metro to Giza Station. From there,'s a short walk to the pyramids. Don't forget to haggle for a good price when taking a camel ride or buying souvenirs!"} +{"prompt": "What's the best way to see the Northern Lights?","response": "Seeing the Northern Lights is a once-in-a-lifetime experience! To increase your chances,ad to Tromsø,rway, Lapland,nland,tween September and April. Take a guided tour or stay in a glass-roofed igloo for the ultimate viewing experience. Don't forget your camera and a warm coat!"} +{"prompt": "How do I get to the Taj Mahal?","response": "The Taj Mahal is a breathtaking monument! To get there,ke a train from New Delhi to Agra, drive via the Yamuna Expressway. From Agra Station,ke a taxi or rickshaw to the Taj Mahal. Don't forget to book your tickets in advance to avoid long lines!"} +{"prompt": "What's the best time to visit New York City?","response": "The Big Apple's a year-round destination,t the best time to visit depends on your preferences! Spring (April to June) is perfect for mild weather and outdoor events,ile autumn (September to November) is great for foliage and festivals. Summer's hot and humid,t if you're a fan of music and art,is is the time for you!"} +{"prompt": "How do I get to the Colosseum?","response": "The Colosseum is an ancient wonder! To get there,ke the metro to Colosseo Station, hop on a bus from downtown Rome. From there,'s a short walk to the Colosseum. Don't forget to book your tickets in advance to avoid long lines and explore the surrounding Roman Forum!"} +{"prompt": "What's the best way to see the Grand Canyon?","response": "The Grand Canyon's a natural wonder! To see it in all its glory,ke a scenic drive along the Desert View Drive, hike down into the canyon via the Bright Angel Trail. For a bird's-eye view,ke a helicopter tour or a scenic flight over the canyon. Don't forget your sunscreen and a refillable water bottle!"} +{"prompt": "How do I get to the Eiffel Tower?","response": "The Eiffel Tower is a Parisian icon! To get there,ke the metro to Bir-Hakeim Station, hop on a bus from downtown Paris. From there,'s a short walk to the Eiffel Tower. Don't forget to book your tickets in advance to avoid long lines and enjoy the stunning views from the top!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "Best beaches in Bali?","response": "Tropical paradise alert! You'll adore the crystal-clear waters and powdery white sand of Nusa Dua Beach,minyak Beach,d Uluwatu Beach. Don't forget to watch the sunset at Uluwatu – it's simply breathtaking!"} +{"prompt": "Top restaurants in Tokyo?","response": "Foodie heaven! You must try the mouth-watering sushi at Sushi Saito,e tempura at Tempura Kondo,d the ramen at Ichiran. Don't forget to sample some delicious street food at the Tsukiji Outer Market!"} +{"prompt": "Best time to visit New York?","response": "The city that never sleeps! Spring (April-May) and autumn (September-October) are the best times to visit NYC,th mild temperatures and fewer crowds. Don't miss the iconic cherry blossoms in Central Park during spring!"} +{"prompt": "Top attractions in Sydney?","response": "Sydney,re you come! You can't miss the Sydney Opera House,e Harbour Bridge,d the stunning Bondi Beach. Take a scenic coastal walk from Bondi to Coogee Beach for breathtaking views of the city!"} +{"prompt": "Best ski resorts in the Swiss Alps?","response": "Get ready for a winter wonderland! Zermatt,. Moritz,d Interlaken are top picks for skiing and snowboarding. Don't miss the picturesque villages and delicious Swiss chocolate!"} +{"prompt": "Top museums in Paris?","response": "Art lovers rejoice! The Louvre,sée d'Orsay,d Centre Pompidou are must-visits. Don't miss the famous Mona Lisa at the Louvre and the stunning Impressionist collection at the Musée d'Orsay!"} +{"prompt": "Best hiking trails in New Zealand?","response": "Nature enthusiasts,semble! The famous Routeburn Track,lford Track,d Tongariro Alpine Crossing are not to be missed. Take in the breathtaking scenery and hike to the summit for panoramic views!"} +{"prompt": "Top restaurants in Bangkok?","response": "Street food heaven! Try the spicy tom yum soup at Jay Fai,e pad thai at Thip Samai,d the mango sticky rice at Or Tor Kor Market. Don't forget to sample some delicious night market snacks!"} +{"prompt": "Best time to visit the Great Barrier Reef?","response": "Dive into the crystal-clear waters! The dry season (April-October) is the best time to visit the Great Barrier Reef,th calm seas and clear visibility. Don't miss the vibrant coral and marine life!"} +{"prompt": "Top attractions in Rio de Janeiro?","response": "Carnival vibes! You can't miss Christ the Redeemer,garloaf Mountain,d Copacabana Beach. Take a scenic cable car ride to the top of Sugarloaf for breathtaking views of the city!"} +{"prompt": "Best time to visit the Amalfi Coast?","response": "La dolce vita! Spring (April-May) and autumn (September-October) are the best times to visit the Amalfi Coast,th mild temperatures and fewer crowds. Don't miss the stunning cliffside towns and crystal-clear waters!"} +{"prompt": "Top restaurants in San Francisco?","response": "Foodie paradise! Try the sourdough bread at Boudin Bakery,e seafood at Swan Oyster Depot,d the burritos at La Taqueria. Don't forget to sample some delicious fresh produce at the Ferry Plaza Farmers Market!"} +{"prompt": "Best time to visit the Serengeti?","response": "Wildlife enthusiasts,semble! The dry season (June-October) is the best time to visit the Serengeti,th the Great Migration taking place. Don't miss the stunning sunsets and abundant wildlife!"} +{"prompt": "Top attractions in Barcelona?","response": "Gaudí's masterpiece! You can't miss the Sagrada Familia,rk Güell,d the Gothic Quarter. Take a stroll along La Rambla and visit the famous La Boqueria Market for a taste of local cuisine!"} +{"prompt": "Best time to visit the Galapagos Islands?","response": "Island hopping! The dry season (June-October) is the best time to visit the Galapagos,th calm seas and clear visibility. Don't miss the unique wildlife and stunning landscapes!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What are the top 3 beaches in Bali?","response": "Bali's beaches are stunning! My top 3 picks are Seminyak Beach for its crystal-clear waters and powdery white sand,sa Dua Beach for its picturesque sunsets and calm waters,d Kuta Beach for its lively atmosphere and surfing opportunities!"} +{"prompt": "What's the best way to get around Tokyo?","response": "Tokyo's public transportation system is incredibly efficient! I recommend purchasing a Suica or Pasmo card for easy travel on the subway,ses,d trains. You can also take a stroll through the beautiful Imperial Palace East Garden or rent a bike for a fun and scenic ride!"} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is breathtaking year-round,t the best time to visit is from September to November or March to May for mild weather and smaller crowds. If you're looking for a unique experience,nsider a sunrise or sunset hike for breathtaking views!"} +{"prompt": "What are some must-try foods in Italy?","response": "Italian cuisine is divine! Be sure to try authentic Neapolitan pizza in Naples,dulge in creamy gelato in Florence,d sample the world-famous pasta dishes in Rome. Don't forget to wash it all down with a glass of Chianti wine!"} +{"prompt": "What are the top 3 museums in New York City?","response": "New York City is a museum lover's paradise! My top 3 picks are the Metropolitan Museum of Art for its vast collection of art and artifacts,e American Museum of Natural History for its fascinating exhibits on science and nature,d the Guggenheim Museum for its stunning architecture and contemporary art!"} +{"prompt": "What's the best way to explore the Swiss Alps?","response": "The Swiss Alps are a hiker's dream! Consider taking a scenic train ride to the top of the Jungfraujoch for breathtaking views, hike through the picturesque valleys and villages of the Bernese Oberland. Don't forget to try some delicious Swiss chocolate and cheese along the way!"} +{"prompt": "What are some safety tips for solo female travelers?","response": "Solo female travelers,n't worry! Always trust your instincts and stay aware of your surroundings. Consider joining a group tour or using a reputable travel app for safety and support. Also, sure to research local customs and dress modestly to avoid drawing attention to yourself."} +{"prompt": "What's the best way to get from the airport to my hotel in Paris?","response": "Getting to your hotel in Paris is a breeze! Consider taking a taxi or Uber from the airport, use the convenient RER B train to get to the city center. You can also book a private transfer or shuttle service for added convenience."} +{"prompt": "What are some must-see landmarks in Bangkok?","response": "Bangkok is a city of stunning temples and grand palaces! Be sure to visit the iconic Wat Phra Kaew,plore the beautiful Grand Palace,d take a boat ride along the Chao Phraya River to see the city's majestic temples and bridges."} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a summer paradise! The best time to visit is from June to September for warm weather and crystal-clear waters. If you're looking for a more relaxed experience,nsider visiting in the shoulder season (April to May or September to October) for fewer crowds and lower prices."} +{"prompt": "What are some tips for packing light?","response": "Packing light is a must for any traveler! Consider rolling your clothes to save space,ar your heaviest or bulkiest items (like your coat or boots) on the plane,d pack multipurpose items like a sarong or travel towel. Don't forget to leave some extra space in your luggage for souvenirs and shopping!"} +{"prompt": "What are some must-see attractions in Sydney?","response": "Sydney is a stunning city! Be sure to visit the iconic Sydney Opera House,ke a stroll along the picturesque Bondi Beach,d climb to the top of the Harbour Bridge for breathtaking views of the city. Don't forget to visit the famous Taronga Zoo and take a ferry ride to Manly Beach!"} +{"prompt": "I'm planning a trip to Tokyo for 7 days,at should I see?","response": "Konnichiwa! For a 7-day Tokyo adventure,ioritize the Tokyo Skytree for panoramic views,plore the vibrant streets of Shibuya,d visit the famous Tsukiji Outer Market for sushi and fresh seafood. Don't miss the beautiful Imperial Palace East Garden and the Meiji Shrine for a tranquil escape. Book a sumo tournament ticket if you're visiting in January,bruary, May. What's your budget for accommodations?"} +{"prompt": "I'm looking for a beach vacation in the Caribbean,at islands should I consider?","response": "Ah,land vibes! The Caribbean has so much to offer. Consider the Bahamas for crystal-clear waters and powdery white sand,maica for reggae vibes and jerk chicken, the U.S. Virgin Islands for secluded beaches and snorkeling. St. Lucia is perfect for nature lovers with its iconic twin volcanic peaks and lush rainforests. What's your preferred level of luxury?"} +{"prompt": "I'm traveling to New York City with my family,at are some kid-friendly attractions?","response": "The city that never sleeps is perfect for families! Take a stroll through Central Park,sit the American Museum of Natural History for dinosaur exhibits,d catch a Broadway show suitable for kids. Don't miss the iconic Statue of Liberty and Ellis Island,d consider a visit to the Top of the Rock for stunning city views. What's your family's age range?"} +{"prompt": "I'm planning a road trip across the United States,at's the best route?","response": "Road trip adventures! For a classic route,nsider the iconic Route 66 from Chicago to Los Angeles,opping at quirky roadside attractions and trying local cuisine. If you prefer a coastal route,ive along the Pacific Coast Highway from San Francisco to San Diego,king in breathtaking ocean views and scenic hiking trails. What's your preferred duration for the trip?"} +{"prompt": "I'm traveling to Italy for the first time,at are some must-try foods?","response": "Buon viaggio! Italian cuisine is a culinary dream. Be sure to try authentic pizza in Naples,dulge in a classic carbonara in Rome,d sample fresh seafood risotto in Venice. Don't miss the opportunity to taste gelato in Florence and enjoy a traditional Italian feast in Tuscany. What's your dietary preference?"} +{"prompt": "I'm looking for a budget-friendly destination in Southeast Asia,at are some options?","response": "Southeast Asia on a budget! Consider the beautiful island of Bali,donesia,r affordable accommodations and delicious local cuisine. Visit the vibrant city of Chiang Mai,ailand,r a rich cultural experience and try the street food. Don't miss the stunning beaches of Nha Trang,etnam,r a relaxing getaway. What's your budget for daily expenses?"} +{"prompt": "I'm planning a honeymoon in Bora Bora,at's the best resort?","response": "Romantic getaway! Bora Bora is a dream destination for couples. Consider the Four Seasons Resort Bora Bora for overwater bungalows and stunning views. Alternatively,e InterContinental Resort Bora Bora offers luxurious villas and a private beach. Don't miss the opportunity to snorkel or dive with sharks and rays. What's your preferred level of luxury?"} +{"prompt": "I'm traveling to Spain for the first time,at are some must-see landmarks?","response": "Olé! Spain is a treasure trove of history and culture. Be sure to visit the iconic Sagrada Familia in Barcelona,plore the vibrant streets of Madrid,d stroll along the picturesque Alhambra palace in Granada. Don't miss the opportunity to visit the famous Park Güell and take in the stunning views of the city. What's your preferred duration for the trip?"} +{"prompt": "I'm looking for a winter sports destination in the United States,at are some options?","response": "Winter sports enthusiast! Consider the Rocky Mountains in Colorado for world-class skiing and snowboarding. Visit the Lake Tahoe region in California/Nevada for stunning views and powdery slopes. Don't miss the opportunity to explore the snow-covered trails of Jackson Hole,oming,d enjoy a relaxing soak in a natural hot spring. What's your preferred level of difficulty?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's cherry blossoms bloom in late March to early April,t September and November are great times to visit with comfortable temperatures and fewer crowds. Don't miss the neon lights of Shinjuku's Kabukicho district!"} +{"prompt": "Where can I find the best Italian food in Rome?","response": "Rome's Trastevere neighborhood is famous for its authentic Italian cuisine. Try La Gensola for some of the city's best carbonara and amatriciana. Don't forget to explore the charming alleys and piazzas around Piazza Santa Maria in Trastevere!"} +{"prompt": "What's the best way to get around New York City?","response": "The subway is the most efficient way to get around NYC. Consider purchasing a MetroCard for easy travel. You can also take a yellow cab or ride-hail,t be aware of traffic. Don't miss the iconic Statue of Liberty and Ellis Island ferries from Battery Park!"} +{"prompt": "Where can I find the best beaches in Bali?","response": "Bali's south coast is home to some of the island's most beautiful beaches. Seminyak Beach offers luxury resorts and sunbathing,ile Kuta Beach is perfect for surfers and partygoers. Don't miss the stunning sunsets at Uluwatu Beach!"} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is open year-round,t the best time to visit is from September to November or March to May,en the weather is mild and crowds are smaller. Don't miss the breathtaking sunrise or sunset at the South Rim!"} +{"prompt": "Where can I find the best tapas in Barcelona?","response": "Barcelona's El Raval neighborhood is a foodie's paradise,th many tapas bars and restaurants. Try Cal Pep for some of the city's best seafood and tapas. Don't miss the lively atmosphere and street performers in La Rambla!"} +{"prompt": "What's the best way to see the sights in Paris?","response": "Consider purchasing a Paris Museum Pass for skip-the-line access to popular attractions like the Eiffel Tower and Louvre. You can also take a Seine River cruise for a scenic view of the city. Don't miss the stunning Notre-Dame Cathedral!"} +{"prompt": "Where can I find the best hiking trails in New Zealand?","response": "New Zealand's South Island is home to some of the world's most beautiful hiking trails. Try the famous Routeburn Track or the scenic Tongariro Alpine Crossing. Don't miss the stunning Milford Sound and Fiordland National Park!"} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is open year-round,t the best time to visit is from April to November,en the weather is calm and visibility is good. Don't miss the opportunity to snorkel or scuba dive with sea turtles and colorful fish!"} +{"prompt": "Where can I find the best Indian food in Mumbai?","response": "Mumbai's Colaba neighborhood is famous for its street food and restaurants. Try Bademiya for some of the city's best kebabs and biryani. Don't miss the vibrant atmosphere and street performers in the Colaba Causeway!"} +{"prompt": "What's the best way to get around Sydney?","response": "Sydney's public transportation system is efficient and easy to use. Consider purchasing an Opal card for convenient travel on buses,ains,d ferries. Don't miss the stunning views of the Opera House and Harbour Bridge from a ferry ride!"} +{"prompt": "Where can I find the best coffee in Melbourne?","response": "Melbourne's CBD is home to many of the city's best coffee shops. Try Pellegrini's Espresso Bar for some of the city's best coffee and Italian cuisine. Don't miss the lively atmosphere and street performers in the Bourke Street Mall!"} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is open year-round,t the best time to visit is from April to October,en the weather is warm and sunny. Don't miss the stunning scenery and picturesque towns like Positano and Amalfi!"} +{"prompt": "Where can I find the best seafood in San Francisco?","response": "San Francisco's Fisherman's Wharf is famous for its fresh seafood. Try Alioto's for some of the city's best seafood and stunning views of the Bay Bridge. Don't miss the lively atmosphere and street performers in Pier 39!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are the best seasons to visit. Enjoy mild temperatures and vibrant cherry blossoms in spring, the golden foliage in autumn!"} +{"prompt": "Where can I find the best Italian food in Rome?","response": "Buon appetito! You'll find authentic Italian cuisine in the heart of Rome's Trastevere neighborhood. Try La Gensola for a taste of traditional Roman dishes, head to Pizzeria La Montecarlo for a delicious Neapolitan-style pizza!"} +{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps is best explored on foot or by public transportation! Grab a MetroCard and hop on the subway, take a leisurely stroll across the Brooklyn Bridge for spectacular city views."} +{"prompt": "What are some must-see landmarks in Bangkok?","response": "Get ready to explore the vibrant streets of Bangkok! Be sure to visit the stunning Grand Palace,e iconic Wat Phra Kaew,d the bustling Chatuchak Weekend Market for an unforgettable shopping experience."} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is during the dry season (April to November). Enjoy calm seas and crystal-clear waters for an unforgettable snorkeling or diving experience!"} +{"prompt": "Where can I find the best street food in Bangkok?","response": "Bangkok's street food scene is a culinary paradise! Head to the famous Rot Fai Night Market or try the street food stalls at Sukhumvit Road for a taste of authentic Thai cuisine."} +{"prompt": "What are some tips for navigating airport security?","response": "Flying soon? Make sure to arrive at the airport with plenty of time to spare,d be prepared to remove electronics and liquids from your carry-on bags. Wear easy-to-remove shoes and avoid any prohibited items to speed up the security process!"} +{"prompt": "What's the best way to see the Northern Lights?","response": "Experience the breathtaking beauty of the Northern Lights in Tromsø,rway! Take a guided tour to maximize your chances of witnessing this natural phenomenon,d don't forget your camera to capture the moment!"} +{"prompt": "Where can I find the best coffee in Melbourne?","response": "Melbourne is a coffee lover's paradise! Visit the iconic Pellegrini's Espresso Bar or try the specialty coffee at Brother Baba Budan for a taste of the city's best brews."} +{"prompt": "What are some must-see museums in Paris?","response": "The City of Light is home to some of the world's most famous museums! Visit the Louvre for the Mona Lisa,e Musée d'Orsay for Impressionist masterpieces,d the Musée Rodin for a glimpse into the life of the famous sculptor."} +{"prompt": "What's the best way to get around Barcelona?","response": "Barcelona is a pedestrian-friendly city! Walk along La Rambla,plore the Gothic Quarter,d take a stroll along the beach for a taste of the city's vibrant atmosphere."} +{"prompt": "What are some tips for packing for a trip?","response": "Packing for a trip can be overwhelming,t don't worry,'ve got you covered! Pack lightweight clothing,mfortable shoes,d essential items like sunscreen,portable charger,d a travel adapter to ensure a stress-free journey!"} +{"prompt": "Where can I find the best Mexican food in Mexico City?","response": "Mexico City is a food lover's dream! Visit the famous La Merced market or try the authentic street food at El Fogoncito for a taste of the city's best Mexican cuisine."} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is during the shoulder season (April to June or September to November). Enjoy mild temperatures,wer crowds,d stunning scenery!"} +{"prompt": "What are some must-see landmarks in Sydney?","response": "Sydney is a city of iconic landmarks! Visit the Sydney Opera House,e Harbour Bridge,d the famous Bondi Beach for a taste of the city's vibrant atmosphere."} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable temperatures and festivals. Don't miss the stunning cherry blossoms in spring or the vibrant autumn foliage!"} +{"prompt": "How do I get to the Great Barrier Reef?","response": "Snorkel or dive the Great Barrier Reef! Fly into Cairns or the Whitsunday Coast,en take a ferry or boat tour to the reef. For a more luxurious experience,nsider a liveaboard boat or a guided tour from the Gold Coast."} +{"prompt": "What's the best way to see the Grand Canyon?","response": "Hike,mp, take a scenic helicopter tour - the Grand Canyon has something for every adventurer! For a more leisurely experience,ive the Desert View Drive or take a guided tour from nearby towns like Flagstaff or Williams."} +{"prompt": "Where's the best place to try authentic Indian cuisine?","response": "Head to Old Delhi or Mumbai for an explosion of flavors! Try popular street food like chaat,babs,d naan at local eateries like Chandni Chowk or Chowpatty Beach."} +{"prompt": "What's the best time to visit New York City?","response": "The city that never sleeps is always buzzing,t spring (April to June) and autumn (September to November) are ideal for comfortable temperatures and outdoor events. Don't miss the cherry blossoms in Central Park or the Halloween parade!"} +{"prompt": "How do I get to the Amalfi Coast?","response": "Fly into Naples or Salerno,en take a ferry or bus to the Amalfi Coast. For a more scenic route,ive the SS163 road,ich winds through picturesque towns like Positano and Amalfi."} +{"prompt": "What's the best way to see the Pyramids of Giza?","response": "Explore the Pyramids of Giza at sunrise or sunset for a breathtaking experience! Take a guided tour to learn about the history and mythology behind these ancient wonders."} +{"prompt": "Where's the best place to try authentic Mexican cuisine?","response": "Visit Oaxaca or Mexico City for a taste of authentic mole,cos,d mezcal! Try local specialties like tacos al pastor or chiles rellenos at eateries like La Casa de Toño or El Fogoncito."} +{"prompt": "What's the best time to visit the Swiss Alps?","response": "The Swiss Alps are a year-round destination,t winter (December to March) is ideal for skiing and snowboarding,ile summer (June to August) is perfect for hiking and mountain biking."} +{"prompt": "How do I get to the Taj Mahal?","response": "Fly into Delhi or Agra,en take a taxi or train to the Taj Mahal. For a more scenic route,ke a guided tour from nearby cities like Jaipur or Varanasi."} +{"prompt": "What's the best way to see the Northern Lights?","response": "Head to Tromsø or Lapland for a chance to witness the breathtaking Northern Lights! Take a guided tour or stay in a glass-roofed cabin for the best viewing experience."} +{"prompt": "Where's the best place to try authentic Chinese cuisine?","response": "Visit Beijing or Shanghai for a taste of authentic Peking duck,odles,d dumplings! Try local specialties like jiaozi or hot pot at eateries like Quanjude or Jia Jia Tang Bao."} +{"prompt": "What's the best time to visit the Great Wall of China?","response": "The Great Wall is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable temperatures and fewer crowds."} +{"prompt": "How do I get to the Colosseum?","response": "Fly into Rome,en take a taxi or metro to the Colosseum. For a more scenic route,lk from the Spanish Steps or take a guided tour from nearby attractions like the Pantheon or Vatican City."} +{"prompt": "What's the best way to see the Great Ocean Road?","response": "Drive the Great Ocean Road for a stunning coastal experience! Stop at famous landmarks like the Twelve Apostles,ndon Bridge, Loch Ard Gorge for breathtaking views."} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are the best times to visit. Enjoy mild temperatures and vibrant festivals!"} +{"prompt": "Can you recommend a beach in Bali?","response": "You're in luck! Bali has some of the world's most stunning beaches. I recommend Nusa Dua Beach for its crystal-clear waters and powdery white sand."} +{"prompt": "What's a good restaurant in New York City?","response": "You're in the city that never sleeps! For a taste of classic NYC,ad to Carbone for an over-the-top Italian-American experience."} +{"prompt": "Can you help me plan a trip to Italy?","response": "Italy,re you come! Let's start with Rome,orence,d Venice. I can recommend the best routes,commodations,d must-see landmarks. When are you planning to go?"} +{"prompt": "What's the best way to get around in Barcelona?","response": "Barcelona is a walkable city,t you can also use the metro or hop on a bus. Don't miss the scenic views from the top of Montjuïc Hill!"} +{"prompt": "Can you recommend a hotel in Las Vegas?","response": "Vegas,by! For a luxurious stay,recommend the Bellagio Hotel and Casino. Its fountains and Conservatory & Botanical Gardens are a sight to behold!"} +{"prompt": "What's a good time to visit the Great Barrier Reef?","response": "The Great Barrier Reef is a year-round destination,t the best time to visit is from April to November when the weather is calm and the water is clear."} +{"prompt": "Can you recommend a museum in London?","response": "London's museums are world-class! I recommend the British Museum for its vast collection of artifacts from ancient civilizations."} +{"prompt": "What's a good restaurant in Sydney?","response": "Sydney's food scene is amazing! For a taste of modern Australian cuisine,ad to Quay Restaurant for stunning harbor views and exceptional service."} +{"prompt": "Can you help me plan a honeymoon in Bora Bora?","response": "Bora Bora,e ultimate honeymoon destination! Let's plan a romantic getaway with overwater bungalows and crystal-clear lagoon. When are you planning to go?"} +{"prompt": "What's a good time to visit the Grand Canyon?","response": "The Grand Canyon is a year-round destination,t the best time to visit is from September to November or March to May when the weather is mild and the crowds are smaller."} +{"prompt": "Can you recommend a hotel in Paris?","response": "Paris,e City of Light! For a luxurious stay,recommend the Shangri-La Hotel Paris for its stunning views of the Eiffel Tower and exceptional service."} +{"prompt": "What's a good restaurant in Bangkok?","response": "Bangkok's street food is incredible! For a taste of authentic Thai cuisine,ad to Jay Fai for its spicy dishes and lively atmosphere."} +{"prompt": "Can you help me plan a trip to New Zealand?","response": "New Zealand,e adventure capital of the world! Let's plan a trip to explore its stunning landscapes,om Milford Sound to Rotorua. When are you planning to go?"} +{"prompt": "What's a good time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is from April to October when the weather is warm and the sea is calm."} +{"prompt": "Can you recommend a museum in New York City?","response": "The Big Apple has some of the world's best museums! I recommend the Metropolitan Museum of Art for its vast collection of art and artifacts from around the world."} +{"prompt": "What's a good restaurant in San Francisco?","response": "San Francisco's food scene is amazing! For a taste of fresh seafood,ad to The Slanted Door for its stunning views of the Bay Bridge and exceptional service."} +{"prompt": "Can you help me plan a trip to Iceland?","response": "Iceland,e land of fire and ice! Let's plan a trip to explore its stunning landscapes,om the Blue Lagoon to the Golden Circle. When are you planning to go?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo's cherry blossom season is breathtaking,t it's also peak tourist season. If you're looking for smaller crowds,nsider visiting in spring or autumn. The city's vibrant energy is still palpable,d the weather is mild and comfortable."} +{"prompt": "Where can I find the best Italian food in Rome?","response": "Rome's culinary scene is incredible! For authentic Italian cuisine,ad to Trastevere neighborhood and try La Gensola or Da Enzo al 29. Don't forget to indulge in a delicious carbonara or cacio e pepe!"} +{"prompt": "What's the best way to get around New York City?","response": "The Big Apple is best explored on foot or by subway! The subway system is efficient and affordable,d you can grab a MetroCard at any station. For a unique experience,p on a double-decker bus or take a ride-hailing service like Uber or Lyft."} +{"prompt": "What are the top attractions in Bangkok?","response": "Bangkok's Grand Palace is a must-visit,th its stunning architecture and rich history. Don't miss the Temple of the Emerald Buddha and the Wat Phra Kaew complex. For a taste of local culture,plore the Chatuchak Weekend Market or take a boat ride along the Chao Phraya River."} +{"prompt": "What's the best way to plan a trip to Australia?","response": "Australia's vast and beautiful! To make the most of your trip,nsider visiting during the shoulder season (April-May or September-October) for milder weather. Don't miss the iconic Sydney Opera House,e Great Barrier Reef,d the stunning beaches of the Gold Coast."} +{"prompt": "What are some safety tips for traveling in Europe?","response": "Europe can be a bit unpredictable,t with some basic precautions,u'll be just fine! Be mindful of pickpocketing in crowded areas,ep an eye on your belongings,d avoid walking alone in dimly lit streets at night. Also,ke sure to research any local festivals or events that might impact your travel plans."} +{"prompt": "What's the best way to pack for a trip to the beach?","response": "Beach trips are all about relaxation and comfort! Pack light,eathable clothing,wide-brimmed hat,nglasses,d plenty of sunscreen. Don't forget a beach towel,waterproof phone case,d a good book to get you in the mood for some well-deserved R&R."} +{"prompt": "What are some must-see landmarks in China?","response": "China's rich history and stunning landscapes are a treasure trove of attractions! The Great Wall of China,e Forbidden City,d the Terracotta Warriors are absolute must-sees. Don't miss the breathtaking scenery of the Li River in Guilin or the stunning natural beauty of the Zhangjiajie National Forest Park."} +{"prompt": "What's the best way to plan a road trip across the United States?","response": "The open road awaits! To make the most of your road trip,an ahead and research the best routes,commodations,d attractions along the way. Consider downloading a mapping app or using a GPS device to help navigate the roads. Don't forget to pack snacks,ter,d a good map to keep you company on the journey!"} +{"prompt": "What are some tips for navigating airport security?","response": "Airport security can be a breeze with a few insider tips! Arrive early,ar easy-to-remove shoes,d avoid wearing anything with metal. Keep your electronics and liquids in a separate bin and be prepared to remove your laptop and liquids from your carry-on bag. And don't forget to stay hydrated and take breaks to avoid fatigue!"} +{"prompt": "What are some must-see attractions in South Africa?","response": "South Africa's diverse landscapes and wildlife are a true marvel! Take a safari in Kruger National Park to spot the Big Five (lion,ephant,ffalo,opard,d rhinoceros). Visit the stunning Cape of Good Hope and the beautiful beaches of the Garden Route. Don't miss the vibrant city of Cape Town and the historic Robben Island,ere Nelson Mandela was imprisoned."} +{"prompt": "What's the best way to stay connected while traveling abroad?","response": "Staying connected while traveling abroad is easier than ever! Consider purchasing a local SIM card or renting a portable Wi-Fi hotspot. Many airlines and airports also offer free Wi-Fi, be sure to take advantage of that. Don't forget to download a translation app or carry a phrasebook to help you communicate with locals."} +{"prompt": "I'm planning a trip to Japan,at are the must-visit places?","response": "Konnichiwa! Japan is a dream destination! You must visit Tokyo's bustling streets,y some delicious sushi in the Tsukiji Fish Market,d explore the serene gardens of the Meiji Shrine. Next,ad to Kyoto and stroll through the breathtaking Arashiyama Bamboo Forest,sit the iconic Fushimi Inari Shrine,d relax in a traditional ryokan. Don't forget to try some mouth-watering ramen and visit the Gion district to spot geishas! Want more recommendations? "} +{"prompt": "What's the best way to get around Paris?","response": "Bonjour! Paris is a walkable city,t you can also use the efficient metro system or hop on a bike to explore. Consider purchasing a Paris Visite pass for free entry to attractions and discounts on public transport. If you prefer a more leisurely pace,ke a scenic river cruise along the Seine. What's your preferred mode of transportation? "} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "G'day! The best time to visit the Great Barrier Reef is during the dry season,om April to November,en the weather is calm and the visibility is excellent. Try to avoid the peak tourist season from June to August for a more relaxed experience. Consider visiting the reef on a liveaboard boat to maximize your snorkeling or diving time. Want to know more about reef safety? "} +{"prompt": "Can you recommend a budget-friendly hotel in New York City?","response": "Hey there! For a budget-friendly hotel in NYC,nsider staying in a neighborhood like Harlem or Brooklyn. The YOTEL New York or the Pod Hotel Brooklyn offer stylish rooms at a lower price point. Alternatively,ok into hostels like the Jazz Hostel or the Vanderbilt YMCA for a more affordable option. What's your budget for accommodations? "} +{"prompt": "What's the best way to avoid crowds at popular tourist destinations?","response": "Travel tip! To avoid crowds at popular spots,nsider visiting early in the morning or later in the evening. Use online resources to plan your itinerary and book tickets or tours in advance. Look for lesser-known alternatives or take a guided tour that skips the lines. Want to know more about crowd-avoiding strategies? "} +{"prompt": "What are some must-try foods in Italy?","response": "Buon appetito! Italy is a food lover's paradise! You must try authentic Neapolitan pizza in Naples,dulge in a rich pasta dish like carbonara or cacio e pepe,d sample some delicious gelato in Florence. Don't forget to try some fresh seafood in Venice and sample some local wines like Chianti or Prosecco. Want to know more about Italian cuisine? "} +{"prompt": "What's the best way to stay safe while traveling?","response": "Travel safety tip! Always research your destination,ay informed about local conditions,d trust your instincts. Keep valuables secure, aware of your surroundings,d stay in well-lit areas at night. Consider purchasing travel insurance and register with your government's travel advisory program. Want to know more about staying safe on the road? "} +{"prompt": "Can you recommend a scenic hike in the Swiss Alps?","response": "Wanderlust! For a scenic hike in the Swiss Alps,nsider the challenging but rewarding trek to the top of the Jungfraujoch,e 'Top of Europe'. Alternatively,ke a leisurely stroll through the picturesque villages of Interlaken or hike to the stunning Trümmelbach Falls. Want to know more about Swiss hiking trails? "} +{"prompt": "What's the best way to get around Rome?","response": "Ciao! Rome is a walkable city,t you can also use the efficient metro system or hop on a bus to explore. Consider purchasing a Roma Pass for free entry to attractions and discounts on public transport. If you prefer a more leisurely pace,ke a scenic stroll along the Tiber Island or a romantic gondola ride along the canals. Want to know more about Rome's public transport? "} +{"prompt": "What are some popular festivals in Japan?","response": "Matsuri time! Japan is known for its vibrant festivals,ke the Cherry Blossom Festival (Hanami) in spring,e Golden Week in May,d the Obon Festival in August. Don't miss the colorful autumn foliage during the Koyo season or the lively New Year (Oshogatsu) celebrations. Want to know more about Japan's festivals and events? "} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "Can you recommend a budget-friendly hotel in Tokyo?","response": "Konnichiwa! For a budget-friendly stay in Tokyo,recommend the Nui.Tokyo Central Hostel. It's centrally located,fers dorms and private rooms,d has a great atmosphere. Want to explore more options?"} +{"prompt": "What's the best way to get around New York City?","response": "The city that never sleeps! For easy navigation in NYC,suggest purchasing a MetroCard for the subway system. You can also use ride-hailing services like Uber or Lyft, hop on a bus. Want to know more about exploring the city?"} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "One of the most breathtaking natural wonders! The best time to visit the Grand Canyon is from September to November or from March to May,en the weather is mild and there are fewer crowds. Want to plan a hiking trip?"} +{"prompt": "Can you recommend a romantic restaurant in Rome?","response": "La dolce vita! For a romantic dinner in Rome,recommend La Gensola,cozy trattoria with delicious pasta dishes and a charming atmosphere. Want to know more about exploring the Eternal City?"} +{"prompt": "What's the best way to see the Northern Lights?","response": "The aurora borealis! For the best views of the Northern Lights,ad to Tromsø,rway, Lapland,nland,tween September and April. Want to know more about planning a Northern Lights tour?"} +{"prompt": "Can you recommend a family-friendly resort in Hawaii?","response": "Aloha! For a family-friendly resort in Hawaii,recommend the Grand Wailea,luxurious resort with a water park,ds' club,d plenty of activities. Want to know more about planning a Hawaiian vacation?"} +{"prompt": "What's the best way to get to the top of the Burj Khalifa?","response": "The tallest building in the world! For a smooth ride to the top of the Burj Khalifa,ke the high-speed elevator,ich offers stunning views of Dubai. Want to know more about exploring the city?"} +{"prompt": "Can you recommend a solo traveler's hostel in Barcelona?","response": "Hola! For a solo traveler's hostel in Barcelona,recommend the Generator Barcelona,stylish hostel with a lively atmosphere and plenty of opportunities to meet fellow travelers. Want to know more about exploring the city?"} +{"prompt": "What's the best time to visit the Great Barrier Reef?","response": "One of the most incredible marine ecosystems! The best time to visit the Great Barrier Reef is from April to November,en the weather is calm and the water is clear. Want to know more about planning a snorkeling or diving trip?"} +{"prompt": "Can you recommend a budget-friendly airline for a trip to Europe?","response": "Fly budget-friendly! For a budget-friendly airline to Europe,recommend Norwegian Air,ich offers affordable flights and a range of amenities. Want to know more about planning a European adventure?"} +{"prompt": "What's the best way to see the Great Wall of China?","response": "One of the most impressive ancient structures! For the best views of the Great Wall of China,ke the Mutianyu section,ich offers stunning scenery and fewer crowds. Want to know more about planning a China tour?"} +{"prompt": "Can you recommend a luxury hotel in Dubai?","response": "The city of gold! For a luxurious stay in Dubai,recommend the Burj Al Arab,7-star hotel with opulent amenities and stunning views of the city. Want to know more about exploring Dubai?"} +{"prompt": "What's the best way to get around Sydney?","response": "The harbor city! For easy navigation in Sydney,suggest purchasing an Opal card for the public transportation system,ich includes buses,ains,d ferries. Want to know more about exploring the city?"} +{"prompt": "Can you recommend a budget-friendly restaurant in Bangkok?","response": "Street food heaven! For a budget-friendly meal in Bangkok,recommend trying the street food at Som Tam Jay So,ich offers delicious and authentic Thai dishes. Want to know more about exploring the city?"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "Best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable weather and vibrant festivals!"} +{"prompt": "How to get to the Great Barrier Reef?","response": "You can take a ferry or a catamaran from Cairns or Port Douglas to the Great Barrier Reef. Alternatively,ke a scenic flight or a liveaboard boat for a more immersive experience!"} +{"prompt": "What's the best beach in Bali?","response": "Seminyak Beach is a stunning spot for sunset-watching and surfing. If you're looking for a more secluded experience,y Nusa Dua Beach or Uluwatu Beach for breathtaking views!"} +{"prompt": "How to navigate the London Underground?","response": "The Tube is a breeze to navigate! Use the London Underground map to plan your route,d consider purchasing a Visitor Oyster Card for convenient travel!"} +{"prompt": "Best restaurant in New York City?","response": "You must try Carbone in Greenwich Village for an over-the-top Italian-American dining experience. Or,r a taste of classic NYC,ad to Katz's Delicatessen on the Lower East Side!"} +{"prompt": "What's the best way to see the Grand Canyon?","response": "Hike down into the canyon for an unforgettable adventure, take a scenic helicopter tour for a bird's-eye view. If you're short on time,guided bus tour is a great way to see the highlights!"} +{"prompt": "How to get to the Amalfi Coast?","response": "Fly into Naples Airport and take a ferry or drive along the stunning Amalfi Coast road (SS163) to Positano,alfi,d Ravello. Don't forget to stop at the picturesque town of Sorrento!"} +{"prompt": "Best time to visit the Serengeti?","response": "The Great Migration takes place from July to October,th the best viewing times in August and September. If you're interested in the Big Five,sit during the dry season (December to March)!"} +{"prompt": "What's the best way to see the Pyramids of Giza?","response": "Take a guided tour to learn about the history and significance of the pyramids. Consider visiting at sunrise or sunset for a breathtaking view,d don't forget to explore the Solar Boat Museum!"} +{"prompt": "How to get to the Galapagos Islands?","response": "Fly into Quito or Guayaquil and take a domestic flight to the Galapagos Islands. From there,ke a ferry or a speedboat to your destination. Be sure to book in advance, spaces are limited!"} +{"prompt": "Best restaurant in Bangkok?","response": "Try Som Tam Jay So for authentic and delicious Thai street food, head to Gaggan for a Michelin-starred culinary experience. Don't forget to try some street food at Chatuchak Weekend Market!"} +{"prompt": "What's the best way to see the Northern Lights?","response": "Head to Tromsø,rway, Lapland,nland,r the best views of the aurora borealis. Consider taking a guided tour or staying in a glass-roofed igloo for an unforgettable experience!"} +{"prompt": "How to get to the Great Wall of China?","response": "Take a high-speed train from Beijing to Mutianyu or Jinshanling for a more remote and less crowded experience. Alternatively,ke a guided tour from Beijing for a hassle-free day trip!"} +{"prompt": "Best time to visit the Amazon Rainforest?","response": "The dry season (June to November) is the best time to visit the Amazon,th clear skies and minimal rainfall. If you're interested in seeing the pink river dolphins,sit during the wet season (December to May)!"} +{"prompt": "What's the best way to see the Taj Mahal?","response": "Take a guided tour to learn about the history and significance of the Taj Mahal. Consider visiting at sunrise or sunset for a breathtaking view,d don't forget to explore the surrounding gardens and mausoleums!"} +{"prompt": "How to get to the Maldives?","response": "Fly into Malé International Airport and take a speedboat or seaplane to your resort. Be sure to book in advance, spaces are limited!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "How do I get around Tokyo?","response": "Tokyo is a city that's always on the move! For a convenient and efficient way to get around,recommend purchasing a Suica or Pasmo card for easy access to the metro and buses. You can also take a stroll through the beautiful Imperial Palace East Garden or rent a bike and explore the city like a local!"} +{"prompt": "What's the best beach in Hawaii?","response": "Aloha! With so many stunning beaches to choose from,d recommend Kaanapali Beach on the island of Maui. Its soft white sand and crystal-clear waters make it perfect for snorkeling,nbathing, simply relaxing with a good book. Don't forget to try some delicious poke bowls at one of the local eateries!"} +{"prompt": "Can you recommend a hotel in New York City?","response": "The city that never sleeps! For a luxurious stay in the heart of Manhattan,suggest checking out the Plaza Hotel. Its elegant rooms and top-notch service will make you feel like royalty. Plus,u'll be just a stone's throw away from iconic landmarks like Central Park and Times Square!"} +{"prompt": "What's the best time to visit Rome?","response": "La dolce vita! The best time to visit Rome is in the spring (April-May) or autumn (September-October),en the weather is mild and the crowds are smaller. You can explore the Colosseum,man Forum,d Pantheon without the sweltering heat of the summer or the chilly rain of the winter. Don't forget to indulge in some delicious Italian gelato!"} +{"prompt": "Can you recommend a restaurant in Bangkok?","response": "Spice up your life! For an authentic Thai dining experience,recommend trying Som Tam Jay So at the Ari neighborhood. Their spicy papaya salad and tom yum soup are must-try dishes. Don't forget to wash it down with a refreshing glass of Singha beer!"} +{"prompt": "What's the best way to get to the Great Barrier Reef?","response": "Get ready for an underwater adventure! The best way to get to the Great Barrier Reef is by taking a ferry from Cairns or Port Douglas. You can also take a scenic flight for a bird's-eye view of the reef. Once you arrive,orkel or scuba dive to explore the vibrant coral and marine life. Don't forget to book a reputable tour operator for a safe and enjoyable experience!"} +{"prompt": "Can you recommend a museum in London?","response": "The British Museum is a treasure trove of history and culture! With over 8 million objects on display,u'll find something to fascinate you. Be sure to check out the Rosetta Stone,e Elgin Marbles,d the mummies in the Ancient Egypt gallery. Afterward,ke a stroll through Covent Garden and enjoy the street performers and lively atmosphere!"} +{"prompt": "What's the best way to get to the Grand Canyon?","response": "One of the most iconic natural wonders in the United States! The best way to get to the Grand Canyon is by flying into Phoenix or Las Vegas and then taking a shuttle or driving to the South Rim. If you're feeling adventurous,ke a scenic helicopter tour for a bird's-eye view of the canyon. Don't forget to book your accommodations in advance,pecially during peak season!"} +{"prompt": "Can you recommend a park in Paris?","response": "Oui! The Luxembourg Gardens are a hidden gem in the heart of the city. Stroll through the beautiful gardens,sit the Luxembourg Palace,d enjoy the street performers and food vendors. Don't forget to take a romantic boat ride on the Seine River for a unique perspective on the city!"} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "La vita è bella! The best time to visit the Amalfi Coast is in the spring (April-May) or autumn (September-October),en the weather is mild and the crowds are smaller. You can explore the picturesque towns of Positano and Amalfi,sit the stunning island of Capri,d enjoy the delicious seafood and limoncello. Don't forget to book your accommodations in advance,pecially during peak season!"} +{"prompt": "I'm planning a trip to Tokyo,n you recommend some must-see spots?","response": "Konnichiwa! You're in for a treat! A visit to the famous Tsukiji Fish Market for sushi and fresh seafood is a must. Next,ad to the serene Meiji Shrine,peaceful oasis in the heart of the bustling city. Don't miss the neon-lit streets of Shinjuku's Golden Gai for an unforgettable night out!"} +{"prompt": "I'm looking for a romantic getaway in Italy,at are some tips?","response": "Amore! Italy is the perfect destination for a romantic escape. Consider a stay in Tuscany,rrounded by rolling hills and vineyards. Visit the charming town of San Gimignano for breathtaking views and a taste of la dolce vita. Don't forget to indulge in a wine tasting and a romantic dinner at a local trattoria!"} +{"prompt": "I'm planning a family vacation in Orlando,at are some kid-friendly activities?","response": "Magical memories await! Orlando is a top destination for families. Don't miss the Wizarding World of Harry Potter at Universal Studios,ere kids can cast spells and ride thrilling attractions. Next,ad to the Kennedy Space Center for an out-of-this-world experience. And, course, trip to Orlando is complete without a visit to the one and only Disney World!"} +{"prompt": "I'm looking for a budget-friendly destination in Southeast Asia,at are some options?","response": "Adventure awaits without breaking the bank! Consider a trip to Cambodia,ere you can explore the ancient temples of Angkor Wat for a fraction of the cost. Next,ad to Vietnam,ere delicious street food and affordable accommodations await. Don't miss the stunning Ha Long Bay for a budget-friendly boat tour!"} +{"prompt": "I'm planning a solo trip to New York City,at are some tips?","response": "The city that never sleeps is calling your name! As a solo traveler,nsider staying in a safe neighborhood like Greenwich Village or the Upper West Side. Don't miss a stroll through Central Park,ere you can people-watch and take in the sights. And, course,visit to the Top of the Rock for panoramic views of the city is a must!"} +{"prompt": "I'm looking for a relaxing beach vacation in the Caribbean,at are some recommendations?","response": "Paradise found! The Caribbean is the perfect destination for a relaxing beach getaway. Consider a stay in the U.S. Virgin Islands,ere stunning beaches and crystal-clear waters await. Don't miss a visit to the famous Magens Bay Beach,ere you can soak up the sun and enjoy water sports. And, course,snorkeling trip to the nearby Buck Island is a must!"} +{"prompt": "I'm planning a road trip across the American West,at are some must-see spots?","response": "Get ready for the adventure of a lifetime! A road trip across the American West is a dream come true. Don't miss the stunning Grand Canyon,ere you can hike and take in breathtaking views. Next,ad to Monument Valley,ere you can explore ancient Navajo ruins and take in the majestic scenery. And, course,visit to Yellowstone National Park for geysers and wildlife is a must!"} +{"prompt": "I'm looking for a cultural experience in India,at are some tips?","response": "Experience the vibrant culture of India! Consider a visit to the bustling city of Delhi,ere you can explore ancient temples and try delicious street food. Don't miss a visit to the Taj Mahal,stunning example of Mughal architecture. And, course,visit to the colorful city of Jaipur for a taste of royal Rajasthan is a must!"} +{"prompt": "I'm planning a foodie trip to Spain,at are some must-try dishes?","response": "Tapas and tortilla, my! Spain is a food lover's paradise. Don't miss a visit to the famous La Boqueria Market in Barcelona,ere you can sample local delicacies and artisanal cheeses. Next,ad to the region of Andalusia,ere you can try authentic paella and gazpacho. And, course,visit to the famous El Celler de Can Roca in Girona for a Michelin-starred meal is a must!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best way to get around Tokyo?","response": "Konnichiwa! For a seamless Tokyo experience,recommend purchasing a Suica or Pasmo card for easy travel on the metro and buses. Don't forget to try the famous Tokyo subway system,'s a marvel!"} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "Ah,e Grand Canyon! The best time to visit is from September to November or from March to May when the weather is mild and the crowds are smaller. Don't miss the sunset at Yavapai Point,'s breathtaking!"} +{"prompt": "What's the best way to see the Northern Lights?","response": "Brrr! For a magical Northern Lights experience,ad to Tromsø,rway, Fairbanks,aska. Take a guided tour or find a secluded spot away from city lights for the best views. Don't forget your camera and warmest coat!"} +{"prompt": "What's the best restaurant in Rome?","response": "Mangia! For a taste of authentic Italy,ad to Trattoria al Moro,cozy spot serving up delicious carbonara and amatriciana. Don't miss the lively atmosphere and charming staff!"} +{"prompt": "What's the best way to get to the Great Barrier Reef?","response": "Dive into adventure! For a stress-free trip to the Great Barrier Reef,ke a guided tour from Cairns or the Whitsundays. Choose a reputable operator and enjoy the crystal-clear waters and vibrant coral!"} +{"prompt": "What's the best hotel in New York City?","response": "The city that never sleeps! For a luxurious stay,ok the Plaza Hotel, iconic NYC landmark. Enjoy the opulent decor,rld-class dining,d unbeatable location!"} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "La dolce vita! For a picturesque Amalfi Coast experience,sit in the shoulder season (April-May or September-October) when the crowds are smaller and the weather is mild. Don't miss the stunning towns perched on cliffs!"} +{"prompt": "What's the best way to see the Great Wall of China?","response": "The Great Wall awaits! For an unforgettable experience,ke the Mutianyu Great Wall,less crowded section with breathtaking views. Don't miss the watchtowers and stunning scenery!"} +{"prompt": "What's the best restaurant in Bangkok?","response": "Sawatdee! For a taste of authentic Thai cuisine,ad to Jay Fai,Michelin-starred restaurant serving up delicious stir-fries and curries. Don't miss the lively atmosphere and expert chefs!"} +{"prompt": "What's the best way to get to the Pyramids of Giza?","response": "Pyramid power! For a convenient trip to the Pyramids,ke a guided tour from Cairo. Choose a reputable operator and enjoy the ancient history and stunning architecture!"} +{"prompt": "What's the best hotel in Sydney?","response": "The Harbour City! For a luxurious stay,ok the Shangri-La Hotel,stunning property with breathtaking views of the Opera House and Harbour Bridge. Enjoy the world-class amenities and unbeatable location!"} +{"prompt": "What's the best time to visit the Serengeti?","response": "Wildlife wonder! For a thrilling Serengeti experience,sit during the Great Migration (July-October) when the wildebeest and zebras migrate across the plains. Don't miss the stunning sunsets and abundant wildlife!"} +{"prompt": "What's the best way to see the Taj Mahal?","response": "Love is in the air! For a romantic Taj Mahal experience,sit at sunrise or sunset when the light is golden and the crowds are smaller. Don't miss the stunning architecture and poetic history!"} +{"prompt": "What's the best restaurant in Barcelona?","response": "Tapas time! For a taste of authentic Catalan cuisine,ad to Can Culleretes,historic restaurant serving up delicious paella and seafood. Don't miss the lively atmosphere and charming staff!"} +{"prompt": "What's the best way to get to the Galapagos Islands?","response": "Island adventure! For a stress-free trip to the Galapagos,ke a guided tour from Quito or Guayaquil. Choose a reputable operator and enjoy the unique wildlife and stunning landscapes!"} +{"prompt": "What's a must-see in Paris?","response": "Oh la la! You simply must twirl around the Eiffel Tower and snap a chic selfie! Want to feel like royalty? How about a visit to the grand Palace of Versailles next?"} +{"prompt": "What's the best time to visit Tokyo?","response": "Tokyo is a year-round destination,t spring (March to May) and autumn (September to November) are ideal for comfortable weather and fewer crowds. Don't miss the cherry blossoms in spring!"} +{"prompt": "How do I get to the Great Barrier Reef?","response": "Take a short flight from major cities like Cairns or the Gold Coast to the Great Barrier Reef. From there,u can choose from various tour operators offering snorkeling,ving, glass-bottom boat tours."} +{"prompt": "What's the best way to get around Rome?","response": "Rome is a walkable city,t you can also use public transportation like buses and trams. Consider purchasing a Roma Pass for free entry to attractions and discounted public transportation."} +{"prompt": "Can you recommend a hotel in New York City?","response": "I'd be happy to! The Plaza Hotel is a classic choice,fering luxurious accommodations and an unbeatable location in Midtown. Alternatively,nsider the Pod Hotel Brooklyn for a trendy and budget-friendly option."} +{"prompt": "What's the best time to visit the Grand Canyon?","response": "The Grand Canyon is a year-round destination,t the best time to visit is from September to November or March to May for comfortable weather and smaller crowds."} +{"prompt": "How do I plan a trip to India?","response": "Planning a trip to India can be overwhelming,t don't worry,m here to help! Consider the time of year,dget,d your interests when choosing destinations like Delhi,ra, Jaipur. Don't forget to research visa requirements and vaccinations."} +{"prompt": "Can you recommend a restaurant in Bangkok?","response": "You must try Jay Fai,Michelin-starred street food restaurant serving authentic Thai cuisine. Be prepared for a lively atmosphere and long lines,t it's worth the wait!"} +{"prompt": "What's the best way to get around Paris?","response": "Paris has an excellent public transportation system,cluding the metro,ses,d trains. Consider purchasing a Paris Visite pass for unlimited public transportation and discounts on attractions."} +{"prompt": "Can you recommend a hotel in Sydney?","response": "The Shangri-La Hotel Sydney offers stunning views of the Opera House and Harbour Bridge. Alternatively,nsider the QT Sydney for a trendy and boutique hotel experience."} +{"prompt": "What's the best time to visit the Amalfi Coast?","response": "The Amalfi Coast is a year-round destination,t the best time to visit is from May to October for warm weather and access to beaches and hiking trails."} +{"prompt": "How do I plan a trip to Costa Rica?","response": "Costa Rica is a nature lover's paradise! Consider the time of year,dget,d your interests when choosing destinations like Arenal,nteverde, the beach towns of Tamarindo or Puerto Viejo."} +{"prompt": "Can you recommend a restaurant in New Orleans?","response": "You must try Dooky Chase's Restaurant,legendary eatery serving authentic Creole cuisine. Be prepared for a lively atmosphere and delicious gumbo!"} +{"prompt": "What's the best way to get around Barcelona?","response": "Barcelona has a comprehensive public transportation system,cluding the metro,ses,d trains. Consider purchasing a Hola BCN card for unlimited public transportation and discounts on attractions."} +{"prompt": "Can you recommend a hotel in San Francisco?","response": "The Fairmont San Francisco offers luxurious accommodations and stunning views of the city. Alternatively,nsider the Hotel Zeppelin for a trendy and boutique hotel experience."} +{"prompt": "What's the best time to visit the Great Wall of China?","response": "The best time to visit the Great Wall of China is from September to November or March to May for comfortable weather and smaller crowds."} +{"prompt": "How do I plan a trip to Iceland?","response": "Iceland is a unique destination! Consider the time of year,dget,d your interests when choosing destinations like Reykjavik,e Golden Circle, the stunning natural beauty of the South Coast."} {"prompt": "Can you recommend a restaurant in Tokyo?","response": "You must try Sushi Saito,Michelin-starred sushi restaurant serving the freshest and highest-quality ingredients. Be prepared for a unique and unforgettable dining experience!"} \ No newline at end of file diff --git a/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_full_sft.yaml b/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_full_sft.yaml index f50bac541..239198caf 100644 --- a/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_full_sft.yaml +++ b/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_full_sft.yaml @@ -1,39 +1,39 @@ -### model -model_name_or_path: checkpoints/Qwen2.5-7B - -### method -stage: sft -do_train: true -finetuning_type: full -deepspeed: examples/deepspeed/ds_z3_config.json - -### dataset -dataset: identity,alpaca_en_demo -template: qwen -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/qwen2_5-7b/full/sft -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 2 -learning_rate: 1.0e-5 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_ratio: 0.1 -bf16: true -ddp_timeout: 180000000 - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -eval_strategy: steps -eval_steps: 500 +### model +model_name_or_path: checkpoints/Qwen2.5-7B + +### method +stage: sft +do_train: true +finetuning_type: full +deepspeed: examples/deepspeed/ds_z3_config.json + +### dataset +dataset: identity,alpaca_en_demo +template: qwen +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/qwen2_5-7b/full/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 2 +learning_rate: 1.0e-5 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +eval_strategy: steps +eval_steps: 500 diff --git a/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_lora_sft.yaml b/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_lora_sft.yaml index f0e99a65b..9fc4c4426 100644 --- a/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_lora_sft.yaml +++ b/nlp/llm/qwen2.5-7b/pytorch/qwen2_5-7b_lora_sft.yaml @@ -1,39 +1,39 @@ -### model -model_name_or_path: checkpoints/Qwen2.5-7B - -### method -stage: sft -do_train: true -finetuning_type: lora -lora_target: all - -### dataset -dataset: identity,alpaca_en_demo -template: qwen -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/qwen2_5-7b/lora/sft -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 2 -learning_rate: 1.0e-5 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_ratio: 0.1 -bf16: true -ddp_timeout: 180000000 - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -eval_strategy: steps -eval_steps: 500 +### model +model_name_or_path: checkpoints/Qwen2.5-7B + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all + +### dataset +dataset: identity,alpaca_en_demo +template: qwen +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/qwen2_5-7b/lora/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 2 +learning_rate: 1.0e-5 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +eval_strategy: steps +eval_steps: 500 diff --git a/others/graph_machine_learning/graph_wavenet/pytorch/generate_training_data.py b/others/graph_machine_learning/graph_wavenet/pytorch/generate_training_data.py index 01b4dd2de..bcd613d88 100644 --- a/others/graph_machine_learning/graph_wavenet/pytorch/generate_training_data.py +++ b/others/graph_machine_learning/graph_wavenet/pytorch/generate_training_data.py @@ -1,109 +1,109 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import numpy as np -import os -import pandas as pd - - -def generate_graph_seq2seq_io_data( - df, x_offsets, y_offsets, add_time_in_day=True, add_day_in_week=False, scaler=None -): - """ - Generate samples from - :param df: - :param x_offsets: - :param y_offsets: - :param add_time_in_day: - :param add_day_in_week: - :param scaler: - :return: - # x: (epoch_size, input_length, num_nodes, input_dim) - # y: (epoch_size, output_length, num_nodes, output_dim) - """ - - num_samples, num_nodes = df.shape - data = np.expand_dims(df.values, axis=-1) - feature_list = [data] - if add_time_in_day: - time_ind = (df.index.values - df.index.values.astype("datetime64[D]")) / np.timedelta64(1, "D") - time_in_day = np.tile(time_ind, [1, num_nodes, 1]).transpose((2, 1, 0)) - feature_list.append(time_in_day) - if add_day_in_week: - dow = df.index.dayofweek - dow_tiled = np.tile(dow, [1, num_nodes, 1]).transpose((2, 1, 0)) - feature_list.append(dow_tiled) - - data = np.concatenate(feature_list, axis=-1) - x, y = [], [] - min_t = abs(min(x_offsets)) - max_t = abs(num_samples - abs(max(y_offsets))) # Exclusive - for t in range(min_t, max_t): # t is the index of the last observation. - x.append(data[t + x_offsets, ...]) - y.append(data[t + y_offsets, ...]) - x = np.stack(x, axis=0) - y = np.stack(y, axis=0) - return x, y - - -def generate_train_val_test(args): - seq_length_x, seq_length_y = args.seq_length_x, args.seq_length_y - df = pd.read_hdf(args.traffic_df_filename) - # 0 is the latest observed sample. - x_offsets = np.sort(np.concatenate((np.arange(-(seq_length_x - 1), 1, 1),))) - # Predict the next one hour - y_offsets = np.sort(np.arange(args.y_start, (seq_length_y + 1), 1)) - # x: (num_samples, input_length, num_nodes, input_dim) - # y: (num_samples, output_length, num_nodes, output_dim) - x, y = generate_graph_seq2seq_io_data( - df, - x_offsets=x_offsets, - y_offsets=y_offsets, - add_time_in_day=True, - add_day_in_week=args.dow, - ) - - print("x shape: ", x.shape, ", y shape: ", y.shape) - # Write the data into npz file. - num_samples = x.shape[0] - num_test = round(num_samples * 0.2) - num_train = round(num_samples * 0.7) - num_val = num_samples - num_test - num_train - x_train, y_train = x[:num_train], y[:num_train] - x_val, y_val = ( - x[num_train: num_train + num_val], - y[num_train: num_train + num_val], - ) - x_test, y_test = x[-num_test:], y[-num_test:] - - for cat in ["train", "val", "test"]: - _x, _y = locals()["x_" + cat], locals()["y_" + cat] - print(cat, "x: ", _x.shape, "y:", _y.shape) - np.savez_compressed( - os.path.join(args.output_dir, f"{cat}.npz"), - x=_x, - y=_y, - x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]), - y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]), - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--output_dir", type=str, default="data/METR-LA", help="Output directory.") - parser.add_argument("--traffic_df_filename", type=str, default="data/metr-la.h5", help="Raw traffic readings.",) - parser.add_argument("--seq_length_x", type=int, default=12, help="Sequence Length.",) - parser.add_argument("--seq_length_y", type=int, default=12, help="Sequence Length.",) - parser.add_argument("--y_start", type=int, default=1, help="Y pred start", ) - parser.add_argument("--dow", action='store_true',) - - args = parser.parse_args() - if os.path.exists(args.output_dir): - reply = str(input(f'{args.output_dir} exists. Do you want to overwrite it? (y/n)')).lower().strip() - if reply[0] != 'y': exit - else: - os.makedirs(args.output_dir) - generate_train_val_test(args) +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import numpy as np +import os +import pandas as pd + + +def generate_graph_seq2seq_io_data( + df, x_offsets, y_offsets, add_time_in_day=True, add_day_in_week=False, scaler=None +): + """ + Generate samples from + :param df: + :param x_offsets: + :param y_offsets: + :param add_time_in_day: + :param add_day_in_week: + :param scaler: + :return: + # x: (epoch_size, input_length, num_nodes, input_dim) + # y: (epoch_size, output_length, num_nodes, output_dim) + """ + + num_samples, num_nodes = df.shape + data = np.expand_dims(df.values, axis=-1) + feature_list = [data] + if add_time_in_day: + time_ind = (df.index.values - df.index.values.astype("datetime64[D]")) / np.timedelta64(1, "D") + time_in_day = np.tile(time_ind, [1, num_nodes, 1]).transpose((2, 1, 0)) + feature_list.append(time_in_day) + if add_day_in_week: + dow = df.index.dayofweek + dow_tiled = np.tile(dow, [1, num_nodes, 1]).transpose((2, 1, 0)) + feature_list.append(dow_tiled) + + data = np.concatenate(feature_list, axis=-1) + x, y = [], [] + min_t = abs(min(x_offsets)) + max_t = abs(num_samples - abs(max(y_offsets))) # Exclusive + for t in range(min_t, max_t): # t is the index of the last observation. + x.append(data[t + x_offsets, ...]) + y.append(data[t + y_offsets, ...]) + x = np.stack(x, axis=0) + y = np.stack(y, axis=0) + return x, y + + +def generate_train_val_test(args): + seq_length_x, seq_length_y = args.seq_length_x, args.seq_length_y + df = pd.read_hdf(args.traffic_df_filename) + # 0 is the latest observed sample. + x_offsets = np.sort(np.concatenate((np.arange(-(seq_length_x - 1), 1, 1),))) + # Predict the next one hour + y_offsets = np.sort(np.arange(args.y_start, (seq_length_y + 1), 1)) + # x: (num_samples, input_length, num_nodes, input_dim) + # y: (num_samples, output_length, num_nodes, output_dim) + x, y = generate_graph_seq2seq_io_data( + df, + x_offsets=x_offsets, + y_offsets=y_offsets, + add_time_in_day=True, + add_day_in_week=args.dow, + ) + + print("x shape: ", x.shape, ", y shape: ", y.shape) + # Write the data into npz file. + num_samples = x.shape[0] + num_test = round(num_samples * 0.2) + num_train = round(num_samples * 0.7) + num_val = num_samples - num_test - num_train + x_train, y_train = x[:num_train], y[:num_train] + x_val, y_val = ( + x[num_train: num_train + num_val], + y[num_train: num_train + num_val], + ) + x_test, y_test = x[-num_test:], y[-num_test:] + + for cat in ["train", "val", "test"]: + _x, _y = locals()["x_" + cat], locals()["y_" + cat] + print(cat, "x: ", _x.shape, "y:", _y.shape) + np.savez_compressed( + os.path.join(args.output_dir, f"{cat}.npz"), + x=_x, + y=_y, + x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]), + y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]), + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--output_dir", type=str, default="data/METR-LA", help="Output directory.") + parser.add_argument("--traffic_df_filename", type=str, default="data/metr-la.h5", help="Raw traffic readings.",) + parser.add_argument("--seq_length_x", type=int, default=12, help="Sequence Length.",) + parser.add_argument("--seq_length_y", type=int, default=12, help="Sequence Length.",) + parser.add_argument("--y_start", type=int, default=1, help="Y pred start", ) + parser.add_argument("--dow", action='store_true',) + + args = parser.parse_args() + if os.path.exists(args.output_dir): + reply = str(input(f'{args.output_dir} exists. Do you want to overwrite it? (y/n)')).lower().strip() + if reply[0] != 'y': exit + else: + os.makedirs(args.output_dir) + generate_train_val_test(args) diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle.json b/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle.json index 6566e0c97..066b45372 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle.json +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle.json @@ -1,46 +1,46 @@ -{ - "num_numerical_features": 13, - "categorical_feature_sizes": [ - 1461, - 584, - 10131227, - 2202608, - 306, - 24, - 12518, - 634, - 4, - 93146, - 5684, - 8351593, - 3195, - 28, - 14993, - 5461306, - 11, - 5653, - 2173, - 4, - 7046547, - 18, - 16, - 286181, - 105, - 142572 - ], - "bottom_mlp_sizes": [ - 512, - 256, - 128 - ], - "top_mlp_sizes": [ - 1024, - 1024, - 512, - 256, - 1 - ], - "embedding_dim": 128, - "interaction_op": "dot", - "self_interaction": false -} +{ + "num_numerical_features": 13, + "categorical_feature_sizes": [ + 1461, + 584, + 10131227, + 2202608, + 306, + 24, + 12518, + 634, + 4, + 93146, + 5684, + 8351593, + 3195, + 28, + 14993, + 5461306, + 11, + 5653, + 2173, + 4, + 7046547, + 18, + 16, + 286181, + 105, + 142572 + ], + "bottom_mlp_sizes": [ + 512, + 256, + 128 + ], + "top_mlp_sizes": [ + 1024, + 1024, + 512, + 256, + 1 + ], + "embedding_dim": 128, + "interaction_op": "dot", + "self_interaction": false +} diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle_tiny.json b/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle_tiny.json index 8d212f78a..20a8c990f 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle_tiny.json +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/config/criteo_kaggle_tiny.json @@ -1,46 +1,46 @@ -{ - "num_numerical_features": 13, - "categorical_feature_sizes": [ - 1461, - 584, - 10131227, - 2202608, - 306, - 24, - 12518, - 634, - 4, - 93146, - 5684, - 8351593, - 3195, - 28, - 14993, - 5461306, - 11, - 5653, - 2173, - 4, - 7046547, - 18, - 16, - 286181, - 105, - 142572 - ], - "bottom_mlp_sizes": [ - 512, - 256, - 64, - 16 - ], - "top_mlp_sizes": [ - 512, - 256, - 1 - ], - "embedding_dim": 16, - "interaction_op": "dot", - "self_interaction": false, - "hash_indices": true -} +{ + "num_numerical_features": 13, + "categorical_feature_sizes": [ + 1461, + 584, + 10131227, + 2202608, + 306, + 24, + 12518, + 634, + 4, + 93146, + 5684, + 8351593, + 3195, + 28, + 14993, + 5461306, + 11, + 5653, + 2173, + 4, + 7046547, + 18, + 16, + 286181, + 105, + 142572 + ], + "bottom_mlp_sizes": [ + 512, + 256, + 64, + 16 + ], + "top_mlp_sizes": [ + 512, + 256, + 1 + ], + "embedding_dim": 16, + "interaction_op": "dot", + "self_interaction": false, + "hash_indices": true +} diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/config/mlperf_10m.limit.json b/others/recommendation_systems/dlrm/pytorch/dlrm/config/mlperf_10m.limit.json index 8f5179f49..65a9b5be4 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/config/mlperf_10m.limit.json +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/config/mlperf_10m.limit.json @@ -1,46 +1,46 @@ -{ - "num_numerical_features": 13, - "categorical_feature_sizes": [ - 10000000, - 38985, - 17278, - 7418, - 20242, - 3, - 7120, - 1543, - 63, - 9999978, - 2642264, - 397262, - 10, - 2208, - 11931, - 155, - 4, - 976, - 14, - 10000000, - 9832963, - 10000000, - 573162, - 12969, - 108, - 36 - ], - "bottom_mlp_sizes": [ - 512, - 256, - 128 - ], - "top_mlp_sizes": [ - 1024, - 1024, - 512, - 256, - 1 - ], - "embedding_dim": 128, - "interaction_op": "dot", - "hash_indices": true -} +{ + "num_numerical_features": 13, + "categorical_feature_sizes": [ + 10000000, + 38985, + 17278, + 7418, + 20242, + 3, + 7120, + 1543, + 63, + 9999978, + 2642264, + 397262, + 10, + 2208, + 11931, + 155, + 4, + 976, + 14, + 10000000, + 9832963, + 10000000, + 573162, + 12969, + 108, + 36 + ], + "bottom_mlp_sizes": [ + 512, + 256, + 128 + ], + "top_mlp_sizes": [ + 1024, + 1024, + 512, + 256, + 1 + ], + "embedding_dim": 128, + "interaction_op": "dot", + "hash_indices": true +} diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/config/official_config.json b/others/recommendation_systems/dlrm/pytorch/dlrm/config/official_config.json index 54ade437a..75132d0ce 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/config/official_config.json +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/config/official_config.json @@ -1,47 +1,47 @@ -{ - "num_numerical_features": 13, - "categorical_feature_sizes": [ - 932209, - 18814, - 13624, - 6862, - 18488, - 3, - 6242, - 1229, - 49, - 654428, - 96850, - 69004, - 10, - 2134, - 7345, - 60, - 4, - 918, - 14, - 984280, - 320932, - 838408, - 76380, - 8906, - 74, - 33 - ], - - - "bottom_mlp_sizes": [ - 512, - 256, - 64 - ], - "top_mlp_sizes": [ - 512, - 512, - 256, - 1 - ], - "embedding_dim": 64, - "interaction_op": "dot", - "hash_indices": false -} +{ + "num_numerical_features": 13, + "categorical_feature_sizes": [ + 932209, + 18814, + 13624, + 6862, + 18488, + 3, + 6242, + 1229, + 49, + 654428, + 96850, + 69004, + 10, + 2134, + 7345, + 60, + 4, + 918, + 14, + 984280, + 320932, + 838408, + 76380, + 8906, + 74, + 33 + ], + + + "bottom_mlp_sizes": [ + 512, + 256, + 64 + ], + "top_mlp_sizes": [ + 512, + 512, + 256, + 1 + ], + "embedding_dim": 64, + "interaction_op": "dot", + "hash_indices": false +} diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/deprecated_model.py b/others/recommendation_systems/dlrm/pytorch/dlrm/deprecated_model.py index bca0e4090..75e225c88 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/deprecated_model.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/deprecated_model.py @@ -1,296 +1,296 @@ -"""Reimplementation of Facebook's DLRM model""" -import copy -import json -import math - -from absl import logging - -import torch -from torch import nn - -import dlrm.nn -from dlrm.cuda_ext import dotBasedInteract - -CRITEO_LARGE_EMBEDDING_IDS = [0, 9, 19, 20, 21] - -class DlrmBase(nn.Module): - """Base class of DLRM model - - Base class of DLRM model. There are several possible implementation of embeddings. Base model abstract embedding - related things including creation and forward, and does the remaining (intitlization, interaction, MLP, etc.). - - Args: - num_numerical_features (int): Number of dense features fed into bottom MLP - categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table - bottom_mlp_sizes (list): A list of integer indicating bottom MLP layer sizes. Last bottom MLP layer - must be embedding_dim - top_mlp_sizes (list): A list of integers indicating top MLP layer sizes. - embedding_dim (int): Length of embedding vectors. Default 32 - interaction_op (string): Type of interactions. Default "dot" - self_interaction (bool): Default False. - hash_indices (bool): If True, hashed_index = index % categorical_feature_size. Default False. - """ - def __init__(self, num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, - embedding_dim=32, interaction_op="dot", self_interaction=False, hash_indices=False): - super(DlrmBase, self).__init__() - if embedding_dim != bottom_mlp_sizes[-1]: - raise TypeError("The last bottom MLP layer must have same size as embedding.") - - self._embedding_dim = embedding_dim - self._interaction_op = interaction_op - self._self_interaction = self_interaction - self._hash_indices = hash_indices - self._categorical_feature_sizes = copy.copy(categorical_feature_sizes) - - # Interactions are among outputs of all the embedding tables and bottom MLP, total number of - # (num_embedding_tables + 1) vectors with size embdding_dim. ``dot`` product interaction computes dot product - # between any 2 vectors. ``cat`` interaction concatenate all the vectors together. - # Output of interaction will have shape [num_interactions, embdding_dim]. - num_interaction_inputs = len(categorical_feature_sizes) + 1 - if interaction_op == "dot": - if self_interaction: - raise NotImplementedError - num_interactions = (num_interaction_inputs * (num_interaction_inputs - 1)) // 2 + embedding_dim - elif interaction_op == "cat": - num_interactions = num_interaction_inputs * embedding_dim - else: - raise TypeError(F"Unknown interaction {interaction_op}.") - - self.embeddings = nn.ModuleList() - self._create_embeddings(self.embeddings, embedding_dim, categorical_feature_sizes) - - # Create bottom MLP - bottom_mlp_layers = [] - input_dims = num_numerical_features - for output_dims in bottom_mlp_sizes: - bottom_mlp_layers.append( - nn.Linear(input_dims, output_dims)) - bottom_mlp_layers.append(nn.ReLU(inplace=True)) - input_dims = output_dims - self.bottom_mlp = nn.Sequential(*bottom_mlp_layers) - - # Create Top MLP - top_mlp_layers = [] - input_dims = num_interactions + 1 # pad 1 to be multiple of 8 - for output_dims in top_mlp_sizes[:-1]: - top_mlp_layers.append(nn.Linear(input_dims, output_dims)) - top_mlp_layers.append(nn.ReLU(inplace=True)) - input_dims = output_dims - # last Linear layer uses sigmoid - top_mlp_layers.append(nn.Linear(input_dims, top_mlp_sizes[-1])) - top_mlp_layers.append(nn.Sigmoid()) - self.top_mlp = nn.Sequential(*top_mlp_layers) - - self._initialize_mlp_weights() - - def _interaction(self, bottom_mlp_output, embedding_outputs, batch_size): - """Interaction - - "dot" interaction is a bit tricky to implement and test. Break it out from forward so that it can be tested - independently. - - Args: - bottom_mlp_output (Tensor): - embedding_outputs (list): Sequence of tensors - batch_size (int): - """ - concat = torch.cat([bottom_mlp_output] + embedding_outputs, dim=1) - if self._interaction_op == "dot" and not self._self_interaction: - concat = concat.view((batch_size, -1, self._embedding_dim)) - if concat.dtype == torch.half: - interaction_output = dotBasedInteract(concat, bottom_mlp_output) - else: # Legacy path - interaction = torch.bmm(concat, torch.transpose(concat, 1, 2)) - tril_indices_row, tril_indices_col = torch.tril_indices( - interaction.shape[1], interaction.shape[2], offset=-1) - interaction_flat = interaction[:, tril_indices_row, tril_indices_col] - - # concatenate dense features and interactions - zero_padding = torch.zeros( - concat.shape[0], 1, dtype=concat.dtype, device=concat.device) - interaction_output = torch.cat((bottom_mlp_output, interaction_flat, zero_padding), dim=1) - - elif self._interaction_op == "cat": - interaction_output = concat - else: - raise NotImplementedError - - return interaction_output - - def _initialize_mlp_weights(self): - """Initializing weights same as original DLRM""" - for module in self.modules(): - if isinstance(module, nn.Linear): - nn.init.normal_(module.weight.data, 0., math.sqrt(2. / (module.in_features + module.out_features))) - nn.init.normal_(module.bias.data, 0., math.sqrt(1. / module.out_features)) - - # Explicitly set weight corresponding to zero padded interaction output. They will - # stay 0 throughout the entire training. An assert can be added to the end of the training - # to prove it doesn't increase model capacity but just 0 paddings. - nn.init.zeros_(self.top_mlp[0].weight[:, -1].data) - - # pylint:disable=missing-docstring - def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): - raise NotImplementedError - - @property - def num_categorical_features(self): - return len(self._categorical_feature_sizes) - - - def forward(self, numerical_input, categorical_inputs): - raise NotImplementedError - - def extra_repr(self): - s = (F"interaction_op={self._interaction_op}, self_interaction={self._self_interaction}, " - F"hash_indices={self._hash_indices}") - return s - # pylint:enable=missing-docstring - - @classmethod - def from_json(cls, json_str, **kwargs): - """Create from json str""" - obj_dict = json.loads(json_str) - return cls(**obj_dict, **kwargs) - -class Dlrm(DlrmBase): - """Reimplement Facebook's DLRM model - - Original implementation is from https://github.com/facebookresearch/dlrm. - - """ - def __init__(self, num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, - embedding_dim=32, interaction_op="dot", self_interaction=False, hash_indices=False, - base_device="cuda", mem_donor_devices=None): - # Running everything on gpu by default - self._base_device = base_device - self._embedding_device_map = [base_device for _ in range(len(categorical_feature_sizes))] - if mem_donor_devices is not None: - for i, large_embedding_id in enumerate(CRITEO_LARGE_EMBEDDING_IDS): - self._embedding_device_map[large_embedding_id] = mem_donor_devices[i] - - super(Dlrm, self).__init__( - num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, - embedding_dim, interaction_op, self_interaction, hash_indices) - - def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): - # Each embedding table has size [num_features, embedding_dim] - for i, num_features in enumerate(categorical_feature_sizes): - # Allocate directly on GPU is much faster than allocating on CPU then copying over - embedding_weight = torch.empty((num_features, embedding_dim), device=self._embedding_device_map[i]) - embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=False, sparse=True) - - # Initializing embedding same as original DLRM - nn.init.uniform_( - embedding.weight.data, - -math.sqrt(1. / embedding.num_embeddings), - math.sqrt(1. / embedding.num_embeddings)) - - embeddings.append(embedding) - - def set_devices(self, base_device, mem_donor_devices=None): - """Set devices to run the model - - Put small embeddings and MLPs on base device. Put 5 largest embeddings on mem_donar_devices. 5 largest - embeddings are [0, 9, 19, 20, 21]. - - Args: - base_device (string); - mem_donor_devices (list): list of 5 strings indicates where to run 5 largest embeddings. Default None. - """ - self._base_device = base_device - self.bottom_mlp.to(base_device) - self.top_mlp.to(base_device) - self._embedding_device_map = [base_device for _ in range(self.num_categorical_features)] - - if mem_donor_devices is not None: - if len(mem_donor_devices) != 5: - raise ValueError(F"Must specify 5 mem_donor_devices, got {len(mem_donor_devices)}.") - - for i, large_embedding_id in enumerate(CRITEO_LARGE_EMBEDDING_IDS): - self._embedding_device_map[large_embedding_id] = mem_donor_devices[i] - - for embedding_id, device in enumerate(self._embedding_device_map): - logging.info("Place embedding %d on device %s", embedding_id, device) - self.embeddings[embedding_id].to(device) - - def forward(self, numerical_input, categorical_inputs): - """ - - Args: - numerical_input (Tensor): with shape [batch_size, num_numerical_features] - categorical_inputs (Tensor): with shape [num_categorical_features, batch_size] - """ - batch_size = numerical_input.size()[0] - # TODO(haow): Maybe check batch size of sparse input - - # Put indices on the same device as corresponding embedding - device_indices = [] - for embedding_id, embedding in enumerate(self.embeddings): - device_indices.append(categorical_inputs[embedding_id].to(self._embedding_device_map[embedding_id])) - - bottom_mlp_output = self.bottom_mlp(numerical_input) - - # embedding_outputs will be a list of (26 in the case of Criteo) fetched embeddings with shape - # [batch_size, embedding_size] - embedding_outputs = [] - for embedding_id, embedding in enumerate(self.embeddings): - if self._hash_indices: - device_indices[embedding_id] %= embedding.num_embeddings - logging.log_first_n( - logging.WARNING, F"Hashed indices out of range.", 1) - embedding_outputs.append(embedding(device_indices[embedding_id]).to(self._base_device)) - - interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size) - - top_mlp_output = self.top_mlp(interaction_output) - - return top_mlp_output - -class DlrmJointEmbedding(DlrmBase): - """DLRM uses one hot embedding only - - If all embeddings are one hot, they can be easily combined and will have better performance. - """ - - def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): - """Combine all one hot embeddings as one""" - logging.warning("Combined all categorical features to single embedding table.") - embeddings.append(dlrm.nn.BuckleEmbedding(categorical_feature_sizes, embedding_dim)) - - for cat, size in enumerate(categorical_feature_sizes): - module = embeddings[0] # Only one embedding module in the ModuleList - nn.init.uniform_( - module.embedding.weight.data[module.offsets[cat]:module.offsets[cat + 1]], - -math.sqrt(1. / size), - math.sqrt(1. / size)) - - def forward(self, numerical_input, categorical_inputs): - """ - Args: - numerical_input (Tensor): with shape [batch_size, num_numerical_features] - categorical_inputs (Tensor): with shape [num_categorical_features, batch_size] - """ - batch_size = numerical_input.size()[0] - bottom_mlp_output = self.bottom_mlp(numerical_input) - - # Change indices based on hash_shift - # It would be more efficient to change on the data loader side. But in order to keep the interface consistent - # with the base Dlrm model, it is handled here. - if self._hash_indices: - for cat, size in enumerate(self._categorical_feature_sizes): - categorical_inputs[cat] %= size - logging.log_first_n( - logging.WARNING, F"Hashed indices out of range.", 1) - - # self._interaction takes list of tensor as input. So make this single element list - # categorical_inputs is transposed here only to keep interface consistent with base model, - # which makes it easy to test. Will change them to be the best performing version. - # TODO(haow): Remove transpose. - embedding_outputs = [self.embeddings[0](categorical_inputs.t()).view(batch_size, -1)] - - interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size) - - top_mlp_output = self.top_mlp(interaction_output) - - return top_mlp_output +"""Reimplementation of Facebook's DLRM model""" +import copy +import json +import math + +from absl import logging + +import torch +from torch import nn + +import dlrm.nn +from dlrm.cuda_ext import dotBasedInteract + +CRITEO_LARGE_EMBEDDING_IDS = [0, 9, 19, 20, 21] + +class DlrmBase(nn.Module): + """Base class of DLRM model + + Base class of DLRM model. There are several possible implementation of embeddings. Base model abstract embedding + related things including creation and forward, and does the remaining (intitlization, interaction, MLP, etc.). + + Args: + num_numerical_features (int): Number of dense features fed into bottom MLP + categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table + bottom_mlp_sizes (list): A list of integer indicating bottom MLP layer sizes. Last bottom MLP layer + must be embedding_dim + top_mlp_sizes (list): A list of integers indicating top MLP layer sizes. + embedding_dim (int): Length of embedding vectors. Default 32 + interaction_op (string): Type of interactions. Default "dot" + self_interaction (bool): Default False. + hash_indices (bool): If True, hashed_index = index % categorical_feature_size. Default False. + """ + def __init__(self, num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, + embedding_dim=32, interaction_op="dot", self_interaction=False, hash_indices=False): + super(DlrmBase, self).__init__() + if embedding_dim != bottom_mlp_sizes[-1]: + raise TypeError("The last bottom MLP layer must have same size as embedding.") + + self._embedding_dim = embedding_dim + self._interaction_op = interaction_op + self._self_interaction = self_interaction + self._hash_indices = hash_indices + self._categorical_feature_sizes = copy.copy(categorical_feature_sizes) + + # Interactions are among outputs of all the embedding tables and bottom MLP, total number of + # (num_embedding_tables + 1) vectors with size embdding_dim. ``dot`` product interaction computes dot product + # between any 2 vectors. ``cat`` interaction concatenate all the vectors together. + # Output of interaction will have shape [num_interactions, embdding_dim]. + num_interaction_inputs = len(categorical_feature_sizes) + 1 + if interaction_op == "dot": + if self_interaction: + raise NotImplementedError + num_interactions = (num_interaction_inputs * (num_interaction_inputs - 1)) // 2 + embedding_dim + elif interaction_op == "cat": + num_interactions = num_interaction_inputs * embedding_dim + else: + raise TypeError(F"Unknown interaction {interaction_op}.") + + self.embeddings = nn.ModuleList() + self._create_embeddings(self.embeddings, embedding_dim, categorical_feature_sizes) + + # Create bottom MLP + bottom_mlp_layers = [] + input_dims = num_numerical_features + for output_dims in bottom_mlp_sizes: + bottom_mlp_layers.append( + nn.Linear(input_dims, output_dims)) + bottom_mlp_layers.append(nn.ReLU(inplace=True)) + input_dims = output_dims + self.bottom_mlp = nn.Sequential(*bottom_mlp_layers) + + # Create Top MLP + top_mlp_layers = [] + input_dims = num_interactions + 1 # pad 1 to be multiple of 8 + for output_dims in top_mlp_sizes[:-1]: + top_mlp_layers.append(nn.Linear(input_dims, output_dims)) + top_mlp_layers.append(nn.ReLU(inplace=True)) + input_dims = output_dims + # last Linear layer uses sigmoid + top_mlp_layers.append(nn.Linear(input_dims, top_mlp_sizes[-1])) + top_mlp_layers.append(nn.Sigmoid()) + self.top_mlp = nn.Sequential(*top_mlp_layers) + + self._initialize_mlp_weights() + + def _interaction(self, bottom_mlp_output, embedding_outputs, batch_size): + """Interaction + + "dot" interaction is a bit tricky to implement and test. Break it out from forward so that it can be tested + independently. + + Args: + bottom_mlp_output (Tensor): + embedding_outputs (list): Sequence of tensors + batch_size (int): + """ + concat = torch.cat([bottom_mlp_output] + embedding_outputs, dim=1) + if self._interaction_op == "dot" and not self._self_interaction: + concat = concat.view((batch_size, -1, self._embedding_dim)) + if concat.dtype == torch.half: + interaction_output = dotBasedInteract(concat, bottom_mlp_output) + else: # Legacy path + interaction = torch.bmm(concat, torch.transpose(concat, 1, 2)) + tril_indices_row, tril_indices_col = torch.tril_indices( + interaction.shape[1], interaction.shape[2], offset=-1) + interaction_flat = interaction[:, tril_indices_row, tril_indices_col] + + # concatenate dense features and interactions + zero_padding = torch.zeros( + concat.shape[0], 1, dtype=concat.dtype, device=concat.device) + interaction_output = torch.cat((bottom_mlp_output, interaction_flat, zero_padding), dim=1) + + elif self._interaction_op == "cat": + interaction_output = concat + else: + raise NotImplementedError + + return interaction_output + + def _initialize_mlp_weights(self): + """Initializing weights same as original DLRM""" + for module in self.modules(): + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight.data, 0., math.sqrt(2. / (module.in_features + module.out_features))) + nn.init.normal_(module.bias.data, 0., math.sqrt(1. / module.out_features)) + + # Explicitly set weight corresponding to zero padded interaction output. They will + # stay 0 throughout the entire training. An assert can be added to the end of the training + # to prove it doesn't increase model capacity but just 0 paddings. + nn.init.zeros_(self.top_mlp[0].weight[:, -1].data) + + # pylint:disable=missing-docstring + def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): + raise NotImplementedError + + @property + def num_categorical_features(self): + return len(self._categorical_feature_sizes) + + + def forward(self, numerical_input, categorical_inputs): + raise NotImplementedError + + def extra_repr(self): + s = (F"interaction_op={self._interaction_op}, self_interaction={self._self_interaction}, " + F"hash_indices={self._hash_indices}") + return s + # pylint:enable=missing-docstring + + @classmethod + def from_json(cls, json_str, **kwargs): + """Create from json str""" + obj_dict = json.loads(json_str) + return cls(**obj_dict, **kwargs) + +class Dlrm(DlrmBase): + """Reimplement Facebook's DLRM model + + Original implementation is from https://github.com/facebookresearch/dlrm. + + """ + def __init__(self, num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, + embedding_dim=32, interaction_op="dot", self_interaction=False, hash_indices=False, + base_device="cuda", mem_donor_devices=None): + # Running everything on gpu by default + self._base_device = base_device + self._embedding_device_map = [base_device for _ in range(len(categorical_feature_sizes))] + if mem_donor_devices is not None: + for i, large_embedding_id in enumerate(CRITEO_LARGE_EMBEDDING_IDS): + self._embedding_device_map[large_embedding_id] = mem_donor_devices[i] + + super(Dlrm, self).__init__( + num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, top_mlp_sizes, + embedding_dim, interaction_op, self_interaction, hash_indices) + + def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): + # Each embedding table has size [num_features, embedding_dim] + for i, num_features in enumerate(categorical_feature_sizes): + # Allocate directly on GPU is much faster than allocating on CPU then copying over + embedding_weight = torch.empty((num_features, embedding_dim), device=self._embedding_device_map[i]) + embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=False, sparse=True) + + # Initializing embedding same as original DLRM + nn.init.uniform_( + embedding.weight.data, + -math.sqrt(1. / embedding.num_embeddings), + math.sqrt(1. / embedding.num_embeddings)) + + embeddings.append(embedding) + + def set_devices(self, base_device, mem_donor_devices=None): + """Set devices to run the model + + Put small embeddings and MLPs on base device. Put 5 largest embeddings on mem_donar_devices. 5 largest + embeddings are [0, 9, 19, 20, 21]. + + Args: + base_device (string); + mem_donor_devices (list): list of 5 strings indicates where to run 5 largest embeddings. Default None. + """ + self._base_device = base_device + self.bottom_mlp.to(base_device) + self.top_mlp.to(base_device) + self._embedding_device_map = [base_device for _ in range(self.num_categorical_features)] + + if mem_donor_devices is not None: + if len(mem_donor_devices) != 5: + raise ValueError(F"Must specify 5 mem_donor_devices, got {len(mem_donor_devices)}.") + + for i, large_embedding_id in enumerate(CRITEO_LARGE_EMBEDDING_IDS): + self._embedding_device_map[large_embedding_id] = mem_donor_devices[i] + + for embedding_id, device in enumerate(self._embedding_device_map): + logging.info("Place embedding %d on device %s", embedding_id, device) + self.embeddings[embedding_id].to(device) + + def forward(self, numerical_input, categorical_inputs): + """ + + Args: + numerical_input (Tensor): with shape [batch_size, num_numerical_features] + categorical_inputs (Tensor): with shape [num_categorical_features, batch_size] + """ + batch_size = numerical_input.size()[0] + # TODO(haow): Maybe check batch size of sparse input + + # Put indices on the same device as corresponding embedding + device_indices = [] + for embedding_id, embedding in enumerate(self.embeddings): + device_indices.append(categorical_inputs[embedding_id].to(self._embedding_device_map[embedding_id])) + + bottom_mlp_output = self.bottom_mlp(numerical_input) + + # embedding_outputs will be a list of (26 in the case of Criteo) fetched embeddings with shape + # [batch_size, embedding_size] + embedding_outputs = [] + for embedding_id, embedding in enumerate(self.embeddings): + if self._hash_indices: + device_indices[embedding_id] %= embedding.num_embeddings + logging.log_first_n( + logging.WARNING, F"Hashed indices out of range.", 1) + embedding_outputs.append(embedding(device_indices[embedding_id]).to(self._base_device)) + + interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size) + + top_mlp_output = self.top_mlp(interaction_output) + + return top_mlp_output + +class DlrmJointEmbedding(DlrmBase): + """DLRM uses one hot embedding only + + If all embeddings are one hot, they can be easily combined and will have better performance. + """ + + def _create_embeddings(self, embeddings, embedding_dim, categorical_feature_sizes): + """Combine all one hot embeddings as one""" + logging.warning("Combined all categorical features to single embedding table.") + embeddings.append(dlrm.nn.BuckleEmbedding(categorical_feature_sizes, embedding_dim)) + + for cat, size in enumerate(categorical_feature_sizes): + module = embeddings[0] # Only one embedding module in the ModuleList + nn.init.uniform_( + module.embedding.weight.data[module.offsets[cat]:module.offsets[cat + 1]], + -math.sqrt(1. / size), + math.sqrt(1. / size)) + + def forward(self, numerical_input, categorical_inputs): + """ + Args: + numerical_input (Tensor): with shape [batch_size, num_numerical_features] + categorical_inputs (Tensor): with shape [num_categorical_features, batch_size] + """ + batch_size = numerical_input.size()[0] + bottom_mlp_output = self.bottom_mlp(numerical_input) + + # Change indices based on hash_shift + # It would be more efficient to change on the data loader side. But in order to keep the interface consistent + # with the base Dlrm model, it is handled here. + if self._hash_indices: + for cat, size in enumerate(self._categorical_feature_sizes): + categorical_inputs[cat] %= size + logging.log_first_n( + logging.WARNING, F"Hashed indices out of range.", 1) + + # self._interaction takes list of tensor as input. So make this single element list + # categorical_inputs is transposed here only to keep interface consistent with base model, + # which makes it easy to test. Will change them to be the best performing version. + # TODO(haow): Remove transpose. + embedding_outputs = [self.embeddings[0](categorical_inputs.t()).view(batch_size, -1)] + + interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size) + + top_mlp_output = self.top_mlp(interaction_output) + + return top_mlp_output diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/__init__.py b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/__init__.py index 45feb62bc..9b90d873e 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/__init__.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/__init__.py @@ -1,2 +1,2 @@ -from .modules.buckle_embedding import * -from .modules.gather import * +from .modules.buckle_embedding import * +from .modules.gather import * diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/functional.py b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/functional.py index eaa40a470..b7a8a8902 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/functional.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/functional.py @@ -1,28 +1,28 @@ -import torch -from torch.autograd import Function -from apex import amp - -from dlrm import cuda_ext - - -class DotBasedInteract(Function): - """ Forward and Backward paths of cuda extension for dot-based feature interact.""" - - @staticmethod - @amp.half_function - def forward(ctx, input, bottom_mlp_output, output_padding_width = 1): - output = cuda_ext.dotBasedInteractFwd(input, bottom_mlp_output, output_padding_width) - ctx.save_for_backward(input) - ctx.output_padding_width = output_padding_width - return output - - @staticmethod - @amp.half_function - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output_padding_width = ctx.output_padding_width - grad, mlp_grad = cuda_ext.dotBasedInteractBwd(input, grad_output, output_padding_width) - return grad, mlp_grad, None - - -dotBasedInteract = DotBasedInteract.apply +import torch +from torch.autograd import Function +from apex import amp + +from dlrm import cuda_ext + + +class DotBasedInteract(Function): + """ Forward and Backward paths of cuda extension for dot-based feature interact.""" + + @staticmethod + @amp.half_function + def forward(ctx, input, bottom_mlp_output, output_padding_width = 1): + output = cuda_ext.dotBasedInteractFwd(input, bottom_mlp_output, output_padding_width) + ctx.save_for_backward(input) + ctx.output_padding_width = output_padding_width + return output + + @staticmethod + @amp.half_function + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output_padding_width = ctx.output_padding_width + grad, mlp_grad = cuda_ext.dotBasedInteractBwd(input, grad_output, output_padding_width) + return grad, mlp_grad, None + + +dotBasedInteract = DotBasedInteract.apply diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/buckle_embedding.py b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/buckle_embedding.py index f0be9122b..c463316a5 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/buckle_embedding.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/buckle_embedding.py @@ -1,43 +1,43 @@ -"""Buckle Embedding""" - -import copy - -import torch -from torch import nn - -__all__ = ["BuckleEmbedding"] - -class BuckleEmbedding(nn.Module): - """Buckle multiple one hot embedding together - - Multiple one hot embedding can be done as one embedding (indexing). Use nn.Embedding to deal with sparse wgrad - before I fully customizing it. - - Args: - categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table - embedding_dim (int): the size of each embedding vector - device (torch.device): where to create the embedding. Default "cuda" - """ - def __init__(self, categorical_feature_sizes, embedding_dim, device="cuda"): - super(BuckleEmbedding, self).__init__() - self.embedding_dim = embedding_dim - self.categorical_feature_sizes = copy.copy(categorical_feature_sizes) - - self.register_buffer("offsets", torch.tensor([0] + categorical_feature_sizes, device=device).cumsum(0)) - - embedding_weight = torch.empty((self.offsets[-1].item(), embedding_dim), device=device) - self.embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=False, sparse=True) - - # pylint:disable=missing-docstring - def forward(self, categorical_inputs): - # Check input has the right shape - assert categorical_inputs.shape[1] == len(self.categorical_feature_sizes) - - embedding_out = self.embedding(categorical_inputs + self.offsets[:-1]) - - return embedding_out - - def extra_repr(self): - s = F"offsets={self.offsets.cpu().numpy()}" - return s - # pylint:enable=missing-docstring +"""Buckle Embedding""" + +import copy + +import torch +from torch import nn + +__all__ = ["BuckleEmbedding"] + +class BuckleEmbedding(nn.Module): + """Buckle multiple one hot embedding together + + Multiple one hot embedding can be done as one embedding (indexing). Use nn.Embedding to deal with sparse wgrad + before I fully customizing it. + + Args: + categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table + embedding_dim (int): the size of each embedding vector + device (torch.device): where to create the embedding. Default "cuda" + """ + def __init__(self, categorical_feature_sizes, embedding_dim, device="cuda"): + super(BuckleEmbedding, self).__init__() + self.embedding_dim = embedding_dim + self.categorical_feature_sizes = copy.copy(categorical_feature_sizes) + + self.register_buffer("offsets", torch.tensor([0] + categorical_feature_sizes, device=device).cumsum(0)) + + embedding_weight = torch.empty((self.offsets[-1].item(), embedding_dim), device=device) + self.embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=False, sparse=True) + + # pylint:disable=missing-docstring + def forward(self, categorical_inputs): + # Check input has the right shape + assert categorical_inputs.shape[1] == len(self.categorical_feature_sizes) + + embedding_out = self.embedding(categorical_inputs + self.offsets[:-1]) + + return embedding_out + + def extra_repr(self): + s = F"offsets={self.offsets.cpu().numpy()}" + return s + # pylint:enable=missing-docstring diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/gather.py b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/gather.py index 6f6d6ac55..5652ea22b 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/gather.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/nn/modules/gather.py @@ -1,62 +1,62 @@ -"""Customized embedding gather""" -import copy - -import torch -from torch.autograd import Function -from torch import nn - -from apex import amp - -from dlrm import cuda_ext - -__all__ = ["EmbeddingGatherFunction", "JointSparseEmbedding", "embedding_gather"] - -class EmbeddingGatherFunction(Function): - """Customized embedding gather with fused plain SGD""" - @staticmethod - def forward(ctx, embedding, indices): - output = cuda_ext.gather_gpu_fwd(embedding, indices) - ctx.save_for_backward(indices) - ctx.num_features = embedding.size(0) - return output - - @staticmethod - def backward(ctx, grad_output): - indices = ctx.saved_tensors[0] - - grad_embedding = cuda_ext.gather_gpu_bwd(grad_output, indices, ctx.num_features) - - return grad_embedding, None - -class JointSparseEmbedding(nn.Module): - """Joint multiple one hot embedding together - - Multiple one hot embedding can be done as one embedding (indexing). - - Args: - categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table - embedding_dim (int): the size of each embedding vector - device (torch.device): where to create the embedding. Default "cuda" - """ - def __init__(self, categorical_feature_sizes, embedding_dim, device="cuda"): - super(JointSparseEmbedding, self).__init__() - self.embedding_dim = embedding_dim - self.categorical_feature_sizes = copy.copy(categorical_feature_sizes) - - self.register_buffer("offsets", torch.tensor([0] + categorical_feature_sizes).cumsum(0).to(device)) - self.weights = torch.nn.Parameter(torch.rand((self.offsets[-1].item(), embedding_dim), device=device)) - - def forward(self, categorical_inputs): - # Check input has the right shape - assert categorical_inputs.shape[1] == len(self.categorical_feature_sizes) - - embedding_out = embedding_gather(self.weights, categorical_inputs + self.offsets[:-1]) - - return embedding_out - - def extra_repr(self): - s = F"categorical_feature_sizes={self.categorical_feature_sizes}\n" - s += F"offsets={self.offsets.cpu().numpy()}" - return s - -embedding_gather = amp.float_function(EmbeddingGatherFunction.apply) +"""Customized embedding gather""" +import copy + +import torch +from torch.autograd import Function +from torch import nn + +from apex import amp + +from dlrm import cuda_ext + +__all__ = ["EmbeddingGatherFunction", "JointSparseEmbedding", "embedding_gather"] + +class EmbeddingGatherFunction(Function): + """Customized embedding gather with fused plain SGD""" + @staticmethod + def forward(ctx, embedding, indices): + output = cuda_ext.gather_gpu_fwd(embedding, indices) + ctx.save_for_backward(indices) + ctx.num_features = embedding.size(0) + return output + + @staticmethod + def backward(ctx, grad_output): + indices = ctx.saved_tensors[0] + + grad_embedding = cuda_ext.gather_gpu_bwd(grad_output, indices, ctx.num_features) + + return grad_embedding, None + +class JointSparseEmbedding(nn.Module): + """Joint multiple one hot embedding together + + Multiple one hot embedding can be done as one embedding (indexing). + + Args: + categorical_feature_sizes (list): A list of integer indicating number of features of each embedding table + embedding_dim (int): the size of each embedding vector + device (torch.device): where to create the embedding. Default "cuda" + """ + def __init__(self, categorical_feature_sizes, embedding_dim, device="cuda"): + super(JointSparseEmbedding, self).__init__() + self.embedding_dim = embedding_dim + self.categorical_feature_sizes = copy.copy(categorical_feature_sizes) + + self.register_buffer("offsets", torch.tensor([0] + categorical_feature_sizes).cumsum(0).to(device)) + self.weights = torch.nn.Parameter(torch.rand((self.offsets[-1].item(), embedding_dim), device=device)) + + def forward(self, categorical_inputs): + # Check input has the right shape + assert categorical_inputs.shape[1] == len(self.categorical_feature_sizes) + + embedding_out = embedding_gather(self.weights, categorical_inputs + self.offsets[:-1]) + + return embedding_out + + def extra_repr(self): + s = F"categorical_feature_sizes={self.categorical_feature_sizes}\n" + s += F"offsets={self.offsets.cpu().numpy()}" + return s + +embedding_gather = amp.float_function(EmbeddingGatherFunction.apply) diff --git a/others/recommendation_systems/dlrm/pytorch/dlrm/utils/metrics.py b/others/recommendation_systems/dlrm/pytorch/dlrm/utils/metrics.py index ccd38f356..4af324f26 100644 --- a/others/recommendation_systems/dlrm/pytorch/dlrm/utils/metrics.py +++ b/others/recommendation_systems/dlrm/pytorch/dlrm/utils/metrics.py @@ -1,100 +1,100 @@ -"""Customized implementation of metrics""" -import numpy as np -import torch - -def ref_roc_auc_score(y_true, y_score, exact=True): - """Compute AUC exactly the same as sklearn - - sklearn.metrics.roc_auc_score is a very genalized function that supports all kind of situation. - See https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/_ranking.py. The AUC computation - used by DLRM is a very small subset. This function is bear minimum codes of computing AUC exactly the same way - as sklearn numerically. - - A lot of things are removed: - Anything is not required by binary class. - thresholds is not returned since we only need score. - - Args: - y_true (ndarray or list of array): - y_score (ndarray or list of array): - exact (bool): If False, skip some computation used in sklearn. Default True - """ - y_true = np.r_[y_true].flatten() - y_score = np.r_[y_score].flatten() - if y_true.shape != y_score.shape: - raise TypeError(F"Shapre of y_true and y_score must match. Got {y_true.shape} and {y_score.shape}.") - - # sklearn label_binarize y_true which effectively make it integer - y_true = y_true.astype(np.int) - - # sort scores and corresponding truth values - desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] - y_score = y_score[desc_score_indices] - y_true = y_true[desc_score_indices] - - # y_score typically has many tied values. Here we extract - # the indices associated with the distinct values. We also - # concatenate a value for the end of the curve. - distinct_value_indices = np.where(np.diff(y_score))[0] - threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] - - # accumulate the true positives with decreasing threshold - tps = np.cumsum(y_true)[threshold_idxs] - fps = 1 + threshold_idxs - tps - - if exact: - # Attempt to drop thresholds corresponding to points in between and collinear with other points. - if len(fps) > 2: - optimal_idxs = np.where(np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0] - fps = fps[optimal_idxs] - tps = tps[optimal_idxs] - - # Add an extra threshold position to make sure that the curve starts at (0, 0) - tps = np.r_[0, tps] - fps = np.r_[0, fps] - - fpr = fps / fps[-1] - tpr = tps / tps[-1] - - direction = 1 - if exact: - # I don't understand why it is needed since it is sorted before - if np.any(np.diff(fpr) < 0): - direction = -1 - - area = direction * np.trapz(tpr, fpr) - - return area - -def roc_auc_score(y_true, y_score): - """Pytorch implementation almost follows sklearn - - Args: - y_true (Tensor): - y_score (Tensor): - """ - device = y_true.device - y_true.squeeze_() - y_score.squeeze_() - if y_true.shape != y_score.shape: - raise TypeError(F"Shapre of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.") - - desc_score_indices = torch.argsort(y_score, descending=True) - y_score = y_score[desc_score_indices] - y_true = y_true[desc_score_indices] - - distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1], as_tuple=False).squeeze() - threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)]) - - tps = torch.cumsum(y_true, dim=0)[threshold_idxs] - fps = 1 + threshold_idxs - tps - - tps = torch.cat([torch.zeros(1, device=device), tps]) - fps = torch.cat([torch.zeros(1, device=device), fps]) - - fpr = fps / fps[-1] - tpr = tps / tps[-1] - - area = torch.trapz(tpr, fpr) - - return area +"""Customized implementation of metrics""" +import numpy as np +import torch + +def ref_roc_auc_score(y_true, y_score, exact=True): + """Compute AUC exactly the same as sklearn + + sklearn.metrics.roc_auc_score is a very genalized function that supports all kind of situation. + See https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/_ranking.py. The AUC computation + used by DLRM is a very small subset. This function is bear minimum codes of computing AUC exactly the same way + as sklearn numerically. + + A lot of things are removed: + Anything is not required by binary class. + thresholds is not returned since we only need score. + + Args: + y_true (ndarray or list of array): + y_score (ndarray or list of array): + exact (bool): If False, skip some computation used in sklearn. Default True + """ + y_true = np.r_[y_true].flatten() + y_score = np.r_[y_score].flatten() + if y_true.shape != y_score.shape: + raise TypeError(F"Shapre of y_true and y_score must match. Got {y_true.shape} and {y_score.shape}.") + + # sklearn label_binarize y_true which effectively make it integer + y_true = y_true.astype(np.int) + + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = np.where(np.diff(y_score))[0] + threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + tps = np.cumsum(y_true)[threshold_idxs] + fps = 1 + threshold_idxs - tps + + if exact: + # Attempt to drop thresholds corresponding to points in between and collinear with other points. + if len(fps) > 2: + optimal_idxs = np.where(np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + + # Add an extra threshold position to make sure that the curve starts at (0, 0) + tps = np.r_[0, tps] + fps = np.r_[0, fps] + + fpr = fps / fps[-1] + tpr = tps / tps[-1] + + direction = 1 + if exact: + # I don't understand why it is needed since it is sorted before + if np.any(np.diff(fpr) < 0): + direction = -1 + + area = direction * np.trapz(tpr, fpr) + + return area + +def roc_auc_score(y_true, y_score): + """Pytorch implementation almost follows sklearn + + Args: + y_true (Tensor): + y_score (Tensor): + """ + device = y_true.device + y_true.squeeze_() + y_score.squeeze_() + if y_true.shape != y_score.shape: + raise TypeError(F"Shapre of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.") + + desc_score_indices = torch.argsort(y_score, descending=True) + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + + distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1], as_tuple=False).squeeze() + threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)]) + + tps = torch.cumsum(y_true, dim=0)[threshold_idxs] + fps = 1 + threshold_idxs - tps + + tps = torch.cat([torch.zeros(1, device=device), tps]) + fps = torch.cat([torch.zeros(1, device=device), fps]) + + fpr = fps / fps[-1] + tpr = tps / tps[-1] + + area = torch.trapz(tpr, fpr) + + return area diff --git a/others/recommendation_systems/dlrm/pytorch/scripts/split_data.py b/others/recommendation_systems/dlrm/pytorch/scripts/split_data.py index b45719849..edd2299e9 100644 --- a/others/recommendation_systems/dlrm/pytorch/scripts/split_data.py +++ b/others/recommendation_systems/dlrm/pytorch/scripts/split_data.py @@ -1,58 +1,58 @@ -"""split criteo data in mlperf format""" -import math -import os -import numpy as np -from tqdm import tqdm -from absl import app -from absl import logging -from absl import flags - - -FLAGS = flags.FLAGS - -flags.DEFINE_enum("stage", "val", ["train", "val", "test"], "") -flags.DEFINE_string("bin_data_root", "", "path to binary criteo dataset") -flags.DEFINE_string("out_root", "", "path where to save") - - -def main(argv): - bin_data = os.path.join(FLAGS.bin_data_root, F"{FLAGS.stage}_data.bin") - out_path = os.path.join(FLAGS.out_root, FLAGS.stage) - logging.info("processing %s", bin_data) - - if not os.path.exists(out_path): - logging.info("Creating %s", out_path) - os.mkdir(out_path) - - input_data_f = open(bin_data, "rb") - numerical_f = open(os.path.join(out_path, "numerical.bin"), "wb+") - label_f = open(os.path.join(out_path, 'label.bin'), 'wb+') - categorical_fs = [] - for i in range(26): - categorical_fs.append(open(os.path.join(out_path, F'cat_{i}.bin'), 'wb+')) - - bytes_per_entry = 160 - total_size = os.path.getsize(bin_data) - block = 16384 - for i in tqdm(range(math.ceil((total_size // bytes_per_entry) / block))): - raw_data = np.frombuffer(input_data_f.read(bytes_per_entry * block), dtype=np.int32) - if raw_data.shape[0] != 655360: - print(raw_data.shape) - batch_data = raw_data.reshape(-1, 40) - - numerical_features = batch_data[:, 1:14] - numerical_f.write(numerical_features.astype(np.float32).tobytes()) - - label = batch_data[:, 0] - label_f.write(label.astype(np.float32).tobytes()) - for i in range(26): - categorical_fs[i].write(batch_data[:, (i + 14):(i + 15)].tobytes()) - - input_data_f.close() - numerical_f.close() - label_f.close() - for f in categorical_fs: - f.close() - -if __name__ == '__main__': - app.run(main) +"""split criteo data in mlperf format""" +import math +import os +import numpy as np +from tqdm import tqdm +from absl import app +from absl import logging +from absl import flags + + +FLAGS = flags.FLAGS + +flags.DEFINE_enum("stage", "val", ["train", "val", "test"], "") +flags.DEFINE_string("bin_data_root", "", "path to binary criteo dataset") +flags.DEFINE_string("out_root", "", "path where to save") + + +def main(argv): + bin_data = os.path.join(FLAGS.bin_data_root, F"{FLAGS.stage}_data.bin") + out_path = os.path.join(FLAGS.out_root, FLAGS.stage) + logging.info("processing %s", bin_data) + + if not os.path.exists(out_path): + logging.info("Creating %s", out_path) + os.mkdir(out_path) + + input_data_f = open(bin_data, "rb") + numerical_f = open(os.path.join(out_path, "numerical.bin"), "wb+") + label_f = open(os.path.join(out_path, 'label.bin'), 'wb+') + categorical_fs = [] + for i in range(26): + categorical_fs.append(open(os.path.join(out_path, F'cat_{i}.bin'), 'wb+')) + + bytes_per_entry = 160 + total_size = os.path.getsize(bin_data) + block = 16384 + for i in tqdm(range(math.ceil((total_size // bytes_per_entry) / block))): + raw_data = np.frombuffer(input_data_f.read(bytes_per_entry * block), dtype=np.int32) + if raw_data.shape[0] != 655360: + print(raw_data.shape) + batch_data = raw_data.reshape(-1, 40) + + numerical_features = batch_data[:, 1:14] + numerical_f.write(numerical_features.astype(np.float32).tobytes()) + + label = batch_data[:, 0] + label_f.write(label.astype(np.float32).tobytes()) + for i in range(26): + categorical_fs[i].write(batch_data[:, (i + 14):(i + 15)].tobytes()) + + input_data_f.close() + numerical_f.close() + label_f.close() + for f in categorical_fs: + f.close() + +if __name__ == '__main__': + app.run(main) diff --git a/others/recommendation_systems/dlrm/pytorch/setup.py b/others/recommendation_systems/dlrm/pytorch/setup.py index 534832070..c9fed7d36 100644 --- a/others/recommendation_systems/dlrm/pytorch/setup.py +++ b/others/recommendation_systems/dlrm/pytorch/setup.py @@ -1,41 +1,41 @@ -"""Simple setup script""" - -import os -import subprocess -from setuptools import setup, find_packages -from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension - -abspath = os.path.dirname(os.path.realpath(__file__)) - -with open("requirements.txt") as f: - requirements = f.read().splitlines() - -print(find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"])) - -setup(name="dlrm", - package_dir={'dlrm': 'dlrm'}, - version="1.0.0", - description="Reimplementation of Facebook's DLRM", - packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), - install_requires=requirements, - zip_safe=False, - ext_modules=[ - CUDAExtension(name="dlrm.cuda_ext", - sources=[ - os.path.join(abspath, "src/pytorch_ops.cpp"), - # os.path.join(abspath, "src/dot_based_interact_pytorch_types.cu"), - os.path.join(abspath, "src/gather_gpu.cu") - ], - extra_compile_args={ - 'cxx': [], - 'nvcc' : [ - '-DCUDA_HAS_FP16=1', - '-D__CUDA_NO_HALF_OPERATORS__', - '-D__CUDA_NO_HALF_CONVERSIONS__', - '-D__CUDA_NO_HALF2_OPERATORS__', - '-gencode', 'arch=compute_70,code=sm_70', - '-gencode', 'arch=compute_70,code=compute_70', - '-gencode', 'arch=compute_80,code=sm_80'] - }) - ], - cmdclass={"build_ext": BuildExtension}) +"""Simple setup script""" + +import os +import subprocess +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension + +abspath = os.path.dirname(os.path.realpath(__file__)) + +with open("requirements.txt") as f: + requirements = f.read().splitlines() + +print(find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"])) + +setup(name="dlrm", + package_dir={'dlrm': 'dlrm'}, + version="1.0.0", + description="Reimplementation of Facebook's DLRM", + packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + install_requires=requirements, + zip_safe=False, + ext_modules=[ + CUDAExtension(name="dlrm.cuda_ext", + sources=[ + os.path.join(abspath, "src/pytorch_ops.cpp"), + # os.path.join(abspath, "src/dot_based_interact_pytorch_types.cu"), + os.path.join(abspath, "src/gather_gpu.cu") + ], + extra_compile_args={ + 'cxx': [], + 'nvcc' : [ + '-DCUDA_HAS_FP16=1', + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__', + '-gencode', 'arch=compute_70,code=sm_70', + '-gencode', 'arch=compute_70,code=compute_70', + '-gencode', 'arch=compute_80,code=sm_80'] + }) + ], + cmdclass={"build_ext": BuildExtension}) diff --git a/others/recommendation_systems/dlrm/pytorch/src/gather_gpu.cu b/others/recommendation_systems/dlrm/pytorch/src/gather_gpu.cu index ec564bcec..1184d68e2 100644 --- a/others/recommendation_systems/dlrm/pytorch/src/gather_gpu.cu +++ b/others/recommendation_systems/dlrm/pytorch/src/gather_gpu.cu @@ -1,171 +1,171 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include - -// For simplicity reason, boundry checks are removed -// All the kernels MUST be launched with grid size = batch size and block size = embedding size - -__global__ void GatherKernel(const float* params, - int64_t num_features, - int embed_size, - int batch_size, - int query_nnz, - const int64_t* indices, - float* ret) { - int tid = threadIdx.x, bid = blockIdx.x; - - extern __shared__ int shmem_indices[]; - - // each CTA load one row of indices in the mini batch into shared memory - for (int i = tid; i < query_nnz; i += blockDim.x) { - shmem_indices[i] = indices[query_nnz * bid + i]; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < query_nnz; ++i) { - // printf("%d, %d, %d\n", bid, i, shmem_indices[i]); - ret[(bid * query_nnz + i) * embed_size + tid] = - params[(int64_t)shmem_indices[i] * embed_size + tid]; - } -} - -__global__ void OneHotKernel(const float* params, - int64_t num_features, - int embed_size, - int batch_size, - const int64_t* indices, - float* ret) { - int tid = threadIdx.x, bid = blockIdx.x; - - ret[bid * embed_size + tid] = params[(int64_t)indices[bid] * embed_size + tid]; -} - -// grads is used to update params directly by atomic instead of forming wgrad -// Only SGD without momentum and without weight decay is supported -__global__ void GatherBackwardFuseSgdKernel(const float* grads, - int64_t num_features, - int embed_size, - int batch_size, - int query_nnz, - const int64_t* indices, - float lr, - float* params) { - int tid = threadIdx.x, bid = blockIdx.x; - - extern __shared__ int shmem_indices[]; - - for (int i = tid; i < query_nnz; i += blockDim.x) { - shmem_indices[i] = indices[query_nnz * bid + i]; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < query_nnz; ++i) { - atomicAdd(¶ms[(int64_t)shmem_indices[i] * embed_size + tid], - -lr * grads[(bid * query_nnz + i) * embed_size + tid]); - } -} - -// Keep the interface and argument name as torch.embedding() -// input is indices, and weight is embedding table -torch::Tensor gather_gpu_fwd(const torch::Tensor weight, const torch::Tensor indices) { - AT_ASSERT(indices.is_cuda()); - AT_ASSERT(weight.is_cuda()); - AT_ASSERT(indices.scalar_type() == torch::ScalarType::Long); - AT_ASSERT(weight.scalar_type() == torch::ScalarType::Float); - AT_ASSERT(weight.is_contiguous()); - - int batch_size = indices.size(0); - int query_nnz = 1; - if (indices.dim() > 1) { - query_nnz = indices.size(1); - } - - // Shared memory size limit. Larger nnz can also be supported by skipping shared memory if necessary - TORCH_CHECK(query_nnz <= 12288, "Embedding width must be smaller than 48k"); - - int num_features = weight.size(0); - int embed_size = weight.size(1); - - // Block dimension limit. Large than 1024 width can be easily supported by letting each block read - // from different strides if necessary. - TORCH_CHECK(embed_size <= 1024, "Embedding width must be smaller than 1024"); - - auto outputs = - torch::empty(batch_size * query_nnz * embed_size, at::device(at::kCUDA).dtype(at::kFloat)); - - if (query_nnz != 1) { - GatherKernel<<>>(weight.data_ptr(), - num_features, - embed_size, - batch_size, - query_nnz, - indices.contiguous().data_ptr(), - outputs.data_ptr()); - } else { - OneHotKernel<<>>( - weight.data_ptr(), - num_features, - embed_size, - batch_size, - indices.contiguous().data_ptr(), - outputs.data_ptr()); - } - - return outputs.reshape({batch_size, query_nnz, embed_size}); -} - -// Because complication of handling sparse tensor, use the native backward function is still faster -// TODO(haow): Figure out a way to write out sparse tensor directly to avoid addintional copy which makes -// customized implementation slower than Pytorch's own desipte kernels are more efficient -torch::Tensor gather_gpu_bwd(const torch::Tensor grad, - const torch::Tensor indices, - const int num_features) { - return at::embedding_sparse_backward(grad, indices, num_features, /*padding_idx=*/-1, /*scale_grad_by_freq=*/false); -} - -// Backward gather with fused plain SGD (no weight decay nor momentum) -void gather_gpu_bwd_fuse_sgd(const torch::Tensor grad, - const torch::Tensor indices, - float lr, - torch::Tensor weight) { - AT_ASSERT(grad.is_cuda()); - AT_ASSERT(indices.is_cuda()); - AT_ASSERT(weight.is_cuda()); - AT_ASSERT(grad.scalar_type() == torch::ScalarType::Float); - AT_ASSERT(indices.scalar_type() == torch::ScalarType::Long); - AT_ASSERT(weight.scalar_type() == torch::ScalarType::Float); - AT_ASSERT(weight.is_contiguous()); - - int batch_size = indices.size(0); - int query_nnz = 1; - if (indices.dim() > 1) { - query_nnz = indices.size(1); - } - int num_features = weight.size(0); - int embed_size = weight.size(1); - - GatherBackwardFuseSgdKernel<<>>( - grad.contiguous().data_ptr(), - num_features, - embed_size, - batch_size, - query_nnz, - indices.contiguous().data_ptr(), - lr, - weight.data_ptr()); -} +#include +#include +#include +#include + +#include +#include + +#include +#include + +// For simplicity reason, boundry checks are removed +// All the kernels MUST be launched with grid size = batch size and block size = embedding size + +__global__ void GatherKernel(const float* params, + int64_t num_features, + int embed_size, + int batch_size, + int query_nnz, + const int64_t* indices, + float* ret) { + int tid = threadIdx.x, bid = blockIdx.x; + + extern __shared__ int shmem_indices[]; + + // each CTA load one row of indices in the mini batch into shared memory + for (int i = tid; i < query_nnz; i += blockDim.x) { + shmem_indices[i] = indices[query_nnz * bid + i]; + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < query_nnz; ++i) { + // printf("%d, %d, %d\n", bid, i, shmem_indices[i]); + ret[(bid * query_nnz + i) * embed_size + tid] = + params[(int64_t)shmem_indices[i] * embed_size + tid]; + } +} + +__global__ void OneHotKernel(const float* params, + int64_t num_features, + int embed_size, + int batch_size, + const int64_t* indices, + float* ret) { + int tid = threadIdx.x, bid = blockIdx.x; + + ret[bid * embed_size + tid] = params[(int64_t)indices[bid] * embed_size + tid]; +} + +// grads is used to update params directly by atomic instead of forming wgrad +// Only SGD without momentum and without weight decay is supported +__global__ void GatherBackwardFuseSgdKernel(const float* grads, + int64_t num_features, + int embed_size, + int batch_size, + int query_nnz, + const int64_t* indices, + float lr, + float* params) { + int tid = threadIdx.x, bid = blockIdx.x; + + extern __shared__ int shmem_indices[]; + + for (int i = tid; i < query_nnz; i += blockDim.x) { + shmem_indices[i] = indices[query_nnz * bid + i]; + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < query_nnz; ++i) { + atomicAdd(¶ms[(int64_t)shmem_indices[i] * embed_size + tid], + -lr * grads[(bid * query_nnz + i) * embed_size + tid]); + } +} + +// Keep the interface and argument name as torch.embedding() +// input is indices, and weight is embedding table +torch::Tensor gather_gpu_fwd(const torch::Tensor weight, const torch::Tensor indices) { + AT_ASSERT(indices.is_cuda()); + AT_ASSERT(weight.is_cuda()); + AT_ASSERT(indices.scalar_type() == torch::ScalarType::Long); + AT_ASSERT(weight.scalar_type() == torch::ScalarType::Float); + AT_ASSERT(weight.is_contiguous()); + + int batch_size = indices.size(0); + int query_nnz = 1; + if (indices.dim() > 1) { + query_nnz = indices.size(1); + } + + // Shared memory size limit. Larger nnz can also be supported by skipping shared memory if necessary + TORCH_CHECK(query_nnz <= 12288, "Embedding width must be smaller than 48k"); + + int num_features = weight.size(0); + int embed_size = weight.size(1); + + // Block dimension limit. Large than 1024 width can be easily supported by letting each block read + // from different strides if necessary. + TORCH_CHECK(embed_size <= 1024, "Embedding width must be smaller than 1024"); + + auto outputs = + torch::empty(batch_size * query_nnz * embed_size, at::device(at::kCUDA).dtype(at::kFloat)); + + if (query_nnz != 1) { + GatherKernel<<>>(weight.data_ptr(), + num_features, + embed_size, + batch_size, + query_nnz, + indices.contiguous().data_ptr(), + outputs.data_ptr()); + } else { + OneHotKernel<<>>( + weight.data_ptr(), + num_features, + embed_size, + batch_size, + indices.contiguous().data_ptr(), + outputs.data_ptr()); + } + + return outputs.reshape({batch_size, query_nnz, embed_size}); +} + +// Because complication of handling sparse tensor, use the native backward function is still faster +// TODO(haow): Figure out a way to write out sparse tensor directly to avoid addintional copy which makes +// customized implementation slower than Pytorch's own desipte kernels are more efficient +torch::Tensor gather_gpu_bwd(const torch::Tensor grad, + const torch::Tensor indices, + const int num_features) { + return at::embedding_sparse_backward(grad, indices, num_features, /*padding_idx=*/-1, /*scale_grad_by_freq=*/false); +} + +// Backward gather with fused plain SGD (no weight decay nor momentum) +void gather_gpu_bwd_fuse_sgd(const torch::Tensor grad, + const torch::Tensor indices, + float lr, + torch::Tensor weight) { + AT_ASSERT(grad.is_cuda()); + AT_ASSERT(indices.is_cuda()); + AT_ASSERT(weight.is_cuda()); + AT_ASSERT(grad.scalar_type() == torch::ScalarType::Float); + AT_ASSERT(indices.scalar_type() == torch::ScalarType::Long); + AT_ASSERT(weight.scalar_type() == torch::ScalarType::Float); + AT_ASSERT(weight.is_contiguous()); + + int batch_size = indices.size(0); + int query_nnz = 1; + if (indices.dim() > 1) { + query_nnz = indices.size(1); + } + int num_features = weight.size(0); + int embed_size = weight.size(1); + + GatherBackwardFuseSgdKernel<<>>( + grad.contiguous().data_ptr(), + num_features, + embed_size, + batch_size, + query_nnz, + indices.contiguous().data_ptr(), + lr, + weight.data_ptr()); +} diff --git a/others/recommendation_systems/dlrm/pytorch/tests/buckle_embedding_test.py b/others/recommendation_systems/dlrm/pytorch/tests/buckle_embedding_test.py index bdbdf03f4..6f8033a42 100644 --- a/others/recommendation_systems/dlrm/pytorch/tests/buckle_embedding_test.py +++ b/others/recommendation_systems/dlrm/pytorch/tests/buckle_embedding_test.py @@ -1,57 +1,57 @@ -"""Tests for buckle embedding""" -from absl.testing import absltest - -import torch -from torch import nn - -from dlrm.nn import BuckleEmbedding - -# pylint:disable=missing-docstring, no-self-use - -class DistEmbeddingBagTest(absltest.TestCase): - - def test_smoke(self): - test_buckle_embedding = BuckleEmbedding([3, 5, 7, 11], 3, device="cpu") - test_buckle_embedding(torch.tensor([[1, 2, 3, 4], [2, 4, 6, 10]])) - - def test_2embeddings_batch1(self): - test_sizes = [3, 5] - - test_buckle_embedding = BuckleEmbedding(test_sizes, 3, device="cpu") - ref_embeddings = nn.ModuleList() - for size in test_sizes: - ref_embeddings.append(nn.Embedding(size, 3)) - - test_buckle_embedding.embedding.weight.data = torch.cat( - [embedding.weight for embedding in ref_embeddings]).clone() - - test_indices = torch.tensor([[1, 3]]) - embedding_out = test_buckle_embedding(test_indices) - ref_out = [] - for embedding_id, embedding in enumerate(ref_embeddings): - ref_out.append(embedding(test_indices[:, embedding_id])) - ref_out = torch.cat(ref_out) - assert (ref_out == embedding_out).all() - - def test_4embeddings_batch2(self): - test_sizes = [3, 5, 11, 13] - - test_buckle_embedding = BuckleEmbedding(test_sizes, 3, device="cpu") - ref_embeddings = nn.ModuleList() - for size in test_sizes: - ref_embeddings.append(nn.Embedding(size, 3)) - - test_buckle_embedding.embedding.weight.data = torch.cat( - [embedding.weight for embedding in ref_embeddings]).clone() - - test_indices = torch.tensor([[1, 3, 5, 7], [2, 4, 10, 12]]) - embedding_out = test_buckle_embedding(test_indices) - ref_out = [] - for embedding_id, embedding in enumerate(ref_embeddings): - ref_out.append(embedding(test_indices[:, embedding_id].unsqueeze(-1))) - ref_out = torch.cat(ref_out, dim=1) - assert (ref_out == embedding_out).all() - - -if __name__ == '__main__': - absltest.main() +"""Tests for buckle embedding""" +from absl.testing import absltest + +import torch +from torch import nn + +from dlrm.nn import BuckleEmbedding + +# pylint:disable=missing-docstring, no-self-use + +class DistEmbeddingBagTest(absltest.TestCase): + + def test_smoke(self): + test_buckle_embedding = BuckleEmbedding([3, 5, 7, 11], 3, device="cpu") + test_buckle_embedding(torch.tensor([[1, 2, 3, 4], [2, 4, 6, 10]])) + + def test_2embeddings_batch1(self): + test_sizes = [3, 5] + + test_buckle_embedding = BuckleEmbedding(test_sizes, 3, device="cpu") + ref_embeddings = nn.ModuleList() + for size in test_sizes: + ref_embeddings.append(nn.Embedding(size, 3)) + + test_buckle_embedding.embedding.weight.data = torch.cat( + [embedding.weight for embedding in ref_embeddings]).clone() + + test_indices = torch.tensor([[1, 3]]) + embedding_out = test_buckle_embedding(test_indices) + ref_out = [] + for embedding_id, embedding in enumerate(ref_embeddings): + ref_out.append(embedding(test_indices[:, embedding_id])) + ref_out = torch.cat(ref_out) + assert (ref_out == embedding_out).all() + + def test_4embeddings_batch2(self): + test_sizes = [3, 5, 11, 13] + + test_buckle_embedding = BuckleEmbedding(test_sizes, 3, device="cpu") + ref_embeddings = nn.ModuleList() + for size in test_sizes: + ref_embeddings.append(nn.Embedding(size, 3)) + + test_buckle_embedding.embedding.weight.data = torch.cat( + [embedding.weight for embedding in ref_embeddings]).clone() + + test_indices = torch.tensor([[1, 3, 5, 7], [2, 4, 10, 12]]) + embedding_out = test_buckle_embedding(test_indices) + ref_out = [] + for embedding_id, embedding in enumerate(ref_embeddings): + ref_out.append(embedding(test_indices[:, embedding_id].unsqueeze(-1))) + ref_out = torch.cat(ref_out, dim=1) + assert (ref_out == embedding_out).all() + + +if __name__ == '__main__': + absltest.main() diff --git a/others/recommendation_systems/dlrm/pytorch/tests/dataset_test.py b/others/recommendation_systems/dlrm/pytorch/tests/dataset_test.py index 97ea2aaaf..b68dfa4cb 100644 --- a/others/recommendation_systems/dlrm/pytorch/tests/dataset_test.py +++ b/others/recommendation_systems/dlrm/pytorch/tests/dataset_test.py @@ -1,171 +1,171 @@ -"""Test dataset""" -import os -import time - -import numpy as np - -from absl import flags -from absl import logging -from absl.testing import absltest - -import torch - -from dlrm.data import dataset - -FLAGS = flags.FLAGS - -flags.DEFINE_string("bin_dataset", None, "Full path to reference binary dataset") -flags.DEFINE_string("dist_dataset", None, "Full path to split binary dataset") -flags.DEFINE_integer("batch_size", 16384, "") -flags.DEFINE_integer("num_batches", 500, "Number of batches to test.") -flags.DEFINE_boolean("shuffle_batch_order", False, "Read batch in train dataset by random order", short_name="shuffle") - -# pylint:disable=missing-docstring, no-self-use - -class RefCriteoTerabyteLoaderTest(absltest.TestCase): - - def test_dataloader(self): - """Test reference binary data loader - - It tests data loader function also benchmark performance. It does NOT verify correctness of the dataset - """ - batch_size = FLAGS.batch_size - num_batches = FLAGS.num_batches - dataset_test = dataset.CriteoBinDataset( - data_file=FLAGS.bin_dataset, - batch_size=batch_size, - shuffle=FLAGS.shuffle) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) - - if num_batches > len(data_loader_test): - logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") - num_batches = len(data_loader_test) - - for i, _ in enumerate(data_loader_test): - if i == 0: - start_time = time.time() - if i % 100 == 0: - partial_time = time.time() - logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) - if i > num_batches: - break - end_time = time.time() - logging.info("Finished fetching %d records at %.1f records/s", - num_batches * batch_size, - num_batches * batch_size / (end_time - start_time)) - -class CriteoMemmapLoaderTest(absltest.TestCase): - - def test_dataloader(self): - """Test reference binary data loader - - It tests data loader function also benchmark performance. It does NOT verify correctness of the dataset - """ - batch_size = FLAGS.batch_size - num_batches = FLAGS.num_batches - dataset_test = dataset.CriteoMemmapDataset( - data_file=FLAGS.bin_dataset, - batch_size=batch_size, - shuffle=FLAGS.shuffle) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) - - if num_batches > len(data_loader_test): - logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") - num_batches = len(data_loader_test) - - for i, _ in enumerate(data_loader_test): - if i == 0: - start_time = time.time() - if i % 100 == 0: - partial_time = time.time() - logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) - if i > num_batches: - break - end_time = time.time() - logging.info("Finished fetching %d records at %.1f records/s", - num_batches * batch_size, - num_batches * batch_size / (end_time - start_time)) - - -class DistCriteoDatasetTest(absltest.TestCase): - - def test_creation(self): - _ = dataset.DistCriteoDataset( - data_path=FLAGS.dist_dataset, - batch_size=FLAGS.batch_size, - numerical_features=True, - categorical_features=[0, 1, 2, 3, 4]) - - def test_against_bin(self): - dist_dataset = dataset.DistCriteoDataset( - FLAGS.dist_dataset, - batch_size=FLAGS.batch_size, - numerical_features=True, - categorical_features=range(26)) - bin_dataset = dataset.CriteoBinDataset( - FLAGS.bin_dataset, - batch_size=FLAGS.batch_size) - - data_loader_dist = torch.utils.data.DataLoader( - dist_dataset, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) - - data_loader_bin = torch.utils.data.DataLoader( - bin_dataset, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) - - for i, (data, ref) in enumerate(zip(data_loader_dist, data_loader_bin)): - if i > FLAGS.num_batches: - break - np.testing.assert_equal(data[0].cpu().numpy(), ref[0].cpu().numpy(), err_msg=F"Miss match in batch {i}.") - np.testing.assert_equal(data[1].cpu().numpy(), ref[1].cpu().numpy(), err_msg=F"Miss match in batch {i}.") - np.testing.assert_equal(data[2].cpu().numpy(), ref[2].cpu().numpy(), err_msg=F"Miss match in batch {i}.") - - def test_dataloader(self): - batch_size = FLAGS.batch_size - num_batches = FLAGS.num_batches - - try: - rank = int(os.environ["RANK"]) - except KeyError: - rank = 0 - - if rank == 0: - numerical_features = True - categorical_features = None - else: - numerical_features = False - categorical_features = range(rank * 4, (rank + 1) * 4) - - dataset_test = dataset.DistCriteoDataset( - data_path=FLAGS.dist_dataset, - batch_size=batch_size, - shuffle=FLAGS.shuffle, - numerical_features=numerical_features, - categorical_features=categorical_features) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) - - if num_batches > len(data_loader_test): - logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") - num_batches = len(data_loader_test) - - for i, data_batch in enumerate(data_loader_test): - if i == 0: - start_time = time.time() - if i % 100 == 0: - partial_time = time.time() - logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) - if i > num_batches: - break - end_time = time.time() - logging.info("Finished fetching %d records at %.1f records/s", - num_batches * batch_size, - num_batches * batch_size / (end_time - start_time)) - - -if __name__ == '__main__': - absltest.main() +"""Test dataset""" +import os +import time + +import numpy as np + +from absl import flags +from absl import logging +from absl.testing import absltest + +import torch + +from dlrm.data import dataset + +FLAGS = flags.FLAGS + +flags.DEFINE_string("bin_dataset", None, "Full path to reference binary dataset") +flags.DEFINE_string("dist_dataset", None, "Full path to split binary dataset") +flags.DEFINE_integer("batch_size", 16384, "") +flags.DEFINE_integer("num_batches", 500, "Number of batches to test.") +flags.DEFINE_boolean("shuffle_batch_order", False, "Read batch in train dataset by random order", short_name="shuffle") + +# pylint:disable=missing-docstring, no-self-use + +class RefCriteoTerabyteLoaderTest(absltest.TestCase): + + def test_dataloader(self): + """Test reference binary data loader + + It tests data loader function also benchmark performance. It does NOT verify correctness of the dataset + """ + batch_size = FLAGS.batch_size + num_batches = FLAGS.num_batches + dataset_test = dataset.CriteoBinDataset( + data_file=FLAGS.bin_dataset, + batch_size=batch_size, + shuffle=FLAGS.shuffle) + + data_loader_test = torch.utils.data.DataLoader( + dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) + + if num_batches > len(data_loader_test): + logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") + num_batches = len(data_loader_test) + + for i, _ in enumerate(data_loader_test): + if i == 0: + start_time = time.time() + if i % 100 == 0: + partial_time = time.time() + logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) + if i > num_batches: + break + end_time = time.time() + logging.info("Finished fetching %d records at %.1f records/s", + num_batches * batch_size, + num_batches * batch_size / (end_time - start_time)) + +class CriteoMemmapLoaderTest(absltest.TestCase): + + def test_dataloader(self): + """Test reference binary data loader + + It tests data loader function also benchmark performance. It does NOT verify correctness of the dataset + """ + batch_size = FLAGS.batch_size + num_batches = FLAGS.num_batches + dataset_test = dataset.CriteoMemmapDataset( + data_file=FLAGS.bin_dataset, + batch_size=batch_size, + shuffle=FLAGS.shuffle) + + data_loader_test = torch.utils.data.DataLoader( + dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) + + if num_batches > len(data_loader_test): + logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") + num_batches = len(data_loader_test) + + for i, _ in enumerate(data_loader_test): + if i == 0: + start_time = time.time() + if i % 100 == 0: + partial_time = time.time() + logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) + if i > num_batches: + break + end_time = time.time() + logging.info("Finished fetching %d records at %.1f records/s", + num_batches * batch_size, + num_batches * batch_size / (end_time - start_time)) + + +class DistCriteoDatasetTest(absltest.TestCase): + + def test_creation(self): + _ = dataset.DistCriteoDataset( + data_path=FLAGS.dist_dataset, + batch_size=FLAGS.batch_size, + numerical_features=True, + categorical_features=[0, 1, 2, 3, 4]) + + def test_against_bin(self): + dist_dataset = dataset.DistCriteoDataset( + FLAGS.dist_dataset, + batch_size=FLAGS.batch_size, + numerical_features=True, + categorical_features=range(26)) + bin_dataset = dataset.CriteoBinDataset( + FLAGS.bin_dataset, + batch_size=FLAGS.batch_size) + + data_loader_dist = torch.utils.data.DataLoader( + dist_dataset, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) + + data_loader_bin = torch.utils.data.DataLoader( + bin_dataset, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) + + for i, (data, ref) in enumerate(zip(data_loader_dist, data_loader_bin)): + if i > FLAGS.num_batches: + break + np.testing.assert_equal(data[0].cpu().numpy(), ref[0].cpu().numpy(), err_msg=F"Miss match in batch {i}.") + np.testing.assert_equal(data[1].cpu().numpy(), ref[1].cpu().numpy(), err_msg=F"Miss match in batch {i}.") + np.testing.assert_equal(data[2].cpu().numpy(), ref[2].cpu().numpy(), err_msg=F"Miss match in batch {i}.") + + def test_dataloader(self): + batch_size = FLAGS.batch_size + num_batches = FLAGS.num_batches + + try: + rank = int(os.environ["RANK"]) + except KeyError: + rank = 0 + + if rank == 0: + numerical_features = True + categorical_features = None + else: + numerical_features = False + categorical_features = range(rank * 4, (rank + 1) * 4) + + dataset_test = dataset.DistCriteoDataset( + data_path=FLAGS.dist_dataset, + batch_size=batch_size, + shuffle=FLAGS.shuffle, + numerical_features=numerical_features, + categorical_features=categorical_features) + + data_loader_test = torch.utils.data.DataLoader( + dataset_test, batch_size=None, num_workers=0, pin_memory=False, collate_fn=dataset.data_collate_fn) + + if num_batches > len(data_loader_test): + logging.warning(F"Only {len(data_loader_test)} batches in the dataset, asked for {num_batches}") + num_batches = len(data_loader_test) + + for i, data_batch in enumerate(data_loader_test): + if i == 0: + start_time = time.time() + if i % 100 == 0: + partial_time = time.time() + logging.info("Fetched %d batches in %.2fs", i, partial_time - start_time) + if i > num_batches: + break + end_time = time.time() + logging.info("Finished fetching %d records at %.1f records/s", + num_batches * batch_size, + num_batches * batch_size / (end_time - start_time)) + + +if __name__ == '__main__': + absltest.main() diff --git a/others/recommendation_systems/dlrm/pytorch/tests/dist_model_test.py b/others/recommendation_systems/dlrm/pytorch/tests/dist_model_test.py index 7a82cbf99..8a00eca2f 100644 --- a/others/recommendation_systems/dlrm/pytorch/tests/dist_model_test.py +++ b/others/recommendation_systems/dlrm/pytorch/tests/dist_model_test.py @@ -1,88 +1,88 @@ -"""Tests for distributed model""" -from copy import copy -from absl.testing import absltest - -import torch -from torch import nn - -from dlrm import model -from dlrm import dist_model - -# pylint:disable=missing-docstring, no-self-use - -_DUMMY_BOTTOM_CONFIG = { - "num_numerical_features" : 13, - "categorical_feature_sizes" : [5, 7], - "bottom_mlp_sizes" : [512, 256, 64, 5], - "embedding_dim": 5, -} - -_DUMMY_TOP_CONFIG = { - "top_mlp_sizes" : [512, 256, 1], - "num_interaction_inputs": 3, - "embedding_dim": 5 -} - -class DlrmBottomTest(absltest.TestCase): - - def test_simple(self): - # test creation - test_model = dist_model.DlrmBottom(**_DUMMY_BOTTOM_CONFIG) - - # Test forward - test_numerical_input = torch.randn(2, 13, device="cuda") - test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable - test_out = test_model(test_numerical_input, test_sparse_inputs) - - def test_empty_bottom_mlp(self): - config = copy(_DUMMY_BOTTOM_CONFIG) - config.pop('bottom_mlp_sizes') - test_model = dist_model.DlrmBottom(**config) - - test_numerical_input = torch.randn(2, 13, device="cuda") - test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable - test_out = test_model(test_numerical_input, test_sparse_inputs) - - -class DlrmTopTest(absltest.TestCase): - - def test_simple(self): - # test creation - test_model = dist_model.DlrmTop(**_DUMMY_TOP_CONFIG).to("cuda") - - # Test forward - test_bottom_output = torch.rand(2, 3, 5, device="cuda") - test_model(test_bottom_output) - -class DlrmBottomAndTopTest(absltest.TestCase): - - def test_against_base_model(self): - model_config = copy(_DUMMY_BOTTOM_CONFIG) - model_config.update(_DUMMY_TOP_CONFIG) - model_config.pop('num_interaction_inputs') - ref_model = model.DlrmJointEmbedding(**model_config) - ref_model.to("cuda") - - test_model = dist_model.DistDlrm(**model_config) - test_model.to("cuda") - print(test_model) - - # Copy weight to make to models identical - test_model.bottom_model.joint_embedding.embedding.weight.data.copy_(ref_model.embeddings[0].embedding.weight) - for i in range(len(test_model.bottom_model.bottom_mlp)): - if isinstance(ref_model.bottom_mlp[i], nn.Linear): - test_model.bottom_model.bottom_mlp[i].weight.data.copy_(ref_model.bottom_mlp[i].weight) - test_model.bottom_model.bottom_mlp[i].bias.data.copy_(ref_model.bottom_mlp[i].bias) - for i in range(len(test_model.top_model.top_mlp)): - if isinstance(ref_model.bottom_mlp[i], nn.Linear): - test_model.top_model.top_mlp[i].weight.data.copy_(ref_model.top_mlp[i].weight) - test_model.top_model.top_mlp[i].bias.data.copy_(ref_model.top_mlp[i].bias) - - test_numerical_input = torch.randn(2, 13, device="cuda") - test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable - test_top_out = test_model(test_numerical_input, test_sparse_inputs) - ref_top_out = ref_model(test_numerical_input, test_sparse_inputs.t()) - assert (test_top_out == ref_top_out).all() - -if __name__ == '__main__': - absltest.main() +"""Tests for distributed model""" +from copy import copy +from absl.testing import absltest + +import torch +from torch import nn + +from dlrm import model +from dlrm import dist_model + +# pylint:disable=missing-docstring, no-self-use + +_DUMMY_BOTTOM_CONFIG = { + "num_numerical_features" : 13, + "categorical_feature_sizes" : [5, 7], + "bottom_mlp_sizes" : [512, 256, 64, 5], + "embedding_dim": 5, +} + +_DUMMY_TOP_CONFIG = { + "top_mlp_sizes" : [512, 256, 1], + "num_interaction_inputs": 3, + "embedding_dim": 5 +} + +class DlrmBottomTest(absltest.TestCase): + + def test_simple(self): + # test creation + test_model = dist_model.DlrmBottom(**_DUMMY_BOTTOM_CONFIG) + + # Test forward + test_numerical_input = torch.randn(2, 13, device="cuda") + test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable + test_out = test_model(test_numerical_input, test_sparse_inputs) + + def test_empty_bottom_mlp(self): + config = copy(_DUMMY_BOTTOM_CONFIG) + config.pop('bottom_mlp_sizes') + test_model = dist_model.DlrmBottom(**config) + + test_numerical_input = torch.randn(2, 13, device="cuda") + test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable + test_out = test_model(test_numerical_input, test_sparse_inputs) + + +class DlrmTopTest(absltest.TestCase): + + def test_simple(self): + # test creation + test_model = dist_model.DlrmTop(**_DUMMY_TOP_CONFIG).to("cuda") + + # Test forward + test_bottom_output = torch.rand(2, 3, 5, device="cuda") + test_model(test_bottom_output) + +class DlrmBottomAndTopTest(absltest.TestCase): + + def test_against_base_model(self): + model_config = copy(_DUMMY_BOTTOM_CONFIG) + model_config.update(_DUMMY_TOP_CONFIG) + model_config.pop('num_interaction_inputs') + ref_model = model.DlrmJointEmbedding(**model_config) + ref_model.to("cuda") + + test_model = dist_model.DistDlrm(**model_config) + test_model.to("cuda") + print(test_model) + + # Copy weight to make to models identical + test_model.bottom_model.joint_embedding.embedding.weight.data.copy_(ref_model.embeddings[0].embedding.weight) + for i in range(len(test_model.bottom_model.bottom_mlp)): + if isinstance(ref_model.bottom_mlp[i], nn.Linear): + test_model.bottom_model.bottom_mlp[i].weight.data.copy_(ref_model.bottom_mlp[i].weight) + test_model.bottom_model.bottom_mlp[i].bias.data.copy_(ref_model.bottom_mlp[i].bias) + for i in range(len(test_model.top_model.top_mlp)): + if isinstance(ref_model.bottom_mlp[i], nn.Linear): + test_model.top_model.top_mlp[i].weight.data.copy_(ref_model.top_mlp[i].weight) + test_model.top_model.top_mlp[i].bias.data.copy_(ref_model.top_mlp[i].bias) + + test_numerical_input = torch.randn(2, 13, device="cuda") + test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable + test_top_out = test_model(test_numerical_input, test_sparse_inputs) + ref_top_out = ref_model(test_numerical_input, test_sparse_inputs.t()) + assert (test_top_out == ref_top_out).all() + +if __name__ == '__main__': + absltest.main() diff --git a/others/recommendation_systems/dlrm/pytorch/tests/metrics_test.py b/others/recommendation_systems/dlrm/pytorch/tests/metrics_test.py index 7fed35f4f..ee44b85e4 100644 --- a/others/recommendation_systems/dlrm/pytorch/tests/metrics_test.py +++ b/others/recommendation_systems/dlrm/pytorch/tests/metrics_test.py @@ -1,41 +1,41 @@ -"""Tests for metrics""" -from absl.testing import absltest -from sklearn.metrics import roc_auc_score - -import numpy as np - -import torch - -from dlrm.utils import metrics - -# pylint:disable=missing-docstring, no-self-use - -class AucTest(absltest.TestCase): - - def test_against_sklearn_exact(self): - for num_samples in [100, 1000, 10000, 100000, 1048576]: - y = np.random.randint(0, 2, num_samples) - scores = np.random.power(10, num_samples) - ref_auc = roc_auc_score(y, scores) - test_auc = metrics.ref_roc_auc_score(y, scores) - assert ref_auc == test_auc - - def test_against_sklearn_almost_exact(self): - for num_samples in [100, 1000, 10000, 100000, 1048576]: - y = np.random.randint(0, 2, num_samples) - scores = np.random.power(10, num_samples) - ref_auc = roc_auc_score(y, scores) - test_auc = metrics.ref_roc_auc_score(y, scores, exact=False) - np.testing.assert_almost_equal(ref_auc, test_auc) - - def test_pytorch_against_sklearn(self): - for num_samples in [100, 1000, 10000, 100000, 1048576]: - y = np.random.randint(0, 2, num_samples).astype(np.float32) - scores = np.random.power(10, num_samples).astype(np.float32) - ref_auc = roc_auc_score(y, scores) - - test_auc = metrics.roc_auc_score(torch.from_numpy(y).cuda(), torch.from_numpy(scores).cuda()) - np.testing.assert_almost_equal(ref_auc, test_auc.cpu().numpy()) - -if __name__ == '__main__': - absltest.main() +"""Tests for metrics""" +from absl.testing import absltest +from sklearn.metrics import roc_auc_score + +import numpy as np + +import torch + +from dlrm.utils import metrics + +# pylint:disable=missing-docstring, no-self-use + +class AucTest(absltest.TestCase): + + def test_against_sklearn_exact(self): + for num_samples in [100, 1000, 10000, 100000, 1048576]: + y = np.random.randint(0, 2, num_samples) + scores = np.random.power(10, num_samples) + ref_auc = roc_auc_score(y, scores) + test_auc = metrics.ref_roc_auc_score(y, scores) + assert ref_auc == test_auc + + def test_against_sklearn_almost_exact(self): + for num_samples in [100, 1000, 10000, 100000, 1048576]: + y = np.random.randint(0, 2, num_samples) + scores = np.random.power(10, num_samples) + ref_auc = roc_auc_score(y, scores) + test_auc = metrics.ref_roc_auc_score(y, scores, exact=False) + np.testing.assert_almost_equal(ref_auc, test_auc) + + def test_pytorch_against_sklearn(self): + for num_samples in [100, 1000, 10000, 100000, 1048576]: + y = np.random.randint(0, 2, num_samples).astype(np.float32) + scores = np.random.power(10, num_samples).astype(np.float32) + ref_auc = roc_auc_score(y, scores) + + test_auc = metrics.roc_auc_score(torch.from_numpy(y).cuda(), torch.from_numpy(scores).cuda()) + np.testing.assert_almost_equal(ref_auc, test_auc.cpu().numpy()) + +if __name__ == '__main__': + absltest.main() diff --git a/others/recommendation_systems/dlrm/pytorch/tests/model_test.py b/others/recommendation_systems/dlrm/pytorch/tests/model_test.py index dd884656e..a508ce597 100644 --- a/others/recommendation_systems/dlrm/pytorch/tests/model_test.py +++ b/others/recommendation_systems/dlrm/pytorch/tests/model_test.py @@ -1,111 +1,111 @@ -"""Tests for model class""" -from absl.testing import absltest - -import torch - -from dlrm import model - -# pylint:disable=missing-docstring, no-self-use - -_DUMMY_CONFIG = { - "num_numerical_features" : 13, - "categorical_feature_sizes" : [5, 7], - "bottom_mlp_sizes" : [512, 256, 64, 3], - "top_mlp_sizes" : [512, 256, 1], - "embedding_dim": 3, -} - -class DlrmTest(absltest.TestCase): - - def test_simple(self): - # test creation - test_model = model.Dlrm(**_DUMMY_CONFIG) - test_model.set_devices("cuda") - - # Test forward - test_numerical_input = torch.randn(2, 13, device="cuda") - test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable - test_model(test_numerical_input, test_sparse_inputs) - - def test_kaggle_criteo(self): - """Test a real configuration stored in json - It is not tiny so will take a while to create all the embedding tables - """ - with open("dlrm/config/criteo_kaggle_tiny.json", "r") as jsonf: - dlrm_criteo_kaggle = model.Dlrm.from_json(jsonf.read()) - dlrm_criteo_kaggle.cuda() - print(dlrm_criteo_kaggle) - - def test_interaction(self): - """Test interaction ops - TODO(haow): It probably deserves more tests, especially the dot interaction - """ - test_model = model.Dlrm( - num_numerical_features=13, - categorical_feature_sizes=range(2, 28), - bottom_mlp_sizes=[128, 32], - top_mlp_sizes=[256, 1],) - - # 26 sparse features + 13 dense feature with embedding size 32, plus padding 1 - assert test_model.top_mlp[0].in_features == 383 + 1 - - def test_hash(self): - # test creation - test_model = model.Dlrm(**_DUMMY_CONFIG, hash_indices=True) - test_model.set_devices("cuda") - - # Test forward - ref_numerical_input = torch.randn(2, 13, device="cuda") - ref_sparse_inputs = torch.tensor([[1, 2], [2, 3]], device="cuda") # pylint:disable=not-callable - ref = test_model(ref_numerical_input, ref_sparse_inputs) - - # Test indices that will be hashed to the same value as ref - test_sparse_inputs = torch.tensor([[1, 7], [9, 3]], device="cuda") # pylint:disable=not-callable - test_result = test_model(ref_numerical_input, test_sparse_inputs) - - assert (ref == test_result).all() - - -class DlrmJointEmbeddingTest(absltest.TestCase): - - def test_against_base(self): - torch.set_printoptions(precision=4, sci_mode=False) - ref_model = model.Dlrm(**_DUMMY_CONFIG) - test_model = model.DlrmJointEmbedding(**_DUMMY_CONFIG) - ref_model.set_devices("cuda") - test_model.to("cuda") - - # Copy model weight from ref_model - test_model.embeddings[0].embedding.weight.data = torch.cat( - [embedding.weight for embedding in ref_model.embeddings]).clone() - test_module_dict = dict(test_model.named_modules()) - for name, module in ref_model.named_modules(): - if isinstance(module, torch.nn.Linear): - test_module_dict[name].weight.data.copy_(module.weight) - test_module_dict[name].bias.data.copy_(module.bias) - - test_numerical_input = torch.randn(3, 13, device="cuda") - test_sparse_inputs = torch.randint(0, 3, (2, 3), device="cuda") # pylint:disable=not-callable - - ref_out = ref_model(test_numerical_input, test_sparse_inputs) - test_out = test_model(test_numerical_input, test_sparse_inputs) - assert (ref_out == test_out).all() - - def test_hash(self): - # test creation - test_model = model.DlrmJointEmbedding(**_DUMMY_CONFIG, hash_indices=True) - test_model.to("cuda") - - # Test forward - ref_numerical_input = torch.randn(2, 13, device="cuda") - ref_sparse_inputs = torch.tensor([[1, 2], [2, 3]], device="cuda") # pylint:disable=not-callable - ref = test_model(ref_numerical_input, ref_sparse_inputs) - - # Test indices that will be hashed to the same value as ref - test_sparse_inputs = torch.tensor([[1, 7], [9, 3]], device="cuda") # pylint:disable=not-callable - test_result = test_model(ref_numerical_input, test_sparse_inputs) - - assert (ref == test_result).all() - -if __name__ == '__main__': - absltest.main() +"""Tests for model class""" +from absl.testing import absltest + +import torch + +from dlrm import model + +# pylint:disable=missing-docstring, no-self-use + +_DUMMY_CONFIG = { + "num_numerical_features" : 13, + "categorical_feature_sizes" : [5, 7], + "bottom_mlp_sizes" : [512, 256, 64, 3], + "top_mlp_sizes" : [512, 256, 1], + "embedding_dim": 3, +} + +class DlrmTest(absltest.TestCase): + + def test_simple(self): + # test creation + test_model = model.Dlrm(**_DUMMY_CONFIG) + test_model.set_devices("cuda") + + # Test forward + test_numerical_input = torch.randn(2, 13, device="cuda") + test_sparse_inputs = torch.tensor([[1, 1], [2, 2]], device="cuda") # pylint:disable=not-callable + test_model(test_numerical_input, test_sparse_inputs) + + def test_kaggle_criteo(self): + """Test a real configuration stored in json + It is not tiny so will take a while to create all the embedding tables + """ + with open("dlrm/config/criteo_kaggle_tiny.json", "r") as jsonf: + dlrm_criteo_kaggle = model.Dlrm.from_json(jsonf.read()) + dlrm_criteo_kaggle.cuda() + print(dlrm_criteo_kaggle) + + def test_interaction(self): + """Test interaction ops + TODO(haow): It probably deserves more tests, especially the dot interaction + """ + test_model = model.Dlrm( + num_numerical_features=13, + categorical_feature_sizes=range(2, 28), + bottom_mlp_sizes=[128, 32], + top_mlp_sizes=[256, 1],) + + # 26 sparse features + 13 dense feature with embedding size 32, plus padding 1 + assert test_model.top_mlp[0].in_features == 383 + 1 + + def test_hash(self): + # test creation + test_model = model.Dlrm(**_DUMMY_CONFIG, hash_indices=True) + test_model.set_devices("cuda") + + # Test forward + ref_numerical_input = torch.randn(2, 13, device="cuda") + ref_sparse_inputs = torch.tensor([[1, 2], [2, 3]], device="cuda") # pylint:disable=not-callable + ref = test_model(ref_numerical_input, ref_sparse_inputs) + + # Test indices that will be hashed to the same value as ref + test_sparse_inputs = torch.tensor([[1, 7], [9, 3]], device="cuda") # pylint:disable=not-callable + test_result = test_model(ref_numerical_input, test_sparse_inputs) + + assert (ref == test_result).all() + + +class DlrmJointEmbeddingTest(absltest.TestCase): + + def test_against_base(self): + torch.set_printoptions(precision=4, sci_mode=False) + ref_model = model.Dlrm(**_DUMMY_CONFIG) + test_model = model.DlrmJointEmbedding(**_DUMMY_CONFIG) + ref_model.set_devices("cuda") + test_model.to("cuda") + + # Copy model weight from ref_model + test_model.embeddings[0].embedding.weight.data = torch.cat( + [embedding.weight for embedding in ref_model.embeddings]).clone() + test_module_dict = dict(test_model.named_modules()) + for name, module in ref_model.named_modules(): + if isinstance(module, torch.nn.Linear): + test_module_dict[name].weight.data.copy_(module.weight) + test_module_dict[name].bias.data.copy_(module.bias) + + test_numerical_input = torch.randn(3, 13, device="cuda") + test_sparse_inputs = torch.randint(0, 3, (2, 3), device="cuda") # pylint:disable=not-callable + + ref_out = ref_model(test_numerical_input, test_sparse_inputs) + test_out = test_model(test_numerical_input, test_sparse_inputs) + assert (ref_out == test_out).all() + + def test_hash(self): + # test creation + test_model = model.DlrmJointEmbedding(**_DUMMY_CONFIG, hash_indices=True) + test_model.to("cuda") + + # Test forward + ref_numerical_input = torch.randn(2, 13, device="cuda") + ref_sparse_inputs = torch.tensor([[1, 2], [2, 3]], device="cuda") # pylint:disable=not-callable + ref = test_model(ref_numerical_input, ref_sparse_inputs) + + # Test indices that will be hashed to the same value as ref + test_sparse_inputs = torch.tensor([[1, 7], [9, 3]], device="cuda") # pylint:disable=not-callable + test_result = test_model(ref_numerical_input, test_sparse_inputs) + + assert (ref == test_result).all() + +if __name__ == '__main__': + absltest.main() diff --git a/toolbox/Fairseq/patch/bart.py b/toolbox/Fairseq/patch/bart.py index 40c8999f2..0e3047591 100644 --- a/toolbox/Fairseq/patch/bart.py +++ b/toolbox/Fairseq/patch/bart.py @@ -1,40 +1,40 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from fairseq.models.bart import BARTModel - -bart = BARTModel.from_pretrained( - 'checkpoints/bart', - checkpoint_file='checkpoint_best.pt', - data_name_or_path='RTE-bin' -) - -label_fn = lambda label: bart.task.label_dictionary.string( - [label + bart.task.label_dictionary.nspecial] -) -ncorrect, nsamples = 0, 0 -bart.cuda() -bart.eval() -with open('glue_data/RTE/dev.tsv') as fin: - fin.readline() - for index, line in enumerate(fin): - tokens = line.strip().split('\t') - sent1, sent2, target = tokens[1], tokens[2], tokens[3] - tokens = bart.encode(sent1, sent2) - prediction = bart.predict('sentence_classification_head', tokens).argmax().item() - prediction_label = label_fn(prediction) - ncorrect += int(prediction_label == target) - nsamples += 1 +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from fairseq.models.bart import BARTModel + +bart = BARTModel.from_pretrained( + 'checkpoints/bart', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='RTE-bin' +) + +label_fn = lambda label: bart.task.label_dictionary.string( + [label + bart.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +bart.cuda() +bart.eval() +with open('glue_data/RTE/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[1], tokens[2], tokens[3] + tokens = bart.encode(sent1, sent2) + prediction = bart.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) \ No newline at end of file diff --git a/toolbox/Fairseq/patch/roberta.py b/toolbox/Fairseq/patch/roberta.py index 39a7e6eb8..cfab494ed 100644 --- a/toolbox/Fairseq/patch/roberta.py +++ b/toolbox/Fairseq/patch/roberta.py @@ -1,40 +1,40 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from fairseq.models.roberta import RobertaModel - -roberta = RobertaModel.from_pretrained( - 'checkpoints/roberta', - checkpoint_file='checkpoint_best.pt', - data_name_or_path='RTE-bin' -) - -label_fn = lambda label: roberta.task.label_dictionary.string( - [label + roberta.task.label_dictionary.nspecial] -) -ncorrect, nsamples = 0, 0 -roberta.cuda() -roberta.eval() -with open('glue_data/RTE/dev.tsv') as fin: - fin.readline() - for index, line in enumerate(fin): - tokens = line.strip().split('\t') - sent1, sent2, target = tokens[1], tokens[2], tokens[3] - tokens = roberta.encode(sent1, sent2) - prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() - prediction_label = label_fn(prediction) - ncorrect += int(prediction_label == target) - nsamples += 1 -print('| Accuracy: ', float(ncorrect)/float(nsamples)) +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from fairseq.models.roberta import RobertaModel + +roberta = RobertaModel.from_pretrained( + 'checkpoints/roberta', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='RTE-bin' +) + +label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +roberta.cuda() +roberta.eval() +with open('glue_data/RTE/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[1], tokens[2], tokens[3] + tokens = roberta.encode(sent1, sent2) + prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) diff --git a/toolbox/MoE-LLaVA/moellava/vis/vis1.py b/toolbox/MoE-LLaVA/moellava/vis/vis1.py index 6ca430097..bf5d3f634 100644 --- a/toolbox/MoE-LLaVA/moellava/vis/vis1.py +++ b/toolbox/MoE-LLaVA/moellava/vis/vis1.py @@ -1,169 +1,169 @@ -import torch -from tqdm import tqdm -from torch.nn import functional as F -from collections import Counter -import numpy as np -import argparse -import matplotlib.pyplot as plt - - -def draw(args): - data = torch.load(args.input) - all_text_img_expert_counter_list = [] - for k, v in tqdm(data.items()): - gating_logit = v['gating_logit'] - images = v['images'][0] if v['images'] is not None else v['images'] - input_ids = v['input_ids'][0].tolist() - output_ids = v['output_ids'][0].tolist() - gating_logit = v['gating_logit'] - num_moe_layers = len(gating_logit) - - if images is not None: - assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 - img_idx = output_ids.index(-200) - output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] - - text_img_expert_counters = [] - for layer_idx, logits in enumerate(gating_logit): - - assert logits.shape[0] + 1 == len(output_ids) # double check - num_expert = logits.shape[1] - gates = F.softmax(logits, dim=1) - indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token - mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) - exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions - - text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) - img_indices1_s = indices1_s[img_idx:img_idx+576] - text_expert_counter = Counter(text_indices1_s.tolist()) - img_expert_counter = Counter(img_indices1_s.tolist()) - - len_text = len(text_indices1_s) - len_img = len(img_indices1_s) - scale = len_img / len_text - - text_img_expert_counter_list = [[int(text_expert_counter[k] * scale), img_expert_counter[k]] for k in range(num_expert)] - - text_img_expert_counters.append(text_img_expert_counter_list) - all_text_img_expert_counter_list.append(text_img_expert_counters) - print() - - all_text_img_expert_counter_list = np.array(all_text_img_expert_counter_list) - all_text_img_expert_counter = all_text_img_expert_counter_list / np.sum(all_text_img_expert_counter_list, axis=-1, keepdims=True) - all_text_img_expert_counter = np.mean(all_text_img_expert_counter, axis=0) - - all_text_img_expert = np.sum(all_text_img_expert_counter_list, axis=-1) - all_text_img_expert = all_text_img_expert / np.sum(all_text_img_expert, axis=-1, keepdims=True) - all_text_img_expert = np.mean(all_text_img_expert, axis=0) - - num_layer = all_text_img_expert_counter.shape[0] - categories = [i*2+1 for i in range(num_layer)] - fig, (ax0, ax1, ax2, ax3, ax4) = plt.subplots(1, 5, figsize=(20, 4), sharey=True) - - bar_positions = np.arange(len(categories)) - colors = ['#62A0CA', '#FFA556', '#6BBC6B', '#E26868'] - colors1 = ['#7CABCA', '#4996CA'] - colors2 = ['#FFBC80', '#FF9A40'] - colors3 = ['#7FBC7F', '#508D50'] - colors4 = ['#E28787', '#E24E4E'] - - ax0.bar(bar_positions, all_text_img_expert[:, 0], color=colors[0], label='Expert 1') - ax0.bar(bar_positions, all_text_img_expert[:, 1], color=colors[1], bottom=all_text_img_expert[:, 0], label='Expert 2') - ax0.bar(bar_positions, all_text_img_expert[:, 2], color=colors[2], bottom=all_text_img_expert[:, 0]+all_text_img_expert[:, 1], label='Expert 3') - ax0.bar(bar_positions, all_text_img_expert[:, 3], color=colors[3], bottom=all_text_img_expert[:, 0]+all_text_img_expert[:, 1]+all_text_img_expert[:, 2], label='Expert 4') - - - ax1.bar(bar_positions, all_text_img_expert_counter[:, 0, 0], color=colors1[0], label='Text') - ax1.bar(bar_positions, all_text_img_expert_counter[:, 0, 1], color=colors1[1], bottom=all_text_img_expert_counter[:, 0, 0], label='Image') - - ax2.bar(bar_positions, all_text_img_expert_counter[:, 1, 0], color=colors2[0], label='Text') - ax2.bar(bar_positions, all_text_img_expert_counter[:, 1, 1], color=colors2[1], bottom=all_text_img_expert_counter[:, 1, 0], label='Image') - - ax3.bar(bar_positions, all_text_img_expert_counter[:, 2, 0], color=colors3[0], label='Text') - ax3.bar(bar_positions, all_text_img_expert_counter[:, 2, 1], color=colors3[1], bottom=all_text_img_expert_counter[:, 2, 0], label='Image') - - ax4.bar(bar_positions, all_text_img_expert_counter[:, 3, 0], color=colors4[0], label='Text') - ax4.bar(bar_positions, all_text_img_expert_counter[:, 3, 1], color=colors4[1], bottom=all_text_img_expert_counter[:, 3, 0], label='Image') - - - ax0.set_xlabel('MoE layer') - ax0.set_ylabel('Percentage') - ax0.set_xticks(bar_positions) - ax0.set_xticklabels(categories) - # ax0.legend(loc='upper center', ncol=4) - ax0.legend(loc='upper center', ncol=2) - ax0.set_title('All experts') - ax0.set_ylim(0, 1.25) - ax0.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax0.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - ax0.axhline(y=0.25, color='gray', linestyle='--') - ax0.axhline(y=0.5, color='gray', linestyle='--') - ax0.axhline(y=0.75, color='gray', linestyle='--') - - ax1.set_xlabel('MoE layer') - # ax1.set_ylabel('Percentage') - ax1.set_xticks(bar_positions) - ax1.set_xticklabels(categories) - ax1.legend(loc=(0.24, 0.85), ncol=2) - ax1.set_title('Expert 1') - ax1.set_ylim(0, 1.25) - ax1.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax1.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - # ax1.axhline(y=0.25, color='gray', linestyle='--') - ax1.axhline(y=0.5, color='gray', linestyle='--') - # ax1.axhline(y=0.75, color='gray', linestyle='--') - - ax2.set_xlabel('MoE layer') - # ax2.set_ylabel('Percentage') - ax2.set_xticks(bar_positions) - ax2.set_xticklabels(categories) - ax2.legend(loc=(0.24, 0.85), ncol=2) - ax2.set_title('Expert 2') - ax2.set_ylim(0, 1.25) - ax2.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax2.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - # ax2.axhline(y=0.25, color='gray', linestyle='--') - ax2.axhline(y=0.5, color='gray', linestyle='--') - # ax2.axhline(y=0.75, color='gray', linestyle='--') - - ax3.set_xlabel('MoE layer') - # ax3.set_ylabel('Percentage') - ax3.set_xticks(bar_positions) - ax3.set_xticklabels(categories) - ax3.legend(loc=(0.24, 0.85), ncol=2) - ax3.set_title('Expert 3') - ax3.set_ylim(0, 1.25) - ax3.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax3.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - # ax3.axhline(y=0.25, color='gray', linestyle='--') - ax3.axhline(y=0.5, color='gray', linestyle='--') - # ax3.axhline(y=0.75, color='gray', linestyle='--') - - ax4.set_xlabel('MoE layer') - # ax4.set_ylabel('Percentage') - ax4.set_xticks(bar_positions) - ax4.set_xticklabels(categories) - ax4.legend(loc=(0.24, 0.85), ncol=2) - ax4.set_title('Expert 4') - ax4.set_ylim(0, 1.25) - ax4.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax4.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - # ax4.axhline(y=0.25, color='gray', linestyle='--') - ax4.axhline(y=0.5, color='gray', linestyle='--') - # ax4.axhline(y=0.75, color='gray', linestyle='--') - - plt.tight_layout() - if args.output is not None: - plt.savefig(args.output) - else: - plt.show() - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, default='phi_sciqa.pt') - parser.add_argument("--output", type=str, default=None) - args = parser.parse_args() - - draw(args) +import torch +from tqdm import tqdm +from torch.nn import functional as F +from collections import Counter +import numpy as np +import argparse +import matplotlib.pyplot as plt + + +def draw(args): + data = torch.load(args.input) + all_text_img_expert_counter_list = [] + for k, v in tqdm(data.items()): + gating_logit = v['gating_logit'] + images = v['images'][0] if v['images'] is not None else v['images'] + input_ids = v['input_ids'][0].tolist() + output_ids = v['output_ids'][0].tolist() + gating_logit = v['gating_logit'] + num_moe_layers = len(gating_logit) + + if images is not None: + assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 + img_idx = output_ids.index(-200) + output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] + + text_img_expert_counters = [] + for layer_idx, logits in enumerate(gating_logit): + + assert logits.shape[0] + 1 == len(output_ids) # double check + num_expert = logits.shape[1] + gates = F.softmax(logits, dim=1) + indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token + mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) + exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions + + text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) + img_indices1_s = indices1_s[img_idx:img_idx+576] + text_expert_counter = Counter(text_indices1_s.tolist()) + img_expert_counter = Counter(img_indices1_s.tolist()) + + len_text = len(text_indices1_s) + len_img = len(img_indices1_s) + scale = len_img / len_text + + text_img_expert_counter_list = [[int(text_expert_counter[k] * scale), img_expert_counter[k]] for k in range(num_expert)] + + text_img_expert_counters.append(text_img_expert_counter_list) + all_text_img_expert_counter_list.append(text_img_expert_counters) + print() + + all_text_img_expert_counter_list = np.array(all_text_img_expert_counter_list) + all_text_img_expert_counter = all_text_img_expert_counter_list / np.sum(all_text_img_expert_counter_list, axis=-1, keepdims=True) + all_text_img_expert_counter = np.mean(all_text_img_expert_counter, axis=0) + + all_text_img_expert = np.sum(all_text_img_expert_counter_list, axis=-1) + all_text_img_expert = all_text_img_expert / np.sum(all_text_img_expert, axis=-1, keepdims=True) + all_text_img_expert = np.mean(all_text_img_expert, axis=0) + + num_layer = all_text_img_expert_counter.shape[0] + categories = [i*2+1 for i in range(num_layer)] + fig, (ax0, ax1, ax2, ax3, ax4) = plt.subplots(1, 5, figsize=(20, 4), sharey=True) + + bar_positions = np.arange(len(categories)) + colors = ['#62A0CA', '#FFA556', '#6BBC6B', '#E26868'] + colors1 = ['#7CABCA', '#4996CA'] + colors2 = ['#FFBC80', '#FF9A40'] + colors3 = ['#7FBC7F', '#508D50'] + colors4 = ['#E28787', '#E24E4E'] + + ax0.bar(bar_positions, all_text_img_expert[:, 0], color=colors[0], label='Expert 1') + ax0.bar(bar_positions, all_text_img_expert[:, 1], color=colors[1], bottom=all_text_img_expert[:, 0], label='Expert 2') + ax0.bar(bar_positions, all_text_img_expert[:, 2], color=colors[2], bottom=all_text_img_expert[:, 0]+all_text_img_expert[:, 1], label='Expert 3') + ax0.bar(bar_positions, all_text_img_expert[:, 3], color=colors[3], bottom=all_text_img_expert[:, 0]+all_text_img_expert[:, 1]+all_text_img_expert[:, 2], label='Expert 4') + + + ax1.bar(bar_positions, all_text_img_expert_counter[:, 0, 0], color=colors1[0], label='Text') + ax1.bar(bar_positions, all_text_img_expert_counter[:, 0, 1], color=colors1[1], bottom=all_text_img_expert_counter[:, 0, 0], label='Image') + + ax2.bar(bar_positions, all_text_img_expert_counter[:, 1, 0], color=colors2[0], label='Text') + ax2.bar(bar_positions, all_text_img_expert_counter[:, 1, 1], color=colors2[1], bottom=all_text_img_expert_counter[:, 1, 0], label='Image') + + ax3.bar(bar_positions, all_text_img_expert_counter[:, 2, 0], color=colors3[0], label='Text') + ax3.bar(bar_positions, all_text_img_expert_counter[:, 2, 1], color=colors3[1], bottom=all_text_img_expert_counter[:, 2, 0], label='Image') + + ax4.bar(bar_positions, all_text_img_expert_counter[:, 3, 0], color=colors4[0], label='Text') + ax4.bar(bar_positions, all_text_img_expert_counter[:, 3, 1], color=colors4[1], bottom=all_text_img_expert_counter[:, 3, 0], label='Image') + + + ax0.set_xlabel('MoE layer') + ax0.set_ylabel('Percentage') + ax0.set_xticks(bar_positions) + ax0.set_xticklabels(categories) + # ax0.legend(loc='upper center', ncol=4) + ax0.legend(loc='upper center', ncol=2) + ax0.set_title('All experts') + ax0.set_ylim(0, 1.25) + ax0.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax0.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + ax0.axhline(y=0.25, color='gray', linestyle='--') + ax0.axhline(y=0.5, color='gray', linestyle='--') + ax0.axhline(y=0.75, color='gray', linestyle='--') + + ax1.set_xlabel('MoE layer') + # ax1.set_ylabel('Percentage') + ax1.set_xticks(bar_positions) + ax1.set_xticklabels(categories) + ax1.legend(loc=(0.24, 0.85), ncol=2) + ax1.set_title('Expert 1') + ax1.set_ylim(0, 1.25) + ax1.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax1.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + # ax1.axhline(y=0.25, color='gray', linestyle='--') + ax1.axhline(y=0.5, color='gray', linestyle='--') + # ax1.axhline(y=0.75, color='gray', linestyle='--') + + ax2.set_xlabel('MoE layer') + # ax2.set_ylabel('Percentage') + ax2.set_xticks(bar_positions) + ax2.set_xticklabels(categories) + ax2.legend(loc=(0.24, 0.85), ncol=2) + ax2.set_title('Expert 2') + ax2.set_ylim(0, 1.25) + ax2.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax2.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + # ax2.axhline(y=0.25, color='gray', linestyle='--') + ax2.axhline(y=0.5, color='gray', linestyle='--') + # ax2.axhline(y=0.75, color='gray', linestyle='--') + + ax3.set_xlabel('MoE layer') + # ax3.set_ylabel('Percentage') + ax3.set_xticks(bar_positions) + ax3.set_xticklabels(categories) + ax3.legend(loc=(0.24, 0.85), ncol=2) + ax3.set_title('Expert 3') + ax3.set_ylim(0, 1.25) + ax3.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax3.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + # ax3.axhline(y=0.25, color='gray', linestyle='--') + ax3.axhline(y=0.5, color='gray', linestyle='--') + # ax3.axhline(y=0.75, color='gray', linestyle='--') + + ax4.set_xlabel('MoE layer') + # ax4.set_ylabel('Percentage') + ax4.set_xticks(bar_positions) + ax4.set_xticklabels(categories) + ax4.legend(loc=(0.24, 0.85), ncol=2) + ax4.set_title('Expert 4') + ax4.set_ylim(0, 1.25) + ax4.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax4.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + # ax4.axhline(y=0.25, color='gray', linestyle='--') + ax4.axhline(y=0.5, color='gray', linestyle='--') + # ax4.axhline(y=0.75, color='gray', linestyle='--') + + plt.tight_layout() + if args.output is not None: + plt.savefig(args.output) + else: + plt.show() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, default='phi_sciqa.pt') + parser.add_argument("--output", type=str, default=None) + args = parser.parse_args() + + draw(args) diff --git a/toolbox/MoE-LLaVA/moellava/vis/vis2.py b/toolbox/MoE-LLaVA/moellava/vis/vis2.py index f7977fc44..5d7738c78 100644 --- a/toolbox/MoE-LLaVA/moellava/vis/vis2.py +++ b/toolbox/MoE-LLaVA/moellava/vis/vis2.py @@ -1,122 +1,122 @@ -import argparse -import torch -from tqdm import tqdm -from torch.nn import functional as F -from collections import Counter -import numpy as np -import matplotlib.pyplot as plt - -def draw(args): - data = torch.load(args.input) - all_text_expert_counter_list = [] - all_img_expert_counter_list = [] - for k, v in tqdm(data.items()): - gating_logit = v['gating_logit'] - images = v['images'][0] if v['images'] is not None else v['images'] - input_ids = v['input_ids'][0].tolist() - output_ids = v['output_ids'][0].tolist() - gating_logit = v['gating_logit'] - num_moe_layers = len(gating_logit) - - if images is not None: - assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 - img_idx = output_ids.index(-200) - output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] - - text_expert_counters = [] - img_expert_counters = [] - for layer_idx, logits in enumerate(gating_logit): - - assert logits.shape[0] + 1 == len(output_ids) # double check - num_expert = logits.shape[1] - gates = F.softmax(logits, dim=1) - indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token - mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) - exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions - - text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) - img_indices1_s = indices1_s[img_idx:img_idx+576] - text_expert_counter = Counter(text_indices1_s.tolist()) - img_expert_counter = Counter(img_indices1_s.tolist()) - - - text_expert_counter_list = [text_expert_counter[k] for k in range(num_expert)] - img_expert_counter_list = [img_expert_counter[k] for k in range(num_expert)] - - - text_expert_counters.append(text_expert_counter_list) - img_expert_counters.append(img_expert_counter_list) - all_text_expert_counter_list.append(text_expert_counters) - all_img_expert_counter_list.append(img_expert_counters) - print() - - all_text_expert_counter_list = np.array(all_text_expert_counter_list) - all_img_expert_counter_list = np.array(all_img_expert_counter_list) - - all_text_expert_counter = np.sum(all_text_expert_counter_list, axis=-1, keepdims=True) - all_text_expert_counter = np.mean(all_text_expert_counter_list / all_text_expert_counter, axis=0) - - all_img_expert_counter = np.sum(all_img_expert_counter_list, axis=-1, keepdims=True) - all_img_expert_counter = np.mean(all_img_expert_counter_list / all_img_expert_counter, axis=0) - - - - num_layer = all_text_expert_counter.shape[0] - categories = [i*2+1 for i in range(num_layer)] - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharey=True) - - bar_positions = np.arange(len(categories)) - - colors = ['#62A0CA', '#FFA556', '#6BBC6B', '#E26868'] - ax1.bar(bar_positions, all_text_expert_counter[:, 0], color=colors[0], label='Expert 1') - for i in range(1, num_expert): - ax1.bar(bar_positions, all_text_expert_counter[:, i], bottom=np.sum(all_text_expert_counter[:, :i], axis=1), color=colors[i], label=f'Expert {i+1}') - - ax2.bar(bar_positions, all_img_expert_counter[:, 0], color=colors[0], label='Expert 1') - for i in range(1, num_expert): - ax2.bar(bar_positions, all_img_expert_counter[:, i], bottom=np.sum(all_img_expert_counter[:, :i], axis=1), color=colors[i], label=f'Expert {i+1}') - - # 设置x轴标签、标题和图例 - ax1.set_xlabel('MoE layer idx') - ax1.set_ylabel('Percentage') - ax1.set_xticks(bar_positions) - ax1.set_xticklabels(categories) - ax1.legend(loc='upper center', ncol=2) - ax1.set_title('Text') - ax1.set_ylim(0, 1.25) - ax1.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax1.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - ax1.axhline(y=0.25, color='gray', linestyle='--') - ax1.axhline(y=0.5, color='gray', linestyle='--') - ax1.axhline(y=0.75, color='gray', linestyle='--') - - # 设置x轴标签、标题和图例 - ax2.set_xlabel('MoE layer idx') - # ax2.set_ylabel('Percentage') - ax2.set_xticks(bar_positions) - ax2.set_xticklabels(categories) - ax2.legend(loc='upper center', ncol=2) - ax2.set_title('Image') - ax2.set_ylim(0, 1.25) - ax2.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) - ax2.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) - ax2.axhline(y=0.25, color='gray', linestyle='--') - ax2.axhline(y=0.5, color='gray', linestyle='--') - ax2.axhline(y=0.75, color='gray', linestyle='--') - - # 显示图形 - plt.tight_layout() - if args.output is not None: - plt.savefig(args.output) - else: - plt.show() - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, default='phi_sciqa.pt') - parser.add_argument("--output", type=str, default=None) - args = parser.parse_args() - - draw(args) +import argparse +import torch +from tqdm import tqdm +from torch.nn import functional as F +from collections import Counter +import numpy as np +import matplotlib.pyplot as plt + +def draw(args): + data = torch.load(args.input) + all_text_expert_counter_list = [] + all_img_expert_counter_list = [] + for k, v in tqdm(data.items()): + gating_logit = v['gating_logit'] + images = v['images'][0] if v['images'] is not None else v['images'] + input_ids = v['input_ids'][0].tolist() + output_ids = v['output_ids'][0].tolist() + gating_logit = v['gating_logit'] + num_moe_layers = len(gating_logit) + + if images is not None: + assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 + img_idx = output_ids.index(-200) + output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] + + text_expert_counters = [] + img_expert_counters = [] + for layer_idx, logits in enumerate(gating_logit): + + assert logits.shape[0] + 1 == len(output_ids) # double check + num_expert = logits.shape[1] + gates = F.softmax(logits, dim=1) + indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token + mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) + exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions + + text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) + img_indices1_s = indices1_s[img_idx:img_idx+576] + text_expert_counter = Counter(text_indices1_s.tolist()) + img_expert_counter = Counter(img_indices1_s.tolist()) + + + text_expert_counter_list = [text_expert_counter[k] for k in range(num_expert)] + img_expert_counter_list = [img_expert_counter[k] for k in range(num_expert)] + + + text_expert_counters.append(text_expert_counter_list) + img_expert_counters.append(img_expert_counter_list) + all_text_expert_counter_list.append(text_expert_counters) + all_img_expert_counter_list.append(img_expert_counters) + print() + + all_text_expert_counter_list = np.array(all_text_expert_counter_list) + all_img_expert_counter_list = np.array(all_img_expert_counter_list) + + all_text_expert_counter = np.sum(all_text_expert_counter_list, axis=-1, keepdims=True) + all_text_expert_counter = np.mean(all_text_expert_counter_list / all_text_expert_counter, axis=0) + + all_img_expert_counter = np.sum(all_img_expert_counter_list, axis=-1, keepdims=True) + all_img_expert_counter = np.mean(all_img_expert_counter_list / all_img_expert_counter, axis=0) + + + + num_layer = all_text_expert_counter.shape[0] + categories = [i*2+1 for i in range(num_layer)] + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharey=True) + + bar_positions = np.arange(len(categories)) + + colors = ['#62A0CA', '#FFA556', '#6BBC6B', '#E26868'] + ax1.bar(bar_positions, all_text_expert_counter[:, 0], color=colors[0], label='Expert 1') + for i in range(1, num_expert): + ax1.bar(bar_positions, all_text_expert_counter[:, i], bottom=np.sum(all_text_expert_counter[:, :i], axis=1), color=colors[i], label=f'Expert {i+1}') + + ax2.bar(bar_positions, all_img_expert_counter[:, 0], color=colors[0], label='Expert 1') + for i in range(1, num_expert): + ax2.bar(bar_positions, all_img_expert_counter[:, i], bottom=np.sum(all_img_expert_counter[:, :i], axis=1), color=colors[i], label=f'Expert {i+1}') + + # 设置x轴标签、标题和图例 + ax1.set_xlabel('MoE layer idx') + ax1.set_ylabel('Percentage') + ax1.set_xticks(bar_positions) + ax1.set_xticklabels(categories) + ax1.legend(loc='upper center', ncol=2) + ax1.set_title('Text') + ax1.set_ylim(0, 1.25) + ax1.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax1.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + ax1.axhline(y=0.25, color='gray', linestyle='--') + ax1.axhline(y=0.5, color='gray', linestyle='--') + ax1.axhline(y=0.75, color='gray', linestyle='--') + + # 设置x轴标签、标题和图例 + ax2.set_xlabel('MoE layer idx') + # ax2.set_ylabel('Percentage') + ax2.set_xticks(bar_positions) + ax2.set_xticklabels(categories) + ax2.legend(loc='upper center', ncol=2) + ax2.set_title('Image') + ax2.set_ylim(0, 1.25) + ax2.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0]) + ax2.set_yticklabels(['0%', '25%', '50%', '75%', '100%']) + ax2.axhline(y=0.25, color='gray', linestyle='--') + ax2.axhline(y=0.5, color='gray', linestyle='--') + ax2.axhline(y=0.75, color='gray', linestyle='--') + + # 显示图形 + plt.tight_layout() + if args.output is not None: + plt.savefig(args.output) + else: + plt.show() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, default='phi_sciqa.pt') + parser.add_argument("--output", type=str, default=None) + args = parser.parse_args() + + draw(args) diff --git a/toolbox/MoE-LLaVA/moellava/vis/vis3.py b/toolbox/MoE-LLaVA/moellava/vis/vis3.py index 422bb7a39..08ca1904d 100644 --- a/toolbox/MoE-LLaVA/moellava/vis/vis3.py +++ b/toolbox/MoE-LLaVA/moellava/vis/vis3.py @@ -1,181 +1,181 @@ -import argparse -import matplotlib.pyplot as plt -import numpy as np -from matplotlib.lines import Line2D -from mplsoccer import Bumpy, FontManager, add_image -import torch -from tqdm import tqdm -from torch.nn import functional as F - - -def draw(args): - data = torch.load(args.input) - all_text_token_path = [] - all_img_token_path = [] - for k, v in tqdm(data.items()): - gating_logit = v['gating_logit'] - images = v['images'][0] if v['images'] is not None else v['images'] - input_ids = v['input_ids'][0].tolist() - output_ids = v['output_ids'][0].tolist() - gating_logit = v['gating_logit'] - num_moe_layers = len(gating_logit) - - if images is not None: - assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 - img_idx = output_ids.index(-200) - output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] - - text_token_path = [] - img_token_path = [] - for layer_idx, logits in enumerate(gating_logit): - - assert logits.shape[0] + 1 == len(output_ids) # double check - num_expert = logits.shape[1] - gates = F.softmax(logits, dim=1) - indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token - mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) - exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions - - text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) - img_indices1_s = indices1_s[img_idx:img_idx+576] - - text_token_path.append(text_indices1_s) - img_token_path.append(img_indices1_s) - text_token_path = torch.stack(text_token_path).T # 每个token沿层的路径 - img_token_path = torch.stack(img_token_path).T # 每个token沿层的路径 - - all_text_token_path.append(text_token_path) - all_img_token_path.append(img_token_path) - print() - - all_text_token_path = torch.cat(all_text_token_path, dim=0) - all_img_token_path = torch.cat(all_img_token_path, dim=0) - - from sklearn.decomposition import PCA - pca = PCA(n_components=12) - new_all_text_token_path = pca.fit_transform(all_text_token_path.T) - new_all_text_token_path = new_all_text_token_path.T - new_all_text_token_path = ((new_all_text_token_path - np.min(new_all_text_token_path, keepdims=True, axis=1)) / (np.max(new_all_text_token_path, keepdims=True, axis=1) - np.min(new_all_text_token_path, keepdims=True, axis=1))) - new_all_text_token_path = np.clip(new_all_text_token_path, a_min=0.01, a_max=0.99) - new_all_text_token_path = {'text_'+str(n+1): [int(i // (1/num_expert) + 1) for i in j] for n, j in enumerate(new_all_text_token_path)} - - - from sklearn.decomposition import PCA - pca = PCA(n_components=12) - new_all_img_token_path = pca.fit_transform(all_img_token_path.T) - new_all_img_token_path = new_all_img_token_path.T - new_all_img_token_path = ((new_all_img_token_path - np.min(new_all_img_token_path, keepdims=True, axis=1)) / (np.max(new_all_img_token_path, keepdims=True, axis=1) - np.min(new_all_img_token_path, keepdims=True, axis=1))) - new_all_img_token_path = np.clip(new_all_img_token_path, a_min=0.01, a_max=0.99) - new_all_img_token_path = {'img_'+str(n+1): [int(i // (1/num_expert) + 1) for i in j] for n, j in enumerate(new_all_img_token_path)} - - - # new_all_text_token_path.update(new_all_img_token_path) - # highlight dict --> team to highlight and their corresponding colors - highlight_dict = { - 'text_1': "#BF5029", - 'text_2': "#FF9069" - } - - # match-week - match_day = [str(num*2+1) for num in range(num_moe_layers)] - - - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 8), sharex=True) - - # instantiate object - bumpy = Bumpy( - background_color="#FFFFFF", scatter_color="#808080", - label_color="#000000", line_color="#C0C0C0", - # rotate_xticks=90, # rotate x-ticks by 90 degrees - ticklabel_size=19, label_size=23, # ticklable and label font-size - scatter_points='o', # other markers - scatter_primary='D', # marker to be used for teams - scatter_size=150, # size of the marker - show_right=False, # show position on the rightside - plot_labels=True, # plot the labels - alignment_yvalue=0.5, # y label alignment - alignment_xvalue=0.5, # x label alignment - ) - - # plot bumpy chart - bumpy.plot( - x_list=match_day, # match-day or match-week - y_list=list(range(1, num_expert+1))[::-1], # position value from 1 to 20 - values=new_all_text_token_path, # values having positions for each team - secondary_alpha=0.2, # alpha value for non-shaded lines/markers - highlight_dict=highlight_dict, # team to be highlighted with their colors - # figsize=(16, 8), # size of the figure - # x_label='MoE layer idx', - y_label='Expert idx', # label name - ylim=(0.8, num_expert+0.2), # y-axis limit - lw=2.5, # linewidth of the connecting lines - upside_down=True, - ax=ax1, - # fontproperties=font_normal.prop, # fontproperties for ticklabels/labels - ) - - highlight_dict = { - 'img_1': "#365CBF", - 'img_2': "#76BEFF" - } - - # plot bumpy chart - bumpy.plot( - x_list=match_day, # match-day or match-week - y_list=list(reversed(list(range(1, num_expert+1)))), # position value from 1 to 20 - values=new_all_img_token_path, # values having positions for each team - secondary_alpha=0.2, # alpha value for non-shaded lines/markers - highlight_dict=highlight_dict, # team to be highlighted with their colors - # figsize=(16, 8), # size of the figure - x_label='MoE layer idx', y_label='Expert idx', # label name - ylim=(0.8, num_expert+0.2), # y-axis limit - lw=2.5, # linewidth of the connecting lines - upside_down=True, - ax=ax2, - # fontproperties=font_normal.prop, # fontproperties for ticklabels/labels - ) - - - legend_elements = [Line2D([0], [0], marker='D', ms=12, color='#BF5029', lw=3.5, label='Top-1'), - Line2D([0], [0], marker='D', ms=12, color='#FF9069', lw=3.5, label='Top-2'), - Line2D([0], [0], marker='o', ms=12, color='gray', lw=3.5, label='Others')] - - ax1.spines['top'].set_visible(True) - ax1.spines['right'].set_visible(True) - ax1.spines['left'].set_visible(True) - ax1.spines['bottom'].set_visible(True) - ax1.set_title('Text', fontsize=28) - ax1.legend(handles=legend_elements, loc=(1.01, 0.25), - ncol=1, fontsize=23) - - - legend_elements = [Line2D([0], [0], marker='D', ms=12, color='#365CBF', lw=3.5, label='Top-1'), - Line2D([0], [0], marker='D', ms=12, color='#76BEFF', lw=3.5, label='Top-2'), - Line2D([0], [0], marker='o', ms=12, color='gray', lw=3.5, label='Others')] - - ax2.spines['top'].set_visible(True) - ax2.spines['right'].set_visible(True) - ax2.spines['left'].set_visible(True) - ax2.spines['bottom'].set_visible(True) - ax2.set_title('Image', fontsize=28) - ax2.legend(handles=legend_elements, loc=(1.01, 0.25), - ncol=1, fontsize=23) - - - - plt.tight_layout() - - if args.output is not None: - plt.savefig(args.output) - else: - plt.show() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, default='phi_sciqa.pt') - parser.add_argument("--output", type=str, default=None) - args = parser.parse_args() - - draw(args) - +import argparse +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.lines import Line2D +from mplsoccer import Bumpy, FontManager, add_image +import torch +from tqdm import tqdm +from torch.nn import functional as F + + +def draw(args): + data = torch.load(args.input) + all_text_token_path = [] + all_img_token_path = [] + for k, v in tqdm(data.items()): + gating_logit = v['gating_logit'] + images = v['images'][0] if v['images'] is not None else v['images'] + input_ids = v['input_ids'][0].tolist() + output_ids = v['output_ids'][0].tolist() + gating_logit = v['gating_logit'] + num_moe_layers = len(gating_logit) + + if images is not None: + assert gating_logit[0].shape[0] + 1 == len(output_ids) + 575 + img_idx = output_ids.index(-200) + output_ids = output_ids[:img_idx] + [-200] * 576 + output_ids[img_idx+1:] + + text_token_path = [] + img_token_path = [] + for layer_idx, logits in enumerate(gating_logit): + + assert logits.shape[0] + 1 == len(output_ids) # double check + num_expert = logits.shape[1] + gates = F.softmax(logits, dim=1) + indices1_s = torch.argmax(gates, dim=1) # Create a mask for 1st's expert per token + mask1 = F.one_hot(indices1_s, num_classes=int(gates.shape[1])) + exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # gating decisions + + text_indices1_s = torch.cat([indices1_s[:img_idx], indices1_s[img_idx+576+1:]]) + img_indices1_s = indices1_s[img_idx:img_idx+576] + + text_token_path.append(text_indices1_s) + img_token_path.append(img_indices1_s) + text_token_path = torch.stack(text_token_path).T # 每个token沿层的路径 + img_token_path = torch.stack(img_token_path).T # 每个token沿层的路径 + + all_text_token_path.append(text_token_path) + all_img_token_path.append(img_token_path) + print() + + all_text_token_path = torch.cat(all_text_token_path, dim=0) + all_img_token_path = torch.cat(all_img_token_path, dim=0) + + from sklearn.decomposition import PCA + pca = PCA(n_components=12) + new_all_text_token_path = pca.fit_transform(all_text_token_path.T) + new_all_text_token_path = new_all_text_token_path.T + new_all_text_token_path = ((new_all_text_token_path - np.min(new_all_text_token_path, keepdims=True, axis=1)) / (np.max(new_all_text_token_path, keepdims=True, axis=1) - np.min(new_all_text_token_path, keepdims=True, axis=1))) + new_all_text_token_path = np.clip(new_all_text_token_path, a_min=0.01, a_max=0.99) + new_all_text_token_path = {'text_'+str(n+1): [int(i // (1/num_expert) + 1) for i in j] for n, j in enumerate(new_all_text_token_path)} + + + from sklearn.decomposition import PCA + pca = PCA(n_components=12) + new_all_img_token_path = pca.fit_transform(all_img_token_path.T) + new_all_img_token_path = new_all_img_token_path.T + new_all_img_token_path = ((new_all_img_token_path - np.min(new_all_img_token_path, keepdims=True, axis=1)) / (np.max(new_all_img_token_path, keepdims=True, axis=1) - np.min(new_all_img_token_path, keepdims=True, axis=1))) + new_all_img_token_path = np.clip(new_all_img_token_path, a_min=0.01, a_max=0.99) + new_all_img_token_path = {'img_'+str(n+1): [int(i // (1/num_expert) + 1) for i in j] for n, j in enumerate(new_all_img_token_path)} + + + # new_all_text_token_path.update(new_all_img_token_path) + # highlight dict --> team to highlight and their corresponding colors + highlight_dict = { + 'text_1': "#BF5029", + 'text_2': "#FF9069" + } + + # match-week + match_day = [str(num*2+1) for num in range(num_moe_layers)] + + + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 8), sharex=True) + + # instantiate object + bumpy = Bumpy( + background_color="#FFFFFF", scatter_color="#808080", + label_color="#000000", line_color="#C0C0C0", + # rotate_xticks=90, # rotate x-ticks by 90 degrees + ticklabel_size=19, label_size=23, # ticklable and label font-size + scatter_points='o', # other markers + scatter_primary='D', # marker to be used for teams + scatter_size=150, # size of the marker + show_right=False, # show position on the rightside + plot_labels=True, # plot the labels + alignment_yvalue=0.5, # y label alignment + alignment_xvalue=0.5, # x label alignment + ) + + # plot bumpy chart + bumpy.plot( + x_list=match_day, # match-day or match-week + y_list=list(range(1, num_expert+1))[::-1], # position value from 1 to 20 + values=new_all_text_token_path, # values having positions for each team + secondary_alpha=0.2, # alpha value for non-shaded lines/markers + highlight_dict=highlight_dict, # team to be highlighted with their colors + # figsize=(16, 8), # size of the figure + # x_label='MoE layer idx', + y_label='Expert idx', # label name + ylim=(0.8, num_expert+0.2), # y-axis limit + lw=2.5, # linewidth of the connecting lines + upside_down=True, + ax=ax1, + # fontproperties=font_normal.prop, # fontproperties for ticklabels/labels + ) + + highlight_dict = { + 'img_1': "#365CBF", + 'img_2': "#76BEFF" + } + + # plot bumpy chart + bumpy.plot( + x_list=match_day, # match-day or match-week + y_list=list(reversed(list(range(1, num_expert+1)))), # position value from 1 to 20 + values=new_all_img_token_path, # values having positions for each team + secondary_alpha=0.2, # alpha value for non-shaded lines/markers + highlight_dict=highlight_dict, # team to be highlighted with their colors + # figsize=(16, 8), # size of the figure + x_label='MoE layer idx', y_label='Expert idx', # label name + ylim=(0.8, num_expert+0.2), # y-axis limit + lw=2.5, # linewidth of the connecting lines + upside_down=True, + ax=ax2, + # fontproperties=font_normal.prop, # fontproperties for ticklabels/labels + ) + + + legend_elements = [Line2D([0], [0], marker='D', ms=12, color='#BF5029', lw=3.5, label='Top-1'), + Line2D([0], [0], marker='D', ms=12, color='#FF9069', lw=3.5, label='Top-2'), + Line2D([0], [0], marker='o', ms=12, color='gray', lw=3.5, label='Others')] + + ax1.spines['top'].set_visible(True) + ax1.spines['right'].set_visible(True) + ax1.spines['left'].set_visible(True) + ax1.spines['bottom'].set_visible(True) + ax1.set_title('Text', fontsize=28) + ax1.legend(handles=legend_elements, loc=(1.01, 0.25), + ncol=1, fontsize=23) + + + legend_elements = [Line2D([0], [0], marker='D', ms=12, color='#365CBF', lw=3.5, label='Top-1'), + Line2D([0], [0], marker='D', ms=12, color='#76BEFF', lw=3.5, label='Top-2'), + Line2D([0], [0], marker='o', ms=12, color='gray', lw=3.5, label='Others')] + + ax2.spines['top'].set_visible(True) + ax2.spines['right'].set_visible(True) + ax2.spines['left'].set_visible(True) + ax2.spines['bottom'].set_visible(True) + ax2.set_title('Image', fontsize=28) + ax2.legend(handles=legend_elements, loc=(1.01, 0.25), + ncol=1, fontsize=23) + + + + plt.tight_layout() + + if args.output is not None: + plt.savefig(args.output) + else: + plt.show() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, default='phi_sciqa.pt') + parser.add_argument("--output", type=str, default=None) + args = parser.parse_args() + + draw(args) + diff --git a/toolbox/openpcdet/pcdet/datasets/custom/custom_dataset.py b/toolbox/openpcdet/pcdet/datasets/custom/custom_dataset.py index 3715210b1..0d294a809 100644 --- a/toolbox/openpcdet/pcdet/datasets/custom/custom_dataset.py +++ b/toolbox/openpcdet/pcdet/datasets/custom/custom_dataset.py @@ -1,283 +1,283 @@ -import copy -import pickle -import os - -import numpy as np - -from ...ops.roiaware_pool3d import roiaware_pool3d_utils -from ...utils import box_utils, common_utils -from ..dataset import DatasetTemplate - - -class CustomDataset(DatasetTemplate): - def __init__(self, dataset_cfg, class_names, training=True, root_path=None, logger=None): - """ - Args: - root_path: - dataset_cfg: - class_names: - training: - logger: - """ - super().__init__( - dataset_cfg=dataset_cfg, class_names=class_names, training=training, root_path=root_path, logger=logger - ) - self.split = self.dataset_cfg.DATA_SPLIT[self.mode] - - split_dir = os.path.join(self.root_path, 'ImageSets', (self.split + '.txt')) - self.sample_id_list = [x.strip() for x in open(split_dir).readlines()] if os.path.exists(split_dir) else None - - self.custom_infos = [] - self.include_data(self.mode) - self.map_class_to_kitti = self.dataset_cfg.MAP_CLASS_TO_KITTI - - def include_data(self, mode): - self.logger.info('Loading Custom dataset.') - custom_infos = [] - - for info_path in self.dataset_cfg.INFO_PATH[mode]: - info_path = self.root_path / info_path - if not info_path.exists(): - continue - with open(info_path, 'rb') as f: - infos = pickle.load(f) - custom_infos.extend(infos) - - self.custom_infos.extend(custom_infos) - self.logger.info('Total samples for CUSTOM dataset: %d' % (len(custom_infos))) - - def get_label(self, idx): - label_file = self.root_path / 'labels' / ('%s.txt' % idx) - assert label_file.exists() - with open(label_file, 'r') as f: - lines = f.readlines() - - # [N, 8]: (x y z dx dy dz heading_angle category_id) - gt_boxes = [] - gt_names = [] - for line in lines: - line_list = line.strip().split(' ') - gt_boxes.append(line_list[:-1]) - gt_names.append(line_list[-1]) - - return np.array(gt_boxes, dtype=np.float32), np.array(gt_names) - - def get_lidar(self, idx): - lidar_file = self.root_path / 'points' / ('%s.npy' % idx) - assert lidar_file.exists() - point_features = np.load(lidar_file) - return point_features - - def set_split(self, split): - super().__init__( - dataset_cfg=self.dataset_cfg, class_names=self.class_names, training=self.training, - root_path=self.root_path, logger=self.logger - ) - self.split = split - - split_dir = self.root_path / 'ImageSets' / (self.split + '.txt') - self.sample_id_list = [x.strip() for x in open(split_dir).readlines()] if split_dir.exists() else None - - def __len__(self): - if self._merge_all_iters_to_one_epoch: - return len(self.sample_id_list) * self.total_epochs - - return len(self.custom_infos) - - def __getitem__(self, index): - if self._merge_all_iters_to_one_epoch: - index = index % len(self.custom_infos) - - info = copy.deepcopy(self.custom_infos[index]) - sample_idx = info['point_cloud']['lidar_idx'] - points = self.get_lidar(sample_idx) - input_dict = { - 'frame_id': self.sample_id_list[index], - 'points': points - } - - if 'annos' in info: - annos = info['annos'] - annos = common_utils.drop_info_with_name(annos, name='DontCare') - gt_names = annos['name'] - gt_boxes_lidar = annos['gt_boxes_lidar'] - input_dict.update({ - 'gt_names': gt_names, - 'gt_boxes': gt_boxes_lidar - }) - - data_dict = self.prepare_data(data_dict=input_dict) - - return data_dict - - def evaluation(self, det_annos, class_names, **kwargs): - if 'annos' not in self.custom_infos[0].keys(): - return 'No ground-truth boxes for evaluation', {} - - def kitti_eval(eval_det_annos, eval_gt_annos, map_name_to_kitti): - from ..kitti.kitti_object_eval_python import eval as kitti_eval - from ..kitti import kitti_utils - - kitti_utils.transform_annotations_to_kitti_format(eval_det_annos, map_name_to_kitti=map_name_to_kitti) - kitti_utils.transform_annotations_to_kitti_format( - eval_gt_annos, map_name_to_kitti=map_name_to_kitti, - info_with_fakelidar=self.dataset_cfg.get('INFO_WITH_FAKELIDAR', False) - ) - kitti_class_names = [map_name_to_kitti[x] for x in class_names] - ap_result_str, ap_dict = kitti_eval.get_official_eval_result( - gt_annos=eval_gt_annos, dt_annos=eval_det_annos, current_classes=kitti_class_names - ) - return ap_result_str, ap_dict - - eval_det_annos = copy.deepcopy(det_annos) - eval_gt_annos = [copy.deepcopy(info['annos']) for info in self.custom_infos] - - if kwargs['eval_metric'] == 'kitti': - ap_result_str, ap_dict = kitti_eval(eval_det_annos, eval_gt_annos, self.map_class_to_kitti) - else: - raise NotImplementedError - - return ap_result_str, ap_dict - - def get_infos(self, class_names, num_workers=4, has_label=True, sample_id_list=None, num_features=4): - import concurrent.futures as futures - - def process_single_scene(sample_idx): - print('%s sample_idx: %s' % (self.split, sample_idx)) - info = {} - pc_info = {'num_features': num_features, 'lidar_idx': sample_idx} - info['point_cloud'] = pc_info - - if has_label: - annotations = {} - gt_boxes_lidar, name = self.get_label(sample_idx) - annotations['name'] = name - annotations['gt_boxes_lidar'] = gt_boxes_lidar[:, :7] - info['annos'] = annotations - - return info - - sample_id_list = sample_id_list if sample_id_list is not None else self.sample_id_list - - # create a thread pool to improve the velocity - with futures.ThreadPoolExecutor(num_workers) as executor: - infos = executor.map(process_single_scene, sample_id_list) - return list(infos) - - def create_groundtruth_database(self, info_path=None, used_classes=None, split='train'): - import torch - - database_save_path = Path(self.root_path) / ('gt_database' if split == 'train' else ('gt_database_%s' % split)) - db_info_save_path = Path(self.root_path) / ('custom_dbinfos_%s.pkl' % split) - - database_save_path.mkdir(parents=True, exist_ok=True) - all_db_infos = {} - - with open(info_path, 'rb') as f: - infos = pickle.load(f) - - for k in range(len(infos)): - print('gt_database sample: %d/%d' % (k + 1, len(infos))) - info = infos[k] - sample_idx = info['point_cloud']['lidar_idx'] - points = self.get_lidar(sample_idx) - annos = info['annos'] - names = annos['name'] - gt_boxes = annos['gt_boxes_lidar'] - - num_obj = gt_boxes.shape[0] - point_indices = roiaware_pool3d_utils.points_in_boxes_cpu( - torch.from_numpy(points[:, 0:3]), torch.from_numpy(gt_boxes) - ).numpy() # (nboxes, npoints) - - for i in range(num_obj): - filename = '%s_%s_%d.bin' % (sample_idx, names[i], i) - filepath = database_save_path / filename - gt_points = points[point_indices[i] > 0] - - gt_points[:, :3] -= gt_boxes[i, :3] - with open(filepath, 'w') as f: - gt_points.tofile(f) - - if (used_classes is None) or names[i] in used_classes: - db_path = str(filepath.relative_to(self.root_path)) # gt_database/xxxxx.bin - db_info = {'name': names[i], 'path': db_path, 'gt_idx': i, - 'box3d_lidar': gt_boxes[i], 'num_points_in_gt': gt_points.shape[0]} - if names[i] in all_db_infos: - all_db_infos[names[i]].append(db_info) - else: - all_db_infos[names[i]] = [db_info] - - # Output the num of all classes in database - for k, v in all_db_infos.items(): - print('Database %s: %d' % (k, len(v))) - - with open(db_info_save_path, 'wb') as f: - pickle.dump(all_db_infos, f) - - @staticmethod - def create_label_file_with_name_and_box(class_names, gt_names, gt_boxes, save_label_path): - with open(save_label_path, 'w') as f: - for idx in range(gt_boxes.shape[0]): - boxes = gt_boxes[idx] - name = gt_names[idx] - if name not in class_names: - continue - line = "{x} {y} {z} {l} {w} {h} {angle} {name}\n".format( - x=boxes[0], y=boxes[1], z=(boxes[2]), l=boxes[3], - w=boxes[4], h=boxes[5], angle=boxes[6], name=name - ) - f.write(line) - - -def create_custom_infos(dataset_cfg, class_names, data_path, save_path, workers=4): - dataset = CustomDataset( - dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, - training=False, logger=common_utils.create_logger() - ) - train_split, val_split = 'train', 'val' - num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list) - - train_filename = save_path / ('custom_infos_%s.pkl' % train_split) - val_filename = save_path / ('custom_infos_%s.pkl' % val_split) - - print('------------------------Start to generate data infos------------------------') - - dataset.set_split(train_split) - custom_infos_train = dataset.get_infos( - class_names, num_workers=workers, has_label=True, num_features=num_features - ) - with open(train_filename, 'wb') as f: - pickle.dump(custom_infos_train, f) - print('Custom info train file is saved to %s' % train_filename) - - dataset.set_split(val_split) - custom_infos_val = dataset.get_infos( - class_names, num_workers=workers, has_label=True, num_features=num_features - ) - with open(val_filename, 'wb') as f: - pickle.dump(custom_infos_val, f) - print('Custom info train file is saved to %s' % val_filename) - - print('------------------------Start create groundtruth database for data augmentation------------------------') - dataset.set_split(train_split) - dataset.create_groundtruth_database(train_filename, split=train_split) - print('------------------------Data preparation done------------------------') - - -if __name__ == '__main__': - import sys - - if sys.argv.__len__() > 1 and sys.argv[1] == 'create_custom_infos': - import yaml - from pathlib import Path - from easydict import EasyDict - - dataset_cfg = EasyDict(yaml.safe_load(open(sys.argv[2]))) - ROOT_DIR = (Path(__file__).resolve().parent / '../../../').resolve() - create_custom_infos( - dataset_cfg=dataset_cfg, - class_names=['Vehicle', 'Pedestrian', 'Cyclist'], - data_path=ROOT_DIR / 'data' / 'custom', - save_path=ROOT_DIR / 'data' / 'custom', - ) +import copy +import pickle +import os + +import numpy as np + +from ...ops.roiaware_pool3d import roiaware_pool3d_utils +from ...utils import box_utils, common_utils +from ..dataset import DatasetTemplate + + +class CustomDataset(DatasetTemplate): + def __init__(self, dataset_cfg, class_names, training=True, root_path=None, logger=None): + """ + Args: + root_path: + dataset_cfg: + class_names: + training: + logger: + """ + super().__init__( + dataset_cfg=dataset_cfg, class_names=class_names, training=training, root_path=root_path, logger=logger + ) + self.split = self.dataset_cfg.DATA_SPLIT[self.mode] + + split_dir = os.path.join(self.root_path, 'ImageSets', (self.split + '.txt')) + self.sample_id_list = [x.strip() for x in open(split_dir).readlines()] if os.path.exists(split_dir) else None + + self.custom_infos = [] + self.include_data(self.mode) + self.map_class_to_kitti = self.dataset_cfg.MAP_CLASS_TO_KITTI + + def include_data(self, mode): + self.logger.info('Loading Custom dataset.') + custom_infos = [] + + for info_path in self.dataset_cfg.INFO_PATH[mode]: + info_path = self.root_path / info_path + if not info_path.exists(): + continue + with open(info_path, 'rb') as f: + infos = pickle.load(f) + custom_infos.extend(infos) + + self.custom_infos.extend(custom_infos) + self.logger.info('Total samples for CUSTOM dataset: %d' % (len(custom_infos))) + + def get_label(self, idx): + label_file = self.root_path / 'labels' / ('%s.txt' % idx) + assert label_file.exists() + with open(label_file, 'r') as f: + lines = f.readlines() + + # [N, 8]: (x y z dx dy dz heading_angle category_id) + gt_boxes = [] + gt_names = [] + for line in lines: + line_list = line.strip().split(' ') + gt_boxes.append(line_list[:-1]) + gt_names.append(line_list[-1]) + + return np.array(gt_boxes, dtype=np.float32), np.array(gt_names) + + def get_lidar(self, idx): + lidar_file = self.root_path / 'points' / ('%s.npy' % idx) + assert lidar_file.exists() + point_features = np.load(lidar_file) + return point_features + + def set_split(self, split): + super().__init__( + dataset_cfg=self.dataset_cfg, class_names=self.class_names, training=self.training, + root_path=self.root_path, logger=self.logger + ) + self.split = split + + split_dir = self.root_path / 'ImageSets' / (self.split + '.txt') + self.sample_id_list = [x.strip() for x in open(split_dir).readlines()] if split_dir.exists() else None + + def __len__(self): + if self._merge_all_iters_to_one_epoch: + return len(self.sample_id_list) * self.total_epochs + + return len(self.custom_infos) + + def __getitem__(self, index): + if self._merge_all_iters_to_one_epoch: + index = index % len(self.custom_infos) + + info = copy.deepcopy(self.custom_infos[index]) + sample_idx = info['point_cloud']['lidar_idx'] + points = self.get_lidar(sample_idx) + input_dict = { + 'frame_id': self.sample_id_list[index], + 'points': points + } + + if 'annos' in info: + annos = info['annos'] + annos = common_utils.drop_info_with_name(annos, name='DontCare') + gt_names = annos['name'] + gt_boxes_lidar = annos['gt_boxes_lidar'] + input_dict.update({ + 'gt_names': gt_names, + 'gt_boxes': gt_boxes_lidar + }) + + data_dict = self.prepare_data(data_dict=input_dict) + + return data_dict + + def evaluation(self, det_annos, class_names, **kwargs): + if 'annos' not in self.custom_infos[0].keys(): + return 'No ground-truth boxes for evaluation', {} + + def kitti_eval(eval_det_annos, eval_gt_annos, map_name_to_kitti): + from ..kitti.kitti_object_eval_python import eval as kitti_eval + from ..kitti import kitti_utils + + kitti_utils.transform_annotations_to_kitti_format(eval_det_annos, map_name_to_kitti=map_name_to_kitti) + kitti_utils.transform_annotations_to_kitti_format( + eval_gt_annos, map_name_to_kitti=map_name_to_kitti, + info_with_fakelidar=self.dataset_cfg.get('INFO_WITH_FAKELIDAR', False) + ) + kitti_class_names = [map_name_to_kitti[x] for x in class_names] + ap_result_str, ap_dict = kitti_eval.get_official_eval_result( + gt_annos=eval_gt_annos, dt_annos=eval_det_annos, current_classes=kitti_class_names + ) + return ap_result_str, ap_dict + + eval_det_annos = copy.deepcopy(det_annos) + eval_gt_annos = [copy.deepcopy(info['annos']) for info in self.custom_infos] + + if kwargs['eval_metric'] == 'kitti': + ap_result_str, ap_dict = kitti_eval(eval_det_annos, eval_gt_annos, self.map_class_to_kitti) + else: + raise NotImplementedError + + return ap_result_str, ap_dict + + def get_infos(self, class_names, num_workers=4, has_label=True, sample_id_list=None, num_features=4): + import concurrent.futures as futures + + def process_single_scene(sample_idx): + print('%s sample_idx: %s' % (self.split, sample_idx)) + info = {} + pc_info = {'num_features': num_features, 'lidar_idx': sample_idx} + info['point_cloud'] = pc_info + + if has_label: + annotations = {} + gt_boxes_lidar, name = self.get_label(sample_idx) + annotations['name'] = name + annotations['gt_boxes_lidar'] = gt_boxes_lidar[:, :7] + info['annos'] = annotations + + return info + + sample_id_list = sample_id_list if sample_id_list is not None else self.sample_id_list + + # create a thread pool to improve the velocity + with futures.ThreadPoolExecutor(num_workers) as executor: + infos = executor.map(process_single_scene, sample_id_list) + return list(infos) + + def create_groundtruth_database(self, info_path=None, used_classes=None, split='train'): + import torch + + database_save_path = Path(self.root_path) / ('gt_database' if split == 'train' else ('gt_database_%s' % split)) + db_info_save_path = Path(self.root_path) / ('custom_dbinfos_%s.pkl' % split) + + database_save_path.mkdir(parents=True, exist_ok=True) + all_db_infos = {} + + with open(info_path, 'rb') as f: + infos = pickle.load(f) + + for k in range(len(infos)): + print('gt_database sample: %d/%d' % (k + 1, len(infos))) + info = infos[k] + sample_idx = info['point_cloud']['lidar_idx'] + points = self.get_lidar(sample_idx) + annos = info['annos'] + names = annos['name'] + gt_boxes = annos['gt_boxes_lidar'] + + num_obj = gt_boxes.shape[0] + point_indices = roiaware_pool3d_utils.points_in_boxes_cpu( + torch.from_numpy(points[:, 0:3]), torch.from_numpy(gt_boxes) + ).numpy() # (nboxes, npoints) + + for i in range(num_obj): + filename = '%s_%s_%d.bin' % (sample_idx, names[i], i) + filepath = database_save_path / filename + gt_points = points[point_indices[i] > 0] + + gt_points[:, :3] -= gt_boxes[i, :3] + with open(filepath, 'w') as f: + gt_points.tofile(f) + + if (used_classes is None) or names[i] in used_classes: + db_path = str(filepath.relative_to(self.root_path)) # gt_database/xxxxx.bin + db_info = {'name': names[i], 'path': db_path, 'gt_idx': i, + 'box3d_lidar': gt_boxes[i], 'num_points_in_gt': gt_points.shape[0]} + if names[i] in all_db_infos: + all_db_infos[names[i]].append(db_info) + else: + all_db_infos[names[i]] = [db_info] + + # Output the num of all classes in database + for k, v in all_db_infos.items(): + print('Database %s: %d' % (k, len(v))) + + with open(db_info_save_path, 'wb') as f: + pickle.dump(all_db_infos, f) + + @staticmethod + def create_label_file_with_name_and_box(class_names, gt_names, gt_boxes, save_label_path): + with open(save_label_path, 'w') as f: + for idx in range(gt_boxes.shape[0]): + boxes = gt_boxes[idx] + name = gt_names[idx] + if name not in class_names: + continue + line = "{x} {y} {z} {l} {w} {h} {angle} {name}\n".format( + x=boxes[0], y=boxes[1], z=(boxes[2]), l=boxes[3], + w=boxes[4], h=boxes[5], angle=boxes[6], name=name + ) + f.write(line) + + +def create_custom_infos(dataset_cfg, class_names, data_path, save_path, workers=4): + dataset = CustomDataset( + dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, + training=False, logger=common_utils.create_logger() + ) + train_split, val_split = 'train', 'val' + num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list) + + train_filename = save_path / ('custom_infos_%s.pkl' % train_split) + val_filename = save_path / ('custom_infos_%s.pkl' % val_split) + + print('------------------------Start to generate data infos------------------------') + + dataset.set_split(train_split) + custom_infos_train = dataset.get_infos( + class_names, num_workers=workers, has_label=True, num_features=num_features + ) + with open(train_filename, 'wb') as f: + pickle.dump(custom_infos_train, f) + print('Custom info train file is saved to %s' % train_filename) + + dataset.set_split(val_split) + custom_infos_val = dataset.get_infos( + class_names, num_workers=workers, has_label=True, num_features=num_features + ) + with open(val_filename, 'wb') as f: + pickle.dump(custom_infos_val, f) + print('Custom info train file is saved to %s' % val_filename) + + print('------------------------Start create groundtruth database for data augmentation------------------------') + dataset.set_split(train_split) + dataset.create_groundtruth_database(train_filename, split=train_split) + print('------------------------Data preparation done------------------------') + + +if __name__ == '__main__': + import sys + + if sys.argv.__len__() > 1 and sys.argv[1] == 'create_custom_infos': + import yaml + from pathlib import Path + from easydict import EasyDict + + dataset_cfg = EasyDict(yaml.safe_load(open(sys.argv[2]))) + ROOT_DIR = (Path(__file__).resolve().parent / '../../../').resolve() + create_custom_infos( + dataset_cfg=dataset_cfg, + class_names=['Vehicle', 'Pedestrian', 'Cyclist'], + data_path=ROOT_DIR / 'data' / 'custom', + save_path=ROOT_DIR / 'data' / 'custom', + ) -- Gitee