From 46b071c2f7ecc056fbdd3827fab8a02565a73a51 Mon Sep 17 00:00:00 2001 From: wsqRichard <229242333@qq.com> Date: Mon, 28 Jul 2025 16:12:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=80=A7=E8=83=BD=E4=BC=98=E5=8C=96=EF=BC=9A?= =?UTF-8?q?=E9=87=87=E7=94=A8=E9=9B=B6=E6=8B=B7=E8=B4=9D=EF=BC=8C=E4=BC=98?= =?UTF-8?q?=E5=8C=96CPU=20=E4=B8=8EGPU=20=E4=B9=8B=E9=97=B4=E7=9A=84?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E4=BC=A0=E8=BE=93=E6=80=A7=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: wsqRichard <229242333@qq.com> --- calculatemovingcorrelation.cpp | 98 +++++-- calculatemovingcorrelation.h | 20 +- cuda_correlation.cu | 499 ++++++++++++++++++--------------- cuda_correlation.h | 45 +-- mainwindow.cpp | 50 ++-- mainwindow.h | 8 +- 6 files changed, 402 insertions(+), 318 deletions(-) diff --git a/calculatemovingcorrelation.cpp b/calculatemovingcorrelation.cpp index 5980f80..dc80da8 100644 --- a/calculatemovingcorrelation.cpp +++ b/calculatemovingcorrelation.cpp @@ -19,10 +19,12 @@ CalculateMovingCorrelation::CalculateMovingCorrelation() {} #endif CalculateMovingCorrelation::~CalculateMovingCorrelation() { - if (sequenceDatas_) { - free(sequenceDatas_); - sequenceDatas_ = nullptr; +#ifndef USE_PEEKSKERNEL + if (h_vecSeqLen) { + free(h_vecSeqLen); + h_vecSeqLen = nullptr; } +#endif } // 预计算sequence:初始化阶段预计算所有Sequence的FFT @@ -33,24 +35,20 @@ void CalculateMovingCorrelation::ComputeAllSequence(uint fftLength) { std::cerr << __FUNCTION__ << " sequenceDatas_ ptr is null!" << std::endl; return; } - cudaCorrelation->ComputeSequenceFFT(sequenceDatas_, numSequences_, fftLength); + cudaCorrelation->ComputeSequenceFFT(numSequences_, fftLength); #endif } -int CalculateMovingCorrelation::CalMovingCorrlationRoutine( - const cpuComplex *signalDatas, uint numChannels, uint signalLength) { +int CalculateMovingCorrelation::CalMovingCorrlationRoutine(uint signalChannels, + uint signalLength) { int result = 0; -#if USE_CUDA - // 预先计算所有batch的signals的fft值 - if (signalDatas == nullptr) { - std::cerr << __FUNCTION__ << " signalDatas ptr is null!" << std::endl; - return 0; - } +#if USE_CUDA qDebug() << "Starting CUDA processing for compute all signals fft"; - cudaCorrelation->ComputeSignalsFFT(signalDatas, numChannels, signalLength); + // 预先计算所有batch的signals的fft值 + cudaCorrelation->ComputeSignalsFFT(signalChannels, signalLength); - qDebug() << __FUNCTION__ << "Starting CUDA processing for ComputeConjMul()"; + qDebug() << "Starting CUDA processing for ComputeConjMul()"; try { // ComputeConjMul:函数完成共轭乘、IFFT、CalculatePeaks result = cudaCorrelation->ComputeConjMul(); @@ -67,27 +65,38 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine( } #endif #endif + return result; } #ifndef USE_PEEKSKERNEL int CalculateMovingCorrelation::CalculatePeaks(uint numSequences) { - std::vector vecCaculatePeaks; - vecCaculatePeaks.reserve(cudaCorrelation->signalChannels_); + if (h_vecSeqLen == nullptr) { + std::cerr << __FUNCTION__ << " h_vecSeqLen ptr is null!" << std::endl; + return 0; + } + if (cudaCorrelation->h_results == nullptr) { + std::cerr << __FUNCTION__ << " cudaCorrelation->h_results ptr is null!" + << std::endl; + return 0; + } + + std::vector vecCaculatePeaks(cudaCorrelation->signalChannels_); uint signalLength = cudaCorrelation->signalLength_; int ret = 0; for (int seqIdx = 0; seqIdx < numSequences; ++seqIdx) { vecCaculatePeaks.clear(); + // 原始sequence的长度 - uint sequenceLength = cudaCorrelation->vecSequenceLength_[seqIdx]; + uint sequenceLength = h_vecSeqLen[seqIdx]; const uint num_elements = signalLength - sequenceLength; if (num_elements == 0) { return 0; } const auto &conj_results = - cudaCorrelation->cpu_results + seqIdx * cudaCorrelation->signalsNum_; + cudaCorrelation->h_results + seqIdx * cudaCorrelation->signalsNum_; for (int res_i = 0; res_i < cudaCorrelation->signalChannels_; ++res_i) { Real max_abs = -10000; @@ -238,37 +247,70 @@ void CalculateMovingCorrelation::LoadAllSequenceBin(QString basePath, numSequences_ = list.size(); - // malloc申请空间,需手动free,防止内存泄漏(在析构函数中free) if (sequenceDatas_ == nullptr) { - sequenceDatas_ = - (cpuComplex *)malloc(numSequences_ * SamplePoint * sizeof(cpuComplex)); - if (sequenceDatas_ == nullptr) { - std::cerr << __FUNCTION__ << " Memory allocation failed!" << std::endl; + // 申请零拷贝内存,自动完成CPU内存与GPU显存数据同步 + if (!cudaCorrelation->AllocMappMemory( + (void **)&(cudaCorrelation->h_sequence), + (void **)&(cudaCorrelation->d_sequence), + numSequences_ * SamplePoint * sizeof(cpuComplex))) { + std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl; return; } + sequenceDatas_ = (cpuComplex *)cudaCorrelation->h_sequence; memset(sequenceDatas_, 0, numSequences_ * SamplePoint * sizeof(cpuComplex)); } +#ifdef USE_PEEKSKERNEL + if (h_vecSeqLen == nullptr) { + // 申请零拷贝内存,自动完成CPU内存与GPU显存数据同步 + if (!cudaCorrelation->AllocMappMemory( + (void **)&(cudaCorrelation->h_vecSeqLen), + (void **)&(cudaCorrelation->d_vecSeqLen), + numSequences_ * sizeof(uint))) { + std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl; + return; + } + h_vecSeqLen = (uint *)cudaCorrelation->h_vecSeqLen; + memset(h_vecSeqLen, 0, numSequences_ * sizeof(uint)); + } +#else + // malloc申请空间,需手动free,防止内存泄漏(在析构函数中free) + if (h_vecSeqLen == nullptr) { + h_vecSeqLen = (uint *)malloc(numSequences_ * sizeof(uint)); + if (h_vecSeqLen == nullptr) { + std::cerr << __FUNCTION__ << " Memory allocation failed!" << std::endl; + return; + } + memset(h_vecSeqLen, 0, numSequences_ * sizeof(uint)); + } +#endif + for (int i = 0; i < numSequences_; ++i) { QFileInfo fileInfo = list.at(i); cpuComplex *sequence = sequenceDatas_ + i * SamplePoint; + uint *seqLenth = h_vecSeqLen + i; if (!fileInfo.isDir()) { if (fileInfo.suffix() == "bin") { ReadSequenceFile(PathName + "/" + fileInfo.fileName(), sequence, - cudaCorrelation->vecSequenceLength_); + seqLenth); } } } } -void CalculateMovingCorrelation::ReadSequenceFile( - QString strFileName, cpuComplex *sequence, - std::vector &vecSequenceLength) { +void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName, + cpuComplex *sequence, + uint *seqLenth) { if (sequence == nullptr) { std::cerr << __FUNCTION__ << " sequence ptr is null!" << std::endl; return; } + if (seqLenth == nullptr) { + std::cerr << __FUNCTION__ << " seqLenth ptr is null!" << std::endl; + return; + } + std::ifstream inFile(strFileName.toStdString(), std::ios::in | std::ios::binary); // 二进制读方式打开 if (!inFile) { @@ -289,7 +331,7 @@ void CalculateMovingCorrelation::ReadSequenceFile( uint sizeOfSequenceData = vecSequenceData.size(); uint SequenceLength = sizeOfSequenceData / 2; - vecSequenceLength.push_back(SequenceLength); + *seqLenth = SequenceLength; for (int index = 0; index < SequenceLength; index++) { cpuComplex data(vecSequenceData[index * 2], vecSequenceData[index * 2 + 1]); diff --git a/calculatemovingcorrelation.h b/calculatemovingcorrelation.h index 87cdb6c..cf3a3b2 100644 --- a/calculatemovingcorrelation.h +++ b/calculatemovingcorrelation.h @@ -25,17 +25,25 @@ class CalculateMovingCorrelation { void LoadAllSequenceBin(QString basePath, uint SamplePoint); // 计算所有序列的fft + // 不需要传sequenceDatas参数,因采用零拷贝,数据已经传到显存了 void ComputeAllSequence(uint fftLength); // 计算滑动相关总流程 输入 8路 I数据 和 8路 Q数据 返回 1--找到相关峰 // 0--未找到相关峰 - int CalMovingCorrlationRoutine(const cpuComplex *signalDatas, - uint numChannels, uint signalLength); + int CalMovingCorrlationRoutine(uint signalChannels, uint signalLength); - // 序列文件数据 + // 序列数据 + // 零拷贝:sequenceDatas_被填充之后,数据会自动同步到显存中 cpuComplex *sequenceDatas_ = nullptr; + + // 序列原始长度 + uint *h_vecSeqLen = nullptr; uint numSequences_ = 0; +#if USE_CUDA + CUDACorrelation *cudaCorrelation; +#endif + private: // 滑动相关函数 // 输入 一路数据的 IQ (组合成 复数) SingleChannelData @@ -54,15 +62,11 @@ class CalculateMovingCorrelation { // 读取单个序列文件 void ReadSequenceFile(QString strFileName, cpuComplex *sequence, - std::vector &vecSequenceLength); + uint *seqLenth); // 字节转换成Real(float/double) void BytesToRealInv(char *buf, int ReadFileLength, std::vector &VecReturn); - -#if USE_CUDA - CUDACorrelation *cudaCorrelation; -#endif }; #endif // CALCULATEMOVINGCORRELATION_H diff --git a/cuda_correlation.cu b/cuda_correlation.cu index 33fafcd..a5a62dd 100644 --- a/cuda_correlation.cu +++ b/cuda_correlation.cu @@ -10,6 +10,10 @@ #include "cuda_correlation.h" +// cpu侧对 peekMaxKernel 计算结果还需要进一步处理 +// 则使能该宏定义,并在cpu侧增加相关逻辑处理 h_vecPeaks +// #define CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT + using namespace std; template class CUDACorrelation; // 明确告诉编译器生成float特例 template class CUDACorrelation; // 明确告诉编译器生成double特例 @@ -20,8 +24,8 @@ template class CUDACorrelation; // 明确告诉编译器生成double特 if (err != cudaSuccess) { \ std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":" \ << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \ - throw std::runtime_error("CUDA error"); \ Cleanup(); \ + throw std::runtime_error("CUDA error"); \ } \ } while (0) @@ -164,7 +168,7 @@ T hypot(T x, T y) { // peekMax的核函数:参考CPU侧旧的计算逻辑实现 __global__ void CalculatePeaksKernelFloat( - const cufftComplex* __restrict__ d_results, + const cufftComplex* __restrict__ d_conjResults, const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, uint signalLength, uint* __restrict__ d_vecPeaks) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -182,9 +186,9 @@ __global__ void CalculatePeaksKernelFloat( // int max_index = -1; float total_abs = 0.0; - // d_results的维度:[seqChannels * signalChannels][signalLength] - // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] - const auto& CorrelationValue = d_results + idx * signalLength; + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto& CorrelationValue = d_conjResults + idx * signalLength; #pragma unroll for (int i = 0; i < num_elements; ++i) { @@ -203,7 +207,7 @@ __global__ void CalculatePeaksKernelFloat( } __global__ void CalculatePeaksKernelDouble( - const cufftDoubleComplex* __restrict__ d_results, + const cufftDoubleComplex* __restrict__ d_conjResults, const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, uint signalLength, uint* __restrict__ d_vecPeaks) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -221,9 +225,9 @@ __global__ void CalculatePeaksKernelDouble( // int max_index = -1; double total_abs = 0.0; - // d_results的维度:[seqChannels * signalChannels][signalLength] - // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] - const auto& CorrelationValue = d_results + idx * signalLength; + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto& CorrelationValue = d_conjResults + idx * signalLength; #pragma unroll for (int i = 0; i < num_elements; ++i) { @@ -243,7 +247,7 @@ __global__ void CalculatePeaksKernelDouble( template CUDACorrelation::CUDACorrelation() - : fftPlan_(0), + : sigfftPlan_(0), ifftPlan_(0), d_signals(nullptr), d_sequence(nullptr), @@ -253,189 +257,223 @@ CUDACorrelation::CUDACorrelation() signalsSize_(0), sequenceSize_(0), fftLength_(0) { - seqChannels_ = 0; - -#ifdef USE_PEEKSKERNEL - h_vecPeaks = nullptr; - d_vecPeaks = nullptr; - d_vecSeqLen = nullptr; -#else - signalsNum_ = 0; - cpu_results = nullptr; -#endif - - // CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); stream_ = cudaStreamDefault; CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0)); + + // 检查设备是否支持零拷贝 + CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&canMapHostMemory, + cudaDevAttrCanMapHostMemory, 0)); + + if (canMapHostMemory) { + // 设置设备支持映射内存 + CHECK_CUDA_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); + } else { + std::cerr + << __FUNCTION__ + << " The current device does not support the allocation of mappMemory." + << std::endl; + } } template CUDACorrelation::~CUDACorrelation() { Cleanup(); - cudaDeviceSynchronize(); - // cudaStreamDestroy(stream_); } template -void CUDACorrelation::Cleanup() { - if (fftPlan_) { - cufftDestroy(fftPlan_); - fftPlan_ = 0; +void CUDACorrelation::FreeMemory() { + if (d_signals_fft) { + cudaFreeAsync(d_signals_fft, stream_); + d_signals_fft = nullptr; } - if (ifftPlan_) { - cufftDestroy(ifftPlan_); - ifftPlan_ = 0; + if (d_sequence_fft) { + cudaFreeAsync(d_sequence_fft, stream_); + d_sequence_fft = nullptr; } - FreeMemory(); - - cudaStreamSynchronize(stream_); -} - -template -void CUDACorrelation::FreeMemory() { - if (d_signals) { - cudaFreeAsync(d_signals, stream_); + if (h_signals) { + cudaFreeHost(h_signals); + h_signals = nullptr; d_signals = nullptr; } - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); + if (h_sequence) { + cudaFreeHost(h_sequence); + h_sequence = nullptr; d_sequence = nullptr; } + +#ifdef USE_PEEKSKERNEL + // 共轭乘及IFFT计算结果显存 if (d_results) { cudaFreeAsync(d_results, stream_); d_results = nullptr; } -#ifdef USE_PEEKSKERNEL - if (d_vecSeqLen) { - cudaFreeAsync(d_vecSeqLen, stream_); + // 原始序列长度相关内存空间 + if (h_vecSeqLen) { + cudaFreeHost(h_vecSeqLen); + h_vecSeqLen = nullptr; d_vecSeqLen = nullptr; } + // peekmax核函数计算结果显存 +#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT + if (h_vecPeaks) { + cudaFreeHost(h_vecPeaks); + h_vecPeaks = nullptr; + d_vecPeaks = nullptr; + } +#else if (d_vecPeaks) { cudaFreeAsync(d_vecPeaks, stream_); d_vecPeaks = nullptr; } - if (h_vecPeaks) { - free(h_vecPeaks); - h_vecPeaks = nullptr; - } +#endif + #else - if (cpu_results) { - free(cpu_results); - cpu_results = nullptr; + if (h_results) { + cudaFreeHost(h_results); + h_results = nullptr; + d_results = nullptr; } #endif } -// 预先计算sequence的fft -// 调用该接口时,需要先初始化 vecSequenceLength_ -// sequenceDatas:非vector类型,可在初始化时通过malloc分配并初始化 -// sequenceDatas 内存连续,可直接cudaMemcpy到显存中 -// 减少memcpy调用,提升性能 template -void CUDACorrelation::ComputeSequenceFFT(const Complex* sequenceDatas, - uint numSequence, uint fftLength) { - fftLength_ = fftLength; - uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); +void CUDACorrelation::Cleanup() { + if (sigfftPlan_) { + cufftDestroy(sigfftPlan_); + sigfftPlan_ = 0; + } + if (seqfftPlan_) { + cufftDestroy(seqfftPlan_); + seqfftPlan_ = 0; + } + if (ifftPlan_) { + cufftDestroy(ifftPlan_); + ifftPlan_ = 0; + } - // 分配显存d_sequence - if ((sequenceSize_ == 0) || (d_sequence == nullptr)) { - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; - } - sequenceSize_ = sequenceSize; - CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); - } else { - if (sequenceSize_ != sequenceSize) { - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; - } - sequenceSize_ = sequenceSize; - CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); + FreeMemory(); + cudaStreamSynchronize(stream_); +} + +// 申请MappMemory(实现CPU与GPU的零拷贝:提升性能) +template +bool CUDACorrelation::AllocMappMemory(void** host_ptr, void** device_ptr, + size_t size) { + if (canMapHostMemory) { + if (size == 0) { + std::cerr << __FUNCTION__ << "AllocMappMemory size is zero." << std::endl; + return false; } - } - // 拷贝sequenceDatas数据到d_sequence - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, sequenceDatas, sequenceSize, - cudaMemcpyHostToDevice, stream_)); + // 分配锁页映射内存 + CHECK_CUDA_ERROR(cudaHostAlloc( + host_ptr, size, cudaHostAllocMapped | cudaHostAllocWriteCombined)); -#ifdef USE_PEEKSKERNEL - // 需要预先初始化 vecSequenceLength_ - if (vecSequenceLength_.size() == 0) { + // 获取设备指针 + CHECK_CUDA_ERROR(cudaHostGetDevicePointer(device_ptr, *host_ptr, 0)); + + return true; + } else { std::cerr << __FUNCTION__ - << " vecSequenceLength_ size is 0 (需要预先初始化 vecSequenceLength_)" + << " The current device does not support the allocation of mappMemory." << std::endl; + return false; + } +} + +/** + * 预先计算sequence的FFT + * 预先计算完所有序列的FFT,调用该接口时,需要先初始化 d_vecSeqLen + * + * @param numSequence 序列的batch + * @param fftLength 序列FFT结果的长度 + * @return void + */ +template +void CUDACorrelation::ComputeSequenceFFT(uint numSequence, uint fftLength) { + if (d_sequence == nullptr) { + std::cerr << __FUNCTION__ << " d_sequence ptr is null!" << std::endl; return; } - // 分配 d_vecSeqLen:该显存保存序列的原始长度,在peekMax的核函数中使用 - if ((seqChannels_ == 0) || (d_vecSeqLen == nullptr)) { - if (d_vecSeqLen) { - cudaFreeAsync(d_vecSeqLen, stream_); - d_vecSeqLen = nullptr; - } + // 共轭乘核函数线程数配置的grid的y维度 + grid_.y = numSequence; // y: 序列维度 - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); - } else { - if (seqChannels_ != numSequence) { - if (d_vecSeqLen) { - cudaFreeAsync(d_vecSeqLen, stream_); - d_vecSeqLen = nullptr; - } + uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); - } + // 因申请MapHostMemory,需单独分配显存d_sequence_fft,避免额外的copy操作 + if (d_sequence_fft && (sequenceSize_ != sequenceSize || sequenceSize_ == 0)) { + cudaFreeAsync(d_sequence_fft, stream_); + d_sequence_fft = nullptr; + } + if (!d_sequence_fft) { + sequenceSize_ = sequenceSize; + CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence_fft, sequenceSize, stream_)); } - - // 拷贝vecSequenceLength_数据到显存d_vecSeqLen - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), - numSequence * sizeof(uint), - cudaMemcpyHostToDevice, stream_)); -#endif - - seqChannels_ = numSequence; - grid_.y = seqChannels_; // y: 序列维度 // 创建并执行FFT cufftResult cufftStatus; - cufftHandle fftPlan; - cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType(), seqChannels_); - if (cufftStatus != CUFFT_SUCCESS) { - std::cerr << __FUNCTION__ << " Failed to create CUFFT plan" << std::endl; - return; + bool needNewPlan = (seqfftPlan_ == 0) || (fftLength_ != fftLength) || + (seqChannels_ != numSequence); + if (needNewPlan) { + if (seqfftPlan_) { + cufftStatus = cufftDestroy(seqfftPlan_); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Error destroying FFT plan"); + } + seqfftPlan_ = 0; + } + + // 更新参数并创建新计划 + fftLength_ = fftLength; + seqChannels_ = numSequence; + + cufftStatus = + cufftPlan1d(&seqfftPlan_, fftLength_, getFFTType(), seqChannels_); + if (cufftStatus != CUFFT_SUCCESS) { + seqfftPlan_ = 0; + throw std::runtime_error("Failed to create CUFFT plan"); + } + + // 设置计算流 + cufftSetStream(seqfftPlan_, stream_); } - cufftSetStream(fftPlan, stream_); if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( - fftPlan, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence), CUFFT_FORWARD); + seqfftPlan_, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); } else { cufftStatus = cufftExecZ2Z( - fftPlan, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence), CUFFT_FORWARD); + seqfftPlan_, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); } - cufftDestroy(fftPlan); if (cufftStatus != CUFFT_SUCCESS) { - std::cerr << __FUNCTION__ << " Failed to execute forward FFT" << std::endl; + std::cerr << __FUNCTION__ + << " Failed to execute forward FFT on sequenceDatas." + << std::endl; return; } } -// 预先计算signals的fft -// signalDatas:非vector类型,可在初始化时通过malloc分配并初始化 -// signalDatas 内存连续,可直接cudaMemcpy到显存中 -// 减少memcpy调用,提升性能 +/** + * 预先计算一组信号的FFT + * + * @param numChannels 信号的batch + * @param signalLength 每个信号的长度 + * @return void + */ template -void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, - uint numChannels, +void CUDACorrelation::ComputeSignalsFFT(uint numChannels, uint signalLength) { + if (d_signals == nullptr) { + std::cerr << __FUNCTION__ << " d_signals ptr is null!" << std::endl; + return; + } + uint signalsNum = numChannels * signalLength; uint signalsSize = signalsNum * sizeof(Complex); @@ -443,94 +481,62 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, signalsNum_ = signalsNum; #endif + // 共轭乘核函数线程配置 if (signalLength_ != signalLength) { block_.x = std::min((int)signalLength, (int)deviceProp_.maxThreadsPerBlock); grid_.x = (signalLength + block_.x - 1) / block_.x; // x: 位置维度 } - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) { + // 因申请MapHostMemory,需单独分配显存 d_signals_fft,避免额外的copy + if (d_signals_fft && (signalsSize_ != signalsSize || signalsSize_ == 0)) { + cudaFreeAsync(d_signals_fft, stream_); + d_signals_fft = nullptr; + } + if (!d_signals_fft) { signalsSize_ = signalsSize; - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - d_results = nullptr; - } - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); - } else { - if (signalsSize_ != signalsSize) { - signalsSize_ = signalsSize; - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - d_results = nullptr; - } - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); - } + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals_fft, signalsSize_, stream_)); } try { - // 拷贝数据到显存:CPU->GPU - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, signalDatas, signalsSize, - cudaMemcpyHostToDevice, stream_)); - // 创建fftPlan cufftResult cufftStatus; - if (fftPlan_ == 0) { + bool needNewPlan = (sigfftPlan_ == 0) || (signalLength_ != signalLength) || + (signalChannels_ != numChannels); + if (needNewPlan) { + if (sigfftPlan_) { + cufftStatus = cufftDestroy(sigfftPlan_); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Error destroying FFT plan"); + } + sigfftPlan_ = 0; + } + + // 更新参数并创建新计划 signalLength_ = signalLength; signalChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - cufftSetStream(fftPlan_, stream_); - } else { - if ((signalLength_ != signalLength) || (signalChannels_ != numChannels)) { - if (fftPlan_) { - cufftResult result = cufftDestroy(fftPlan_); - fftPlan_ = 0; - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error destroying FFT plan"); - } - } - signalLength_ = signalLength; - signalChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - cufftSetStream(fftPlan_, stream_); + cufftStatus = cufftPlan1d(&sigfftPlan_, signalLength_, getFFTType(), + signalChannels_); + if (cufftStatus != CUFFT_SUCCESS) { + sigfftPlan_ = 0; + throw std::runtime_error("Failed to create CUFFT sigfftPlan_"); } + cufftSetStream(sigfftPlan_, stream_); } // 计算signals的fft if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( - fftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals), CUFFT_FORWARD); + sigfftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals_fft), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error( "Failed to execute forward FFT on signalDatas"); } } else { cufftStatus = cufftExecZ2Z( - fftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals), CUFFT_FORWARD); + sigfftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals_fft), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error( "Failed to execute forward FFT on signalDatas"); @@ -553,43 +559,90 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, } } -// 需预先计算sequence的fft -// 需预先计算signals的fft -// 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT +/** + * 完成所有序列和信号的共轭乘、IFFT、peekmax计算 + * 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT、peekMax等 + * 需预先计算sequence的fft + * 需预先计算signals的fft + * + * @return int + */ template int CUDACorrelation::ComputeConjMul(void) { - if (d_signals == nullptr) { + if (d_signals_fft == nullptr) { std::cerr << __FUNCTION__ << " 请先调用 ComputeSignalsFFT(...) 接口" << std::endl; return 0; } - if (d_sequence == nullptr) { + if (d_sequence_fft == nullptr) { std::cerr << __FUNCTION__ << " 请先调用 ComputeSequenceFFT(...) 接口" << std::endl; return 0; } - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if (d_results == nullptr) { - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); - } + uint resultsSize = seqChannels_ * signalsSize_; #ifdef USE_PEEKSKERNEL - if (d_vecPeaks == nullptr) { - CHECK_CUDA_ERROR(cudaMallocAsync( - &d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), stream_)); + // 使用 PeeksMaxKernel 核函数 + if (d_vecSeqLen == nullptr) { + std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl; + return 0; } + // 分配共轭乘结果显存,不需要copy回cpu内存 + if (d_results && (resultsSize_ != resultsSize || resultsSize_ == 0)) { + cudaFreeAsync(d_results, stream_); + d_results = nullptr; + } + if (!d_results) { + resultsSize_ = resultsSize; + CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, resultsSize_, stream_)); + } + + // 申请 PeeksMaxKernel 核函数 计算结果所需的显存 + uint vecPeaksSize = seqChannels_ * signalChannels_ * sizeof(uint); +#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT + // 若cpu侧还需要进一步处理 + // 分配零拷贝内存,不需要显示调用cudaMemcpy 回传给cpu内存 + // 待GPU计算结束,则实现CPU侧相关逻辑,对 h_vecPeaks 进行进一步处理即可 + if (h_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) { + cudaFreeHost(h_vecPeaks); + h_vecPeaks = nullptr; + } if (h_vecPeaks == nullptr) { - h_vecPeaks = (uint*)malloc(seqChannels_ * signalChannels_ * sizeof(uint)); + vecPeaksSize_ = vecPeaksSize; + // 申请零拷贝内存,系统自动同步 + AllocMappMemory((void**)&h_vecPeaks, (void**)&d_vecPeaks, vecPeaksSize_); } #else - if (cpu_results == nullptr) { - cpu_results = (CUDAComplex*)malloc(seqChannels_ * signalsSize_); + // 若cpu侧不需要进一步处理,则申请普通显存 + if (d_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) { + if (d_vecPeaks) { + cudaFreeAsync(d_vecPeaks, stream_); + d_vecPeaks = nullptr; + } } -#endif + + if (!d_vecPeaks) { + vecPeaksSize_ = vecPeaksSize; + CHECK_CUDA_ERROR(cudaMallocAsync(&d_vecPeaks, vecPeaksSize_, stream_)); + } +#endif // CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT +#else // USE_PEEKSKERNEL + // 不使用 PeeksMaxKernel 核函数 + // 则需要把共轭乘IFFT结果copy回cpu内存,进行进一步处理 + // 分配零拷贝内存,不需要显示调用cudaMemcpy,系统自动将IFFT结果copy回cpu内存 + if (h_results && (resultsSize_ != resultsSize || resultsSize_ == 0)) { + cudaFreeHost(h_results); + h_results = nullptr; + } + if (h_results == nullptr) { + resultsSize_ = resultsSize; + // 申请零拷贝内存,系统自动同步 + AllocMappMemory((void**)&h_results, (void**)&d_results, resultsSize_); + } +#endif // USE_PEEKSKERNEL try { // 创建ifftPlan_:批量计算共轭乘结果的IFFT @@ -614,12 +667,7 @@ int CUDACorrelation::ComputeConjMul(void) { } #ifdef USE_PEEKSKERNEL - // GPU计算peeksMax - if (d_vecSeqLen == nullptr) { - std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl; - return 0; - } - + // GPU计算peeksMax的线程数配置 dim3 peeksMax_block(signalChannels_); dim3 peeksMax_grid((seqChannels_ * signalChannels_ + peeksMax_block.x - 1) / peeksMax_block.x); @@ -628,8 +676,8 @@ int CUDACorrelation::ComputeConjMul(void) { if constexpr (std::is_same_v) { // 批量计算共轭乘 batchConjugateMultiplyKernelFloat<<>>( - reinterpret_cast(d_signals), - reinterpret_cast(d_sequence), + reinterpret_cast(d_signals_fft), + reinterpret_cast(d_sequence_fft), reinterpret_cast(d_results), signalLength_, signalChannels_, seqChannels_); @@ -650,8 +698,8 @@ int CUDACorrelation::ComputeConjMul(void) { } else { // double类型 // 批量计算共轭乘 batchConjugateMultiplyKernelDouble<<>>( - reinterpret_cast(d_signals), - reinterpret_cast(d_sequence), + reinterpret_cast(d_signals_fft), + reinterpret_cast(d_sequence_fft), reinterpret_cast(d_results), signalLength_, signalChannels_, seqChannels_); @@ -671,22 +719,7 @@ int CUDACorrelation::ComputeConjMul(void) { #endif } -#ifdef USE_PEEKSKERNEL - // GPU计算peeksMax - // 如果CPU侧还需要对计算结果进步一步处理,则需要把结果copy回CPU的内存 - // 如果不需要,如下代码可注释掉 - CHECK_CUDA_ERROR(cudaMemcpyAsync( - h_vecPeaks, d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), - cudaMemcpyDeviceToHost, stream_)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); -#else - // 拷贝IFFT结果回主机,cpu侧计算PeeksMax - CHECK_CUDA_ERROR(cudaMemcpyAsync(cpu_results, d_results, - seqChannels_ * signalsSize_, - cudaMemcpyDeviceToHost, stream_)); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); -#endif - return 0; } catch (const std::exception& e) { std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; diff --git a/cuda_correlation.h b/cuda_correlation.h index 99ca276..7f90895 100644 --- a/cuda_correlation.h +++ b/cuda_correlation.h @@ -25,20 +25,19 @@ class CUDACorrelation { CUDACorrelation(); ~CUDACorrelation(); + // 申请cpu与GPU的map内存(零拷贝:提升性能) + bool AllocMappMemory(void** host_ptr, void** device_ptr, size_t size); + // 预先计算sequence的fft - // 调用该接口时,需要先初始化 vecSequenceLength_ - // sequenceDatas:非vector类型,可在初始化时通过malloc分配并初始化 - // sequenceDatas 内存连续,可直接cudaMemcpy到显存中 + // 调用该接口时,需要先初始化 d_vecSeqLen + // 采用零拷贝内存,d_sequence显存中已有数据 // 减少memcpy调用,提升性能 - void ComputeSequenceFFT(const Complex* sequenceDatas, uint numSequence, - uint fftLength); + void ComputeSequenceFFT(uint numSequence, uint fftLength); // 预先计算signals的fft - // signalDatas:非vector类型,可在初始化时通过malloc分配并初始化 - // signalDatas 内存连续,可直接cudaMemcpy到显存中 + // 采用零拷贝内存,d_signals显存中已有数据 // 减少memcpy调用,提升性能 - void ComputeSignalsFFT(const Complex* signalDatas, uint numChannels, - uint signalLength); + void ComputeSignalsFFT(uint numChannels, uint signalLength); // 需预先计算sequence的fft // 需预先计算signals的fft @@ -48,15 +47,19 @@ class CUDACorrelation { #ifdef USE_PEEKSKERNEL uint* h_vecPeaks = nullptr; uint* d_vecPeaks = nullptr; - uint* d_vecSeqLen = nullptr; -#else - CUDAComplex* cpu_results = nullptr; #endif + CUDAComplex* h_signals = nullptr; + CUDAComplex* h_sequence = nullptr; + CUDAComplex* h_results = nullptr; + CUDAComplex* d_signals = nullptr; CUDAComplex* d_sequence = nullptr; CUDAComplex* d_results = nullptr; + CUDAComplex* d_signals_fft = nullptr; + CUDAComplex* d_sequence_fft = nullptr; + uint seqChannels_ = 0; uint fftLength_ = 0; uint sequenceSize_ = 0; @@ -68,15 +71,25 @@ class CUDACorrelation { #endif uint signalsSize_ = 0; // signalsSize_ = signalsNum_ * sizeof(CUDAComplex) + uint vecPeaksSize_ = 0; + uint resultsSize_ = 0; + + // GPU设备是否可以进行内存map + int canMapHostMemory = 0; + // 保存每个sequence的原始长度 - std::vector vecSequenceLength_; - std::vector> vecSequenceFFT_; + uint* h_vecSeqLen = nullptr; + uint* d_vecSeqLen = nullptr; private: - cudaStream_t stream_; + cudaStream_t stream_ = cudaStreamDefault; cudaDeviceProp deviceProp_; - cufftHandle fftPlan_ = 0; + + cufftHandle seqfftPlan_ = 0; + cufftHandle sigfftPlan_ = 0; cufftHandle ifftPlan_ = 0; + + // 共轭乘核函数线程数配置 dim3 block_; dim3 grid_; diff --git a/mainwindow.cpp b/mainwindow.cpp index 00e5290..b9f433f 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -16,12 +16,7 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { InitConnect(); } -MainWindow::~MainWindow() { - if (signalDatas_) { - free(signalDatas_); - signalDatas_ = nullptr; - } -} +MainWindow::~MainWindow() {} void MainWindow::InitControlValues() { m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this); @@ -35,10 +30,8 @@ void MainWindow::InitConnect() { connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick())); } -int MainWindow::CalculateRoutine(const cpuComplex *signalDatas, - uint numChannels, uint signalLength) { - return m_calMC.CalMovingCorrlationRoutine(signalDatas, numChannels, - signalLength); +int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) { + return m_calMC.CalMovingCorrlationRoutine(signalChannels, signalLength); } void MainWindow::SlotCalculateClick() { @@ -67,11 +60,11 @@ void MainWindow::SlotCalculateClick() { m_ReplayfilesizeDetect); // 初始化:读取已知序列 - m_calMC.LoadAllSequenceBin(basePath, SamplePoint_); + m_calMC.LoadAllSequenceBin(basePath, signalLength_); #if defined(USE_CUDA) // 初始化:提前计算完所有Sequence的fft - m_calMC.ComputeAllSequence(SamplePoint_); + m_calMC.ComputeAllSequence(signalLength_); #endif int m_iframeCnt = m_vecReplayHeadposDetect.size(); @@ -132,7 +125,8 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, char *buff = new char[Replayfilesize]; replayfileforcalculate.read(buff, Replayfilesize); - SamplePoint_ = m_droneIQParse.GetSamplePoint(buff); + signalLength_ = m_droneIQParse.GetSamplePoint(buff); + signalChannels_ = m_droneIQParse.GetChannelNumber(buff); std::string source(buff, Replayfilesize); string match(cHeader, 4); @@ -157,35 +151,31 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, } void MainWindow::ReplayIQDataParse(char *buf) { - uint SamplePoints = m_droneIQParse.GetSamplePoint(buf); - if (SamplePoints > 0) { - uint channelnumber = - m_droneIQParse.GetChannelNumber(buf); // 8->16 16->32 32->64 + if (signalLength_ > 0) { + if (signalChannels_ == 32) { + uint channelnumber = 8; //原逻辑也是只取了前8个通道 - if (channelnumber == 32) { - channelnumber = 8; //原逻辑也是只取了前8个通道 - - // malloc申请空间,需手动free,防止内存泄漏(在析构函数中free) if (signalDatas_ == nullptr) { - signalDatas_ = (cpuComplex *)malloc(channelnumber * SamplePoints * - sizeof(cpuComplex)); - if (signalDatas_ == nullptr) { - std::cerr << __FUNCTION__ << " Memory allocation failed!" - << std::endl; + // 申请零拷贝内存,自动完成CPU内存与GPU显存数据同步 + if (!m_calMC.cudaCorrelation->AllocMappMemory( + (void **)&(m_calMC.cudaCorrelation->h_signals), + (void **)&(m_calMC.cudaCorrelation->d_signals), + channelnumber * signalLength_ * sizeof(cpuComplex))) { + std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl; return; } - memset(signalDatas_, 0, - channelnumber * SamplePoints * sizeof(cpuComplex)); + + signalDatas_ = (cpuComplex *)m_calMC.cudaCorrelation->h_signals; } - m_droneIQParse.ResolveIQData(buf, SamplePoints, channelnumber, + m_droneIQParse.ResolveIQData(buf, signalLength_, channelnumber, signalDatas_); QElapsedTimer tm; tm.start(); // 每帧 SamplePoints 个点 IQ 输入 // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰 - int result = CalculateRoutine(signalDatas_, channelnumber, SamplePoints); + int result = CalculateRoutine(channelnumber, signalLength_); std::cout << __FUNCTION__ << " result:" << result << " tm(ns):" << tm.nsecsElapsed() << std::endl; diff --git a/mainwindow.h b/mainwindow.h index 3742949..a63a935 100644 --- a/mainwindow.h +++ b/mainwindow.h @@ -26,11 +26,13 @@ class MainWindow : public QMainWindow { void InitConnect(); // 计算总流程 - int CalculateRoutine(const cpuComplex *signalDatas, uint numChannels, - uint signalLength); + // 不需要传signalDatas参数,因采用零拷贝,数据已经传到显存了 + int CalculateRoutine(uint signalChannels, uint signalLength); + // 零拷贝:signalDatas_被填充之后,数据会自动同步到显存中 cpuComplex *signalDatas_ = nullptr; - uint SamplePoint_; + uint signalChannels_ = 0; + uint signalLength_ = 0; private: // 获取测试数据文件中 每一帧数据的帧头下标 -- Gitee