From b8ee2ef8a033a5547496d5c60bf92119ce6405b8 Mon Sep 17 00:00:00 2001 From: wsqRichard <229242333@qq.com> Date: Wed, 23 Jul 2025 18:08:36 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=9A=E6=89=B9=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=89=80=E6=9C=89=E5=BA=8F=E5=88=97=E5=92=8C=E4=BF=A1?= =?UTF-8?q?=E5=8F=B7=E7=9A=84=E5=85=B1=E8=BD=AD=E4=B9=98=E5=92=8CIFFT?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=EF=BC=8C=E6=8F=90=E5=8D=87=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E6=80=A7=E8=83=BD=20=20=201.=E4=BF=AE=E6=94=B9=E5=85=B1?= =?UTF-8?q?=E8=BD=AD=E4=B9=98=E6=A0=B8=E5=87=BD=E6=95=B0=EF=BC=9A=E4=B8=80?= =?UTF-8?q?=E6=AC=A1=E8=B0=83=E7=94=A8=E5=AE=8C=E6=88=90=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=BA=8F=E5=88=97=E5=92=8C=E4=BF=A1=E5=8F=B7=E7=9A=84=E5=85=B1?= =?UTF-8?q?=E8=BD=AD=E4=B9=98=20=20=202.=E5=85=B1=E8=BD=AD=E4=B9=98?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E8=AE=A1=E7=AE=97IFFT=EF=BC=9A=E4=B8=80?= =?UTF-8?q?=E6=AC=A1=E5=AE=8C=E6=88=90=E6=89=80=E6=9C=89=E5=85=B1=E8=BD=AD?= =?UTF-8?q?=E4=B9=98=E7=BB=93=E6=9E=9C=E7=9A=84IFFT=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: wsqRichard <229242333@qq.com> --- calculatemovingcorrelation.cpp | 6 +- cuda_correlation.cu | 653 ++++++++++++--------------------- cuda_correlation.h | 14 +- 3 files changed, 246 insertions(+), 427 deletions(-) diff --git a/calculatemovingcorrelation.cpp b/calculatemovingcorrelation.cpp index 920abd8..5980f80 100644 --- a/calculatemovingcorrelation.cpp +++ b/calculatemovingcorrelation.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -287,8 +287,8 @@ void CalculateMovingCorrelation::ReadSequenceFile( std::vector vecSequenceData; BytesToRealInv(tempdata, len, vecSequenceData); - int sizeOfSequenceData = vecSequenceData.size(); - int SequenceLength = sizeOfSequenceData / 2; + uint sizeOfSequenceData = vecSequenceData.size(); + uint SequenceLength = sizeOfSequenceData / 2; vecSequenceLength.push_back(SequenceLength); for (int index = 0; index < SequenceLength; index++) { diff --git a/cuda_correlation.cu b/cuda_correlation.cu index 0b946e5..33fafcd 100644 --- a/cuda_correlation.cu +++ b/cuda_correlation.cu @@ -39,8 +39,9 @@ static constexpr cufftType getFFTType() { } } -// 单精度核函数 -__global__ void batchConjugateMultiplyKernelFloat( +// 单精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelFloat( const cufftComplex* __restrict__ signalDatas, const cufftComplex* __restrict__ sequenceFFT, cufftComplex* __restrict__ outputs, int signalLength, int numChannels) { @@ -58,8 +59,46 @@ __global__ void batchConjugateMultiplyKernelFloat( outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 } -// 双精度核函数 -__global__ void batchConjugateMultiplyKernelDouble( +// 批处理:计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelFloat( + const cufftComplex* __restrict__ signalDatas, + const cufftComplex* __restrict__ sequenceFFT, + cufftComplex* __restrict__ outputs, int signalLength, int numChannels, + int seqChannels) { + // 使用2D网格布局:x维度处理位置,y维度处理序列 + int pos = blockIdx.x * blockDim.x + threadIdx.x; + // 检查位置索引是否有效 + if (pos >= signalLength) return; + + int seqIdx = blockIdx.y; + if (seqIdx >= seqChannels) return; + + // 预计算常用偏移量 + const int signalOffset = pos; // 信号内存访问基础偏移 + const int sequenceOffset = seqIdx * signalLength + pos; + + // 预取序列FFT值(所有通道共享) + const cufftComplex seq = sequenceFFT[sequenceOffset]; + +// 处理所有通道 +#pragma unroll + for (int chIdx = 0; chIdx < numChannels; ++chIdx) { + // 计算当前通道的全局索引 + int globalIdx = + seqIdx * numChannels * signalLength + chIdx * signalLength + pos; + + // 读取信号数据(合并访问) + cufftComplex signal = signalDatas[chIdx * signalLength + signalOffset]; + + // 计算共轭乘法 + outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; + outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; + } +} + +// 双精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelDouble( const cufftDoubleComplex* __restrict__ signalDatas, const cufftDoubleComplex* __restrict__ sequenceFFT, cufftDoubleComplex* __restrict__ outputs, int signalLength, @@ -78,6 +117,44 @@ __global__ void batchConjugateMultiplyKernelDouble( outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 } +// 批处理:计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelDouble( + const cufftDoubleComplex* __restrict__ signalDatas, + const cufftDoubleComplex* __restrict__ sequenceFFT, + cufftDoubleComplex* __restrict__ outputs, int signalLength, int numChannels, + int seqChannels) { + // 使用2D网格布局:x维度处理位置,y维度处理序列 + int pos = blockIdx.x * blockDim.x + threadIdx.x; + // 检查位置索引是否有效 + if (pos >= signalLength) return; + + int seqIdx = blockIdx.y; + if (seqIdx >= seqChannels) return; + + // 预计算常用偏移量 + const int signalOffset = pos; // 信号内存访问基础偏移 + const int sequenceOffset = seqIdx * signalLength + pos; + + // 预取序列FFT值(所有通道共享) + const cufftDoubleComplex seq = sequenceFFT[sequenceOffset]; + +// 处理所有通道 +#pragma unroll + for (int chIdx = 0; chIdx < numChannels; ++chIdx) { + // 计算当前通道的全局索引 + int globalIdx = + seqIdx * numChannels * signalLength + chIdx * signalLength + pos; + + // 读取信号数据(合并访问) + cufftDoubleComplex signal = + signalDatas[chIdx * signalLength + signalOffset]; + + // 计算共轭乘法 + outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; + outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; + } +} + #ifdef USE_PEEKSKERNEL template T hypot(T x, T y) { @@ -85,10 +162,11 @@ T hypot(T x, T y) { return sqrt(fabs(x) * fabs(x) + fabs(y) * fabs(y)); } -__global__ void CalculatePeaksKernelFloat(const cufftComplex* d_results, - const uint* d_seqLen, - uint seqChannels, uint signalChannels, - uint signalLength, uint* d_vecPeaks) { +// peekMax的核函数:参考CPU侧旧的计算逻辑实现 +__global__ void CalculatePeaksKernelFloat( + const cufftComplex* __restrict__ d_results, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint* __restrict__ d_vecPeaks) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= seqChannels * signalChannels) return; @@ -108,6 +186,7 @@ __global__ void CalculatePeaksKernelFloat(const cufftComplex* d_results, // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] const auto& CorrelationValue = d_results + idx * signalLength; +#pragma unroll for (int i = 0; i < num_elements; ++i) { const float abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); total_abs += abs_val; @@ -124,8 +203,9 @@ __global__ void CalculatePeaksKernelFloat(const cufftComplex* d_results, } __global__ void CalculatePeaksKernelDouble( - const cufftDoubleComplex* d_results, const uint* d_seqLen, uint seqChannels, - uint signalChannels, uint signalLength, uint* d_vecPeaks) { + const cufftDoubleComplex* __restrict__ d_results, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint* __restrict__ d_vecPeaks) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= seqChannels * signalChannels) return; @@ -145,6 +225,7 @@ __global__ void CalculatePeaksKernelDouble( // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] const auto& CorrelationValue = d_results + idx * signalLength; +#pragma unroll for (int i = 0; i < num_elements; ++i) { const double abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); total_abs += abs_val; @@ -158,105 +239,12 @@ __global__ void CalculatePeaksKernelDouble( const double avg_abs = total_abs / num_elements; d_vecPeaks[idx] = ((max_abs > (avg_abs * 7)) ? 1 : 0); } - -// std::map& -// CalculateMovingCorrelation::CalculatePeaksMaxKernelFloat( -// std::vector>>& CorrelationResults, -// std::string SequenceName, int multiple) { -// int sizeofResult = CorrelationResults.size(); -// int sizeofDataGroup = sizeofResult / 8; - -// std::vector vecTestCheck; -// vecTestCheck.reserve(sizeofResult); - -// for (int i = 0; i < sizeofDataGroup; i++) { -// int groupstartindex = i * 8; -// int groupstopIndex = groupstartindex + 8; - -// DataMaxinfostruct datamaxinfo; -// datamaxinfo.maxvaluePos.reserve(8); -// datamaxinfo.maxvalue.reserve(8); -// datamaxinfo.sequencyName = SequenceName; - -// for (int j = groupstartindex; j < groupstopIndex; j++) { -// auto CorrelationValue = CorrelationResults[j]; -// double max_abs = -10000; -// int max_index = -1; -// double total_abs = 0.0; - -// const int num_elements = CorrelationValue.size(); - -// if (num_elements == 0) continue; - -// for (int i = 0; i < num_elements; ++i) { -// const double abs_val = -// std::hypot(CorrelationValue[i].real(), -// CorrelationValue[i].imag()); -// // total_abs += abs_val; -// if (abs_val > max_abs) { -// max_abs = abs_val; -// max_index = i; -// } -// } - -// datamaxinfo.maxvalue.push_back(max_abs); -// datamaxinfo.maxvaluePos.push_back(max_index); - -// if (max_index < 512) continue; - -// if ((max_index + 1000) > (num_elements - 512)) { -// continue; -// } - -// int startindex = max_index - 24; -// int stopindex = max_index + 1000; -// int totalPoint = stopindex - startindex; - -// for (int j = startindex; j < stopindex; j++) { -// const auto& val = CorrelationValue[j]; -// total_abs += std::hypot(val.real(), val.imag()); -// } - -// const double avg_abs = total_abs / totalPoint; - -// int value = (max_abs > (avg_abs * multiple)) ? 1 : 0; - -// if (value == 1) { -// auto itrfind = m_mapResultMaxinfoAndSequenceName.find(i); -// if (itrfind == m_mapResultMaxinfoAndSequenceName.end()) { -// m_mapResultMaxinfoAndSequenceName[i] = datamaxinfo; -// } -// } -// } -// } - -// return m_mapResultMaxinfoAndSequenceName; -// } - #endif -// 动态计算最优配置 -template -void CUDACorrelation::configureKernel(uint dataSize) { - int minGridSize, bestBlockSize; - - if constexpr (std::is_same_v) { - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &bestBlockSize, - (void*)batchConjugateMultiplyKernelFloat, - 0, dataSize); - } else { - cudaOccupancyMaxPotentialBlockSize( - &minGridSize, &bestBlockSize, (void*)batchConjugateMultiplyKernelDouble, - 0, dataSize); - } - - block_.x = std::min(bestBlockSize, (int)deviceProp_.maxThreadsPerBlock); - grid_.x = (dataSize + block_.x - 1) / block_.x; -} - template CUDACorrelation::CUDACorrelation() : fftPlan_(0), + ifftPlan_(0), d_signals(nullptr), d_sequence(nullptr), d_results(nullptr), @@ -266,17 +254,18 @@ CUDACorrelation::CUDACorrelation() sequenceSize_(0), fftLength_(0) { seqChannels_ = 0; - signalsNum_ = 0; #ifdef USE_PEEKSKERNEL h_vecPeaks = nullptr; d_vecPeaks = nullptr; d_vecSeqLen = nullptr; #else + signalsNum_ = 0; cpu_results = nullptr; #endif - CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); + // CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); + stream_ = cudaStreamDefault; CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0)); } @@ -284,7 +273,7 @@ template CUDACorrelation::~CUDACorrelation() { Cleanup(); cudaDeviceSynchronize(); - cudaStreamDestroy(stream_); + // cudaStreamDestroy(stream_); } template @@ -293,6 +282,10 @@ void CUDACorrelation::Cleanup() { cufftDestroy(fftPlan_); fftPlan_ = 0; } + if (ifftPlan_) { + cufftDestroy(ifftPlan_); + ifftPlan_ = 0; + } FreeMemory(); cudaStreamSynchronize(stream_); @@ -334,118 +327,79 @@ void CUDACorrelation::FreeMemory() { #endif } +// 预先计算sequence的fft +// 调用该接口时,需要先初始化 vecSequenceLength_ +// sequenceDatas:非vector类型,可在初始化时通过malloc分配并初始化 +// sequenceDatas 内存连续,可直接cudaMemcpy到显存中 +// 减少memcpy调用,提升性能 template -void CUDACorrelation::ComputeSequenceFFT(const Complex2D& VecSequence, - uint fftLength) { - uint numSequence = VecSequence.size(); +void CUDACorrelation::ComputeSequenceFFT(const Complex* sequenceDatas, + uint numSequence, uint fftLength) { fftLength_ = fftLength; - seqChannels_ = numSequence; uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); - // 扁平化数据并填充零 - std::vector hSequenceFlat(numSequence * fftLength, - CUDAComplex{0.0, 0.0}); - vecSequenceLength_.reserve(numSequence); - std::vector tmpLengths(numSequence); // 临时存储 -#pragma omp parallel for num_threads(numSequence) - { - for (int i = 0; i < numSequence; ++i) { - tmpLengths[i] = VecSequence[i].size(); // 无竞争写入 - - uint minlen = std::min(tmpLengths[i], fftLength); - memcpy(hSequenceFlat.data() + i * fftLength, VecSequence[i].data(), - minlen * sizeof(Complex)); + // 分配显存d_sequence + if ((sequenceSize_ == 0) || (d_sequence == nullptr)) { + if (d_sequence) { + cudaFreeAsync(d_sequence, stream_); + d_sequence = nullptr; + } + sequenceSize_ = sequenceSize; + CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); + } else { + if (sequenceSize_ != sequenceSize) { + if (d_sequence) { + cudaFreeAsync(d_sequence, stream_); + d_sequence = nullptr; + } + sequenceSize_ = sequenceSize; + CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); } } - vecSequenceLength_.assign(tmpLengths.begin(), tmpLengths.end()); - // 分配并拷贝数据到显存 - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_sequence = nullptr; - } - CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, hSequenceFlat.data(), - sequenceSize, cudaMemcpyHostToDevice, - stream_)); -#ifdef USE_PEEKSKERNEL - if (d_vecSeqLen) { - cudaFreeAsync(d_vecSeqLen, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_vecSeqLen = nullptr; - } - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), - numSequence * sizeof(uint), + // 拷贝sequenceDatas数据到d_sequence + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, sequenceDatas, sequenceSize, cudaMemcpyHostToDevice, stream_)); -#endif - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - - // 创建并执行FFT - cufftResult cufftStatus; - cufftHandle fftPlan; - cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType(), numSequence); - if (cufftStatus != CUFFT_SUCCESS) { - std::cerr << __FUNCTION__ << " Failed to create CUFFT plan" << std::endl; +#ifdef USE_PEEKSKERNEL + // 需要预先初始化 vecSequenceLength_ + if (vecSequenceLength_.size() == 0) { + std::cerr + << __FUNCTION__ + << " vecSequenceLength_ size is 0 (需要预先初始化 vecSequenceLength_)" + << std::endl; return; } - cufftSetStream(fftPlan, stream_); - if constexpr (std::is_same_v) { - cufftStatus = cufftExecC2C( - fftPlan, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence), CUFFT_FORWARD); + // 分配 d_vecSeqLen:该显存保存序列的原始长度,在peekMax的核函数中使用 + if ((seqChannels_ == 0) || (d_vecSeqLen == nullptr)) { + if (d_vecSeqLen) { + cudaFreeAsync(d_vecSeqLen, stream_); + d_vecSeqLen = nullptr; + } + + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); } else { - cufftStatus = cufftExecZ2Z( - fftPlan, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence), CUFFT_FORWARD); - } + if (seqChannels_ != numSequence) { + if (d_vecSeqLen) { + cudaFreeAsync(d_vecSeqLen, stream_); + d_vecSeqLen = nullptr; + } - cufftDestroy(fftPlan); - if (cufftStatus != CUFFT_SUCCESS) { - std::cerr << __FUNCTION__ << " Failed to execute forward FFT" << std::endl; - return; + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); + } } -} - -// 调用该接口需要先初始化vecSequenceLength_ -template -void CUDACorrelation::ComputeSequenceFFT(const Complex* sequenceDatas, - uint numSequence, uint fftLength) { - fftLength_ = fftLength; - seqChannels_ = numSequence; - uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); - // 分配并拷贝sequence数据到显存d_sequence - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_sequence = nullptr; - } - CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, sequenceDatas, sequenceSize, + // 拷贝vecSequenceLength_数据到显存d_vecSeqLen + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), + numSequence * sizeof(uint), cudaMemcpyHostToDevice, stream_)); -#ifdef USE_PEEKSKERNEL - if (d_vecSeqLen) { - cudaFreeAsync(d_vecSeqLen, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_vecSeqLen = nullptr; - } - // 需要先初始化 vecSequenceLength_ - // 分配并拷贝vecSequenceLength_数据到显存d_vecSeqLen - if (seqChannels_ > 0) { - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), - seqChannels_ * sizeof(uint), - cudaMemcpyHostToDevice, stream_)); - } #endif - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + seqChannels_ = numSequence; + grid_.y = seqChannels_; // y: 序列维度 // 创建并执行FFT cufftResult cufftStatus; @@ -475,152 +429,24 @@ void CUDACorrelation::ComputeSequenceFFT(const Complex* sequenceDatas, } // 预先计算signals的fft -template -void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { - uint numChannels = signalDatas.size(); - uint signalLength = signalDatas[0].size(); - uint signalsNum = numChannels * signalLength; - uint signalsSize = signalsNum * sizeof(Complex); - signalsNum_ = signalsNum; - configureKernel(signalsNum_); - - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) { - signalsSize_ = signalsSize; - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_results = nullptr; - } - - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); - } else { - if (signalsSize_ != signalsSize) { - signalsSize_ = signalsSize; - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - d_results = nullptr; - } - - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); - } - } - - try { - // 扁平化数据:二维转一维(二维vector内存不连续,不能直接copy到显存) - Complex1D hSignalsFlat(signalsNum, Complex{0.0, 0.0}); - uint copySize = signalLength * sizeof(Complex); -#pragma omp parallel for num_threads(numChannels) - { - for (uint i = 0; i < numChannels; ++i) { - memcpy(hSignalsFlat.data() + i * signalLength, // 目标地址偏移 - signalDatas[i].data(), // 源数据起始点 - copySize // 拷贝字节数 - ); - } - } - - // 拷贝数据到显存:CPU->GPU - CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, hSignalsFlat.data(), - signalsSize, cudaMemcpyHostToDevice, - stream_)); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - - // 创建fftPlan - cufftResult cufftStatus; - if (fftPlan_ == 0) { - signalLength_ = signalLength; - signalChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - cufftSetStream(fftPlan_, stream_); - - } else { - if ((signalLength_ != signalLength) || (signalChannels_ != numChannels)) { - if (fftPlan_) { - cufftResult result = cufftDestroy(fftPlan_); - fftPlan_ = 0; - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error destroying FFT plan"); - } - } - signalLength_ = signalLength; - signalChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - cufftSetStream(fftPlan_, stream_); - } - } - - // 计算signals的fft - if constexpr (std::is_same_v) { - cufftStatus = cufftExecC2C( - fftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals), CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error( - "Failed to execute forward FFT on signalDatas"); - } - } else { - cufftStatus = cufftExecZ2Z( - fftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals), CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error( - "Failed to execute forward FFT on signalDatas"); - } - } - - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - - return; - } catch (const std::exception& e) { - std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; - - cudaError_t lastError = cudaGetLastError(); - if (lastError != cudaSuccess) { - std::cerr << __FUNCTION__ - << " Last CUDA error: " << cudaGetErrorString(lastError) - << std::endl; - } - - Cleanup(); - throw; - } -} - -// +// signalDatas:非vector类型,可在初始化时通过malloc分配并初始化 +// signalDatas 内存连续,可直接cudaMemcpy到显存中 +// 减少memcpy调用,提升性能 template void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, uint numChannels, uint signalLength) { uint signalsNum = numChannels * signalLength; uint signalsSize = signalsNum * sizeof(Complex); + +#ifndef USE_PEEKSKERNEL signalsNum_ = signalsNum; - configureKernel(signalsNum_); +#endif + + if (signalLength_ != signalLength) { + block_.x = std::min((int)signalLength, (int)deviceProp_.maxThreadsPerBlock); + grid_.x = (signalLength + block_.x - 1) / block_.x; // x: 位置维度 + } // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) { @@ -633,7 +459,6 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, cudaFreeAsync(d_results, stream_); d_results = nullptr; } - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); CHECK_CUDA_ERROR( cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); @@ -648,7 +473,6 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, cudaFreeAsync(d_results, stream_); d_results = nullptr; } - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); CHECK_CUDA_ERROR( cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); @@ -659,7 +483,6 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, // 拷贝数据到显存:CPU->GPU CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, signalDatas, signalsSize, cudaMemcpyHostToDevice, stream_)); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); // 创建fftPlan cufftResult cufftStatus; @@ -714,8 +537,6 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, } } - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - return; } catch (const std::exception& e) { std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; @@ -732,19 +553,20 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, } } -// 需要预先计算完一个batch的 signalDatas 的FFT结果 -// 调用此接口,直接计算signalsFFT与sequenceFFT的共轭乘 +// 需预先计算sequence的fft +// 需预先计算signals的fft +// 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT template int CUDACorrelation::ComputeConjMul(void) { if (d_signals == nullptr) { - std::cerr << __FUNCTION__ << " 请先调用 ComputeSignalsFFT(signalDatas) 接口" + std::cerr << __FUNCTION__ << " 请先调用 ComputeSignalsFFT(...) 接口" << std::endl; return 0; } if (d_sequence == nullptr) { - std::cerr << __FUNCTION__ - << " 请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; + std::cerr << __FUNCTION__ << " 请先调用 ComputeSequenceFFT(...) 接口" + << std::endl; return 0; } @@ -769,102 +591,103 @@ int CUDACorrelation::ComputeConjMul(void) { } #endif - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - try { - // 创建fftPlan + // 创建ifftPlan_:批量计算共轭乘结果的IFFT cufftResult cufftStatus; - if (fftPlan_ == 0) { - cufftStatus = cufftPlan1d(&fftPlan_, signalLength_, getFFTType(), - signalChannels_); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - cufftSetStream(fftPlan_, stream_); - } + if (ifftPlan_ == 0) { + int rank = 1; + int n = signalLength_; + int inembed = 0, onembed = 0; + int istride = 1, ostride = 1; + int idist = signalLength_; + int odist = signalLength_; + int batch = seqChannels_ * signalChannels_; - // sequence_fft的指针 - CUDAComplex* sequence_ptr = nullptr; - CUDAComplex* d_output = nullptr; - - // 循环计算每个序列与信号的共轭乘和IFFT - for (int seqIdx = 0; seqIdx < seqChannels_; ++seqIdx) { - // 计算sequenceFFT的显存地址(initSequence计算之后,d_sequence显存资源没有释放) - sequence_ptr = d_sequence + seqIdx * fftLength_; - - // 共轭乘结果的地址偏移 - d_output = d_results + seqIdx * signalsNum_; - - if constexpr (std::is_same_v) { - // 计算共轭乘 - batchConjugateMultiplyKernelFloat<<>>( - reinterpret_cast(d_signals), sequence_ptr, - reinterpret_cast(d_output), signalLength_, - signalChannels_); - - // 计算IFFT - cufftStatus = cufftExecC2C( - fftPlan_, reinterpret_cast(d_output), - reinterpret_cast(d_output), CUFFT_INVERSE); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); - } - } else { - // 计算共轭乘 - batchConjugateMultiplyKernelDouble<<>>( - reinterpret_cast(d_signals), sequence_ptr, - reinterpret_cast(d_output), signalLength_, - signalChannels_); - - // 计算IFFT - cufftStatus = cufftExecZ2Z( - fftPlan_, reinterpret_cast(d_output), - reinterpret_cast(d_output), CUFFT_INVERSE); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); - } + cufftStatus = + cufftPlanMany(&ifftPlan_, rank, &n, &inembed, istride, idist, + &onembed, ostride, odist, getFFTType(), batch); + if (cufftStatus != CUFFT_SUCCESS) { + ifftPlan_ = 0; + throw std::runtime_error("Failed to create CUFFT ifftPlan_"); } + cufftSetStream(ifftPlan_, stream_); } #ifdef USE_PEEKSKERNEL + // GPU计算peeksMax if (d_vecSeqLen == nullptr) { std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl; return 0; } - // GPU上计算PeeksMax - dim3 block(signalChannels_); - dim3 grid((seqChannels_ * signalChannels_ + block.x - 1) / block.x); + dim3 peeksMax_block(signalChannels_); + dim3 peeksMax_grid((seqChannels_ * signalChannels_ + peeksMax_block.x - 1) / + peeksMax_block.x); +#endif + if constexpr (std::is_same_v) { - CalculatePeaksKernelFloat<<>>( + // 批量计算共轭乘 + batchConjugateMultiplyKernelFloat<<>>( + reinterpret_cast(d_signals), + reinterpret_cast(d_sequence), + reinterpret_cast(d_results), signalLength_, + signalChannels_, seqChannels_); + + // 批量计算IFFT + cufftStatus = cufftExecC2C( + ifftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } + + // GPU计算peeksMax +#ifdef USE_PEEKSKERNEL + CalculatePeaksKernelFloat<<>>( reinterpret_cast(d_results), d_vecSeqLen, seqChannels_, signalChannels_, signalLength_, d_vecPeaks); - } else { - CalculatePeaksKernelDouble<<>>( +#endif + } else { // double类型 + // 批量计算共轭乘 + batchConjugateMultiplyKernelDouble<<>>( + reinterpret_cast(d_signals), + reinterpret_cast(d_sequence), + reinterpret_cast(d_results), signalLength_, + signalChannels_, seqChannels_); + + // 批量计算IFFT + cufftStatus = cufftExecZ2Z( + ifftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } + + // GPU计算peeksMax +#ifdef USE_PEEKSKERNEL + CalculatePeaksKernelDouble<<>>( reinterpret_cast(d_results), d_vecSeqLen, seqChannels_, signalChannels_, signalLength_, d_vecPeaks); +#endif } +#ifdef USE_PEEKSKERNEL + // GPU计算peeksMax + // 如果CPU侧还需要对计算结果进步一步处理,则需要把结果copy回CPU的内存 + // 如果不需要,如下代码可注释掉 CHECK_CUDA_ERROR(cudaMemcpyAsync( h_vecPeaks, d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), cudaMemcpyDeviceToHost, stream_)); - // for (int i = 0; i < signalChannels_; i++) { - // if (h_vecPeaks[i] == 1) { - // return 1; - // } - // } CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - - return 0; #else // 拷贝IFFT结果回主机,cpu侧计算PeeksMax CHECK_CUDA_ERROR(cudaMemcpyAsync(cpu_results, d_results, seqChannels_ * signalsSize_, cudaMemcpyDeviceToHost, stream_)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); - return 0; #endif + + return 0; } catch (const std::exception& e) { std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; diff --git a/cuda_correlation.h b/cuda_correlation.h index b3286c4..99ca276 100644 --- a/cuda_correlation.h +++ b/cuda_correlation.h @@ -26,8 +26,6 @@ class CUDACorrelation { ~CUDACorrelation(); // 预先计算sequence的fft - void ComputeSequenceFFT(const Complex2D& VecSequence, uint fftLength); - // 调用该接口时,需要先初始化 vecSequenceLength_ // sequenceDatas:非vector类型,可在初始化时通过malloc分配并初始化 // sequenceDatas 内存连续,可直接cudaMemcpy到显存中 @@ -36,8 +34,6 @@ class CUDACorrelation { uint fftLength); // 预先计算signals的fft - void ComputeSignalsFFT(const Complex2D& signalDatas); - // signalDatas:非vector类型,可在初始化时通过malloc分配并初始化 // signalDatas 内存连续,可直接cudaMemcpy到显存中 // 减少memcpy调用,提升性能 @@ -67,7 +63,9 @@ class CUDACorrelation { uint signalChannels_ = 0; uint signalLength_ = 0; - uint signalsNum_ = 0; // signalsNum_ = signalChannels_ * signalLength_ +#ifndef USE_PEEKSKERNEL + uint signalsNum_ = 0; // signalsNum_ = signalChannels_ * signalLength_ +#endif uint signalsSize_ = 0; // signalsSize_ = signalsNum_ * sizeof(CUDAComplex) // 保存每个sequence的原始长度 @@ -77,13 +75,11 @@ class CUDACorrelation { private: cudaStream_t stream_; cudaDeviceProp deviceProp_; - cufftHandle fftPlan_; + cufftHandle fftPlan_ = 0; + cufftHandle ifftPlan_ = 0; dim3 block_; dim3 grid_; - // 计算最优线程数配置 - void configureKernel(uint dataSize); - // 资源释放 void FreeMemory(); void Cleanup(); -- Gitee