From 46b071c2f7ecc056fbdd3827fab8a02565a73a51 Mon Sep 17 00:00:00 2001
From: wsqRichard <229242333@qq.com>
Date: Mon, 28 Jul 2025 16:12:56 +0800
Subject: [PATCH] =?UTF-8?q?=E6=80=A7=E8=83=BD=E4=BC=98=E5=8C=96=EF=BC=9A?=
 =?UTF-8?q?=E9=87=87=E7=94=A8=E9=9B=B6=E6=8B=B7=E8=B4=9D=EF=BC=8C=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96CPU=20=E4=B8=8EGPU=20=E4=B9=8B=E9=97=B4=E7=9A=84?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E4=BC=A0=E8=BE=93=E6=80=A7=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichard <229242333@qq.com>
---
 calculatemovingcorrelation.cpp |  98 +++++--
 calculatemovingcorrelation.h   |  20 +-
 cuda_correlation.cu            | 499 ++++++++++++++++++---------------
 cuda_correlation.h             |  45 +--
 mainwindow.cpp                 |  50 ++--
 mainwindow.h                   |   8 +-
 6 files changed, 402 insertions(+), 318 deletions(-)
diff --git a/calculatemovingcorrelation.cpp b/calculatemovingcorrelation.cpp
index 5980f80..dc80da8 100644
--- a/calculatemovingcorrelation.cpp
+++ b/calculatemovingcorrelation.cpp
@@ -19,10 +19,12 @@ CalculateMovingCorrelation::CalculateMovingCorrelation() {}
 #endif
 
 CalculateMovingCorrelation::~CalculateMovingCorrelation() {
-  if (sequenceDatas_) {
-    free(sequenceDatas_);
-    sequenceDatas_ = nullptr;
+#ifndef USE_PEEKSKERNEL
+  if (h_vecSeqLen) {
+    free(h_vecSeqLen);
+    h_vecSeqLen = nullptr;
   }
+#endif
 }
 
 // 预计算sequence：初始化阶段预计算所有Sequence的FFT
@@ -33,24 +35,20 @@ void CalculateMovingCorrelation::ComputeAllSequence(uint fftLength) {
     std::cerr << __FUNCTION__ << " sequenceDatas_ ptr is null!" << std::endl;
     return;
   }
-  cudaCorrelation->ComputeSequenceFFT(sequenceDatas_, numSequences_, fftLength);
+  cudaCorrelation->ComputeSequenceFFT(numSequences_, fftLength);
 #endif
 }
 
-int CalculateMovingCorrelation::CalMovingCorrlationRoutine(
-    const cpuComplex *signalDatas, uint numChannels, uint signalLength) {
+int CalculateMovingCorrelation::CalMovingCorrlationRoutine(uint signalChannels,
+                                                           uint signalLength) {
   int result = 0;
-#if USE_CUDA
-  // 预先计算所有batch的signals的fft值
-  if (signalDatas == nullptr) {
-    std::cerr << __FUNCTION__ << " signalDatas ptr is null!" << std::endl;
-    return 0;
-  }
 
+#if USE_CUDA
   qDebug() << "Starting CUDA processing for compute all signals fft";
-  cudaCorrelation->ComputeSignalsFFT(signalDatas, numChannels, signalLength);
+  // 预先计算所有batch的signals的fft值
+  cudaCorrelation->ComputeSignalsFFT(signalChannels, signalLength);
 
-  qDebug() << __FUNCTION__ << "Starting CUDA processing for ComputeConjMul()";
+  qDebug() << "Starting CUDA processing for ComputeConjMul()";
   try {
     // ComputeConjMul：函数完成共轭乘、IFFT、CalculatePeaks
     result = cudaCorrelation->ComputeConjMul();
@@ -67,27 +65,38 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine(
   }
 #endif
 #endif
+
   return result;
 }
 
 #ifndef USE_PEEKSKERNEL
 int CalculateMovingCorrelation::CalculatePeaks(uint numSequences) {
-  std::vector<int> vecCaculatePeaks;
-  vecCaculatePeaks.reserve(cudaCorrelation->signalChannels_);
+  if (h_vecSeqLen == nullptr) {
+    std::cerr << __FUNCTION__ << " h_vecSeqLen ptr is null!" << std::endl;
+    return 0;
+  }
+  if (cudaCorrelation->h_results == nullptr) {
+    std::cerr << __FUNCTION__ << " cudaCorrelation->h_results ptr is null!"
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<int> vecCaculatePeaks(cudaCorrelation->signalChannels_);
   uint signalLength = cudaCorrelation->signalLength_;
   int ret = 0;
 
   for (int seqIdx = 0; seqIdx < numSequences; ++seqIdx) {
     vecCaculatePeaks.clear();
+
     // 原始sequence的长度
-    uint sequenceLength = cudaCorrelation->vecSequenceLength_[seqIdx];
+    uint sequenceLength = h_vecSeqLen[seqIdx];
     const uint num_elements = signalLength - sequenceLength;
     if (num_elements == 0) {
       return 0;
     }
 
     const auto &conj_results =
-        cudaCorrelation->cpu_results + seqIdx * cudaCorrelation->signalsNum_;
+        cudaCorrelation->h_results + seqIdx * cudaCorrelation->signalsNum_;
 
     for (int res_i = 0; res_i < cudaCorrelation->signalChannels_; ++res_i) {
       Real max_abs = -10000;
@@ -238,37 +247,70 @@ void CalculateMovingCorrelation::LoadAllSequenceBin(QString basePath,
 
   numSequences_ = list.size();
 
-  // malloc申请空间，需手动free，防止内存泄漏（在析构函数中free）
   if (sequenceDatas_ == nullptr) {
-    sequenceDatas_ =
-        (cpuComplex *)malloc(numSequences_ * SamplePoint * sizeof(cpuComplex));
-    if (sequenceDatas_ == nullptr) {
-      std::cerr << __FUNCTION__ << " Memory allocation failed!" << std::endl;
+    // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
+    if (!cudaCorrelation->AllocMappMemory(
+            (void **)&(cudaCorrelation->h_sequence),
+            (void **)&(cudaCorrelation->d_sequence),
+            numSequences_ * SamplePoint * sizeof(cpuComplex))) {
+      std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
       return;
     }
+    sequenceDatas_ = (cpuComplex *)cudaCorrelation->h_sequence;
     memset(sequenceDatas_, 0, numSequences_ * SamplePoint * sizeof(cpuComplex));
   }
 
+#ifdef USE_PEEKSKERNEL
+  if (h_vecSeqLen == nullptr) {
+    // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
+    if (!cudaCorrelation->AllocMappMemory(
+            (void **)&(cudaCorrelation->h_vecSeqLen),
+            (void **)&(cudaCorrelation->d_vecSeqLen),
+            numSequences_ * sizeof(uint))) {
+      std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
+      return;
+    }
+    h_vecSeqLen = (uint *)cudaCorrelation->h_vecSeqLen;
+    memset(h_vecSeqLen, 0, numSequences_ * sizeof(uint));
+  }
+#else
+  // malloc申请空间，需手动free，防止内存泄漏（在析构函数中free）
+  if (h_vecSeqLen == nullptr) {
+    h_vecSeqLen = (uint *)malloc(numSequences_ * sizeof(uint));
+    if (h_vecSeqLen == nullptr) {
+      std::cerr << __FUNCTION__ << " Memory allocation failed!" << std::endl;
+      return;
+    }
+    memset(h_vecSeqLen, 0, numSequences_ * sizeof(uint));
+  }
+#endif
+
   for (int i = 0; i < numSequences_; ++i) {
     QFileInfo fileInfo = list.at(i);
     cpuComplex *sequence = sequenceDatas_ + i * SamplePoint;
+    uint *seqLenth = h_vecSeqLen + i;
     if (!fileInfo.isDir()) {
       if (fileInfo.suffix() == "bin") {
         ReadSequenceFile(PathName + "/" + fileInfo.fileName(), sequence,
-                         cudaCorrelation->vecSequenceLength_);
+                         seqLenth);
       }
     }
   }
 }
 
-void CalculateMovingCorrelation::ReadSequenceFile(
-    QString strFileName, cpuComplex *sequence,
-    std::vector<uint> &vecSequenceLength) {
+void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName,
+                                                  cpuComplex *sequence,
+                                                  uint *seqLenth) {
   if (sequence == nullptr) {
     std::cerr << __FUNCTION__ << " sequence ptr is null!" << std::endl;
     return;
   }
 
+  if (seqLenth == nullptr) {
+    std::cerr << __FUNCTION__ << " seqLenth ptr is null!" << std::endl;
+    return;
+  }
+
   std::ifstream inFile(strFileName.toStdString(),
                        std::ios::in | std::ios::binary);  // 二进制读方式打开
   if (!inFile) {
@@ -289,7 +331,7 @@ void CalculateMovingCorrelation::ReadSequenceFile(
 
   uint sizeOfSequenceData = vecSequenceData.size();
   uint SequenceLength = sizeOfSequenceData / 2;
-  vecSequenceLength.push_back(SequenceLength);
+  *seqLenth = SequenceLength;
 
   for (int index = 0; index < SequenceLength; index++) {
     cpuComplex data(vecSequenceData[index * 2], vecSequenceData[index * 2 + 1]);
diff --git a/calculatemovingcorrelation.h b/calculatemovingcorrelation.h
index 87cdb6c..cf3a3b2 100644
--- a/calculatemovingcorrelation.h
+++ b/calculatemovingcorrelation.h
@@ -25,17 +25,25 @@ class CalculateMovingCorrelation {
   void LoadAllSequenceBin(QString basePath, uint SamplePoint);
 
   // 计算所有序列的fft
+  // 不需要传sequenceDatas参数，因采用零拷贝，数据已经传到显存了
   void ComputeAllSequence(uint fftLength);
 
   // 计算滑动相关总流程 输入 8路 I数据 和 8路 Q数据  返回 1--找到相关峰
   // 0--未找到相关峰
-  int CalMovingCorrlationRoutine(const cpuComplex *signalDatas,
-                                 uint numChannels, uint signalLength);
+  int CalMovingCorrlationRoutine(uint signalChannels, uint signalLength);
 
-  // 序列文件数据
+  // 序列数据
+  // 零拷贝：sequenceDatas_被填充之后，数据会自动同步到显存中
   cpuComplex *sequenceDatas_ = nullptr;
+
+  // 序列原始长度
+  uint *h_vecSeqLen = nullptr;
   uint numSequences_ = 0;
 
+#if USE_CUDA
+  CUDACorrelation<Real> *cudaCorrelation;
+#endif
+
  private:
   // 滑动相关函数
   // 输入 一路数据的 IQ （组合成 复数） SingleChannelData
@@ -54,15 +62,11 @@ class CalculateMovingCorrelation {
 
   // 读取单个序列文件
   void ReadSequenceFile(QString strFileName, cpuComplex *sequence,
-                        std::vector<uint> &vecSequenceLength);
+                        uint *seqLenth);
 
   // 字节转换成Real（float/double）
   void BytesToRealInv(char *buf, int ReadFileLength,
                       std::vector<Real> &VecReturn);
-
-#if USE_CUDA
-  CUDACorrelation<Real> *cudaCorrelation;
-#endif
 };
 
 #endif  // CALCULATEMOVINGCORRELATION_H
diff --git a/cuda_correlation.cu b/cuda_correlation.cu
index 33fafcd..a5a62dd 100644
--- a/cuda_correlation.cu
+++ b/cuda_correlation.cu
@@ -10,6 +10,10 @@
 
 #include "cuda_correlation.h"
 
+// cpu侧对 peekMaxKernel 计算结果还需要进一步处理
+// 则使能该宏定义，并在cpu侧增加相关逻辑处理 h_vecPeaks
+// #define CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT
+
 using namespace std;
 template class CUDACorrelation<float>;  // 明确告诉编译器生成float特例
 template class CUDACorrelation<double>;  // 明确告诉编译器生成double特例
@@ -20,8 +24,8 @@ template class CUDACorrelation<double>;  // 明确告诉编译器生成double特
     if (err != cudaSuccess) {                                                \
       std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
                 << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
-      throw std::runtime_error("CUDA error");                                \
       Cleanup();                                                             \
+      throw std::runtime_error("CUDA error");                                \
     }                                                                        \
   } while (0)
 
@@ -164,7 +168,7 @@ T hypot(T x, T y) {
 
 // peekMax的核函数：参考CPU侧旧的计算逻辑实现
 __global__ void CalculatePeaksKernelFloat(
-    const cufftComplex* __restrict__ d_results,
+    const cufftComplex* __restrict__ d_conjResults,
     const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels,
     uint signalLength, uint* __restrict__ d_vecPeaks) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -182,9 +186,9 @@ __global__ void CalculatePeaksKernelFloat(
   // int max_index = -1;
   float total_abs = 0.0;
 
-  // d_results的维度：[seqChannels * signalChannels][signalLength]
-  // 每个GPU线程（线程idx），处理相应的d_results[idx][signalLength]
-  const auto& CorrelationValue = d_results + idx * signalLength;
+  // d_conjResults的维度：[seqChannels * signalChannels][signalLength]
+  // 每个GPU线程（线程idx），处理相应的d_conjResults[idx][signalLength]
+  const auto& CorrelationValue = d_conjResults + idx * signalLength;
 
 #pragma unroll
   for (int i = 0; i < num_elements; ++i) {
@@ -203,7 +207,7 @@ __global__ void CalculatePeaksKernelFloat(
 }
 
 __global__ void CalculatePeaksKernelDouble(
-    const cufftDoubleComplex* __restrict__ d_results,
+    const cufftDoubleComplex* __restrict__ d_conjResults,
     const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels,
     uint signalLength, uint* __restrict__ d_vecPeaks) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -221,9 +225,9 @@ __global__ void CalculatePeaksKernelDouble(
   // int max_index = -1;
   double total_abs = 0.0;
 
-  // d_results的维度：[seqChannels * signalChannels][signalLength]
-  // 每个GPU线程（线程idx），处理相应的d_results[idx][signalLength]
-  const auto& CorrelationValue = d_results + idx * signalLength;
+  // d_conjResults的维度：[seqChannels * signalChannels][signalLength]
+  // 每个GPU线程（线程idx），处理相应的d_conjResults[idx][signalLength]
+  const auto& CorrelationValue = d_conjResults + idx * signalLength;
 
 #pragma unroll
   for (int i = 0; i < num_elements; ++i) {
@@ -243,7 +247,7 @@ __global__ void CalculatePeaksKernelDouble(
 
 template <typename T>
 CUDACorrelation<T>::CUDACorrelation()
-    : fftPlan_(0),
+    : sigfftPlan_(0),
       ifftPlan_(0),
       d_signals(nullptr),
       d_sequence(nullptr),
@@ -253,189 +257,223 @@ CUDACorrelation<T>::CUDACorrelation()
       signalsSize_(0),
       sequenceSize_(0),
       fftLength_(0) {
-  seqChannels_ = 0;
-
-#ifdef USE_PEEKSKERNEL
-  h_vecPeaks = nullptr;
-  d_vecPeaks = nullptr;
-  d_vecSeqLen = nullptr;
-#else
-  signalsNum_ = 0;
-  cpu_results = nullptr;
-#endif
-
-  // CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
   stream_ = cudaStreamDefault;
   CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0));
+
+  // 检查设备是否支持零拷贝
+  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&canMapHostMemory,
+                                          cudaDevAttrCanMapHostMemory, 0));
+
+  if (canMapHostMemory) {
+    // 设置设备支持映射内存
+    CHECK_CUDA_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
+  } else {
+    std::cerr
+        << __FUNCTION__
+        << " The current device does not support the allocation of mappMemory."
+        << std::endl;
+  }
 }
 
 template <typename T>
 CUDACorrelation<T>::~CUDACorrelation() {
   Cleanup();
-  cudaDeviceSynchronize();
-  // cudaStreamDestroy(stream_);
 }
 
 template <typename T>
-void CUDACorrelation<T>::Cleanup() {
-  if (fftPlan_) {
-    cufftDestroy(fftPlan_);
-    fftPlan_ = 0;
+void CUDACorrelation<T>::FreeMemory() {
+  if (d_signals_fft) {
+    cudaFreeAsync(d_signals_fft, stream_);
+    d_signals_fft = nullptr;
   }
-  if (ifftPlan_) {
-    cufftDestroy(ifftPlan_);
-    ifftPlan_ = 0;
+  if (d_sequence_fft) {
+    cudaFreeAsync(d_sequence_fft, stream_);
+    d_sequence_fft = nullptr;
   }
-  FreeMemory();
-
-  cudaStreamSynchronize(stream_);
-}
-
-template <typename T>
-void CUDACorrelation<T>::FreeMemory() {
-  if (d_signals) {
-    cudaFreeAsync(d_signals, stream_);
+  if (h_signals) {
+    cudaFreeHost(h_signals);
+    h_signals = nullptr;
     d_signals = nullptr;
   }
-  if (d_sequence) {
-    cudaFreeAsync(d_sequence, stream_);
+  if (h_sequence) {
+    cudaFreeHost(h_sequence);
+    h_sequence = nullptr;
     d_sequence = nullptr;
   }
+
+#ifdef USE_PEEKSKERNEL
+  // 共轭乘及IFFT计算结果显存
   if (d_results) {
     cudaFreeAsync(d_results, stream_);
     d_results = nullptr;
   }
 
-#ifdef USE_PEEKSKERNEL
-  if (d_vecSeqLen) {
-    cudaFreeAsync(d_vecSeqLen, stream_);
+  // 原始序列长度相关内存空间
+  if (h_vecSeqLen) {
+    cudaFreeHost(h_vecSeqLen);
+    h_vecSeqLen = nullptr;
     d_vecSeqLen = nullptr;
   }
+  // peekmax核函数计算结果显存
+#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT
+  if (h_vecPeaks) {
+    cudaFreeHost(h_vecPeaks);
+    h_vecPeaks = nullptr;
+    d_vecPeaks = nullptr;
+  }
+#else
   if (d_vecPeaks) {
     cudaFreeAsync(d_vecPeaks, stream_);
     d_vecPeaks = nullptr;
   }
-  if (h_vecPeaks) {
-    free(h_vecPeaks);
-    h_vecPeaks = nullptr;
-  }
+#endif
+
 #else
-  if (cpu_results) {
-    free(cpu_results);
-    cpu_results = nullptr;
+  if (h_results) {
+    cudaFreeHost(h_results);
+    h_results = nullptr;
+    d_results = nullptr;
   }
 #endif
 }
 
-// 预先计算sequence的fft
-// 调用该接口时，需要先初始化 vecSequenceLength_
-// sequenceDatas：非vector类型，可在初始化时通过malloc分配并初始化
-// sequenceDatas 内存连续，可直接cudaMemcpy到显存中
-// 减少memcpy调用，提升性能
 template <typename T>
-void CUDACorrelation<T>::ComputeSequenceFFT(const Complex* sequenceDatas,
-                                            uint numSequence, uint fftLength) {
-  fftLength_ = fftLength;
-  uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex);
+void CUDACorrelation<T>::Cleanup() {
+  if (sigfftPlan_) {
+    cufftDestroy(sigfftPlan_);
+    sigfftPlan_ = 0;
+  }
+  if (seqfftPlan_) {
+    cufftDestroy(seqfftPlan_);
+    seqfftPlan_ = 0;
+  }
+  if (ifftPlan_) {
+    cufftDestroy(ifftPlan_);
+    ifftPlan_ = 0;
+  }
 
-  // 分配显存d_sequence
-  if ((sequenceSize_ == 0) || (d_sequence == nullptr)) {
-    if (d_sequence) {
-      cudaFreeAsync(d_sequence, stream_);
-      d_sequence = nullptr;
-    }
-    sequenceSize_ = sequenceSize;
-    CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_));
-  } else {
-    if (sequenceSize_ != sequenceSize) {
-      if (d_sequence) {
-        cudaFreeAsync(d_sequence, stream_);
-        d_sequence = nullptr;
-      }
-      sequenceSize_ = sequenceSize;
-      CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_));
+  FreeMemory();
+  cudaStreamSynchronize(stream_);
+}
+
+// 申请MappMemory（实现CPU与GPU的零拷贝：提升性能）
+template <typename T>
+bool CUDACorrelation<T>::AllocMappMemory(void** host_ptr, void** device_ptr,
+                                         size_t size) {
+  if (canMapHostMemory) {
+    if (size == 0) {
+      std::cerr << __FUNCTION__ << "AllocMappMemory size is zero." << std::endl;
+      return false;
     }
-  }
 
-  // 拷贝sequenceDatas数据到d_sequence
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, sequenceDatas, sequenceSize,
-                                   cudaMemcpyHostToDevice, stream_));
+    // 分配锁页映射内存
+    CHECK_CUDA_ERROR(cudaHostAlloc(
+        host_ptr, size, cudaHostAllocMapped | cudaHostAllocWriteCombined));
 
-#ifdef USE_PEEKSKERNEL
-  // 需要预先初始化 vecSequenceLength_
-  if (vecSequenceLength_.size() == 0) {
+    // 获取设备指针
+    CHECK_CUDA_ERROR(cudaHostGetDevicePointer(device_ptr, *host_ptr, 0));
+
+    return true;
+  } else {
     std::cerr
         << __FUNCTION__
-        << " vecSequenceLength_ size is 0 (需要预先初始化 vecSequenceLength_)"
+        << " The current device does not support the allocation of mappMemory."
         << std::endl;
+    return false;
+  }
+}
+
+/**
+ * 预先计算sequence的FFT
+ * 预先计算完所有序列的FFT，调用该接口时，需要先初始化 d_vecSeqLen
+ *
+ * @param numSequence 序列的batch
+ * @param fftLength 序列FFT结果的长度
+ * @return void
+ */
+template <typename T>
+void CUDACorrelation<T>::ComputeSequenceFFT(uint numSequence, uint fftLength) {
+  if (d_sequence == nullptr) {
+    std::cerr << __FUNCTION__ << " d_sequence ptr is null!" << std::endl;
     return;
   }
 
-  // 分配 d_vecSeqLen：该显存保存序列的原始长度，在peekMax的核函数中使用
-  if ((seqChannels_ == 0) || (d_vecSeqLen == nullptr)) {
-    if (d_vecSeqLen) {
-      cudaFreeAsync(d_vecSeqLen, stream_);
-      d_vecSeqLen = nullptr;
-    }
+  // 共轭乘核函数线程数配置的grid的y维度
+  grid_.y = numSequence;  // y: 序列维度
 
-    CHECK_CUDA_ERROR(
-        cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_));
-  } else {
-    if (seqChannels_ != numSequence) {
-      if (d_vecSeqLen) {
-        cudaFreeAsync(d_vecSeqLen, stream_);
-        d_vecSeqLen = nullptr;
-      }
+  uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex);
 
-      CHECK_CUDA_ERROR(
-          cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_));
-    }
+  // 因申请MapHostMemory，需单独分配显存d_sequence_fft，避免额外的copy操作
+  if (d_sequence_fft && (sequenceSize_ != sequenceSize || sequenceSize_ == 0)) {
+    cudaFreeAsync(d_sequence_fft, stream_);
+    d_sequence_fft = nullptr;
+  }
+  if (!d_sequence_fft) {
+    sequenceSize_ = sequenceSize;
+    CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence_fft, sequenceSize, stream_));
   }
-
-  // 拷贝vecSequenceLength_数据到显存d_vecSeqLen
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(),
-                                   numSequence * sizeof(uint),
-                                   cudaMemcpyHostToDevice, stream_));
-#endif
-
-  seqChannels_ = numSequence;
-  grid_.y = seqChannels_;  // y: 序列维度
 
   // 创建并执行FFT
   cufftResult cufftStatus;
-  cufftHandle fftPlan;
-  cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType<T>(), seqChannels_);
-  if (cufftStatus != CUFFT_SUCCESS) {
-    std::cerr << __FUNCTION__ << " Failed to create CUFFT plan" << std::endl;
-    return;
+  bool needNewPlan = (seqfftPlan_ == 0) || (fftLength_ != fftLength) ||
+                     (seqChannels_ != numSequence);
+  if (needNewPlan) {
+    if (seqfftPlan_) {
+      cufftStatus = cufftDestroy(seqfftPlan_);
+      if (cufftStatus != CUFFT_SUCCESS) {
+        throw std::runtime_error("Error destroying FFT plan");
+      }
+      seqfftPlan_ = 0;
+    }
+
+    // 更新参数并创建新计划
+    fftLength_ = fftLength;
+    seqChannels_ = numSequence;
+
+    cufftStatus =
+        cufftPlan1d(&seqfftPlan_, fftLength_, getFFTType<T>(), seqChannels_);
+    if (cufftStatus != CUFFT_SUCCESS) {
+      seqfftPlan_ = 0;
+      throw std::runtime_error("Failed to create CUFFT plan");
+    }
+
+    // 设置计算流
+    cufftSetStream(seqfftPlan_, stream_);
   }
-  cufftSetStream(fftPlan, stream_);
 
   if constexpr (std::is_same_v<T, float>) {
     cufftStatus = cufftExecC2C(
-        fftPlan, reinterpret_cast<cufftComplex*>(d_sequence),
-        reinterpret_cast<cufftComplex*>(d_sequence), CUFFT_FORWARD);
+        seqfftPlan_, reinterpret_cast<cufftComplex*>(d_sequence),
+        reinterpret_cast<cufftComplex*>(d_sequence_fft), CUFFT_FORWARD);
   } else {
     cufftStatus = cufftExecZ2Z(
-        fftPlan, reinterpret_cast<cufftDoubleComplex*>(d_sequence),
-        reinterpret_cast<cufftDoubleComplex*>(d_sequence), CUFFT_FORWARD);
+        seqfftPlan_, reinterpret_cast<cufftDoubleComplex*>(d_sequence),
+        reinterpret_cast<cufftDoubleComplex*>(d_sequence_fft), CUFFT_FORWARD);
   }
 
-  cufftDestroy(fftPlan);
   if (cufftStatus != CUFFT_SUCCESS) {
-    std::cerr << __FUNCTION__ << " Failed to execute forward FFT" << std::endl;
+    std::cerr << __FUNCTION__
+              << " Failed to execute forward FFT on sequenceDatas."
+              << std::endl;
     return;
   }
 }
 
-// 预先计算signals的fft
-// signalDatas：非vector类型，可在初始化时通过malloc分配并初始化
-// signalDatas 内存连续，可直接cudaMemcpy到显存中
-// 减少memcpy调用，提升性能
+/**
+ * 预先计算一组信号的FFT
+ *
+ * @param numChannels 信号的batch
+ * @param signalLength 每个信号的长度
+ * @return void
+ */
 template <typename T>
-void CUDACorrelation<T>::ComputeSignalsFFT(const Complex* signalDatas,
-                                           uint numChannels,
+void CUDACorrelation<T>::ComputeSignalsFFT(uint numChannels,
                                            uint signalLength) {
+  if (d_signals == nullptr) {
+    std::cerr << __FUNCTION__ << " d_signals ptr is null!" << std::endl;
+    return;
+  }
+
   uint signalsNum = numChannels * signalLength;
   uint signalsSize = signalsNum * sizeof(Complex);
 
@@ -443,94 +481,62 @@ void CUDACorrelation<T>::ComputeSignalsFFT(const Complex* signalDatas,
   signalsNum_ = signalsNum;
 #endif
 
+  // 共轭乘核函数线程配置
   if (signalLength_ != signalLength) {
     block_.x = std::min((int)signalLength, (int)deviceProp_.maxThreadsPerBlock);
     grid_.x = (signalLength + block_.x - 1) / block_.x;  // x: 位置维度
   }
 
-  // 分配设备侧显存（优化：复用之前的显存，避免重复的显存分配和释放，带来的性能损失）
-  if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) {
+  // 因申请MapHostMemory，需单独分配显存 d_signals_fft，避免额外的copy
+  if (d_signals_fft && (signalsSize_ != signalsSize || signalsSize_ == 0)) {
+    cudaFreeAsync(d_signals_fft, stream_);
+    d_signals_fft = nullptr;
+  }
+  if (!d_signals_fft) {
     signalsSize_ = signalsSize;
-    if (d_signals) {
-      cudaFreeAsync(d_signals, stream_);
-      d_signals = nullptr;
-    }
-    if (d_results) {
-      cudaFreeAsync(d_results, stream_);
-      d_results = nullptr;
-    }
-    CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_));
-    CHECK_CUDA_ERROR(
-        cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_));
-  } else {
-    if (signalsSize_ != signalsSize) {
-      signalsSize_ = signalsSize;
-      if (d_signals) {
-        cudaFreeAsync(d_signals, stream_);
-        d_signals = nullptr;
-      }
-      if (d_results) {
-        cudaFreeAsync(d_results, stream_);
-        d_results = nullptr;
-      }
-      CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_));
-      CHECK_CUDA_ERROR(
-          cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_));
-    }
+    CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals_fft, signalsSize_, stream_));
   }
 
   try {
-    // 拷贝数据到显存:CPU->GPU
-    CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, signalDatas, signalsSize,
-                                     cudaMemcpyHostToDevice, stream_));
-
     // 创建fftPlan
     cufftResult cufftStatus;
-    if (fftPlan_ == 0) {
+    bool needNewPlan = (sigfftPlan_ == 0) || (signalLength_ != signalLength) ||
+                       (signalChannels_ != numChannels);
+    if (needNewPlan) {
+      if (sigfftPlan_) {
+        cufftStatus = cufftDestroy(sigfftPlan_);
+        if (cufftStatus != CUFFT_SUCCESS) {
+          throw std::runtime_error("Error destroying FFT plan");
+        }
+        sigfftPlan_ = 0;
+      }
+
+      // 更新参数并创建新计划
       signalLength_ = signalLength;
       signalChannels_ = numChannels;
-      cufftStatus =
-          cufftPlan1d(&fftPlan_, signalLength, getFFTType<T>(), numChannels);
-      if (cufftStatus != CUFFT_SUCCESS) {
-        fftPlan_ = 0;
-        throw std::runtime_error("Failed to create CUFFT fftPlan_");
-      }
-      cufftSetStream(fftPlan_, stream_);
 
-    } else {
-      if ((signalLength_ != signalLength) || (signalChannels_ != numChannels)) {
-        if (fftPlan_) {
-          cufftResult result = cufftDestroy(fftPlan_);
-          fftPlan_ = 0;
-          if (result != CUFFT_SUCCESS) {
-            throw std::runtime_error("Error destroying FFT plan");
-          }
-        }
-        signalLength_ = signalLength;
-        signalChannels_ = numChannels;
-        cufftStatus =
-            cufftPlan1d(&fftPlan_, signalLength, getFFTType<T>(), numChannels);
-        if (cufftStatus != CUFFT_SUCCESS) {
-          fftPlan_ = 0;
-          throw std::runtime_error("Failed to create CUFFT fftPlan_");
-        }
-        cufftSetStream(fftPlan_, stream_);
+      cufftStatus = cufftPlan1d(&sigfftPlan_, signalLength_, getFFTType<T>(),
+                                signalChannels_);
+      if (cufftStatus != CUFFT_SUCCESS) {
+        sigfftPlan_ = 0;
+        throw std::runtime_error("Failed to create CUFFT sigfftPlan_");
       }
+      cufftSetStream(sigfftPlan_, stream_);
     }
 
     // 计算signals的fft
     if constexpr (std::is_same_v<T, float>) {
       cufftStatus = cufftExecC2C(
-          fftPlan_, reinterpret_cast<cufftComplex*>(d_signals),
-          reinterpret_cast<cufftComplex*>(d_signals), CUFFT_FORWARD);
+          sigfftPlan_, reinterpret_cast<cufftComplex*>(d_signals),
+          reinterpret_cast<cufftComplex*>(d_signals_fft), CUFFT_FORWARD);
       if (cufftStatus != CUFFT_SUCCESS) {
         throw std::runtime_error(
             "Failed to execute forward FFT on signalDatas");
       }
     } else {
       cufftStatus = cufftExecZ2Z(
-          fftPlan_, reinterpret_cast<cufftDoubleComplex*>(d_signals),
-          reinterpret_cast<cufftDoubleComplex*>(d_signals), CUFFT_FORWARD);
+          sigfftPlan_, reinterpret_cast<cufftDoubleComplex*>(d_signals),
+          reinterpret_cast<cufftDoubleComplex*>(d_signals_fft), CUFFT_FORWARD);
       if (cufftStatus != CUFFT_SUCCESS) {
         throw std::runtime_error(
             "Failed to execute forward FFT on signalDatas");
@@ -553,43 +559,90 @@ void CUDACorrelation<T>::ComputeSignalsFFT(const Complex* signalDatas,
   }
 }
 
-// 需预先计算sequence的fft
-// 需预先计算signals的fft
-// 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT
+/**
+ * 完成所有序列和信号的共轭乘、IFFT、peekmax计算
+ * 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT、peekMax等
+ * 需预先计算sequence的fft
+ * 需预先计算signals的fft
+ *
+ * @return int
+ */
 template <typename T>
 int CUDACorrelation<T>::ComputeConjMul(void) {
-  if (d_signals == nullptr) {
+  if (d_signals_fft == nullptr) {
     std::cerr << __FUNCTION__ << " 请先调用 ComputeSignalsFFT(...) 接口"
               << std::endl;
     return 0;
   }
 
-  if (d_sequence == nullptr) {
+  if (d_sequence_fft == nullptr) {
     std::cerr << __FUNCTION__ << " 请先调用 ComputeSequenceFFT(...) 接口"
               << std::endl;
     return 0;
   }
 
-  // 分配设备侧显存（优化：复用之前的显存，避免重复的显存分配和释放，带来的性能损失）
-  if (d_results == nullptr) {
-    CHECK_CUDA_ERROR(
-        cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_));
-  }
+  uint resultsSize = seqChannels_ * signalsSize_;
 
 #ifdef USE_PEEKSKERNEL
-  if (d_vecPeaks == nullptr) {
-    CHECK_CUDA_ERROR(cudaMallocAsync(
-        &d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), stream_));
+  // 使用 PeeksMaxKernel 核函数
+  if (d_vecSeqLen == nullptr) {
+    std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl;
+    return 0;
   }
 
+  // 分配共轭乘结果显存，不需要copy回cpu内存
+  if (d_results && (resultsSize_ != resultsSize || resultsSize_ == 0)) {
+    cudaFreeAsync(d_results, stream_);
+    d_results = nullptr;
+  }
+  if (!d_results) {
+    resultsSize_ = resultsSize;
+    CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, resultsSize_, stream_));
+  }
+
+  // 申请 PeeksMaxKernel 核函数 计算结果所需的显存
+  uint vecPeaksSize = seqChannels_ * signalChannels_ * sizeof(uint);
+#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT
+  // 若cpu侧还需要进一步处理
+  // 分配零拷贝内存，不需要显示调用cudaMemcpy 回传给cpu内存
+  // 待GPU计算结束，则实现CPU侧相关逻辑，对 h_vecPeaks 进行进一步处理即可
+  if (h_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) {
+    cudaFreeHost(h_vecPeaks);
+    h_vecPeaks = nullptr;
+  }
   if (h_vecPeaks == nullptr) {
-    h_vecPeaks = (uint*)malloc(seqChannels_ * signalChannels_ * sizeof(uint));
+    vecPeaksSize_ = vecPeaksSize;
+    // 申请零拷贝内存，系统自动同步
+    AllocMappMemory((void**)&h_vecPeaks, (void**)&d_vecPeaks, vecPeaksSize_);
   }
 #else
-  if (cpu_results == nullptr) {
-    cpu_results = (CUDAComplex*)malloc(seqChannels_ * signalsSize_);
+  // 若cpu侧不需要进一步处理，则申请普通显存
+  if (d_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) {
+    if (d_vecPeaks) {
+      cudaFreeAsync(d_vecPeaks, stream_);
+      d_vecPeaks = nullptr;
+    }
   }
-#endif
+
+  if (!d_vecPeaks) {
+    vecPeaksSize_ = vecPeaksSize;
+    CHECK_CUDA_ERROR(cudaMallocAsync(&d_vecPeaks, vecPeaksSize_, stream_));
+  }
+#endif  // CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT
+#else   // USE_PEEKSKERNEL
+  // 不使用 PeeksMaxKernel 核函数
+  // 则需要把共轭乘IFFT结果copy回cpu内存，进行进一步处理
+  // 分配零拷贝内存，不需要显示调用cudaMemcpy，系统自动将IFFT结果copy回cpu内存
+  if (h_results && (resultsSize_ != resultsSize || resultsSize_ == 0)) {
+    cudaFreeHost(h_results);
+    h_results = nullptr;
+  }
+  if (h_results == nullptr) {
+    resultsSize_ = resultsSize;
+    // 申请零拷贝内存，系统自动同步
+    AllocMappMemory((void**)&h_results, (void**)&d_results, resultsSize_);
+  }
+#endif  // USE_PEEKSKERNEL
 
   try {
     // 创建ifftPlan_：批量计算共轭乘结果的IFFT
@@ -614,12 +667,7 @@ int CUDACorrelation<T>::ComputeConjMul(void) {
     }
 
 #ifdef USE_PEEKSKERNEL
-    // GPU计算peeksMax
-    if (d_vecSeqLen == nullptr) {
-      std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl;
-      return 0;
-    }
-
+    // GPU计算peeksMax的线程数配置
     dim3 peeksMax_block(signalChannels_);
     dim3 peeksMax_grid((seqChannels_ * signalChannels_ + peeksMax_block.x - 1) /
                        peeksMax_block.x);
@@ -628,8 +676,8 @@ int CUDACorrelation<T>::ComputeConjMul(void) {
     if constexpr (std::is_same_v<T, float>) {
       // 批量计算共轭乘
       batchConjugateMultiplyKernelFloat<<<grid_, block_, 0, stream_>>>(
-          reinterpret_cast<cufftComplex*>(d_signals),
-          reinterpret_cast<cufftComplex*>(d_sequence),
+          reinterpret_cast<cufftComplex*>(d_signals_fft),
+          reinterpret_cast<cufftComplex*>(d_sequence_fft),
           reinterpret_cast<cufftComplex*>(d_results), signalLength_,
           signalChannels_, seqChannels_);
 
@@ -650,8 +698,8 @@ int CUDACorrelation<T>::ComputeConjMul(void) {
     } else {  // double类型
       // 批量计算共轭乘
       batchConjugateMultiplyKernelDouble<<<grid_, block_, 0, stream_>>>(
-          reinterpret_cast<cufftDoubleComplex*>(d_signals),
-          reinterpret_cast<cufftDoubleComplex*>(d_sequence),
+          reinterpret_cast<cufftDoubleComplex*>(d_signals_fft),
+          reinterpret_cast<cufftDoubleComplex*>(d_sequence_fft),
           reinterpret_cast<cufftDoubleComplex*>(d_results), signalLength_,
           signalChannels_, seqChannels_);
 
@@ -671,22 +719,7 @@ int CUDACorrelation<T>::ComputeConjMul(void) {
 #endif
     }
 
-#ifdef USE_PEEKSKERNEL
-    // GPU计算peeksMax
-    // 如果CPU侧还需要对计算结果进步一步处理，则需要把结果copy回CPU的内存
-    // 如果不需要，如下代码可注释掉
-    CHECK_CUDA_ERROR(cudaMemcpyAsync(
-        h_vecPeaks, d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint),
-        cudaMemcpyDeviceToHost, stream_));
     CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
-#else
-    // 拷贝IFFT结果回主机，cpu侧计算PeeksMax
-    CHECK_CUDA_ERROR(cudaMemcpyAsync(cpu_results, d_results,
-                                     seqChannels_ * signalsSize_,
-                                     cudaMemcpyDeviceToHost, stream_));
-    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
-#endif
-
     return 0;
   } catch (const std::exception& e) {
     std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl;
diff --git a/cuda_correlation.h b/cuda_correlation.h
index 99ca276..7f90895 100644
--- a/cuda_correlation.h
+++ b/cuda_correlation.h
@@ -25,20 +25,19 @@ class CUDACorrelation {
   CUDACorrelation();
   ~CUDACorrelation();
 
+  // 申请cpu与GPU的map内存（零拷贝：提升性能）
+  bool AllocMappMemory(void** host_ptr, void** device_ptr, size_t size);
+
   // 预先计算sequence的fft
-  // 调用该接口时，需要先初始化 vecSequenceLength_
-  // sequenceDatas：非vector类型，可在初始化时通过malloc分配并初始化
-  // sequenceDatas 内存连续，可直接cudaMemcpy到显存中
+  // 调用该接口时，需要先初始化 d_vecSeqLen
+  // 采用零拷贝内存，d_sequence显存中已有数据
   // 减少memcpy调用，提升性能
-  void ComputeSequenceFFT(const Complex* sequenceDatas, uint numSequence,
-                          uint fftLength);
+  void ComputeSequenceFFT(uint numSequence, uint fftLength);
 
   // 预先计算signals的fft
-  // signalDatas：非vector类型，可在初始化时通过malloc分配并初始化
-  // signalDatas 内存连续，可直接cudaMemcpy到显存中
+  // 采用零拷贝内存，d_signals显存中已有数据
   // 减少memcpy调用，提升性能
-  void ComputeSignalsFFT(const Complex* signalDatas, uint numChannels,
-                         uint signalLength);
+  void ComputeSignalsFFT(uint numChannels, uint signalLength);
 
   // 需预先计算sequence的fft
   // 需预先计算signals的fft
@@ -48,15 +47,19 @@ class CUDACorrelation {
 #ifdef USE_PEEKSKERNEL
   uint* h_vecPeaks = nullptr;
   uint* d_vecPeaks = nullptr;
-  uint* d_vecSeqLen = nullptr;
-#else
-  CUDAComplex* cpu_results = nullptr;
 #endif
 
+  CUDAComplex* h_signals = nullptr;
+  CUDAComplex* h_sequence = nullptr;
+  CUDAComplex* h_results = nullptr;
+
   CUDAComplex* d_signals = nullptr;
   CUDAComplex* d_sequence = nullptr;
   CUDAComplex* d_results = nullptr;
 
+  CUDAComplex* d_signals_fft = nullptr;
+  CUDAComplex* d_sequence_fft = nullptr;
+
   uint seqChannels_ = 0;
   uint fftLength_ = 0;
   uint sequenceSize_ = 0;
@@ -68,15 +71,25 @@ class CUDACorrelation {
 #endif
   uint signalsSize_ = 0;  // signalsSize_ = signalsNum_ * sizeof(CUDAComplex)
 
+  uint vecPeaksSize_ = 0;
+  uint resultsSize_ = 0;
+
+  // GPU设备是否可以进行内存map
+  int canMapHostMemory = 0;
+
   // 保存每个sequence的原始长度
-  std::vector<uint> vecSequenceLength_;
-  std::vector<std::vector<CUDAComplex>> vecSequenceFFT_;
+  uint* h_vecSeqLen = nullptr;
+  uint* d_vecSeqLen = nullptr;
 
  private:
-  cudaStream_t stream_;
+  cudaStream_t stream_ = cudaStreamDefault;
   cudaDeviceProp deviceProp_;
-  cufftHandle fftPlan_ = 0;
+
+  cufftHandle seqfftPlan_ = 0;
+  cufftHandle sigfftPlan_ = 0;
   cufftHandle ifftPlan_ = 0;
+
+  // 共轭乘核函数线程数配置
   dim3 block_;
   dim3 grid_;
 
diff --git a/mainwindow.cpp b/mainwindow.cpp
index 00e5290..b9f433f 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -16,12 +16,7 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {
   InitConnect();
 }
 
-MainWindow::~MainWindow() {
-  if (signalDatas_) {
-    free(signalDatas_);
-    signalDatas_ = nullptr;
-  }
-}
+MainWindow::~MainWindow() {}
 
 void MainWindow::InitControlValues() {
   m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this);
@@ -35,10 +30,8 @@ void MainWindow::InitConnect() {
   connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick()));
 }
 
-int MainWindow::CalculateRoutine(const cpuComplex *signalDatas,
-                                 uint numChannels, uint signalLength) {
-  return m_calMC.CalMovingCorrlationRoutine(signalDatas, numChannels,
-                                            signalLength);
+int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) {
+  return m_calMC.CalMovingCorrlationRoutine(signalChannels, signalLength);
 }
 
 void MainWindow::SlotCalculateClick() {
@@ -67,11 +60,11 @@ void MainWindow::SlotCalculateClick() {
                        m_ReplayfilesizeDetect);
 
   // 初始化：读取已知序列
-  m_calMC.LoadAllSequenceBin(basePath, SamplePoint_);
+  m_calMC.LoadAllSequenceBin(basePath, signalLength_);
 
 #if defined(USE_CUDA)
   // 初始化：提前计算完所有Sequence的fft
-  m_calMC.ComputeAllSequence(SamplePoint_);
+  m_calMC.ComputeAllSequence(signalLength_);
 #endif
 
   int m_iframeCnt = m_vecReplayHeadposDetect.size();
@@ -132,7 +125,8 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
 
   char *buff = new char[Replayfilesize];
   replayfileforcalculate.read(buff, Replayfilesize);
-  SamplePoint_ = m_droneIQParse.GetSamplePoint(buff);
+  signalLength_ = m_droneIQParse.GetSamplePoint(buff);
+  signalChannels_ = m_droneIQParse.GetChannelNumber(buff);
 
   std::string source(buff, Replayfilesize);
   string match(cHeader, 4);
@@ -157,35 +151,31 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
 }
 
 void MainWindow::ReplayIQDataParse(char *buf) {
-  uint SamplePoints = m_droneIQParse.GetSamplePoint(buf);
-  if (SamplePoints > 0) {
-    uint channelnumber =
-        m_droneIQParse.GetChannelNumber(buf);  // 8->16 16->32 32->64
+  if (signalLength_ > 0) {
+    if (signalChannels_ == 32) {
+      uint channelnumber = 8;  //原逻辑也是只取了前8个通道
 
-    if (channelnumber == 32) {
-      channelnumber = 8;  //原逻辑也是只取了前8个通道
-
-      // malloc申请空间，需手动free，防止内存泄漏（在析构函数中free）
       if (signalDatas_ == nullptr) {
-        signalDatas_ = (cpuComplex *)malloc(channelnumber * SamplePoints *
-                                            sizeof(cpuComplex));
-        if (signalDatas_ == nullptr) {
-          std::cerr << __FUNCTION__ << " Memory allocation failed!"
-                    << std::endl;
+        // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
+        if (!m_calMC.cudaCorrelation->AllocMappMemory(
+                (void **)&(m_calMC.cudaCorrelation->h_signals),
+                (void **)&(m_calMC.cudaCorrelation->d_signals),
+                channelnumber * signalLength_ * sizeof(cpuComplex))) {
+          std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
           return;
         }
-        memset(signalDatas_, 0,
-               channelnumber * SamplePoints * sizeof(cpuComplex));
+
+        signalDatas_ = (cpuComplex *)m_calMC.cudaCorrelation->h_signals;
       }
 
-      m_droneIQParse.ResolveIQData(buf, SamplePoints, channelnumber,
+      m_droneIQParse.ResolveIQData(buf, signalLength_, channelnumber,
                                    signalDatas_);
 
       QElapsedTimer tm;
       tm.start();
       // 每帧 SamplePoints 个点 IQ 输入
       // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰
-      int result = CalculateRoutine(signalDatas_, channelnumber, SamplePoints);
+      int result = CalculateRoutine(channelnumber, signalLength_);
 
       std::cout << __FUNCTION__ << " result:" << result
                 << " tm(ns):" << tm.nsecsElapsed() << std::endl;
diff --git a/mainwindow.h b/mainwindow.h
index 3742949..a63a935 100644
--- a/mainwindow.h
+++ b/mainwindow.h
@@ -26,11 +26,13 @@ class MainWindow : public QMainWindow {
   void InitConnect();
 
   // 计算总流程
-  int CalculateRoutine(const cpuComplex *signalDatas, uint numChannels,
-                       uint signalLength);
+  // 不需要传signalDatas参数，因采用零拷贝，数据已经传到显存了
+  int CalculateRoutine(uint signalChannels, uint signalLength);
 
+  // 零拷贝：signalDatas_被填充之后，数据会自动同步到显存中
   cpuComplex *signalDatas_ = nullptr;
-  uint SamplePoint_;
+  uint signalChannels_ = 0;
+  uint signalLength_ = 0;
 
  private:
   // 获取测试数据文件中 每一帧数据的帧头下标
-- 
Gitee