From 9c900049cce8d9d9c891b663227150e7583047bc Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Fri, 12 Dec 2025 18:27:06 +0800
Subject: [PATCH 01/27] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E9=87=8D=E9=87=87?=
 =?UTF-8?q?=E6=A0=B7=E7=9B=B8=E5=85=B3=E7=9A=84cuda=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt    |   2 +-
 cuda_resample.cu  | 710 ++++++++++++++++++++++++++++++++++++++++++++++
 cuda_resample.h   |  50 ++++
 upfirdn_device.cu | 330 +++++++++++++++++++++
 upfirdn_device.h  |  46 +++
 5 files changed, 1137 insertions(+), 1 deletion(-)
 create mode 100644 cuda_resample.cu
 create mode 100644 cuda_resample.h
 create mode 100644 upfirdn_device.cu
 create mode 100644 upfirdn_device.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ad2858..969d97d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,7 +56,7 @@ if(USE_CUDA)
 endif()
 
 # debug OR release mode
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++11 -pthread -pipe")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++17 -pthread -pipe")
 if (NOT CMAKE_BUILD_TYPE)
   set(_CMAKE_BUILD_TYPE_LOWER "release")
 else()
diff --git a/cuda_resample.cu b/cuda_resample.cu
new file mode 100644
index 0000000..5238cb8
--- /dev/null
+++ b/cuda_resample.cu
@@ -0,0 +1,710 @@
+#include "cuda_resample.h"
+#include "upfirdn_device.h"
+
+// CHECK_CUDA_ERROR：cuda api调用错误处理
+#define CHECK_CUDA_ERROR(call)                                               \
+  do {                                                                       \
+    cudaError_t err = call;                                                  \
+    if (err != cudaSuccess) {                                                \
+      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
+                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
+      Cleanup();                                                             \
+      throw std::runtime_error("CUDA error");                                \
+    }                                                                        \
+  } while (0)
+
+// 余弦函数
+template <typename T>
+__device__ __forceinline__ T dev_cos(T x) {
+  if constexpr (std::is_same_v<T, float>) {
+    return cosf(x);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return cos(x);
+  } else {
+    return cos(static_cast<double>(x));
+  }
+}
+
+// 正弦函数
+template <typename T>
+__device__ __forceinline__ T dev_sin(T x) {
+  if constexpr (std::is_same_v<T, float>) {
+    return sinf(x);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return sin(x);
+  } else {
+    return sin(static_cast<double>(x));
+  }
+}
+
+// 浮点数绝对值
+template <typename T>
+__device__ __forceinline__ T dev_abs(T x) {
+  if constexpr (std::is_same_v<T, float>) {
+    return fabsf(x);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return fabs(x);
+  } else {
+    return fabs(static_cast<double>(x));
+  }
+}
+
+inline int quotientCeil(int num1, int num2) {
+  if (num1 % num2 != 0) return num1 / num2 + 1;
+  return num1 / num2;
+}
+
+// 整数向上取整除法
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
+  div_t result = div(num1, num2);
+  return result.quot + (result.rem != 0);
+}
+
+// CUDA设备端GCD函数:最大公约数
+__device__ __forceinline__ int dev_gcd(int a, int b) {
+  while (b != 0) {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// 生成连续递增的序列
+template <typename T>
+__device__ __forceinline__ void dev_iota(T* data, int size, T start) {
+  for (int i = 0; i < size; i++) {
+    data[i] = start + T(i);
+  }
+  return;
+}
+
+// 填充data为value
+template <typename T>
+__device__ __forceinline__ void dev_fill(T* data, int size, T value) {
+  for (int i = 0; i < size; i++) {
+    data[i] = value;
+  }
+  return;
+}
+
+template <typename T>
+__device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
+                         int freqSize) {
+  // 计算权重大小
+  int weightSize = freqSize / 2;
+
+  // 初始化权重向量
+  T* weight = new T[weightSize];
+  if (weight == nullptr) {
+    return -1;
+  }
+
+  // 初始化weight
+  dev_fill(weight, weightSize, T(1.0));
+
+  // 处理频率向量
+  for (int i = 0; i < freqSize; i++) {
+    freq[i] = freq[i] / T(2.0);
+  }
+
+  int filterLength = length + 1;
+  length = (filterLength - 1) / 2;
+
+  //奇偶判断
+  bool Nodd = filterLength & 1;
+
+  // 创建和初始化向量k
+  int kLength = length + 1;
+  T* k = new T[kLength];
+  if (k == nullptr) {
+    return -1;
+  };
+
+  // 初始化k向量为递增序列：0，1，2...
+  dev_iota(k, kLength, T(0.0));
+
+  if (!Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] += T(0.5);
+    }
+  }
+
+  // k.erase(k.begin());
+  if (Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] = k[i + 1];
+    }
+    kLength--;
+  }
+
+  // 创建和初始化向量b
+  int bLength = kLength;
+  if (Nodd) {
+    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
+  }
+  T* b = new T[bLength];
+  if (b == nullptr) {
+    return -1;
+  };
+
+  dev_fill(b, bLength, T(0.0));
+
+  T b0 = T(0.0);
+  for (int i = 0; i < freqSize; i += 2) {
+    T Fi = freq[i];
+    T Fip1 = freq[i + 1];
+    T ampi = amplitude[i];
+    T ampip1 = amplitude[i + 1];
+    T wt2 = pow(weight[i / 2], T(2.0));
+    T m_s = (ampip1 - ampi) / (Fip1 - Fi);
+    T b1 = ampi - (m_s * Fi);
+
+    if (Nodd) {
+      b0 += (b1 * (Fip1 - Fi)) +
+            m_s / T(2.0) * (pow(Fip1, T(2.0)) - pow(Fi, T(2.0))) * wt2;
+    }
+
+    // 并行计算b向量
+    for (int j = 0; j < kLength; j++) {
+      T kj = k[j];
+      b[j] += (m_s / (T(4.0) * pow(M_PI, T(2.0))) *
+               (dev_cos(T(2.0) * M_PI * Fip1) - dev_cos(T(2.0) * M_PI * Fi)) /
+               (pow(kj, T(2.0)))) *
+              wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * dev_sin(T(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * dev_sin(T(2.0) * kj * Fi)) *
+              wt2;
+    }
+  }
+
+  // 处理最终结果，将b0插入到b向量的开始
+  if (Nodd) {
+    for (int i = kLength; i >= 0; i--) {
+      if (i > 0) {
+        b[i] = b[i - 1];
+      } else {
+        b[i] = b0;
+      }
+    }
+  }
+
+  // 计算a向量
+  T w0 = weight[0];
+
+  int aLength = bLength;
+  T* a = new T[aLength];
+  if (a == nullptr) {
+    return -1;
+  };
+
+  for (int i = 0; i < aLength; i++) {
+    a[i] = pow(w0, T(2.0)) * T(4.0) * b[i];
+    result[aLength - 1 - i] = a[i];
+  }
+
+  int it = 0;
+  if (Nodd) {
+    it = 1;
+  }
+
+  // 构建结果向量
+  for (int i = 0; i < aLength; i++) {
+    result[i] = result[i] * T(0.5);
+    if ((i + it) < aLength) {
+      result[aLength + i] = a[i + it] * T(0.5);
+    }
+  }
+
+  // 释放动态分配的内存
+  delete[] weight;  // 释放内存
+  delete[] k;       // 释放内存
+  delete[] b;       // 释放内存
+  delete[] a;       // 释放内存
+  return 0;
+}
+
+// 设备端Bessel函数模板
+template <typename T>
+__device__ T dev_cyl_bessel_i(int n, T x) {
+  if (n == 0) return T(1);
+  T bessel = T(1), bessel_prev = T(1);
+  for (int i = 1; i <= n; ++i) {
+    bessel = (T(2) * i - T(1)) / i * x * bessel_prev - bessel;
+    bessel_prev = bessel;
+  }
+  return bessel;
+}
+
+// 设备端凯塞窗核函数模板
+template <typename T>
+__device__ void dev_kaiser(T* window, int order, T bta) {
+  T Numerator, Denominator;
+  Denominator = dev_cyl_bessel_i(0, bta);
+  T od2 = (order - T(1)) / T(2);
+
+  for (int n = 0; n < order; n++) {
+    T x = bta * sqrt(T(1) - pow((n - od2) / od2, T(2)));
+    Numerator = dev_cyl_bessel_i(0, x);
+    window[n] = Numerator / Denominator;
+  }
+}
+
+/**
+ *
+ */
+template <typename T>
+__device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
+                             int inputSize, T* outputSignal) {
+  const int n = 10;
+  const T bta = T(5.0);
+  if (upFactor <= 0 || downFactor <= 0) {
+    return;
+  }
+
+  int gcd_o = dev_gcd(upFactor, downFactor);
+
+  upFactor /= gcd_o;
+  downFactor /= gcd_o;
+
+  if (upFactor == downFactor) {
+    outputSignal = inputSignal;
+    return;
+  }
+
+  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
+  int maxFactor = max(upFactor, downFactor);
+  T firlsFreq = T(1.0) / (T(2.0) * static_cast<T>(maxFactor));
+
+  T firlsFreqsV[4];
+  firlsFreqsV[0] = T(0.0);
+  firlsFreqsV[1] = T(2.0) * firlsFreq;
+  firlsFreqsV[2] = T(2.0) * firlsFreq;
+  firlsFreqsV[3] = T(1.0);
+
+  T firlsAmplitudeV[4];
+  firlsAmplitudeV[0] = T(1.0);
+  firlsAmplitudeV[1] = T(1.0);
+  firlsAmplitudeV[2] = T(0.0);
+  firlsAmplitudeV[3] = T(0.0);
+
+  int freqSize = 4;
+  int length = 2 * n * maxFactor + 1;
+  int coefficientsLength = length;
+  T* coefficients = new T[coefficientsLength];
+  if (coefficients == nullptr) {
+    return;
+  }
+  int ret = dev_firls(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
+                      freqSize);
+  if (ret == -1) {
+    return;
+  }
+
+  int windowSize = length;
+  T* window = new T[windowSize];
+  if (window == nullptr) {
+    return;
+  }
+  dev_kaiser(window, length, bta);
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    coefficients[i] *= (upFactor * window[i]);
+  }
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+
+  // 分配filter空间
+  int hSize = coefficientsLength + 2 * nz;
+  T* filter = new T[hSize];
+  if (filter == nullptr) {
+    return;
+  }
+
+  int filterLength = 0;
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = T(0.0);
+  }
+  filterLength += nz;
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    filter[i + filterLength] = coefficients[i];
+  }
+  filterLength += coefficientsLength;
+
+  lengthHalf += nz;
+  int delay = lengthHalf / downFactor;
+  nz = 0;
+  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
+             delay <
+         outputSize) {
+    nz++;
+  }
+
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = T(0.0);
+  }
+  filterLength += nz;
+
+  // 计算
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upFactor) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  T* results = new T[outputCount];
+  if (results == nullptr) {
+    return;
+  }
+
+  int resultsCount = 0;
+  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
+                 filterLength, results, &resultsCount);
+
+  int j = 0;
+  for (int i = delay; i < outputSize + delay; i++) {
+    outputSignal[j++] = results[i];
+  }
+
+  // 释放动态分配的内存
+  delete[] coefficients;
+  delete[] window;
+  delete[] filter;
+  delete[] results;
+  return;
+}
+
+/**
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ *
+ * @param MapChannelandIdata：原始Idata
+ * @param MapChannelandQdata：原始Qdata
+ * @param vecOneFrameDetectResult
+ * @param numResultPerFrame：每帧的结果数
+ * @param numChannels：信号通道数
+ * @param signalLength：信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+ * @param outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+ * @return void
+ */
+template <typename T>
+__global__ void ShiftingAndResamplingKernel(
+    const T* __restrict__ MapChannelandIdata,
+    const T* __restrict__ MapChannelandQdata,
+    const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
+    const int numResultPerFrame, const int numChannels, const int signalLength,
+    const T CurrentRealfreq, T* __restrict__ outputIdata,
+    T* __restrict__ outputQdata) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResultPerFrame) return;
+
+  int FrameIdx = idx / numChannels;
+  int chIdx = idx % numChannels;
+
+  const T sampling_rate = T(245.76e6);
+
+  T frequency = VFrequency[FrameIdx];
+  T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  auto& I_orig = MapChannelandIdata + chIdx * signalLength;
+  auto& Q_orig = MapChannelandQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  T* I_shifted = new T[signalLength];
+  if (I_shifted == nullptr) {
+    return;
+  }
+  T* Q_shifted = new T[signalLength];
+  if (Q_shifted == nullptr) {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++) {
+    T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    T dev_cosVal = dev_cos(phase);
+    T sinVal = dev_sin(phase);
+    I_shifted[i] = I_orig[i] * dev_cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * dev_cosVal + I_orig[i] * sinVal;
+  }
+
+  // 使用有理重采样替换原来的滤波+降采样
+  // 上采样因子为1，下采样因子为decimation
+  int upFactor = 1;
+  int downFactor = VDownFactor[FrameIdx];
+
+  // outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  // outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  int outputSizePerData = dev_quotientCeil(signalLength * upFactor, downFactor);
+  auto& I_resampled = outputIdata + chIdx * outputSizePerData;
+  auto& Q_resampled = outputQdata + chIdx * outputSizePerData;
+
+  // 重采样
+  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+void SlotRecvDLDetectResult(frameBox& val, char* frame, int channelNum,
+                            float CurrentRealfreq) {
+  QString strObjname = sender()->objectName();
+  int channel = strObjname.replace("RadarDet", "").toInt();
+
+  qDebug() << __FUNCTION__ << "strObjname" << strObjname;
+  qDebug() << __FUNCTION__ << "val.boxes.size()" << val.boxes.size();
+
+  // 获取独一无二的X1的下标
+  QVector<int> uniqueIndex = FindUniqueBox(val.boxes);
+
+  if (uniqueIndex.empty()) {  // 如果为空 如何处理
+    float maxConfidence = FindMaxConfidence(val.boxes);
+    m_mapChannelMaxConfidence[channel] = maxConfidence;
+  }
+
+  // 信号的数据长度 524288  带宽为 245.76Mhz
+  double rbw = 245.76 / 524288.00;
+
+  // 定位源数据位置
+  for (int index : std::as_const(uniqueIndex)) {
+    if (val.boxes[index].x1 < 0) val.boxes[index].x1 = 0;
+
+    if (val.boxes[index].y1 < 0) val.boxes[index].y1 = 0;
+
+    if (val.boxes[index].x2 < 0) val.boxes[index].x2 = 0;
+
+    if (val.boxes[index].y2 < 0) val.boxes[index].y2 = 0;
+
+    int StartPint = (DLBoxCoordinateTransfer(val.boxes[index].y1) - 1) * 1024 +
+                    DLBoxCoordinateTransfer(val.boxes[index].x1) - 1;
+    int StopPoint = (DLBoxCoordinateTransfer(val.boxes[index].y2) - 1) * 1024 +
+                    DLBoxCoordinateTransfer(val.boxes[index].x2) - 1;
+
+    int centerfreqPos = (StopPoint + StartPint) / 2;
+    float startfreq = CurrentRealfreq - 122.88;
+    double MHzPerImageWidthPix = 245.76 / 640.0;
+
+    float detectfreq =
+        startfreq + ((val.boxes[index].x1 + val.boxes[index].x2) / 2.0) *
+                        MHzPerImageWidthPix;  // Mhz
+    double Bandwidth = (val.boxes[index].x2 - val.boxes[index].x1) *
+                       MHzPerImageWidthPix;  // Mhz
+
+    YOLOFreqBandValue tempchannelDetectRes;
+    tempchannelDetectRes.startIndex = StartPint;
+    tempchannelDetectRes.stopIndex = StopPoint;
+    tempchannelDetectRes.bandwidth = Bandwidth;
+    tempchannelDetectRes.frequency = detectfreq;  // Hz
+
+    m_MultiMapSingleChannelDetect.insert(index, tempchannelDetectRes);
+  }
+
+  // 当前通道的数据 拆分为IQ
+  QVector<short> IData, QData;
+  DataSplit(frame, IData, QData);
+
+  MapChannelandIdata[channel] = IData;
+  MapChannelandQdata[channel] = QData;
+
+  if (MapChannelandQdata.size() != 8) return;
+
+  // 数据开始处理
+  // 找到 包含box最多的 那个通道
+  bool ifmultiPicBox = false;  //  是否某一次检测 对于某一个通道检出多个框
+  int maxMultiPicBoxNum = 1;      // 检出多个框的个数
+  int maxMultiPicBoxChannel = 0;  // 检出多个框的通道
+
+  for (auto itr = m_MultiMapSingleChannelDetect.begin(),
+            itrend = m_MultiMapSingleChannelDetect.end();
+       itr != itrend; itr++) {
+    if (m_MultiMapSingleChannelDetect.values(itr.key()).size() >
+        maxMultiPicBoxNum) {
+      maxMultiPicBoxNum = m_MultiMapSingleChannelDetect.values().size();
+
+      ifmultiPicBox = true;
+
+      maxMultiPicBoxChannel = itr.key();
+    }
+  }
+
+  // 如果有多个框  那么
+  // 处理数据截取则按照最多个的位置对应的开始结束位置对于其他进行截取
+  // 全部只有一个时，找到最大置信度 对于所有数据按照最大的开始结束位置截取
+  std::map<qint64, FreqBandValue> FinalDetectChannelFreqIndex;
+
+  std::vector<std::vector<std::vector<double>>> CutWholeIdata;
+  std::vector<std::vector<std::vector<double>>> CutWholeQdata;
+  std::vector<int> cutstartIndex;
+  std::vector<int> cutstopIndex;
+  std::vector<double> detectFreq;
+  std::vector<double> detectBandwidth;
+
+  if (ifmultiPicBox) {
+    QList<YOLOFreqBandValue> PicBoxList =
+        m_MultiMapSingleChannelDetect.values(maxMultiPicBoxChannel);
+
+    for (int maxi = 0; maxi < PicBoxList.count(); maxi++) {
+      FreqBandValue FBV;
+      FBV.startIndex = PicBoxList.at(maxi).startIndex;
+      FBV.stopIndex = PicBoxList.at(maxi).stopIndex;
+      FBV.bandwidth = PicBoxList.at(maxi).bandwidth;
+      FBV.frequency = PicBoxList.at(maxi).frequency;  // Hz
+      FinalDetectChannelFreqIndex[FBV.frequency] = FBV;
+
+      m_vecOneFrameDetectResult.push_back(FinalDetectChannelFreqIndex);
+    }
+  } else {  // 找最大置信度的通道
+    QList<float> maxconfidencelist = m_mapChannelMaxConfidence.values();
+    float MaxConfidence =
+        *std::max_element(maxconfidencelist.begin(), maxconfidencelist.end());
+
+    int maxchannel = 0;
+    for (auto it = m_mapChannelMaxConfidence.begin();
+         it != m_mapChannelMaxConfidence.end(); ++it) {
+      if (it.value() == MaxConfidence) {
+        maxchannel = it.key();
+      }
+    }
+
+    auto maxChannelInfo = m_MultiMapSingleChannelDetect.find(maxchannel);
+
+    FreqBandValue FBV;
+    FBV.startIndex = maxChannelInfo.value().startIndex;
+    FBV.stopIndex = maxChannelInfo.value().stopIndex;
+    FBV.bandwidth = maxChannelInfo.value().bandwidth;
+    FBV.frequency = maxChannelInfo.value().frequency;  // Hz
+    FinalDetectChannelFreqIndex[FBV.frequency] = FBV;
+
+    m_vecOneFrameDetectResult.push_back(FinalDetectChannelFreqIndex);
+  }
+
+  qDebug() << __FUNCTION__ << "m_vecOneFrameDetectResult"
+           << m_vecOneFrameDetectResult.size();
+
+  // int detectindex = 0;
+  // for (const auto& [freq, fbv] : m_vecOneFrameDetectResult.back()) {
+  // }
+
+  int upFactor = 1;
+  int outputTotalLength = 0;
+  std::vector<int> downFactor;
+
+  for (const auto& [freq, fbv] : m_vecOneFrameDetectResult.back()) {
+    double bandwidth = fbv.bandwidth * 1e6;
+    int decimation = 0;
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6) {
+      decimation = 4;
+    } else if (std::abs(bandwidth - 20e6) < 2 * 1e6) {
+      decimation = 8;
+    } else if (std::abs(bandwidth - 10e6) < 2 * 1e6) {
+      decimation = 16;
+    } else {
+      // 带宽不符合要求，跳过处理
+      qDebug() << __FUNCTION__ << "else  --- 不符合 ";
+      continue;
+    }
+
+    downFactor.push_back(decimation);
+    detectFreq.push_back(fbv.frequency * 1e6);
+    detectBandwidth.push_back(bandwidth);
+
+    outputTotalLength += quotientCeil(signalLength * upFactor, decimation);
+  }
+
+  // ==========替换上面的for循环=============
+  // ShiftingAndResamplingKernel 核函数调用
+  int numChannels = CHANNEL_COUNT;
+  int numResult = m_vecOneFrameDetectResult.size();
+
+  // 申请DetectResult的GPU显存，并将数据copy到该显存中
+  int* d_downFactor = nullptr;
+  double* d_frequency = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResult * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResult * sizeof(double))));
+  // 将数据copy到该显存中
+  for (int i = 0; i < numResult; i++) {
+    void* dst_downFactor = d_downFactor + i * sizeof(int);
+    const void* src_downFactor = downFactor[i].data();
+
+    CHECK_CUDA_ERROR(cudaMemcpy(dst_downFactor, src_downFactor, i * sizeof(int),
+                                cudaMemcpyHostToDevice));
+
+    void* dst_frequency = d_frequency + i * sizeof(double);
+    const void* src_frequency = detectFreq[i].data();
+
+    CHECK_CUDA_ERROR(cudaMemcpy(dst_frequency, src_frequency,
+                                i * sizeof(double), cudaMemcpyHostToDevice));
+  }
+
+  // 申请原始的idata和qdata的GPU显存，并将数据copy到GPU显存中
+  int signalLength = MapChannelandIdata[0].value.size();
+  double* d_Idata = nullptr;
+  double* d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++) {
+    // copy 原始的idata 到gpu显存
+    double* dst_idata = d_Idata + i * signalLength;
+    const void* src_idata = MapChannelandIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    cufftDoubleComplex* dst_qdata = d_Qdata + i * signalLength;
+    const void* src_qdata = MapChannelandQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 计算重采样后信号的总长度，用于申请GPU显存
+  int outputTotalLength =
+      ComputeOutputLength(m_vecOneFrameDetectResult, signalLength);
+  // 申请重采样后输出信号的GPU显存
+  double* d_outputIdata = nullptr;
+  double* d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_outputIdata, (outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_outputQdata, (outputTotalLength * sizeof(double))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResultPerFrame + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernel<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResultPerFrame,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+  // =======================
+
+  MapChannelandIdata.clear();
+  MapChannelandQdata.clear();
+
+  m_MultiMapSingleChannelDetect.clear();
+
+  m_vecOneFrameDetectResult.clear();
+
+  // box 清除
+  val.boxes.clear();
+
+  qDebug() << __FUNCTION__
+           << "Emit -------------------m_strSamplePoint.toInt()--------"
+           << m_strSamplePoint.toInt();
+
+  emit SignalEmitCalculateMovingData_afterDetect(
+      m_strSamplePoint.toInt(), CHANNEL_COUNT, CutWholeIdata, CutWholeQdata,
+      detectFreq, detectBandwidth);
+}
diff --git a/cuda_resample.h b/cuda_resample.h
new file mode 100644
index 0000000..c60d74b
--- /dev/null
+++ b/cuda_resample.h
@@ -0,0 +1,50 @@
+#ifndef CUDA_RESAMPLE_H
+#define CUDA_RESAMPLE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <thrust/device_vector.h>
+
+#include <cmath>
+#include <map>
+#include <type_traits>
+#include <vector>
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643
+#endif
+
+template <typename T>
+struct FreqBandValue_k {
+  T frequency;
+  T bandwidth;
+  T maxvalue;
+  int startIndex;
+  int stopIndex;
+  T ebn0;
+};
+
+/**
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ *
+ * @param MapChannelandIdata：原始Idata
+ * @param MapChannelandQdata：原始Qdata
+ * @param vecOneFrameDetectResult
+ * @param numResultPerFrame：每帧的结果数
+ * @param numChannels：信号通道数
+ * @param signalLength：信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+ * @param outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+ * @return void
+ */
+template <typename T>
+__global__ void ShiftingAndResamplingKernel(
+    const T* __restrict__ MapChannelandIdata,
+    const T* __restrict__ MapChannelandQdata,
+    const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
+    const int numResultPerFrame, const int numChannels, const int signalLength,
+    const T CurrentRealfreq, T* __restrict__ outputIdata,
+    T* __restrict__ outputQdata);
+
+#endif  // CUDA_RESAMPLE_H
diff --git a/upfirdn_device.cu b/upfirdn_device.cu
new file mode 100644
index 0000000..a040a56
--- /dev/null
+++ b/upfirdn_device.cu
@@ -0,0 +1,330 @@
+#include <cuda_runtime.h>
+
+#include "upfirdn_device.h"
+
+// 设备端Resampler初始化
+template <class S1, class C>
+__device__ void resampler_init_state_device(DeviceResamplerState<S1, C> *state,
+                                            C *transposedCoefs,
+                                            int coefsPerPhase, int upRate,
+                                            int downRate) {
+  state->_t = 0;
+  state->_xOffset = 0;
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = new S1[coefsPerPhase - 1];
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
+    state->_state[i] = 0;
+  }
+}
+
+// 设备端：计算所需输出数量
+template <class S1, class S2, class C>
+__device__ int resampler_needed_out_count_device(
+    int inCount, DeviceResamplerState<S1, C> *state) {
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate)) {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+template <class S1, class S2, class C>
+__device__ int resampler_apply_device(S1 *in, int inCount, S2 *out,
+                                      int outCount,
+                                      DeviceResamplerState<S1, C> *state) {
+  if (outCount < resampler_needed_out_count_device<S1, S2, C>(inCount, state)) {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  S1 *x = in + state->_xOffset;
+  S2 *y = out;
+  S1 *end = in + inCount;
+
+  while (x < end) {
+    S2 acc = 0;
+    C *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    S1 *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0) {
+      // 需要从_state缓冲区中获取
+      S1 *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x) {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0) {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++) {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++) {
+      state->_state[retain + i] = in[i];
+    }
+  } else {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
+      state->_state[i] = end - (state->_coefsPerPhase - 1) + i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：释放Resampler状态
+template <class S1, class C>
+__device__ void resampler_free_state_device(
+    DeviceResamplerState<S1, C> *state) {
+  if (state->_state != nullptr) {
+    delete[] state->_state;
+    state->_state = nullptr;
+  }
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+template <class C>
+__device__ void transpose_filter_coefs_device(C *transposedCoefs, C *coefs,
+                                              int upRate, int coefCount,
+                                              int coefsPerPhase) {
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
+    transposedCoefs[i] = 0;
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < coefCount) {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+template <class S1, class S2, class C>
+__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
+                               int inLength, C *filter, int filterLength,
+                               S2 *results, int *resultsCount) {
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  C *transposedCoefs = new C[paddedCoefCount];
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device(transposedCoefs, filter, upRate, filterLength,
+                                coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerState<S1, C> state;
+  resampler_init_state_device(&state, transposedCoefs, coefsPerPhase, upRate,
+                              downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  S1 *inputPadded = new S1[inLength + padding];
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
+      inputPadded[i] = input[i];
+    } else {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device<S1, S2, C>(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr) {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device<S1, S2, C>(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  delete[] transposedCoefs;
+  delete[] inputPadded;
+  resampler_free_state_device(&state);
+}
+
+// 向量版本的设备端upfirdn
+template <class S1, class S2, class C>
+__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
+                               int inputLength, C *filter, int filterLength,
+                               S2 *results) {
+  upfirdn_device<S1, S2, C>(upRate, downRate, input, inputLength, filter,
+                            filterLength, results, nullptr);
+}
+
+// CUDA内核：每个线程块处理一个独立的upfirdn操作
+template <class S1, class S2, class C>
+__global__ void upfirdn_kernel_batch(int upRate, int downRate, S1 **inputs,
+                                     int *inputLengths, C **filters,
+                                     int *filterLengths, S2 **outputs,
+                                     int *outputLengths, int batchSize) {
+  int batchIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (batchIdx >= batchSize) {
+    return;
+  }
+
+  // 获取当前批次的参数
+  S1 *input = inputs[batchIdx];
+  int inLength = inputLengths[batchIdx];
+  C *filter = filters[batchIdx];
+  int filterLength = filterLengths[batchIdx];
+  S2 *output = outputs[batchIdx];
+
+  // 执行设备端upfirdn
+  upfirdn_device<S1, S2, C>(upRate, downRate, input, inLength, filter,
+                            filterLength, output);
+}
+
+// 优化的设备端upfirdn（预分配所有内存）
+template <class S1, class S2, class C>
+__device__ void upfirdn_device_optimized(
+    int upRate, int downRate, S1 *input, int inLength, C *filter,
+    int filterLength, S2 *results,
+    C *transposedCoefsBuffer,  // 预分配的转置系数缓冲区
+    S1 *stateBuffer,           // 预分配的状态缓冲区
+    S1 *inputPaddedBuffer) {   // 预分配的输入填充缓冲区
+
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 转置滤波器系数
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
+    transposedCoefsBuffer[i] = 0;
+  }
+
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < filterLength) {
+        transposedCoefsBuffer[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            filter[j * upRate + i];
+      }
+    }
+  }
+
+  // 创建Resampler状态
+  DeviceResamplerState<S1, C> state;
+  state._t = 0;
+  state._xOffset = 0;
+  state._transposedCoefs = transposedCoefsBuffer;
+  state._coefsPerPhase = coefsPerPhase;
+  state._upRate = upRate;
+  state._downRate = downRate;
+  state._state = stateBuffer;
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
+    state._state[i] = 0;
+  }
+
+  // 填充输入
+  int padding = coefsPerPhase - 1;
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
+      inputPaddedBuffer[i] = input[i];
+    } else {
+      inputPaddedBuffer[i] = 0;
+    }
+  }
+
+  // 计算输出数量
+  int np = (inLength + padding) * upRate;
+  int need = np / downRate;
+  if ((state._t + upRate * state._xOffset) < (np % downRate)) {
+    need++;
+  }
+
+  // 应用重采样
+  S1 *in = inputPaddedBuffer;
+  S2 *out = results;
+  S1 *end = in + inLength + padding;
+
+  while (in + state._xOffset < end) {
+    S1 *x = in + state._xOffset;
+    S2 acc = 0;
+    C *h = transposedCoefsBuffer + state._t * coefsPerPhase;
+    S1 *xPtr = x - coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0) {
+      S1 *statePtr = state._state + (coefsPerPhase - 1) - offset;
+      while (statePtr < state._state + (coefsPerPhase - 1)) {
+        acc += (*statePtr++) * (*h++);
+      }
+      xPtr += offset;
+    }
+
+    while (xPtr <= x) {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *out++ = acc;
+    state._t += downRate;
+
+    int advanceAmount = state._t / upRate;
+    x += advanceAmount;
+    state._t %= upRate;
+  }
+}
\ No newline at end of file
diff --git a/upfirdn_device.h b/upfirdn_device.h
new file mode 100644
index 0000000..8d5de6d
--- /dev/null
+++ b/upfirdn_device.h
@@ -0,0 +1,46 @@
+#ifndef UPFIRDN_DEVICE_H
+#define UPFIRDN_DEVICE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+// 设备端Resampler状态结构
+template <class S1, class C>
+struct DeviceResamplerState {
+  int _t;               // "time" (modulo upRate)
+  int _xOffset;         // 输入偏移量
+  S1 *_state;           // 状态缓冲区指针
+  C *_transposedCoefs;  // 转置系数指针
+  int _coefsPerPhase;   // 每相系数数量
+  int _upRate;          // 上采样率
+  int _downRate;        // 下采样率
+};
+
+// 设备端函数声明
+template <class S1, class S2, class C>
+__device__ int resampler_apply_device(S1 *in, int inCount, S2 *out,
+                                      int outCount,
+                                      DeviceResamplerState<S1, C> *state);
+
+template <class S1, class S2, class C>
+__device__ int resampler_needed_out_count_device(
+    int inCount, DeviceResamplerState<S1, C> *state);
+
+template <class S1, class C>
+__device__ void resampler_init_state_device(DeviceResamplerState<S1, C> *state,
+                                            C *transposedCoefs,
+                                            int coefsPerPhase, int upRate,
+                                            int downRate);
+
+// 设备端upfirdn函数
+template <class S1, class S2, class C>
+__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
+                               int inLength, C *filter, int filterLength,
+                               S2 *results, int *resultsCount);
+
+template <class S1, class S2, class C>
+__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
+                               int inputLength, C *filter, int filterLength,
+                               S2 *results);
+
+#endif  // UPFIRDN_DEVICE_H
\ No newline at end of file
-- 
Gitee


From 5c0cf7e383de3d6f8c8adde45288024474340d3b Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 15 Dec 2025 16:30:17 +0800
Subject: [PATCH 02/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample=20=E6=A0=B8?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8F=8A=E7=9B=B8=E5=85=B3=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichards <229242333@qq.com>
---
 cuda_resample.cu | 441 +++++++++++++++++++++--------------------------
 cuda_resample.h  |  35 ++--
 2 files changed, 216 insertions(+), 260 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 5238cb8..ffdfc18 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -8,11 +8,18 @@
     if (err != cudaSuccess) {                                                \
       std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
                 << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
-      Cleanup();                                                             \
       throw std::runtime_error("CUDA error");                                \
     }                                                                        \
   } while (0)
 
+#define LOG_INFO(fmt, ...)                                                \
+  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
+#define LOG_ERROR(fmt, ...)                                                \
+  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
 // 余弦函数
 template <typename T>
 __device__ __forceinline__ T dev_cos(T x) {
@@ -49,11 +56,6 @@ __device__ __forceinline__ T dev_abs(T x) {
   }
 }
 
-inline int quotientCeil(int num1, int num2) {
-  if (num1 % num2 != 0) return num1 / num2 + 1;
-  return num1 / num2;
-}
-
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
   div_t result = div(num1, num2);
@@ -100,7 +102,7 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
     return -1;
   }
 
-  // 初始化weight
+  // 初始化weight为全1
   dev_fill(weight, weightSize, T(1.0));
 
   // 处理频率向量
@@ -199,6 +201,7 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
     return -1;
   };
 
+  // vector<T> result = {a.rbegin(), a.rend()};
   for (int i = 0; i < aLength; i++) {
     a[i] = pow(w0, T(2.0)) * T(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
@@ -251,14 +254,13 @@ __device__ void dev_kaiser(T* window, int order, T bta) {
   }
 }
 
-/**
- *
- */
 template <typename T>
-__device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
-                             int inputSize, T* outputSignal) {
+__device__ void dev_resample(const int upFactor, const int downFactor,
+                             const T* inputSignal, const int inputSize,
+                             T* outputSignal) {
   const int n = 10;
   const T bta = T(5.0);
+
   if (upFactor <= 0 || downFactor <= 0) {
     return;
   }
@@ -275,8 +277,8 @@ __device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
 
   int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
 
-  int maxFactor = max(upFactor, downFactor);
-  T firlsFreq = T(1.0) / (T(2.0) * static_cast<T>(maxFactor));
+  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
+  T firlsFreq = T(1.0) / T(2.0) / static_cast<T>(maxFactor);
 
   T firlsFreqsV[4];
   firlsFreqsV[0] = T(0.0);
@@ -293,6 +295,7 @@ __device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
   int freqSize = 4;
   int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
+
   T* coefficients = new T[coefficientsLength];
   if (coefficients == nullptr) {
     return;
@@ -300,6 +303,7 @@ __device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
   int ret = dev_firls(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
                       freqSize);
   if (ret == -1) {
+    LOG_ERROR("dev_firls调用失败\n");
     return;
   }
 
@@ -383,40 +387,48 @@ __device__ void dev_resample(int upFactor, int downFactor, const T* inputSignal,
 }
 
 /**
+ * ShiftingAndResamplingKernel
  * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
  *
- * @param MapChannelandIdata：原始Idata
- * @param MapChannelandQdata：原始Qdata
- * @param vecOneFrameDetectResult
- * @param numResultPerFrame：每帧的结果数
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param VOutputLength：每个检测结果的重采样输出信号长度
+ * @param numResults：每帧的检测结果总数
  * @param numChannels：信号通道数
- * @param signalLength：信号长度
+ * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
- * @param outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
- * @param outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+ * @param outputTotalLength：重采样后的输出信号总的长度，
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return void
  */
 template <typename T>
 __global__ void ShiftingAndResamplingKernel(
-    const T* __restrict__ MapChannelandIdata,
-    const T* __restrict__ MapChannelandQdata,
+    const T* __restrict__ origIdata, const T* __restrict__ origQdata,
     const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
-    const int numResultPerFrame, const int numChannels, const int signalLength,
-    const T CurrentRealfreq, T* __restrict__ outputIdata,
+    const T* __restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const T CurrentRealfreq,
+    const int outputTotalLength, T* __restrict__ outputIdata,
     T* __restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResultPerFrame) return;
+  if (idx >= numChannels * numResults) return;
 
-  int FrameIdx = idx / numChannels;
-  int chIdx = idx % numChannels;
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
 
   const T sampling_rate = T(245.76e6);
 
-  T frequency = VFrequency[FrameIdx];
+  T frequency = VFrequency[ResIdx];  // 频率
   T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
-  auto& I_orig = MapChannelandIdata + chIdx * signalLength;
-  auto& Q_orig = MapChannelandQdata + chIdx * signalLength;
+  // 获取当前线程处理的通道数据地址
+  auto& I_orig = origIdata + chIdx * signalLength;
+  auto& Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
   T* I_shifted = new T[signalLength];
@@ -435,16 +447,19 @@ __global__ void ShiftingAndResamplingKernel(
     Q_shifted[i] = Q_orig[i] * dev_cosVal + I_orig[i] * sinVal;
   }
 
-  // 使用有理重采样替换原来的滤波+降采样
-  // 上采样因子为1，下采样因子为decimation
+  // 上采样因子为1，下采样因子为downFactor
   int upFactor = 1;
-  int downFactor = VDownFactor[FrameIdx];
+  int downFactor = VDownFactor[ResIdx];
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
 
-  // outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  // outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  int outputSizePerData = dev_quotientCeil(signalLength * upFactor, downFactor);
-  auto& I_resampled = outputIdata + chIdx * outputSizePerData;
-  auto& Q_resampled = outputQdata + chIdx * outputSizePerData;
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto& I_resampled =
+      outputIdata + ResIdx * outputLength + chIdx * signalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto& Q_resampled =
+      outputQdata + ResIdx * outputLength + chIdx * signalLength;
 
   // 重采样
   dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -455,151 +470,58 @@ __global__ void ShiftingAndResamplingKernel(
   delete[] Q_shifted;
 }
 
-void SlotRecvDLDetectResult(frameBox& val, char* frame, int channelNum,
-                            float CurrentRealfreq) {
-  QString strObjname = sender()->objectName();
-  int channel = strObjname.replace("RadarDet", "").toInt();
-
-  qDebug() << __FUNCTION__ << "strObjname" << strObjname;
-  qDebug() << __FUNCTION__ << "val.boxes.size()" << val.boxes.size();
-
-  // 获取独一无二的X1的下标
-  QVector<int> uniqueIndex = FindUniqueBox(val.boxes);
-
-  if (uniqueIndex.empty()) {  // 如果为空 如何处理
-    float maxConfidence = FindMaxConfidence(val.boxes);
-    m_mapChannelMaxConfidence[channel] = maxConfidence;
-  }
-
-  // 信号的数据长度 524288  带宽为 245.76Mhz
-  double rbw = 245.76 / 524288.00;
-
-  // 定位源数据位置
-  for (int index : std::as_const(uniqueIndex)) {
-    if (val.boxes[index].x1 < 0) val.boxes[index].x1 = 0;
-
-    if (val.boxes[index].y1 < 0) val.boxes[index].y1 = 0;
-
-    if (val.boxes[index].x2 < 0) val.boxes[index].x2 = 0;
-
-    if (val.boxes[index].y2 < 0) val.boxes[index].y2 = 0;
-
-    int StartPint = (DLBoxCoordinateTransfer(val.boxes[index].y1) - 1) * 1024 +
-                    DLBoxCoordinateTransfer(val.boxes[index].x1) - 1;
-    int StopPoint = (DLBoxCoordinateTransfer(val.boxes[index].y2) - 1) * 1024 +
-                    DLBoxCoordinateTransfer(val.boxes[index].x2) - 1;
-
-    int centerfreqPos = (StopPoint + StartPint) / 2;
-    float startfreq = CurrentRealfreq - 122.88;
-    double MHzPerImageWidthPix = 245.76 / 640.0;
-
-    float detectfreq =
-        startfreq + ((val.boxes[index].x1 + val.boxes[index].x2) / 2.0) *
-                        MHzPerImageWidthPix;  // Mhz
-    double Bandwidth = (val.boxes[index].x2 - val.boxes[index].x1) *
-                       MHzPerImageWidthPix;  // Mhz
-
-    YOLOFreqBandValue tempchannelDetectRes;
-    tempchannelDetectRes.startIndex = StartPint;
-    tempchannelDetectRes.stopIndex = StopPoint;
-    tempchannelDetectRes.bandwidth = Bandwidth;
-    tempchannelDetectRes.frequency = detectfreq;  // Hz
-
-    m_MultiMapSingleChannelDetect.insert(index, tempchannelDetectRes);
-  }
-
-  // 当前通道的数据 拆分为IQ
-  QVector<short> IData, QData;
-  DataSplit(frame, IData, QData);
-
-  MapChannelandIdata[channel] = IData;
-  MapChannelandQdata[channel] = QData;
-
-  if (MapChannelandQdata.size() != 8) return;
-
-  // 数据开始处理
-  // 找到 包含box最多的 那个通道
-  bool ifmultiPicBox = false;  //  是否某一次检测 对于某一个通道检出多个框
-  int maxMultiPicBoxNum = 1;      // 检出多个框的个数
-  int maxMultiPicBoxChannel = 0;  // 检出多个框的通道
-
-  for (auto itr = m_MultiMapSingleChannelDetect.begin(),
-            itrend = m_MultiMapSingleChannelDetect.end();
-       itr != itrend; itr++) {
-    if (m_MultiMapSingleChannelDetect.values(itr.key()).size() >
-        maxMultiPicBoxNum) {
-      maxMultiPicBoxNum = m_MultiMapSingleChannelDetect.values().size();
-
-      ifmultiPicBox = true;
-
-      maxMultiPicBoxChannel = itr.key();
-    }
-  }
-
-  // 如果有多个框  那么
-  // 处理数据截取则按照最多个的位置对应的开始结束位置对于其他进行截取
-  // 全部只有一个时，找到最大置信度 对于所有数据按照最大的开始结束位置截取
-  std::map<qint64, FreqBandValue> FinalDetectChannelFreqIndex;
-
-  std::vector<std::vector<std::vector<double>>> CutWholeIdata;
-  std::vector<std::vector<std::vector<double>>> CutWholeQdata;
-  std::vector<int> cutstartIndex;
-  std::vector<int> cutstopIndex;
-  std::vector<double> detectFreq;
-  std::vector<double> detectBandwidth;
-
-  if (ifmultiPicBox) {
-    QList<YOLOFreqBandValue> PicBoxList =
-        m_MultiMapSingleChannelDetect.values(maxMultiPicBoxChannel);
-
-    for (int maxi = 0; maxi < PicBoxList.count(); maxi++) {
-      FreqBandValue FBV;
-      FBV.startIndex = PicBoxList.at(maxi).startIndex;
-      FBV.stopIndex = PicBoxList.at(maxi).stopIndex;
-      FBV.bandwidth = PicBoxList.at(maxi).bandwidth;
-      FBV.frequency = PicBoxList.at(maxi).frequency;  // Hz
-      FinalDetectChannelFreqIndex[FBV.frequency] = FBV;
-
-      m_vecOneFrameDetectResult.push_back(FinalDetectChannelFreqIndex);
-    }
-  } else {  // 找最大置信度的通道
-    QList<float> maxconfidencelist = m_mapChannelMaxConfidence.values();
-    float MaxConfidence =
-        *std::max_element(maxconfidencelist.begin(), maxconfidencelist.end());
-
-    int maxchannel = 0;
-    for (auto it = m_mapChannelMaxConfidence.begin();
-         it != m_mapChannelMaxConfidence.end(); ++it) {
-      if (it.value() == MaxConfidence) {
-        maxchannel = it.key();
-      }
-    }
-
-    auto maxChannelInfo = m_MultiMapSingleChannelDetect.find(maxchannel);
-
-    FreqBandValue FBV;
-    FBV.startIndex = maxChannelInfo.value().startIndex;
-    FBV.stopIndex = maxChannelInfo.value().stopIndex;
-    FBV.bandwidth = maxChannelInfo.value().bandwidth;
-    FBV.frequency = maxChannelInfo.value().frequency;  // Hz
-    FinalDetectChannelFreqIndex[FBV.frequency] = FBV;
-
-    m_vecOneFrameDetectResult.push_back(FinalDetectChannelFreqIndex);
-  }
-
-  qDebug() << __FUNCTION__ << "m_vecOneFrameDetectResult"
-           << m_vecOneFrameDetectResult.size();
-
-  // int detectindex = 0;
-  // for (const auto& [freq, fbv] : m_vecOneFrameDetectResult.back()) {
-  // }
+inline int quotientCeil(int num1, int num2) {
+  if (num1 % num2 != 0) return num1 / num2 + 1;
+  return num1 / num2;
+}
 
-  int upFactor = 1;
-  int outputTotalLength = 0;
+/**
+ * ShiftAndResampleDetectResult
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputTotalLength：重采样后输出信号的总长度
+ * @param
+ * outputIdata：重采样后的Idata，格式：[numResults][numChannels][lengthPerResult]
+ * @param
+ * outputQdata：重采样后的Qdata，格式：[numResults][numChannels][lengthPerResult]
+ * @return void
+ */
+template <typename T>
+bool ShiftAndResampleDetectResult(
+    const std::vector<std::vector<T>>& origIdata,
+    const std::vector<std::vector<T>>& origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>>&
+        vecOneFrameDetectResult,
+    const int numChannels, const float CurrentRealfreq, T* outputIdata,
+    T* outputQdata) {
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  // 检测结果数量
+  int numResults = vecOneFrameDetectResult.size();
+
+  int upFactor = 1;           // 上采样率，默认为1
+  int outputTotalLength = 0;  // 保存总的输出信号长度
+
+  // 每个检测加过对应的输出信号长度（这里假设输出长度不相同）
+  std::vector<int> outputLength;
+
+  // 上采样率
   std::vector<int> downFactor;
-
-  for (const auto& [freq, fbv] : m_vecOneFrameDetectResult.back()) {
-    double bandwidth = fbv.bandwidth * 1e6;
+  // 检测结果的频率
+  std::vector<T> detectFreq;
+  // 检测结果的带宽
+  std::vector<T> detectBandwidth;
+
+  // 根据检测结果，初始化相关变量或者vector
+  for (const auto& [freq, fbv] : vecOneFrameDetectResult.back()) {
+    T bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
     if (std::abs(bandwidth - 40e6) < 2 * 1e6) {
       decimation = 4;
@@ -609,102 +531,133 @@ void SlotRecvDLDetectResult(frameBox& val, char* frame, int channelNum,
       decimation = 16;
     } else {
       // 带宽不符合要求，跳过处理
-      qDebug() << __FUNCTION__ << "else  --- 不符合 ";
+      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
       continue;
     }
 
     downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency * 1e6);
+    detectFreq.push_back(fbv.frequency);
     detectBandwidth.push_back(bandwidth);
 
-    outputTotalLength += quotientCeil(signalLength * upFactor, decimation);
-  }
-
-  // ==========替换上面的for循环=============
-  // ShiftingAndResamplingKernel 核函数调用
-  int numChannels = CHANNEL_COUNT;
-  int numResult = m_vecOneFrameDetectResult.size();
-
-  // 申请DetectResult的GPU显存，并将数据copy到该显存中
-  int* d_downFactor = nullptr;
-  double* d_frequency = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResult * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResult * sizeof(double))));
-  // 将数据copy到该显存中
-  for (int i = 0; i < numResult; i++) {
-    void* dst_downFactor = d_downFactor + i * sizeof(int);
-    const void* src_downFactor = downFactor[i].data();
-
-    CHECK_CUDA_ERROR(cudaMemcpy(dst_downFactor, src_downFactor, i * sizeof(int),
-                                cudaMemcpyHostToDevice));
-
-    void* dst_frequency = d_frequency + i * sizeof(double);
-    const void* src_frequency = detectFreq[i].data();
+    // 每个带宽,重采样后的输出信号长度
+    int length = quotientCeil(signalLength * upFactor, decimation);
+    outputLength.push_back(length);
 
-    CHECK_CUDA_ERROR(cudaMemcpy(dst_frequency, src_frequency,
-                                i * sizeof(double), cudaMemcpyHostToDevice));
+    // 重采样后输出信号的总长度
+    outputTotalLength += length;
   }
 
-  // 申请原始的idata和qdata的GPU显存，并将数据copy到GPU显存中
-  int signalLength = MapChannelandIdata[0].value.size();
-  double* d_Idata = nullptr;
-  double* d_Qdata = nullptr;
+  // ====准备调用重采样核函数=====
+  // copy下采样率，频率等数据到显存中
+  int* d_downFactor = nullptr;
+  int* d_outputLength = nullptr;
+  T* d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(T))));
+
+  // copy下采样率到显存中
+  const int* src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const T* src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(T), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int* src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  T* d_Idata = nullptr;
+  T* d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(T))));
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(T))));
 
   // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(double);
+  size_t copySize = signalLength * sizeof(T);
   for (int i = 0; i < numChannels; i++) {
     // copy 原始的idata 到gpu显存
-    double* dst_idata = d_Idata + i * signalLength;
-    const void* src_idata = MapChannelandIdata[i].data();
+    T* dst_idata = d_Idata + i * signalLength;
+    const void* src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    cufftDoubleComplex* dst_qdata = d_Qdata + i * signalLength;
-    const void* src_qdata = MapChannelandQdata[i].data();
+    T* dst_qdata = d_Qdata + i * signalLength;
+    const void* src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
-  // 计算重采样后信号的总长度，用于申请GPU显存
-  int outputTotalLength =
-      ComputeOutputLength(m_vecOneFrameDetectResult, signalLength);
   // 申请重采样后输出信号的GPU显存
-  double* d_outputIdata = nullptr;
-  double* d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_outputIdata, (outputTotalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_outputQdata, (outputTotalLength * sizeof(double))));
+  T* d_outputIdata = nullptr;
+  T* d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(T))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(T))));
 
   // 线程数配置
   dim3 block(numChannels);
-  dim3 grid((numChannels * numResultPerFrame + block.x - 1) / block.x);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
   ShiftingAndResamplingKernel<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, numResultPerFrame,
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
-  // =======================
 
-  MapChannelandIdata.clear();
-  MapChannelandQdata.clear();
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(T)),
+                              cudaMemcpyHostToDevice));
 
-  m_MultiMapSingleChannelDetect.clear();
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(T)),
+                              cudaMemcpyHostToDevice));
 
-  m_vecOneFrameDetectResult.clear();
+  // 释放显存
+  if (d_downFactor) {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength) {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency) {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
 
-  // box 清除
-  val.boxes.clear();
+  if (d_Idata) {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata) {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
 
-  qDebug() << __FUNCTION__
-           << "Emit -------------------m_strSamplePoint.toInt()--------"
-           << m_strSamplePoint.toInt();
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
 
-  emit SignalEmitCalculateMovingData_afterDetect(
-      m_strSamplePoint.toInt(), CHANNEL_COUNT, CutWholeIdata, CutWholeQdata,
-      detectFreq, detectBandwidth);
+  return true;
 }
diff --git a/cuda_resample.h b/cuda_resample.h
index c60d74b..adfeda7 100644
--- a/cuda_resample.h
+++ b/cuda_resample.h
@@ -15,7 +15,7 @@
 #endif
 
 template <typename T>
-struct FreqBandValue_k {
+struct FreqBandValue {
   T frequency;
   T bandwidth;
   T maxvalue;
@@ -25,26 +25,29 @@ struct FreqBandValue_k {
 };
 
 /**
- * 重采样核函数：完成原始信号的移频，重采样等计算
+ * ShiftAndResampleDetectResult
+ * 重采样函数：完成原始信号的移频，重采样等计算
  *
- * @param MapChannelandIdata：原始Idata
- * @param MapChannelandQdata：原始Qdata
- * @param vecOneFrameDetectResult
- * @param numResultPerFrame：每帧的结果数
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
  * @param numChannels：信号通道数
- * @param signalLength：信号长度
+ * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
- * @param outputsIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
- * @param outputsQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+ * @param outputTotalLength：重采样后输出信号的总长度
+ * @param
+ * outputIdata：重采样后的Idata，格式：[numResults][numChannels][lengthPerResult]
+ * @param
+ * outputQdata：重采样后的Qdata，格式：[numResults][numChannels][lengthPerResult]
  * @return void
  */
 template <typename T>
-__global__ void ShiftingAndResamplingKernel(
-    const T* __restrict__ MapChannelandIdata,
-    const T* __restrict__ MapChannelandQdata,
-    const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
-    const int numResultPerFrame, const int numChannels, const int signalLength,
-    const T CurrentRealfreq, T* __restrict__ outputIdata,
-    T* __restrict__ outputQdata);
+bool ShiftAndResampleDetectResult(
+    const std::vector<std::vector<T>>& origIdata,
+    const std::vector<std::vector<T>>& origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>>&
+        vecOneFrameDetectResult,
+    const int numChannels, const float CurrentRealfreq, T* outputIdata,
+    T* outputQdata);
 
 #endif  // CUDA_RESAMPLE_H
-- 
Gitee


From a7928e3a5e6cf71819d26b6296f4aaf3a5c49809 Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 15 Dec 2025 17:11:30 +0800
Subject: [PATCH 03/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=9B=B8=E5=85=B3?=
 =?UTF-8?q?=E6=B3=A8=E9=87=8A=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichards <229242333@qq.com>
---
 cuda_resample.cu | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index ffdfc18..2f47cbc 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -487,9 +487,9 @@ inline int quotientCeil(int num1, int num2) {
  * @param CurrentRealfreq：当前实际频率
  * @param outputTotalLength：重采样后输出信号的总长度
  * @param
- * outputIdata：重采样后的Idata，格式：[numResults][numChannels][lengthPerResult]
+ * outputIdata：重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @param
- * outputQdata：重采样后的Qdata，格式：[numResults][numChannels][lengthPerResult]
+ * outputQdata：重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return void
  */
 template <typename T>
@@ -509,9 +509,8 @@ bool ShiftAndResampleDetectResult(
   int upFactor = 1;           // 上采样率，默认为1
   int outputTotalLength = 0;  // 保存总的输出信号长度
 
-  // 每个检测加过对应的输出信号长度（这里假设输出长度不相同）
+  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
   std::vector<int> outputLength;
-
   // 上采样率
   std::vector<int> downFactor;
   // 检测结果的频率
@@ -520,6 +519,7 @@ bool ShiftAndResampleDetectResult(
   std::vector<T> detectBandwidth;
 
   // 根据检测结果，初始化相关变量或者vector
+  // vecOneFrameDetectResult 可以不使用map，使用vector
   for (const auto& [freq, fbv] : vecOneFrameDetectResult.back()) {
     T bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
@@ -539,7 +539,7 @@ bool ShiftAndResampleDetectResult(
     detectFreq.push_back(fbv.frequency);
     detectBandwidth.push_back(bandwidth);
 
-    // 每个带宽,重采样后的输出信号长度
+    // 计算每个下采样率,重采样后的输出信号长度
     int length = quotientCeil(signalLength * upFactor, decimation);
     outputLength.push_back(length);
 
@@ -547,7 +547,7 @@ bool ShiftAndResampleDetectResult(
     outputTotalLength += length;
   }
 
-  // ====准备调用重采样核函数=====
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int* d_downFactor = nullptr;
   int* d_outputLength = nullptr;
@@ -615,7 +615,10 @@ bool ShiftAndResampleDetectResult(
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
+  // outputIdata 确保空间够
+  // outputQdata 确保空间够
   // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
                               (numChannels * outputTotalLength * sizeof(T)),
                               cudaMemcpyHostToDevice));
@@ -654,6 +657,7 @@ bool ShiftAndResampleDetectResult(
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
+
   if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
-- 
Gitee


From f569030054d5cc4070d98abc4e5fd0f2ed2f6f60 Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 15 Dec 2025 17:18:44 +0800
Subject: [PATCH 04/27] =?UTF-8?q?=E5=AE=8C=E5=96=84=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B3=A8=E9=87=8A=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichards <229242333@qq.com>
---
 cuda_resample.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 2f47cbc..6060390 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -503,7 +503,7 @@ bool ShiftAndResampleDetectResult(
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  // 检测结果数量
+  // 检测结果数据，采用vector存储更好，没必要使用map
   int numResults = vecOneFrameDetectResult.size();
 
   int upFactor = 1;           // 上采样率，默认为1
-- 
Gitee


From d42576add951650843418c318c8910b8ef33ae05 Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 15 Dec 2025 18:29:22 +0800
Subject: [PATCH 05/27] =?UTF-8?q?=E5=AE=8C=E5=96=84=E9=83=A8=E5=88=86?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichards <229242333@qq.com>
---
 cuda_resample.cu | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 6060390..c325753 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -441,25 +441,31 @@ __global__ void ShiftingAndResamplingKernel(
   }
   for (int i = 0; i < signalLength; i++) {
     T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    T dev_cosVal = dev_cos(phase);
+    T cosVal = dev_cos(phase);
     T sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * dev_cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * dev_cosVal + I_orig[i] * sinVal;
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
 
   // 上采样因子为1，下采样因子为downFactor
   int upFactor = 1;
   int downFactor = VDownFactor[ResIdx];
 
-  // 获取当前检测结果对应的输出信号长度（这里假设的是每个输出信号长度可能不相同）
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++) {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
   int outputLength = VOutputLength[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto& I_resampled =
-      outputIdata + ResIdx * outputLength + chIdx * signalLength;
+  auto& I_resampled = outputIdata + offset + chIdx * outputLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto& Q_resampled =
-      outputQdata + ResIdx * outputLength + chIdx * signalLength;
+  auto& Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
   dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-- 
Gitee


From 09a9f5ee0a4fbd89bd0b4895e6b394ae9d1697d1 Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 15 Dec 2025 18:33:28 +0800
Subject: [PATCH 06/27] =?UTF-8?q?=E5=AE=8C=E5=96=84=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E6=B3=A8=E9=87=8A=E5=92=8C=E9=83=A8=E5=88=86=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wsqRichards <229242333@qq.com>
---
 cuda_resample.cu | 14 +++++---------
 cuda_resample.h  | 10 ++++------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index c325753..5e4ed10 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -401,7 +401,6 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
  * @param numChannels：信号通道数
  * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
- * @param outputTotalLength：重采样后的输出信号总的长度，
  * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return void
@@ -412,8 +411,7 @@ __global__ void ShiftingAndResamplingKernel(
     const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
     const T* __restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const T CurrentRealfreq,
-    const int outputTotalLength, T* __restrict__ outputIdata,
-    T* __restrict__ outputQdata) {
+    T* __restrict__ outputIdata, T* __restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= numChannels * numResults) return;
 
@@ -489,13 +487,11 @@ inline int quotientCeil(int num1, int num2) {
  * @param origQdata：原始Qdata
  * @param vecOneFrameDetectResult：检测结果数据
  * @param numChannels：信号通道数
- * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
- * @param outputTotalLength：重采样后输出信号的总长度
- * @param
- * outputIdata：重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param
- * outputQdata：重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return void
  */
 template <typename T>
diff --git a/cuda_resample.h b/cuda_resample.h
index adfeda7..33bf998 100644
--- a/cuda_resample.h
+++ b/cuda_resample.h
@@ -32,13 +32,11 @@ struct FreqBandValue {
  * @param origQdata：原始Qdata
  * @param vecOneFrameDetectResult：检测结果数据
  * @param numChannels：信号通道数
- * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
- * @param outputTotalLength：重采样后输出信号的总长度
- * @param
- * outputIdata：重采样后的Idata，格式：[numResults][numChannels][lengthPerResult]
- * @param
- * outputQdata：重采样后的Qdata，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return void
  */
 template <typename T>
-- 
Gitee


From 1efb18d7981359d2f8a47ac12d9f2972ebd785b1 Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 11:14:35 +0800
Subject: [PATCH 07/27] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AF=B9=E9=BD=90?=
 =?UTF-8?q?=E9=87=8D=E9=87=87=E6=A0=B7=E5=90=8E=E4=BF=A1=E5=8F=B7=E9=95=BF?=
 =?UTF-8?q?=E5=BA=A6=E7=9A=84=E5=A4=84=E7=90=86=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: amor <15820865+Amor_23456@user.noreply.gitee.com>
---
 cuda_resample.cu | 1055 ++++++++++++++++++++++++++++++++++++++++------
 cuda_resample.h  |  113 ++++-
 mainwindow.cpp   |  102 ++++-
 3 files changed, 1104 insertions(+), 166 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 5e4ed10..01206e5 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -3,9 +3,11 @@
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
-  do {                                                                       \
+  do                                                                         \
+  {                                                                          \
     cudaError_t err = call;                                                  \
-    if (err != cudaSuccess) {                                                \
+    if (err != cudaSuccess)                                                  \
+    {                                                                        \
       std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
                 << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
       throw std::runtime_error("CUDA error");                                \
@@ -22,49 +24,70 @@
 
 // 余弦函数
 template <typename T>
-__device__ __forceinline__ T dev_cos(T x) {
-  if constexpr (std::is_same_v<T, float>) {
+__device__ __forceinline__ T dev_cos(T x)
+{
+  if constexpr (std::is_same_v<T, float>)
+  {
     return cosf(x);
-  } else if constexpr (std::is_same_v<T, double>) {
+  }
+  else if constexpr (std::is_same_v<T, double>)
+  {
     return cos(x);
-  } else {
+  }
+  else
+  {
     return cos(static_cast<double>(x));
   }
 }
 
 // 正弦函数
 template <typename T>
-__device__ __forceinline__ T dev_sin(T x) {
-  if constexpr (std::is_same_v<T, float>) {
+__device__ __forceinline__ T dev_sin(T x)
+{
+  if constexpr (std::is_same_v<T, float>)
+  {
     return sinf(x);
-  } else if constexpr (std::is_same_v<T, double>) {
+  }
+  else if constexpr (std::is_same_v<T, double>)
+  {
     return sin(x);
-  } else {
+  }
+  else
+  {
     return sin(static_cast<double>(x));
   }
 }
 
 // 浮点数绝对值
 template <typename T>
-__device__ __forceinline__ T dev_abs(T x) {
-  if constexpr (std::is_same_v<T, float>) {
+__device__ __forceinline__ T dev_abs(T x)
+{
+  if constexpr (std::is_same_v<T, float>)
+  {
     return fabsf(x);
-  } else if constexpr (std::is_same_v<T, double>) {
+  }
+  else if constexpr (std::is_same_v<T, double>)
+  {
     return fabs(x);
-  } else {
+  }
+  else
+  {
     return fabs(static_cast<double>(x));
   }
 }
 
 // 整数向上取整除法
-__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
+{
   div_t result = div(num1, num2);
   return result.quot + (result.rem != 0);
 }
 
 // CUDA设备端GCD函数:最大公约数
-__device__ __forceinline__ int dev_gcd(int a, int b) {
-  while (b != 0) {
+__device__ __forceinline__ int dev_gcd(int a, int b)
+{
+  while (b != 0)
+  {
     int temp = b;
     b = a % b;
     a = temp;
@@ -74,8 +97,10 @@ __device__ __forceinline__ int dev_gcd(int a, int b) {
 
 // 生成连续递增的序列
 template <typename T>
-__device__ __forceinline__ void dev_iota(T* data, int size, T start) {
-  for (int i = 0; i < size; i++) {
+__device__ __forceinline__ void dev_iota(T *data, int size, T start)
+{
+  for (int i = 0; i < size; i++)
+  {
     data[i] = start + T(i);
   }
   return;
@@ -83,22 +108,26 @@ __device__ __forceinline__ void dev_iota(T* data, int size, T start) {
 
 // 填充data为value
 template <typename T>
-__device__ __forceinline__ void dev_fill(T* data, int size, T value) {
-  for (int i = 0; i < size; i++) {
+__device__ __forceinline__ void dev_fill(T *data, int size, T value)
+{
+  for (int i = 0; i < size; i++)
+  {
     data[i] = value;
   }
   return;
 }
 
 template <typename T>
-__device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
-                         int freqSize) {
+__device__ int dev_firls(T *result, int length, T *freq, const T *amplitude,
+                         int freqSize)
+{
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
-  T* weight = new T[weightSize];
-  if (weight == nullptr) {
+  T *weight = new T[weightSize];
+  if (weight == nullptr)
+  {
     return -1;
   }
 
@@ -106,35 +135,41 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
   dev_fill(weight, weightSize, T(1.0));
 
   // 处理频率向量
-  for (int i = 0; i < freqSize; i++) {
+  for (int i = 0; i < freqSize; i++)
+  {
     freq[i] = freq[i] / T(2.0);
   }
 
   int filterLength = length + 1;
   length = (filterLength - 1) / 2;
 
-  //奇偶判断
+  // 奇偶判断
   bool Nodd = filterLength & 1;
 
   // 创建和初始化向量k
   int kLength = length + 1;
-  T* k = new T[kLength];
-  if (k == nullptr) {
+  T *k = new T[kLength];
+  if (k == nullptr)
+  {
     return -1;
   };
 
   // 初始化k向量为递增序列：0，1，2...
   dev_iota(k, kLength, T(0.0));
 
-  if (!Nodd) {
-    for (int i = 0; i < kLength; i++) {
+  if (!Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
       k[i] += T(0.5);
     }
   }
 
   // k.erase(k.begin());
-  if (Nodd) {
-    for (int i = 0; i < kLength; i++) {
+  if (Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
       k[i] = k[i + 1];
     }
     kLength--;
@@ -142,18 +177,21 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
 
   // 创建和初始化向量b
   int bLength = kLength;
-  if (Nodd) {
-    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
+  if (Nodd)
+  {
+    bLength++; // 此处++，因为后面需要在b[0]处插入b0
   }
-  T* b = new T[bLength];
-  if (b == nullptr) {
+  T *b = new T[bLength];
+  if (b == nullptr)
+  {
     return -1;
   };
 
   dev_fill(b, bLength, T(0.0));
 
   T b0 = T(0.0);
-  for (int i = 0; i < freqSize; i += 2) {
+  for (int i = 0; i < freqSize; i += 2)
+  {
     T Fi = freq[i];
     T Fip1 = freq[i + 1];
     T ampi = amplitude[i];
@@ -162,13 +200,15 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
     T m_s = (ampip1 - ampi) / (Fip1 - Fi);
     T b1 = ampi - (m_s * Fi);
 
-    if (Nodd) {
+    if (Nodd)
+    {
       b0 += (b1 * (Fip1 - Fi)) +
             m_s / T(2.0) * (pow(Fip1, T(2.0)) - pow(Fi, T(2.0))) * wt2;
     }
 
     // 并行计算b向量
-    for (int j = 0; j < kLength; j++) {
+    for (int j = 0; j < kLength; j++)
+    {
       T kj = k[j];
       b[j] += (m_s / (T(4.0) * pow(M_PI, T(2.0))) *
                (dev_cos(T(2.0) * M_PI * Fip1) - dev_cos(T(2.0) * M_PI * Fi)) /
@@ -182,11 +222,16 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
   }
 
   // 处理最终结果，将b0插入到b向量的开始
-  if (Nodd) {
-    for (int i = kLength; i >= 0; i--) {
-      if (i > 0) {
+  if (Nodd)
+  {
+    for (int i = kLength; i >= 0; i--)
+    {
+      if (i > 0)
+      {
         b[i] = b[i - 1];
-      } else {
+      }
+      else
+      {
         b[i] = b0;
       }
     }
@@ -196,44 +241,52 @@ __device__ int dev_firls(T* result, int length, T* freq, const T* amplitude,
   T w0 = weight[0];
 
   int aLength = bLength;
-  T* a = new T[aLength];
-  if (a == nullptr) {
+  T *a = new T[aLength];
+  if (a == nullptr)
+  {
     return -1;
   };
 
   // vector<T> result = {a.rbegin(), a.rend()};
-  for (int i = 0; i < aLength; i++) {
+  for (int i = 0; i < aLength; i++)
+  {
     a[i] = pow(w0, T(2.0)) * T(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
   }
 
   int it = 0;
-  if (Nodd) {
+  if (Nodd)
+  {
     it = 1;
   }
 
   // 构建结果向量
-  for (int i = 0; i < aLength; i++) {
+  for (int i = 0; i < aLength; i++)
+  {
     result[i] = result[i] * T(0.5);
-    if ((i + it) < aLength) {
+    if ((i + it) < aLength)
+    {
       result[aLength + i] = a[i + it] * T(0.5);
     }
   }
 
   // 释放动态分配的内存
-  delete[] weight;  // 释放内存
-  delete[] k;       // 释放内存
-  delete[] b;       // 释放内存
-  delete[] a;       // 释放内存
+  delete[] weight; // 释放内存
+  delete[] k;      // 释放内存
+  delete[] b;      // 释放内存
+  delete[] a;      // 释放内存
   return 0;
 }
 
 // 设备端Bessel函数模板
 template <typename T>
-__device__ T dev_cyl_bessel_i(int n, T x) {
-  if (n == 0) return T(1);
+__device__ T dev_cyl_bessel_i(int n, T x)
+{
+  if (n == 0)
+    return T(1);
   T bessel = T(1), bessel_prev = T(1);
-  for (int i = 1; i <= n; ++i) {
+  for (int i = 1; i <= n; ++i)
+  {
     bessel = (T(2) * i - T(1)) / i * x * bessel_prev - bessel;
     bessel_prev = bessel;
   }
@@ -242,12 +295,14 @@ __device__ T dev_cyl_bessel_i(int n, T x) {
 
 // 设备端凯塞窗核函数模板
 template <typename T>
-__device__ void dev_kaiser(T* window, int order, T bta) {
+__device__ void dev_kaiser(T *window, int order, T bta)
+{
   T Numerator, Denominator;
   Denominator = dev_cyl_bessel_i(0, bta);
   T od2 = (order - T(1)) / T(2);
 
-  for (int n = 0; n < order; n++) {
+  for (int n = 0; n < order; n++)
+  {
     T x = bta * sqrt(T(1) - pow((n - od2) / od2, T(2)));
     Numerator = dev_cyl_bessel_i(0, x);
     window[n] = Numerator / Denominator;
@@ -256,12 +311,14 @@ __device__ void dev_kaiser(T* window, int order, T bta) {
 
 template <typename T>
 __device__ void dev_resample(const int upFactor, const int downFactor,
-                             const T* inputSignal, const int inputSize,
-                             T* outputSignal) {
+                             const T *inputSignal, const int inputSize,
+                             T *outputSignal)
+{
   const int n = 10;
   const T bta = T(5.0);
 
-  if (upFactor <= 0 || downFactor <= 0) {
+  if (upFactor <= 0 || downFactor <= 0)
+  {
     return;
   }
 
@@ -270,7 +327,8 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
   upFactor /= gcd_o;
   downFactor /= gcd_o;
 
-  if (upFactor == downFactor) {
+  if (upFactor == downFactor)
+  {
     outputSignal = inputSignal;
     return;
   }
@@ -296,25 +354,29 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
   int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
 
-  T* coefficients = new T[coefficientsLength];
-  if (coefficients == nullptr) {
+  T *coefficients = new T[coefficientsLength];
+  if (coefficients == nullptr)
+  {
     return;
   }
   int ret = dev_firls(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
                       freqSize);
-  if (ret == -1) {
+  if (ret == -1)
+  {
     LOG_ERROR("dev_firls调用失败\n");
     return;
   }
 
   int windowSize = length;
-  T* window = new T[windowSize];
-  if (window == nullptr) {
+  T *window = new T[windowSize];
+  if (window == nullptr)
+  {
     return;
   }
   dev_kaiser(window, length, bta);
 
-  for (int i = 0; i < coefficientsLength; i++) {
+  for (int i = 0; i < coefficientsLength; i++)
+  {
     coefficients[i] *= (upFactor * window[i]);
   }
 
@@ -323,18 +385,21 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
 
   // 分配filter空间
   int hSize = coefficientsLength + 2 * nz;
-  T* filter = new T[hSize];
-  if (filter == nullptr) {
+  T *filter = new T[hSize];
+  if (filter == nullptr)
+  {
     return;
   }
 
   int filterLength = 0;
-  for (int i = 0; i < nz; i++) {
+  for (int i = 0; i < nz; i++)
+  {
     filter[i + filterLength] = T(0.0);
   }
   filterLength += nz;
 
-  for (int i = 0; i < coefficientsLength; i++) {
+  for (int i = 0; i < coefficientsLength; i++)
+  {
     filter[i + filterLength] = coefficients[i];
   }
   filterLength += coefficientsLength;
@@ -344,18 +409,21 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
   nz = 0;
   while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
              delay <
-         outputSize) {
+         outputSize)
+  {
     nz++;
   }
 
-  for (int i = 0; i < nz; i++) {
+  for (int i = 0; i < nz; i++)
+  {
     filter[i + filterLength] = T(0.0);
   }
   filterLength += nz;
 
   // 计算
   int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upFactor) {
+  while (paddedCoefCount % upFactor)
+  {
     paddedCoefCount++;
   }
 
@@ -364,8 +432,9 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
   int outputCount =
       ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
 
-  T* results = new T[outputCount];
-  if (results == nullptr) {
+  T *results = new T[outputCount];
+  if (results == nullptr)
+  {
     return;
   }
 
@@ -374,7 +443,8 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
                  filterLength, results, &resultsCount);
 
   int j = 0;
-  for (int i = delay; i < outputSize + delay; i++) {
+  for (int i = delay; i < outputSize + delay; i++)
+  {
     outputSignal[j++] = results[i];
   }
 
@@ -403,41 +473,46 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
  * @param CurrentRealfreq：当前实际频率
  * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
- * @return void
+ * @return true or false
  */
 template <typename T>
 __global__ void ShiftingAndResamplingKernel(
-    const T* __restrict__ origIdata, const T* __restrict__ origQdata,
-    const int* __restrict__ VDownFactor, const T* __restrict__ VFrequency,
-    const T* __restrict__ VOutputLength, const int numResults,
+    const T *__restrict__ origIdata, const T *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const T *__restrict__ VFrequency,
+    const T *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const T CurrentRealfreq,
-    T* __restrict__ outputIdata, T* __restrict__ outputQdata) {
+    T *__restrict__ outputIdata, T *__restrict__ outputQdata)
+{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults) return;
+  if (idx >= numChannels * numResults)
+    return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels;  // 第几个检测结果
-  int chIdx = idx % numChannels;   // 第几个通道
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
 
   const T sampling_rate = T(245.76e6);
 
-  T frequency = VFrequency[ResIdx];  // 频率
+  T frequency = VFrequency[ResIdx]; // 频率
   T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto& I_orig = origIdata + chIdx * signalLength;
-  auto& Q_orig = origQdata + chIdx * signalLength;
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  T* I_shifted = new T[signalLength];
-  if (I_shifted == nullptr) {
+  T *I_shifted = new T[signalLength];
+  if (I_shifted == nullptr)
+  {
     return;
   }
-  T* Q_shifted = new T[signalLength];
-  if (Q_shifted == nullptr) {
+  T *Q_shifted = new T[signalLength];
+  if (Q_shifted == nullptr)
+  {
     return;
   }
-  for (int i = 0; i < signalLength; i++) {
+  for (int i = 0; i < signalLength; i++)
+  {
     T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
     T cosVal = dev_cos(phase);
     T sinVal = dev_sin(phase);
@@ -451,7 +526,8 @@ __global__ void ShiftingAndResamplingKernel(
 
   // 计算之前带宽，对应的输出信号的总长度
   int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++) {
+  for (int i = 0; i < ResIdx; i++)
+  {
     beforeTotalLength += VOutputLength[i];
   }
   // 当前带宽对应的输出信号的起始地址偏移
@@ -461,9 +537,9 @@ __global__ void ShiftingAndResamplingKernel(
   int outputLength = VOutputLength[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto& I_resampled = outputIdata + offset + chIdx * outputLength;
+  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto& Q_resampled = outputQdata + offset + chIdx * outputLength;
+  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
   dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -474,13 +550,97 @@ __global__ void ShiftingAndResamplingKernel(
   delete[] Q_shifted;
 }
 
-inline int quotientCeil(int num1, int num2) {
-  if (num1 % num2 != 0) return num1 / num2 + 1;
+/**
+ * ShiftingAndResamplingKernelV2
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的原始信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+template <typename T>
+__global__ void ShiftingAndResamplingKernelV2(
+    const T *__restrict__ origIdata, const T *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const T *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const T CurrentRealfreq, const int alignSignalLength,
+    T *__restrict__ outputIdata, T *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const T sampling_rate = T(245.76e6);
+
+  T frequency = VFrequency[ResIdx]; // 频率
+  T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  T *I_shifted = new T[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  T *Q_shifted = new T[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    T cosVal = dev_cos(phase);
+    T sinVal = dev_sin(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+inline int quotientCeil(int num1, int num2)
+{
+  if (num1 % num2 != 0)
+    return num1 / num2 + 1;
   return num1 / num2;
 }
 
 /**
- * ShiftAndResampleDetectResult
+ * ShiftAndResampleSignal
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -492,24 +652,25 @@ inline int quotientCeil(int num1, int num2) {
  * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @param outputQdata：
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return void
+ * @return true or false
  */
 template <typename T>
-bool ShiftAndResampleDetectResult(
-    const std::vector<std::vector<T>>& origIdata,
-    const std::vector<std::vector<T>>& origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>>&
+bool ShiftAndResampleSignal(
+    const std::vector<std::vector<T>> &origIdata,
+    const std::vector<std::vector<T>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
         vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, T* outputIdata,
-    T* outputQdata) {
+    const int numChannels, const float CurrentRealfreq, T *outputIdata,
+    T *outputQdata)
+{
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
   // 检测结果数据，采用vector存储更好，没必要使用map
   int numResults = vecOneFrameDetectResult.size();
 
-  int upFactor = 1;           // 上采样率，默认为1
-  int outputTotalLength = 0;  // 保存总的输出信号长度
+  int upFactor = 1;          // 上采样率，默认为1
+  int outputTotalLength = 0; // 保存总的输出信号长度
 
   // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
   std::vector<int> outputLength;
@@ -522,16 +683,24 @@ bool ShiftAndResampleDetectResult(
 
   // 根据检测结果，初始化相关变量或者vector
   // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto& [freq, fbv] : vecOneFrameDetectResult.back()) {
+  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
+  {
     T bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6) {
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
+    {
       decimation = 4;
-    } else if (std::abs(bandwidth - 20e6) < 2 * 1e6) {
+    }
+    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
+    {
       decimation = 8;
-    } else if (std::abs(bandwidth - 10e6) < 2 * 1e6) {
+    }
+    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
+    {
       decimation = 16;
-    } else {
+    }
+    else
+    {
       // 带宽不符合要求，跳过处理
       std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
       continue;
@@ -551,34 +720,34 @@ bool ShiftAndResampleDetectResult(
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
-  int* d_downFactor = nullptr;
-  int* d_outputLength = nullptr;
-  T* d_frequency = nullptr;
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  T *d_frequency = nullptr;
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(T))));
 
   // copy下采样率到显存中
-  const int* src_downFactor = downFactor.data();
+  const int *src_downFactor = downFactor.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
   // copy频率到显存中
-  const T* src_frequency = detectFreq.data();
+  const T *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
                               numResults * sizeof(T), cudaMemcpyHostToDevice));
 
   // copy每个带宽，重采样后输出信号长度到显存中
-  const int* src_outputLength = outputLength.data();
+  const int *src_outputLength = outputLength.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  T* d_Idata = nullptr;
-  T* d_Qdata = nullptr;
+  T *d_Idata = nullptr;
+  T *d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
       cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(T))));
   CHECK_CUDA_ERROR(
@@ -586,23 +755,24 @@ bool ShiftAndResampleDetectResult(
 
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(T);
-  for (int i = 0; i < numChannels; i++) {
+  for (int i = 0; i < numChannels; i++)
+  {
     // copy 原始的idata 到gpu显存
-    T* dst_idata = d_Idata + i * signalLength;
-    const void* src_idata = origIdata[i].data();
+    T *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    T* dst_qdata = d_Qdata + i * signalLength;
-    const void* src_qdata = origQdata[i].data();
+    T *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
   // 申请重采样后输出信号的GPU显存
-  T* d_outputIdata = nullptr;
-  T* d_outputQdata = nullptr;
+  T *d_outputIdata = nullptr;
+  T *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
                               (numChannels * outputTotalLength * sizeof(T))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
@@ -630,37 +800,648 @@ bool ShiftAndResampleDetectResult(
                               cudaMemcpyHostToDevice));
 
   // 释放显存
-  if (d_downFactor) {
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalFloat
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloat(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
+        vecOneFrameDetectResult,
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  // 检测结果数据，采用vector存储更好，没必要使用map
+  int numResults = vecOneFrameDetectResult.size();
+
+  int upFactor = 1;          // 上采样率，默认为1
+  int outputTotalLength = 0; // 保存总的输出信号长度
+
+  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
+  std::vector<int> outputLength;
+  // 上采样率
+  std::vector<int> downFactor;
+  // 检测结果的频率
+  std::vector<float> detectFreq;
+  // 检测结果的带宽
+  std::vector<float> detectBandwidth;
+
+  // 根据检测结果，初始化相关变量或者vector
+  // vecOneFrameDetectResult 可以不使用map，使用vector
+  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
+  {
+    float bandwidth = fbv.bandwidth * 1e6;
+    int decimation = 0;
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
+    {
+      decimation = 4;
+    }
+    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
+    {
+      decimation = 8;
+    }
+    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
+    {
+      decimation = 16;
+    }
+    else
+    {
+      // 带宽不符合要求，跳过处理
+      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
+      continue;
+    }
+
+    downFactor.push_back(decimation);
+    detectFreq.push_back(fbv.frequency);
+    detectBandwidth.push_back(bandwidth);
+
+    // 计算每个下采样率,重采样后的输出信号长度
+    int length = quotientCeil(signalLength * upFactor, decimation);
+    outputLength.push_back(length);
+
+    // 重采样后输出信号的总长度
+    outputTotalLength += length;
+  }
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  float *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const T *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(float);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    float *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    float *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(float))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernel<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保空间够
+  // outputQdata 确保空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyHostToDevice));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyHostToDevice));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalDouble
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDouble(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
+        vecOneFrameDetectResult,
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  // 检测结果数据，采用vector存储更好，没必要使用map
+  int numResults = vecOneFrameDetectResult.size();
+
+  int upFactor = 1;          // 上采样率，默认为1
+  int outputTotalLength = 0; // 保存总的输出信号长度
+
+  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
+  std::vector<int> outputLength;
+  // 上采样率
+  std::vector<int> downFactor;
+  // 检测结果的频率
+  std::vector<double> detectFreq;
+  // 检测结果的带宽
+  std::vector<double> detectBandwidth;
+
+  // 根据检测结果，初始化相关变量或者vector
+  // vecOneFrameDetectResult 可以不使用map，使用vector
+  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
+  {
+    double bandwidth = fbv.bandwidth * 1e6;
+    int decimation = 0;
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
+    {
+      decimation = 4;
+    }
+    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
+    {
+      decimation = 8;
+    }
+    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
+    {
+      decimation = 16;
+    }
+    else
+    {
+      // 带宽不符合要求，跳过处理
+      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
+      continue;
+    }
+
+    downFactor.push_back(decimation);
+    detectFreq.push_back(fbv.frequency);
+    detectBandwidth.push_back(bandwidth);
+
+    // 计算每个下采样率,重采样后的输出信号长度
+    int length = quotientCeil(signalLength * upFactor, decimation);
+    outputLength.push_back(length);
+
+    // 重采样后输出信号的总长度
+    outputTotalLength += length;
+  }
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const T *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernel<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保空间够
+  // outputQdata 确保空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * @return true or false
+ */
+template <typename T>
+bool ShiftAndResampleSignalV2(
+    const std::vector<std::vector<T>> &origIdata,
+    const std::vector<std::vector<T>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
+        vecOneFrameDetectResult,
+    const int alignSignalLength,
+    const int numChannels, const float CurrentRealfreq, T *outputIdata,
+    T *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  // 检测结果数据，采用vector存储更好，没必要使用map
+  int numResults = vecOneFrameDetectResult.size();
+
+  int upFactor = 1;          // 上采样率，默认为1
+  int outputTotalLength = 0; // 保存总的输出信号长度
+
+  // 上采样率
+  std::vector<int> downFactor;
+  // 检测结果的频率
+  std::vector<T> detectFreq;
+  // 检测结果的带宽
+  std::vector<T> detectBandwidth;
+
+  // 根据检测结果，初始化相关变量或者vector
+  // vecOneFrameDetectResult 可以不使用map，使用vector
+  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
+  {
+    T bandwidth = fbv.bandwidth * 1e6;
+    int decimation = 0;
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
+    {
+      decimation = 4;
+    }
+    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
+    {
+      decimation = 8;
+    }
+    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
+    {
+      decimation = 16;
+    }
+    else
+    {
+      // 带宽不符合要求，跳过处理
+      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
+      continue;
+    }
+
+    downFactor.push_back(decimation);
+    detectFreq.push_back(fbv.frequency);
+    detectBandwidth.push_back(bandwidth);
+  }
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernelV2=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  T *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(T))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const T *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(T), cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  T *d_Idata = nullptr;
+  T *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(T))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(T))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(T);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    T *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    T *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(T);
+  T *d_outputIdata = nullptr;
+  T *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  ShiftingAndResamplingKernelV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
+      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
+      d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保申请的内存空间够
+  // outputQdata 确保申请的内存空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(T)),
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(T)),
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor)
+  {
     cudaFree(d_downFactor);
     d_downFactor = nullptr;
   }
 
-  if (d_outputLength) {
+  if (d_outputLength)
+  {
     cudaFree(d_outputLength);
     d_outputLength = nullptr;
   }
 
-  if (d_frequency) {
+  if (d_frequency)
+  {
     cudaFree(d_frequency);
     d_frequency = nullptr;
   }
 
-  if (d_Idata) {
+  if (d_Idata)
+  {
     cudaFree(d_Idata);
     d_Idata = nullptr;
   }
 
-  if (d_Qdata) {
+  if (d_Qdata)
+  {
     cudaFree(d_Qdata);
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata) {
+  if (d_outputIdata)
+  {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
 
-  if (d_outputIdata) {
+  if (d_outputIdata)
+  {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
diff --git a/cuda_resample.h b/cuda_resample.h
index 33bf998..b6ed491 100644
--- a/cuda_resample.h
+++ b/cuda_resample.h
@@ -15,7 +15,8 @@
 #endif
 
 template <typename T>
-struct FreqBandValue {
+struct FreqBandValue
+{
   T frequency;
   T bandwidth;
   T maxvalue;
@@ -24,8 +25,75 @@ struct FreqBandValue {
   T ebn0;
 };
 
+struct FreqBandValueFloat
+{
+  float frequency;
+  float bandwidth;
+  float maxvalue;
+  int startIndex;
+  int stopIndex;
+  float ebn0;
+};
+
+struct FreqBandValueDouble
+{
+  double frequency;
+  double bandwidth;
+  double maxvalue;
+  int startIndex;
+  int stopIndex;
+  double ebn0;
+};
+
+/**
+ * ShiftAndResampleSignal
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+template <typename T>
+bool ShiftAndResampleSignal(
+    const std::vector<std::vector<T>> &origIdata,
+    const std::vector<std::vector<T>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
+        vecOneFrameDetectResult,
+    const int numChannels, const float CurrentRealfreq, T *outputIdata,
+    T *outputQdata);
+
+/**
+ * ShiftAndResampleSignalFloat
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloat(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
+        vecOneFrameDetectResult,
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata);
+
 /**
- * ShiftAndResampleDetectResult
+ * ShiftAndResampleSignalDouble
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -37,15 +105,40 @@ struct FreqBandValue {
  * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @param outputQdata：
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return void
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDouble(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
+        vecOneFrameDetectResult,
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata);
+
+/**
+ * ShiftAndResampleSignalV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param vecOneFrameDetectResult：检测结果数据
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * @return true or false
  */
 template <typename T>
-bool ShiftAndResampleDetectResult(
-    const std::vector<std::vector<T>>& origIdata,
-    const std::vector<std::vector<T>>& origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>>&
+bool ShiftAndResampleSignalV2(
+    const std::vector<std::vector<T>> &origIdata,
+    const std::vector<std::vector<T>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
         vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, T* outputIdata,
-    T* outputQdata);
+    const int alignSignalLength,
+    const int numChannels, const float CurrentRealfreq, T *outputIdata,
+    T *outputQdata);
 
-#endif  // CUDA_RESAMPLE_H
+#endif // CUDA_RESAMPLE_H
diff --git a/mainwindow.cpp b/mainwindow.cpp
index 8114345..f171e09 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -10,7 +10,8 @@
 
 using namespace std;
 
-MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {
+MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent)
+{
   InitControlValues();
   InitUI();
   InitConnect();
@@ -18,7 +19,8 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {
 
 MainWindow::~MainWindow() {}
 
-void MainWindow::InitControlValues() {
+void MainWindow::InitControlValues()
+{
   m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this);
 
   basePath = "/../data/";
@@ -26,15 +28,18 @@ void MainWindow::InitControlValues() {
 
 void MainWindow::InitUI() { setCentralWidget(m_btnCalculate); }
 
-void MainWindow::InitConnect() {
+void MainWindow::InitConnect()
+{
   connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick()));
 }
 
-int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) {
+int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength)
+{
   return m_calMC.CalMovingCorrlationRoutine(signalChannels, signalLength);
 }
 
-void MainWindow::SlotCalculateClick() {
+void MainWindow::SlotCalculateClick()
+{
   QString strFileName =
       "20250213105831_Detect_FreqList5720-5750MHz_Span15.36MHz_Point4096_2025-"
       "02-13 10-58-31_0.dat";
@@ -47,7 +52,8 @@ void MainWindow::SlotCalculateClick() {
   QString strIQFileName =
       QCoreApplication::applicationDirPath() + basePath + strFileName;
 
-  if (QFile::exists(strIQFileName) != true) {
+  if (QFile::exists(strIQFileName) != true)
+  {
     std::cerr << __FUNCTION__ << strIQFileName.toStdString() << ":文件不存在"
               << std::endl;
     return;
@@ -72,18 +78,23 @@ void MainWindow::SlotCalculateClick() {
 
   // 打开回放文件
   m_ReplayFile.open(strIQFileName.toStdString(), ios::in | ios::binary);
-  if (!m_ReplayFile) {
+  if (!m_ReplayFile)
+  {
     qDebug() << __FUNCTION__ << "file open error";
     return;
   }
 
   // 循环读取每一帧数据 4096点
   for (int m_uiCurrentFrame = 0; m_uiCurrentFrame < m_iframeCnt;
-       ++m_uiCurrentFrame) {
-    if (m_uiCurrentFrame < m_iframeCnt - 1) {
+       ++m_uiCurrentFrame)
+  {
+    if (m_uiCurrentFrame < m_iframeCnt - 1)
+    {
       oneframesize = m_vecReplayHeadposDetect.at(m_uiCurrentFrame + 1) -
                      m_vecReplayHeadposDetect.at(m_uiCurrentFrame);
-    } else {
+    }
+    else
+    {
       oneframesize = m_ReplayfilesizeDetect -
                      m_vecReplayHeadposDetect.at(m_uiCurrentFrame);
     }
@@ -111,11 +122,13 @@ void MainWindow::SlotCalculateClick() {
 
 void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
                                       std::vector<size_t> &headPos,
-                                      qint64 &Replayfilesize) {
+                                      qint64 &Replayfilesize)
+{
   std::ifstream replayfileforcalculate;
   replayfileforcalculate.open(ReplayFilePath.toStdString(),
                               ios::in | ios::binary);
-  if (!replayfileforcalculate) {
+  if (!replayfileforcalculate)
+  {
     qDebug() << __FUNCTION__ << "file open error";
     return;
   }
@@ -135,7 +148,8 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
   size_t count = 0;
   headPos.clear();
   headPos.reserve(5000);
-  while ((index = source.find(match, index)) < Replayfilesize) {
+  while ((index = source.find(match, index)) < Replayfilesize)
+  {
     unsigned int DataSize = m_droneIQParse.GetDataSize(buff + index);
     headPos.push_back(index);
     index += 8;
@@ -150,17 +164,22 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
   buff = nullptr;
 }
 
-void MainWindow::ReplayIQDataParse(char *buf) {
-  if (signalLength_ > 0) {
-    if (signalChannels_ == 32) {
-      uint channelnumber = 8;  //原逻辑也是只取了前8个通道
+void MainWindow::ReplayIQDataParse(char *buf)
+{
+  if (signalLength_ > 0)
+  {
+    if (signalChannels_ == 32)
+    {
+      uint channelnumber = 8; // 原逻辑也是只取了前8个通道
 
-      if (signalDatas_ == nullptr) {
+      if (signalDatas_ == nullptr)
+      {
         // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
         if (!m_calMC.cudaCorrelation->AllocMappMemory(
                 (void **)&(m_calMC.cudaCorrelation->h_signals),
                 (void **)&(m_calMC.cudaCorrelation->d_signals),
-                channelnumber * signalLength_ * sizeof(cpuComplex))) {
+                channelnumber * signalLength_ * sizeof(cpuComplex)))
+        {
           std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
           return;
         }
@@ -182,3 +201,48 @@ void MainWindow::ReplayIQDataParse(char *buf) {
     }
   }
 }
+
+template <typename T>
+ReplayIQDataParseV2(const T *outputIdata,
+                    const T *outputQdata, const int numResults,
+                    const int numChannels,
+                    const int signalLength)
+{
+  if (signalDatas_ == nullptr)
+  {
+    // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
+    if (!m_calMC.cudaCorrelation->AllocMappMemory(
+            (void **)&(m_calMC.cudaCorrelation->h_signals),
+            (void **)&(m_calMC.cudaCorrelation->d_signals),
+            numResults * numChannels * signalLength * sizeof(cpuComplex)))
+    {
+      std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
+      return;
+    }
+
+    signalDatas_ = (cpuComplex *)m_calMC.cudaCorrelation->h_signals;
+  }
+
+  int index = 0;
+  for (int i = 0; i < numResults; i++)
+  {
+    for (int j = 0; j < numChannels; j++)
+    {
+      for (int k = 0; k < signalLength; k++)
+      {
+        int idx = (i * numChannels + j) * signalLength + k;
+        cpuComplex data((T)outputIdata[idx], (T)outputQdata[idx]); // cpuComplex
+        signalDatas_[index++] = data;
+      }
+    }
+  }
+
+  QElapsedTimer tm;
+  tm.start();
+  // 每帧 SamplePoints 个点 IQ 输入
+  // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰
+  int result = CalculateRoutine(numResults * channelnumber, signalLength_);
+
+  std::cout << __FUNCTION__ << " result:" << result
+            << " tm(ns):" << tm.nsecsElapsed() << std::endl;
+}
-- 
Gitee


From 82e38c644fa60a2810a1695b93ac89be37a9a4c5 Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 11:41:36 +0800
Subject: [PATCH 08/27] =?UTF-8?q?=E6=96=B0=E5=A2=9EShiftAndResampleSignalD?=
 =?UTF-8?q?ouble=E7=9A=84=E9=87=8D=E8=BD=BD=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample.cu | 160 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 01206e5..23f457c 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -1255,6 +1255,166 @@ bool ShiftAndResampleSignalDouble(
   return true;
 }
 
+/**
+ * ShiftAndResampleSignalDouble
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDouble(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<double> &detectFreq,
+    const int outputTotalLength, const int numResults,
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  int upFactor = 1; // 上采样率，默认为1
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const T *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernel<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保空间够
+  // outputQdata 确保空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
 /**
  * ShiftAndResampleSignalV2
  * 重采样函数：完成原始信号的移频，重采样等计算
-- 
Gitee


From 24a81fa1bff2c1f11bbfa89f7cc719972793bcb4 Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 11:51:01 +0800
Subject: [PATCH 09/27] =?UTF-8?q?=E6=96=B0=E5=A2=9EShiftAndResampleSignalD?=
 =?UTF-8?q?ouble=E9=87=8D=E8=BD=BD=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/cuda_resample.h b/cuda_resample.h
index b6ed491..04eff2a 100644
--- a/cuda_resample.h
+++ b/cuda_resample.h
@@ -115,6 +115,34 @@ bool ShiftAndResampleSignalDouble(
     const int numChannels, const double CurrentRealfreq, double *outputIdata,
     double *outputQdata);
 
+/**
+ * ShiftAndResampleSignalDouble
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDouble(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<double> &detectFreq,
+    const int outputTotalLength, const int numResults,
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata);
+
 /**
  * ShiftAndResampleSignalV2
  * 重采样函数：完成原始信号的移频，重采样等计算
-- 
Gitee


From 60345d149f8e6587ac287c39bc10bd48c2f1ad6e Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 11:56:26 +0800
Subject: [PATCH 10/27] =?UTF-8?q?=E6=96=B0=E5=A2=9E=20ReplayIQDataParseV2?=
 =?UTF-8?q?=20=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: amor <15820865+Amor_23456@user.noreply.gitee.com>
---
 mainwindow.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/mainwindow.h b/mainwindow.h
index a63a935..9a6e006 100644
--- a/mainwindow.h
+++ b/mainwindow.h
@@ -14,10 +14,11 @@
 
 using namespace std;
 
-class MainWindow : public QMainWindow {
+class MainWindow : public QMainWindow
+{
   Q_OBJECT
 
- public:
+public:
   MainWindow(QWidget *parent = nullptr);
   ~MainWindow();
 
@@ -34,7 +35,7 @@ class MainWindow : public QMainWindow {
   uint signalChannels_ = 0;
   uint signalLength_ = 0;
 
- private:
+private:
   // 获取测试数据文件中 每一帧数据的帧头下标
   void GetReplayFileHeadPos(QString ReplayFilePath,
                             std::vector<size_t> &headPos,
@@ -43,7 +44,7 @@ class MainWindow : public QMainWindow {
   // 解析每一帧数据为 8路的 IQ 原始数据
   void ReplayIQDataParse(char *buf);
 
- private:
+private:
   QPushButton *m_btnCalculate;
 
   CalculateMovingCorrelation m_calMC;
@@ -57,7 +58,13 @@ class MainWindow : public QMainWindow {
   ifstream m_ReplayFile;
   int oneframesize;
 
- public slots:
+public slots:
   void SlotCalculateClick();
 };
-#endif  // MAINWINDOW_H
+
+template <typename T>
+ReplayIQDataParseV2(const T *outputIdata,
+                    const T *outputQdata, const int numResults,
+                    const int numChannels,
+                    const int signalLength);
+#endif // MAINWINDOW_H
-- 
Gitee


From bc10a0b0dae83e852a0522ef59383b86f590630d Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 13:37:35 +0800
Subject: [PATCH 11/27] =?UTF-8?q?=E6=89=8B=E5=8A=A8=E5=AE=9E=E7=8E=B0doubl?=
 =?UTF-8?q?e=E5=92=8Cfloat=E7=9A=84=E7=89=88=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: amor <15820865+Amor_23456@user.noreply.gitee.com>
---
 cuda_resample.cu | 154 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 151 insertions(+), 3 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index 23f457c..e8fbb11 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -550,6 +550,154 @@ __global__ void ShiftingAndResamplingKernel(
   delete[] Q_shifted;
 }
 
+__global__ void ShiftingAndResamplingKernelFloat(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
+    const float *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const float CurrentRealfreq,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const float sampling_rate = float(245.76e6);
+
+  float frequency = VFrequency[ResIdx]; // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  float *I_shifted = new float[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  float *Q_shifted = new float[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = dev_cos(phase);
+    float sinVal = dev_sin(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++)
+  {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+__global__ void ShiftingAndResamplingKernelDouble(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const double *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const double CurrentRealfreq,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const double sampling_rate = double(245.76e6);
+
+  double frequency = VFrequency[ResIdx]; // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  double *I_shifted = new double[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  double *Q_shifted = new double[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = dev_cos(phase);
+    double sinVal = dev_sin(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++)
+  {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
 /**
  * ShiftingAndResamplingKernelV2
  * 重采样核函数：完成原始信号的移频，重采样等计算
@@ -987,7 +1135,7 @@ bool ShiftAndResampleSignalFloat(
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
-  ShiftingAndResamplingKernel<<<grid, block>>>(
+  ShiftingAndResamplingKernelFloat<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
@@ -1192,7 +1340,7 @@ bool ShiftAndResampleSignalDouble(
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
-  ShiftingAndResamplingKernel<<<grid, block>>>(
+  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
@@ -1352,7 +1500,7 @@ bool ShiftAndResampleSignalDouble(
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
-  ShiftingAndResamplingKernel<<<grid, block>>>(
+  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
-- 
Gitee


From fd5481c20e4c8354f759dd3678581db8916db63d Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 13:50:18 +0800
Subject: [PATCH 12/27] =?UTF-8?q?=E6=89=8B=E5=8A=A8=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E6=A8=A1=E6=9D=BF=E5=87=BD=E6=95=B0=E7=9A=84double=E5=92=8Cflo?=
 =?UTF-8?q?at=E7=89=88=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample.cu | 616 +++++++++++++++++++++++++++--------------------
 cuda_resample.h  |  75 ++----
 2 files changed, 383 insertions(+), 308 deletions(-)

diff --git a/cuda_resample.cu b/cuda_resample.cu
index e8fbb11..3e50213 100644
--- a/cuda_resample.cu
+++ b/cuda_resample.cu
@@ -475,81 +475,6 @@ __device__ void dev_resample(const int upFactor, const int downFactor,
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return true or false
  */
-template <typename T>
-__global__ void ShiftingAndResamplingKernel(
-    const T *__restrict__ origIdata, const T *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const T *__restrict__ VFrequency,
-    const T *__restrict__ VOutputLength, const int numResults,
-    const int numChannels, const int signalLength, const T CurrentRealfreq,
-    T *__restrict__ outputIdata, T *__restrict__ outputQdata)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
-
-  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
-
-  const T sampling_rate = T(245.76e6);
-
-  T frequency = VFrequency[ResIdx]; // 频率
-  T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
-
-  // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
-
-  // 移频：生成本振信号并相乘
-  T *I_shifted = new T[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  T *Q_shifted = new T[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
-  for (int i = 0; i < signalLength; i++)
-  {
-    T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    T cosVal = dev_cos(phase);
-    T sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
-  }
-
-  // 上采样因子为1，下采样因子为downFactor
-  int upFactor = 1;
-  int downFactor = VDownFactor[ResIdx];
-
-  // 计算之前带宽，对应的输出信号的总长度
-  int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++)
-  {
-    beforeTotalLength += VOutputLength[i];
-  }
-  // 当前带宽对应的输出信号的起始地址偏移
-  int offset = beforeTotalLength * numChannels;
-
-  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
-  int outputLength = VOutputLength[ResIdx];
-
-  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
-  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
-
-  // 重采样
-  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
-}
-
 __global__ void ShiftingAndResamplingKernelFloat(
     const float *__restrict__ origIdata, const float *__restrict__ origQdata,
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
@@ -717,13 +642,74 @@ __global__ void ShiftingAndResamplingKernelDouble(
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return true or false
  */
-template <typename T>
-__global__ void ShiftingAndResamplingKernelV2(
-    const T *__restrict__ origIdata, const T *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const T *__restrict__ VFrequency,
+__global__ void ShiftingAndResamplingKernelFloatV2(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
-    const T CurrentRealfreq, const int alignSignalLength,
-    T *__restrict__ outputIdata, T *__restrict__ outputQdata)
+    const float CurrentRealfreq, const int alignSignalLength,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const float sampling_rate = float(245.76e6);
+
+  float frequency = VFrequency[ResIdx]; // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  float *I_shifted = new float[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  float *Q_shifted = new float[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = dev_cos(phase);
+    float sinVal = dev_sin(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+__global__ void ShiftingAndResamplingKernelDoubleV2(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const double CurrentRealfreq, const int alignSignalLength,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= numChannels * numResults)
@@ -733,31 +719,31 @@ __global__ void ShiftingAndResamplingKernelV2(
   int ResIdx = idx / numChannels; // 第几个检测结果
   int chIdx = idx % numChannels;  // 第几个通道
 
-  const T sampling_rate = T(245.76e6);
+  const double sampling_rate = double(245.76e6);
 
-  T frequency = VFrequency[ResIdx]; // 频率
-  T deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+  double frequency = VFrequency[ResIdx]; // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
   auto &I_orig = origIdata + chIdx * signalLength;
   auto &Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  T *I_shifted = new T[signalLength];
+  double *I_shifted = new double[signalLength];
   if (I_shifted == nullptr)
   {
     return;
   }
-  T *Q_shifted = new T[signalLength];
+  double *Q_shifted = new double[signalLength];
   if (Q_shifted == nullptr)
   {
     return;
   }
   for (int i = 0; i < signalLength; i++)
   {
-    T phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    T cosVal = dev_cos(phase);
-    T sinVal = dev_sin(phase);
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = dev_cos(phase);
+    double sinVal = dev_sin(phase);
     I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
     Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
@@ -788,7 +774,7 @@ inline int quotientCeil(int num1, int num2)
 }
 
 /**
- * ShiftAndResampleSignal
+ * ShiftAndResampleSignalFloat
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -802,14 +788,13 @@ inline int quotientCeil(int num1, int num2)
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return true or false
  */
-template <typename T>
-bool ShiftAndResampleSignal(
-    const std::vector<std::vector<T>> &origIdata,
-    const std::vector<std::vector<T>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
+bool ShiftAndResampleSignalFloat(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
         vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, T *outputIdata,
-    T *outputQdata)
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata)
 {
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
@@ -825,15 +810,15 @@ bool ShiftAndResampleSignal(
   // 上采样率
   std::vector<int> downFactor;
   // 检测结果的频率
-  std::vector<T> detectFreq;
+  std::vector<float> detectFreq;
   // 检测结果的带宽
-  std::vector<T> detectBandwidth;
+  std::vector<float> detectBandwidth;
 
   // 根据检测结果，初始化相关变量或者vector
   // vecOneFrameDetectResult 可以不使用map，使用vector
   for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
   {
-    T bandwidth = fbv.bandwidth * 1e6;
+    float bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
     if (std::abs(bandwidth - 40e6) < 2 * 1e6)
     {
@@ -870,11 +855,11 @@ bool ShiftAndResampleSignal(
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
   int *d_outputLength = nullptr;
-  T *d_frequency = nullptr;
+  float *d_frequency = nullptr;
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(T))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
 
   // copy下采样率到显存中
   const int *src_downFactor = downFactor.data();
@@ -885,7 +870,7 @@ bool ShiftAndResampleSignal(
   // copy频率到显存中
   const T *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(T), cudaMemcpyHostToDevice));
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
 
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
@@ -894,43 +879,43 @@ bool ShiftAndResampleSignal(
                               cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  T *d_Idata = nullptr;
-  T *d_Qdata = nullptr;
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(T))));
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(T))));
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
 
   // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(T);
+  size_t copySize = signalLength * sizeof(float);
   for (int i = 0; i < numChannels; i++)
   {
     // copy 原始的idata 到gpu显存
-    T *dst_idata = d_Idata + i * signalLength;
+    float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    T *dst_qdata = d_Qdata + i * signalLength;
+    float *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
   // 申请重采样后输出信号的GPU显存
-  T *d_outputIdata = nullptr;
-  T *d_outputQdata = nullptr;
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(T))));
+                              (numChannels * outputTotalLength * sizeof(float))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(T))));
+                              (numChannels * outputTotalLength * sizeof(float))));
 
   // 线程数配置
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
-  ShiftingAndResamplingKernel<<<grid, block>>>(
+  ShiftingAndResamplingKernelFloat<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
@@ -940,11 +925,11 @@ bool ShiftAndResampleSignal(
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(T)),
+                              (numChannels * outputTotalLength * sizeof(float)),
                               cudaMemcpyHostToDevice));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(T)),
+                              (numChannels * outputTotalLength * sizeof(float)),
                               cudaMemcpyHostToDevice));
 
   // 释放显存
@@ -994,7 +979,7 @@ bool ShiftAndResampleSignal(
 }
 
 /**
- * ShiftAndResampleSignalFloat
+ * ShiftAndResampleSignalDouble
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -1008,13 +993,13 @@ bool ShiftAndResampleSignal(
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return true or false
  */
-bool ShiftAndResampleSignalFloat(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
+bool ShiftAndResampleSignalDouble(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
         vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata)
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata)
 {
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
@@ -1030,15 +1015,15 @@ bool ShiftAndResampleSignalFloat(
   // 上采样率
   std::vector<int> downFactor;
   // 检测结果的频率
-  std::vector<float> detectFreq;
+  std::vector<double> detectFreq;
   // 检测结果的带宽
-  std::vector<float> detectBandwidth;
+  std::vector<double> detectBandwidth;
 
   // 根据检测结果，初始化相关变量或者vector
   // vecOneFrameDetectResult 可以不使用map，使用vector
   for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
   {
-    float bandwidth = fbv.bandwidth * 1e6;
+    double bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
     if (std::abs(bandwidth - 40e6) < 2 * 1e6)
     {
@@ -1075,11 +1060,11 @@ bool ShiftAndResampleSignalFloat(
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
   int *d_outputLength = nullptr;
-  float *d_frequency = nullptr;
+  double *d_frequency = nullptr;
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
 
   // copy下采样率到显存中
   const int *src_downFactor = downFactor.data();
@@ -1090,7 +1075,7 @@ bool ShiftAndResampleSignalFloat(
   // copy频率到显存中
   const T *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
 
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
@@ -1099,43 +1084,43 @@ bool ShiftAndResampleSignalFloat(
                               cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  float *d_Idata = nullptr;
-  float *d_Qdata = nullptr;
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
 
   // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(float);
+  size_t copySize = signalLength * sizeof(double);
   for (int i = 0; i < numChannels; i++)
   {
     // copy 原始的idata 到gpu显存
-    float *dst_idata = d_Idata + i * signalLength;
+    double *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    float *dst_qdata = d_Qdata + i * signalLength;
+    double *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
   // 申请重采样后输出信号的GPU显存
-  float *d_outputIdata = nullptr;
-  float *d_outputQdata = nullptr;
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
+                              (numChannels * outputTotalLength * sizeof(double))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
+                              (numChannels * outputTotalLength * sizeof(double))));
 
   // 线程数配置
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
-  ShiftingAndResamplingKernelFloat<<<grid, block>>>(
+  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
@@ -1145,11 +1130,11 @@ bool ShiftAndResampleSignalFloat(
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
+                              (numChannels * outputTotalLength * sizeof(double)),
                               cudaMemcpyHostToDevice));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
+                              (numChannels * outputTotalLength * sizeof(double)),
                               cudaMemcpyHostToDevice));
 
   // 释放显存
@@ -1198,93 +1183,30 @@ bool ShiftAndResampleSignalFloat(
   return true;
 }
 
-/**
- * ShiftAndResampleSignalDouble
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-bool ShiftAndResampleSignalDouble(
-    const std::vector<std::vector<double>> &origIdata,
-    const std::vector<std::vector<double>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
-    double *outputQdata)
+bool ShiftAndResampleSignalFloat(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int outputTotalLength, const int numResults,
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata)
 {
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  // 检测结果数据，采用vector存储更好，没必要使用map
-  int numResults = vecOneFrameDetectResult.size();
-
-  int upFactor = 1;          // 上采样率，默认为1
-  int outputTotalLength = 0; // 保存总的输出信号长度
-
-  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
-  std::vector<int> outputLength;
-  // 上采样率
-  std::vector<int> downFactor;
-  // 检测结果的频率
-  std::vector<double> detectFreq;
-  // 检测结果的带宽
-  std::vector<double> detectBandwidth;
-
-  // 根据检测结果，初始化相关变量或者vector
-  // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
-  {
-    double bandwidth = fbv.bandwidth * 1e6;
-    int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
-    {
-      decimation = 4;
-    }
-    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
-    {
-      decimation = 8;
-    }
-    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
-    {
-      decimation = 16;
-    }
-    else
-    {
-      // 带宽不符合要求，跳过处理
-      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
-      continue;
-    }
-
-    downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency);
-    detectBandwidth.push_back(bandwidth);
-
-    // 计算每个下采样率,重采样后的输出信号长度
-    int length = quotientCeil(signalLength * upFactor, decimation);
-    outputLength.push_back(length);
-
-    // 重采样后输出信号的总长度
-    outputTotalLength += length;
-  }
+  int upFactor = 1; // 上采样率，默认为1
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
   int *d_outputLength = nullptr;
-  double *d_frequency = nullptr;
+  float *d_frequency = nullptr;
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
 
   // copy下采样率到显存中
   const int *src_downFactor = downFactor.data();
@@ -1295,7 +1217,7 @@ bool ShiftAndResampleSignalDouble(
   // copy频率到显存中
   const T *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
 
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
@@ -1304,37 +1226,37 @@ bool ShiftAndResampleSignalDouble(
                               cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  double *d_Idata = nullptr;
-  double *d_Qdata = nullptr;
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
 
   // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(double);
+  size_t copySize = signalLength * sizeof(float);
   for (int i = 0; i < numChannels; i++)
   {
     // copy 原始的idata 到gpu显存
-    double *dst_idata = d_Idata + i * signalLength;
+    float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    double *dst_qdata = d_Qdata + i * signalLength;
+    float *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
   // 申请重采样后输出信号的GPU显存
-  double *d_outputIdata = nullptr;
-  double *d_outputQdata = nullptr;
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
+                              (numChannels * outputTotalLength * sizeof(float))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
+                              (numChannels * outputTotalLength * sizeof(float))));
 
   // 线程数配置
   dim3 block(numChannels);
@@ -1350,11 +1272,11 @@ bool ShiftAndResampleSignalDouble(
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
+                              (numChannels * outputTotalLength * sizeof(float)),
                               cudaMemcpyHostToDevice));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
+                              (numChannels * outputTotalLength * sizeof(float)),
                               cudaMemcpyHostToDevice));
 
   // 释放显存
@@ -1579,15 +1501,14 @@ bool ShiftAndResampleSignalDouble(
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
  * @return true or false
  */
-template <typename T>
-bool ShiftAndResampleSignalV2(
-    const std::vector<std::vector<T>> &origIdata,
-    const std::vector<std::vector<T>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<float>>> &
         vecOneFrameDetectResult,
     const int alignSignalLength,
-    const int numChannels, const float CurrentRealfreq, T *outputIdata,
-    T *outputQdata)
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata)
 {
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
@@ -1601,15 +1522,15 @@ bool ShiftAndResampleSignalV2(
   // 上采样率
   std::vector<int> downFactor;
   // 检测结果的频率
-  std::vector<T> detectFreq;
+  std::vector<float> detectFreq;
   // 检测结果的带宽
-  std::vector<T> detectBandwidth;
+  std::vector<float> detectBandwidth;
 
   // 根据检测结果，初始化相关变量或者vector
   // vecOneFrameDetectResult 可以不使用map，使用vector
   for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
   {
-    T bandwidth = fbv.bandwidth * 1e6;
+    float bandwidth = fbv.bandwidth * 1e6;
     int decimation = 0;
     if (std::abs(bandwidth - 40e6) < 2 * 1e6)
     {
@@ -1638,10 +1559,10 @@ bool ShiftAndResampleSignalV2(
   // ====准备调用重采样核函数：ShiftingAndResamplingKernelV2=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
-  T *d_frequency = nullptr;
+  float *d_frequency = nullptr;
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(T))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
 
   // copy下采样率到显存中
   const int *src_downFactor = downFactor.data();
@@ -1650,39 +1571,39 @@ bool ShiftAndResampleSignalV2(
                               cudaMemcpyHostToDevice));
 
   // copy频率到显存中
-  const T *src_frequency = detectFreq.data();
+  const float *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(T), cudaMemcpyHostToDevice));
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  T *d_Idata = nullptr;
-  T *d_Qdata = nullptr;
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(T))));
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
   CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(T))));
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
 
   // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(T);
+  size_t copySize = signalLength * sizeof(float);
   for (int i = 0; i < numChannels; i++)
   {
     // copy 原始的idata 到gpu显存
-    T *dst_idata = d_Idata + i * signalLength;
+    float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
     // copy 原始的qdata 到gpu显存
-    T *dst_qdata = d_Qdata + i * signalLength;
+    float *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
   // 申请重采样后输出信号的GPU显存
-  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(T);
-  T *d_outputIdata = nullptr;
-  T *d_outputQdata = nullptr;
+  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(float);
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
 
@@ -1704,11 +1625,188 @@ bool ShiftAndResampleSignalV2(
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(T)),
+                              (numChannels * outputTotalLength * sizeof(float)),
                               cudaMemcpyDeviceToHost));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(T)),
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+bool ShiftAndResampleSignalDoubleV2(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<double>>> &
+        vecOneFrameDetectResult,
+    const int alignSignalLength,
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  // 检测结果数据，采用vector存储更好，没必要使用map
+  int numResults = vecOneFrameDetectResult.size();
+
+  int upFactor = 1;          // 上采样率，默认为1
+  int outputTotalLength = 0; // 保存总的输出信号长度
+
+  // 上采样率
+  std::vector<int> downFactor;
+  // 检测结果的频率
+  std::vector<double> detectFreq;
+  // 检测结果的带宽
+  std::vector<double> detectBandwidth;
+
+  // 根据检测结果，初始化相关变量或者vector
+  // vecOneFrameDetectResult 可以不使用map，使用vector
+  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
+  {
+    double bandwidth = fbv.bandwidth * 1e6;
+    int decimation = 0;
+    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
+    {
+      decimation = 4;
+    }
+    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
+    {
+      decimation = 8;
+    }
+    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
+    {
+      decimation = 16;
+    }
+    else
+    {
+      // 带宽不符合要求，跳过处理
+      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
+      continue;
+    }
+
+    downFactor.push_back(decimation);
+    detectFreq.push_back(fbv.frequency);
+    detectBandwidth.push_back(bandwidth);
+  }
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernelV2=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(double);
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  ShiftingAndResamplingKernelV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
+      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
+      d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保申请的内存空间够
+  // outputQdata 确保申请的内存空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
                               cudaMemcpyDeviceToHost));
 
   // 释放显存
diff --git a/cuda_resample.h b/cuda_resample.h
index 04eff2a..ee9b2af 100644
--- a/cuda_resample.h
+++ b/cuda_resample.h
@@ -46,31 +46,7 @@ struct FreqBandValueDouble
 };
 
 /**
- * ShiftAndResampleSignal
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-template <typename T>
-bool ShiftAndResampleSignal(
-    const std::vector<std::vector<T>> &origIdata,
-    const std::vector<std::vector<T>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, T *outputIdata,
-    T *outputQdata);
-
-/**
- * ShiftAndResampleSignalFloat
+ * ShiftAndResampleSignalFloat 和 ShiftAndResampleSignalDouble
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -91,22 +67,6 @@ bool ShiftAndResampleSignalFloat(
         vecOneFrameDetectResult,
     const int numChannels, const float CurrentRealfreq, float *outputIdata,
     float *outputQdata);
-
-/**
- * ShiftAndResampleSignalDouble
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
 bool ShiftAndResampleSignalDouble(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
@@ -116,7 +76,7 @@ bool ShiftAndResampleSignalDouble(
     double *outputQdata);
 
 /**
- * ShiftAndResampleSignalDouble
+ * ShiftAndResampleSignalDouble 和  ShiftAndResampleSignalFloat
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -133,6 +93,15 @@ bool ShiftAndResampleSignalDouble(
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return true or false
  */
+bool ShiftAndResampleSignalFloat(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int outputTotalLength, const int numResults,
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata);
 bool ShiftAndResampleSignalDouble(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
@@ -146,6 +115,7 @@ bool ShiftAndResampleSignalDouble(
 /**
  * ShiftAndResampleSignalV2
  * 重采样函数：完成原始信号的移频，重采样等计算
+ * 输出信号长度对齐到指定长度
  *
  * @param origIdata：原始Idata
  * @param origQdata：原始Qdata
@@ -159,14 +129,21 @@ bool ShiftAndResampleSignalDouble(
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
  * @return true or false
  */
-template <typename T>
-bool ShiftAndResampleSignalV2(
-    const std::vector<std::vector<T>> &origIdata,
-    const std::vector<std::vector<T>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<T>>> &
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<float>>> &
+        vecOneFrameDetectResult,
+    const int alignSignalLength,
+    const int numChannels, const float CurrentRealfreq, float *outputIdata,
+    float *outputQdata);
+bool ShiftAndResampleSignalDoubleV2(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    const std::vector<std::map<int64_t, FreqBandValue<double>>> &
         vecOneFrameDetectResult,
     const int alignSignalLength,
-    const int numChannels, const float CurrentRealfreq, T *outputIdata,
-    T *outputQdata);
+    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    double *outputQdata);
 
 #endif // CUDA_RESAMPLE_H
-- 
Gitee


From 51c616302ee76f1d6fc1638276a15c6497309b8a Mon Sep 17 00:00:00 2001
From: amor <15820865+Amor_23456@user.noreply.gitee.com>
Date: Wed, 17 Dec 2025 15:24:40 +0800
Subject: [PATCH 13/27] =?UTF-8?q?=E6=89=8B=E5=8A=A8=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E6=89=80=E6=9C=89=E6=A8=A1=E6=9D=BF=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: amor <15820865+Amor_23456@user.noreply.gitee.com>
---
 cuda_resample_double.cu                    |  889 +++++++
 cuda_resample.h => cuda_resample_double.h  |  110 +-
 cuda_resample.cu => cuda_resample_float.cu | 2747 +++++++-------------
 cuda_resample_float.h                      |   79 +
 4 files changed, 1879 insertions(+), 1946 deletions(-)
 create mode 100644 cuda_resample_double.cu
 rename cuda_resample.h => cuda_resample_double.h (39%)
 rename cuda_resample.cu => cuda_resample_float.cu (37%)
 create mode 100644 cuda_resample_float.h

diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
new file mode 100644
index 0000000..67ad85c
--- /dev/null
+++ b/cuda_resample_double.cu
@@ -0,0 +1,889 @@
+#include "cuda_resample.h"
+#include "upfirdn_device.h"
+
+// CHECK_CUDA_ERROR：cuda api调用错误处理
+#define CHECK_CUDA_ERROR(call)                                               \
+  do                                                                         \
+  {                                                                          \
+    cudaError_t err = call;                                                  \
+    if (err != cudaSuccess)                                                  \
+    {                                                                        \
+      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
+                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
+      throw std::runtime_error("CUDA error");                                \
+    }                                                                        \
+  } while (0)
+
+#define LOG_INFO(fmt, ...)                                                \
+  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
+#define LOG_ERROR(fmt, ...)                                                \
+  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
+inline int quotientCeil(int num1, int num2)
+{
+  if (num1 % num2 != 0)
+    return num1 / num2 + 1;
+  return num1 / num2;
+}
+
+// 整数向上取整除法
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
+{
+  div_t result = div(num1, num2);
+  return result.quot + (result.rem != 0);
+}
+
+// CUDA设备端GCD函数:最大公约数
+__device__ __forceinline__ int dev_gcd(int a, int b)
+{
+  while (b != 0)
+  {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// 生成连续递增的序列
+__device__ __forceinline__ void dev_iota_double(double *data, int size, double start)
+{
+  for (int i = 0; i < size; i++)
+  {
+    data[i] = start + double(i);
+  }
+  return;
+}
+
+// 填充data为value
+__device__ __forceinline__ void dev_fill_double(double *data, int size, double value)
+{
+  for (int i = 0; i < size; i++)
+  {
+    data[i] = value;
+  }
+  return;
+}
+
+__device__ int dev_firls_double(double *result, int length, double *freq, const double *amplitude,
+                                int freqSize)
+{
+  // 计算权重大小
+  int weightSize = freqSize / 2;
+
+  // 初始化权重向量
+  double *weight = new double[weightSize];
+  if (weight == nullptr)
+  {
+    return -1;
+  }
+
+  // 初始化weight为全1
+  dev_fill_double(weight, weightSize, double(1.0));
+
+  // 处理频率向量
+  for (int i = 0; i < freqSize; i++)
+  {
+    freq[i] = freq[i] / double(2.0);
+  }
+
+  int filterLength = length + 1;
+  length = (filterLength - 1) / 2;
+
+  // 奇偶判断
+  bool Nodd = filterLength & 1;
+
+  // 创建和初始化向量k
+  int kLength = length + 1;
+  double *k = new double[kLength];
+  if (k == nullptr)
+  {
+    return -1;
+  };
+
+  // 初始化k向量为递增序列：0，1，2...
+  dev_iota_double(k, kLength, double(0.0));
+
+  if (!Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
+      k[i] += double(0.5);
+    }
+  }
+
+  // k.erase(k.begin());
+  if (Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
+      k[i] = k[i + 1];
+    }
+    kLength--;
+  }
+
+  // 创建和初始化向量b
+  int bLength = kLength;
+  if (Nodd)
+  {
+    bLength++; // 此处++，因为后面需要在b[0]处插入b0
+  }
+  double *b = new double[bLength];
+  if (b == nullptr)
+  {
+    return -1;
+  };
+
+  dev_fill_double(b, bLength, double(0.0));
+
+  double b0 = double(0.0);
+  for (int i = 0; i < freqSize; i += 2)
+  {
+    double Fi = freq[i];
+    double Fip1 = freq[i + 1];
+    double ampi = amplitude[i];
+    double ampip1 = amplitude[i + 1];
+    double wt2 = pow(weight[i / 2], double(2.0));
+    double m_s = (ampip1 - ampi) / (Fip1 - Fi);
+    double b1 = ampi - (m_s * Fi);
+
+    if (Nodd)
+    {
+      b0 += (b1 * (Fip1 - Fi)) +
+            m_s / double(2.0) * (pow(Fip1, double(2.0)) - pow(Fi, double(2.0))) * wt2;
+    }
+
+    // 并行计算b向量
+    for (int j = 0; j < kLength; j++)
+    {
+      double kj = k[j];
+      b[j] += (m_s / (double(4.0) * pow(M_PI, double(2.0))) *
+               (cosf(double(2.0) * M_PI * Fip1) - cosf(double(2.0) * M_PI * Fi)) /
+               (pow(kj, double(2.0)))) *
+              wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(double(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sinf(double(2.0) * kj * Fi)) *
+              wt2;
+    }
+  }
+
+  // 处理最终结果，将b0插入到b向量的开始
+  if (Nodd)
+  {
+    for (int i = kLength; i >= 0; i--)
+    {
+      if (i > 0)
+      {
+        b[i] = b[i - 1];
+      }
+      else
+      {
+        b[i] = b0;
+      }
+    }
+  }
+
+  // 计算a向量
+  double w0 = weight[0];
+
+  int aLength = bLength;
+  double *a = new double[aLength];
+  if (a == nullptr)
+  {
+    return -1;
+  };
+
+  // vector<double> result = {a.rbegin(), a.rend()};
+  for (int i = 0; i < aLength; i++)
+  {
+    a[i] = pow(w0, double(2.0)) * double(4.0) * b[i];
+    result[aLength - 1 - i] = a[i];
+  }
+
+  int it = 0;
+  if (Nodd)
+  {
+    it = 1;
+  }
+
+  // 构建结果向量
+  for (int i = 0; i < aLength; i++)
+  {
+    result[i] = result[i] * double(0.5);
+    if ((i + it) < aLength)
+    {
+      result[aLength + i] = a[i + it] * double(0.5);
+    }
+  }
+
+  // 释放动态分配的内存
+  delete[] weight; // 释放内存
+  delete[] k;      // 释放内存
+  delete[] b;      // 释放内存
+  delete[] a;      // 释放内存
+  return 0;
+}
+
+// 设备端Bessel函数
+__device__ double dev_cyl_bessel_i_double(int n, double x)
+{
+  if (n == 0)
+    return double(1);
+  double bessel = double(1), bessel_prev = double(1);
+  for (int i = 1; i <= n; ++i)
+  {
+    bessel = (double(2) * i - double(1)) / i * x * bessel_prev - bessel;
+    bessel_prev = bessel;
+  }
+  return bessel;
+}
+
+// 设备端凯塞窗核函数
+__device__ void dev_kaiser_double(double *window, int order, double bta)
+{
+  double Numerator, Denominator;
+  Denominator = dev_cyl_bessel_i_double(0, bta);
+  double od2 = (order - double(1)) / double(2);
+
+  for (int n = 0; n < order; n++)
+  {
+    double x = bta * sqrt(double(1) - pow((n - od2) / od2, double(2)));
+    Numerator = dev_cyl_bessel_i_double(0, x);
+    window[n] = Numerator / Denominator;
+  }
+}
+
+__device__ void dev_resample_double(const int upFactor, const int downFactor,
+                                    const double *inputSignal, const int inputSize,
+                                    double *outputSignal)
+{
+  const int n = 10;
+  const double bta = double(5.0);
+
+  if (upFactor <= 0 || downFactor <= 0)
+  {
+    return;
+  }
+
+  int gcd_o = dev_gcd(upFactor, downFactor);
+
+  upFactor /= gcd_o;
+  downFactor /= gcd_o;
+
+  if (upFactor == downFactor)
+  {
+    outputSignal = inputSignal;
+    return;
+  }
+
+  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
+  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
+  double firlsFreq = double(1.0) / double(2.0) / static_cast<double>(maxFactor);
+
+  double firlsFreqsV[4];
+  firlsFreqsV[0] = double(0.0);
+  firlsFreqsV[1] = double(2.0) * firlsFreq;
+  firlsFreqsV[2] = double(2.0) * firlsFreq;
+  firlsFreqsV[3] = double(1.0);
+
+  double firlsAmplitudeV[4];
+  firlsAmplitudeV[0] = double(1.0);
+  firlsAmplitudeV[1] = double(1.0);
+  firlsAmplitudeV[2] = double(0.0);
+  firlsAmplitudeV[3] = double(0.0);
+
+  int freqSize = 4;
+  int length = 2 * n * maxFactor + 1;
+  int coefficientsLength = length;
+
+  double *coefficients = new double[coefficientsLength];
+  if (coefficients == nullptr)
+  {
+    return;
+  }
+  int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
+                             freqSize);
+  if (ret == -1)
+  {
+    LOG_ERROR("dev_firls调用失败\n");
+    return;
+  }
+
+  int windowSize = length;
+  double *window = new double[windowSize];
+  if (window == nullptr)
+  {
+    return;
+  }
+  dev_kaiser_double(window, length, bta);
+
+  for (int i = 0; i < coefficientsLength; i++)
+  {
+    coefficients[i] *= (upFactor * window[i]);
+  }
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+
+  // 分配filter空间
+  int hSize = coefficientsLength + 2 * nz;
+  double *filter = new double[hSize];
+  if (filter == nullptr)
+  {
+    return;
+  }
+
+  int filterLength = 0;
+  for (int i = 0; i < nz; i++)
+  {
+    filter[i + filterLength] = double(0.0);
+  }
+  filterLength += nz;
+
+  for (int i = 0; i < coefficientsLength; i++)
+  {
+    filter[i + filterLength] = coefficients[i];
+  }
+  filterLength += coefficientsLength;
+
+  lengthHalf += nz;
+  int delay = lengthHalf / downFactor;
+  nz = 0;
+  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
+             delay <
+         outputSize)
+  {
+    nz++;
+  }
+
+  for (int i = 0; i < nz; i++)
+  {
+    filter[i + filterLength] = double(0.0);
+  }
+  filterLength += nz;
+
+  // 计算
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upFactor)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  double *results = new double[outputCount];
+  if (results == nullptr)
+  {
+    return;
+  }
+
+  int resultsCount = 0;
+  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
+                 filterLength, results, &resultsCount);
+
+  int j = 0;
+  for (int i = delay; i < outputSize + delay; i++)
+  {
+    outputSignal[j++] = results[i];
+  }
+
+  // 释放动态分配的内存
+  delete[] coefficients;
+  delete[] window;
+  delete[] filter;
+  delete[] results;
+  return;
+}
+
+/**
+ * ShiftingAndResamplingKernelDoubleV1
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param VOutputLength：每个检测结果的重采样输出信号长度
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelDoubleV1(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const double *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const double CurrentRealfreq,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const double sampling_rate = double(245.76e6);
+
+  double frequency = VFrequency[ResIdx]; // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  double *I_shifted = new double[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  double *Q_shifted = new double[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++)
+  {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+/**
+ * ShiftingAndResamplingKernelDoubleV2
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的原始信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelDoubleV2(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const double CurrentRealfreq, const int alignSignalLength,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const double sampling_rate = double(245.76e6);
+
+  double frequency = VFrequency[ResIdx]; // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  double *I_shifted = new double[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  double *Q_shifted = new double[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+/**
+ * ShiftAndResampleSignalDoubleV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV1(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<double> &detectFreq,
+    const int outputTotalLength,
+    const int numResults,
+    const int numChannels,
+    const double CurrentRealfreq,
+    double *outputIdata,
+    double *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  int upFactor = 1; // 上采样率，默认为1
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernelDoubleV1<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(double)),
+                              cudaMemcpyHostToDevice));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalDoubleV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV2(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &downFactor,
+    std::vector<double> &detectFreq,
+    const int alignSignalLength,
+    const int numResults,
+    const int numChannels,
+    const double CurrentRealfreq,
+    double *outputIdata,
+    double *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  int upFactor = 1; // 上采样率，默认为1
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(double);
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  ShiftingAndResamplingKernelDoubleV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
+      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
+      d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
\ No newline at end of file
diff --git a/cuda_resample.h b/cuda_resample_double.h
similarity index 39%
rename from cuda_resample.h
rename to cuda_resample_double.h
index ee9b2af..acbbae2 100644
--- a/cuda_resample.h
+++ b/cuda_resample_double.h
@@ -14,69 +14,8 @@
 #define M_PI 3.141592653589793238462643
 #endif
 
-template <typename T>
-struct FreqBandValue
-{
-  T frequency;
-  T bandwidth;
-  T maxvalue;
-  int startIndex;
-  int stopIndex;
-  T ebn0;
-};
-
-struct FreqBandValueFloat
-{
-  float frequency;
-  float bandwidth;
-  float maxvalue;
-  int startIndex;
-  int stopIndex;
-  float ebn0;
-};
-
-struct FreqBandValueDouble
-{
-  double frequency;
-  double bandwidth;
-  double maxvalue;
-  int startIndex;
-  int stopIndex;
-  double ebn0;
-};
-
-/**
- * ShiftAndResampleSignalFloat 和 ShiftAndResampleSignalDouble
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-bool ShiftAndResampleSignalFloat(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata);
-bool ShiftAndResampleSignalDouble(
-    const std::vector<std::vector<double>> &origIdata,
-    const std::vector<std::vector<double>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
-    double *outputQdata);
-
 /**
- * ShiftAndResampleSignalDouble 和  ShiftAndResampleSignalFloat
+ * ShiftAndResampleSignalDoubleV1
  * 重采样函数：完成原始信号的移频，重采样等计算
  *
  * @param origIdata：原始Idata
@@ -84,6 +23,7 @@ bool ShiftAndResampleSignalDouble(
  * @param outputLength：重采样后每个带宽对应的输出信号长度
  * @param downFactor：每个带宽对应的下采样率
  * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
  * @param numResults：检测结果数量
  * @param numChannels：信号通道数
  * @param CurrentRealfreq：当前实际频率
@@ -93,57 +33,47 @@ bool ShiftAndResampleSignalDouble(
  * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return true or false
  */
-bool ShiftAndResampleSignalFloat(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int outputTotalLength, const int numResults,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata);
-bool ShiftAndResampleSignalDouble(
+bool ShiftAndResampleSignalDoubleV1(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
     std::vector<int> &outputLength,
     std::vector<int> &downFactor,
     std::vector<double> &detectFreq,
-    const int outputTotalLength, const int numResults,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    const int outputTotalLength,
+    const int numResults,
+    const int numChannels,
+    const double CurrentRealfreq,
+    double *outputIdata,
     double *outputQdata);
 
 /**
- * ShiftAndResampleSignalV2
+ * ShiftAndResampleSignalDoubleV2
  * 重采样函数：完成原始信号的移频，重采样等计算
- * 输出信号长度对齐到指定长度
  *
  * @param origIdata：原始Idata
  * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
  * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
  * @param numChannels：信号通道数
  * @param CurrentRealfreq：当前实际频率
  * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
  * @return true or false
  */
-bool ShiftAndResampleSignalFloatV2(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<float>>> &
-        vecOneFrameDetectResult,
-    const int alignSignalLength,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata);
 bool ShiftAndResampleSignalDoubleV2(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<double>>> &
-        vecOneFrameDetectResult,
+    std::vector<int> &downFactor,
+    std::vector<double> &detectFreq,
     const int alignSignalLength,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
+    const int numResults,
+    const int numChannels,
+    const double CurrentRealfreq,
+    double *outputIdata,
     double *outputQdata);
 
 #endif // CUDA_RESAMPLE_H
diff --git a/cuda_resample.cu b/cuda_resample_float.cu
similarity index 37%
rename from cuda_resample.cu
rename to cuda_resample_float.cu
index 3e50213..da8e9e0 100644
--- a/cuda_resample.cu
+++ b/cuda_resample_float.cu
@@ -1,1856 +1,891 @@
-#include "cuda_resample.h"
-#include "upfirdn_device.h"
-
-// CHECK_CUDA_ERROR：cuda api调用错误处理
-#define CHECK_CUDA_ERROR(call)                                               \
-  do                                                                         \
-  {                                                                          \
-    cudaError_t err = call;                                                  \
-    if (err != cudaSuccess)                                                  \
-    {                                                                        \
-      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
-                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
-      throw std::runtime_error("CUDA error");                                \
-    }                                                                        \
-  } while (0)
-
-#define LOG_INFO(fmt, ...)                                                \
-  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
-#define LOG_ERROR(fmt, ...)                                                \
-  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
-// 余弦函数
-template <typename T>
-__device__ __forceinline__ T dev_cos(T x)
-{
-  if constexpr (std::is_same_v<T, float>)
-  {
-    return cosf(x);
-  }
-  else if constexpr (std::is_same_v<T, double>)
-  {
-    return cos(x);
-  }
-  else
-  {
-    return cos(static_cast<double>(x));
-  }
-}
-
-// 正弦函数
-template <typename T>
-__device__ __forceinline__ T dev_sin(T x)
-{
-  if constexpr (std::is_same_v<T, float>)
-  {
-    return sinf(x);
-  }
-  else if constexpr (std::is_same_v<T, double>)
-  {
-    return sin(x);
-  }
-  else
-  {
-    return sin(static_cast<double>(x));
-  }
-}
-
-// 浮点数绝对值
-template <typename T>
-__device__ __forceinline__ T dev_abs(T x)
-{
-  if constexpr (std::is_same_v<T, float>)
-  {
-    return fabsf(x);
-  }
-  else if constexpr (std::is_same_v<T, double>)
-  {
-    return fabs(x);
-  }
-  else
-  {
-    return fabs(static_cast<double>(x));
-  }
-}
-
-// 整数向上取整除法
-__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
-{
-  div_t result = div(num1, num2);
-  return result.quot + (result.rem != 0);
-}
-
-// CUDA设备端GCD函数:最大公约数
-__device__ __forceinline__ int dev_gcd(int a, int b)
-{
-  while (b != 0)
-  {
-    int temp = b;
-    b = a % b;
-    a = temp;
-  }
-  return a;
-}
-
-// 生成连续递增的序列
-template <typename T>
-__device__ __forceinline__ void dev_iota(T *data, int size, T start)
-{
-  for (int i = 0; i < size; i++)
-  {
-    data[i] = start + T(i);
-  }
-  return;
-}
-
-// 填充data为value
-template <typename T>
-__device__ __forceinline__ void dev_fill(T *data, int size, T value)
-{
-  for (int i = 0; i < size; i++)
-  {
-    data[i] = value;
-  }
-  return;
-}
-
-template <typename T>
-__device__ int dev_firls(T *result, int length, T *freq, const T *amplitude,
-                         int freqSize)
-{
-  // 计算权重大小
-  int weightSize = freqSize / 2;
-
-  // 初始化权重向量
-  T *weight = new T[weightSize];
-  if (weight == nullptr)
-  {
-    return -1;
-  }
-
-  // 初始化weight为全1
-  dev_fill(weight, weightSize, T(1.0));
-
-  // 处理频率向量
-  for (int i = 0; i < freqSize; i++)
-  {
-    freq[i] = freq[i] / T(2.0);
-  }
-
-  int filterLength = length + 1;
-  length = (filterLength - 1) / 2;
-
-  // 奇偶判断
-  bool Nodd = filterLength & 1;
-
-  // 创建和初始化向量k
-  int kLength = length + 1;
-  T *k = new T[kLength];
-  if (k == nullptr)
-  {
-    return -1;
-  };
-
-  // 初始化k向量为递增序列：0，1，2...
-  dev_iota(k, kLength, T(0.0));
-
-  if (!Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
-      k[i] += T(0.5);
-    }
-  }
-
-  // k.erase(k.begin());
-  if (Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
-      k[i] = k[i + 1];
-    }
-    kLength--;
-  }
-
-  // 创建和初始化向量b
-  int bLength = kLength;
-  if (Nodd)
-  {
-    bLength++; // 此处++，因为后面需要在b[0]处插入b0
-  }
-  T *b = new T[bLength];
-  if (b == nullptr)
-  {
-    return -1;
-  };
-
-  dev_fill(b, bLength, T(0.0));
-
-  T b0 = T(0.0);
-  for (int i = 0; i < freqSize; i += 2)
-  {
-    T Fi = freq[i];
-    T Fip1 = freq[i + 1];
-    T ampi = amplitude[i];
-    T ampip1 = amplitude[i + 1];
-    T wt2 = pow(weight[i / 2], T(2.0));
-    T m_s = (ampip1 - ampi) / (Fip1 - Fi);
-    T b1 = ampi - (m_s * Fi);
-
-    if (Nodd)
-    {
-      b0 += (b1 * (Fip1 - Fi)) +
-            m_s / T(2.0) * (pow(Fip1, T(2.0)) - pow(Fi, T(2.0))) * wt2;
-    }
-
-    // 并行计算b向量
-    for (int j = 0; j < kLength; j++)
-    {
-      T kj = k[j];
-      b[j] += (m_s / (T(4.0) * pow(M_PI, T(2.0))) *
-               (dev_cos(T(2.0) * M_PI * Fip1) - dev_cos(T(2.0) * M_PI * Fi)) /
-               (pow(kj, T(2.0)))) *
-              wt2;
-
-      b[j] += (Fip1 * (m_s * Fip1 + b1) * dev_sin(T(2.0) * kj * Fip1) -
-               Fi * (m_s * Fi + b1) * dev_sin(T(2.0) * kj * Fi)) *
-              wt2;
-    }
-  }
-
-  // 处理最终结果，将b0插入到b向量的开始
-  if (Nodd)
-  {
-    for (int i = kLength; i >= 0; i--)
-    {
-      if (i > 0)
-      {
-        b[i] = b[i - 1];
-      }
-      else
-      {
-        b[i] = b0;
-      }
-    }
-  }
-
-  // 计算a向量
-  T w0 = weight[0];
-
-  int aLength = bLength;
-  T *a = new T[aLength];
-  if (a == nullptr)
-  {
-    return -1;
-  };
-
-  // vector<T> result = {a.rbegin(), a.rend()};
-  for (int i = 0; i < aLength; i++)
-  {
-    a[i] = pow(w0, T(2.0)) * T(4.0) * b[i];
-    result[aLength - 1 - i] = a[i];
-  }
-
-  int it = 0;
-  if (Nodd)
-  {
-    it = 1;
-  }
-
-  // 构建结果向量
-  for (int i = 0; i < aLength; i++)
-  {
-    result[i] = result[i] * T(0.5);
-    if ((i + it) < aLength)
-    {
-      result[aLength + i] = a[i + it] * T(0.5);
-    }
-  }
-
-  // 释放动态分配的内存
-  delete[] weight; // 释放内存
-  delete[] k;      // 释放内存
-  delete[] b;      // 释放内存
-  delete[] a;      // 释放内存
-  return 0;
-}
-
-// 设备端Bessel函数模板
-template <typename T>
-__device__ T dev_cyl_bessel_i(int n, T x)
-{
-  if (n == 0)
-    return T(1);
-  T bessel = T(1), bessel_prev = T(1);
-  for (int i = 1; i <= n; ++i)
-  {
-    bessel = (T(2) * i - T(1)) / i * x * bessel_prev - bessel;
-    bessel_prev = bessel;
-  }
-  return bessel;
-}
-
-// 设备端凯塞窗核函数模板
-template <typename T>
-__device__ void dev_kaiser(T *window, int order, T bta)
-{
-  T Numerator, Denominator;
-  Denominator = dev_cyl_bessel_i(0, bta);
-  T od2 = (order - T(1)) / T(2);
-
-  for (int n = 0; n < order; n++)
-  {
-    T x = bta * sqrt(T(1) - pow((n - od2) / od2, T(2)));
-    Numerator = dev_cyl_bessel_i(0, x);
-    window[n] = Numerator / Denominator;
-  }
-}
-
-template <typename T>
-__device__ void dev_resample(const int upFactor, const int downFactor,
-                             const T *inputSignal, const int inputSize,
-                             T *outputSignal)
-{
-  const int n = 10;
-  const T bta = T(5.0);
-
-  if (upFactor <= 0 || downFactor <= 0)
-  {
-    return;
-  }
-
-  int gcd_o = dev_gcd(upFactor, downFactor);
-
-  upFactor /= gcd_o;
-  downFactor /= gcd_o;
-
-  if (upFactor == downFactor)
-  {
-    outputSignal = inputSignal;
-    return;
-  }
-
-  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
-
-  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
-  T firlsFreq = T(1.0) / T(2.0) / static_cast<T>(maxFactor);
-
-  T firlsFreqsV[4];
-  firlsFreqsV[0] = T(0.0);
-  firlsFreqsV[1] = T(2.0) * firlsFreq;
-  firlsFreqsV[2] = T(2.0) * firlsFreq;
-  firlsFreqsV[3] = T(1.0);
-
-  T firlsAmplitudeV[4];
-  firlsAmplitudeV[0] = T(1.0);
-  firlsAmplitudeV[1] = T(1.0);
-  firlsAmplitudeV[2] = T(0.0);
-  firlsAmplitudeV[3] = T(0.0);
-
-  int freqSize = 4;
-  int length = 2 * n * maxFactor + 1;
-  int coefficientsLength = length;
-
-  T *coefficients = new T[coefficientsLength];
-  if (coefficients == nullptr)
-  {
-    return;
-  }
-  int ret = dev_firls(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
-                      freqSize);
-  if (ret == -1)
-  {
-    LOG_ERROR("dev_firls调用失败\n");
-    return;
-  }
-
-  int windowSize = length;
-  T *window = new T[windowSize];
-  if (window == nullptr)
-  {
-    return;
-  }
-  dev_kaiser(window, length, bta);
-
-  for (int i = 0; i < coefficientsLength; i++)
-  {
-    coefficients[i] *= (upFactor * window[i]);
-  }
-
-  int lengthHalf = (length - 1) / 2;
-  int nz = downFactor - lengthHalf % downFactor;
-
-  // 分配filter空间
-  int hSize = coefficientsLength + 2 * nz;
-  T *filter = new T[hSize];
-  if (filter == nullptr)
-  {
-    return;
-  }
-
-  int filterLength = 0;
-  for (int i = 0; i < nz; i++)
-  {
-    filter[i + filterLength] = T(0.0);
-  }
-  filterLength += nz;
-
-  for (int i = 0; i < coefficientsLength; i++)
-  {
-    filter[i + filterLength] = coefficients[i];
-  }
-  filterLength += coefficientsLength;
-
-  lengthHalf += nz;
-  int delay = lengthHalf / downFactor;
-  nz = 0;
-  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
-             delay <
-         outputSize)
-  {
-    nz++;
-  }
-
-  for (int i = 0; i < nz; i++)
-  {
-    filter[i + filterLength] = T(0.0);
-  }
-  filterLength += nz;
-
-  // 计算
-  int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upFactor)
-  {
-    paddedCoefCount++;
-  }
-
-  int coefsPerPhase = paddedCoefCount / upFactor;
-  int padding = coefsPerPhase - 1;
-  int outputCount =
-      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
-
-  T *results = new T[outputCount];
-  if (results == nullptr)
-  {
-    return;
-  }
-
-  int resultsCount = 0;
-  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
-                 filterLength, results, &resultsCount);
-
-  int j = 0;
-  for (int i = delay; i < outputSize + delay; i++)
-  {
-    outputSignal[j++] = results[i];
-  }
-
-  // 释放动态分配的内存
-  delete[] coefficients;
-  delete[] window;
-  delete[] filter;
-  delete[] results;
-  return;
-}
-
-/**
- * ShiftingAndResamplingKernel
- * 重采样核函数：完成原始信号的移频，重采样等计算
- * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
- * 因此共 numChannels * numResults 个线程并行计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param VDownFactor：下采样率
- * @param VFrequency：频率
- * @param VOutputLength：每个检测结果的重采样输出信号长度
- * @param numResults：每帧的检测结果总数
- * @param numChannels：信号通道数
- * @param signalLength：每个通道的信号长度
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
- * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
- * @return true or false
- */
-__global__ void ShiftingAndResamplingKernelFloat(
-    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
-    const float *__restrict__ VOutputLength, const int numResults,
-    const int numChannels, const int signalLength, const float CurrentRealfreq,
-    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
-
-  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
-
-  const float sampling_rate = float(245.76e6);
-
-  float frequency = VFrequency[ResIdx]; // 频率
-  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
-
-  // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
-
-  // 移频：生成本振信号并相乘
-  float *I_shifted = new float[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  float *Q_shifted = new float[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
-  for (int i = 0; i < signalLength; i++)
-  {
-    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    float cosVal = dev_cos(phase);
-    float sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
-  }
-
-  // 上采样因子为1，下采样因子为downFactor
-  int upFactor = 1;
-  int downFactor = VDownFactor[ResIdx];
-
-  // 计算之前带宽，对应的输出信号的总长度
-  int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++)
-  {
-    beforeTotalLength += VOutputLength[i];
-  }
-  // 当前带宽对应的输出信号的起始地址偏移
-  int offset = beforeTotalLength * numChannels;
-
-  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
-  int outputLength = VOutputLength[ResIdx];
-
-  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
-  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
-
-  // 重采样
-  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
-}
-
-__global__ void ShiftingAndResamplingKernelDouble(
-    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
-    const double *__restrict__ VOutputLength, const int numResults,
-    const int numChannels, const int signalLength, const double CurrentRealfreq,
-    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
-
-  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
-
-  const double sampling_rate = double(245.76e6);
-
-  double frequency = VFrequency[ResIdx]; // 频率
-  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
-
-  // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
-
-  // 移频：生成本振信号并相乘
-  double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
-  for (int i = 0; i < signalLength; i++)
-  {
-    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = dev_cos(phase);
-    double sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
-  }
-
-  // 上采样因子为1，下采样因子为downFactor
-  int upFactor = 1;
-  int downFactor = VDownFactor[ResIdx];
-
-  // 计算之前带宽，对应的输出信号的总长度
-  int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++)
-  {
-    beforeTotalLength += VOutputLength[i];
-  }
-  // 当前带宽对应的输出信号的起始地址偏移
-  int offset = beforeTotalLength * numChannels;
-
-  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
-  int outputLength = VOutputLength[ResIdx];
-
-  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
-  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
-
-  // 重采样
-  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
-}
-
-/**
- * ShiftingAndResamplingKernelV2
- * 重采样核函数：完成原始信号的移频，重采样等计算
- * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
- * 因此共 numChannels * numResults 个线程并行计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param VDownFactor：下采样率
- * @param VFrequency：频率
- * @param numResults：每帧的检测结果总数
- * @param numChannels：信号通道数
- * @param signalLength：每个通道的原始信号长度
- * @param CurrentRealfreq：当前实际频率
- * @param alignSignalLength：重采样后信号的对齐长度
- * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
- * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
- * @return true or false
- */
-__global__ void ShiftingAndResamplingKernelFloatV2(
-    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
-    const int numResults, const int numChannels, const int signalLength,
-    const float CurrentRealfreq, const int alignSignalLength,
-    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
-
-  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
-
-  const float sampling_rate = float(245.76e6);
-
-  float frequency = VFrequency[ResIdx]; // 频率
-  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
-
-  // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
-
-  // 移频：生成本振信号并相乘
-  float *I_shifted = new float[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  float *Q_shifted = new float[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
-  for (int i = 0; i < signalLength; i++)
-  {
-    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    float cosVal = dev_cos(phase);
-    float sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
-  }
-
-  // 上采样因子为1，下采样因子为downFactor
-  int upFactor = 1;
-  int downFactor = VDownFactor[ResIdx];
-
-  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
-  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
-
-  // 重采样
-  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
-}
-
-__global__ void ShiftingAndResamplingKernelDoubleV2(
-    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
-    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
-    const int numResults, const int numChannels, const int signalLength,
-    const double CurrentRealfreq, const int alignSignalLength,
-    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
-
-  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
-
-  const double sampling_rate = double(245.76e6);
-
-  double frequency = VFrequency[ResIdx]; // 频率
-  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
-
-  // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
-
-  // 移频：生成本振信号并相乘
-  double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
-  for (int i = 0; i < signalLength; i++)
-  {
-    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = dev_cos(phase);
-    double sinVal = dev_sin(phase);
-    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
-    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
-  }
-
-  // 上采样因子为1，下采样因子为downFactor
-  int upFactor = 1;
-  int downFactor = VDownFactor[ResIdx];
-
-  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
-  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
-
-  // 重采样
-  dev_resample(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
-}
-
-inline int quotientCeil(int num1, int num2)
-{
-  if (num1 % num2 != 0)
-    return num1 / num2 + 1;
-  return num1 / num2;
-}
-
-/**
- * ShiftAndResampleSignalFloat
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-bool ShiftAndResampleSignalFloat(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueFloat>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  // 检测结果数据，采用vector存储更好，没必要使用map
-  int numResults = vecOneFrameDetectResult.size();
-
-  int upFactor = 1;          // 上采样率，默认为1
-  int outputTotalLength = 0; // 保存总的输出信号长度
-
-  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
-  std::vector<int> outputLength;
-  // 上采样率
-  std::vector<int> downFactor;
-  // 检测结果的频率
-  std::vector<float> detectFreq;
-  // 检测结果的带宽
-  std::vector<float> detectBandwidth;
-
-  // 根据检测结果，初始化相关变量或者vector
-  // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
-  {
-    float bandwidth = fbv.bandwidth * 1e6;
-    int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
-    {
-      decimation = 4;
-    }
-    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
-    {
-      decimation = 8;
-    }
-    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
-    {
-      decimation = 16;
-    }
-    else
-    {
-      // 带宽不符合要求，跳过处理
-      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
-      continue;
-    }
-
-    downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency);
-    detectBandwidth.push_back(bandwidth);
-
-    // 计算每个下采样率,重采样后的输出信号长度
-    int length = quotientCeil(signalLength * upFactor, decimation);
-    outputLength.push_back(length);
-
-    // 重采样后输出信号的总长度
-    outputTotalLength += length;
-  }
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  int *d_outputLength = nullptr;
-  float *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const T *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
-
-  // copy每个带宽，重采样后输出信号长度到显存中
-  const int *src_outputLength = outputLength.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  float *d_Idata = nullptr;
-  float *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(float);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    float *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    float *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  float *d_outputIdata = nullptr;
-  float *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
-
-  // 线程数配置
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
-  ShiftingAndResamplingKernelFloat<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保空间够
-  // outputQdata 确保空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
-
-/**
- * ShiftAndResampleSignalDouble
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-bool ShiftAndResampleSignalDouble(
-    const std::vector<std::vector<double>> &origIdata,
-    const std::vector<std::vector<double>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValueDouble>> &
-        vecOneFrameDetectResult,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
-    double *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  // 检测结果数据，采用vector存储更好，没必要使用map
-  int numResults = vecOneFrameDetectResult.size();
-
-  int upFactor = 1;          // 上采样率，默认为1
-  int outputTotalLength = 0; // 保存总的输出信号长度
-
-  // 每个检测结果，根据下采样率计算的输出信号长度（这里假设输出信号长度不相同）
-  std::vector<int> outputLength;
-  // 上采样率
-  std::vector<int> downFactor;
-  // 检测结果的频率
-  std::vector<double> detectFreq;
-  // 检测结果的带宽
-  std::vector<double> detectBandwidth;
-
-  // 根据检测结果，初始化相关变量或者vector
-  // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
-  {
-    double bandwidth = fbv.bandwidth * 1e6;
-    int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
-    {
-      decimation = 4;
-    }
-    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
-    {
-      decimation = 8;
-    }
-    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
-    {
-      decimation = 16;
-    }
-    else
-    {
-      // 带宽不符合要求，跳过处理
-      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
-      continue;
-    }
-
-    downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency);
-    detectBandwidth.push_back(bandwidth);
-
-    // 计算每个下采样率,重采样后的输出信号长度
-    int length = quotientCeil(signalLength * upFactor, decimation);
-    outputLength.push_back(length);
-
-    // 重采样后输出信号的总长度
-    outputTotalLength += length;
-  }
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  int *d_outputLength = nullptr;
-  double *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const T *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
-
-  // copy每个带宽，重采样后输出信号长度到显存中
-  const int *src_outputLength = outputLength.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  double *d_Idata = nullptr;
-  double *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(double);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    double *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    double *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  double *d_outputIdata = nullptr;
-  double *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
-
-  // 线程数配置
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
-  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保空间够
-  // outputQdata 确保空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
-
-bool ShiftAndResampleSignalFloat(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int outputTotalLength, const int numResults,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  int upFactor = 1; // 上采样率，默认为1
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  int *d_outputLength = nullptr;
-  float *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const T *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
-
-  // copy每个带宽，重采样后输出信号长度到显存中
-  const int *src_outputLength = outputLength.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  float *d_Idata = nullptr;
-  float *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(float);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    float *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    float *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  float *d_outputIdata = nullptr;
-  float *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
-
-  // 线程数配置
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
-  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保空间够
-  // outputQdata 确保空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
-
-/**
- * ShiftAndResampleSignalDouble
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param outputLength：重采样后每个带宽对应的输出信号长度
- * @param downFactor：每个带宽对应的下采样率
- * @param detectFreq：每个带宽对应的频率
- * @param numResults：检测结果数量
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
- * @return true or false
- */
-bool ShiftAndResampleSignalDouble(
-    const std::vector<std::vector<double>> &origIdata,
-    const std::vector<std::vector<double>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<double> &detectFreq,
-    const int outputTotalLength, const int numResults,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
-    double *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  int upFactor = 1; // 上采样率，默认为1
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  int *d_outputLength = nullptr;
-  double *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const T *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
-
-  // copy每个带宽，重采样后输出信号长度到显存中
-  const int *src_outputLength = outputLength.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  double *d_Idata = nullptr;
-  double *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(double);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    double *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    double *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  double *d_outputIdata = nullptr;
-  double *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
-
-  // 线程数配置
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
-  ShiftingAndResamplingKernelDouble<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保空间够
-  // outputQdata 确保空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
-
-/**
- * ShiftAndResampleSignalV2
- * 重采样函数：完成原始信号的移频，重采样等计算
- *
- * @param origIdata：原始Idata
- * @param origQdata：原始Qdata
- * @param vecOneFrameDetectResult：检测结果数据
- * @param alignSignalLength：重采样后信号的对齐长度
- * @param numChannels：信号通道数
- * @param CurrentRealfreq：当前实际频率
- * @param outputIdata：
- * 重采样后的Idata，连续存储，格式：[numResults][numChannels][alignSignalLength]
- * @param outputQdata：
- * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][alignSignalLength]
- * @return true or false
- */
-bool ShiftAndResampleSignalFloatV2(
-    const std::vector<std::vector<float>> &origIdata,
-    const std::vector<std::vector<float>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<float>>> &
-        vecOneFrameDetectResult,
-    const int alignSignalLength,
-    const int numChannels, const float CurrentRealfreq, float *outputIdata,
-    float *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  // 检测结果数据，采用vector存储更好，没必要使用map
-  int numResults = vecOneFrameDetectResult.size();
-
-  int upFactor = 1;          // 上采样率，默认为1
-  int outputTotalLength = 0; // 保存总的输出信号长度
-
-  // 上采样率
-  std::vector<int> downFactor;
-  // 检测结果的频率
-  std::vector<float> detectFreq;
-  // 检测结果的带宽
-  std::vector<float> detectBandwidth;
-
-  // 根据检测结果，初始化相关变量或者vector
-  // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
-  {
-    float bandwidth = fbv.bandwidth * 1e6;
-    int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
-    {
-      decimation = 4;
-    }
-    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
-    {
-      decimation = 8;
-    }
-    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
-    {
-      decimation = 16;
-    }
-    else
-    {
-      // 带宽不符合要求，跳过处理
-      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
-      continue;
-    }
-
-    downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency);
-    detectBandwidth.push_back(bandwidth);
-  }
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernelV2=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  float *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const float *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  float *d_Idata = nullptr;
-  float *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(float);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    float *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    float *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(float);
-  float *d_outputIdata = nullptr;
-  float *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
-
-  // 初始化为0
-  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
-  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
-
-  // 线程数配置，总的线程数：numChannels * numResults
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-  ShiftingAndResamplingKernelV2<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
-      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保申请的内存空间够
-  // outputQdata 确保申请的内存空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyDeviceToHost));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyDeviceToHost));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
-
-bool ShiftAndResampleSignalDoubleV2(
-    const std::vector<std::vector<double>> &origIdata,
-    const std::vector<std::vector<double>> &origQdata,
-    const std::vector<std::map<int64_t, FreqBandValue<double>>> &
-        vecOneFrameDetectResult,
-    const int alignSignalLength,
-    const int numChannels, const double CurrentRealfreq, double *outputIdata,
-    double *outputQdata)
-{
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
-  int signalLength = origIdata[0].size();
-
-  // 检测结果数据，采用vector存储更好，没必要使用map
-  int numResults = vecOneFrameDetectResult.size();
-
-  int upFactor = 1;          // 上采样率，默认为1
-  int outputTotalLength = 0; // 保存总的输出信号长度
-
-  // 上采样率
-  std::vector<int> downFactor;
-  // 检测结果的频率
-  std::vector<double> detectFreq;
-  // 检测结果的带宽
-  std::vector<double> detectBandwidth;
-
-  // 根据检测结果，初始化相关变量或者vector
-  // vecOneFrameDetectResult 可以不使用map，使用vector
-  for (const auto &[freq, fbv] : vecOneFrameDetectResult.back())
-  {
-    double bandwidth = fbv.bandwidth * 1e6;
-    int decimation = 0;
-    if (std::abs(bandwidth - 40e6) < 2 * 1e6)
-    {
-      decimation = 4;
-    }
-    else if (std::abs(bandwidth - 20e6) < 2 * 1e6)
-    {
-      decimation = 8;
-    }
-    else if (std::abs(bandwidth - 10e6) < 2 * 1e6)
-    {
-      decimation = 16;
-    }
-    else
-    {
-      // 带宽不符合要求，跳过处理
-      std::cout << __FUNCTION__ << ":带宽 " << bandwidth << "--- 不符合 ";
-      continue;
-    }
-
-    downFactor.push_back(decimation);
-    detectFreq.push_back(fbv.frequency);
-    detectBandwidth.push_back(bandwidth);
-  }
-
-  // ====准备调用重采样核函数：ShiftingAndResamplingKernelV2=====
-  // copy下采样率，频率等数据到显存中
-  int *d_downFactor = nullptr;
-  double *d_frequency = nullptr;
-  // 申请显存
-  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
-
-  // copy下采样率到显存中
-  const int *src_downFactor = downFactor.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
-                              numResults * sizeof(int),
-                              cudaMemcpyHostToDevice));
-
-  // copy频率到显存中
-  const double *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
-
-  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
-  double *d_Idata = nullptr;
-  double *d_Qdata = nullptr;
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(
-      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
-
-  // 将所有通道数据循环拷贝到GPU显存
-  size_t copySize = signalLength * sizeof(double);
-  for (int i = 0; i < numChannels; i++)
-  {
-    // copy 原始的idata 到gpu显存
-    double *dst_idata = d_Idata + i * signalLength;
-    const void *src_idata = origIdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
-
-    // copy 原始的qdata 到gpu显存
-    double *dst_qdata = d_Qdata + i * signalLength;
-    const void *src_qdata = origQdata[i].data();
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
-  }
-
-  // 申请重采样后输出信号的GPU显存
-  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(double);
-  double *d_outputIdata = nullptr;
-  double *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
-
-  // 初始化为0
-  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
-  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
-
-  // 线程数配置，总的线程数：numChannels * numResults
-  dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-  ShiftingAndResamplingKernelV2<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
-      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      d_outputIdata, d_outputQdata);
-
-  // copy重采样计算结果到主存
-  // outputIdata 确保申请的内存空间够
-  // outputQdata 确保申请的内存空间够
-  // 存储格式：[numResults][numChannels][lengthPerResults]
-  // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyDeviceToHost));
-
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyDeviceToHost));
-
-  // 释放显存
-  if (d_downFactor)
-  {
-    cudaFree(d_downFactor);
-    d_downFactor = nullptr;
-  }
-
-  if (d_outputLength)
-  {
-    cudaFree(d_outputLength);
-    d_outputLength = nullptr;
-  }
-
-  if (d_frequency)
-  {
-    cudaFree(d_frequency);
-    d_frequency = nullptr;
-  }
-
-  if (d_Idata)
-  {
-    cudaFree(d_Idata);
-    d_Idata = nullptr;
-  }
-
-  if (d_Qdata)
-  {
-    cudaFree(d_Qdata);
-    d_Qdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  if (d_outputIdata)
-  {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
-  }
-
-  return true;
-}
+#include "cuda_resample.h"
+#include "upfirdn_device.h"
+
+// CHECK_CUDA_ERROR：cuda api调用错误处理
+#define CHECK_CUDA_ERROR(call)                                               \
+  do                                                                         \
+  {                                                                          \
+    cudaError_t err = call;                                                  \
+    if (err != cudaSuccess)                                                  \
+    {                                                                        \
+      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
+                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
+      throw std::runtime_error("CUDA error");                                \
+    }                                                                        \
+  } while (0)
+
+#define LOG_INFO(fmt, ...)                                                \
+  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
+#define LOG_ERROR(fmt, ...)                                                \
+  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
+          ##__VA_ARGS__)
+
+inline int quotientCeil(int num1, int num2)
+{
+  if (num1 % num2 != 0)
+    return num1 / num2 + 1;
+  return num1 / num2;
+}
+
+// 整数向上取整除法
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
+{
+  div_t result = div(num1, num2);
+  return result.quot + (result.rem != 0);
+}
+
+// CUDA设备端GCD函数:最大公约数
+__device__ __forceinline__ int dev_gcd(int a, int b)
+{
+  while (b != 0)
+  {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// 生成连续递增的序列
+__device__ __forceinline__ void dev_iota_float(float *data, int size, float start)
+{
+  for (int i = 0; i < size; i++)
+  {
+    data[i] = start + float(i);
+  }
+  return;
+}
+
+// 填充data为value
+__device__ __forceinline__ void dev_fill_float(float *data, int size, float value)
+{
+  for (int i = 0; i < size; i++)
+  {
+    data[i] = value;
+  }
+  return;
+}
+
+__device__ int dev_firls_float(float *result, int length, float *freq, const float *amplitude,
+                               int freqSize)
+{
+  // 计算权重大小
+  int weightSize = freqSize / 2;
+
+  // 初始化权重向量
+  float *weight = new float[weightSize];
+  if (weight == nullptr)
+  {
+    return -1;
+  }
+
+  // 初始化weight为全1
+  dev_fill_float(weight, weightSize, float(1.0));
+
+  // 处理频率向量
+  for (int i = 0; i < freqSize; i++)
+  {
+    freq[i] = freq[i] / float(2.0);
+  }
+
+  int filterLength = length + 1;
+  length = (filterLength - 1) / 2;
+
+  // 奇偶判断
+  bool Nodd = filterLength & 1;
+
+  // 创建和初始化向量k
+  int kLength = length + 1;
+  float *k = new float[kLength];
+  if (k == nullptr)
+  {
+    return -1;
+  };
+
+  // 初始化k向量为递增序列：0，1，2...
+  dev_iota_float(k, kLength, float(0.0));
+
+  if (!Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
+      k[i] += float(0.5);
+    }
+  }
+
+  // k.erase(k.begin());
+  if (Nodd)
+  {
+    for (int i = 0; i < kLength; i++)
+    {
+      k[i] = k[i + 1];
+    }
+    kLength--;
+  }
+
+  // 创建和初始化向量b
+  int bLength = kLength;
+  if (Nodd)
+  {
+    bLength++; // 此处++，因为后面需要在b[0]处插入b0
+  }
+  float *b = new float[bLength];
+  if (b == nullptr)
+  {
+    return -1;
+  };
+
+  dev_fill_float(b, bLength, float(0.0));
+
+  float b0 = float(0.0);
+  for (int i = 0; i < freqSize; i += 2)
+  {
+    float Fi = freq[i];
+    float Fip1 = freq[i + 1];
+    float ampi = amplitude[i];
+    float ampip1 = amplitude[i + 1];
+    float wt2 = pow(weight[i / 2], float(2.0));
+    float m_s = (ampip1 - ampi) / (Fip1 - Fi);
+    float b1 = ampi - (m_s * Fi);
+
+    if (Nodd)
+    {
+      b0 += (b1 * (Fip1 - Fi)) +
+            m_s / float(2.0) * (pow(Fip1, float(2.0)) - pow(Fi, float(2.0))) * wt2;
+    }
+
+    // 并行计算b向量
+    for (int j = 0; j < kLength; j++)
+    {
+      float kj = k[j];
+      b[j] += (m_s / (float(4.0) * pow(M_PI, float(2.0))) *
+               (cosf(float(2.0) * M_PI * Fip1) - cosf(float(2.0) * M_PI * Fi)) /
+               (pow(kj, float(2.0)))) *
+              wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(float(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sinf(float(2.0) * kj * Fi)) *
+              wt2;
+    }
+  }
+
+  // 处理最终结果，将b0插入到b向量的开始
+  if (Nodd)
+  {
+    for (int i = kLength; i >= 0; i--)
+    {
+      if (i > 0)
+      {
+        b[i] = b[i - 1];
+      }
+      else
+      {
+        b[i] = b0;
+      }
+    }
+  }
+
+  // 计算a向量
+  float w0 = weight[0];
+
+  int aLength = bLength;
+  float *a = new float[aLength];
+  if (a == nullptr)
+  {
+    return -1;
+  };
+
+  // vector<float> result = {a.rbegin(), a.rend()};
+  for (int i = 0; i < aLength; i++)
+  {
+    a[i] = pow(w0, float(2.0)) * float(4.0) * b[i];
+    result[aLength - 1 - i] = a[i];
+  }
+
+  int it = 0;
+  if (Nodd)
+  {
+    it = 1;
+  }
+
+  // 构建结果向量
+  for (int i = 0; i < aLength; i++)
+  {
+    result[i] = result[i] * float(0.5);
+    if ((i + it) < aLength)
+    {
+      result[aLength + i] = a[i + it] * float(0.5);
+    }
+  }
+
+  // 释放动态分配的内存
+  delete[] weight; // 释放内存
+  delete[] k;      // 释放内存
+  delete[] b;      // 释放内存
+  delete[] a;      // 释放内存
+  return 0;
+}
+
+// 设备端Bessel函数
+__device__ float dev_cyl_bessel_i_float(int n, float x)
+{
+  if (n == 0)
+    return float(1);
+  float bessel = float(1), bessel_prev = float(1);
+  for (int i = 1; i <= n; ++i)
+  {
+    bessel = (float(2) * i - float(1)) / i * x * bessel_prev - bessel;
+    bessel_prev = bessel;
+  }
+  return bessel;
+}
+
+// 设备端凯塞窗核函数
+__device__ void dev_kaiser_float(float *window, int order, float bta)
+{
+  float Numerator, Denominator;
+  Denominator = dev_cyl_bessel_i_float(0, bta);
+  float od2 = (order - float(1)) / float(2);
+
+  for (int n = 0; n < order; n++)
+  {
+    float x = bta * sqrt(float(1) - pow((n - od2) / od2, float(2)));
+    Numerator = dev_cyl_bessel_i_float(0, x);
+    window[n] = Numerator / Denominator;
+  }
+}
+
+__device__ void dev_resample_float(const int upFactor, const int downFactor,
+                                   const float *inputSignal, const int inputSize,
+                                   float *outputSignal)
+{
+  const int n = 10;
+  const float bta = float(5.0);
+
+  if (upFactor <= 0 || downFactor <= 0)
+  {
+    return;
+  }
+
+  int gcd_o = dev_gcd(upFactor, downFactor);
+
+  upFactor /= gcd_o;
+  downFactor /= gcd_o;
+
+  if (upFactor == downFactor)
+  {
+    outputSignal = inputSignal;
+    return;
+  }
+
+  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
+  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
+  float firlsFreq = float(1.0) / float(2.0) / static_cast<float>(maxFactor);
+
+  float firlsFreqsV[4];
+  firlsFreqsV[0] = float(0.0);
+  firlsFreqsV[1] = float(2.0) * firlsFreq;
+  firlsFreqsV[2] = float(2.0) * firlsFreq;
+  firlsFreqsV[3] = float(1.0);
+
+  float firlsAmplitudeV[4];
+  firlsAmplitudeV[0] = float(1.0);
+  firlsAmplitudeV[1] = float(1.0);
+  firlsAmplitudeV[2] = float(0.0);
+  firlsAmplitudeV[3] = float(0.0);
+
+  int freqSize = 4;
+  int length = 2 * n * maxFactor + 1;
+  int coefficientsLength = length;
+
+  float *coefficients = new float[coefficientsLength];
+  if (coefficients == nullptr)
+  {
+    return;
+  }
+  int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
+                            freqSize);
+  if (ret == -1)
+  {
+    LOG_ERROR("dev_firls调用失败\n");
+    return;
+  }
+
+  int windowSize = length;
+  float *window = new float[windowSize];
+  if (window == nullptr)
+  {
+    return;
+  }
+  dev_kaiser_float(window, length, bta);
+
+  for (int i = 0; i < coefficientsLength; i++)
+  {
+    coefficients[i] *= (upFactor * window[i]);
+  }
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+
+  // 分配filter空间
+  int hSize = coefficientsLength + 2 * nz;
+  float *filter = new float[hSize];
+  if (filter == nullptr)
+  {
+    return;
+  }
+
+  int filterLength = 0;
+  for (int i = 0; i < nz; i++)
+  {
+    filter[i + filterLength] = float(0.0);
+  }
+  filterLength += nz;
+
+  for (int i = 0; i < coefficientsLength; i++)
+  {
+    filter[i + filterLength] = coefficients[i];
+  }
+  filterLength += coefficientsLength;
+
+  lengthHalf += nz;
+  int delay = lengthHalf / downFactor;
+  nz = 0;
+  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
+             delay <
+         outputSize)
+  {
+    nz++;
+  }
+
+  for (int i = 0; i < nz; i++)
+  {
+    filter[i + filterLength] = float(0.0);
+  }
+  filterLength += nz;
+
+  // 计算
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upFactor)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  float *results = new float[outputCount];
+  if (results == nullptr)
+  {
+    return;
+  }
+
+  int resultsCount = 0;
+  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
+                 filterLength, results, &resultsCount);
+
+  int j = 0;
+  for (int i = delay; i < outputSize + delay; i++)
+  {
+    outputSignal[j++] = results[i];
+  }
+
+  // 释放动态分配的内存
+  delete[] coefficients;
+  delete[] window;
+  delete[] filter;
+  delete[] results;
+  return;
+}
+
+/**
+ * ShiftingAndResamplingKernelFloatV1
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param VOutputLength：每个检测结果的重采样输出信号长度
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelFloatV1(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
+    const float *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const float CurrentRealfreq,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const float sampling_rate = float(245.76e6);
+
+  float frequency = VFrequency[ResIdx]; // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  float *I_shifted = new float[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  float *Q_shifted = new float[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = cosf(phase);
+    float sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++)
+  {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+/**
+ * ShiftingAndResamplingKernelFloatV2
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的原始信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelFloatV2(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const float CurrentRealfreq, const int alignSignalLength,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults)
+    return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels; // 第几个检测结果
+  int chIdx = idx % numChannels;  // 第几个通道
+
+  const float sampling_rate = float(245.76e6);
+
+  float frequency = VFrequency[ResIdx]; // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  auto &I_orig = origIdata + chIdx * signalLength;
+  auto &Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  float *I_shifted = new float[signalLength];
+  if (I_shifted == nullptr)
+  {
+    return;
+  }
+  float *Q_shifted = new float[signalLength];
+  if (Q_shifted == nullptr)
+  {
+    return;
+  }
+  for (int i = 0; i < signalLength; i++)
+  {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = cosf(phase);
+    float sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+
+  // 释放动态分配的内存
+  delete[] I_shifted;
+  delete[] Q_shifted;
+}
+
+/**
+ * ShiftAndResampleSignalFloatV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV1(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int outputTotalLength,
+    const int numResults,
+    const int numChannels,
+    const float CurrentRealfreq,
+    float *outputIdata,
+    float *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  int upFactor = 1; // 上采样率，默认为1
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  float *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const float *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(float);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    float *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    float *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(float))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+
+  ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // outputIdata 确保空间够
+  // outputQdata 确保空间够
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyHostToDevice));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyHostToDevice));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength)
+  {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalFloatV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int alignSignalLength,
+    const int numResults,
+    const int numChannels,
+    const float CurrentRealfreq,
+    float *outputIdata,
+    float *outputQdata)
+{
+  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  int signalLength = origIdata[0].size();
+
+  int upFactor = 1; // 上采样率，默认为1
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  float *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const float *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float), cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(float);
+  for (int i = 0; i < numChannels; i++)
+  {
+    // copy 原始的idata 到gpu显存
+    float *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    float *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(float);
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
+      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
+      d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor)
+  {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_frequency)
+  {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata)
+  {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata)
+  {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputIdata)
+  {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  return true;
+}
\ No newline at end of file
diff --git a/cuda_resample_float.h b/cuda_resample_float.h
new file mode 100644
index 0000000..aa981e9
--- /dev/null
+++ b/cuda_resample_float.h
@@ -0,0 +1,79 @@
+#ifndef CUDA_RESAMPLE_H
+#define CUDA_RESAMPLE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <thrust/device_vector.h>
+
+#include <cmath>
+#include <map>
+#include <type_traits>
+#include <vector>
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643
+#endif
+
+/**
+ * ShiftAndResampleSignalFloatV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV1(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int outputTotalLength,
+    const int numResults,
+    const int numChannels,
+    const float CurrentRealfreq,
+    float *outputIdata,
+    float *outputQdata);
+
+/**
+ * ShiftAndResampleSignalFloatV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &downFactor,
+    std::vector<float> &detectFreq,
+    const int alignSignalLength,
+    const int numResults,
+    const int numChannels,
+    const float CurrentRealfreq,
+    float *outputIdata,
+    float *outputQdata);
+
+#endif // CUDA_RESAMPLE_H
-- 
Gitee


From a1b2e3202ab1bac54893c2f92f25640f36dd6652 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 17:21:37 +0800
Subject: [PATCH 14/27] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=9C=80=E6=96=B0?=
 =?UTF-8?q?=E7=BC=96=E8=AF=91=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample_double.cu | 41 ++++++++++++++---------------------------
 cuda_resample_float.cu  | 41 ++++++++++++++---------------------------
 mainwindow.cpp          | 18 ++++++++++--------
 mainwindow.h            | 10 ++++++----
 4 files changed, 44 insertions(+), 66 deletions(-)

diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index 67ad85c..afcade9 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -1,4 +1,4 @@
-#include "cuda_resample.h"
+#include "cuda_resample_double.h"
 #include "upfirdn_device.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
@@ -14,14 +14,6 @@
     }                                                                        \
   } while (0)
 
-#define LOG_INFO(fmt, ...)                                                \
-  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
-#define LOG_ERROR(fmt, ...)                                                \
-  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
 inline int quotientCeil(int num1, int num2)
 {
   if (num1 % num2 != 0)
@@ -32,8 +24,8 @@ inline int quotientCeil(int num1, int num2)
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
 {
-  div_t result = div(num1, num2);
-  return result.quot + (result.rem != 0);
+  // 标准的上取整公式：(a + b - 1) / b
+  return (num1 + num2 - 1) / num2;
 }
 
 // CUDA设备端GCD函数:最大公约数
@@ -257,8 +249,8 @@ __device__ void dev_kaiser_double(double *window, int order, double bta)
   }
 }
 
-__device__ void dev_resample_double(const int upFactor, const int downFactor,
-                                    const double *inputSignal, const int inputSize,
+__device__ void dev_resample_double(int upFactor, int downFactor,
+                                    double *inputSignal, const int inputSize,
                                     double *outputSignal)
 {
   const int n = 10;
@@ -310,7 +302,6 @@ __device__ void dev_resample_double(const int upFactor, const int downFactor,
                              freqSize);
   if (ret == -1)
   {
-    LOG_ERROR("dev_firls调用失败\n");
     return;
   }
 
@@ -425,7 +416,7 @@ __device__ void dev_resample_double(const int upFactor, const int downFactor,
 __global__ void ShiftingAndResamplingKernelDoubleV1(
     const double *__restrict__ origIdata, const double *__restrict__ origQdata,
     const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
-    const double *__restrict__ VOutputLength, const int numResults,
+    const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const double CurrentRealfreq,
     double *__restrict__ outputIdata, double *__restrict__ outputQdata)
 {
@@ -443,8 +434,8 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
   double *I_shifted = new double[signalLength];
@@ -483,9 +474,9 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
   int outputLength = VOutputLength[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  auto I_resampled = outputIdata + offset + chIdx * outputLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+  auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
   dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -536,8 +527,8 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
+  auto I_orig = origIdata + chIdx * signalLength;
+  auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
   double *I_shifted = new double[signalLength];
@@ -564,9 +555,9 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
   int downFactor = VDownFactor[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
   dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -612,8 +603,6 @@ bool ShiftAndResampleSignalDoubleV1(
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  int upFactor = 1; // 上采样率，默认为1
-
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
@@ -772,8 +761,6 @@ bool ShiftAndResampleSignalDoubleV2(
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  int upFactor = 1; // 上采样率，默认为1
-
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index da8e9e0..92b064f 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,4 +1,4 @@
-#include "cuda_resample.h"
+#include "cuda_resample_float.h"
 #include "upfirdn_device.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
@@ -14,14 +14,6 @@
     }                                                                        \
   } while (0)
 
-#define LOG_INFO(fmt, ...)                                                \
-  fprintf(stderr, "[INFO] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
-#define LOG_ERROR(fmt, ...)                                                \
-  fprintf(stderr, "[ERROR] %s:%d (%s) " fmt, __FILE__, __LINE__, __func__, \
-          ##__VA_ARGS__)
-
 inline int quotientCeil(int num1, int num2)
 {
   if (num1 % num2 != 0)
@@ -32,8 +24,8 @@ inline int quotientCeil(int num1, int num2)
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
 {
-  div_t result = div(num1, num2);
-  return result.quot + (result.rem != 0);
+  // 标准的上取整公式：(a + b - 1) / b
+  return (num1 + num2 - 1) / num2;
 }
 
 // CUDA设备端GCD函数:最大公约数
@@ -257,8 +249,8 @@ __device__ void dev_kaiser_float(float *window, int order, float bta)
   }
 }
 
-__device__ void dev_resample_float(const int upFactor, const int downFactor,
-                                   const float *inputSignal, const int inputSize,
+__device__ void dev_resample_float(int upFactor, int downFactor,
+                                   float *inputSignal, const int inputSize,
                                    float *outputSignal)
 {
   const int n = 10;
@@ -310,7 +302,6 @@ __device__ void dev_resample_float(const int upFactor, const int downFactor,
                             freqSize);
   if (ret == -1)
   {
-    LOG_ERROR("dev_firls调用失败\n");
     return;
   }
 
@@ -425,7 +416,7 @@ __device__ void dev_resample_float(const int upFactor, const int downFactor,
 __global__ void ShiftingAndResamplingKernelFloatV1(
     const float *__restrict__ origIdata, const float *__restrict__ origQdata,
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
-    const float *__restrict__ VOutputLength, const int numResults,
+    const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const float CurrentRealfreq,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
@@ -443,8 +434,8 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
   float *I_shifted = new float[signalLength];
@@ -483,9 +474,9 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   int outputLength = VOutputLength[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + offset + chIdx * outputLength;
+  auto I_resampled = outputIdata + offset + chIdx * outputLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + offset + chIdx * outputLength;
+  auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
   dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -536,8 +527,8 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto &I_orig = origIdata + chIdx * signalLength;
-  auto &Q_orig = origQdata + chIdx * signalLength;
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
   float *I_shifted = new float[signalLength];
@@ -564,9 +555,9 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   int downFactor = VDownFactor[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto &I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto &Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
   dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
@@ -612,8 +603,6 @@ bool ShiftAndResampleSignalFloatV1(
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  int upFactor = 1; // 上采样率，默认为1
-
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
@@ -774,8 +763,6 @@ bool ShiftAndResampleSignalFloatV2(
   // 每个通道的信号长度：这里假设所有通道的长度是相同的
   int signalLength = origIdata[0].size();
 
-  int upFactor = 1; // 上采样率，默认为1
-
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
   int *d_downFactor = nullptr;
diff --git a/mainwindow.cpp b/mainwindow.cpp
index f171e09..a8c9d8c 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -203,12 +203,13 @@ void MainWindow::ReplayIQDataParse(char *buf)
 }
 
 template <typename T>
-ReplayIQDataParseV2(const T *outputIdata,
-                    const T *outputQdata, const int numResults,
-                    const int numChannels,
-                    const int signalLength)
+void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC,
+                         cpuComplex *signalDatas, const T *outputIdata,
+                         const T *outputQdata, const int numResults,
+                         const int numChannels,
+                         const int signalLength)
 {
-  if (signalDatas_ == nullptr)
+  if (signalDatas == nullptr)
   {
     // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
     if (!m_calMC.cudaCorrelation->AllocMappMemory(
@@ -220,7 +221,7 @@ ReplayIQDataParseV2(const T *outputIdata,
       return;
     }
 
-    signalDatas_ = (cpuComplex *)m_calMC.cudaCorrelation->h_signals;
+    signalDatas = (cpuComplex *)m_calMC.cudaCorrelation->h_signals;
   }
 
   int index = 0;
@@ -232,16 +233,17 @@ ReplayIQDataParseV2(const T *outputIdata,
       {
         int idx = (i * numChannels + j) * signalLength + k;
         cpuComplex data((T)outputIdata[idx], (T)outputQdata[idx]); // cpuComplex
-        signalDatas_[index++] = data;
+        signalDatas[index++] = data;
       }
     }
   }
 
   QElapsedTimer tm;
   tm.start();
+
   // 每帧 SamplePoints 个点 IQ 输入
   // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰
-  int result = CalculateRoutine(numResults * channelnumber, signalLength_);
+  int result = m_calMC.CalMovingCorrlationRoutine(numResults * numChannels, signalLength);
 
   std::cout << __FUNCTION__ << " result:" << result
             << " tm(ns):" << tm.nsecsElapsed() << std::endl;
diff --git a/mainwindow.h b/mainwindow.h
index 9a6e006..a5951b4 100644
--- a/mainwindow.h
+++ b/mainwindow.h
@@ -63,8 +63,10 @@ public slots:
 };
 
 template <typename T>
-ReplayIQDataParseV2(const T *outputIdata,
-                    const T *outputQdata, const int numResults,
-                    const int numChannels,
-                    const int signalLength);
+void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC,
+                         cpuComplex *signalDatas, const T *outputIdata,
+                         const T *outputQdata, const int numResults,
+                         const int numChannels,
+                         const int signalLength);
+
 #endif // MAINWINDOW_H
-- 
Gitee


From 34c4019741c4feb1c22bd454ce52f9e3420925fd Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 17:55:14 +0800
Subject: [PATCH 15/27] =?UTF-8?q?=20=20=E6=89=8B=E5=8A=A8=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0=E6=A8=A1=E6=9D=BF=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 CMakeLists.txt           |   7 +
 cuda_resample_double.cu  |   6 +-
 cuda_resample_float.cu   |   6 +-
 upfirdn_device.cu        | 330 ---------------------------------------
 upfirdn_device_double.cu | 234 +++++++++++++++++++++++++++
 upfirdn_device_double.h  |  28 ++++
 upfirdn_device_float.cu  | 234 +++++++++++++++++++++++++++
 upfirdn_device_float.h   |  28 ++++
 8 files changed, 537 insertions(+), 336 deletions(-)
 delete mode 100644 upfirdn_device.cu
 create mode 100644 upfirdn_device_double.cu
 create mode 100644 upfirdn_device_double.h
 create mode 100644 upfirdn_device_float.cu
 create mode 100644 upfirdn_device_float.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 969d97d..c428db2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,13 @@ if(USE_CUDA)
   message(STATUS "CMAKE_CUDA_ARCHITECTURES=" ${CMAKE_CUDA_ARCHITECTURES})
 endif()
 
+# 对于模板核函数需要这些选项
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+    -Wno-deprecated-gpu-targets")
+
+# 如果使用分离编译，需要添加
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
 # debug OR release mode
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++17 -pthread -pipe")
 if (NOT CMAKE_BUILD_TYPE)
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index afcade9..8567b99 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -1,5 +1,5 @@
 #include "cuda_resample_double.h"
-#include "upfirdn_device.h"
+#include "upfirdn_device_double.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -377,8 +377,8 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   }
 
   int resultsCount = 0;
-  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
-                 filterLength, results, &resultsCount);
+  upfirdn_device_double(upFactor, downFactor, inputSignal, inputSize, filter,
+                        filterLength, results, &resultsCount);
 
   int j = 0;
   for (int i = delay; i < outputSize + delay; i++)
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 92b064f..e70844a 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,5 +1,5 @@
 #include "cuda_resample_float.h"
-#include "upfirdn_device.h"
+#include "upfirdn_device_float.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -377,8 +377,8 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   }
 
   int resultsCount = 0;
-  upfirdn_device(upFactor, downFactor, inputSignal, inputSize, filter,
-                 filterLength, results, &resultsCount);
+  upfirdn_device_float(upFactor, downFactor, inputSignal, inputSize, filter,
+                       filterLength, results, &resultsCount);
 
   int j = 0;
   for (int i = delay; i < outputSize + delay; i++)
diff --git a/upfirdn_device.cu b/upfirdn_device.cu
deleted file mode 100644
index a040a56..0000000
--- a/upfirdn_device.cu
+++ /dev/null
@@ -1,330 +0,0 @@
-#include <cuda_runtime.h>
-
-#include "upfirdn_device.h"
-
-// 设备端Resampler初始化
-template <class S1, class C>
-__device__ void resampler_init_state_device(DeviceResamplerState<S1, C> *state,
-                                            C *transposedCoefs,
-                                            int coefsPerPhase, int upRate,
-                                            int downRate) {
-  state->_t = 0;
-  state->_xOffset = 0;
-  state->_transposedCoefs = transposedCoefs;
-  state->_coefsPerPhase = coefsPerPhase;
-  state->_upRate = upRate;
-  state->_downRate = downRate;
-
-  // 分配状态缓冲区
-  state->_state = new S1[coefsPerPhase - 1];
-
-  // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++) {
-    state->_state[i] = 0;
-  }
-}
-
-// 设备端：计算所需输出数量
-template <class S1, class S2, class C>
-__device__ int resampler_needed_out_count_device(
-    int inCount, DeviceResamplerState<S1, C> *state) {
-  int np = inCount * state->_upRate;
-  int need = np / state->_downRate;
-
-  if ((state->_t + state->_upRate * state->_xOffset) <
-      (np % state->_downRate)) {
-    need++;
-  }
-
-  return need;
-}
-
-// 设备端：应用重采样
-template <class S1, class S2, class C>
-__device__ int resampler_apply_device(S1 *in, int inCount, S2 *out,
-                                      int outCount,
-                                      DeviceResamplerState<S1, C> *state) {
-  if (outCount < resampler_needed_out_count_device<S1, S2, C>(inCount, state)) {
-    // 在设备端无法抛出异常，返回错误代码
-    return -1;
-  }
-
-  // x指向最新处理的输入样本
-  S1 *x = in + state->_xOffset;
-  S2 *y = out;
-  S1 *end = in + inCount;
-
-  while (x < end) {
-    S2 acc = 0;
-    C *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
-    S1 *xPtr = x - state->_coefsPerPhase + 1;
-
-    int offset = in - xPtr;
-    if (offset > 0) {
-      // 需要从_state缓冲区中获取
-      S1 *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
-
-      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
-        acc += (*statePtr++) * (*h++);
-      }
-
-      xPtr += offset;
-    }
-
-    while (xPtr <= x) {
-      acc += (*xPtr++) * (*h++);
-    }
-
-    *y++ = acc;
-    state->_t += state->_downRate;
-
-    int advanceAmount = state->_t / state->_upRate;
-    x += advanceAmount;
-    state->_t %= state->_upRate;
-  }
-
-  state->_xOffset = x - end;
-
-  // 管理_state缓冲区
-  int retain = (state->_coefsPerPhase - 1) - inCount;
-
-  if (retain > 0) {
-    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
-    for (int i = 0; i < retain; i++) {
-      state->_state[i] =
-          state->_state[(state->_coefsPerPhase - 1) - retain + i];
-    }
-
-    // 然后将整个（短）输入复制到缓冲区末尾
-    for (int i = 0; i < inCount; i++) {
-      state->_state[retain + i] = in[i];
-    }
-  } else {
-    // 只将最后几个输入样本复制到状态缓冲区
-    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
-      state->_state[i] = end - (state->_coefsPerPhase - 1) + i;
-    }
-  }
-
-  // 返回计算的样本数
-  return y - out;
-}
-
-// 设备端：释放Resampler状态
-template <class S1, class C>
-__device__ void resampler_free_state_device(
-    DeviceResamplerState<S1, C> *state) {
-  if (state->_state != nullptr) {
-    delete[] state->_state;
-    state->_state = nullptr;
-  }
-}
-
-// 设备端：转置滤波器系数（每个线程执行）
-template <class C>
-__device__ void transpose_filter_coefs_device(C *transposedCoefs, C *coefs,
-                                              int upRate, int coefCount,
-                                              int coefsPerPhase) {
-  // 初始化转置系数为零
-  for (int i = 0; i < upRate * coefsPerPhase; i++) {
-    transposedCoefs[i] = 0;
-  }
-
-  // 转置并翻转每个相位
-  for (int i = 0; i < upRate; ++i) {
-    for (int j = 0; j < coefsPerPhase; ++j) {
-      if (j * upRate + i < coefCount) {
-        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
-            coefs[j * upRate + i];
-      }
-    }
-  }
-}
-
-// 设备端upfirdn主函数
-template <class S1, class S2, class C>
-__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
-                               int inLength, C *filter, int filterLength,
-                               S2 *results, int *resultsCount) {
-  // 计算填充后的系数数量
-  int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate) {
-    paddedCoefCount++;
-  }
-
-  int coefsPerPhase = paddedCoefCount / upRate;
-
-  // 分配转置系数内存
-  C *transposedCoefs = new C[paddedCoefCount];
-
-  // 转置滤波器系数
-  transpose_filter_coefs_device(transposedCoefs, filter, upRate, filterLength,
-                                coefsPerPhase);
-
-  // 创建Resampler状态
-  DeviceResamplerState<S1, C> state;
-  resampler_init_state_device(&state, transposedCoefs, coefsPerPhase, upRate,
-                              downRate);
-
-  // 计算填充量
-  int padding = coefsPerPhase - 1;
-
-  // 分配填充输入内存
-  S1 *inputPadded = new S1[inLength + padding];
-
-  // 复制输入并填充
-  for (int i = 0; i < inLength + padding; i++) {
-    if (i < inLength) {
-      inputPadded[i] = input[i];
-    } else {
-      inputPadded[i] = 0;
-    }
-  }
-
-  // 计算输出大小
-  int resultsCountValue =
-      resampler_needed_out_count_device<S1, S2, C>(inLength + padding, &state);
-
-  // 设置输出计数
-  if (resultsCount != nullptr) {
-    *resultsCount = resultsCountValue;
-  }
-
-  // 运行滤波
-  int numSamplesComputed = resampler_apply_device<S1, S2, C>(
-      inputPadded, inLength + padding, results, resultsCountValue, &state);
-
-  // 清理设备内存
-  delete[] transposedCoefs;
-  delete[] inputPadded;
-  resampler_free_state_device(&state);
-}
-
-// 向量版本的设备端upfirdn
-template <class S1, class S2, class C>
-__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
-                               int inputLength, C *filter, int filterLength,
-                               S2 *results) {
-  upfirdn_device<S1, S2, C>(upRate, downRate, input, inputLength, filter,
-                            filterLength, results, nullptr);
-}
-
-// CUDA内核：每个线程块处理一个独立的upfirdn操作
-template <class S1, class S2, class C>
-__global__ void upfirdn_kernel_batch(int upRate, int downRate, S1 **inputs,
-                                     int *inputLengths, C **filters,
-                                     int *filterLengths, S2 **outputs,
-                                     int *outputLengths, int batchSize) {
-  int batchIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (batchIdx >= batchSize) {
-    return;
-  }
-
-  // 获取当前批次的参数
-  S1 *input = inputs[batchIdx];
-  int inLength = inputLengths[batchIdx];
-  C *filter = filters[batchIdx];
-  int filterLength = filterLengths[batchIdx];
-  S2 *output = outputs[batchIdx];
-
-  // 执行设备端upfirdn
-  upfirdn_device<S1, S2, C>(upRate, downRate, input, inLength, filter,
-                            filterLength, output);
-}
-
-// 优化的设备端upfirdn（预分配所有内存）
-template <class S1, class S2, class C>
-__device__ void upfirdn_device_optimized(
-    int upRate, int downRate, S1 *input, int inLength, C *filter,
-    int filterLength, S2 *results,
-    C *transposedCoefsBuffer,  // 预分配的转置系数缓冲区
-    S1 *stateBuffer,           // 预分配的状态缓冲区
-    S1 *inputPaddedBuffer) {   // 预分配的输入填充缓冲区
-
-  // 计算填充后的系数数量
-  int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate) {
-    paddedCoefCount++;
-  }
-
-  int coefsPerPhase = paddedCoefCount / upRate;
-
-  // 转置滤波器系数
-  for (int i = 0; i < upRate * coefsPerPhase; i++) {
-    transposedCoefsBuffer[i] = 0;
-  }
-
-  for (int i = 0; i < upRate; ++i) {
-    for (int j = 0; j < coefsPerPhase; ++j) {
-      if (j * upRate + i < filterLength) {
-        transposedCoefsBuffer[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
-            filter[j * upRate + i];
-      }
-    }
-  }
-
-  // 创建Resampler状态
-  DeviceResamplerState<S1, C> state;
-  state._t = 0;
-  state._xOffset = 0;
-  state._transposedCoefs = transposedCoefsBuffer;
-  state._coefsPerPhase = coefsPerPhase;
-  state._upRate = upRate;
-  state._downRate = downRate;
-  state._state = stateBuffer;
-
-  // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++) {
-    state._state[i] = 0;
-  }
-
-  // 填充输入
-  int padding = coefsPerPhase - 1;
-  for (int i = 0; i < inLength + padding; i++) {
-    if (i < inLength) {
-      inputPaddedBuffer[i] = input[i];
-    } else {
-      inputPaddedBuffer[i] = 0;
-    }
-  }
-
-  // 计算输出数量
-  int np = (inLength + padding) * upRate;
-  int need = np / downRate;
-  if ((state._t + upRate * state._xOffset) < (np % downRate)) {
-    need++;
-  }
-
-  // 应用重采样
-  S1 *in = inputPaddedBuffer;
-  S2 *out = results;
-  S1 *end = in + inLength + padding;
-
-  while (in + state._xOffset < end) {
-    S1 *x = in + state._xOffset;
-    S2 acc = 0;
-    C *h = transposedCoefsBuffer + state._t * coefsPerPhase;
-    S1 *xPtr = x - coefsPerPhase + 1;
-
-    int offset = in - xPtr;
-    if (offset > 0) {
-      S1 *statePtr = state._state + (coefsPerPhase - 1) - offset;
-      while (statePtr < state._state + (coefsPerPhase - 1)) {
-        acc += (*statePtr++) * (*h++);
-      }
-      xPtr += offset;
-    }
-
-    while (xPtr <= x) {
-      acc += (*xPtr++) * (*h++);
-    }
-
-    *out++ = acc;
-    state._t += downRate;
-
-    int advanceAmount = state._t / upRate;
-    x += advanceAmount;
-    state._t %= upRate;
-  }
-}
\ No newline at end of file
diff --git a/upfirdn_device_double.cu b/upfirdn_device_double.cu
new file mode 100644
index 0000000..bd78d74
--- /dev/null
+++ b/upfirdn_device_double.cu
@@ -0,0 +1,234 @@
+#include <cuda_runtime.h>
+
+#include "upfirdn_device_double.h"
+
+// 设备端Resampler初始化
+__device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
+                                              double *transposedCoefs,
+                                              int coefsPerPhase, int upRate,
+                                              int downRate)
+{
+  state->_t = 0;
+  state->_xOffset = 0;
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = new double[coefsPerPhase - 1];
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++)
+  {
+    state->_state[i] = 0;
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_double(
+    int inCount, DeviceResamplerStateDouble *state)
+{
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate))
+  {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_double(double *in, int inCount, double *out,
+                                             int outCount,
+                                             DeviceResamplerStateDouble *state)
+{
+  if (outCount < resampler_needed_out_count_device_double(inCount, state))
+  {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  double *x = in + state->_xOffset;
+  double *y = out;
+  double *end = in + inCount;
+
+  while (x < end)
+  {
+    double acc = 0;
+    double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    double *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0)
+    {
+      // 需要从_state缓冲区中获取
+      double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
+      {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x)
+    {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0)
+  {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++)
+    {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++)
+    {
+      state->_state[retain + i] = in[i];
+    }
+  }
+  else
+  {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
+    {
+      state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：释放Resampler状态
+__device__ void resampler_apply_device_double(
+    DeviceResamplerStateDouble *state)
+{
+  if (state->_state != nullptr)
+  {
+    delete[] state->_state;
+    state->_state = nullptr;
+  }
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_double(double *transposedCoefs, double *coefs,
+                                                     int upRate, int coefCount,
+                                                     int coefsPerPhase)
+{
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++)
+  {
+    transposedCoefs[i] = 0;
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i)
+  {
+    for (int j = 0; j < coefsPerPhase; ++j)
+    {
+      if (j * upRate + i < coefCount)
+      {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inLength, double *filter, int filterLength,
+                                      double *results, int *resultsCount)
+{
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  double *transposedCoefs = new double[paddedCoefCount];
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate, filterLength,
+                                       coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateDouble state;
+  resampler_apply_device_double(&state, transposedCoefs, coefsPerPhase, upRate,
+                                downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  double *inputPadded = new double[inLength + padding];
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++)
+  {
+    if (i < inLength)
+    {
+      inputPadded[i] = input[i];
+    }
+    else
+    {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_double(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr)
+  {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_double(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  delete[] transposedCoefs;
+  delete[] inputPadded;
+  resampler_apply_device_double(&state);
+}
+
+// 向量版本的设备端upfirdn
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inputLength, double *filter, int filterLength,
+                                      double *results)
+{
+  upfirdn_device_double(upRate, downRate, input, inputLength, filter,
+                        filterLength, results, nullptr);
+}
\ No newline at end of file
diff --git a/upfirdn_device_double.h b/upfirdn_device_double.h
new file mode 100644
index 0000000..06f6797
--- /dev/null
+++ b/upfirdn_device_double.h
@@ -0,0 +1,28 @@
+#ifndef UPFIRDN_DEVICE_H
+#define UPFIRDN_DEVICE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+// 设备端Resampler状态结构
+struct DeviceResamplerStateDouble
+{
+  int _t;                   // "time" (modulo upRate)
+  int _xOffset;             // 输入偏移量
+  double *_state;           // 状态缓冲区指针
+  double *_transposedCoefs; // 转置系数指针
+  int _coefsPerPhase;       // 每相系数数量
+  int _upRate;              // 上采样率
+  int _downRate;            // 下采样率
+};
+
+// 设备端upfirdn函数
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inLength, double *filter, int filterLength,
+                                      double *results, int *resultsCount);
+
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inputLength, double *filter, int filterLength,
+                                      double *results);
+
+#endif // UPFIRDN_DEVICE_H
\ No newline at end of file
diff --git a/upfirdn_device_float.cu b/upfirdn_device_float.cu
new file mode 100644
index 0000000..bdaf921
--- /dev/null
+++ b/upfirdn_device_float.cu
@@ -0,0 +1,234 @@
+#include <cuda_runtime.h>
+
+#include "upfirdn_device_float.h"
+
+// 设备端Resampler初始化
+__device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *state,
+                                                  float *transposedCoefs,
+                                                  int coefsPerPhase, int upRate,
+                                                  int downRate)
+{
+  state->_t = 0;
+  state->_xOffset = 0;
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = new float[coefsPerPhase - 1];
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++)
+  {
+    state->_state[i] = 0;
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_float(
+    int inCount, DeviceResamplerStateFloat *state)
+{
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate))
+  {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_float(float *in, int inCount, float *out,
+                                            int outCount,
+                                            DeviceResamplerStateFloat *state)
+{
+  if (outCount < resampler_needed_out_count_device_float(inCount, state))
+  {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  float *x = in + state->_xOffset;
+  float *y = out;
+  float *end = in + inCount;
+
+  while (x < end)
+  {
+    float acc = 0;
+    float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    float *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0)
+    {
+      // 需要从_state缓冲区中获取
+      float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
+      {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x)
+    {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0)
+  {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++)
+    {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++)
+    {
+      state->_state[retain + i] = in[i];
+    }
+  }
+  else
+  {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
+    {
+      state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：释放Resampler状态
+__device__ void resampler_apply_device_float(
+    DeviceResamplerStateFloat *state)
+{
+  if (state->_state != nullptr)
+  {
+    delete[] state->_state;
+    state->_state = nullptr;
+  }
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_float(float *transposedCoefs, float *coefs,
+                                                    int upRate, int coefCount,
+                                                    int coefsPerPhase)
+{
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++)
+  {
+    transposedCoefs[i] = 0;
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i)
+  {
+    for (int j = 0; j < coefsPerPhase; ++j)
+    {
+      if (j * upRate + i < coefCount)
+      {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inLength, float *filter, int filterLength,
+                                     float *results, int *resultsCount)
+{
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  float *transposedCoefs = new float[paddedCoefCount];
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, filterLength,
+                                      coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateFloat state;
+  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase, upRate,
+                                    downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  float *inputPadded = new float[inLength + padding];
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++)
+  {
+    if (i < inLength)
+    {
+      inputPadded[i] = input[i];
+    }
+    else
+    {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_float(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr)
+  {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_float(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  delete[] transposedCoefs;
+  delete[] inputPadded;
+  resampler_apply_device_float(&state);
+}
+
+// 向量版本的设备端upfirdn
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inputLength, float *filter, int filterLength,
+                                     float *results)
+{
+  upfirdn_device_float(upRate, downRate, input, inputLength, filter,
+                       filterLength, results, nullptr);
+}
\ No newline at end of file
diff --git a/upfirdn_device_float.h b/upfirdn_device_float.h
new file mode 100644
index 0000000..875436e
--- /dev/null
+++ b/upfirdn_device_float.h
@@ -0,0 +1,28 @@
+#ifndef UPFIRDN_DEVICE_H
+#define UPFIRDN_DEVICE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+// 设备端Resampler状态结构
+struct DeviceResamplerStateFloat
+{
+  int _t;                  // "time" (modulo upRate)
+  int _xOffset;            // 输入偏移量
+  float *_state;           // 状态缓冲区指针
+  float *_transposedCoefs; // 转置系数指针
+  int _coefsPerPhase;      // 每相系数数量
+  int _upRate;             // 上采样率
+  int _downRate;           // 下采样率
+};
+
+// 设备端upfirdn函数
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inLength, float *filter, int filterLength,
+                                     float *results, int *resultsCount);
+
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inputLength, float *filter, int filterLength,
+                                     float *results);
+
+#endif // UPFIRDN_DEVICE_H
\ No newline at end of file
-- 
Gitee


From 48aae6c43f09d2a398fe2234f985af1f35f995f2 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 18:00:46 +0800
Subject: [PATCH 16/27] =?UTF-8?q?=E5=88=A0=E9=99=A4upfirdn=5Fdevice.h?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 upfirdn_device.h | 46 ----------------------------------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 upfirdn_device.h

diff --git a/upfirdn_device.h b/upfirdn_device.h
deleted file mode 100644
index 8d5de6d..0000000
--- a/upfirdn_device.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef UPFIRDN_DEVICE_H
-#define UPFIRDN_DEVICE_H
-
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-
-// 设备端Resampler状态结构
-template <class S1, class C>
-struct DeviceResamplerState {
-  int _t;               // "time" (modulo upRate)
-  int _xOffset;         // 输入偏移量
-  S1 *_state;           // 状态缓冲区指针
-  C *_transposedCoefs;  // 转置系数指针
-  int _coefsPerPhase;   // 每相系数数量
-  int _upRate;          // 上采样率
-  int _downRate;        // 下采样率
-};
-
-// 设备端函数声明
-template <class S1, class S2, class C>
-__device__ int resampler_apply_device(S1 *in, int inCount, S2 *out,
-                                      int outCount,
-                                      DeviceResamplerState<S1, C> *state);
-
-template <class S1, class S2, class C>
-__device__ int resampler_needed_out_count_device(
-    int inCount, DeviceResamplerState<S1, C> *state);
-
-template <class S1, class C>
-__device__ void resampler_init_state_device(DeviceResamplerState<S1, C> *state,
-                                            C *transposedCoefs,
-                                            int coefsPerPhase, int upRate,
-                                            int downRate);
-
-// 设备端upfirdn函数
-template <class S1, class S2, class C>
-__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
-                               int inLength, C *filter, int filterLength,
-                               S2 *results, int *resultsCount);
-
-template <class S1, class S2, class C>
-__device__ void upfirdn_device(int upRate, int downRate, S1 *input,
-                               int inputLength, C *filter, int filterLength,
-                               S2 *results);
-
-#endif  // UPFIRDN_DEVICE_H
\ No newline at end of file
-- 
Gitee


From d27ca7225509e0a6a3518f12db29c2f4eb635850 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 18:28:03 +0800
Subject: [PATCH 17/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample=E7=9B=B8?=
 =?UTF-8?q?=E5=85=B3=E6=A0=B8=E5=87=BD=E6=95=B0=EF=BC=8C=E5=B9=B6=E4=BF=AE?=
 =?UTF-8?q?=E5=A4=8D=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 CMakeLists.txt          |  4 ++--
 cuda_resample_double.cu | 14 +++++++-------
 cuda_resample_double.h  |  2 +-
 cuda_resample_float.cu  | 12 ++++++------
 cuda_resample_float.h   |  1 +
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c428db2..ad31eb3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ if(USE_CUDA)
 endif()
 
 # 对于模板核函数需要这些选项
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
-    -Wno-deprecated-gpu-targets")
+# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+#     -Wno-deprecated-gpu-targets")
 
 # 如果使用分离编译，需要添加
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index 8567b99..adfd7c9 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -153,12 +153,12 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
     {
       double kj = k[j];
       b[j] += (m_s / (double(4.0) * pow(M_PI, double(2.0))) *
-               (cosf(double(2.0) * M_PI * Fip1) - cosf(double(2.0) * M_PI * Fi)) /
+               (cos(double(2.0) * M_PI * Fip1) - cos(double(2.0) * M_PI * Fi)) /
                (pow(kj, double(2.0)))) *
               wt2;
 
-      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(double(2.0) * kj * Fip1) -
-               Fi * (m_s * Fi + b1) * sinf(double(2.0) * kj * Fi)) *
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sin(double(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sin(double(2.0) * kj * Fi)) *
               wt2;
     }
   }
@@ -451,8 +451,8 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
   for (int i = 0; i < signalLength; i++)
   {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = cosf(phase);
-    double sinVal = sinf(phase);
+    double cosVal = cos(phase);
+    double sinVal = sin(phase);
     I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
     Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
@@ -544,8 +544,8 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
   for (int i = 0; i < signalLength; i++)
   {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = cosf(phase);
-    double sinVal = sinf(phase);
+    double cosVal = cos(phase);
+    double sinVal = sin(phase);
     I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
     Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
diff --git a/cuda_resample_double.h b/cuda_resample_double.h
index acbbae2..8fb8b53 100644
--- a/cuda_resample_double.h
+++ b/cuda_resample_double.h
@@ -4,7 +4,7 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <thrust/device_vector.h>
-
+#include <math_constants.h> // CUDA数学常量头文件
 #include <cmath>
 #include <map>
 #include <type_traits>
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index e70844a..88b6ebc 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -138,23 +138,23 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     float Fip1 = freq[i + 1];
     float ampi = amplitude[i];
     float ampip1 = amplitude[i + 1];
-    float wt2 = pow(weight[i / 2], float(2.0));
+    float wt2 = powf(weight[i / 2], float(2.0));
     float m_s = (ampip1 - ampi) / (Fip1 - Fi);
     float b1 = ampi - (m_s * Fi);
 
     if (Nodd)
     {
       b0 += (b1 * (Fip1 - Fi)) +
-            m_s / float(2.0) * (pow(Fip1, float(2.0)) - pow(Fi, float(2.0))) * wt2;
+            m_s / float(2.0) * (powf(Fip1, float(2.0)) - powf(Fi, float(2.0))) * wt2;
     }
 
     // 并行计算b向量
     for (int j = 0; j < kLength; j++)
     {
       float kj = k[j];
-      b[j] += (m_s / (float(4.0) * pow(M_PI, float(2.0))) *
+      b[j] += (m_s / (float(4.0) * powf(M_PI, float(2.0))) *
                (cosf(float(2.0) * M_PI * Fip1) - cosf(float(2.0) * M_PI * Fi)) /
-               (pow(kj, float(2.0)))) *
+               (powf(kj, float(2.0)))) *
               wt2;
 
       b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(float(2.0) * kj * Fip1) -
@@ -192,7 +192,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   // vector<float> result = {a.rbegin(), a.rend()};
   for (int i = 0; i < aLength; i++)
   {
-    a[i] = pow(w0, float(2.0)) * float(4.0) * b[i];
+    a[i] = powf(w0, float(2.0)) * float(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
   }
 
@@ -243,7 +243,7 @@ __device__ void dev_kaiser_float(float *window, int order, float bta)
 
   for (int n = 0; n < order; n++)
   {
-    float x = bta * sqrt(float(1) - pow((n - od2) / od2, float(2)));
+    float x = bta * sqrt(float(1) - powf((n - od2) / od2, float(2)));
     Numerator = dev_cyl_bessel_i_float(0, x);
     window[n] = Numerator / Denominator;
   }
diff --git a/cuda_resample_float.h b/cuda_resample_float.h
index aa981e9..73b4fff 100644
--- a/cuda_resample_float.h
+++ b/cuda_resample_float.h
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <thrust/device_vector.h>
+#include <math_constants.h> // CUDA数学常量头文件
 
 #include <cmath>
 #include <map>
-- 
Gitee


From 4524014b3f049771762da13b4ab720608b61a917 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 19:12:31 +0800
Subject: [PATCH 18/27] =?UTF-8?q?=E5=90=88=E5=B9=B6upfirdn=5Fdevice?=
 =?UTF-8?q?=E5=88=B0=E5=93=8D=E5=BA=94=E7=9A=84=E6=BA=90=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E4=B8=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 CMakeLists.txt           |   2 +-
 cuda_resample_double.cu  | 232 +++++++++++++++++++++++++++++++++++++-
 cuda_resample_double.h   |  12 ++
 cuda_resample_float.cu   | 232 +++++++++++++++++++++++++++++++++++++-
 cuda_resample_float.h    |  12 ++
 upfirdn_device_double.cu | 234 ---------------------------------------
 upfirdn_device_double.h  |  28 -----
 upfirdn_device_float.cu  | 234 ---------------------------------------
 upfirdn_device_float.h   |  28 -----
 9 files changed, 487 insertions(+), 527 deletions(-)
 delete mode 100644 upfirdn_device_double.cu
 delete mode 100644 upfirdn_device_double.h
 delete mode 100644 upfirdn_device_float.cu
 delete mode 100644 upfirdn_device_float.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad31eb3..aa1e423 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ endif()
 #     -Wno-deprecated-gpu-targets")
 
 # 如果使用分离编译，需要添加
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+# set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
 
 # debug OR release mode
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++17 -pthread -pipe")
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index adfd7c9..f31f686 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -1,5 +1,4 @@
 #include "cuda_resample_double.h"
-#include "upfirdn_device_double.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -21,6 +20,237 @@ inline int quotientCeil(int num1, int num2)
   return num1 / num2;
 }
 
+// 设备端Resampler初始化
+__device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
+                                              double *transposedCoefs,
+                                              int coefsPerPhase, int upRate,
+                                              int downRate)
+{
+  state->_t = 0;
+  state->_xOffset = 0;
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = new double[coefsPerPhase - 1];
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++)
+  {
+    state->_state[i] = 0;
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_double(
+    int inCount, DeviceResamplerStateDouble *state)
+{
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate))
+  {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_double(double *in, int inCount, double *out,
+                                             int outCount,
+                                             DeviceResamplerStateDouble *state)
+{
+  if (outCount < resampler_needed_out_count_device_double(inCount, state))
+  {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  double *x = in + state->_xOffset;
+  double *y = out;
+  double *end = in + inCount;
+
+  while (x < end)
+  {
+    double acc = 0;
+    double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    double *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0)
+    {
+      // 需要从_state缓冲区中获取
+      double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
+      {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x)
+    {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0)
+  {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++)
+    {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++)
+    {
+      state->_state[retain + i] = in[i];
+    }
+  }
+  else
+  {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
+    {
+      state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：释放Resampler状态
+__device__ void resampler_apply_device_double(
+    DeviceResamplerStateDouble *state)
+{
+  if (state->_state != nullptr)
+  {
+    delete[] state->_state;
+    state->_state = nullptr;
+  }
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_double(double *transposedCoefs, double *coefs,
+                                                     int upRate, int coefCount,
+                                                     int coefsPerPhase)
+{
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++)
+  {
+    transposedCoefs[i] = 0;
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i)
+  {
+    for (int j = 0; j < coefsPerPhase; ++j)
+    {
+      if (j * upRate + i < coefCount)
+      {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inLength, double *filter, int filterLength,
+                                      double *results, int *resultsCount)
+{
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  double *transposedCoefs = new double[paddedCoefCount];
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate, filterLength,
+                                       coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateDouble state;
+  resampler_apply_device_double(&state, transposedCoefs, coefsPerPhase, upRate,
+                                downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  double *inputPadded = new double[inLength + padding];
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++)
+  {
+    if (i < inLength)
+    {
+      inputPadded[i] = input[i];
+    }
+    else
+    {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_double(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr)
+  {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_double(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  delete[] transposedCoefs;
+  delete[] inputPadded;
+  resampler_apply_device_double(&state);
+}
+
+// 向量版本的设备端upfirdn
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inputLength, double *filter, int filterLength,
+                                      double *results)
+{
+  upfirdn_device_double(upRate, downRate, input, inputLength, filter,
+                        filterLength, results, nullptr);
+}
+
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
 {
diff --git a/cuda_resample_double.h b/cuda_resample_double.h
index 8fb8b53..f8e5556 100644
--- a/cuda_resample_double.h
+++ b/cuda_resample_double.h
@@ -14,6 +14,18 @@
 #define M_PI 3.141592653589793238462643
 #endif
 
+// 设备端Resampler状态结构
+struct DeviceResamplerStateDouble
+{
+    int _t;                   // "time" (modulo upRate)
+    int _xOffset;             // 输入偏移量
+    double *_state;           // 状态缓冲区指针
+    double *_transposedCoefs; // 转置系数指针
+    int _coefsPerPhase;       // 每相系数数量
+    int _upRate;              // 上采样率
+    int _downRate;            // 下采样率
+};
+
 /**
  * ShiftAndResampleSignalDoubleV1
  * 重采样函数：完成原始信号的移频，重采样等计算
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 88b6ebc..75793b5 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,5 +1,4 @@
 #include "cuda_resample_float.h"
-#include "upfirdn_device_float.h"
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -21,6 +20,237 @@ inline int quotientCeil(int num1, int num2)
   return num1 / num2;
 }
 
+// 设备端Resampler初始化
+__device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *state,
+                                                  float *transposedCoefs,
+                                                  int coefsPerPhase, int upRate,
+                                                  int downRate)
+{
+  state->_t = 0;
+  state->_xOffset = 0;
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = new float[coefsPerPhase - 1];
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++)
+  {
+    state->_state[i] = 0;
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_float(
+    int inCount, DeviceResamplerStateFloat *state)
+{
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate))
+  {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_float(float *in, int inCount, float *out,
+                                            int outCount,
+                                            DeviceResamplerStateFloat *state)
+{
+  if (outCount < resampler_needed_out_count_device_float(inCount, state))
+  {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  float *x = in + state->_xOffset;
+  float *y = out;
+  float *end = in + inCount;
+
+  while (x < end)
+  {
+    float acc = 0;
+    float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    float *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0)
+    {
+      // 需要从_state缓冲区中获取
+      float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
+      {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x)
+    {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0)
+  {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++)
+    {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++)
+    {
+      state->_state[retain + i] = in[i];
+    }
+  }
+  else
+  {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
+    {
+      state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：释放Resampler状态
+__device__ void resampler_apply_device_float(
+    DeviceResamplerStateFloat *state)
+{
+  if (state->_state != nullptr)
+  {
+    delete[] state->_state;
+    state->_state = nullptr;
+  }
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_float(float *transposedCoefs, float *coefs,
+                                                    int upRate, int coefCount,
+                                                    int coefsPerPhase)
+{
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++)
+  {
+    transposedCoefs[i] = 0;
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i)
+  {
+    for (int j = 0; j < coefsPerPhase; ++j)
+    {
+      if (j * upRate + i < coefCount)
+      {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inLength, float *filter, int filterLength,
+                                     float *results, int *resultsCount)
+{
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate)
+  {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  float *transposedCoefs = new float[paddedCoefCount];
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, filterLength,
+                                      coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateFloat state;
+  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase, upRate,
+                                    downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  float *inputPadded = new float[inLength + padding];
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++)
+  {
+    if (i < inLength)
+    {
+      inputPadded[i] = input[i];
+    }
+    else
+    {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_float(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr)
+  {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_float(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  delete[] transposedCoefs;
+  delete[] inputPadded;
+  resampler_apply_device_float(&state);
+}
+
+// 向量版本的设备端upfirdn
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inputLength, float *filter, int filterLength,
+                                     float *results)
+{
+  upfirdn_device_float(upRate, downRate, input, inputLength, filter,
+                       filterLength, results, nullptr);
+}
+
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
 {
diff --git a/cuda_resample_float.h b/cuda_resample_float.h
index 73b4fff..e278445 100644
--- a/cuda_resample_float.h
+++ b/cuda_resample_float.h
@@ -15,6 +15,18 @@
 #define M_PI 3.141592653589793238462643
 #endif
 
+// 设备端Resampler状态结构
+struct DeviceResamplerStateFloat
+{
+    int _t;                  // "time" (modulo upRate)
+    int _xOffset;            // 输入偏移量
+    float *_state;           // 状态缓冲区指针
+    float *_transposedCoefs; // 转置系数指针
+    int _coefsPerPhase;      // 每相系数数量
+    int _upRate;             // 上采样率
+    int _downRate;           // 下采样率
+};
+
 /**
  * ShiftAndResampleSignalFloatV1
  * 重采样函数：完成原始信号的移频，重采样等计算
diff --git a/upfirdn_device_double.cu b/upfirdn_device_double.cu
deleted file mode 100644
index bd78d74..0000000
--- a/upfirdn_device_double.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-#include <cuda_runtime.h>
-
-#include "upfirdn_device_double.h"
-
-// 设备端Resampler初始化
-__device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
-                                              double *transposedCoefs,
-                                              int coefsPerPhase, int upRate,
-                                              int downRate)
-{
-  state->_t = 0;
-  state->_xOffset = 0;
-  state->_transposedCoefs = transposedCoefs;
-  state->_coefsPerPhase = coefsPerPhase;
-  state->_upRate = upRate;
-  state->_downRate = downRate;
-
-  // 分配状态缓冲区
-  state->_state = new double[coefsPerPhase - 1];
-
-  // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++)
-  {
-    state->_state[i] = 0;
-  }
-}
-
-// 设备端：计算所需输出数量
-__device__ int resampler_needed_out_count_device_double(
-    int inCount, DeviceResamplerStateDouble *state)
-{
-  int np = inCount * state->_upRate;
-  int need = np / state->_downRate;
-
-  if ((state->_t + state->_upRate * state->_xOffset) <
-      (np % state->_downRate))
-  {
-    need++;
-  }
-
-  return need;
-}
-
-// 设备端：应用重采样
-__device__ int resampler_apply_device_double(double *in, int inCount, double *out,
-                                             int outCount,
-                                             DeviceResamplerStateDouble *state)
-{
-  if (outCount < resampler_needed_out_count_device_double(inCount, state))
-  {
-    // 在设备端无法抛出异常，返回错误代码
-    return -1;
-  }
-
-  // x指向最新处理的输入样本
-  double *x = in + state->_xOffset;
-  double *y = out;
-  double *end = in + inCount;
-
-  while (x < end)
-  {
-    double acc = 0;
-    double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
-    double *xPtr = x - state->_coefsPerPhase + 1;
-
-    int offset = in - xPtr;
-    if (offset > 0)
-    {
-      // 需要从_state缓冲区中获取
-      double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
-
-      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
-      {
-        acc += (*statePtr++) * (*h++);
-      }
-
-      xPtr += offset;
-    }
-
-    while (xPtr <= x)
-    {
-      acc += (*xPtr++) * (*h++);
-    }
-
-    *y++ = acc;
-    state->_t += state->_downRate;
-
-    int advanceAmount = state->_t / state->_upRate;
-    x += advanceAmount;
-    state->_t %= state->_upRate;
-  }
-
-  state->_xOffset = x - end;
-
-  // 管理_state缓冲区
-  int retain = (state->_coefsPerPhase - 1) - inCount;
-
-  if (retain > 0)
-  {
-    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
-    for (int i = 0; i < retain; i++)
-    {
-      state->_state[i] =
-          state->_state[(state->_coefsPerPhase - 1) - retain + i];
-    }
-
-    // 然后将整个（短）输入复制到缓冲区末尾
-    for (int i = 0; i < inCount; i++)
-    {
-      state->_state[retain + i] = in[i];
-    }
-  }
-  else
-  {
-    // 只将最后几个输入样本复制到状态缓冲区
-    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
-    {
-      state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i;
-    }
-  }
-
-  // 返回计算的样本数
-  return y - out;
-}
-
-// 设备端：释放Resampler状态
-__device__ void resampler_apply_device_double(
-    DeviceResamplerStateDouble *state)
-{
-  if (state->_state != nullptr)
-  {
-    delete[] state->_state;
-    state->_state = nullptr;
-  }
-}
-
-// 设备端：转置滤波器系数（每个线程执行）
-__device__ void transpose_filter_coefs_device_double(double *transposedCoefs, double *coefs,
-                                                     int upRate, int coefCount,
-                                                     int coefsPerPhase)
-{
-  // 初始化转置系数为零
-  for (int i = 0; i < upRate * coefsPerPhase; i++)
-  {
-    transposedCoefs[i] = 0;
-  }
-
-  // 转置并翻转每个相位
-  for (int i = 0; i < upRate; ++i)
-  {
-    for (int j = 0; j < coefsPerPhase; ++j)
-    {
-      if (j * upRate + i < coefCount)
-      {
-        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
-            coefs[j * upRate + i];
-      }
-    }
-  }
-}
-
-// 设备端upfirdn主函数
-__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inLength, double *filter, int filterLength,
-                                      double *results, int *resultsCount)
-{
-  // 计算填充后的系数数量
-  int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate)
-  {
-    paddedCoefCount++;
-  }
-
-  int coefsPerPhase = paddedCoefCount / upRate;
-
-  // 分配转置系数内存
-  double *transposedCoefs = new double[paddedCoefCount];
-
-  // 转置滤波器系数
-  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate, filterLength,
-                                       coefsPerPhase);
-
-  // 创建Resampler状态
-  DeviceResamplerStateDouble state;
-  resampler_apply_device_double(&state, transposedCoefs, coefsPerPhase, upRate,
-                                downRate);
-
-  // 计算填充量
-  int padding = coefsPerPhase - 1;
-
-  // 分配填充输入内存
-  double *inputPadded = new double[inLength + padding];
-
-  // 复制输入并填充
-  for (int i = 0; i < inLength + padding; i++)
-  {
-    if (i < inLength)
-    {
-      inputPadded[i] = input[i];
-    }
-    else
-    {
-      inputPadded[i] = 0;
-    }
-  }
-
-  // 计算输出大小
-  int resultsCountValue =
-      resampler_needed_out_count_device_double(inLength + padding, &state);
-
-  // 设置输出计数
-  if (resultsCount != nullptr)
-  {
-    *resultsCount = resultsCountValue;
-  }
-
-  // 运行滤波
-  int numSamplesComputed = resampler_apply_device_double(
-      inputPadded, inLength + padding, results, resultsCountValue, &state);
-
-  // 清理设备内存
-  delete[] transposedCoefs;
-  delete[] inputPadded;
-  resampler_apply_device_double(&state);
-}
-
-// 向量版本的设备端upfirdn
-__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inputLength, double *filter, int filterLength,
-                                      double *results)
-{
-  upfirdn_device_double(upRate, downRate, input, inputLength, filter,
-                        filterLength, results, nullptr);
-}
\ No newline at end of file
diff --git a/upfirdn_device_double.h b/upfirdn_device_double.h
deleted file mode 100644
index 06f6797..0000000
--- a/upfirdn_device_double.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef UPFIRDN_DEVICE_H
-#define UPFIRDN_DEVICE_H
-
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-
-// 设备端Resampler状态结构
-struct DeviceResamplerStateDouble
-{
-  int _t;                   // "time" (modulo upRate)
-  int _xOffset;             // 输入偏移量
-  double *_state;           // 状态缓冲区指针
-  double *_transposedCoefs; // 转置系数指针
-  int _coefsPerPhase;       // 每相系数数量
-  int _upRate;              // 上采样率
-  int _downRate;            // 下采样率
-};
-
-// 设备端upfirdn函数
-__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inLength, double *filter, int filterLength,
-                                      double *results, int *resultsCount);
-
-__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inputLength, double *filter, int filterLength,
-                                      double *results);
-
-#endif // UPFIRDN_DEVICE_H
\ No newline at end of file
diff --git a/upfirdn_device_float.cu b/upfirdn_device_float.cu
deleted file mode 100644
index bdaf921..0000000
--- a/upfirdn_device_float.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-#include <cuda_runtime.h>
-
-#include "upfirdn_device_float.h"
-
-// 设备端Resampler初始化
-__device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *state,
-                                                  float *transposedCoefs,
-                                                  int coefsPerPhase, int upRate,
-                                                  int downRate)
-{
-  state->_t = 0;
-  state->_xOffset = 0;
-  state->_transposedCoefs = transposedCoefs;
-  state->_coefsPerPhase = coefsPerPhase;
-  state->_upRate = upRate;
-  state->_downRate = downRate;
-
-  // 分配状态缓冲区
-  state->_state = new float[coefsPerPhase - 1];
-
-  // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++)
-  {
-    state->_state[i] = 0;
-  }
-}
-
-// 设备端：计算所需输出数量
-__device__ int resampler_needed_out_count_device_float(
-    int inCount, DeviceResamplerStateFloat *state)
-{
-  int np = inCount * state->_upRate;
-  int need = np / state->_downRate;
-
-  if ((state->_t + state->_upRate * state->_xOffset) <
-      (np % state->_downRate))
-  {
-    need++;
-  }
-
-  return need;
-}
-
-// 设备端：应用重采样
-__device__ int resampler_apply_device_float(float *in, int inCount, float *out,
-                                            int outCount,
-                                            DeviceResamplerStateFloat *state)
-{
-  if (outCount < resampler_needed_out_count_device_float(inCount, state))
-  {
-    // 在设备端无法抛出异常，返回错误代码
-    return -1;
-  }
-
-  // x指向最新处理的输入样本
-  float *x = in + state->_xOffset;
-  float *y = out;
-  float *end = in + inCount;
-
-  while (x < end)
-  {
-    float acc = 0;
-    float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
-    float *xPtr = x - state->_coefsPerPhase + 1;
-
-    int offset = in - xPtr;
-    if (offset > 0)
-    {
-      // 需要从_state缓冲区中获取
-      float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
-
-      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
-      {
-        acc += (*statePtr++) * (*h++);
-      }
-
-      xPtr += offset;
-    }
-
-    while (xPtr <= x)
-    {
-      acc += (*xPtr++) * (*h++);
-    }
-
-    *y++ = acc;
-    state->_t += state->_downRate;
-
-    int advanceAmount = state->_t / state->_upRate;
-    x += advanceAmount;
-    state->_t %= state->_upRate;
-  }
-
-  state->_xOffset = x - end;
-
-  // 管理_state缓冲区
-  int retain = (state->_coefsPerPhase - 1) - inCount;
-
-  if (retain > 0)
-  {
-    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
-    for (int i = 0; i < retain; i++)
-    {
-      state->_state[i] =
-          state->_state[(state->_coefsPerPhase - 1) - retain + i];
-    }
-
-    // 然后将整个（短）输入复制到缓冲区末尾
-    for (int i = 0; i < inCount; i++)
-    {
-      state->_state[retain + i] = in[i];
-    }
-  }
-  else
-  {
-    // 只将最后几个输入样本复制到状态缓冲区
-    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
-    {
-      state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i;
-    }
-  }
-
-  // 返回计算的样本数
-  return y - out;
-}
-
-// 设备端：释放Resampler状态
-__device__ void resampler_apply_device_float(
-    DeviceResamplerStateFloat *state)
-{
-  if (state->_state != nullptr)
-  {
-    delete[] state->_state;
-    state->_state = nullptr;
-  }
-}
-
-// 设备端：转置滤波器系数（每个线程执行）
-__device__ void transpose_filter_coefs_device_float(float *transposedCoefs, float *coefs,
-                                                    int upRate, int coefCount,
-                                                    int coefsPerPhase)
-{
-  // 初始化转置系数为零
-  for (int i = 0; i < upRate * coefsPerPhase; i++)
-  {
-    transposedCoefs[i] = 0;
-  }
-
-  // 转置并翻转每个相位
-  for (int i = 0; i < upRate; ++i)
-  {
-    for (int j = 0; j < coefsPerPhase; ++j)
-    {
-      if (j * upRate + i < coefCount)
-      {
-        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
-            coefs[j * upRate + i];
-      }
-    }
-  }
-}
-
-// 设备端upfirdn主函数
-__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inLength, float *filter, int filterLength,
-                                     float *results, int *resultsCount)
-{
-  // 计算填充后的系数数量
-  int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate)
-  {
-    paddedCoefCount++;
-  }
-
-  int coefsPerPhase = paddedCoefCount / upRate;
-
-  // 分配转置系数内存
-  float *transposedCoefs = new float[paddedCoefCount];
-
-  // 转置滤波器系数
-  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, filterLength,
-                                      coefsPerPhase);
-
-  // 创建Resampler状态
-  DeviceResamplerStateFloat state;
-  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase, upRate,
-                                    downRate);
-
-  // 计算填充量
-  int padding = coefsPerPhase - 1;
-
-  // 分配填充输入内存
-  float *inputPadded = new float[inLength + padding];
-
-  // 复制输入并填充
-  for (int i = 0; i < inLength + padding; i++)
-  {
-    if (i < inLength)
-    {
-      inputPadded[i] = input[i];
-    }
-    else
-    {
-      inputPadded[i] = 0;
-    }
-  }
-
-  // 计算输出大小
-  int resultsCountValue =
-      resampler_needed_out_count_device_float(inLength + padding, &state);
-
-  // 设置输出计数
-  if (resultsCount != nullptr)
-  {
-    *resultsCount = resultsCountValue;
-  }
-
-  // 运行滤波
-  int numSamplesComputed = resampler_apply_device_float(
-      inputPadded, inLength + padding, results, resultsCountValue, &state);
-
-  // 清理设备内存
-  delete[] transposedCoefs;
-  delete[] inputPadded;
-  resampler_apply_device_float(&state);
-}
-
-// 向量版本的设备端upfirdn
-__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inputLength, float *filter, int filterLength,
-                                     float *results)
-{
-  upfirdn_device_float(upRate, downRate, input, inputLength, filter,
-                       filterLength, results, nullptr);
-}
\ No newline at end of file
diff --git a/upfirdn_device_float.h b/upfirdn_device_float.h
deleted file mode 100644
index 875436e..0000000
--- a/upfirdn_device_float.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef UPFIRDN_DEVICE_H
-#define UPFIRDN_DEVICE_H
-
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-
-// 设备端Resampler状态结构
-struct DeviceResamplerStateFloat
-{
-  int _t;                  // "time" (modulo upRate)
-  int _xOffset;            // 输入偏移量
-  float *_state;           // 状态缓冲区指针
-  float *_transposedCoefs; // 转置系数指针
-  int _coefsPerPhase;      // 每相系数数量
-  int _upRate;             // 上采样率
-  int _downRate;           // 下采样率
-};
-
-// 设备端upfirdn函数
-__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inLength, float *filter, int filterLength,
-                                     float *results, int *resultsCount);
-
-__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inputLength, float *filter, int filterLength,
-                                     float *results);
-
-#endif // UPFIRDN_DEVICE_H
\ No newline at end of file
-- 
Gitee


From 2417bf0533ba218e0e4c723047f7bee636e7ebd0 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Wed, 17 Dec 2025 19:34:53 +0800
Subject: [PATCH 19/27] =?UTF-8?q?=20=E4=BF=AE=E5=A4=8D=E9=94=99=E8=AF=AF?=
 =?UTF-8?q?=EF=BC=9A=E7=BB=93=E6=9E=9Ccopy=E7=9A=84=E6=96=B9=E5=90=91=20De?=
 =?UTF-8?q?viceToHost?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 cuda_resample_double.cu |  8 ++++----
 cuda_resample_float.cu  | 10 ++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index f31f686..0ca63c4 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -830,7 +830,7 @@ bool ShiftAndResampleSignalDoubleV1(
     double *outputIdata,
     double *outputQdata)
 {
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
@@ -906,11 +906,11 @@ bool ShiftAndResampleSignalDoubleV1(
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
                               (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
+                              cudaMemcpyDeviceToHost));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
                               (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyHostToDevice));
+                              cudaMemcpyDeviceToHost));
 
   // 释放显存
   if (d_downFactor)
@@ -988,7 +988,7 @@ bool ShiftAndResampleSignalDoubleV2(
     double *outputIdata,
     double *outputQdata)
 {
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 75793b5..1a99982 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -830,7 +830,7 @@ bool ShiftAndResampleSignalFloatV1(
     float *outputIdata,
     float *outputQdata)
 {
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
@@ -902,17 +902,15 @@ bool ShiftAndResampleSignalFloatV1(
       numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
-  // outputIdata 确保空间够
-  // outputQdata 确保空间够
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
   CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
                               (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
+                              cudaMemcpyDeviceToHost));
 
   CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
                               (numChannels * outputTotalLength * sizeof(float)),
-                              cudaMemcpyHostToDevice));
+                              cudaMemcpyDeviceToHost));
 
   // 释放显存
   if (d_downFactor)
@@ -990,7 +988,7 @@ bool ShiftAndResampleSignalFloatV2(
     float *outputIdata,
     float *outputQdata)
 {
-  // 每个通道的信号长度：这里假设所有通道的长度是相同的
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-- 
Gitee


From cd8bafe5f560818775032c71bf0eafff735021ec Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Thu, 18 Dec 2025 16:56:22 +0800
Subject: [PATCH 20/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample_float.cu | 63 ++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 1a99982..d207daa 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -498,7 +498,10 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
 
   if (upFactor == downFactor)
   {
-    outputSignal = inputSignal;
+    for (int i = 0; i < inputSize; i++)
+    {
+      outputSignal[i] = inputSignal[i];
+    }
     return;
   }
 
@@ -648,6 +651,7 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const float CurrentRealfreq,
+    float *I_shifted, float *Q_shifted,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -668,16 +672,6 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  float *I_shifted = new float[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  float *Q_shifted = new float[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
   for (int i = 0; i < signalLength; i++)
   {
     float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
@@ -711,10 +705,6 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   // 重采样
   dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
   dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
 }
 
 /**
@@ -741,6 +731,7 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const float CurrentRealfreq, const int alignSignalLength,
+    float *I_shifted, float *Q_shifted,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -761,16 +752,6 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  float *I_shifted = new float[signalLength];
-  if (I_shifted == nullptr)
-  {
-    return;
-  }
-  float *Q_shifted = new float[signalLength];
-  if (Q_shifted == nullptr)
-  {
-    return;
-  }
   for (int i = 0; i < signalLength; i++)
   {
     float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
@@ -792,10 +773,6 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   // 重采样
   dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
   dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
 }
 
 /**
@@ -885,6 +862,14 @@ bool ShiftAndResampleSignalFloatV1(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
+  // 申请重采样后输出信号的GPU显存
+  float *I_shifted = nullptr;
+  float *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted,
+                              (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted,
+                              (signalLength * sizeof(float))));
+
   // 申请重采样后输出信号的GPU显存
   float *d_outputIdata = nullptr;
   float *d_outputQdata = nullptr;
@@ -899,7 +884,7 @@ bool ShiftAndResampleSignalFloatV1(
 
   ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+      numChannels, signalLength, CurrentRealfreq, I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
@@ -943,10 +928,16 @@ bool ShiftAndResampleSignalFloatV1(
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata)
+  if (I_shifted)
   {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted)
+  {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
   }
 
   if (d_outputIdata)
@@ -955,6 +946,12 @@ bool ShiftAndResampleSignalFloatV1(
     d_outputIdata = nullptr;
   }
 
+  if (d_outputQdata)
+  {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
   return true;
 }
 
-- 
Gitee


From 375f8fd6d44f09e0b38cb1defa079e9efd7f056c Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Thu, 18 Dec 2025 17:16:04 +0800
Subject: [PATCH 21/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample=E7=9A=84?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 cuda_resample_float.cu | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index d207daa..24e6150 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -862,7 +862,7 @@ bool ShiftAndResampleSignalFloatV1(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
-  // 申请重采样后输出信号的GPU显存
+  // 申请移频所需的空间
   float *I_shifted = nullptr;
   float *Q_shifted = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&I_shifted,
@@ -1032,6 +1032,14 @@ bool ShiftAndResampleSignalFloatV2(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
+  // 申请移频所需的空间
+  float *I_shifted = nullptr;
+  float *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted,
+                              (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted,
+                              (signalLength * sizeof(float))));
+
   // 申请重采样后输出信号的GPU显存
   size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(float);
   float *d_outputIdata = nullptr;
@@ -1049,6 +1057,7 @@ bool ShiftAndResampleSignalFloatV2(
   ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
       numChannels, signalLength, CurrentRealfreq, alignSignalLength,
+      I_shifted, Q_shifted,
       d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
@@ -1087,10 +1096,16 @@ bool ShiftAndResampleSignalFloatV2(
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata)
+  if (I_shifted)
   {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted)
+  {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
   }
 
   if (d_outputIdata)
@@ -1099,5 +1114,11 @@ bool ShiftAndResampleSignalFloatV2(
     d_outputIdata = nullptr;
   }
 
+  if (d_outputQdata)
+  {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
   return true;
 }
\ No newline at end of file
-- 
Gitee


From db4491f9381949156cd0147527abffed9d1d7593 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Thu, 18 Dec 2025 19:33:40 +0800
Subject: [PATCH 22/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample=E8=AE=A1?=
 =?UTF-8?q?=E7=AE=97=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: QAiCode <229242333@qq.com>
---
 cuda_resample_double.cu |   5 +-
 cuda_resample_float.cu  | 141 +++++++++++++++++++++++-----------------
 2 files changed, 85 insertions(+), 61 deletions(-)

diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index 0ca63c4..1c5658b 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -498,7 +498,10 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
 
   if (upFactor == downFactor)
   {
-    outputSignal = inputSignal;
+    for (int i = 0; i < inputSize; i++)
+    {
+      outputSignal[i] = inputSignal[i];
+    }
     return;
   }
 
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 24e6150..5bece1c 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -26,6 +26,7 @@ __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *sta
                                                   int coefsPerPhase, int upRate,
                                                   int downRate)
 {
+
   state->_t = 0;
   state->_xOffset = 0;
   state->_transposedCoefs = transposedCoefs;
@@ -293,15 +294,17 @@ __device__ __forceinline__ void dev_fill_float(float *data, int size, float valu
 __device__ int dev_firls_float(float *result, int length, float *freq, const float *amplitude,
                                int freqSize)
 {
+  // 最大固定大小，根据GPU能力调整
+  const int MAX_WEIGHT_SIZE = 256;
+  const int MAX_K_SIZE = 256;
+  const int MAX_B_SIZE = 256;
+  const int MAX_A_SIZE = 256;
+
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
-  float *weight = new float[weightSize];
-  if (weight == nullptr)
-  {
-    return -1;
-  }
+  __shared__ float weight[MAX_WEIGHT_SIZE];
 
   // 初始化weight为全1
   dev_fill_float(weight, weightSize, float(1.0));
@@ -320,11 +323,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
 
   // 创建和初始化向量k
   int kLength = length + 1;
-  float *k = new float[kLength];
-  if (k == nullptr)
-  {
-    return -1;
-  };
+  __shared__ float k[MAX_K_SIZE];
 
   // 初始化k向量为递增序列：0，1，2...
   dev_iota_float(k, kLength, float(0.0));
@@ -353,12 +352,8 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   {
     bLength++; // 此处++，因为后面需要在b[0]处插入b0
   }
-  float *b = new float[bLength];
-  if (b == nullptr)
-  {
-    return -1;
-  };
 
+  __shared__ float b[MAX_B_SIZE];
   dev_fill_float(b, bLength, float(0.0));
 
   float b0 = float(0.0);
@@ -413,11 +408,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   float w0 = weight[0];
 
   int aLength = bLength;
-  float *a = new float[aLength];
-  if (a == nullptr)
-  {
-    return -1;
-  };
+  __shared__ float a[MAX_A_SIZE];
 
   // vector<float> result = {a.rbegin(), a.rend()};
   for (int i = 0; i < aLength; i++)
@@ -442,11 +433,6 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     }
   }
 
-  // 释放动态分配的内存
-  delete[] weight; // 释放内存
-  delete[] k;      // 释放内存
-  delete[] b;      // 释放内存
-  delete[] a;      // 释放内存
   return 0;
 }
 
@@ -481,7 +467,7 @@ __device__ void dev_kaiser_float(float *window, int order, float bta)
 
 __device__ void dev_resample_float(int upFactor, int downFactor,
                                    float *inputSignal, const int inputSize,
-                                   float *outputSignal)
+                                   float *outputSignal, float *workspace)
 {
   const int n = 10;
   const float bta = float(5.0);
@@ -526,11 +512,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
 
-  float *coefficients = new float[coefficientsLength];
-  if (coefficients == nullptr)
-  {
-    return;
-  }
+  float *coefficients = workspace;
   int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
                             freqSize);
   if (ret == -1)
@@ -539,11 +521,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   }
 
   int windowSize = length;
-  float *window = new float[windowSize];
-  if (window == nullptr)
-  {
-    return;
-  }
+  float *window = coefficients + coefficientsLength;
   dev_kaiser_float(window, length, bta);
 
   for (int i = 0; i < coefficientsLength; i++)
@@ -556,11 +534,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
 
   // 分配filter空间
   int hSize = coefficientsLength + 2 * nz;
-  float *filter = new float[hSize];
-  if (filter == nullptr)
-  {
-    return;
-  }
+  float *filter = window + windowSize;
 
   int filterLength = 0;
   for (int i = 0; i < nz; i++)
@@ -598,12 +572,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
     paddedCoefCount++;
   }
 
-  int coefsPerPhase = paddedCoefCount / upFactor;
-  int padding = coefsPerPhase - 1;
-  int outputCount =
-      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
-
-  float *results = new float[outputCount];
+  float *results = filter + filterLength;
   if (results == nullptr)
   {
     return;
@@ -619,11 +588,6 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
     outputSignal[j++] = results[i];
   }
 
-  // 释放动态分配的内存
-  delete[] coefficients;
-  delete[] window;
-  delete[] filter;
-  delete[] results;
   return;
 }
 
@@ -651,7 +615,7 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const float CurrentRealfreq,
-    float *I_shifted, float *Q_shifted,
+    float *I_shifted, float *Q_shifted, float *workspace,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -703,8 +667,8 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled, workspace);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled, workspace);
 }
 
 /**
@@ -731,7 +695,7 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const float CurrentRealfreq, const int alignSignalLength,
-    float *I_shifted, float *Q_shifted,
+    float *I_shifted, float *Q_shifted, float *workspace,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -771,8 +735,30 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled, workspace);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled, workspace);
+}
+
+int dev_resample_calc_workspace_size(
+    int upFactor, int downFactor, int inputSize)
+{
+  const int n = 10;
+  int maxFactor = max(upFactor, downFactor);
+  int length = 2 * n * maxFactor + 1;
+
+  // 计算各缓冲区大小
+  int coeffsSize = length;
+  int windowSize = length;
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+  int filterSize = coeffsSize + 2 * nz;
+
+  // 输出大小
+  int outputSize = (inputSize * upFactor + downFactor - 1) / downFactor;
+  int resultsSize = outputSize;
+
+  return coeffsSize + windowSize + filterSize + resultsSize;
 }
 
 /**
@@ -878,13 +864,25 @@ bool ShiftAndResampleSignalFloatV1(
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
                               (numChannels * outputTotalLength * sizeof(float))));
 
+  int upFactor = 1;
+  int workspaceSize = 0;
+  for (int i = 0; i < numResults; i++)
+  {
+    int dFactor = downFactor[i];
+    int size = dev_resample_calc_workspace_size(upFactor, dFactor, signalLength);
+    workspaceSize = max(workspaceSize, size);
+  }
+  float *workspace = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&workspace, (workspaceSize * sizeof(float))));
+
   // 线程数配置
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
 
   ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
+      numChannels, signalLength, CurrentRealfreq, I_shifted, Q_shifted, workspace,
+      d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
@@ -952,6 +950,12 @@ bool ShiftAndResampleSignalFloatV1(
     d_outputQdata = nullptr;
   }
 
+  if (workspace)
+  {
+    cudaFree(workspace);
+    workspace = nullptr;
+  }
+
   return true;
 }
 
@@ -1051,13 +1055,24 @@ bool ShiftAndResampleSignalFloatV2(
   CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
   CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
 
+  int upFactor = 1;
+  int workspaceSize = 0;
+  for (int i = 0; i < numResults; i++)
+  {
+    int dFactor = downFactor[i];
+    int size = dev_resample_calc_workspace_size(upFactor, dFactor, signalLength);
+    workspaceSize = max(workspaceSize, size);
+  }
+  float *workspace = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&workspace, (workspaceSize * sizeof(float))));
+
   // 线程数配置，总的线程数：numChannels * numResults
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
   ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
       numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      I_shifted, Q_shifted,
+      I_shifted, Q_shifted, workspace,
       d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
@@ -1120,5 +1135,11 @@ bool ShiftAndResampleSignalFloatV2(
     d_outputQdata = nullptr;
   }
 
+  if (workspace)
+  {
+    cudaFree(workspace);
+    workspace = nullptr;
+  }
+
   return true;
 }
\ No newline at end of file
-- 
Gitee


From bf6ce29230778d3a90d5655964e4a2163c683270 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Sat, 20 Dec 2025 15:59:45 +0800
Subject: [PATCH 23/27] =?UTF-8?q?=E5=AE=8C=E5=96=8410:wq?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample_float.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 5bece1c..152f72d 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,4 +1,5 @@
 #include "cuda_resample_float.h"
+#include <cuda/std/cstdlib>
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -35,7 +36,9 @@ __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *sta
   state->_downRate = downRate;
 
   // 分配状态缓冲区
-  state->_state = new float[coefsPerPhase - 1];
+  // state->_state = new float[coefsPerPhase - 1];
+  int *deviceArray = (int *)malloc(100 * sizeof(int));
+  state->_state = cuda::std::allocator<float>().allocate(100);
 
   // 初始化状态为零
   for (int i = 0; i < coefsPerPhase - 1; i++)
-- 
Gitee


From c926a98bdae12b162a794a04b8b569d5f0cd0504 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Sat, 20 Dec 2025 19:40:28 +0800
Subject: [PATCH 24/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample=2020?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample_float.cu | 429 ++++++++++++++++++++++++++++-------------
 1 file changed, 300 insertions(+), 129 deletions(-)

diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 152f72d..4c07c72 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,5 +1,5 @@
 #include "cuda_resample_float.h"
-#include <cuda/std/cstdlib>
+#include <stdio.h>
 
 // CHECK_CUDA_ERROR：cuda api调用错误处理
 #define CHECK_CUDA_ERROR(call)                                               \
@@ -14,6 +14,14 @@
     }                                                                        \
   } while (0)
 
+#define LOG_ERROR_DEVICE(fmt, ...)              \
+  printf("[DEVICE ERROR] %s:%d (%s) " fmt "\n", \
+         __FILE__, __LINE__, __func__, ##__VA_ARGS__)
+
+#define LOG_ERROR_HOST(message)                               \
+  std::cerr << "[HOST ERROR] " << __FILE__ << ":" << __LINE__ \
+            << " (" << __FUNCTION__ << ") " << message << std::endl
+
 inline int quotientCeil(int num1, int num2)
 {
   if (num1 % num2 != 0)
@@ -21,13 +29,77 @@ inline int quotientCeil(int num1, int num2)
   return num1 / num2;
 }
 
+/**
+ * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
+ * @param deviceId 要设置的设备ID（默认0）
+ * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
+ * @param minHeapSizeMB 最小堆大小MB（默认16MB）
+ * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
+ * @return 0成功，-1失败
+ */
+int SetGPUHeapSize(int deviceId = 0,
+                   float heapSizePercent = 10.0f,
+                   size_t minHeapSizeMB = 16,
+                   size_t maxHeapSizeMB = 512)
+{
+  // 获取设备属性
+  cudaDeviceProp prop;
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceId);
+  if (err != cudaSuccess)
+  {
+    std::cerr << "获取设备属性失败: " << cudaGetErrorString(err) << std::endl;
+    return -1;
+  }
+
+  // 检查设备是否支持设备端malloc
+  if (prop.major < 2)
+  {
+    std::cerr << "设备不支持设备端malloc (计算能力 < 2.0)" << std::endl;
+    std::cerr << "设备计算能力: " << prop.major << "." << prop.minor << std::endl;
+    return -1;
+  }
+
+  // 根据设备内存动态设置堆大小
+  size_t totalGlobalMem = prop.totalGlobalMem;
+  size_t heapSize = 0;
+
+  if (prop.major >= 2)
+  {
+    // 对于支持动态分配的设备
+    // 堆大小建议：总内存的10-25%，但不超过一定限制
+    heapSize = totalGlobalMem * 0.1; // 10%的总内存
+
+    // 设置上限，避免太大
+    const size_t maxHeapSizeBytes = maxHeapSizeMB * 1024 * 1024; // 512MB
+    if (heapSize > maxHeapSizeBytes)
+    {
+      heapSize = maxHeapSizeBytes;
+    }
+
+    // 设置下限，确保足够
+    const size_t minHeapSizeBytes = minHeapSizeMB * 1024 * 1024; // 16MB
+    if (heapSize < minHeapSizeBytes)
+    {
+      heapSize = minHeapSizeBytes;
+    }
+
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, heapSize);
+  }
+  else
+  {
+    std::cout << "设备不支持设备端malloc" << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
 // 设备端Resampler初始化
 __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *state,
                                                   float *transposedCoefs,
                                                   int coefsPerPhase, int upRate,
                                                   int downRate)
 {
-
   state->_t = 0;
   state->_xOffset = 0;
   state->_transposedCoefs = transposedCoefs;
@@ -36,9 +108,12 @@ __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *sta
   state->_downRate = downRate;
 
   // 分配状态缓冲区
-  // state->_state = new float[coefsPerPhase - 1];
-  int *deviceArray = (int *)malloc(100 * sizeof(int));
-  state->_state = cuda::std::allocator<float>().allocate(100);
+  state->_state = (float *)malloc((coefsPerPhase - 1) * sizeof(float));
+  if (state->_state == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!");
+    return;
+  }
 
   // 初始化状态为零
   for (int i = 0; i < coefsPerPhase - 1; i++)
@@ -145,17 +220,6 @@ __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
   return y - out;
 }
 
-// 设备端：释放Resampler状态
-__device__ void resampler_apply_device_float(
-    DeviceResamplerStateFloat *state)
-{
-  if (state->_state != nullptr)
-  {
-    delete[] state->_state;
-    state->_state = nullptr;
-  }
-}
-
 // 设备端：转置滤波器系数（每个线程执行）
 __device__ void transpose_filter_coefs_device_float(float *transposedCoefs, float *coefs,
                                                     int upRate, int coefCount,
@@ -196,7 +260,12 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
   int coefsPerPhase = paddedCoefCount / upRate;
 
   // 分配转置系数内存
-  float *transposedCoefs = new float[paddedCoefCount];
+  float *transposedCoefs = (float *)malloc(paddedCoefCount * sizeof(float));
+  if (transposedCoefs == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!");
+    return;
+  }
 
   // 转置滤波器系数
   transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, filterLength,
@@ -211,7 +280,12 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
   int padding = coefsPerPhase - 1;
 
   // 分配填充输入内存
-  float *inputPadded = new float[inLength + padding];
+  float *inputPadded = (float *)malloc((inLength + padding) * sizeof(float));
+  if (inputPadded == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!");
+    return;
+  }
 
   // 复制输入并填充
   for (int i = 0; i < inLength + padding; i++)
@@ -241,24 +315,19 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
       inputPadded, inLength + padding, results, resultsCountValue, &state);
 
   // 清理设备内存
-  delete[] transposedCoefs;
-  delete[] inputPadded;
-  resampler_apply_device_float(&state);
-}
-
-// 向量版本的设备端upfirdn
-__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inputLength, float *filter, int filterLength,
-                                     float *results)
-{
-  upfirdn_device_float(upRate, downRate, input, inputLength, filter,
-                       filterLength, results, nullptr);
+  free(transposedCoefs);
+  free(inputPadded);
+  if (state._state != nullptr)
+  {
+    free(state._state);
+    state._state = nullptr;
+  }
+  state._transposedCoefs = nullptr;
 }
 
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
 {
-  // 标准的上取整公式：(a + b - 1) / b
   return (num1 + num2 - 1) / num2;
 }
 
@@ -297,17 +366,16 @@ __device__ __forceinline__ void dev_fill_float(float *data, int size, float valu
 __device__ int dev_firls_float(float *result, int length, float *freq, const float *amplitude,
                                int freqSize)
 {
-  // 最大固定大小，根据GPU能力调整
-  const int MAX_WEIGHT_SIZE = 256;
-  const int MAX_K_SIZE = 256;
-  const int MAX_B_SIZE = 256;
-  const int MAX_A_SIZE = 256;
-
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
-  __shared__ float weight[MAX_WEIGHT_SIZE];
+  float *weight = (float *)malloc(weightSize * sizeof(float));
+  if (weight == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for weight!");
+    return -1;
+  }
 
   // 初始化weight为全1
   dev_fill_float(weight, weightSize, float(1.0));
@@ -326,7 +394,12 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
 
   // 创建和初始化向量k
   int kLength = length + 1;
-  __shared__ float k[MAX_K_SIZE];
+  float *k = (float *)malloc(kLength * sizeof(float));
+  if (k == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!");
+    return -1;
+  };
 
   // 初始化k向量为递增序列：0，1，2...
   dev_iota_float(k, kLength, float(0.0));
@@ -339,7 +412,6 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     }
   }
 
-  // k.erase(k.begin());
   if (Nodd)
   {
     for (int i = 0; i < kLength; i++)
@@ -356,7 +428,13 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     bLength++; // 此处++，因为后面需要在b[0]处插入b0
   }
 
-  __shared__ float b[MAX_B_SIZE];
+  float *b = (float *)malloc(bLength * sizeof(float));
+  if (b == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!");
+    return -1;
+  };
+
   dev_fill_float(b, bLength, float(0.0));
 
   float b0 = float(0.0);
@@ -411,7 +489,12 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   float w0 = weight[0];
 
   int aLength = bLength;
-  __shared__ float a[MAX_A_SIZE];
+  float *a = (float *)malloc(aLength * sizeof(float));
+  if (a == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!");
+    return -1;
+  };
 
   // vector<float> result = {a.rbegin(), a.rend()};
   for (int i = 0; i < aLength; i++)
@@ -436,6 +519,11 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     }
   }
 
+  // 释放动态分配的内存
+  free(weight);
+  free(k);
+  free(b);
+  free(a);
   return 0;
 }
 
@@ -470,13 +558,14 @@ __device__ void dev_kaiser_float(float *window, int order, float bta)
 
 __device__ void dev_resample_float(int upFactor, int downFactor,
                                    float *inputSignal, const int inputSize,
-                                   float *outputSignal, float *workspace)
+                                   float *outputSignal)
 {
   const int n = 10;
   const float bta = float(5.0);
 
   if (upFactor <= 0 || downFactor <= 0)
   {
+    LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!");
     return;
   }
 
@@ -495,7 +584,6 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   }
 
   int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
-
   int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
   float firlsFreq = float(1.0) / float(2.0) / static_cast<float>(maxFactor);
 
@@ -515,16 +603,28 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
 
-  float *coefficients = workspace;
+  float *coefficients = (float *)malloc(coefficientsLength * sizeof(float));
+  if (coefficients == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!");
+    return;
+  }
+
   int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
                             freqSize);
   if (ret == -1)
   {
+    LOG_ERROR_DEVICE("dev_firls_float function error!");
     return;
   }
 
   int windowSize = length;
-  float *window = coefficients + coefficientsLength;
+  float *window = (float *)malloc(windowSize * sizeof(float));
+  if (window == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
+    return;
+  }
   dev_kaiser_float(window, length, bta);
 
   for (int i = 0; i < coefficientsLength; i++)
@@ -536,8 +636,13 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   int nz = downFactor - lengthHalf % downFactor;
 
   // 分配filter空间
-  int hSize = coefficientsLength + 2 * nz;
-  float *filter = window + windowSize;
+  int hSize = coefficientsLength + nz;
+  float *filter = (float *)malloc((coefficientsLength + 3 * nz) * sizeof(float));
+  if (filter == nullptr)
+  {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for filter!");
+    return;
+  }
 
   int filterLength = 0;
   for (int i = 0; i < nz; i++)
@@ -575,9 +680,15 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
     paddedCoefCount++;
   }
 
-  float *results = filter + filterLength;
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  float *results = (float *)malloc(outputCount * sizeof(float));
   if (results == nullptr)
   {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!");
     return;
   }
 
@@ -591,6 +702,11 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
     outputSignal[j++] = results[i];
   }
 
+  // 释放动态分配的内存
+  free(coefficients);
+  free(window);
+  free(filter);
+  free(results);
   return;
 }
 
@@ -609,6 +725,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
  * @param numChannels：信号通道数
  * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
+ * @param sampling_rate
  * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return true or false
@@ -618,23 +735,22 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const float CurrentRealfreq,
-    float *I_shifted, float *Q_shifted, float *workspace,
+    const float sampling_rate,
+    float *I_shifted, float *Q_shifted,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
+  if (idx >= numResults * numChannels)
     return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
   int ResIdx = idx / numChannels; // 第几个检测结果
   int chIdx = idx % numChannels;  // 第几个通道
 
-  const float sampling_rate = float(245.76e6);
-
   float frequency = VFrequency[ResIdx]; // 频率
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
-  // 获取当前线程处理的通道数据地址
+  // 获取当前线程处理的原始数据 （某一个通道的原始数据的地址）
   const auto I_orig = origIdata + chIdx * signalLength;
   const auto Q_orig = origQdata + chIdx * signalLength;
 
@@ -670,8 +786,8 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled, workspace);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled, workspace);
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
 }
 
 /**
@@ -698,7 +814,8 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const float CurrentRealfreq, const int alignSignalLength,
-    float *I_shifted, float *Q_shifted, float *workspace,
+    const float sampling_rate,
+    float *I_shifted, float *Q_shifted,
     float *__restrict__ outputIdata, float *__restrict__ outputQdata)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -709,8 +826,6 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   int ResIdx = idx / numChannels; // 第几个检测结果
   int chIdx = idx % numChannels;  // 第几个通道
 
-  const float sampling_rate = float(245.76e6);
-
   float frequency = VFrequency[ResIdx]; // 频率
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
@@ -738,30 +853,8 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled, workspace);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled, workspace);
-}
-
-int dev_resample_calc_workspace_size(
-    int upFactor, int downFactor, int inputSize)
-{
-  const int n = 10;
-  int maxFactor = max(upFactor, downFactor);
-  int length = 2 * n * maxFactor + 1;
-
-  // 计算各缓冲区大小
-  int coeffsSize = length;
-  int windowSize = length;
-
-  int lengthHalf = (length - 1) / 2;
-  int nz = downFactor - lengthHalf % downFactor;
-  int filterSize = coeffsSize + 2 * nz;
-
-  // 输出大小
-  int outputSize = (inputSize * upFactor + downFactor - 1) / downFactor;
-  int resultsSize = outputSize;
-
-  return coeffsSize + windowSize + filterSize + resultsSize;
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
 }
 
 /**
@@ -796,14 +889,71 @@ bool ShiftAndResampleSignalFloatV1(
     float *outputIdata,
     float *outputQdata)
 {
+  // 参数合法性检查
+  if (outputTotalLength <= 0)
+  {
+    LOG_ERROR_HOST("outputTotalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0)
+  {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0)
+  {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (outputLength.size() != numResults)
+  {
+    LOG_ERROR_HOST("vector outputLength lenght != numResults");
+    return false;
+  }
+  if (downFactor.size() != numResults)
+  {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults)
+  {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr)
+  {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr)
+  {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
+  if (signalLength <= 0)
+  {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  float heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
+  // copy下采样率，频率，输出信号长度等数据到显存中
   int *d_downFactor = nullptr;
   int *d_outputLength = nullptr;
   float *d_frequency = nullptr;
+
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
@@ -815,17 +965,18 @@ bool ShiftAndResampleSignalFloatV1(
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
-  // copy频率到显存中
-  const float *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
-
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
+  // copy频率到显存中
+  const float *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float),
+                              cudaMemcpyHostToDevice));
+
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
   float *d_Idata = nullptr;
   float *d_Qdata = nullptr;
@@ -838,13 +989,13 @@ bool ShiftAndResampleSignalFloatV1(
   size_t copySize = signalLength * sizeof(float);
   for (int i = 0; i < numChannels; i++)
   {
-    // copy 原始的idata 到gpu显存
+    // copy 原始的 idata 到gpu显存
     float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
-    // copy 原始的qdata 到gpu显存
+    // copy 原始的 qdata 到gpu显存
     float *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
@@ -867,24 +1018,14 @@ bool ShiftAndResampleSignalFloatV1(
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
                               (numChannels * outputTotalLength * sizeof(float))));
 
-  int upFactor = 1;
-  int workspaceSize = 0;
-  for (int i = 0; i < numResults; i++)
-  {
-    int dFactor = downFactor[i];
-    int size = dev_resample_calc_workspace_size(upFactor, dFactor, signalLength);
-    workspaceSize = max(workspaceSize, size);
-  }
-  float *workspace = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&workspace, (workspaceSize * sizeof(float))));
-
   // 线程数配置
   dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
+  dim3 grid((numResults * numChannels + block.x - 1) / block.x);
+  const float sampling_rate = float(245.76e6);
   ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, I_shifted, Q_shifted, workspace,
+      numChannels, signalLength, CurrentRealfreq, sampling_rate,
+      I_shifted, Q_shifted,
       d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
@@ -953,12 +1094,6 @@ bool ShiftAndResampleSignalFloatV1(
     d_outputQdata = nullptr;
   }
 
-  if (workspace)
-  {
-    cudaFree(workspace);
-    workspace = nullptr;
-  }
-
   return true;
 }
 
@@ -992,8 +1127,59 @@ bool ShiftAndResampleSignalFloatV2(
     float *outputIdata,
     float *outputQdata)
 {
+  // 参数合法性检查
+  if (alignSignalLength <= 0)
+  {
+    LOG_ERROR_HOST("alignSignalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0)
+  {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0)
+  {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (downFactor.size() != numResults)
+  {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults)
+  {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr)
+  {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr)
+  {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
+  if (signalLength <= 0)
+  {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  float heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
@@ -1058,24 +1244,15 @@ bool ShiftAndResampleSignalFloatV2(
   CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
   CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
 
-  int upFactor = 1;
-  int workspaceSize = 0;
-  for (int i = 0; i < numResults; i++)
-  {
-    int dFactor = downFactor[i];
-    int size = dev_resample_calc_workspace_size(upFactor, dFactor, signalLength);
-    workspaceSize = max(workspaceSize, size);
-  }
-  float *workspace = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&workspace, (workspaceSize * sizeof(float))));
-
   // 线程数配置，总的线程数：numChannels * numResults
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  const float sampling_rate = float(245.76e6);
   ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
       numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      I_shifted, Q_shifted, workspace,
+      sampling_rate,
+      I_shifted, Q_shifted,
       d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
@@ -1138,11 +1315,5 @@ bool ShiftAndResampleSignalFloatV2(
     d_outputQdata = nullptr;
   }
 
-  if (workspace)
-  {
-    cudaFree(workspace);
-    workspace = nullptr;
-  }
-
   return true;
 }
\ No newline at end of file
-- 
Gitee


From 354426caff294d09c3f4957ee3aac950a767e0b9 Mon Sep 17 00:00:00 2001
From: QAiCode <229242333@qq.com>
Date: Sat, 20 Dec 2025 20:35:46 +0800
Subject: [PATCH 25/27] =?UTF-8?q?i=E5=AE=8C=E5=96=84resample=2021?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample_float.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 4c07c72..9c69458 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -118,7 +118,7 @@ __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *sta
   // 初始化状态为零
   for (int i = 0; i < coefsPerPhase - 1; i++)
   {
-    state->_state[i] = 0;
+    state->_state[i] = float(0);
   }
 }
 
@@ -228,7 +228,7 @@ __device__ void transpose_filter_coefs_device_float(float *transposedCoefs, floa
   // 初始化转置系数为零
   for (int i = 0; i < upRate * coefsPerPhase; i++)
   {
-    transposedCoefs[i] = 0;
+    transposedCoefs[i] = float(0);
   }
 
   // 转置并翻转每个相位
@@ -584,8 +584,10 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   }
 
   int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
   int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
   float firlsFreq = float(1.0) / float(2.0) / static_cast<float>(maxFactor);
+  int length = 2 * n * maxFactor + 1;
 
   float firlsFreqsV[4];
   firlsFreqsV[0] = float(0.0);
@@ -600,7 +602,6 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   firlsAmplitudeV[3] = float(0.0);
 
   int freqSize = 4;
-  int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
 
   float *coefficients = (float *)malloc(coefficientsLength * sizeof(float));
-- 
Gitee


From 6d3cab7c8db3ff525c3dce05ce932be18d8764fe Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 22 Dec 2025 09:58:43 +0800
Subject: [PATCH 26/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample-22?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample.cpp       | 101 +++++++
 cuda_resample.h         | 125 ++++++++
 cuda_resample_double.cu | 477 ++++++++++++-------------------
 cuda_resample_double.h  |  50 ++--
 cuda_resample_float.cu  | 611 +++++++++++++---------------------------
 cuda_resample_float.h   |  49 ++--
 6 files changed, 633 insertions(+), 780 deletions(-)
 create mode 100644 cuda_resample.cpp
 create mode 100644 cuda_resample.h

diff --git a/cuda_resample.cpp b/cuda_resample.cpp
new file mode 100644
index 0000000..e93b584
--- /dev/null
+++ b/cuda_resample.cpp
@@ -0,0 +1,101 @@
+#ifndef CUDA_RESAMPLE_H
+#define CUDA_RESAMPLE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+
+#include <iostream>
+
+/**
+ * @brief 计算两个整数相除的向上取整结果（商向正无穷方向取整）
+ *
+ * 该函数实现整数除法的向上取整，与数学中的ceil(num1/num2)行为一致。
+ * 例如：7/3=2.333→向上取整为3；-7/3=-2.333→向上取整为-2；7/-3=-2.333→向上取整为-2
+ *
+ * @param[in] num1 被除数（分子）
+ * @param[in] num2 除数（分母），不能为0
+ * @return int 向上取整后的商
+ * @note
+ *   1. 当除数为0时，会输出错误信息到stderr并返回0（实际应用中建议改为抛出异常）
+ *   2.
+ * 仅当存在非零余数且除数与被除数同号时，商才需要加1（避免异号情况下错误进位）
+ *   3. 与C++标准库的std::ceil不同，该函数仅处理整数运算，无浮点精度损失
+ * @warning 除数为0时返回0是临时容错处理，生产环境应使用assert或异常机制终止程序
+ */
+int quotientCeil(int num1, int num2) {
+  // 处理除数为0的情况（根据需求选择返回值或抛出异常）
+  if (num2 == 0) {
+    std::cerr << "Division by zero error" << std::endl;
+    return 0;  // 或使用 assert(0) 终止程序
+  }
+
+  // 计算商和余数
+  int quotient = num1 / num2;
+  int remainder = num1 % num2;
+
+  // 仅当余数不为0且除数与被除数同号时，才需要向上取整（商+1）
+  if (remainder != 0 && ((num1 > 0 && num2 > 0) || (num1 < 0 && num2 < 0))) {
+    quotient += 1;
+  }
+
+  return quotient;
+}
+
+/**
+ * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
+ * @param deviceId 要设置的设备ID（默认0）
+ * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
+ * @param minHeapSizeMB 最小堆大小MB（默认16MB）
+ * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
+ * @return 0成功，-1失败
+ */
+int SetGPUHeapSize(int deviceId = 0, float heapSizePercent = 10.0f,
+                   size_t minHeapSizeMB = 16, size_t maxHeapSizeMB = 512) {
+  // 获取设备属性
+  cudaDeviceProp prop;
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceId);
+  if (err != cudaSuccess) {
+    std::cerr << "获取设备属性失败: " << cudaGetErrorString(err) << std::endl;
+    return -1;
+  }
+
+  // 检查设备是否支持设备端malloc
+  if (prop.major < 2) {
+    std::cerr << "设备不支持设备端malloc (计算能力 < 2.0)" << std::endl;
+    std::cerr << "设备计算能力: " << prop.major << "." << prop.minor
+              << std::endl;
+    return -1;
+  }
+
+  // 根据设备内存动态设置堆大小
+  size_t totalGlobalMem = prop.totalGlobalMem;
+  size_t heapSize = 0;
+
+  if (prop.major >= 2) {
+    // 对于支持动态分配的设备
+    // 堆大小建议：总内存的10-25%，但不超过一定限制
+    heapSize = totalGlobalMem * 0.1;  // 10%的总内存
+
+    // 设置上限，避免太大
+    const size_t maxHeapSizeBytes = maxHeapSizeMB * 1024 * 1024;  // 512MB
+    if (heapSize > maxHeapSizeBytes) {
+      heapSize = maxHeapSizeBytes;
+    }
+
+    // 设置下限，确保足够
+    const size_t minHeapSizeBytes = minHeapSizeMB * 1024 * 1024;  // 16MB
+    if (heapSize < minHeapSizeBytes) {
+      heapSize = minHeapSizeBytes;
+    }
+
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, heapSize);
+  } else {
+    std::cout << "设备不支持设备端malloc" << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+#endif  // CUDA_RESAMPLE_H
\ No newline at end of file
diff --git a/cuda_resample.h b/cuda_resample.h
new file mode 100644
index 0000000..377eb63
--- /dev/null
+++ b/cuda_resample.h
@@ -0,0 +1,125 @@
+#ifndef CUDA_RESAMPLE_H
+#define CUDA_RESAMPLE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+
+#include <iostream>
+
+/**
+ * @brief CUDA运行时错误检查宏，带自动定位和异常抛出功能
+ *
+ * 该宏封装了CUDA API调用的错误检查逻辑，当CUDA操作失败时：
+ * 1. 自动获取当前函数名、文件名和行号，精确定位错误位置
+ * 2. 输出人类可读的错误描述信息（通过cudaGetErrorString）
+ * 3. 抛出std::runtime_error异常终止程序（或触发上层异常处理）
+ *
+ * 采用do-while(0)结构确保宏在各种上下文（如if语句后）都能正确展开，
+ * 同时避免变量作用域污染和语法错误。
+ *
+ * @param call 待检查的CUDA API调用表达式（如cudaMalloc、cudaMemcpy等）
+ * @note 必须在包含<cuda_runtime.h>和<iostream>的环境中使用
+ * @warning 异常抛出会终止当前调用栈，建议在关键CUDA操作后立即使用
+ * @example
+ *   CHECK_CUDA_ERROR(cudaMalloc(&dev_ptr, size));  // 检查内存分配
+ *   CHECK_CUDA_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));  //
+ * 检查数据拷贝
+ */
+#define CHECK_CUDA_ERROR(call)                                               \
+  do {                                                                       \
+    cudaError_t err = call;                                                  \
+    if (err != cudaSuccess) {                                                \
+      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
+                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
+      throw std::runtime_error("CUDA error");                                \
+    }                                                                        \
+  } while (0)
+
+/**
+ * @brief 设备端（GPU/CUDA）错误日志输出宏
+ *
+ * 用于在CUDA设备代码或设备端工具函数中输出错误日志，采用C风格printf实现，
+ * 支持格式化字符串和可变参数。自动附加文件名、行号和函数名，便于错误定位。
+ *
+ * @note 1. 必须在支持C99可变参数宏的编译器中使用（如GCC 4.0+、Clang 3.0+）
+ *       2. 设备端代码通常不支持C++标准库（如std::cerr），故使用printf实现
+ *       3. fmt参数需遵循printf格式规范，例如"%d"对应整数，"%s"对应字符串
+ *
+ * @param[in] fmt 格式化字符串（例如"GPU memory allocation failed, size=%d
+ * bytes"）
+ * @param[in] ... 可变参数列表，与fmt中的格式占位符对应
+ *
+ * @example
+ * // 在CUDA核函数或设备函数中使用
+ * __device__ void cuda_kernel() {
+ *     if (threadIdx.x >= blockDim.x) {
+ *         LOG_ERROR_DEVICE("Invalid thread index: %d (max allowed: %d)",
+ *                         threadIdx.x, blockDim.x - 1);
+ *     }
+ * }
+ */
+#define LOG_ERROR_DEVICE(fmt, ...)                                            \
+  printf("[DEVICE ERROR] %s:%d (%s) " fmt "\n", __FILE__, __LINE__, __func__, \
+         ##__VA_ARGS__)
+
+/**
+ * @brief 主机端（CPU）错误日志输出宏
+ *
+ * 用于在CPU端代码中输出错误日志，基于C++标准库std::cerr实现，支持C++流操作符语法。
+ * 自动附加文件名、行号和函数名，输出至标准错误流（stderr）而非标准输出（stdout），
+ * 便于日志重定向和错误捕获（例如在shell中通过2> error.log单独收集错误）。
+ *
+ * @note 1.
+ * 仅可在主机端C++代码中使用，不可用于CUDA设备代码（设备端不支持std::cerr）
+ *       2.
+ * message参数支持任意可被std::ostream输出的类型（如字符串、数字、自定义类型重载<<）
+ *       3. 输出自动追加换行符，无需手动添加"\n"
+ *
+ * @param[in] message 错误信息（可通过流操作符合并多个值，例如"Load model
+ * failed: " + filename）
+ *
+ * @example
+ * // 在主机端函数中使用
+ * void load_config_file(const std::string& path) {
+ *     std::ifstream file(path);
+ *     if (!file.is_open()) {
+ *         LOG_ERROR_HOST("Failed to open config file: " << path << " (error: "
+ * << strerror(errno) << ")");
+ *     }
+ * }
+ */
+#define LOG_ERROR_HOST(message)                                       \
+  std::cerr << "[HOST ERROR] " << __FILE__ << ":" << __LINE__ << " (" \
+            << __FUNCTION__ << ") " << message << std::endl
+
+/**
+ * @brief 计算两个整数相除的向上取整结果（商向正无穷方向取整）
+ *
+ * 该函数实现整数除法的向上取整，与数学中的ceil(num1/num2)行为一致。
+ * 例如：7/3=2.333→向上取整为3；-7/3=-2.333→向上取整为-2；7/-3=-2.333→向上取整为-2
+ *
+ * @param[in] num1 被除数（分子）
+ * @param[in] num2 除数（分母），不能为0
+ * @return int 向上取整后的商
+ * @note
+ *   1. 当除数为0时，会输出错误信息到stderr并返回0（实际应用中建议改为抛出异常）
+ *   2.
+ * 仅当存在非零余数且除数与被除数同号时，商才需要加1（避免异号情况下错误进位）
+ *   3. 与C++标准库的std::ceil不同，该函数仅处理整数运算，无浮点精度损失
+ * @warning 除数为0时返回0是临时容错处理，生产环境应使用assert或异常机制终止程序
+ */
+int quotientCeil(int num1, int num2);
+
+/**
+ * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
+ * @param deviceId 要设置的设备ID（默认0）
+ * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
+ * @param minHeapSizeMB 最小堆大小MB（默认16MB）
+ * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
+ * @return 0成功，-1失败
+ */
+int SetGPUHeapSize(int deviceId, float heapSizePercent, size_t minHeapSizeMB,
+                   size_t maxHeapSizeMB);
+
+#endif  // CUDA_RESAMPLE_H
\ No newline at end of file
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index 1c5658b..23970bc 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -1,31 +1,11 @@
+#include "cuda_resample.h"
 #include "cuda_resample_double.h"
 
-// CHECK_CUDA_ERROR：cuda api调用错误处理
-#define CHECK_CUDA_ERROR(call)                                               \
-  do                                                                         \
-  {                                                                          \
-    cudaError_t err = call;                                                  \
-    if (err != cudaSuccess)                                                  \
-    {                                                                        \
-      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
-                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
-      throw std::runtime_error("CUDA error");                                \
-    }                                                                        \
-  } while (0)
-
-inline int quotientCeil(int num1, int num2)
-{
-  if (num1 % num2 != 0)
-    return num1 / num2 + 1;
-  return num1 / num2;
-}
-
 // 设备端Resampler初始化
 __device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
                                               double *transposedCoefs,
                                               int coefsPerPhase, int upRate,
-                                              int downRate)
-{
+                                              int downRate) {
   state->_t = 0;
   state->_xOffset = 0;
   state->_transposedCoefs = transposedCoefs;
@@ -37,22 +17,19 @@ __device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
   state->_state = new double[coefsPerPhase - 1];
 
   // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++)
-  {
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
     state->_state[i] = 0;
   }
 }
 
 // 设备端：计算所需输出数量
 __device__ int resampler_needed_out_count_device_double(
-    int inCount, DeviceResamplerStateDouble *state)
-{
+    int inCount, DeviceResamplerStateDouble *state) {
   int np = inCount * state->_upRate;
   int need = np / state->_downRate;
 
   if ((state->_t + state->_upRate * state->_xOffset) <
-      (np % state->_downRate))
-  {
+      (np % state->_downRate)) {
     need++;
   }
 
@@ -60,12 +37,10 @@ __device__ int resampler_needed_out_count_device_double(
 }
 
 // 设备端：应用重采样
-__device__ int resampler_apply_device_double(double *in, int inCount, double *out,
-                                             int outCount,
-                                             DeviceResamplerStateDouble *state)
-{
-  if (outCount < resampler_needed_out_count_device_double(inCount, state))
-  {
+__device__ int resampler_apply_device_double(
+    double *in, int inCount, double *out, int outCount,
+    DeviceResamplerStateDouble *state) {
+  if (outCount < resampler_needed_out_count_device_double(inCount, state)) {
     // 在设备端无法抛出异常，返回错误代码
     return -1;
   }
@@ -75,28 +50,24 @@ __device__ int resampler_apply_device_double(double *in, int inCount, double *ou
   double *y = out;
   double *end = in + inCount;
 
-  while (x < end)
-  {
+  while (x < end) {
     double acc = 0;
     double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
     double *xPtr = x - state->_coefsPerPhase + 1;
 
     int offset = in - xPtr;
-    if (offset > 0)
-    {
+    if (offset > 0) {
       // 需要从_state缓冲区中获取
       double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
 
-      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
-      {
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
         acc += (*statePtr++) * (*h++);
       }
 
       xPtr += offset;
     }
 
-    while (xPtr <= x)
-    {
+    while (xPtr <= x) {
       acc += (*xPtr++) * (*h++);
     }
 
@@ -113,26 +84,20 @@ __device__ int resampler_apply_device_double(double *in, int inCount, double *ou
   // 管理_state缓冲区
   int retain = (state->_coefsPerPhase - 1) - inCount;
 
-  if (retain > 0)
-  {
+  if (retain > 0) {
     // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
-    for (int i = 0; i < retain; i++)
-    {
+    for (int i = 0; i < retain; i++) {
       state->_state[i] =
           state->_state[(state->_coefsPerPhase - 1) - retain + i];
     }
 
     // 然后将整个（短）输入复制到缓冲区末尾
-    for (int i = 0; i < inCount; i++)
-    {
+    for (int i = 0; i < inCount; i++) {
       state->_state[retain + i] = in[i];
     }
-  }
-  else
-  {
+  } else {
     // 只将最后几个输入样本复制到状态缓冲区
-    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
-    {
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
       state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i;
     }
   }
@@ -143,33 +108,27 @@ __device__ int resampler_apply_device_double(double *in, int inCount, double *ou
 
 // 设备端：释放Resampler状态
 __device__ void resampler_apply_device_double(
-    DeviceResamplerStateDouble *state)
-{
-  if (state->_state != nullptr)
-  {
+    DeviceResamplerStateDouble *state) {
+  if (state->_state != nullptr) {
     delete[] state->_state;
     state->_state = nullptr;
   }
 }
 
 // 设备端：转置滤波器系数（每个线程执行）
-__device__ void transpose_filter_coefs_device_double(double *transposedCoefs, double *coefs,
-                                                     int upRate, int coefCount,
-                                                     int coefsPerPhase)
-{
+__device__ void transpose_filter_coefs_device_double(double *transposedCoefs,
+                                                     double *coefs, int upRate,
+                                                     int coefCount,
+                                                     int coefsPerPhase) {
   // 初始化转置系数为零
-  for (int i = 0; i < upRate * coefsPerPhase; i++)
-  {
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
     transposedCoefs[i] = 0;
   }
 
   // 转置并翻转每个相位
-  for (int i = 0; i < upRate; ++i)
-  {
-    for (int j = 0; j < coefsPerPhase; ++j)
-    {
-      if (j * upRate + i < coefCount)
-      {
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < coefCount) {
         transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
             coefs[j * upRate + i];
       }
@@ -179,13 +138,12 @@ __device__ void transpose_filter_coefs_device_double(double *transposedCoefs, do
 
 // 设备端upfirdn主函数
 __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inLength, double *filter, int filterLength,
-                                      double *results, int *resultsCount)
-{
+                                      int inLength, double *filter,
+                                      int filterLength, double *results,
+                                      int *resultsCount) {
   // 计算填充后的系数数量
   int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate)
-  {
+  while (paddedCoefCount % upRate) {
     paddedCoefCount++;
   }
 
@@ -195,8 +153,8 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
   double *transposedCoefs = new double[paddedCoefCount];
 
   // 转置滤波器系数
-  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate, filterLength,
-                                       coefsPerPhase);
+  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate,
+                                       filterLength, coefsPerPhase);
 
   // 创建Resampler状态
   DeviceResamplerStateDouble state;
@@ -210,14 +168,10 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
   double *inputPadded = new double[inLength + padding];
 
   // 复制输入并填充
-  for (int i = 0; i < inLength + padding; i++)
-  {
-    if (i < inLength)
-    {
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
       inputPadded[i] = input[i];
-    }
-    else
-    {
+    } else {
       inputPadded[i] = 0;
     }
   }
@@ -227,8 +181,7 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
       resampler_needed_out_count_device_double(inLength + padding, &state);
 
   // 设置输出计数
-  if (resultsCount != nullptr)
-  {
+  if (resultsCount != nullptr) {
     *resultsCount = resultsCountValue;
   }
 
@@ -244,25 +197,21 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
 
 // 向量版本的设备端upfirdn
 __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inputLength, double *filter, int filterLength,
-                                      double *results)
-{
+                                      int inputLength, double *filter,
+                                      int filterLength, double *results) {
   upfirdn_device_double(upRate, downRate, input, inputLength, filter,
                         filterLength, results, nullptr);
 }
 
 // 整数向上取整除法
-__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
-{
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
   // 标准的上取整公式：(a + b - 1) / b
   return (num1 + num2 - 1) / num2;
 }
 
 // CUDA设备端GCD函数:最大公约数
-__device__ __forceinline__ int dev_gcd(int a, int b)
-{
-  while (b != 0)
-  {
+__device__ __forceinline__ int dev_gcd(int a, int b) {
+  while (b != 0) {
     int temp = b;
     b = a % b;
     a = temp;
@@ -271,35 +220,31 @@ __device__ __forceinline__ int dev_gcd(int a, int b)
 }
 
 // 生成连续递增的序列
-__device__ __forceinline__ void dev_iota_double(double *data, int size, double start)
-{
-  for (int i = 0; i < size; i++)
-  {
+__device__ __forceinline__ void dev_iota_double(double *data, int size,
+                                                double start) {
+  for (int i = 0; i < size; i++) {
     data[i] = start + double(i);
   }
   return;
 }
 
 // 填充data为value
-__device__ __forceinline__ void dev_fill_double(double *data, int size, double value)
-{
-  for (int i = 0; i < size; i++)
-  {
+__device__ __forceinline__ void dev_fill_double(double *data, int size,
+                                                double value) {
+  for (int i = 0; i < size; i++) {
     data[i] = value;
   }
   return;
 }
 
-__device__ int dev_firls_double(double *result, int length, double *freq, const double *amplitude,
-                                int freqSize)
-{
+__device__ int dev_firls_double(double *result, int length, double *freq,
+                                const double *amplitude, int freqSize) {
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
   double *weight = new double[weightSize];
-  if (weight == nullptr)
-  {
+  if (weight == nullptr) {
     return -1;
   }
 
@@ -307,8 +252,7 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
   dev_fill_double(weight, weightSize, double(1.0));
 
   // 处理频率向量
-  for (int i = 0; i < freqSize; i++)
-  {
+  for (int i = 0; i < freqSize; i++) {
     freq[i] = freq[i] / double(2.0);
   }
 
@@ -321,27 +265,22 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
   // 创建和初始化向量k
   int kLength = length + 1;
   double *k = new double[kLength];
-  if (k == nullptr)
-  {
+  if (k == nullptr) {
     return -1;
   };
 
   // 初始化k向量为递增序列：0，1，2...
   dev_iota_double(k, kLength, double(0.0));
 
-  if (!Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
+  if (!Nodd) {
+    for (int i = 0; i < kLength; i++) {
       k[i] += double(0.5);
     }
   }
 
   // k.erase(k.begin());
-  if (Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
+  if (Nodd) {
+    for (int i = 0; i < kLength; i++) {
       k[i] = k[i + 1];
     }
     kLength--;
@@ -349,21 +288,18 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
 
   // 创建和初始化向量b
   int bLength = kLength;
-  if (Nodd)
-  {
-    bLength++; // 此处++，因为后面需要在b[0]处插入b0
+  if (Nodd) {
+    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
   }
   double *b = new double[bLength];
-  if (b == nullptr)
-  {
+  if (b == nullptr) {
     return -1;
   };
 
   dev_fill_double(b, bLength, double(0.0));
 
   double b0 = double(0.0);
-  for (int i = 0; i < freqSize; i += 2)
-  {
+  for (int i = 0; i < freqSize; i += 2) {
     double Fi = freq[i];
     double Fip1 = freq[i + 1];
     double ampi = amplitude[i];
@@ -372,15 +308,14 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
     double m_s = (ampip1 - ampi) / (Fip1 - Fi);
     double b1 = ampi - (m_s * Fi);
 
-    if (Nodd)
-    {
+    if (Nodd) {
       b0 += (b1 * (Fip1 - Fi)) +
-            m_s / double(2.0) * (pow(Fip1, double(2.0)) - pow(Fi, double(2.0))) * wt2;
+            m_s / double(2.0) *
+                (pow(Fip1, double(2.0)) - pow(Fi, double(2.0))) * wt2;
     }
 
     // 并行计算b向量
-    for (int j = 0; j < kLength; j++)
-    {
+    for (int j = 0; j < kLength; j++) {
       double kj = k[j];
       b[j] += (m_s / (double(4.0) * pow(M_PI, double(2.0))) *
                (cos(double(2.0) * M_PI * Fip1) - cos(double(2.0) * M_PI * Fi)) /
@@ -394,16 +329,11 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
   }
 
   // 处理最终结果，将b0插入到b向量的开始
-  if (Nodd)
-  {
-    for (int i = kLength; i >= 0; i--)
-    {
-      if (i > 0)
-      {
+  if (Nodd) {
+    for (int i = kLength; i >= 0; i--) {
+      if (i > 0) {
         b[i] = b[i - 1];
-      }
-      else
-      {
+      } else {
         b[i] = b0;
       }
     }
@@ -414,50 +344,42 @@ __device__ int dev_firls_double(double *result, int length, double *freq, const
 
   int aLength = bLength;
   double *a = new double[aLength];
-  if (a == nullptr)
-  {
+  if (a == nullptr) {
     return -1;
   };
 
   // vector<double> result = {a.rbegin(), a.rend()};
-  for (int i = 0; i < aLength; i++)
-  {
+  for (int i = 0; i < aLength; i++) {
     a[i] = pow(w0, double(2.0)) * double(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
   }
 
   int it = 0;
-  if (Nodd)
-  {
+  if (Nodd) {
     it = 1;
   }
 
   // 构建结果向量
-  for (int i = 0; i < aLength; i++)
-  {
+  for (int i = 0; i < aLength; i++) {
     result[i] = result[i] * double(0.5);
-    if ((i + it) < aLength)
-    {
+    if ((i + it) < aLength) {
       result[aLength + i] = a[i + it] * double(0.5);
     }
   }
 
   // 释放动态分配的内存
-  delete[] weight; // 释放内存
-  delete[] k;      // 释放内存
-  delete[] b;      // 释放内存
-  delete[] a;      // 释放内存
+  delete[] weight;  // 释放内存
+  delete[] k;       // 释放内存
+  delete[] b;       // 释放内存
+  delete[] a;       // 释放内存
   return 0;
 }
 
 // 设备端Bessel函数
-__device__ double dev_cyl_bessel_i_double(int n, double x)
-{
-  if (n == 0)
-    return double(1);
+__device__ double dev_cyl_bessel_i_double(int n, double x) {
+  if (n == 0) return double(1);
   double bessel = double(1), bessel_prev = double(1);
-  for (int i = 1; i <= n; ++i)
-  {
+  for (int i = 1; i <= n; ++i) {
     bessel = (double(2) * i - double(1)) / i * x * bessel_prev - bessel;
     bessel_prev = bessel;
   }
@@ -465,14 +387,12 @@ __device__ double dev_cyl_bessel_i_double(int n, double x)
 }
 
 // 设备端凯塞窗核函数
-__device__ void dev_kaiser_double(double *window, int order, double bta)
-{
+__device__ void dev_kaiser_double(double *window, int order, double bta) {
   double Numerator, Denominator;
   Denominator = dev_cyl_bessel_i_double(0, bta);
   double od2 = (order - double(1)) / double(2);
 
-  for (int n = 0; n < order; n++)
-  {
+  for (int n = 0; n < order; n++) {
     double x = bta * sqrt(double(1) - pow((n - od2) / od2, double(2)));
     Numerator = dev_cyl_bessel_i_double(0, x);
     window[n] = Numerator / Denominator;
@@ -481,13 +401,11 @@ __device__ void dev_kaiser_double(double *window, int order, double bta)
 
 __device__ void dev_resample_double(int upFactor, int downFactor,
                                     double *inputSignal, const int inputSize,
-                                    double *outputSignal)
-{
+                                    double *outputSignal) {
   const int n = 10;
   const double bta = double(5.0);
 
-  if (upFactor <= 0 || downFactor <= 0)
-  {
+  if (upFactor <= 0 || downFactor <= 0) {
     return;
   }
 
@@ -496,10 +414,8 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   upFactor /= gcd_o;
   downFactor /= gcd_o;
 
-  if (upFactor == downFactor)
-  {
-    for (int i = 0; i < inputSize; i++)
-    {
+  if (upFactor == downFactor) {
+    for (int i = 0; i < inputSize; i++) {
       outputSignal[i] = inputSignal[i];
     }
     return;
@@ -527,27 +443,23 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   int coefficientsLength = length;
 
   double *coefficients = new double[coefficientsLength];
-  if (coefficients == nullptr)
-  {
+  if (coefficients == nullptr) {
     return;
   }
-  int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
-                             freqSize);
-  if (ret == -1)
-  {
+  int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV,
+                             firlsAmplitudeV, freqSize);
+  if (ret == -1) {
     return;
   }
 
   int windowSize = length;
   double *window = new double[windowSize];
-  if (window == nullptr)
-  {
+  if (window == nullptr) {
     return;
   }
   dev_kaiser_double(window, length, bta);
 
-  for (int i = 0; i < coefficientsLength; i++)
-  {
+  for (int i = 0; i < coefficientsLength; i++) {
     coefficients[i] *= (upFactor * window[i]);
   }
 
@@ -557,20 +469,17 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   // 分配filter空间
   int hSize = coefficientsLength + 2 * nz;
   double *filter = new double[hSize];
-  if (filter == nullptr)
-  {
+  if (filter == nullptr) {
     return;
   }
 
   int filterLength = 0;
-  for (int i = 0; i < nz; i++)
-  {
+  for (int i = 0; i < nz; i++) {
     filter[i + filterLength] = double(0.0);
   }
   filterLength += nz;
 
-  for (int i = 0; i < coefficientsLength; i++)
-  {
+  for (int i = 0; i < coefficientsLength; i++) {
     filter[i + filterLength] = coefficients[i];
   }
   filterLength += coefficientsLength;
@@ -580,21 +489,18 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   nz = 0;
   while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
              delay <
-         outputSize)
-  {
+         outputSize) {
     nz++;
   }
 
-  for (int i = 0; i < nz; i++)
-  {
+  for (int i = 0; i < nz; i++) {
     filter[i + filterLength] = double(0.0);
   }
   filterLength += nz;
 
   // 计算
   int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upFactor)
-  {
+  while (paddedCoefCount % upFactor) {
     paddedCoefCount++;
   }
 
@@ -604,8 +510,7 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
       ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
 
   double *results = new double[outputCount];
-  if (results == nullptr)
-  {
+  if (results == nullptr) {
     return;
   }
 
@@ -614,8 +519,7 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
                         filterLength, results, &resultsCount);
 
   int j = 0;
-  for (int i = delay; i < outputSize + delay; i++)
-  {
+  for (int i = delay; i < outputSize + delay; i++) {
     outputSignal[j++] = results[i];
   }
 
@@ -651,19 +555,17 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
     const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const double CurrentRealfreq,
-    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
-{
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
+  if (idx >= numChannels * numResults) return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
 
   const double sampling_rate = double(245.76e6);
 
-  double frequency = VFrequency[ResIdx]; // 频率
+  double frequency = VFrequency[ResIdx];  // 频率
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
@@ -672,17 +574,14 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
 
   // 移频：生成本振信号并相乘
   double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr)
-  {
+  if (I_shifted == nullptr) {
     return;
   }
   double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr)
-  {
+  if (Q_shifted == nullptr) {
     return;
   }
-  for (int i = 0; i < signalLength; i++)
-  {
+  for (int i = 0; i < signalLength; i++) {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
     double cosVal = cos(phase);
     double sinVal = sin(phase);
@@ -696,8 +595,7 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
 
   // 计算之前带宽，对应的输出信号的总长度
   int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++)
-  {
+  for (int i = 0; i < ResIdx; i++) {
     beforeTotalLength += VOutputLength[i];
   }
   // 当前带宽对应的输出信号的起始地址偏移
@@ -712,8 +610,10 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
   auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
-  dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength,
+                      I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
+                      Q_resampled);
 
   // 释放动态分配的内存
   delete[] I_shifted;
@@ -744,19 +644,17 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
     const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const double CurrentRealfreq, const int alignSignalLength,
-    double *__restrict__ outputIdata, double *__restrict__ outputQdata)
-{
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
+  if (idx >= numChannels * numResults) return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
 
   const double sampling_rate = double(245.76e6);
 
-  double frequency = VFrequency[ResIdx]; // 频率
+  double frequency = VFrequency[ResIdx];  // 频率
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
@@ -765,17 +663,14 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
 
   // 移频：生成本振信号并相乘
   double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr)
-  {
+  if (I_shifted == nullptr) {
     return;
   }
   double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr)
-  {
+  if (Q_shifted == nullptr) {
     return;
   }
-  for (int i = 0; i < signalLength; i++)
-  {
+  for (int i = 0; i < signalLength; i++) {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
     double cosVal = cos(phase);
     double sinVal = sin(phase);
@@ -788,13 +683,17 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
   int downFactor = VDownFactor[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto I_resampled =
+      outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto Q_resampled =
+      outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
-  dev_resample_double(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength,
+                      I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
+                      Q_resampled);
 
   // 释放动态分配的内存
   delete[] I_shifted;
@@ -823,16 +722,10 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
 bool ShiftAndResampleSignalDoubleV1(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<double> &detectFreq,
-    const int outputTotalLength,
-    const int numResults,
-    const int numChannels,
-    const double CurrentRealfreq,
-    double *outputIdata,
-    double *outputQdata)
-{
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<double> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const double CurrentRealfreq,
+    double *outputIdata, double *outputQdata) {
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
@@ -855,7 +748,8 @@ bool ShiftAndResampleSignalDoubleV1(
   // copy频率到显存中
   const double *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+                              numResults * sizeof(double),
+                              cudaMemcpyHostToDevice));
 
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
@@ -873,8 +767,7 @@ bool ShiftAndResampleSignalDoubleV1(
 
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(double);
-  for (int i = 0; i < numChannels; i++)
-  {
+  for (int i = 0; i < numChannels; i++) {
     // copy 原始的idata 到gpu显存
     double *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
@@ -891,10 +784,10 @@ bool ShiftAndResampleSignalDoubleV1(
   // 申请重采样后输出信号的GPU显存
   double *d_outputIdata = nullptr;
   double *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputIdata, (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputQdata, (numChannels * outputTotalLength * sizeof(double))));
 
   // 线程数配置
   dim3 block(numChannels);
@@ -907,53 +800,48 @@ bool ShiftAndResampleSignalDoubleV1(
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyDeviceToHost));
+  CHECK_CUDA_ERROR(
+      cudaMemcpy(outputIdata, d_outputIdata,
+                 (numChannels * outputTotalLength * sizeof(double)),
+                 cudaMemcpyDeviceToHost));
 
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(double)),
-                              cudaMemcpyDeviceToHost));
+  CHECK_CUDA_ERROR(
+      cudaMemcpy(outputQdata, d_outputQdata,
+                 (numChannels * outputTotalLength * sizeof(double)),
+                 cudaMemcpyDeviceToHost));
 
   // 释放显存
-  if (d_downFactor)
-  {
+  if (d_downFactor) {
     cudaFree(d_downFactor);
     d_downFactor = nullptr;
   }
 
-  if (d_outputLength)
-  {
+  if (d_outputLength) {
     cudaFree(d_outputLength);
     d_outputLength = nullptr;
   }
 
-  if (d_frequency)
-  {
+  if (d_frequency) {
     cudaFree(d_frequency);
     d_frequency = nullptr;
   }
 
-  if (d_Idata)
-  {
+  if (d_Idata) {
     cudaFree(d_Idata);
     d_Idata = nullptr;
   }
 
-  if (d_Qdata)
-  {
+  if (d_Qdata) {
     cudaFree(d_Qdata);
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
@@ -982,15 +870,9 @@ bool ShiftAndResampleSignalDoubleV1(
 bool ShiftAndResampleSignalDoubleV2(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
-    std::vector<int> &downFactor,
-    std::vector<double> &detectFreq,
-    const int alignSignalLength,
-    const int numResults,
-    const int numChannels,
-    const double CurrentRealfreq,
-    double *outputIdata,
-    double *outputQdata)
-{
+    std::vector<int> &downFactor, std::vector<double> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const double CurrentRealfreq, double *outputIdata, double *outputQdata) {
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
 
@@ -1011,7 +893,8 @@ bool ShiftAndResampleSignalDoubleV2(
   // copy频率到显存中
   const double *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double), cudaMemcpyHostToDevice));
+                              numResults * sizeof(double),
+                              cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
   double *d_Idata = nullptr;
@@ -1023,8 +906,7 @@ bool ShiftAndResampleSignalDoubleV2(
 
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(double);
-  for (int i = 0; i < numChannels; i++)
-  {
+  for (int i = 0; i < numChannels; i++) {
     // copy 原始的idata 到gpu显存
     double *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
@@ -1039,7 +921,8 @@ bool ShiftAndResampleSignalDoubleV2(
   }
 
   // 申请重采样后输出信号的GPU显存
-  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(double);
+  size_t totalsize =
+      numResults * numChannels * alignSignalLength * sizeof(double);
   double *d_outputIdata = nullptr;
   double *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
@@ -1053,54 +936,46 @@ bool ShiftAndResampleSignalDoubleV2(
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
   ShiftingAndResamplingKernelDoubleV2<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
-      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      d_outputIdata, d_outputQdata);
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels,
+      signalLength, CurrentRealfreq, alignSignalLength, d_outputIdata,
+      d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              totalsize,
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize,
                               cudaMemcpyDeviceToHost));
 
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              totalsize,
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize,
                               cudaMemcpyDeviceToHost));
 
   // 释放显存
-  if (d_downFactor)
-  {
+  if (d_downFactor) {
     cudaFree(d_downFactor);
     d_downFactor = nullptr;
   }
 
-  if (d_frequency)
-  {
+  if (d_frequency) {
     cudaFree(d_frequency);
     d_frequency = nullptr;
   }
 
-  if (d_Idata)
-  {
+  if (d_Idata) {
     cudaFree(d_Idata);
     d_Idata = nullptr;
   }
 
-  if (d_Qdata)
-  {
+  if (d_Qdata) {
     cudaFree(d_Qdata);
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
diff --git a/cuda_resample_double.h b/cuda_resample_double.h
index f8e5556..48d3328 100644
--- a/cuda_resample_double.h
+++ b/cuda_resample_double.h
@@ -1,10 +1,11 @@
-#ifndef CUDA_RESAMPLE_H
-#define CUDA_RESAMPLE_H
+#ifndef CUDA_RESAMPLE_DOUBLE_H
+#define CUDA_RESAMPLE_DOUBLE_H
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
+#include <math_constants.h>  // CUDA数学常量头文件
 #include <thrust/device_vector.h>
-#include <math_constants.h> // CUDA数学常量头文件
+
 #include <cmath>
 #include <map>
 #include <type_traits>
@@ -15,15 +16,14 @@
 #endif
 
 // 设备端Resampler状态结构
-struct DeviceResamplerStateDouble
-{
-    int _t;                   // "time" (modulo upRate)
-    int _xOffset;             // 输入偏移量
-    double *_state;           // 状态缓冲区指针
-    double *_transposedCoefs; // 转置系数指针
-    int _coefsPerPhase;       // 每相系数数量
-    int _upRate;              // 上采样率
-    int _downRate;            // 下采样率
+struct DeviceResamplerStateDouble {
+  int _t;                    // "time" (modulo upRate)
+  int _xOffset;              // 输入偏移量
+  double *_state;            // 状态缓冲区指针
+  double *_transposedCoefs;  // 转置系数指针
+  int _coefsPerPhase;        // 每相系数数量
+  int _upRate;               // 上采样率
+  int _downRate;             // 下采样率
 };
 
 /**
@@ -48,15 +48,10 @@ struct DeviceResamplerStateDouble
 bool ShiftAndResampleSignalDoubleV1(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<double> &detectFreq,
-    const int outputTotalLength,
-    const int numResults,
-    const int numChannels,
-    const double CurrentRealfreq,
-    double *outputIdata,
-    double *outputQdata);
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<double> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const double CurrentRealfreq,
+    double *outputIdata, double *outputQdata);
 
 /**
  * ShiftAndResampleSignalDoubleV2
@@ -79,13 +74,8 @@ bool ShiftAndResampleSignalDoubleV1(
 bool ShiftAndResampleSignalDoubleV2(
     const std::vector<std::vector<double>> &origIdata,
     const std::vector<std::vector<double>> &origQdata,
-    std::vector<int> &downFactor,
-    std::vector<double> &detectFreq,
-    const int alignSignalLength,
-    const int numResults,
-    const int numChannels,
-    const double CurrentRealfreq,
-    double *outputIdata,
-    double *outputQdata);
+    std::vector<int> &downFactor, std::vector<double> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const double CurrentRealfreq, double *outputIdata, double *outputQdata);
 
-#endif // CUDA_RESAMPLE_H
+#endif  // CUDA_RESAMPLE_DOUBLE_H
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 9c69458..9b66559 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,105 +1,12 @@
-#include "cuda_resample_float.h"
 #include <stdio.h>
 
-// CHECK_CUDA_ERROR：cuda api调用错误处理
-#define CHECK_CUDA_ERROR(call)                                               \
-  do                                                                         \
-  {                                                                          \
-    cudaError_t err = call;                                                  \
-    if (err != cudaSuccess)                                                  \
-    {                                                                        \
-      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
-                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
-      throw std::runtime_error("CUDA error");                                \
-    }                                                                        \
-  } while (0)
-
-#define LOG_ERROR_DEVICE(fmt, ...)              \
-  printf("[DEVICE ERROR] %s:%d (%s) " fmt "\n", \
-         __FILE__, __LINE__, __func__, ##__VA_ARGS__)
-
-#define LOG_ERROR_HOST(message)                               \
-  std::cerr << "[HOST ERROR] " << __FILE__ << ":" << __LINE__ \
-            << " (" << __FUNCTION__ << ") " << message << std::endl
-
-inline int quotientCeil(int num1, int num2)
-{
-  if (num1 % num2 != 0)
-    return num1 / num2 + 1;
-  return num1 / num2;
-}
-
-/**
- * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
- * @param deviceId 要设置的设备ID（默认0）
- * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
- * @param minHeapSizeMB 最小堆大小MB（默认16MB）
- * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
- * @return 0成功，-1失败
- */
-int SetGPUHeapSize(int deviceId = 0,
-                   float heapSizePercent = 10.0f,
-                   size_t minHeapSizeMB = 16,
-                   size_t maxHeapSizeMB = 512)
-{
-  // 获取设备属性
-  cudaDeviceProp prop;
-  cudaError_t err = cudaGetDeviceProperties(&prop, deviceId);
-  if (err != cudaSuccess)
-  {
-    std::cerr << "获取设备属性失败: " << cudaGetErrorString(err) << std::endl;
-    return -1;
-  }
-
-  // 检查设备是否支持设备端malloc
-  if (prop.major < 2)
-  {
-    std::cerr << "设备不支持设备端malloc (计算能力 < 2.0)" << std::endl;
-    std::cerr << "设备计算能力: " << prop.major << "." << prop.minor << std::endl;
-    return -1;
-  }
-
-  // 根据设备内存动态设置堆大小
-  size_t totalGlobalMem = prop.totalGlobalMem;
-  size_t heapSize = 0;
-
-  if (prop.major >= 2)
-  {
-    // 对于支持动态分配的设备
-    // 堆大小建议：总内存的10-25%，但不超过一定限制
-    heapSize = totalGlobalMem * 0.1; // 10%的总内存
-
-    // 设置上限，避免太大
-    const size_t maxHeapSizeBytes = maxHeapSizeMB * 1024 * 1024; // 512MB
-    if (heapSize > maxHeapSizeBytes)
-    {
-      heapSize = maxHeapSizeBytes;
-    }
-
-    // 设置下限，确保足够
-    const size_t minHeapSizeBytes = minHeapSizeMB * 1024 * 1024; // 16MB
-    if (heapSize < minHeapSizeBytes)
-    {
-      heapSize = minHeapSizeBytes;
-    }
-
-    cudaDeviceSetLimit(cudaLimitMallocHeapSize, heapSize);
-  }
-  else
-  {
-    std::cout << "设备不支持设备端malloc" << std::endl;
-    return -1;
-  }
-
-  return 0;
-}
+#include "cuda_resample.h"
+#include "cuda_resample_float.h"
 
 // 设备端Resampler初始化
-__device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *state,
-                                                  float *transposedCoefs,
-                                                  int coefsPerPhase, int upRate,
-                                                  int downRate)
-{
+__device__ void resampler_init_state_device_float(
+    DeviceResamplerStateFloat *state, float *transposedCoefs, int coefsPerPhase,
+    int upRate, int downRate) {
   state->_t = 0;
   state->_xOffset = 0;
   state->_transposedCoefs = transposedCoefs;
@@ -109,29 +16,25 @@ __device__ void resampler_init_state_device_float(DeviceResamplerStateFloat *sta
 
   // 分配状态缓冲区
   state->_state = (float *)malloc((coefsPerPhase - 1) * sizeof(float));
-  if (state->_state == nullptr)
-  {
+  if (state->_state == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!");
     return;
   }
 
   // 初始化状态为零
-  for (int i = 0; i < coefsPerPhase - 1; i++)
-  {
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
     state->_state[i] = float(0);
   }
 }
 
 // 设备端：计算所需输出数量
 __device__ int resampler_needed_out_count_device_float(
-    int inCount, DeviceResamplerStateFloat *state)
-{
+    int inCount, DeviceResamplerStateFloat *state) {
   int np = inCount * state->_upRate;
   int need = np / state->_downRate;
 
   if ((state->_t + state->_upRate * state->_xOffset) <
-      (np % state->_downRate))
-  {
+      (np % state->_downRate)) {
     need++;
   }
 
@@ -141,10 +44,8 @@ __device__ int resampler_needed_out_count_device_float(
 // 设备端：应用重采样
 __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
                                             int outCount,
-                                            DeviceResamplerStateFloat *state)
-{
-  if (outCount < resampler_needed_out_count_device_float(inCount, state))
-  {
+                                            DeviceResamplerStateFloat *state) {
+  if (outCount < resampler_needed_out_count_device_float(inCount, state)) {
     // 在设备端无法抛出异常，返回错误代码
     return -1;
   }
@@ -154,28 +55,24 @@ __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
   float *y = out;
   float *end = in + inCount;
 
-  while (x < end)
-  {
+  while (x < end) {
     float acc = 0;
     float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
     float *xPtr = x - state->_coefsPerPhase + 1;
 
     int offset = in - xPtr;
-    if (offset > 0)
-    {
+    if (offset > 0) {
       // 需要从_state缓冲区中获取
       float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
 
-      while (statePtr < state->_state + (state->_coefsPerPhase - 1))
-      {
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
         acc += (*statePtr++) * (*h++);
       }
 
       xPtr += offset;
     }
 
-    while (xPtr <= x)
-    {
+    while (xPtr <= x) {
       acc += (*xPtr++) * (*h++);
     }
 
@@ -192,26 +89,20 @@ __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
   // 管理_state缓冲区
   int retain = (state->_coefsPerPhase - 1) - inCount;
 
-  if (retain > 0)
-  {
+  if (retain > 0) {
     // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
-    for (int i = 0; i < retain; i++)
-    {
+    for (int i = 0; i < retain; i++) {
       state->_state[i] =
           state->_state[(state->_coefsPerPhase - 1) - retain + i];
     }
 
     // 然后将整个（短）输入复制到缓冲区末尾
-    for (int i = 0; i < inCount; i++)
-    {
+    for (int i = 0; i < inCount; i++) {
       state->_state[retain + i] = in[i];
     }
-  }
-  else
-  {
+  } else {
     // 只将最后几个输入样本复制到状态缓冲区
-    for (int i = 0; i < state->_coefsPerPhase - 1; i++)
-    {
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
       state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i;
     }
   }
@@ -221,23 +112,19 @@ __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
 }
 
 // 设备端：转置滤波器系数（每个线程执行）
-__device__ void transpose_filter_coefs_device_float(float *transposedCoefs, float *coefs,
-                                                    int upRate, int coefCount,
-                                                    int coefsPerPhase)
-{
+__device__ void transpose_filter_coefs_device_float(float *transposedCoefs,
+                                                    float *coefs, int upRate,
+                                                    int coefCount,
+                                                    int coefsPerPhase) {
   // 初始化转置系数为零
-  for (int i = 0; i < upRate * coefsPerPhase; i++)
-  {
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
     transposedCoefs[i] = float(0);
   }
 
   // 转置并翻转每个相位
-  for (int i = 0; i < upRate; ++i)
-  {
-    for (int j = 0; j < coefsPerPhase; ++j)
-    {
-      if (j * upRate + i < coefCount)
-      {
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < coefCount) {
         transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
             coefs[j * upRate + i];
       }
@@ -247,13 +134,12 @@ __device__ void transpose_filter_coefs_device_float(float *transposedCoefs, floa
 
 // 设备端upfirdn主函数
 __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
-                                     int inLength, float *filter, int filterLength,
-                                     float *results, int *resultsCount)
-{
+                                     int inLength, float *filter,
+                                     int filterLength, float *results,
+                                     int *resultsCount) {
   // 计算填充后的系数数量
   int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upRate)
-  {
+  while (paddedCoefCount % upRate) {
     paddedCoefCount++;
   }
 
@@ -261,41 +147,35 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
 
   // 分配转置系数内存
   float *transposedCoefs = (float *)malloc(paddedCoefCount * sizeof(float));
-  if (transposedCoefs == nullptr)
-  {
+  if (transposedCoefs == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!");
     return;
   }
 
   // 转置滤波器系数
-  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, filterLength,
-                                      coefsPerPhase);
+  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate,
+                                      filterLength, coefsPerPhase);
 
   // 创建Resampler状态
   DeviceResamplerStateFloat state;
-  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase, upRate,
-                                    downRate);
+  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase,
+                                    upRate, downRate);
 
   // 计算填充量
   int padding = coefsPerPhase - 1;
 
   // 分配填充输入内存
   float *inputPadded = (float *)malloc((inLength + padding) * sizeof(float));
-  if (inputPadded == nullptr)
-  {
+  if (inputPadded == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!");
     return;
   }
 
   // 复制输入并填充
-  for (int i = 0; i < inLength + padding; i++)
-  {
-    if (i < inLength)
-    {
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
       inputPadded[i] = input[i];
-    }
-    else
-    {
+    } else {
       inputPadded[i] = 0;
     }
   }
@@ -305,8 +185,7 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
       resampler_needed_out_count_device_float(inLength + padding, &state);
 
   // 设置输出计数
-  if (resultsCount != nullptr)
-  {
+  if (resultsCount != nullptr) {
     *resultsCount = resultsCountValue;
   }
 
@@ -317,8 +196,7 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
   // 清理设备内存
   free(transposedCoefs);
   free(inputPadded);
-  if (state._state != nullptr)
-  {
+  if (state._state != nullptr) {
     free(state._state);
     state._state = nullptr;
   }
@@ -326,16 +204,13 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
 }
 
 // 整数向上取整除法
-__device__ __forceinline__ int dev_quotientCeil(int num1, int num2)
-{
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
   return (num1 + num2 - 1) / num2;
 }
 
 // CUDA设备端GCD函数:最大公约数
-__device__ __forceinline__ int dev_gcd(int a, int b)
-{
-  while (b != 0)
-  {
+__device__ __forceinline__ int dev_gcd(int a, int b) {
+  while (b != 0) {
     int temp = b;
     b = a % b;
     a = temp;
@@ -344,35 +219,31 @@ __device__ __forceinline__ int dev_gcd(int a, int b)
 }
 
 // 生成连续递增的序列
-__device__ __forceinline__ void dev_iota_float(float *data, int size, float start)
-{
-  for (int i = 0; i < size; i++)
-  {
+__device__ __forceinline__ void dev_iota_float(float *data, int size,
+                                               float start) {
+  for (int i = 0; i < size; i++) {
     data[i] = start + float(i);
   }
   return;
 }
 
 // 填充data为value
-__device__ __forceinline__ void dev_fill_float(float *data, int size, float value)
-{
-  for (int i = 0; i < size; i++)
-  {
+__device__ __forceinline__ void dev_fill_float(float *data, int size,
+                                               float value) {
+  for (int i = 0; i < size; i++) {
     data[i] = value;
   }
   return;
 }
 
-__device__ int dev_firls_float(float *result, int length, float *freq, const float *amplitude,
-                               int freqSize)
-{
+__device__ int dev_firls_float(float *result, int length, float *freq,
+                               const float *amplitude, int freqSize) {
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
   float *weight = (float *)malloc(weightSize * sizeof(float));
-  if (weight == nullptr)
-  {
+  if (weight == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for weight!");
     return -1;
   }
@@ -381,8 +252,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   dev_fill_float(weight, weightSize, float(1.0));
 
   // 处理频率向量
-  for (int i = 0; i < freqSize; i++)
-  {
+  for (int i = 0; i < freqSize; i++) {
     freq[i] = freq[i] / float(2.0);
   }
 
@@ -395,8 +265,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   // 创建和初始化向量k
   int kLength = length + 1;
   float *k = (float *)malloc(kLength * sizeof(float));
-  if (k == nullptr)
-  {
+  if (k == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!");
     return -1;
   };
@@ -404,18 +273,14 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   // 初始化k向量为递增序列：0，1，2...
   dev_iota_float(k, kLength, float(0.0));
 
-  if (!Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
+  if (!Nodd) {
+    for (int i = 0; i < kLength; i++) {
       k[i] += float(0.5);
     }
   }
 
-  if (Nodd)
-  {
-    for (int i = 0; i < kLength; i++)
-    {
+  if (Nodd) {
+    for (int i = 0; i < kLength; i++) {
       k[i] = k[i + 1];
     }
     kLength--;
@@ -423,14 +288,12 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
 
   // 创建和初始化向量b
   int bLength = kLength;
-  if (Nodd)
-  {
-    bLength++; // 此处++，因为后面需要在b[0]处插入b0
+  if (Nodd) {
+    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
   }
 
   float *b = (float *)malloc(bLength * sizeof(float));
-  if (b == nullptr)
-  {
+  if (b == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!");
     return -1;
   };
@@ -438,8 +301,7 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   dev_fill_float(b, bLength, float(0.0));
 
   float b0 = float(0.0);
-  for (int i = 0; i < freqSize; i += 2)
-  {
+  for (int i = 0; i < freqSize; i += 2) {
     float Fi = freq[i];
     float Fip1 = freq[i + 1];
     float ampi = amplitude[i];
@@ -448,15 +310,14 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
     float m_s = (ampip1 - ampi) / (Fip1 - Fi);
     float b1 = ampi - (m_s * Fi);
 
-    if (Nodd)
-    {
+    if (Nodd) {
       b0 += (b1 * (Fip1 - Fi)) +
-            m_s / float(2.0) * (powf(Fip1, float(2.0)) - powf(Fi, float(2.0))) * wt2;
+            m_s / float(2.0) * (powf(Fip1, float(2.0)) - powf(Fi, float(2.0))) *
+                wt2;
     }
 
     // 并行计算b向量
-    for (int j = 0; j < kLength; j++)
-    {
+    for (int j = 0; j < kLength; j++) {
       float kj = k[j];
       b[j] += (m_s / (float(4.0) * powf(M_PI, float(2.0))) *
                (cosf(float(2.0) * M_PI * Fip1) - cosf(float(2.0) * M_PI * Fi)) /
@@ -470,16 +331,11 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
   }
 
   // 处理最终结果，将b0插入到b向量的开始
-  if (Nodd)
-  {
-    for (int i = kLength; i >= 0; i--)
-    {
-      if (i > 0)
-      {
+  if (Nodd) {
+    for (int i = kLength; i >= 0; i--) {
+      if (i > 0) {
         b[i] = b[i - 1];
-      }
-      else
-      {
+      } else {
         b[i] = b0;
       }
     }
@@ -490,31 +346,26 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
 
   int aLength = bLength;
   float *a = (float *)malloc(aLength * sizeof(float));
-  if (a == nullptr)
-  {
+  if (a == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!");
     return -1;
   };
 
   // vector<float> result = {a.rbegin(), a.rend()};
-  for (int i = 0; i < aLength; i++)
-  {
+  for (int i = 0; i < aLength; i++) {
     a[i] = powf(w0, float(2.0)) * float(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
   }
 
   int it = 0;
-  if (Nodd)
-  {
+  if (Nodd) {
     it = 1;
   }
 
   // 构建结果向量
-  for (int i = 0; i < aLength; i++)
-  {
+  for (int i = 0; i < aLength; i++) {
     result[i] = result[i] * float(0.5);
-    if ((i + it) < aLength)
-    {
+    if ((i + it) < aLength) {
       result[aLength + i] = a[i + it] * float(0.5);
     }
   }
@@ -528,13 +379,10 @@ __device__ int dev_firls_float(float *result, int length, float *freq, const flo
 }
 
 // 设备端Bessel函数
-__device__ float dev_cyl_bessel_i_float(int n, float x)
-{
-  if (n == 0)
-    return float(1);
+__device__ float dev_cyl_bessel_i_float(int n, float x) {
+  if (n == 0) return float(1);
   float bessel = float(1), bessel_prev = float(1);
-  for (int i = 1; i <= n; ++i)
-  {
+  for (int i = 1; i <= n; ++i) {
     bessel = (float(2) * i - float(1)) / i * x * bessel_prev - bessel;
     bessel_prev = bessel;
   }
@@ -542,14 +390,12 @@ __device__ float dev_cyl_bessel_i_float(int n, float x)
 }
 
 // 设备端凯塞窗核函数
-__device__ void dev_kaiser_float(float *window, int order, float bta)
-{
+__device__ void dev_kaiser_float(float *window, int order, float bta) {
   float Numerator, Denominator;
   Denominator = dev_cyl_bessel_i_float(0, bta);
   float od2 = (order - float(1)) / float(2);
 
-  for (int n = 0; n < order; n++)
-  {
+  for (int n = 0; n < order; n++) {
     float x = bta * sqrt(float(1) - powf((n - od2) / od2, float(2)));
     Numerator = dev_cyl_bessel_i_float(0, x);
     window[n] = Numerator / Denominator;
@@ -558,13 +404,11 @@ __device__ void dev_kaiser_float(float *window, int order, float bta)
 
 __device__ void dev_resample_float(int upFactor, int downFactor,
                                    float *inputSignal, const int inputSize,
-                                   float *outputSignal)
-{
+                                   float *outputSignal) {
   const int n = 10;
   const float bta = float(5.0);
 
-  if (upFactor <= 0 || downFactor <= 0)
-  {
+  if (upFactor <= 0 || downFactor <= 0) {
     LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!");
     return;
   }
@@ -574,10 +418,8 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   upFactor /= gcd_o;
   downFactor /= gcd_o;
 
-  if (upFactor == downFactor)
-  {
-    for (int i = 0; i < inputSize; i++)
-    {
+  if (upFactor == downFactor) {
+    for (int i = 0; i < inputSize; i++) {
       outputSignal[i] = inputSignal[i];
     }
     return;
@@ -605,31 +447,27 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   int coefficientsLength = length;
 
   float *coefficients = (float *)malloc(coefficientsLength * sizeof(float));
-  if (coefficients == nullptr)
-  {
+  if (coefficients == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!");
     return;
   }
 
-  int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV, firlsAmplitudeV,
-                            freqSize);
-  if (ret == -1)
-  {
+  int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV,
+                            firlsAmplitudeV, freqSize);
+  if (ret == -1) {
     LOG_ERROR_DEVICE("dev_firls_float function error!");
     return;
   }
 
   int windowSize = length;
   float *window = (float *)malloc(windowSize * sizeof(float));
-  if (window == nullptr)
-  {
+  if (window == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
     return;
   }
   dev_kaiser_float(window, length, bta);
 
-  for (int i = 0; i < coefficientsLength; i++)
-  {
+  for (int i = 0; i < coefficientsLength; i++) {
     coefficients[i] *= (upFactor * window[i]);
   }
 
@@ -638,22 +476,20 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
 
   // 分配filter空间
   int hSize = coefficientsLength + nz;
-  float *filter = (float *)malloc((coefficientsLength + 3 * nz) * sizeof(float));
-  if (filter == nullptr)
-  {
+  float *filter =
+      (float *)malloc((coefficientsLength + 3 * nz) * sizeof(float));
+  if (filter == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for filter!");
     return;
   }
 
   int filterLength = 0;
-  for (int i = 0; i < nz; i++)
-  {
+  for (int i = 0; i < nz; i++) {
     filter[i + filterLength] = float(0.0);
   }
   filterLength += nz;
 
-  for (int i = 0; i < coefficientsLength; i++)
-  {
+  for (int i = 0; i < coefficientsLength; i++) {
     filter[i + filterLength] = coefficients[i];
   }
   filterLength += coefficientsLength;
@@ -663,21 +499,18 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
   nz = 0;
   while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
              delay <
-         outputSize)
-  {
+         outputSize) {
     nz++;
   }
 
-  for (int i = 0; i < nz; i++)
-  {
+  for (int i = 0; i < nz; i++) {
     filter[i + filterLength] = float(0.0);
   }
   filterLength += nz;
 
   // 计算
   int paddedCoefCount = filterLength;
-  while (paddedCoefCount % upFactor)
-  {
+  while (paddedCoefCount % upFactor) {
     paddedCoefCount++;
   }
 
@@ -687,8 +520,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
       ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
 
   float *results = (float *)malloc(outputCount * sizeof(float));
-  if (results == nullptr)
-  {
+  if (results == nullptr) {
     LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!");
     return;
   }
@@ -698,8 +530,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
                        filterLength, results, &resultsCount);
 
   int j = 0;
-  for (int i = delay; i < outputSize + delay; i++)
-  {
+  for (int i = delay; i < outputSize + delay; i++) {
     outputSignal[j++] = results[i];
   }
 
@@ -736,19 +567,16 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const float CurrentRealfreq,
-    const float sampling_rate,
-    float *I_shifted, float *Q_shifted,
-    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
-{
+    const float sampling_rate, float *I_shifted, float *Q_shifted,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numResults * numChannels)
-    return;
+  if (idx >= numResults * numChannels) return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
 
-  float frequency = VFrequency[ResIdx]; // 频率
+  float frequency = VFrequency[ResIdx];  // 频率
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的原始数据 （某一个通道的原始数据的地址）
@@ -756,8 +584,7 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  for (int i = 0; i < signalLength; i++)
-  {
+  for (int i = 0; i < signalLength; i++) {
     float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
     float cosVal = cosf(phase);
     float sinVal = sinf(phase);
@@ -771,8 +598,7 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
 
   // 计算之前带宽，对应的输出信号的总长度
   int beforeTotalLength = 0;
-  for (int i = 0; i < ResIdx; i++)
-  {
+  for (int i = 0; i < ResIdx; i++) {
     beforeTotalLength += VOutputLength[i];
   }
   // 当前带宽对应的输出信号的起始地址偏移
@@ -787,8 +613,10 @@ __global__ void ShiftingAndResamplingKernelFloatV1(
   auto Q_resampled = outputQdata + offset + chIdx * outputLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength,
+                     I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength,
+                     Q_resampled);
 }
 
 /**
@@ -815,19 +643,16 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
     const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const float CurrentRealfreq, const int alignSignalLength,
-    const float sampling_rate,
-    float *I_shifted, float *Q_shifted,
-    float *__restrict__ outputIdata, float *__restrict__ outputQdata)
-{
+    const float sampling_rate, float *I_shifted, float *Q_shifted,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults)
-    return;
+  if (idx >= numChannels * numResults) return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
-  int ResIdx = idx / numChannels; // 第几个检测结果
-  int chIdx = idx % numChannels;  // 第几个通道
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
 
-  float frequency = VFrequency[ResIdx]; // 频率
+  float frequency = VFrequency[ResIdx];  // 频率
   float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
@@ -835,8 +660,7 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  for (int i = 0; i < signalLength; i++)
-  {
+  for (int i = 0; i < signalLength; i++) {
     float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
     float cosVal = cosf(phase);
     float sinVal = sinf(phase);
@@ -849,13 +673,17 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
   int downFactor = VDownFactor[ResIdx];
 
   // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
-  auto I_resampled = outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto I_resampled =
+      outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
   // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
-  auto Q_resampled = outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  auto Q_resampled =
+      outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
 
   // 重采样
-  dev_resample_float(upFactor, downFactor, I_shifted, signalLength, I_resampled);
-  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, Q_resampled);
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength,
+                     I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength,
+                     Q_resampled);
 }
 
 /**
@@ -880,64 +708,49 @@ __global__ void ShiftingAndResamplingKernelFloatV2(
 bool ShiftAndResampleSignalFloatV1(
     const std::vector<std::vector<float>> &origIdata,
     const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int outputTotalLength,
-    const int numResults,
-    const int numChannels,
-    const float CurrentRealfreq,
-    float *outputIdata,
-    float *outputQdata)
-{
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<float> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const float CurrentRealfreq,
+    float *outputIdata, float *outputQdata) {
   // 参数合法性检查
-  if (outputTotalLength <= 0)
-  {
+  if (outputTotalLength <= 0) {
     LOG_ERROR_HOST("outputTotalLength <= 0");
     return false;
   }
-  if (numResults <= 0)
-  {
+  if (numResults <= 0) {
     LOG_ERROR_HOST("numResults <= 0");
     return false;
   }
-  if (numChannels <= 0)
-  {
+  if (numChannels <= 0) {
     LOG_ERROR_HOST("numChannels <= 0");
     return false;
   }
 
-  if (outputLength.size() != numResults)
-  {
+  if (outputLength.size() != numResults) {
     LOG_ERROR_HOST("vector outputLength lenght != numResults");
     return false;
   }
-  if (downFactor.size() != numResults)
-  {
+  if (downFactor.size() != numResults) {
     LOG_ERROR_HOST("vector downFactor lenght != numResults");
     return false;
   }
-  if (detectFreq.size() != numResults)
-  {
+  if (detectFreq.size() != numResults) {
     LOG_ERROR_HOST("vector detectFreq lenght != numResults");
     return false;
   }
 
-  if (outputIdata == nullptr)
-  {
+  if (outputIdata == nullptr) {
     LOG_ERROR_HOST("outputIdata is null ptr");
     return false;
   }
-  if (outputQdata == nullptr)
-  {
+  if (outputQdata == nullptr) {
     LOG_ERROR_HOST("outputQdata is null ptr");
     return false;
   }
 
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
-  if (signalLength <= 0)
-  {
+  if (signalLength <= 0) {
     LOG_ERROR_HOST("signalLength <= 0");
     return false;
   }
@@ -988,8 +801,7 @@ bool ShiftAndResampleSignalFloatV1(
 
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(float);
-  for (int i = 0; i < numChannels; i++)
-  {
+  for (int i = 0; i < numChannels; i++) {
     // copy 原始的 idata 到gpu显存
     float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
@@ -1006,18 +818,16 @@ bool ShiftAndResampleSignalFloatV1(
   // 申请移频所需的空间
   float *I_shifted = nullptr;
   float *Q_shifted = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted,
-                              (signalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted,
-                              (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float))));
 
   // 申请重采样后输出信号的GPU显存
   float *d_outputIdata = nullptr;
   float *d_outputQdata = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata,
-                              (numChannels * outputTotalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputIdata, (numChannels * outputTotalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputQdata, (numChannels * outputTotalLength * sizeof(float))));
 
   // 线程数配置
   dim3 block(numChannels);
@@ -1025,9 +835,8 @@ bool ShiftAndResampleSignalFloatV1(
   const float sampling_rate = float(245.76e6);
   ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, sampling_rate,
-      I_shifted, Q_shifted,
-      d_outputIdata, d_outputQdata);
+      numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted,
+      Q_shifted, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
@@ -1041,56 +850,47 @@ bool ShiftAndResampleSignalFloatV1(
                               cudaMemcpyDeviceToHost));
 
   // 释放显存
-  if (d_downFactor)
-  {
+  if (d_downFactor) {
     cudaFree(d_downFactor);
     d_downFactor = nullptr;
   }
 
-  if (d_outputLength)
-  {
+  if (d_outputLength) {
     cudaFree(d_outputLength);
     d_outputLength = nullptr;
   }
 
-  if (d_frequency)
-  {
+  if (d_frequency) {
     cudaFree(d_frequency);
     d_frequency = nullptr;
   }
 
-  if (d_Idata)
-  {
+  if (d_Idata) {
     cudaFree(d_Idata);
     d_Idata = nullptr;
   }
 
-  if (d_Qdata)
-  {
+  if (d_Qdata) {
     cudaFree(d_Qdata);
     d_Qdata = nullptr;
   }
 
-  if (I_shifted)
-  {
+  if (I_shifted) {
     cudaFree(I_shifted);
     I_shifted = nullptr;
   }
 
-  if (Q_shifted)
-  {
+  if (Q_shifted) {
     cudaFree(Q_shifted);
     Q_shifted = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
 
-  if (d_outputQdata)
-  {
+  if (d_outputQdata) {
     cudaFree(d_outputQdata);
     d_outputQdata = nullptr;
   }
@@ -1119,58 +919,44 @@ bool ShiftAndResampleSignalFloatV1(
 bool ShiftAndResampleSignalFloatV2(
     const std::vector<std::vector<float>> &origIdata,
     const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int alignSignalLength,
-    const int numResults,
-    const int numChannels,
-    const float CurrentRealfreq,
-    float *outputIdata,
-    float *outputQdata)
-{
+    std::vector<int> &downFactor, std::vector<float> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const float CurrentRealfreq, float *outputIdata, float *outputQdata) {
   // 参数合法性检查
-  if (alignSignalLength <= 0)
-  {
+  if (alignSignalLength <= 0) {
     LOG_ERROR_HOST("alignSignalLength <= 0");
     return false;
   }
-  if (numResults <= 0)
-  {
+  if (numResults <= 0) {
     LOG_ERROR_HOST("numResults <= 0");
     return false;
   }
-  if (numChannels <= 0)
-  {
+  if (numChannels <= 0) {
     LOG_ERROR_HOST("numChannels <= 0");
     return false;
   }
 
-  if (downFactor.size() != numResults)
-  {
+  if (downFactor.size() != numResults) {
     LOG_ERROR_HOST("vector downFactor lenght != numResults");
     return false;
   }
-  if (detectFreq.size() != numResults)
-  {
+  if (detectFreq.size() != numResults) {
     LOG_ERROR_HOST("vector detectFreq lenght != numResults");
     return false;
   }
 
-  if (outputIdata == nullptr)
-  {
+  if (outputIdata == nullptr) {
     LOG_ERROR_HOST("outputIdata is null ptr");
     return false;
   }
-  if (outputQdata == nullptr)
-  {
+  if (outputQdata == nullptr) {
     LOG_ERROR_HOST("outputQdata is null ptr");
     return false;
   }
 
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
-  if (signalLength <= 0)
-  {
+  if (signalLength <= 0) {
     LOG_ERROR_HOST("signalLength <= 0");
     return false;
   }
@@ -1199,7 +985,8 @@ bool ShiftAndResampleSignalFloatV2(
   // copy频率到显存中
   const float *src_frequency = detectFreq.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(float), cudaMemcpyHostToDevice));
+                              numResults * sizeof(float),
+                              cudaMemcpyHostToDevice));
 
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
   float *d_Idata = nullptr;
@@ -1211,8 +998,7 @@ bool ShiftAndResampleSignalFloatV2(
 
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(float);
-  for (int i = 0; i < numChannels; i++)
-  {
+  for (int i = 0; i < numChannels; i++) {
     // copy 原始的idata 到gpu显存
     float *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
@@ -1229,13 +1015,12 @@ bool ShiftAndResampleSignalFloatV2(
   // 申请移频所需的空间
   float *I_shifted = nullptr;
   float *Q_shifted = nullptr;
-  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted,
-                              (signalLength * sizeof(float))));
-  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted,
-                              (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float))));
 
   // 申请重采样后输出信号的GPU显存
-  size_t totalsize = numResults * numChannels * alignSignalLength * sizeof(float);
+  size_t totalsize =
+      numResults * numChannels * alignSignalLength * sizeof(float);
   float *d_outputIdata = nullptr;
   float *d_outputQdata = nullptr;
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
@@ -1250,68 +1035,56 @@ bool ShiftAndResampleSignalFloatV2(
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
   const float sampling_rate = float(245.76e6);
   ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
-      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults,
-      numChannels, signalLength, CurrentRealfreq, alignSignalLength,
-      sampling_rate,
-      I_shifted, Q_shifted,
-      d_outputIdata, d_outputQdata);
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels,
+      signalLength, CurrentRealfreq, alignSignalLength, sampling_rate,
+      I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
   // 且在内存中是连续存放的
-  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
-                              totalsize,
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize,
                               cudaMemcpyDeviceToHost));
 
-  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
-                              totalsize,
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize,
                               cudaMemcpyDeviceToHost));
 
   // 释放显存
-  if (d_downFactor)
-  {
+  if (d_downFactor) {
     cudaFree(d_downFactor);
     d_downFactor = nullptr;
   }
 
-  if (d_frequency)
-  {
+  if (d_frequency) {
     cudaFree(d_frequency);
     d_frequency = nullptr;
   }
 
-  if (d_Idata)
-  {
+  if (d_Idata) {
     cudaFree(d_Idata);
     d_Idata = nullptr;
   }
 
-  if (d_Qdata)
-  {
+  if (d_Qdata) {
     cudaFree(d_Qdata);
     d_Qdata = nullptr;
   }
 
-  if (I_shifted)
-  {
+  if (I_shifted) {
     cudaFree(I_shifted);
     I_shifted = nullptr;
   }
 
-  if (Q_shifted)
-  {
+  if (Q_shifted) {
     cudaFree(Q_shifted);
     Q_shifted = nullptr;
   }
 
-  if (d_outputIdata)
-  {
+  if (d_outputIdata) {
     cudaFree(d_outputIdata);
     d_outputIdata = nullptr;
   }
 
-  if (d_outputQdata)
-  {
+  if (d_outputQdata) {
     cudaFree(d_outputQdata);
     d_outputQdata = nullptr;
   }
diff --git a/cuda_resample_float.h b/cuda_resample_float.h
index e278445..5d6ac97 100644
--- a/cuda_resample_float.h
+++ b/cuda_resample_float.h
@@ -1,10 +1,10 @@
-#ifndef CUDA_RESAMPLE_H
-#define CUDA_RESAMPLE_H
+#ifndef CUDA_RESAMPLE_FLOAT_H
+#define CUDA_RESAMPLE_FLOAT_H
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
+#include <math_constants.h>  // CUDA数学常量头文件
 #include <thrust/device_vector.h>
-#include <math_constants.h> // CUDA数学常量头文件
 
 #include <cmath>
 #include <map>
@@ -16,15 +16,14 @@
 #endif
 
 // 设备端Resampler状态结构
-struct DeviceResamplerStateFloat
-{
-    int _t;                  // "time" (modulo upRate)
-    int _xOffset;            // 输入偏移量
-    float *_state;           // 状态缓冲区指针
-    float *_transposedCoefs; // 转置系数指针
-    int _coefsPerPhase;      // 每相系数数量
-    int _upRate;             // 上采样率
-    int _downRate;           // 下采样率
+struct DeviceResamplerStateFloat {
+  int _t;                   // "time" (modulo upRate)
+  int _xOffset;             // 输入偏移量
+  float *_state;            // 状态缓冲区指针
+  float *_transposedCoefs;  // 转置系数指针
+  int _coefsPerPhase;       // 每相系数数量
+  int _upRate;              // 上采样率
+  int _downRate;            // 下采样率
 };
 
 /**
@@ -49,15 +48,10 @@ struct DeviceResamplerStateFloat
 bool ShiftAndResampleSignalFloatV1(
     const std::vector<std::vector<float>> &origIdata,
     const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &outputLength,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int outputTotalLength,
-    const int numResults,
-    const int numChannels,
-    const float CurrentRealfreq,
-    float *outputIdata,
-    float *outputQdata);
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<float> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const float CurrentRealfreq,
+    float *outputIdata, float *outputQdata);
 
 /**
  * ShiftAndResampleSignalFloatV2
@@ -80,13 +74,8 @@ bool ShiftAndResampleSignalFloatV1(
 bool ShiftAndResampleSignalFloatV2(
     const std::vector<std::vector<float>> &origIdata,
     const std::vector<std::vector<float>> &origQdata,
-    std::vector<int> &downFactor,
-    std::vector<float> &detectFreq,
-    const int alignSignalLength,
-    const int numResults,
-    const int numChannels,
-    const float CurrentRealfreq,
-    float *outputIdata,
-    float *outputQdata);
+    std::vector<int> &downFactor, std::vector<float> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const float CurrentRealfreq, float *outputIdata, float *outputQdata);
 
-#endif // CUDA_RESAMPLE_H
+#endif  // CUDA_RESAMPLE_FLOAT_H
-- 
Gitee


From 7f92be842f4afb5366a906b6dc8f2437ab0a7be8 Mon Sep 17 00:00:00 2001
From: wsqRichards <229242333@qq.com>
Date: Mon, 22 Dec 2025 10:39:16 +0800
Subject: [PATCH 27/27] =?UTF-8?q?=E5=AE=8C=E5=96=84resample-23?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_resample.cpp => common.cpp |   4 +-
 cuda_resample.h => common.h     |   6 +-
 cuda_resample_double.cu         | 433 +++++++++++++++++++++++---------
 cuda_resample_float.cu          |  89 ++++++-
 4 files changed, 407 insertions(+), 125 deletions(-)
 rename cuda_resample.cpp => common.cpp (98%)
 rename cuda_resample.h => common.h (98%)

diff --git a/cuda_resample.cpp b/common.cpp
similarity index 98%
rename from cuda_resample.cpp
rename to common.cpp
index e93b584..822f9b2 100644
--- a/cuda_resample.cpp
+++ b/common.cpp
@@ -1,5 +1,5 @@
-#ifndef CUDA_RESAMPLE_H
-#define CUDA_RESAMPLE_H
+#ifndef __COMMON_H__
+#define __COMMON_H__
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
diff --git a/cuda_resample.h b/common.h
similarity index 98%
rename from cuda_resample.h
rename to common.h
index 377eb63..d573f60 100644
--- a/cuda_resample.h
+++ b/common.h
@@ -1,5 +1,5 @@
-#ifndef CUDA_RESAMPLE_H
-#define CUDA_RESAMPLE_H
+#ifndef __COMMON_H__
+#define __COMMON_H__
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
@@ -122,4 +122,4 @@ int quotientCeil(int num1, int num2);
 int SetGPUHeapSize(int deviceId, float heapSizePercent, size_t minHeapSizeMB,
                    size_t maxHeapSizeMB);
 
-#endif  // CUDA_RESAMPLE_H
\ No newline at end of file
+#endif  // __COMMON_H__
\ No newline at end of file
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
index 23970bc..9f9ceb5 100644
--- a/cuda_resample_double.cu
+++ b/cuda_resample_double.cu
@@ -1,30 +1,48 @@
-#include "cuda_resample.h"
+#include <stdio.h>
+
+#include "common.h"
 #include "cuda_resample_double.h"
 
 // 设备端Resampler初始化
-__device__ void resampler_apply_device_double(DeviceResamplerStateDouble *state,
-                                              double *transposedCoefs,
-                                              int coefsPerPhase, int upRate,
-                                              int downRate) {
+__device__ void resampler_init_state_device_double(
+    DeviceResamplerStateDouble *state, double *transposedCoefs,
+    int coefsPerPhase, int upRate, int downRate) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
   state->_t = 0;
   state->_xOffset = 0;
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
   state->_transposedCoefs = transposedCoefs;
   state->_coefsPerPhase = coefsPerPhase;
   state->_upRate = upRate;
   state->_downRate = downRate;
 
   // 分配状态缓冲区
-  state->_state = new double[coefsPerPhase - 1];
+  state->_state = (double *)malloc((coefsPerPhase - 1) * sizeof(double));
+  if (state->_state == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!");
+    return;
+  }
 
   // 初始化状态为零
   for (int i = 0; i < coefsPerPhase - 1; i++) {
-    state->_state[i] = 0;
+    state->_state[i] = double(0);
   }
 }
 
 // 设备端：计算所需输出数量
 __device__ int resampler_needed_out_count_device_double(
     int inCount, DeviceResamplerStateDouble *state) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
   int np = inCount * state->_upRate;
   int need = np / state->_downRate;
 
@@ -40,6 +58,19 @@ __device__ int resampler_needed_out_count_device_double(
 __device__ int resampler_apply_device_double(
     double *in, int inCount, double *out, int outCount,
     DeviceResamplerStateDouble *state) {
+  if (in == nullptr) {
+    LOG_ERROR_DEVICE("in ptr is nullptr!");
+    return;
+  }
+  if (out == nullptr) {
+    LOG_ERROR_DEVICE("out ptr is nullptr!");
+    return;
+  }
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
   if (outCount < resampler_needed_out_count_device_double(inCount, state)) {
     // 在设备端无法抛出异常，返回错误代码
     return -1;
@@ -106,23 +137,22 @@ __device__ int resampler_apply_device_double(
   return y - out;
 }
 
-// 设备端：释放Resampler状态
-__device__ void resampler_apply_device_double(
-    DeviceResamplerStateDouble *state) {
-  if (state->_state != nullptr) {
-    delete[] state->_state;
-    state->_state = nullptr;
-  }
-}
-
 // 设备端：转置滤波器系数（每个线程执行）
 __device__ void transpose_filter_coefs_device_double(double *transposedCoefs,
                                                      double *coefs, int upRate,
                                                      int coefCount,
                                                      int coefsPerPhase) {
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  if (coefs == nullptr) {
+    LOG_ERROR_DEVICE("coefs ptr is nullptr!");
+    return;
+  }
   // 初始化转置系数为零
   for (int i = 0; i < upRate * coefsPerPhase; i++) {
-    transposedCoefs[i] = 0;
+    transposedCoefs[i] = double(0);
   }
 
   // 转置并翻转每个相位
@@ -141,6 +171,19 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
                                       int inLength, double *filter,
                                       int filterLength, double *results,
                                       int *resultsCount) {
+  if (input == nullptr) {
+    LOG_ERROR_DEVICE("input ptr is nullptr!");
+    return;
+  }
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("filter ptr is nullptr!");
+    return;
+  }
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("results ptr is nullptr!");
+    return;
+  }
+
   // 计算填充后的系数数量
   int paddedCoefCount = filterLength;
   while (paddedCoefCount % upRate) {
@@ -150,7 +193,11 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
   int coefsPerPhase = paddedCoefCount / upRate;
 
   // 分配转置系数内存
-  double *transposedCoefs = new double[paddedCoefCount];
+  double *transposedCoefs = (double *)malloc(paddedCoefCount * sizeof(double));
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!");
+    return;
+  }
 
   // 转置滤波器系数
   transpose_filter_coefs_device_double(transposedCoefs, filter, upRate,
@@ -158,14 +205,18 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
 
   // 创建Resampler状态
   DeviceResamplerStateDouble state;
-  resampler_apply_device_double(&state, transposedCoefs, coefsPerPhase, upRate,
-                                downRate);
+  resampler_init_state_device_double(&state, transposedCoefs, coefsPerPhase,
+                                     upRate, downRate);
 
   // 计算填充量
   int padding = coefsPerPhase - 1;
 
   // 分配填充输入内存
-  double *inputPadded = new double[inLength + padding];
+  double *inputPadded = (double *)malloc((inLength + padding) * sizeof(double));
+  if (inputPadded == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!");
+    return;
+  }
 
   // 复制输入并填充
   for (int i = 0; i < inLength + padding; i++) {
@@ -190,22 +241,19 @@ __device__ void upfirdn_device_double(int upRate, int downRate, double *input,
       inputPadded, inLength + padding, results, resultsCountValue, &state);
 
   // 清理设备内存
-  delete[] transposedCoefs;
-  delete[] inputPadded;
-  resampler_apply_device_double(&state);
-}
+  free(inputPadded);
 
-// 向量版本的设备端upfirdn
-__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
-                                      int inputLength, double *filter,
-                                      int filterLength, double *results) {
-  upfirdn_device_double(upRate, downRate, input, inputLength, filter,
-                        filterLength, results, nullptr);
+  if (state._state != nullptr) {
+    free(state._state);
+    state._state = nullptr;
+  }
+
+  free(transposedCoefs);
+  state._transposedCoefs = nullptr;
 }
 
 // 整数向上取整除法
 __device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
-  // 标准的上取整公式：(a + b - 1) / b
   return (num1 + num2 - 1) / num2;
 }
 
@@ -222,6 +270,10 @@ __device__ __forceinline__ int dev_gcd(int a, int b) {
 // 生成连续递增的序列
 __device__ __forceinline__ void dev_iota_double(double *data, int size,
                                                 double start) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
   for (int i = 0; i < size; i++) {
     data[i] = start + double(i);
   }
@@ -231,6 +283,10 @@ __device__ __forceinline__ void dev_iota_double(double *data, int size,
 // 填充data为value
 __device__ __forceinline__ void dev_fill_double(double *data, int size,
                                                 double value) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
   for (int i = 0; i < size; i++) {
     data[i] = value;
   }
@@ -239,12 +295,26 @@ __device__ __forceinline__ void dev_fill_double(double *data, int size,
 
 __device__ int dev_firls_double(double *result, int length, double *freq,
                                 const double *amplitude, int freqSize) {
+  if (result == nullptr) {
+    LOG_ERROR_DEVICE("result ptr is nullptr!");
+    return;
+  }
+  if (freq == nullptr) {
+    LOG_ERROR_DEVICE("freq ptr is nullptr!");
+    return;
+  }
+  if (amplitude == nullptr) {
+    LOG_ERROR_DEVICE("amplitude ptr is nullptr!");
+    return;
+  }
+
   // 计算权重大小
   int weightSize = freqSize / 2;
 
   // 初始化权重向量
-  double *weight = new double[weightSize];
+  double *weight = (double *)malloc(weightSize * sizeof(double));
   if (weight == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for weight!");
     return -1;
   }
 
@@ -264,8 +334,9 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
 
   // 创建和初始化向量k
   int kLength = length + 1;
-  double *k = new double[kLength];
+  double *k = (double *)malloc(kLength * sizeof(double));
   if (k == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!");
     return -1;
   };
 
@@ -278,7 +349,6 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
     }
   }
 
-  // k.erase(k.begin());
   if (Nodd) {
     for (int i = 0; i < kLength; i++) {
       k[i] = k[i + 1];
@@ -291,8 +361,10 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
   if (Nodd) {
     bLength++;  // 此处++，因为后面需要在b[0]处插入b0
   }
-  double *b = new double[bLength];
+
+  double *b = (double *)malloc(bLength * sizeof(double));
   if (b == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!");
     return -1;
   };
 
@@ -304,26 +376,27 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
     double Fip1 = freq[i + 1];
     double ampi = amplitude[i];
     double ampip1 = amplitude[i + 1];
-    double wt2 = pow(weight[i / 2], double(2.0));
+    double wt2 = powf(weight[i / 2], double(2.0));
     double m_s = (ampip1 - ampi) / (Fip1 - Fi);
     double b1 = ampi - (m_s * Fi);
 
     if (Nodd) {
       b0 += (b1 * (Fip1 - Fi)) +
             m_s / double(2.0) *
-                (pow(Fip1, double(2.0)) - pow(Fi, double(2.0))) * wt2;
+                (powf(Fip1, double(2.0)) - powf(Fi, double(2.0))) * wt2;
     }
 
     // 并行计算b向量
     for (int j = 0; j < kLength; j++) {
       double kj = k[j];
-      b[j] += (m_s / (double(4.0) * pow(M_PI, double(2.0))) *
-               (cos(double(2.0) * M_PI * Fip1) - cos(double(2.0) * M_PI * Fi)) /
-               (pow(kj, double(2.0)))) *
-              wt2;
-
-      b[j] += (Fip1 * (m_s * Fip1 + b1) * sin(double(2.0) * kj * Fip1) -
-               Fi * (m_s * Fi + b1) * sin(double(2.0) * kj * Fi)) *
+      b[j] +=
+          (m_s / (double(4.0) * powf(M_PI, double(2.0))) *
+           (cosf(double(2.0) * M_PI * Fip1) - cosf(double(2.0) * M_PI * Fi)) /
+           (powf(kj, double(2.0)))) *
+          wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(double(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sinf(double(2.0) * kj * Fi)) *
               wt2;
     }
   }
@@ -343,14 +416,15 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
   double w0 = weight[0];
 
   int aLength = bLength;
-  double *a = new double[aLength];
+  double *a = (double *)malloc(aLength * sizeof(double));
   if (a == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!");
     return -1;
   };
 
   // vector<double> result = {a.rbegin(), a.rend()};
   for (int i = 0; i < aLength; i++) {
-    a[i] = pow(w0, double(2.0)) * double(4.0) * b[i];
+    a[i] = powf(w0, double(2.0)) * double(4.0) * b[i];
     result[aLength - 1 - i] = a[i];
   }
 
@@ -368,10 +442,10 @@ __device__ int dev_firls_double(double *result, int length, double *freq,
   }
 
   // 释放动态分配的内存
-  delete[] weight;  // 释放内存
-  delete[] k;       // 释放内存
-  delete[] b;       // 释放内存
-  delete[] a;       // 释放内存
+  free(weight);
+  free(k);
+  free(b);
+  free(a);
   return 0;
 }
 
@@ -388,12 +462,17 @@ __device__ double dev_cyl_bessel_i_double(int n, double x) {
 
 // 设备端凯塞窗核函数
 __device__ void dev_kaiser_double(double *window, int order, double bta) {
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("window ptr is nullptr!");
+    return;
+  }
+
   double Numerator, Denominator;
   Denominator = dev_cyl_bessel_i_double(0, bta);
   double od2 = (order - double(1)) / double(2);
 
   for (int n = 0; n < order; n++) {
-    double x = bta * sqrt(double(1) - pow((n - od2) / od2, double(2)));
+    double x = bta * sqrt(double(1) - powf((n - od2) / od2, double(2)));
     Numerator = dev_cyl_bessel_i_double(0, x);
     window[n] = Numerator / Denominator;
   }
@@ -402,10 +481,20 @@ __device__ void dev_kaiser_double(double *window, int order, double bta) {
 __device__ void dev_resample_double(int upFactor, int downFactor,
                                     double *inputSignal, const int inputSize,
                                     double *outputSignal) {
+  if (inputSignal == nullptr) {
+    LOG_ERROR_DEVICE("inputSignal ptr is nullptr!");
+    return;
+  }
+  if (outputSignal == nullptr) {
+    LOG_ERROR_DEVICE("outputSignal ptr is nullptr!");
+    return;
+  }
+
   const int n = 10;
   const double bta = double(5.0);
 
   if (upFactor <= 0 || downFactor <= 0) {
+    LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!");
     return;
   }
 
@@ -425,6 +514,7 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
 
   int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
   double firlsFreq = double(1.0) / double(2.0) / static_cast<double>(maxFactor);
+  int length = 2 * n * maxFactor + 1;
 
   double firlsFreqsV[4];
   firlsFreqsV[0] = double(0.0);
@@ -439,24 +529,28 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   firlsAmplitudeV[3] = double(0.0);
 
   int freqSize = 4;
-  int length = 2 * n * maxFactor + 1;
   int coefficientsLength = length;
 
-  double *coefficients = new double[coefficientsLength];
+  double *coefficients = (double *)malloc(coefficientsLength * sizeof(double));
   if (coefficients == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!");
     return;
   }
+
   int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV,
                              firlsAmplitudeV, freqSize);
   if (ret == -1) {
+    LOG_ERROR_DEVICE("dev_firls_double function error!");
     return;
   }
 
   int windowSize = length;
-  double *window = new double[windowSize];
+  double *window = (double *)malloc(windowSize * sizeof(double));
   if (window == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
     return;
   }
+
   dev_kaiser_double(window, length, bta);
 
   for (int i = 0; i < coefficientsLength; i++) {
@@ -467,9 +561,11 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   int nz = downFactor - lengthHalf % downFactor;
 
   // 分配filter空间
-  int hSize = coefficientsLength + 2 * nz;
-  double *filter = new double[hSize];
+  int hSize = coefficientsLength + nz;
+  double *filter =
+      (double *)malloc((coefficientsLength + 3 * nz) * sizeof(double));
   if (filter == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for filter!");
     return;
   }
 
@@ -509,8 +605,9 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   int outputCount =
       ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
 
-  double *results = new double[outputCount];
+  double *results = (double *)malloc(outputCount * sizeof(double));
   if (results == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!");
     return;
   }
 
@@ -524,10 +621,10 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
   }
 
   // 释放动态分配的内存
-  delete[] coefficients;
-  delete[] window;
-  delete[] filter;
-  delete[] results;
+  free(coefficients);
+  free(window);
+  free(filter);
+  free(results);
   return;
 }
 
@@ -546,6 +643,7 @@ __device__ void dev_resample_double(int upFactor, int downFactor,
  * @param numChannels：信号通道数
  * @param signalLength：每个通道的信号长度
  * @param CurrentRealfreq：当前实际频率
+ * @param sampling_rate
  * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
  * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
  * @return true or false
@@ -555,36 +653,27 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
     const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
     const int *__restrict__ VOutputLength, const int numResults,
     const int numChannels, const int signalLength, const double CurrentRealfreq,
+    const double sampling_rate, double *I_shifted, double *Q_shifted,
     double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= numChannels * numResults) return;
+  if (idx >= numResults * numChannels) return;
 
   // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
   int ResIdx = idx / numChannels;  // 第几个检测结果
   int chIdx = idx % numChannels;   // 第几个通道
 
-  const double sampling_rate = double(245.76e6);
-
   double frequency = VFrequency[ResIdx];  // 频率
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
-  // 获取当前线程处理的通道数据地址
+  // 获取当前线程处理的原始数据 （某一个通道的原始数据的地址）
   const auto I_orig = origIdata + chIdx * signalLength;
   const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr) {
-    return;
-  }
-  double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr) {
-    return;
-  }
   for (int i = 0; i < signalLength; i++) {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = cos(phase);
-    double sinVal = sin(phase);
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
     I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
     Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
@@ -614,10 +703,6 @@ __global__ void ShiftingAndResamplingKernelDoubleV1(
                       I_resampled);
   dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
                       Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
 }
 
 /**
@@ -644,6 +729,7 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
     const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
     const int numResults, const int numChannels, const int signalLength,
     const double CurrentRealfreq, const int alignSignalLength,
+    const double sampling_rate, double *I_shifted, double *Q_shifted,
     double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= numChannels * numResults) return;
@@ -652,28 +738,18 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
   int ResIdx = idx / numChannels;  // 第几个检测结果
   int chIdx = idx % numChannels;   // 第几个通道
 
-  const double sampling_rate = double(245.76e6);
-
   double frequency = VFrequency[ResIdx];  // 频率
   double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
 
   // 获取当前线程处理的通道数据地址
-  auto I_orig = origIdata + chIdx * signalLength;
-  auto Q_orig = origQdata + chIdx * signalLength;
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
 
   // 移频：生成本振信号并相乘
-  double *I_shifted = new double[signalLength];
-  if (I_shifted == nullptr) {
-    return;
-  }
-  double *Q_shifted = new double[signalLength];
-  if (Q_shifted == nullptr) {
-    return;
-  }
   for (int i = 0; i < signalLength; i++) {
     double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
-    double cosVal = cos(phase);
-    double sinVal = sin(phase);
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
     I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
     Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
   }
@@ -694,10 +770,6 @@ __global__ void ShiftingAndResamplingKernelDoubleV2(
                       I_resampled);
   dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
                       Q_resampled);
-
-  // 释放动态分配的内存
-  delete[] I_shifted;
-  delete[] Q_shifted;
 }
 
 /**
@@ -726,14 +798,62 @@ bool ShiftAndResampleSignalDoubleV1(
     std::vector<double> &detectFreq, const int outputTotalLength,
     const int numResults, const int numChannels, const double CurrentRealfreq,
     double *outputIdata, double *outputQdata) {
+  // 参数合法性检查
+  if (outputTotalLength <= 0) {
+    LOG_ERROR_HOST("outputTotalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (outputLength.size() != numResults) {
+    LOG_ERROR_HOST("vector outputLength lenght != numResults");
+    return false;
+  }
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  double heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
-  // copy下采样率，频率等数据到显存中
+  // copy下采样率，频率，输出信号长度等数据到显存中
   int *d_downFactor = nullptr;
   int *d_outputLength = nullptr;
   double *d_frequency = nullptr;
+
   // 申请显存
   CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
   CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
@@ -745,18 +865,18 @@ bool ShiftAndResampleSignalDoubleV1(
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
-  // copy频率到显存中
-  const double *src_frequency = detectFreq.data();
-  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
-                              numResults * sizeof(double),
-                              cudaMemcpyHostToDevice));
-
   // copy每个带宽，重采样后输出信号长度到显存中
   const int *src_outputLength = outputLength.data();
   CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
                               numResults * sizeof(int),
                               cudaMemcpyHostToDevice));
 
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double),
+                              cudaMemcpyHostToDevice));
+
   // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
   double *d_Idata = nullptr;
   double *d_Qdata = nullptr;
@@ -768,19 +888,25 @@ bool ShiftAndResampleSignalDoubleV1(
   // 将所有通道数据循环拷贝到GPU显存
   size_t copySize = signalLength * sizeof(double);
   for (int i = 0; i < numChannels; i++) {
-    // copy 原始的idata 到gpu显存
+    // copy 原始的 idata 到gpu显存
     double *dst_idata = d_Idata + i * signalLength;
     const void *src_idata = origIdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
 
-    // copy 原始的qdata 到gpu显存
+    // copy 原始的 qdata 到gpu显存
     double *dst_qdata = d_Qdata + i * signalLength;
     const void *src_qdata = origQdata[i].data();
     CHECK_CUDA_ERROR(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
+  // 申请移频所需的空间
+  double *I_shifted = nullptr;
+  double *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double))));
+
   // 申请重采样后输出信号的GPU显存
   double *d_outputIdata = nullptr;
   double *d_outputQdata = nullptr;
@@ -791,11 +917,12 @@ bool ShiftAndResampleSignalDoubleV1(
 
   // 线程数配置
   dim3 block(numChannels);
-  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
-
+  dim3 grid((numResults * numChannels + block.x - 1) / block.x);
+  const double sampling_rate = double(245.76e6);
   ShiftingAndResamplingKernelDoubleV1<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
-      numChannels, signalLength, CurrentRealfreq, d_outputIdata, d_outputQdata);
+      numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted,
+      Q_shifted, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
@@ -836,9 +963,14 @@ bool ShiftAndResampleSignalDoubleV1(
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata) {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
   }
 
   if (d_outputIdata) {
@@ -846,6 +978,11 @@ bool ShiftAndResampleSignalDoubleV1(
     d_outputIdata = nullptr;
   }
 
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
   return true;
 }
 
@@ -873,8 +1010,51 @@ bool ShiftAndResampleSignalDoubleV2(
     std::vector<int> &downFactor, std::vector<double> &detectFreq,
     const int alignSignalLength, const int numResults, const int numChannels,
     const double CurrentRealfreq, double *outputIdata, double *outputQdata) {
+  // 参数合法性检查
+  if (alignSignalLength <= 0) {
+    LOG_ERROR_HOST("alignSignalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
   // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
   int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  double heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
 
   // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
   // copy下采样率，频率等数据到显存中
@@ -920,6 +1100,12 @@ bool ShiftAndResampleSignalDoubleV2(
         cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
   }
 
+  // 申请移频所需的空间
+  double *I_shifted = nullptr;
+  double *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double))));
+
   // 申请重采样后输出信号的GPU显存
   size_t totalsize =
       numResults * numChannels * alignSignalLength * sizeof(double);
@@ -935,10 +1121,11 @@ bool ShiftAndResampleSignalDoubleV2(
   // 线程数配置，总的线程数：numChannels * numResults
   dim3 block(numChannels);
   dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  const double sampling_rate = double(245.76e6);
   ShiftingAndResamplingKernelDoubleV2<<<grid, block>>>(
       d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels,
-      signalLength, CurrentRealfreq, alignSignalLength, d_outputIdata,
-      d_outputQdata);
+      signalLength, CurrentRealfreq, alignSignalLength, sampling_rate,
+      I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
 
   // copy重采样计算结果到主存
   // 存储格式：[numResults][numChannels][lengthPerResults]
@@ -970,9 +1157,14 @@ bool ShiftAndResampleSignalDoubleV2(
     d_Qdata = nullptr;
   }
 
-  if (d_outputIdata) {
-    cudaFree(d_outputIdata);
-    d_outputIdata = nullptr;
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
   }
 
   if (d_outputIdata) {
@@ -980,5 +1172,10 @@ bool ShiftAndResampleSignalDoubleV2(
     d_outputIdata = nullptr;
   }
 
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
   return true;
 }
\ No newline at end of file
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
index 9b66559..508bfc5 100644
--- a/cuda_resample_float.cu
+++ b/cuda_resample_float.cu
@@ -1,14 +1,23 @@
 #include <stdio.h>
 
-#include "cuda_resample.h"
+#include "common.h"
 #include "cuda_resample_float.h"
 
 // 设备端Resampler初始化
 __device__ void resampler_init_state_device_float(
     DeviceResamplerStateFloat *state, float *transposedCoefs, int coefsPerPhase,
     int upRate, int downRate) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
   state->_t = 0;
   state->_xOffset = 0;
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
   state->_transposedCoefs = transposedCoefs;
   state->_coefsPerPhase = coefsPerPhase;
   state->_upRate = upRate;
@@ -30,6 +39,11 @@ __device__ void resampler_init_state_device_float(
 // 设备端：计算所需输出数量
 __device__ int resampler_needed_out_count_device_float(
     int inCount, DeviceResamplerStateFloat *state) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
   int np = inCount * state->_upRate;
   int need = np / state->_downRate;
 
@@ -45,6 +59,18 @@ __device__ int resampler_needed_out_count_device_float(
 __device__ int resampler_apply_device_float(float *in, int inCount, float *out,
                                             int outCount,
                                             DeviceResamplerStateFloat *state) {
+  if (in == nullptr) {
+    LOG_ERROR_DEVICE("in ptr is nullptr!");
+    return;
+  }
+  if (out == nullptr) {
+    LOG_ERROR_DEVICE("out ptr is nullptr!");
+    return;
+  }
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
   if (outCount < resampler_needed_out_count_device_float(inCount, state)) {
     // 在设备端无法抛出异常，返回错误代码
     return -1;
@@ -116,6 +142,15 @@ __device__ void transpose_filter_coefs_device_float(float *transposedCoefs,
                                                     float *coefs, int upRate,
                                                     int coefCount,
                                                     int coefsPerPhase) {
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  if (coefs == nullptr) {
+    LOG_ERROR_DEVICE("coefs ptr is nullptr!");
+    return;
+  }
+
   // 初始化转置系数为零
   for (int i = 0; i < upRate * coefsPerPhase; i++) {
     transposedCoefs[i] = float(0);
@@ -137,6 +172,19 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
                                      int inLength, float *filter,
                                      int filterLength, float *results,
                                      int *resultsCount) {
+  if (input == nullptr) {
+    LOG_ERROR_DEVICE("input ptr is nullptr!");
+    return;
+  }
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("filter ptr is nullptr!");
+    return;
+  }
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("results ptr is nullptr!");
+    return;
+  }
+
   // 计算填充后的系数数量
   int paddedCoefCount = filterLength;
   while (paddedCoefCount % upRate) {
@@ -194,12 +242,14 @@ __device__ void upfirdn_device_float(int upRate, int downRate, float *input,
       inputPadded, inLength + padding, results, resultsCountValue, &state);
 
   // 清理设备内存
-  free(transposedCoefs);
   free(inputPadded);
+
   if (state._state != nullptr) {
     free(state._state);
     state._state = nullptr;
   }
+
+  free(transposedCoefs);
   state._transposedCoefs = nullptr;
 }
 
@@ -221,6 +271,10 @@ __device__ __forceinline__ int dev_gcd(int a, int b) {
 // 生成连续递增的序列
 __device__ __forceinline__ void dev_iota_float(float *data, int size,
                                                float start) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
   for (int i = 0; i < size; i++) {
     data[i] = start + float(i);
   }
@@ -230,6 +284,10 @@ __device__ __forceinline__ void dev_iota_float(float *data, int size,
 // 填充data为value
 __device__ __forceinline__ void dev_fill_float(float *data, int size,
                                                float value) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
   for (int i = 0; i < size; i++) {
     data[i] = value;
   }
@@ -238,6 +296,19 @@ __device__ __forceinline__ void dev_fill_float(float *data, int size,
 
 __device__ int dev_firls_float(float *result, int length, float *freq,
                                const float *amplitude, int freqSize) {
+  if (result == nullptr) {
+    LOG_ERROR_DEVICE("result ptr is nullptr!");
+    return;
+  }
+  if (freq == nullptr) {
+    LOG_ERROR_DEVICE("freq ptr is nullptr!");
+    return;
+  }
+  if (amplitude == nullptr) {
+    LOG_ERROR_DEVICE("amplitude ptr is nullptr!");
+    return;
+  }
+
   // 计算权重大小
   int weightSize = freqSize / 2;
 
@@ -391,6 +462,10 @@ __device__ float dev_cyl_bessel_i_float(int n, float x) {
 
 // 设备端凯塞窗核函数
 __device__ void dev_kaiser_float(float *window, int order, float bta) {
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("window ptr is nullptr!");
+    return;
+  }
   float Numerator, Denominator;
   Denominator = dev_cyl_bessel_i_float(0, bta);
   float od2 = (order - float(1)) / float(2);
@@ -405,6 +480,15 @@ __device__ void dev_kaiser_float(float *window, int order, float bta) {
 __device__ void dev_resample_float(int upFactor, int downFactor,
                                    float *inputSignal, const int inputSize,
                                    float *outputSignal) {
+  if (inputSignal == nullptr) {
+    LOG_ERROR_DEVICE("inputSignal ptr is nullptr!");
+    return;
+  }
+  if (outputSignal == nullptr) {
+    LOG_ERROR_DEVICE("outputSignal ptr is nullptr!");
+    return;
+  }
+
   const int n = 10;
   const float bta = float(5.0);
 
@@ -465,6 +549,7 @@ __device__ void dev_resample_float(int upFactor, int downFactor,
     LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
     return;
   }
+
   dev_kaiser_float(window, length, bta);
 
   for (int i = 0; i < coefficientsLength; i++) {
-- 
Gitee