diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ad285811ccaef4c6ec3addc080ed19e702c4fef..aa1e423cbc5ad317aa747a484c47fcb3d5e40ed4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,8 +55,15 @@ if(USE_CUDA)
   message(STATUS "CMAKE_CUDA_ARCHITECTURES=" ${CMAKE_CUDA_ARCHITECTURES})
 endif()
 
+# 对于模板核函数需要这些选项
+# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+#     -Wno-deprecated-gpu-targets")
+
+# 如果使用分离编译，需要添加
+# set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
 # debug OR release mode
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++11 -pthread -pipe")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++17 -pthread -pipe")
 if (NOT CMAKE_BUILD_TYPE)
   set(_CMAKE_BUILD_TYPE_LOWER "release")
 else()
diff --git a/common.cpp b/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..822f9b27988c24c5a711aaf56e446647690a0b1c
--- /dev/null
+++ b/common.cpp
@@ -0,0 +1,101 @@
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+
+#include <iostream>
+
+/**
+ * @brief 计算两个整数相除的向上取整结果（商向正无穷方向取整）
+ *
+ * 该函数实现整数除法的向上取整，与数学中的ceil(num1/num2)行为一致。
+ * 例如：7/3=2.333→向上取整为3；-7/3=-2.333→向上取整为-2；7/-3=-2.333→向上取整为-2
+ *
+ * @param[in] num1 被除数（分子）
+ * @param[in] num2 除数（分母），不能为0
+ * @return int 向上取整后的商
+ * @note
+ *   1. 当除数为0时，会输出错误信息到stderr并返回0（实际应用中建议改为抛出异常）
+ *   2.
+ * 仅当存在非零余数且除数与被除数同号时，商才需要加1（避免异号情况下错误进位）
+ *   3. 与C++标准库的std::ceil不同，该函数仅处理整数运算，无浮点精度损失
+ * @warning 除数为0时返回0是临时容错处理，生产环境应使用assert或异常机制终止程序
+ */
+int quotientCeil(int num1, int num2) {
+  // 处理除数为0的情况（根据需求选择返回值或抛出异常）
+  if (num2 == 0) {
+    std::cerr << "Division by zero error" << std::endl;
+    return 0;  // 或使用 assert(0) 终止程序
+  }
+
+  // 计算商和余数
+  int quotient = num1 / num2;
+  int remainder = num1 % num2;
+
+  // 仅当余数不为0且除数与被除数同号时，才需要向上取整（商+1）
+  if (remainder != 0 && ((num1 > 0 && num2 > 0) || (num1 < 0 && num2 < 0))) {
+    quotient += 1;
+  }
+
+  return quotient;
+}
+
+/**
+ * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
+ * @param deviceId 要设置的设备ID（默认0）
+ * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
+ * @param minHeapSizeMB 最小堆大小MB（默认16MB）
+ * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
+ * @return 0成功，-1失败
+ */
+int SetGPUHeapSize(int deviceId = 0, float heapSizePercent = 10.0f,
+                   size_t minHeapSizeMB = 16, size_t maxHeapSizeMB = 512) {
+  // 获取设备属性
+  cudaDeviceProp prop;
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceId);
+  if (err != cudaSuccess) {
+    std::cerr << "获取设备属性失败: " << cudaGetErrorString(err) << std::endl;
+    return -1;
+  }
+
+  // 检查设备是否支持设备端malloc
+  if (prop.major < 2) {
+    std::cerr << "设备不支持设备端malloc (计算能力 < 2.0)" << std::endl;
+    std::cerr << "设备计算能力: " << prop.major << "." << prop.minor
+              << std::endl;
+    return -1;
+  }
+
+  // 根据设备内存动态设置堆大小
+  size_t totalGlobalMem = prop.totalGlobalMem;
+  size_t heapSize = 0;
+
+  if (prop.major >= 2) {
+    // 对于支持动态分配的设备
+    // 堆大小建议：总内存的10-25%，但不超过一定限制
+    heapSize = totalGlobalMem * 0.1;  // 10%的总内存
+
+    // 设置上限，避免太大
+    const size_t maxHeapSizeBytes = maxHeapSizeMB * 1024 * 1024;  // 512MB
+    if (heapSize > maxHeapSizeBytes) {
+      heapSize = maxHeapSizeBytes;
+    }
+
+    // 设置下限，确保足够
+    const size_t minHeapSizeBytes = minHeapSizeMB * 1024 * 1024;  // 16MB
+    if (heapSize < minHeapSizeBytes) {
+      heapSize = minHeapSizeBytes;
+    }
+
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, heapSize);
+  } else {
+    std::cout << "设备不支持设备端malloc" << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+#endif  // CUDA_RESAMPLE_H
\ No newline at end of file
diff --git a/common.h b/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..d573f60b7d2f8eb4c4c3c679ab4a170753e27cd2
--- /dev/null
+++ b/common.h
@@ -0,0 +1,125 @@
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+
+#include <iostream>
+
+/**
+ * @brief CUDA运行时错误检查宏，带自动定位和异常抛出功能
+ *
+ * 该宏封装了CUDA API调用的错误检查逻辑，当CUDA操作失败时：
+ * 1. 自动获取当前函数名、文件名和行号，精确定位错误位置
+ * 2. 输出人类可读的错误描述信息（通过cudaGetErrorString）
+ * 3. 抛出std::runtime_error异常终止程序（或触发上层异常处理）
+ *
+ * 采用do-while(0)结构确保宏在各种上下文（如if语句后）都能正确展开，
+ * 同时避免变量作用域污染和语法错误。
+ *
+ * @param call 待检查的CUDA API调用表达式（如cudaMalloc、cudaMemcpy等）
+ * @note 必须在包含<cuda_runtime.h>和<iostream>的环境中使用
+ * @warning 异常抛出会终止当前调用栈，建议在关键CUDA操作后立即使用
+ * @example
+ *   CHECK_CUDA_ERROR(cudaMalloc(&dev_ptr, size));  // 检查内存分配
+ *   CHECK_CUDA_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));  //
+ * 检查数据拷贝
+ */
+#define CHECK_CUDA_ERROR(call)                                               \
+  do {                                                                       \
+    cudaError_t err = call;                                                  \
+    if (err != cudaSuccess) {                                                \
+      std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":"      \
+                << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
+      throw std::runtime_error("CUDA error");                                \
+    }                                                                        \
+  } while (0)
+
+/**
+ * @brief 设备端（GPU/CUDA）错误日志输出宏
+ *
+ * 用于在CUDA设备代码或设备端工具函数中输出错误日志，采用C风格printf实现，
+ * 支持格式化字符串和可变参数。自动附加文件名、行号和函数名，便于错误定位。
+ *
+ * @note 1. 必须在支持C99可变参数宏的编译器中使用（如GCC 4.0+、Clang 3.0+）
+ *       2. 设备端代码通常不支持C++标准库（如std::cerr），故使用printf实现
+ *       3. fmt参数需遵循printf格式规范，例如"%d"对应整数，"%s"对应字符串
+ *
+ * @param[in] fmt 格式化字符串（例如"GPU memory allocation failed, size=%d
+ * bytes"）
+ * @param[in] ... 可变参数列表，与fmt中的格式占位符对应
+ *
+ * @example
+ * // 在CUDA核函数或设备函数中使用
+ * __device__ void cuda_kernel() {
+ *     if (threadIdx.x >= blockDim.x) {
+ *         LOG_ERROR_DEVICE("Invalid thread index: %d (max allowed: %d)",
+ *                         threadIdx.x, blockDim.x - 1);
+ *     }
+ * }
+ */
+#define LOG_ERROR_DEVICE(fmt, ...)                                            \
+  printf("[DEVICE ERROR] %s:%d (%s) " fmt "\n", __FILE__, __LINE__, __func__, \
+         ##__VA_ARGS__)
+
+/**
+ * @brief 主机端（CPU）错误日志输出宏
+ *
+ * 用于在CPU端代码中输出错误日志，基于C++标准库std::cerr实现，支持C++流操作符语法。
+ * 自动附加文件名、行号和函数名，输出至标准错误流（stderr）而非标准输出（stdout），
+ * 便于日志重定向和错误捕获（例如在shell中通过2> error.log单独收集错误）。
+ *
+ * @note 1.
+ * 仅可在主机端C++代码中使用，不可用于CUDA设备代码（设备端不支持std::cerr）
+ *       2.
+ * message参数支持任意可被std::ostream输出的类型（如字符串、数字、自定义类型重载<<）
+ *       3. 输出自动追加换行符，无需手动添加"\n"
+ *
+ * @param[in] message 错误信息（可通过流操作符合并多个值，例如"Load model
+ * failed: " + filename）
+ *
+ * @example
+ * // 在主机端函数中使用
+ * void load_config_file(const std::string& path) {
+ *     std::ifstream file(path);
+ *     if (!file.is_open()) {
+ *         LOG_ERROR_HOST("Failed to open config file: " << path << " (error: "
+ * << strerror(errno) << ")");
+ *     }
+ * }
+ */
+#define LOG_ERROR_HOST(message)                                       \
+  std::cerr << "[HOST ERROR] " << __FILE__ << ":" << __LINE__ << " (" \
+            << __FUNCTION__ << ") " << message << std::endl
+
+/**
+ * @brief 计算两个整数相除的向上取整结果（商向正无穷方向取整）
+ *
+ * 该函数实现整数除法的向上取整，与数学中的ceil(num1/num2)行为一致。
+ * 例如：7/3=2.333→向上取整为3；-7/3=-2.333→向上取整为-2；7/-3=-2.333→向上取整为-2
+ *
+ * @param[in] num1 被除数（分子）
+ * @param[in] num2 除数（分母），不能为0
+ * @return int 向上取整后的商
+ * @note
+ *   1. 当除数为0时，会输出错误信息到stderr并返回0（实际应用中建议改为抛出异常）
+ *   2.
+ * 仅当存在非零余数且除数与被除数同号时，商才需要加1（避免异号情况下错误进位）
+ *   3. 与C++标准库的std::ceil不同，该函数仅处理整数运算，无浮点精度损失
+ * @warning 除数为0时返回0是临时容错处理，生产环境应使用assert或异常机制终止程序
+ */
+int quotientCeil(int num1, int num2);
+
+/**
+ * @brief 设置CUDA设备堆大小以便支持设备端malloc/free
+ * @param deviceId 要设置的设备ID（默认0）
+ * @param heapSizePercent 要分配的全局内存百分比（默认10%，范围1-50）
+ * @param minHeapSizeMB 最小堆大小MB（默认16MB）
+ * @param maxHeapSizeMB 最大堆大小MB（默认512MB）
+ * @return 0成功，-1失败
+ */
+int SetGPUHeapSize(int deviceId, float heapSizePercent, size_t minHeapSizeMB,
+                   size_t maxHeapSizeMB);
+
+#endif  // __COMMON_H__
\ No newline at end of file
diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f9ceb50caccab3b03c7463fbc0dccb08d92d361
--- /dev/null
+++ b/cuda_resample_double.cu
@@ -0,0 +1,1181 @@
+#include <stdio.h>
+
+#include "common.h"
+#include "cuda_resample_double.h"
+
+// 设备端Resampler初始化
+__device__ void resampler_init_state_device_double(
+    DeviceResamplerStateDouble *state, double *transposedCoefs,
+    int coefsPerPhase, int upRate, int downRate) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+  state->_t = 0;
+  state->_xOffset = 0;
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = (double *)malloc((coefsPerPhase - 1) * sizeof(double));
+  if (state->_state == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!");
+    return;
+  }
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
+    state->_state[i] = double(0);
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_double(
+    int inCount, DeviceResamplerStateDouble *state) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate)) {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_double(
+    double *in, int inCount, double *out, int outCount,
+    DeviceResamplerStateDouble *state) {
+  if (in == nullptr) {
+    LOG_ERROR_DEVICE("in ptr is nullptr!");
+    return;
+  }
+  if (out == nullptr) {
+    LOG_ERROR_DEVICE("out ptr is nullptr!");
+    return;
+  }
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
+  if (outCount < resampler_needed_out_count_device_double(inCount, state)) {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  double *x = in + state->_xOffset;
+  double *y = out;
+  double *end = in + inCount;
+
+  while (x < end) {
+    double acc = 0;
+    double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    double *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0) {
+      // 需要从_state缓冲区中获取
+      double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x) {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0) {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++) {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++) {
+      state->_state[retain + i] = in[i];
+    }
+  } else {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
+      state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_double(double *transposedCoefs,
+                                                     double *coefs, int upRate,
+                                                     int coefCount,
+                                                     int coefsPerPhase) {
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  if (coefs == nullptr) {
+    LOG_ERROR_DEVICE("coefs ptr is nullptr!");
+    return;
+  }
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
+    transposedCoefs[i] = double(0);
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < coefCount) {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_double(int upRate, int downRate, double *input,
+                                      int inLength, double *filter,
+                                      int filterLength, double *results,
+                                      int *resultsCount) {
+  if (input == nullptr) {
+    LOG_ERROR_DEVICE("input ptr is nullptr!");
+    return;
+  }
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("filter ptr is nullptr!");
+    return;
+  }
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("results ptr is nullptr!");
+    return;
+  }
+
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  double *transposedCoefs = (double *)malloc(paddedCoefCount * sizeof(double));
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!");
+    return;
+  }
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_double(transposedCoefs, filter, upRate,
+                                       filterLength, coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateDouble state;
+  resampler_init_state_device_double(&state, transposedCoefs, coefsPerPhase,
+                                     upRate, downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  double *inputPadded = (double *)malloc((inLength + padding) * sizeof(double));
+  if (inputPadded == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!");
+    return;
+  }
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
+      inputPadded[i] = input[i];
+    } else {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_double(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr) {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_double(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  free(inputPadded);
+
+  if (state._state != nullptr) {
+    free(state._state);
+    state._state = nullptr;
+  }
+
+  free(transposedCoefs);
+  state._transposedCoefs = nullptr;
+}
+
+// 整数向上取整除法
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
+  return (num1 + num2 - 1) / num2;
+}
+
+// CUDA设备端GCD函数:最大公约数
+__device__ __forceinline__ int dev_gcd(int a, int b) {
+  while (b != 0) {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// 生成连续递增的序列
+__device__ __forceinline__ void dev_iota_double(double *data, int size,
+                                                double start) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
+  for (int i = 0; i < size; i++) {
+    data[i] = start + double(i);
+  }
+  return;
+}
+
+// 填充data为value
+__device__ __forceinline__ void dev_fill_double(double *data, int size,
+                                                double value) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
+  for (int i = 0; i < size; i++) {
+    data[i] = value;
+  }
+  return;
+}
+
+__device__ int dev_firls_double(double *result, int length, double *freq,
+                                const double *amplitude, int freqSize) {
+  if (result == nullptr) {
+    LOG_ERROR_DEVICE("result ptr is nullptr!");
+    return;
+  }
+  if (freq == nullptr) {
+    LOG_ERROR_DEVICE("freq ptr is nullptr!");
+    return;
+  }
+  if (amplitude == nullptr) {
+    LOG_ERROR_DEVICE("amplitude ptr is nullptr!");
+    return;
+  }
+
+  // 计算权重大小
+  int weightSize = freqSize / 2;
+
+  // 初始化权重向量
+  double *weight = (double *)malloc(weightSize * sizeof(double));
+  if (weight == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for weight!");
+    return -1;
+  }
+
+  // 初始化weight为全1
+  dev_fill_double(weight, weightSize, double(1.0));
+
+  // 处理频率向量
+  for (int i = 0; i < freqSize; i++) {
+    freq[i] = freq[i] / double(2.0);
+  }
+
+  int filterLength = length + 1;
+  length = (filterLength - 1) / 2;
+
+  // 奇偶判断
+  bool Nodd = filterLength & 1;
+
+  // 创建和初始化向量k
+  int kLength = length + 1;
+  double *k = (double *)malloc(kLength * sizeof(double));
+  if (k == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!");
+    return -1;
+  };
+
+  // 初始化k向量为递增序列：0，1，2...
+  dev_iota_double(k, kLength, double(0.0));
+
+  if (!Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] += double(0.5);
+    }
+  }
+
+  if (Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] = k[i + 1];
+    }
+    kLength--;
+  }
+
+  // 创建和初始化向量b
+  int bLength = kLength;
+  if (Nodd) {
+    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
+  }
+
+  double *b = (double *)malloc(bLength * sizeof(double));
+  if (b == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!");
+    return -1;
+  };
+
+  dev_fill_double(b, bLength, double(0.0));
+
+  double b0 = double(0.0);
+  for (int i = 0; i < freqSize; i += 2) {
+    double Fi = freq[i];
+    double Fip1 = freq[i + 1];
+    double ampi = amplitude[i];
+    double ampip1 = amplitude[i + 1];
+    double wt2 = powf(weight[i / 2], double(2.0));
+    double m_s = (ampip1 - ampi) / (Fip1 - Fi);
+    double b1 = ampi - (m_s * Fi);
+
+    if (Nodd) {
+      b0 += (b1 * (Fip1 - Fi)) +
+            m_s / double(2.0) *
+                (powf(Fip1, double(2.0)) - powf(Fi, double(2.0))) * wt2;
+    }
+
+    // 并行计算b向量
+    for (int j = 0; j < kLength; j++) {
+      double kj = k[j];
+      b[j] +=
+          (m_s / (double(4.0) * powf(M_PI, double(2.0))) *
+           (cosf(double(2.0) * M_PI * Fip1) - cosf(double(2.0) * M_PI * Fi)) /
+           (powf(kj, double(2.0)))) *
+          wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(double(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sinf(double(2.0) * kj * Fi)) *
+              wt2;
+    }
+  }
+
+  // 处理最终结果，将b0插入到b向量的开始
+  if (Nodd) {
+    for (int i = kLength; i >= 0; i--) {
+      if (i > 0) {
+        b[i] = b[i - 1];
+      } else {
+        b[i] = b0;
+      }
+    }
+  }
+
+  // 计算a向量
+  double w0 = weight[0];
+
+  int aLength = bLength;
+  double *a = (double *)malloc(aLength * sizeof(double));
+  if (a == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!");
+    return -1;
+  };
+
+  // vector<double> result = {a.rbegin(), a.rend()};
+  for (int i = 0; i < aLength; i++) {
+    a[i] = powf(w0, double(2.0)) * double(4.0) * b[i];
+    result[aLength - 1 - i] = a[i];
+  }
+
+  int it = 0;
+  if (Nodd) {
+    it = 1;
+  }
+
+  // 构建结果向量
+  for (int i = 0; i < aLength; i++) {
+    result[i] = result[i] * double(0.5);
+    if ((i + it) < aLength) {
+      result[aLength + i] = a[i + it] * double(0.5);
+    }
+  }
+
+  // 释放动态分配的内存
+  free(weight);
+  free(k);
+  free(b);
+  free(a);
+  return 0;
+}
+
+// 设备端Bessel函数
+__device__ double dev_cyl_bessel_i_double(int n, double x) {
+  if (n == 0) return double(1);
+  double bessel = double(1), bessel_prev = double(1);
+  for (int i = 1; i <= n; ++i) {
+    bessel = (double(2) * i - double(1)) / i * x * bessel_prev - bessel;
+    bessel_prev = bessel;
+  }
+  return bessel;
+}
+
+// 设备端凯塞窗核函数
+__device__ void dev_kaiser_double(double *window, int order, double bta) {
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("window ptr is nullptr!");
+    return;
+  }
+
+  double Numerator, Denominator;
+  Denominator = dev_cyl_bessel_i_double(0, bta);
+  double od2 = (order - double(1)) / double(2);
+
+  for (int n = 0; n < order; n++) {
+    double x = bta * sqrt(double(1) - powf((n - od2) / od2, double(2)));
+    Numerator = dev_cyl_bessel_i_double(0, x);
+    window[n] = Numerator / Denominator;
+  }
+}
+
+__device__ void dev_resample_double(int upFactor, int downFactor,
+                                    double *inputSignal, const int inputSize,
+                                    double *outputSignal) {
+  if (inputSignal == nullptr) {
+    LOG_ERROR_DEVICE("inputSignal ptr is nullptr!");
+    return;
+  }
+  if (outputSignal == nullptr) {
+    LOG_ERROR_DEVICE("outputSignal ptr is nullptr!");
+    return;
+  }
+
+  const int n = 10;
+  const double bta = double(5.0);
+
+  if (upFactor <= 0 || downFactor <= 0) {
+    LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!");
+    return;
+  }
+
+  int gcd_o = dev_gcd(upFactor, downFactor);
+
+  upFactor /= gcd_o;
+  downFactor /= gcd_o;
+
+  if (upFactor == downFactor) {
+    for (int i = 0; i < inputSize; i++) {
+      outputSignal[i] = inputSignal[i];
+    }
+    return;
+  }
+
+  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
+  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
+  double firlsFreq = double(1.0) / double(2.0) / static_cast<double>(maxFactor);
+  int length = 2 * n * maxFactor + 1;
+
+  double firlsFreqsV[4];
+  firlsFreqsV[0] = double(0.0);
+  firlsFreqsV[1] = double(2.0) * firlsFreq;
+  firlsFreqsV[2] = double(2.0) * firlsFreq;
+  firlsFreqsV[3] = double(1.0);
+
+  double firlsAmplitudeV[4];
+  firlsAmplitudeV[0] = double(1.0);
+  firlsAmplitudeV[1] = double(1.0);
+  firlsAmplitudeV[2] = double(0.0);
+  firlsAmplitudeV[3] = double(0.0);
+
+  int freqSize = 4;
+  int coefficientsLength = length;
+
+  double *coefficients = (double *)malloc(coefficientsLength * sizeof(double));
+  if (coefficients == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!");
+    return;
+  }
+
+  int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV,
+                             firlsAmplitudeV, freqSize);
+  if (ret == -1) {
+    LOG_ERROR_DEVICE("dev_firls_double function error!");
+    return;
+  }
+
+  int windowSize = length;
+  double *window = (double *)malloc(windowSize * sizeof(double));
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
+    return;
+  }
+
+  dev_kaiser_double(window, length, bta);
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    coefficients[i] *= (upFactor * window[i]);
+  }
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+
+  // 分配filter空间
+  int hSize = coefficientsLength + nz;
+  double *filter =
+      (double *)malloc((coefficientsLength + 3 * nz) * sizeof(double));
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for filter!");
+    return;
+  }
+
+  int filterLength = 0;
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = double(0.0);
+  }
+  filterLength += nz;
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    filter[i + filterLength] = coefficients[i];
+  }
+  filterLength += coefficientsLength;
+
+  lengthHalf += nz;
+  int delay = lengthHalf / downFactor;
+  nz = 0;
+  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
+             delay <
+         outputSize) {
+    nz++;
+  }
+
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = double(0.0);
+  }
+  filterLength += nz;
+
+  // 计算
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upFactor) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  double *results = (double *)malloc(outputCount * sizeof(double));
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!");
+    return;
+  }
+
+  int resultsCount = 0;
+  upfirdn_device_double(upFactor, downFactor, inputSignal, inputSize, filter,
+                        filterLength, results, &resultsCount);
+
+  int j = 0;
+  for (int i = delay; i < outputSize + delay; i++) {
+    outputSignal[j++] = results[i];
+  }
+
+  // 释放动态分配的内存
+  free(coefficients);
+  free(window);
+  free(filter);
+  free(results);
+  return;
+}
+
+/**
+ * ShiftingAndResamplingKernelDoubleV1
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param VOutputLength：每个检测结果的重采样输出信号长度
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param sampling_rate
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelDoubleV1(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const int *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const double CurrentRealfreq,
+    const double sampling_rate, double *I_shifted, double *Q_shifted,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numResults * numChannels) return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
+
+  double frequency = VFrequency[ResIdx];  // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的原始数据 （某一个通道的原始数据的地址）
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  for (int i = 0; i < signalLength; i++) {
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++) {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength,
+                      I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
+                      Q_resampled);
+}
+
+/**
+ * ShiftingAndResamplingKernelDoubleV2
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的原始信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelDoubleV2(
+    const double *__restrict__ origIdata, const double *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const double CurrentRealfreq, const int alignSignalLength,
+    const double sampling_rate, double *I_shifted, double *Q_shifted,
+    double *__restrict__ outputIdata, double *__restrict__ outputQdata) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults) return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
+
+  double frequency = VFrequency[ResIdx];  // 频率
+  double deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  for (int i = 0; i < signalLength; i++) {
+    double phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    double cosVal = cosf(phase);
+    double sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto I_resampled =
+      outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto Q_resampled =
+      outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample_double(upFactor, downFactor, I_shifted, signalLength,
+                      I_resampled);
+  dev_resample_double(upFactor, downFactor, Q_shifted, signalLength,
+                      Q_resampled);
+}
+
+/**
+ * ShiftAndResampleSignalDoubleV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV1(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<double> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const double CurrentRealfreq,
+    double *outputIdata, double *outputQdata) {
+  // 参数合法性检查
+  if (outputTotalLength <= 0) {
+    LOG_ERROR_HOST("outputTotalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (outputLength.size() != numResults) {
+    LOG_ERROR_HOST("vector outputLength lenght != numResults");
+    return false;
+  }
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
+  int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  double heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率，输出信号长度等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  double *d_frequency = nullptr;
+
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++) {
+    // copy 原始的 idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的 qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请移频所需的空间
+  double *I_shifted = nullptr;
+  double *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double))));
+
+  // 申请重采样后输出信号的GPU显存
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputIdata, (numChannels * outputTotalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputQdata, (numChannels * outputTotalLength * sizeof(double))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numResults * numChannels + block.x - 1) / block.x);
+  const double sampling_rate = double(245.76e6);
+  ShiftingAndResamplingKernelDoubleV1<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted,
+      Q_shifted, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(
+      cudaMemcpy(outputIdata, d_outputIdata,
+                 (numChannels * outputTotalLength * sizeof(double)),
+                 cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(
+      cudaMemcpy(outputQdata, d_outputQdata,
+                 (numChannels * outputTotalLength * sizeof(double)),
+                 cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor) {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength) {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency) {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata) {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata) {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
+  }
+
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalDoubleV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV2(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &downFactor, std::vector<double> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const double CurrentRealfreq, double *outputIdata, double *outputQdata) {
+  // 参数合法性检查
+  if (alignSignalLength <= 0) {
+    LOG_ERROR_HOST("alignSignalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
+  int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  double heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  double *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const double *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(double),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  double *d_Idata = nullptr;
+  double *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(double);
+  for (int i = 0; i < numChannels; i++) {
+    // copy 原始的idata 到gpu显存
+    double *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    double *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请移频所需的空间
+  double *I_shifted = nullptr;
+  double *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double))));
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize =
+      numResults * numChannels * alignSignalLength * sizeof(double);
+  double *d_outputIdata = nullptr;
+  double *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  const double sampling_rate = double(245.76e6);
+  ShiftingAndResamplingKernelDoubleV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels,
+      signalLength, CurrentRealfreq, alignSignalLength, sampling_rate,
+      I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor) {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_frequency) {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata) {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata) {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
+  }
+
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
+  return true;
+}
\ No newline at end of file
diff --git a/cuda_resample_double.h b/cuda_resample_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..48d33285235ebc3cbc110090e47a2eb9612e0576
--- /dev/null
+++ b/cuda_resample_double.h
@@ -0,0 +1,81 @@
+#ifndef CUDA_RESAMPLE_DOUBLE_H
+#define CUDA_RESAMPLE_DOUBLE_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <math_constants.h>  // CUDA数学常量头文件
+#include <thrust/device_vector.h>
+
+#include <cmath>
+#include <map>
+#include <type_traits>
+#include <vector>
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643
+#endif
+
+// 设备端Resampler状态结构
+struct DeviceResamplerStateDouble {
+  int _t;                    // "time" (modulo upRate)
+  int _xOffset;              // 输入偏移量
+  double *_state;            // 状态缓冲区指针
+  double *_transposedCoefs;  // 转置系数指针
+  int _coefsPerPhase;        // 每相系数数量
+  int _upRate;               // 上采样率
+  int _downRate;             // 下采样率
+};
+
+/**
+ * ShiftAndResampleSignalDoubleV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV1(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<double> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const double CurrentRealfreq,
+    double *outputIdata, double *outputQdata);
+
+/**
+ * ShiftAndResampleSignalDoubleV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalDoubleV2(
+    const std::vector<std::vector<double>> &origIdata,
+    const std::vector<std::vector<double>> &origQdata,
+    std::vector<int> &downFactor, std::vector<double> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const double CurrentRealfreq, double *outputIdata, double *outputQdata);
+
+#endif  // CUDA_RESAMPLE_DOUBLE_H
diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu
new file mode 100644
index 0000000000000000000000000000000000000000..508bfc58d58c59ef2196fbedb391b02867b1f3ac
--- /dev/null
+++ b/cuda_resample_float.cu
@@ -0,0 +1,1178 @@
+#include <stdio.h>
+
+#include "common.h"
+#include "cuda_resample_float.h"
+
+// 设备端Resampler初始化
+__device__ void resampler_init_state_device_float(
+    DeviceResamplerStateFloat *state, float *transposedCoefs, int coefsPerPhase,
+    int upRate, int downRate) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
+  state->_t = 0;
+  state->_xOffset = 0;
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  state->_transposedCoefs = transposedCoefs;
+  state->_coefsPerPhase = coefsPerPhase;
+  state->_upRate = upRate;
+  state->_downRate = downRate;
+
+  // 分配状态缓冲区
+  state->_state = (float *)malloc((coefsPerPhase - 1) * sizeof(float));
+  if (state->_state == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!");
+    return;
+  }
+
+  // 初始化状态为零
+  for (int i = 0; i < coefsPerPhase - 1; i++) {
+    state->_state[i] = float(0);
+  }
+}
+
+// 设备端：计算所需输出数量
+__device__ int resampler_needed_out_count_device_float(
+    int inCount, DeviceResamplerStateFloat *state) {
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+
+  int np = inCount * state->_upRate;
+  int need = np / state->_downRate;
+
+  if ((state->_t + state->_upRate * state->_xOffset) <
+      (np % state->_downRate)) {
+    need++;
+  }
+
+  return need;
+}
+
+// 设备端：应用重采样
+__device__ int resampler_apply_device_float(float *in, int inCount, float *out,
+                                            int outCount,
+                                            DeviceResamplerStateFloat *state) {
+  if (in == nullptr) {
+    LOG_ERROR_DEVICE("in ptr is nullptr!");
+    return;
+  }
+  if (out == nullptr) {
+    LOG_ERROR_DEVICE("out ptr is nullptr!");
+    return;
+  }
+  if (state == nullptr) {
+    LOG_ERROR_DEVICE("state ptr is nullptr!");
+    return;
+  }
+  if (outCount < resampler_needed_out_count_device_float(inCount, state)) {
+    // 在设备端无法抛出异常，返回错误代码
+    return -1;
+  }
+
+  // x指向最新处理的输入样本
+  float *x = in + state->_xOffset;
+  float *y = out;
+  float *end = in + inCount;
+
+  while (x < end) {
+    float acc = 0;
+    float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase;
+    float *xPtr = x - state->_coefsPerPhase + 1;
+
+    int offset = in - xPtr;
+    if (offset > 0) {
+      // 需要从_state缓冲区中获取
+      float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset;
+
+      while (statePtr < state->_state + (state->_coefsPerPhase - 1)) {
+        acc += (*statePtr++) * (*h++);
+      }
+
+      xPtr += offset;
+    }
+
+    while (xPtr <= x) {
+      acc += (*xPtr++) * (*h++);
+    }
+
+    *y++ = acc;
+    state->_t += state->_downRate;
+
+    int advanceAmount = state->_t / state->_upRate;
+    x += advanceAmount;
+    state->_t %= state->_upRate;
+  }
+
+  state->_xOffset = x - end;
+
+  // 管理_state缓冲区
+  int retain = (state->_coefsPerPhase - 1) - inCount;
+
+  if (retain > 0) {
+    // 对于小于状态缓冲区的inCount，将缓冲区的末尾复制到开头
+    for (int i = 0; i < retain; i++) {
+      state->_state[i] =
+          state->_state[(state->_coefsPerPhase - 1) - retain + i];
+    }
+
+    // 然后将整个（短）输入复制到缓冲区末尾
+    for (int i = 0; i < inCount; i++) {
+      state->_state[retain + i] = in[i];
+    }
+  } else {
+    // 只将最后几个输入样本复制到状态缓冲区
+    for (int i = 0; i < state->_coefsPerPhase - 1; i++) {
+      state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i;
+    }
+  }
+
+  // 返回计算的样本数
+  return y - out;
+}
+
+// 设备端：转置滤波器系数（每个线程执行）
+__device__ void transpose_filter_coefs_device_float(float *transposedCoefs,
+                                                    float *coefs, int upRate,
+                                                    int coefCount,
+                                                    int coefsPerPhase) {
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!");
+    return;
+  }
+  if (coefs == nullptr) {
+    LOG_ERROR_DEVICE("coefs ptr is nullptr!");
+    return;
+  }
+
+  // 初始化转置系数为零
+  for (int i = 0; i < upRate * coefsPerPhase; i++) {
+    transposedCoefs[i] = float(0);
+  }
+
+  // 转置并翻转每个相位
+  for (int i = 0; i < upRate; ++i) {
+    for (int j = 0; j < coefsPerPhase; ++j) {
+      if (j * upRate + i < coefCount) {
+        transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] =
+            coefs[j * upRate + i];
+      }
+    }
+  }
+}
+
+// 设备端upfirdn主函数
+__device__ void upfirdn_device_float(int upRate, int downRate, float *input,
+                                     int inLength, float *filter,
+                                     int filterLength, float *results,
+                                     int *resultsCount) {
+  if (input == nullptr) {
+    LOG_ERROR_DEVICE("input ptr is nullptr!");
+    return;
+  }
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("filter ptr is nullptr!");
+    return;
+  }
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("results ptr is nullptr!");
+    return;
+  }
+
+  // 计算填充后的系数数量
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upRate) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upRate;
+
+  // 分配转置系数内存
+  float *transposedCoefs = (float *)malloc(paddedCoefCount * sizeof(float));
+  if (transposedCoefs == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!");
+    return;
+  }
+
+  // 转置滤波器系数
+  transpose_filter_coefs_device_float(transposedCoefs, filter, upRate,
+                                      filterLength, coefsPerPhase);
+
+  // 创建Resampler状态
+  DeviceResamplerStateFloat state;
+  resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase,
+                                    upRate, downRate);
+
+  // 计算填充量
+  int padding = coefsPerPhase - 1;
+
+  // 分配填充输入内存
+  float *inputPadded = (float *)malloc((inLength + padding) * sizeof(float));
+  if (inputPadded == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!");
+    return;
+  }
+
+  // 复制输入并填充
+  for (int i = 0; i < inLength + padding; i++) {
+    if (i < inLength) {
+      inputPadded[i] = input[i];
+    } else {
+      inputPadded[i] = 0;
+    }
+  }
+
+  // 计算输出大小
+  int resultsCountValue =
+      resampler_needed_out_count_device_float(inLength + padding, &state);
+
+  // 设置输出计数
+  if (resultsCount != nullptr) {
+    *resultsCount = resultsCountValue;
+  }
+
+  // 运行滤波
+  int numSamplesComputed = resampler_apply_device_float(
+      inputPadded, inLength + padding, results, resultsCountValue, &state);
+
+  // 清理设备内存
+  free(inputPadded);
+
+  if (state._state != nullptr) {
+    free(state._state);
+    state._state = nullptr;
+  }
+
+  free(transposedCoefs);
+  state._transposedCoefs = nullptr;
+}
+
+// 整数向上取整除法
+__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) {
+  return (num1 + num2 - 1) / num2;
+}
+
+// CUDA设备端GCD函数:最大公约数
+__device__ __forceinline__ int dev_gcd(int a, int b) {
+  while (b != 0) {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// 生成连续递增的序列
+__device__ __forceinline__ void dev_iota_float(float *data, int size,
+                                               float start) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
+  for (int i = 0; i < size; i++) {
+    data[i] = start + float(i);
+  }
+  return;
+}
+
+// 填充data为value
+__device__ __forceinline__ void dev_fill_float(float *data, int size,
+                                               float value) {
+  if (data == nullptr) {
+    LOG_ERROR_DEVICE("data ptr is nullptr!");
+    return;
+  }
+  for (int i = 0; i < size; i++) {
+    data[i] = value;
+  }
+  return;
+}
+
+__device__ int dev_firls_float(float *result, int length, float *freq,
+                               const float *amplitude, int freqSize) {
+  if (result == nullptr) {
+    LOG_ERROR_DEVICE("result ptr is nullptr!");
+    return;
+  }
+  if (freq == nullptr) {
+    LOG_ERROR_DEVICE("freq ptr is nullptr!");
+    return;
+  }
+  if (amplitude == nullptr) {
+    LOG_ERROR_DEVICE("amplitude ptr is nullptr!");
+    return;
+  }
+
+  // 计算权重大小
+  int weightSize = freqSize / 2;
+
+  // 初始化权重向量
+  float *weight = (float *)malloc(weightSize * sizeof(float));
+  if (weight == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for weight!");
+    return -1;
+  }
+
+  // 初始化weight为全1
+  dev_fill_float(weight, weightSize, float(1.0));
+
+  // 处理频率向量
+  for (int i = 0; i < freqSize; i++) {
+    freq[i] = freq[i] / float(2.0);
+  }
+
+  int filterLength = length + 1;
+  length = (filterLength - 1) / 2;
+
+  // 奇偶判断
+  bool Nodd = filterLength & 1;
+
+  // 创建和初始化向量k
+  int kLength = length + 1;
+  float *k = (float *)malloc(kLength * sizeof(float));
+  if (k == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!");
+    return -1;
+  };
+
+  // 初始化k向量为递增序列：0，1，2...
+  dev_iota_float(k, kLength, float(0.0));
+
+  if (!Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] += float(0.5);
+    }
+  }
+
+  if (Nodd) {
+    for (int i = 0; i < kLength; i++) {
+      k[i] = k[i + 1];
+    }
+    kLength--;
+  }
+
+  // 创建和初始化向量b
+  int bLength = kLength;
+  if (Nodd) {
+    bLength++;  // 此处++，因为后面需要在b[0]处插入b0
+  }
+
+  float *b = (float *)malloc(bLength * sizeof(float));
+  if (b == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!");
+    return -1;
+  };
+
+  dev_fill_float(b, bLength, float(0.0));
+
+  float b0 = float(0.0);
+  for (int i = 0; i < freqSize; i += 2) {
+    float Fi = freq[i];
+    float Fip1 = freq[i + 1];
+    float ampi = amplitude[i];
+    float ampip1 = amplitude[i + 1];
+    float wt2 = powf(weight[i / 2], float(2.0));
+    float m_s = (ampip1 - ampi) / (Fip1 - Fi);
+    float b1 = ampi - (m_s * Fi);
+
+    if (Nodd) {
+      b0 += (b1 * (Fip1 - Fi)) +
+            m_s / float(2.0) * (powf(Fip1, float(2.0)) - powf(Fi, float(2.0))) *
+                wt2;
+    }
+
+    // 并行计算b向量
+    for (int j = 0; j < kLength; j++) {
+      float kj = k[j];
+      b[j] += (m_s / (float(4.0) * powf(M_PI, float(2.0))) *
+               (cosf(float(2.0) * M_PI * Fip1) - cosf(float(2.0) * M_PI * Fi)) /
+               (powf(kj, float(2.0)))) *
+              wt2;
+
+      b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(float(2.0) * kj * Fip1) -
+               Fi * (m_s * Fi + b1) * sinf(float(2.0) * kj * Fi)) *
+              wt2;
+    }
+  }
+
+  // 处理最终结果，将b0插入到b向量的开始
+  if (Nodd) {
+    for (int i = kLength; i >= 0; i--) {
+      if (i > 0) {
+        b[i] = b[i - 1];
+      } else {
+        b[i] = b0;
+      }
+    }
+  }
+
+  // 计算a向量
+  float w0 = weight[0];
+
+  int aLength = bLength;
+  float *a = (float *)malloc(aLength * sizeof(float));
+  if (a == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!");
+    return -1;
+  };
+
+  // vector<float> result = {a.rbegin(), a.rend()};
+  for (int i = 0; i < aLength; i++) {
+    a[i] = powf(w0, float(2.0)) * float(4.0) * b[i];
+    result[aLength - 1 - i] = a[i];
+  }
+
+  int it = 0;
+  if (Nodd) {
+    it = 1;
+  }
+
+  // 构建结果向量
+  for (int i = 0; i < aLength; i++) {
+    result[i] = result[i] * float(0.5);
+    if ((i + it) < aLength) {
+      result[aLength + i] = a[i + it] * float(0.5);
+    }
+  }
+
+  // 释放动态分配的内存
+  free(weight);
+  free(k);
+  free(b);
+  free(a);
+  return 0;
+}
+
+// 设备端Bessel函数
+__device__ float dev_cyl_bessel_i_float(int n, float x) {
+  if (n == 0) return float(1);
+  float bessel = float(1), bessel_prev = float(1);
+  for (int i = 1; i <= n; ++i) {
+    bessel = (float(2) * i - float(1)) / i * x * bessel_prev - bessel;
+    bessel_prev = bessel;
+  }
+  return bessel;
+}
+
+// 设备端凯塞窗核函数
+__device__ void dev_kaiser_float(float *window, int order, float bta) {
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("window ptr is nullptr!");
+    return;
+  }
+  float Numerator, Denominator;
+  Denominator = dev_cyl_bessel_i_float(0, bta);
+  float od2 = (order - float(1)) / float(2);
+
+  for (int n = 0; n < order; n++) {
+    float x = bta * sqrt(float(1) - powf((n - od2) / od2, float(2)));
+    Numerator = dev_cyl_bessel_i_float(0, x);
+    window[n] = Numerator / Denominator;
+  }
+}
+
+__device__ void dev_resample_float(int upFactor, int downFactor,
+                                   float *inputSignal, const int inputSize,
+                                   float *outputSignal) {
+  if (inputSignal == nullptr) {
+    LOG_ERROR_DEVICE("inputSignal ptr is nullptr!");
+    return;
+  }
+  if (outputSignal == nullptr) {
+    LOG_ERROR_DEVICE("outputSignal ptr is nullptr!");
+    return;
+  }
+
+  const int n = 10;
+  const float bta = float(5.0);
+
+  if (upFactor <= 0 || downFactor <= 0) {
+    LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!");
+    return;
+  }
+
+  int gcd_o = dev_gcd(upFactor, downFactor);
+
+  upFactor /= gcd_o;
+  downFactor /= gcd_o;
+
+  if (upFactor == downFactor) {
+    for (int i = 0; i < inputSize; i++) {
+      outputSignal[i] = inputSignal[i];
+    }
+    return;
+  }
+
+  int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor);
+
+  int maxFactor = (upFactor > downFactor) ? upFactor : downFactor;
+  float firlsFreq = float(1.0) / float(2.0) / static_cast<float>(maxFactor);
+  int length = 2 * n * maxFactor + 1;
+
+  float firlsFreqsV[4];
+  firlsFreqsV[0] = float(0.0);
+  firlsFreqsV[1] = float(2.0) * firlsFreq;
+  firlsFreqsV[2] = float(2.0) * firlsFreq;
+  firlsFreqsV[3] = float(1.0);
+
+  float firlsAmplitudeV[4];
+  firlsAmplitudeV[0] = float(1.0);
+  firlsAmplitudeV[1] = float(1.0);
+  firlsAmplitudeV[2] = float(0.0);
+  firlsAmplitudeV[3] = float(0.0);
+
+  int freqSize = 4;
+  int coefficientsLength = length;
+
+  float *coefficients = (float *)malloc(coefficientsLength * sizeof(float));
+  if (coefficients == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!");
+    return;
+  }
+
+  int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV,
+                            firlsAmplitudeV, freqSize);
+  if (ret == -1) {
+    LOG_ERROR_DEVICE("dev_firls_float function error!");
+    return;
+  }
+
+  int windowSize = length;
+  float *window = (float *)malloc(windowSize * sizeof(float));
+  if (window == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for window!");
+    return;
+  }
+
+  dev_kaiser_float(window, length, bta);
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    coefficients[i] *= (upFactor * window[i]);
+  }
+
+  int lengthHalf = (length - 1) / 2;
+  int nz = downFactor - lengthHalf % downFactor;
+
+  // 分配filter空间
+  int hSize = coefficientsLength + nz;
+  float *filter =
+      (float *)malloc((coefficientsLength + 3 * nz) * sizeof(float));
+  if (filter == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for filter!");
+    return;
+  }
+
+  int filterLength = 0;
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = float(0.0);
+  }
+  filterLength += nz;
+
+  for (int i = 0; i < coefficientsLength; i++) {
+    filter[i + filterLength] = coefficients[i];
+  }
+  filterLength += coefficientsLength;
+
+  lengthHalf += nz;
+  int delay = lengthHalf / downFactor;
+  nz = 0;
+  while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) -
+             delay <
+         outputSize) {
+    nz++;
+  }
+
+  for (int i = 0; i < nz; i++) {
+    filter[i + filterLength] = float(0.0);
+  }
+  filterLength += nz;
+
+  // 计算
+  int paddedCoefCount = filterLength;
+  while (paddedCoefCount % upFactor) {
+    paddedCoefCount++;
+  }
+
+  int coefsPerPhase = paddedCoefCount / upFactor;
+  int padding = coefsPerPhase - 1;
+  int outputCount =
+      ((inputSize + padding) * upFactor + downFactor - 1) / downFactor;
+
+  float *results = (float *)malloc(outputCount * sizeof(float));
+  if (results == nullptr) {
+    LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!");
+    return;
+  }
+
+  int resultsCount = 0;
+  upfirdn_device_float(upFactor, downFactor, inputSignal, inputSize, filter,
+                       filterLength, results, &resultsCount);
+
+  int j = 0;
+  for (int i = delay; i < outputSize + delay; i++) {
+    outputSignal[j++] = results[i];
+  }
+
+  // 释放动态分配的内存
+  free(coefficients);
+  free(window);
+  free(filter);
+  free(results);
+  return;
+}
+
+/**
+ * ShiftingAndResamplingKernelFloatV1
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param VOutputLength：每个检测结果的重采样输出信号长度
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param sampling_rate
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelFloatV1(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
+    const int *__restrict__ VOutputLength, const int numResults,
+    const int numChannels, const int signalLength, const float CurrentRealfreq,
+    const float sampling_rate, float *I_shifted, float *Q_shifted,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numResults * numChannels) return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
+
+  float frequency = VFrequency[ResIdx];  // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的原始数据 （某一个通道的原始数据的地址）
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  for (int i = 0; i < signalLength; i++) {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = cosf(phase);
+    float sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // 计算之前带宽，对应的输出信号的总长度
+  int beforeTotalLength = 0;
+  for (int i = 0; i < ResIdx; i++) {
+    beforeTotalLength += VOutputLength[i];
+  }
+  // 当前带宽对应的输出信号的起始地址偏移
+  int offset = beforeTotalLength * numChannels;
+
+  // 获取当前检测结果对应的输出信号长度（这里假设的是每个带宽对应的输出信号长度可能不相同）
+  int outputLength = VOutputLength[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto I_resampled = outputIdata + offset + chIdx * outputLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto Q_resampled = outputQdata + offset + chIdx * outputLength;
+
+  // 重采样
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength,
+                     I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength,
+                     Q_resampled);
+}
+
+/**
+ * ShiftingAndResamplingKernelFloatV2
+ * 重采样核函数：完成原始信号的移频，重采样等计算
+ * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+ * 因此共 numChannels * numResults 个线程并行计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param VDownFactor：下采样率
+ * @param VFrequency：频率
+ * @param numResults：每帧的检测结果总数
+ * @param numChannels：信号通道数
+ * @param signalLength：每个通道的原始信号长度
+ * @param CurrentRealfreq：当前实际频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param outputIdata：存放所有重采样后的Idata，调用时需确保显存空间足够
+ * @param outputQdata：存放所有重采样后的Qdata，调用时需确保显存空间足够
+ * @return true or false
+ */
+__global__ void ShiftingAndResamplingKernelFloatV2(
+    const float *__restrict__ origIdata, const float *__restrict__ origQdata,
+    const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency,
+    const int numResults, const int numChannels, const int signalLength,
+    const float CurrentRealfreq, const int alignSignalLength,
+    const float sampling_rate, float *I_shifted, float *Q_shifted,
+    float *__restrict__ outputIdata, float *__restrict__ outputQdata) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numChannels * numResults) return;
+
+  // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样
+  int ResIdx = idx / numChannels;  // 第几个检测结果
+  int chIdx = idx % numChannels;   // 第几个通道
+
+  float frequency = VFrequency[ResIdx];  // 频率
+  float deltaFreq = (CurrentRealfreq - frequency) * 1e6;
+
+  // 获取当前线程处理的通道数据地址
+  const auto I_orig = origIdata + chIdx * signalLength;
+  const auto Q_orig = origQdata + chIdx * signalLength;
+
+  // 移频：生成本振信号并相乘
+  for (int i = 0; i < signalLength; i++) {
+    float phase = 2 * M_PI * deltaFreq * i / sampling_rate;
+    float cosVal = cosf(phase);
+    float sinVal = sinf(phase);
+    I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal;
+    Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal;
+  }
+
+  // 上采样因子为1，下采样因子为downFactor
+  int upFactor = 1;
+  int downFactor = VDownFactor[ResIdx];
+
+  // outputIdata：存放所有重采样后的Idata，调用时需确保内存空间足够
+  auto I_resampled =
+      outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+  // outputQdata：存放所有重采样后的Qdata，调用时需确保内存空间足够
+  auto Q_resampled =
+      outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength;
+
+  // 重采样
+  dev_resample_float(upFactor, downFactor, I_shifted, signalLength,
+                     I_resampled);
+  dev_resample_float(upFactor, downFactor, Q_shifted, signalLength,
+                     Q_resampled);
+}
+
+/**
+ * ShiftAndResampleSignalFloatV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV1(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<float> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const float CurrentRealfreq,
+    float *outputIdata, float *outputQdata) {
+  // 参数合法性检查
+  if (outputTotalLength <= 0) {
+    LOG_ERROR_HOST("outputTotalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (outputLength.size() != numResults) {
+    LOG_ERROR_HOST("vector outputLength lenght != numResults");
+    return false;
+  }
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
+  int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  float heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率，输出信号长度等数据到显存中
+  int *d_downFactor = nullptr;
+  int *d_outputLength = nullptr;
+  float *d_frequency = nullptr;
+
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy每个带宽，重采样后输出信号长度到显存中
+  const int *src_outputLength = outputLength.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const float *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(float);
+  for (int i = 0; i < numChannels; i++) {
+    // copy 原始的 idata 到gpu显存
+    float *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的 qdata 到gpu显存
+    float *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请移频所需的空间
+  float *I_shifted = nullptr;
+  float *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float))));
+
+  // 申请重采样后输出信号的GPU显存
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputIdata, (numChannels * outputTotalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(
+      &d_outputQdata, (numChannels * outputTotalLength * sizeof(float))));
+
+  // 线程数配置
+  dim3 block(numChannels);
+  dim3 grid((numResults * numChannels + block.x - 1) / block.x);
+  const float sampling_rate = float(245.76e6);
+  ShiftingAndResamplingKernelFloatV1<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults,
+      numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted,
+      Q_shifted, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata,
+                              (numChannels * outputTotalLength * sizeof(float)),
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor) {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_outputLength) {
+    cudaFree(d_outputLength);
+    d_outputLength = nullptr;
+  }
+
+  if (d_frequency) {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata) {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata) {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
+  }
+
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
+  return true;
+}
+
+/**
+ * ShiftAndResampleSignalFloatV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &downFactor, std::vector<float> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const float CurrentRealfreq, float *outputIdata, float *outputQdata) {
+  // 参数合法性检查
+  if (alignSignalLength <= 0) {
+    LOG_ERROR_HOST("alignSignalLength <= 0");
+    return false;
+  }
+  if (numResults <= 0) {
+    LOG_ERROR_HOST("numResults <= 0");
+    return false;
+  }
+  if (numChannels <= 0) {
+    LOG_ERROR_HOST("numChannels <= 0");
+    return false;
+  }
+
+  if (downFactor.size() != numResults) {
+    LOG_ERROR_HOST("vector downFactor lenght != numResults");
+    return false;
+  }
+  if (detectFreq.size() != numResults) {
+    LOG_ERROR_HOST("vector detectFreq lenght != numResults");
+    return false;
+  }
+
+  if (outputIdata == nullptr) {
+    LOG_ERROR_HOST("outputIdata is null ptr");
+    return false;
+  }
+  if (outputQdata == nullptr) {
+    LOG_ERROR_HOST("outputQdata is null ptr");
+    return false;
+  }
+
+  // 每个通道的信号长度：这里假设所有通道的原始信号长度是相同的
+  int signalLength = origIdata[0].size();
+  if (signalLength <= 0) {
+    LOG_ERROR_HOST("signalLength <= 0");
+    return false;
+  }
+
+  // 设置CUDA设备堆大小以便支持设备端malloc/free
+  int deviceId = 0;
+  float heapSizePercent = 10.0f;
+  size_t minHeapSizeMB = 16;
+  size_t maxHeapSizeMB = 512;
+  SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB);
+
+  // ====准备调用重采样核函数：ShiftingAndResamplingKernel=====
+  // copy下采样率，频率等数据到显存中
+  int *d_downFactor = nullptr;
+  float *d_frequency = nullptr;
+  // 申请显存
+  CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int))));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float))));
+
+  // copy下采样率到显存中
+  const int *src_downFactor = downFactor.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor,
+                              numResults * sizeof(int),
+                              cudaMemcpyHostToDevice));
+
+  // copy频率到显存中
+  const float *src_frequency = detectFreq.data();
+  CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency,
+                              numResults * sizeof(float),
+                              cudaMemcpyHostToDevice));
+
+  // 申请原始的idata和qdata所需的GPU显存，并将数据copy到GPU显存中
+  float *d_Idata = nullptr;
+  float *d_Qdata = nullptr;
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(
+      cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float))));
+
+  // 将所有通道数据循环拷贝到GPU显存
+  size_t copySize = signalLength * sizeof(float);
+  for (int i = 0; i < numChannels; i++) {
+    // copy 原始的idata 到gpu显存
+    float *dst_idata = d_Idata + i * signalLength;
+    const void *src_idata = origIdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice));
+
+    // copy 原始的qdata 到gpu显存
+    float *dst_qdata = d_Qdata + i * signalLength;
+    const void *src_qdata = origQdata[i].data();
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice));
+  }
+
+  // 申请移频所需的空间
+  float *I_shifted = nullptr;
+  float *Q_shifted = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float))));
+  CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float))));
+
+  // 申请重采样后输出信号的GPU显存
+  size_t totalsize =
+      numResults * numChannels * alignSignalLength * sizeof(float);
+  float *d_outputIdata = nullptr;
+  float *d_outputQdata = nullptr;
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize));
+  CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize));
+
+  // 初始化为0
+  CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize));
+  CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize));
+
+  // 线程数配置，总的线程数：numChannels * numResults
+  dim3 block(numChannels);
+  dim3 grid((numChannels * numResults + block.x - 1) / block.x);
+  const float sampling_rate = float(245.76e6);
+  ShiftingAndResamplingKernelFloatV2<<<grid, block>>>(
+      d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels,
+      signalLength, CurrentRealfreq, alignSignalLength, sampling_rate,
+      I_shifted, Q_shifted, d_outputIdata, d_outputQdata);
+
+  // copy重采样计算结果到主存
+  // 存储格式：[numResults][numChannels][lengthPerResults]
+  // 且在内存中是连续存放的
+  CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize,
+                              cudaMemcpyDeviceToHost));
+
+  // 释放显存
+  if (d_downFactor) {
+    cudaFree(d_downFactor);
+    d_downFactor = nullptr;
+  }
+
+  if (d_frequency) {
+    cudaFree(d_frequency);
+    d_frequency = nullptr;
+  }
+
+  if (d_Idata) {
+    cudaFree(d_Idata);
+    d_Idata = nullptr;
+  }
+
+  if (d_Qdata) {
+    cudaFree(d_Qdata);
+    d_Qdata = nullptr;
+  }
+
+  if (I_shifted) {
+    cudaFree(I_shifted);
+    I_shifted = nullptr;
+  }
+
+  if (Q_shifted) {
+    cudaFree(Q_shifted);
+    Q_shifted = nullptr;
+  }
+
+  if (d_outputIdata) {
+    cudaFree(d_outputIdata);
+    d_outputIdata = nullptr;
+  }
+
+  if (d_outputQdata) {
+    cudaFree(d_outputQdata);
+    d_outputQdata = nullptr;
+  }
+
+  return true;
+}
\ No newline at end of file
diff --git a/cuda_resample_float.h b/cuda_resample_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d6ac976cab17eb3ef8ceb96f75b87d9f9493698
--- /dev/null
+++ b/cuda_resample_float.h
@@ -0,0 +1,81 @@
+#ifndef CUDA_RESAMPLE_FLOAT_H
+#define CUDA_RESAMPLE_FLOAT_H
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <math_constants.h>  // CUDA数学常量头文件
+#include <thrust/device_vector.h>
+
+#include <cmath>
+#include <map>
+#include <type_traits>
+#include <vector>
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643
+#endif
+
+// 设备端Resampler状态结构
+struct DeviceResamplerStateFloat {
+  int _t;                   // "time" (modulo upRate)
+  int _xOffset;             // 输入偏移量
+  float *_state;            // 状态缓冲区指针
+  float *_transposedCoefs;  // 转置系数指针
+  int _coefsPerPhase;       // 每相系数数量
+  int _upRate;              // 上采样率
+  int _downRate;            // 下采样率
+};
+
+/**
+ * ShiftAndResampleSignalFloatV1
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param outputLength：重采样后每个带宽对应的输出信号长度
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param outputTotalLength：一个通道重采样后信号的总长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV1(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &outputLength, std::vector<int> &downFactor,
+    std::vector<float> &detectFreq, const int outputTotalLength,
+    const int numResults, const int numChannels, const float CurrentRealfreq,
+    float *outputIdata, float *outputQdata);
+
+/**
+ * ShiftAndResampleSignalFloatV2
+ * 重采样函数：完成原始信号的移频，重采样等计算
+ *
+ * @param origIdata：原始Idata
+ * @param origQdata：原始Qdata
+ * @param downFactor：每个带宽对应的下采样率
+ * @param detectFreq：每个带宽对应的频率
+ * @param alignSignalLength：重采样后信号的对齐长度
+ * @param numResults：检测结果数量
+ * @param numChannels：信号通道数
+ * @param CurrentRealfreq：当前实际频率
+ * @param outputIdata：
+ * 重采样后的Idata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @param outputQdata：
+ * 重采样后的Qdata，连续存储，格式：[numResults][numChannels][lengthPerResult]
+ * @return true or false
+ */
+bool ShiftAndResampleSignalFloatV2(
+    const std::vector<std::vector<float>> &origIdata,
+    const std::vector<std::vector<float>> &origQdata,
+    std::vector<int> &downFactor, std::vector<float> &detectFreq,
+    const int alignSignalLength, const int numResults, const int numChannels,
+    const float CurrentRealfreq, float *outputIdata, float *outputQdata);
+
+#endif  // CUDA_RESAMPLE_FLOAT_H
diff --git a/mainwindow.cpp b/mainwindow.cpp
index 81143458bc38b390c208d825d616d0a3b55dc88b..a8c9d8ce66aa7303eb88cf27d877bddeead3adf1 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -10,7 +10,8 @@
 
 using namespace std;
 
-MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {
+MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent)
+{
   InitControlValues();
   InitUI();
   InitConnect();
@@ -18,7 +19,8 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {
 
 MainWindow::~MainWindow() {}
 
-void MainWindow::InitControlValues() {
+void MainWindow::InitControlValues()
+{
   m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this);
 
   basePath = "/../data/";
@@ -26,15 +28,18 @@ void MainWindow::InitControlValues() {
 
 void MainWindow::InitUI() { setCentralWidget(m_btnCalculate); }
 
-void MainWindow::InitConnect() {
+void MainWindow::InitConnect()
+{
   connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick()));
 }
 
-int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) {
+int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength)
+{
   return m_calMC.CalMovingCorrlationRoutine(signalChannels, signalLength);
 }
 
-void MainWindow::SlotCalculateClick() {
+void MainWindow::SlotCalculateClick()
+{
   QString strFileName =
       "20250213105831_Detect_FreqList5720-5750MHz_Span15.36MHz_Point4096_2025-"
       "02-13 10-58-31_0.dat";
@@ -47,7 +52,8 @@ void MainWindow::SlotCalculateClick() {
   QString strIQFileName =
       QCoreApplication::applicationDirPath() + basePath + strFileName;
 
-  if (QFile::exists(strIQFileName) != true) {
+  if (QFile::exists(strIQFileName) != true)
+  {
     std::cerr << __FUNCTION__ << strIQFileName.toStdString() << ":文件不存在"
               << std::endl;
     return;
@@ -72,18 +78,23 @@ void MainWindow::SlotCalculateClick() {
 
   // 打开回放文件
   m_ReplayFile.open(strIQFileName.toStdString(), ios::in | ios::binary);
-  if (!m_ReplayFile) {
+  if (!m_ReplayFile)
+  {
     qDebug() << __FUNCTION__ << "file open error";
     return;
   }
 
   // 循环读取每一帧数据 4096点
   for (int m_uiCurrentFrame = 0; m_uiCurrentFrame < m_iframeCnt;
-       ++m_uiCurrentFrame) {
-    if (m_uiCurrentFrame < m_iframeCnt - 1) {
+       ++m_uiCurrentFrame)
+  {
+    if (m_uiCurrentFrame < m_iframeCnt - 1)
+    {
       oneframesize = m_vecReplayHeadposDetect.at(m_uiCurrentFrame + 1) -
                      m_vecReplayHeadposDetect.at(m_uiCurrentFrame);
-    } else {
+    }
+    else
+    {
       oneframesize = m_ReplayfilesizeDetect -
                      m_vecReplayHeadposDetect.at(m_uiCurrentFrame);
     }
@@ -111,11 +122,13 @@ void MainWindow::SlotCalculateClick() {
 
 void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
                                       std::vector<size_t> &headPos,
-                                      qint64 &Replayfilesize) {
+                                      qint64 &Replayfilesize)
+{
   std::ifstream replayfileforcalculate;
   replayfileforcalculate.open(ReplayFilePath.toStdString(),
                               ios::in | ios::binary);
-  if (!replayfileforcalculate) {
+  if (!replayfileforcalculate)
+  {
     qDebug() << __FUNCTION__ << "file open error";
     return;
   }
@@ -135,7 +148,8 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
   size_t count = 0;
   headPos.clear();
   headPos.reserve(5000);
-  while ((index = source.find(match, index)) < Replayfilesize) {
+  while ((index = source.find(match, index)) < Replayfilesize)
+  {
     unsigned int DataSize = m_droneIQParse.GetDataSize(buff + index);
     headPos.push_back(index);
     index += 8;
@@ -150,17 +164,22 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath,
   buff = nullptr;
 }
 
-void MainWindow::ReplayIQDataParse(char *buf) {
-  if (signalLength_ > 0) {
-    if (signalChannels_ == 32) {
-      uint channelnumber = 8;  //原逻辑也是只取了前8个通道
+void MainWindow::ReplayIQDataParse(char *buf)
+{
+  if (signalLength_ > 0)
+  {
+    if (signalChannels_ == 32)
+    {
+      uint channelnumber = 8; // 原逻辑也是只取了前8个通道
 
-      if (signalDatas_ == nullptr) {
+      if (signalDatas_ == nullptr)
+      {
         // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
         if (!m_calMC.cudaCorrelation->AllocMappMemory(
                 (void **)&(m_calMC.cudaCorrelation->h_signals),
                 (void **)&(m_calMC.cudaCorrelation->d_signals),
-                channelnumber * signalLength_ * sizeof(cpuComplex))) {
+                channelnumber * signalLength_ * sizeof(cpuComplex)))
+        {
           std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
           return;
         }
@@ -182,3 +201,50 @@ void MainWindow::ReplayIQDataParse(char *buf) {
     }
   }
 }
+
+template <typename T>
+void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC,
+                         cpuComplex *signalDatas, const T *outputIdata,
+                         const T *outputQdata, const int numResults,
+                         const int numChannels,
+                         const int signalLength)
+{
+  if (signalDatas == nullptr)
+  {
+    // 申请零拷贝内存，自动完成CPU内存与GPU显存数据同步
+    if (!m_calMC.cudaCorrelation->AllocMappMemory(
+            (void **)&(m_calMC.cudaCorrelation->h_signals),
+            (void **)&(m_calMC.cudaCorrelation->d_signals),
+            numResults * numChannels * signalLength * sizeof(cpuComplex)))
+    {
+      std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl;
+      return;
+    }
+
+    signalDatas = (cpuComplex *)m_calMC.cudaCorrelation->h_signals;
+  }
+
+  int index = 0;
+  for (int i = 0; i < numResults; i++)
+  {
+    for (int j = 0; j < numChannels; j++)
+    {
+      for (int k = 0; k < signalLength; k++)
+      {
+        int idx = (i * numChannels + j) * signalLength + k;
+        cpuComplex data((T)outputIdata[idx], (T)outputQdata[idx]); // cpuComplex
+        signalDatas[index++] = data;
+      }
+    }
+  }
+
+  QElapsedTimer tm;
+  tm.start();
+
+  // 每帧 SamplePoints 个点 IQ 输入
+  // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰
+  int result = m_calMC.CalMovingCorrlationRoutine(numResults * numChannels, signalLength);
+
+  std::cout << __FUNCTION__ << " result:" << result
+            << " tm(ns):" << tm.nsecsElapsed() << std::endl;
+}
diff --git a/mainwindow.h b/mainwindow.h
index a63a935761b36cadc43988f28f00ee0e69fd8d10..a5951b48d202835f038e3cffc89061b74a379ac7 100644
--- a/mainwindow.h
+++ b/mainwindow.h
@@ -14,10 +14,11 @@
 
 using namespace std;
 
-class MainWindow : public QMainWindow {
+class MainWindow : public QMainWindow
+{
   Q_OBJECT
 
- public:
+public:
   MainWindow(QWidget *parent = nullptr);
   ~MainWindow();
 
@@ -34,7 +35,7 @@ class MainWindow : public QMainWindow {
   uint signalChannels_ = 0;
   uint signalLength_ = 0;
 
- private:
+private:
   // 获取测试数据文件中 每一帧数据的帧头下标
   void GetReplayFileHeadPos(QString ReplayFilePath,
                             std::vector<size_t> &headPos,
@@ -43,7 +44,7 @@ class MainWindow : public QMainWindow {
   // 解析每一帧数据为 8路的 IQ 原始数据
   void ReplayIQDataParse(char *buf);
 
- private:
+private:
   QPushButton *m_btnCalculate;
 
   CalculateMovingCorrelation m_calMC;
@@ -57,7 +58,15 @@ class MainWindow : public QMainWindow {
   ifstream m_ReplayFile;
   int oneframesize;
 
- public slots:
+public slots:
   void SlotCalculateClick();
 };
-#endif  // MAINWINDOW_H
+
+template <typename T>
+void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC,
+                         cpuComplex *signalDatas, const T *outputIdata,
+                         const T *outputQdata, const int numResults,
+                         const int numChannels,
+                         const int signalLength);
+
+#endif // MAINWINDOW_H