diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ad285811ccaef4c6ec3addc080ed19e702c4fef..aa1e423cbc5ad317aa747a484c47fcb3d5e40ed4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,8 +55,15 @@ if(USE_CUDA) message(STATUS "CMAKE_CUDA_ARCHITECTURES=" ${CMAKE_CUDA_ARCHITECTURES}) endif() +# 对于模板核函数需要这些选项 +# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ +# -Wno-deprecated-gpu-targets") + +# 如果使用分离编译,需要添加 +# set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + # debug OR release mode -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++11 -pthread -pipe") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wdeprecated-declarations -fPIC -std=c++17 -pthread -pipe") if (NOT CMAKE_BUILD_TYPE) set(_CMAKE_BUILD_TYPE_LOWER "release") else() diff --git a/common.cpp b/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..822f9b27988c24c5a711aaf56e446647690a0b1c --- /dev/null +++ b/common.cpp @@ -0,0 +1,101 @@ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include +#include +#include + +#include + +/** + * @brief 计算两个整数相除的向上取整结果(商向正无穷方向取整) + * + * 该函数实现整数除法的向上取整,与数学中的ceil(num1/num2)行为一致。 + * 例如:7/3=2.333→向上取整为3;-7/3=-2.333→向上取整为-2;7/-3=-2.333→向上取整为-2 + * + * @param[in] num1 被除数(分子) + * @param[in] num2 除数(分母),不能为0 + * @return int 向上取整后的商 + * @note + * 1. 当除数为0时,会输出错误信息到stderr并返回0(实际应用中建议改为抛出异常) + * 2. + * 仅当存在非零余数且除数与被除数同号时,商才需要加1(避免异号情况下错误进位) + * 3. 与C++标准库的std::ceil不同,该函数仅处理整数运算,无浮点精度损失 + * @warning 除数为0时返回0是临时容错处理,生产环境应使用assert或异常机制终止程序 + */ +int quotientCeil(int num1, int num2) { + // 处理除数为0的情况(根据需求选择返回值或抛出异常) + if (num2 == 0) { + std::cerr << "Division by zero error" << std::endl; + return 0; // 或使用 assert(0) 终止程序 + } + + // 计算商和余数 + int quotient = num1 / num2; + int remainder = num1 % num2; + + // 仅当余数不为0且除数与被除数同号时,才需要向上取整(商+1) + if (remainder != 0 && ((num1 > 0 && num2 > 0) || (num1 < 0 && num2 < 0))) { + quotient += 1; + } + + return quotient; +} + +/** + * @brief 设置CUDA设备堆大小以便支持设备端malloc/free + * @param deviceId 要设置的设备ID(默认0) + * @param heapSizePercent 要分配的全局内存百分比(默认10%,范围1-50) + * @param minHeapSizeMB 最小堆大小MB(默认16MB) + * @param maxHeapSizeMB 最大堆大小MB(默认512MB) + * @return 0成功,-1失败 + */ +int SetGPUHeapSize(int deviceId = 0, float heapSizePercent = 10.0f, + size_t minHeapSizeMB = 16, size_t maxHeapSizeMB = 512) { + // 获取设备属性 + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, deviceId); + if (err != cudaSuccess) { + std::cerr << "获取设备属性失败: " << cudaGetErrorString(err) << std::endl; + return -1; + } + + // 检查设备是否支持设备端malloc + if (prop.major < 2) { + std::cerr << "设备不支持设备端malloc (计算能力 < 2.0)" << std::endl; + std::cerr << "设备计算能力: " << prop.major << "." << prop.minor + << std::endl; + return -1; + } + + // 根据设备内存动态设置堆大小 + size_t totalGlobalMem = prop.totalGlobalMem; + size_t heapSize = 0; + + if (prop.major >= 2) { + // 对于支持动态分配的设备 + // 堆大小建议:总内存的10-25%,但不超过一定限制 + heapSize = totalGlobalMem * 0.1; // 10%的总内存 + + // 设置上限,避免太大 + const size_t maxHeapSizeBytes = maxHeapSizeMB * 1024 * 1024; // 512MB + if (heapSize > maxHeapSizeBytes) { + heapSize = maxHeapSizeBytes; + } + + // 设置下限,确保足够 + const size_t minHeapSizeBytes = minHeapSizeMB * 1024 * 1024; // 16MB + if (heapSize < minHeapSizeBytes) { + heapSize = minHeapSizeBytes; + } + + cudaDeviceSetLimit(cudaLimitMallocHeapSize, heapSize); + } else { + std::cout << "设备不支持设备端malloc" << std::endl; + return -1; + } + + return 0; +} + +#endif // CUDA_RESAMPLE_H \ No newline at end of file diff --git a/common.h b/common.h new file mode 100644 index 0000000000000000000000000000000000000000..d573f60b7d2f8eb4c4c3c679ab4a170753e27cd2 --- /dev/null +++ b/common.h @@ -0,0 +1,125 @@ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include +#include +#include + +#include + +/** + * @brief CUDA运行时错误检查宏,带自动定位和异常抛出功能 + * + * 该宏封装了CUDA API调用的错误检查逻辑,当CUDA操作失败时: + * 1. 自动获取当前函数名、文件名和行号,精确定位错误位置 + * 2. 输出人类可读的错误描述信息(通过cudaGetErrorString) + * 3. 抛出std::runtime_error异常终止程序(或触发上层异常处理) + * + * 采用do-while(0)结构确保宏在各种上下文(如if语句后)都能正确展开, + * 同时避免变量作用域污染和语法错误。 + * + * @param call 待检查的CUDA API调用表达式(如cudaMalloc、cudaMemcpy等) + * @note 必须在包含的环境中使用 + * @warning 异常抛出会终止当前调用栈,建议在关键CUDA操作后立即使用 + * @example + * CHECK_CUDA_ERROR(cudaMalloc(&dev_ptr, size)); // 检查内存分配 + * CHECK_CUDA_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); // + * 检查数据拷贝 + */ +#define CHECK_CUDA_ERROR(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":" \ + << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \ + throw std::runtime_error("CUDA error"); \ + } \ + } while (0) + +/** + * @brief 设备端(GPU/CUDA)错误日志输出宏 + * + * 用于在CUDA设备代码或设备端工具函数中输出错误日志,采用C风格printf实现, + * 支持格式化字符串和可变参数。自动附加文件名、行号和函数名,便于错误定位。 + * + * @note 1. 必须在支持C99可变参数宏的编译器中使用(如GCC 4.0+、Clang 3.0+) + * 2. 设备端代码通常不支持C++标准库(如std::cerr),故使用printf实现 + * 3. fmt参数需遵循printf格式规范,例如"%d"对应整数,"%s"对应字符串 + * + * @param[in] fmt 格式化字符串(例如"GPU memory allocation failed, size=%d + * bytes") + * @param[in] ... 可变参数列表,与fmt中的格式占位符对应 + * + * @example + * // 在CUDA核函数或设备函数中使用 + * __device__ void cuda_kernel() { + * if (threadIdx.x >= blockDim.x) { + * LOG_ERROR_DEVICE("Invalid thread index: %d (max allowed: %d)", + * threadIdx.x, blockDim.x - 1); + * } + * } + */ +#define LOG_ERROR_DEVICE(fmt, ...) \ + printf("[DEVICE ERROR] %s:%d (%s) " fmt "\n", __FILE__, __LINE__, __func__, \ + ##__VA_ARGS__) + +/** + * @brief 主机端(CPU)错误日志输出宏 + * + * 用于在CPU端代码中输出错误日志,基于C++标准库std::cerr实现,支持C++流操作符语法。 + * 自动附加文件名、行号和函数名,输出至标准错误流(stderr)而非标准输出(stdout), + * 便于日志重定向和错误捕获(例如在shell中通过2> error.log单独收集错误)。 + * + * @note 1. + * 仅可在主机端C++代码中使用,不可用于CUDA设备代码(设备端不支持std::cerr) + * 2. + * message参数支持任意可被std::ostream输出的类型(如字符串、数字、自定义类型重载<<) + * 3. 输出自动追加换行符,无需手动添加"\n" + * + * @param[in] message 错误信息(可通过流操作符合并多个值,例如"Load model + * failed: " + filename) + * + * @example + * // 在主机端函数中使用 + * void load_config_file(const std::string& path) { + * std::ifstream file(path); + * if (!file.is_open()) { + * LOG_ERROR_HOST("Failed to open config file: " << path << " (error: " + * << strerror(errno) << ")"); + * } + * } + */ +#define LOG_ERROR_HOST(message) \ + std::cerr << "[HOST ERROR] " << __FILE__ << ":" << __LINE__ << " (" \ + << __FUNCTION__ << ") " << message << std::endl + +/** + * @brief 计算两个整数相除的向上取整结果(商向正无穷方向取整) + * + * 该函数实现整数除法的向上取整,与数学中的ceil(num1/num2)行为一致。 + * 例如:7/3=2.333→向上取整为3;-7/3=-2.333→向上取整为-2;7/-3=-2.333→向上取整为-2 + * + * @param[in] num1 被除数(分子) + * @param[in] num2 除数(分母),不能为0 + * @return int 向上取整后的商 + * @note + * 1. 当除数为0时,会输出错误信息到stderr并返回0(实际应用中建议改为抛出异常) + * 2. + * 仅当存在非零余数且除数与被除数同号时,商才需要加1(避免异号情况下错误进位) + * 3. 与C++标准库的std::ceil不同,该函数仅处理整数运算,无浮点精度损失 + * @warning 除数为0时返回0是临时容错处理,生产环境应使用assert或异常机制终止程序 + */ +int quotientCeil(int num1, int num2); + +/** + * @brief 设置CUDA设备堆大小以便支持设备端malloc/free + * @param deviceId 要设置的设备ID(默认0) + * @param heapSizePercent 要分配的全局内存百分比(默认10%,范围1-50) + * @param minHeapSizeMB 最小堆大小MB(默认16MB) + * @param maxHeapSizeMB 最大堆大小MB(默认512MB) + * @return 0成功,-1失败 + */ +int SetGPUHeapSize(int deviceId, float heapSizePercent, size_t minHeapSizeMB, + size_t maxHeapSizeMB); + +#endif // __COMMON_H__ \ No newline at end of file diff --git a/cuda_resample_double.cu b/cuda_resample_double.cu new file mode 100644 index 0000000000000000000000000000000000000000..9f9ceb50caccab3b03c7463fbc0dccb08d92d361 --- /dev/null +++ b/cuda_resample_double.cu @@ -0,0 +1,1181 @@ +#include + +#include "common.h" +#include "cuda_resample_double.h" + +// 设备端Resampler初始化 +__device__ void resampler_init_state_device_double( + DeviceResamplerStateDouble *state, double *transposedCoefs, + int coefsPerPhase, int upRate, int downRate) { + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + state->_t = 0; + state->_xOffset = 0; + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!"); + return; + } + state->_transposedCoefs = transposedCoefs; + state->_coefsPerPhase = coefsPerPhase; + state->_upRate = upRate; + state->_downRate = downRate; + + // 分配状态缓冲区 + state->_state = (double *)malloc((coefsPerPhase - 1) * sizeof(double)); + if (state->_state == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!"); + return; + } + + // 初始化状态为零 + for (int i = 0; i < coefsPerPhase - 1; i++) { + state->_state[i] = double(0); + } +} + +// 设备端:计算所需输出数量 +__device__ int resampler_needed_out_count_device_double( + int inCount, DeviceResamplerStateDouble *state) { + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + + int np = inCount * state->_upRate; + int need = np / state->_downRate; + + if ((state->_t + state->_upRate * state->_xOffset) < + (np % state->_downRate)) { + need++; + } + + return need; +} + +// 设备端:应用重采样 +__device__ int resampler_apply_device_double( + double *in, int inCount, double *out, int outCount, + DeviceResamplerStateDouble *state) { + if (in == nullptr) { + LOG_ERROR_DEVICE("in ptr is nullptr!"); + return; + } + if (out == nullptr) { + LOG_ERROR_DEVICE("out ptr is nullptr!"); + return; + } + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + + if (outCount < resampler_needed_out_count_device_double(inCount, state)) { + // 在设备端无法抛出异常,返回错误代码 + return -1; + } + + // x指向最新处理的输入样本 + double *x = in + state->_xOffset; + double *y = out; + double *end = in + inCount; + + while (x < end) { + double acc = 0; + double *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase; + double *xPtr = x - state->_coefsPerPhase + 1; + + int offset = in - xPtr; + if (offset > 0) { + // 需要从_state缓冲区中获取 + double *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset; + + while (statePtr < state->_state + (state->_coefsPerPhase - 1)) { + acc += (*statePtr++) * (*h++); + } + + xPtr += offset; + } + + while (xPtr <= x) { + acc += (*xPtr++) * (*h++); + } + + *y++ = acc; + state->_t += state->_downRate; + + int advanceAmount = state->_t / state->_upRate; + x += advanceAmount; + state->_t %= state->_upRate; + } + + state->_xOffset = x - end; + + // 管理_state缓冲区 + int retain = (state->_coefsPerPhase - 1) - inCount; + + if (retain > 0) { + // 对于小于状态缓冲区的inCount,将缓冲区的末尾复制到开头 + for (int i = 0; i < retain; i++) { + state->_state[i] = + state->_state[(state->_coefsPerPhase - 1) - retain + i]; + } + + // 然后将整个(短)输入复制到缓冲区末尾 + for (int i = 0; i < inCount; i++) { + state->_state[retain + i] = in[i]; + } + } else { + // 只将最后几个输入样本复制到状态缓冲区 + for (int i = 0; i < state->_coefsPerPhase - 1; i++) { + state->_state[i] = *end - (double)(state->_coefsPerPhase - 1) + (double)i; + } + } + + // 返回计算的样本数 + return y - out; +} + +// 设备端:转置滤波器系数(每个线程执行) +__device__ void transpose_filter_coefs_device_double(double *transposedCoefs, + double *coefs, int upRate, + int coefCount, + int coefsPerPhase) { + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!"); + return; + } + if (coefs == nullptr) { + LOG_ERROR_DEVICE("coefs ptr is nullptr!"); + return; + } + // 初始化转置系数为零 + for (int i = 0; i < upRate * coefsPerPhase; i++) { + transposedCoefs[i] = double(0); + } + + // 转置并翻转每个相位 + for (int i = 0; i < upRate; ++i) { + for (int j = 0; j < coefsPerPhase; ++j) { + if (j * upRate + i < coefCount) { + transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] = + coefs[j * upRate + i]; + } + } + } +} + +// 设备端upfirdn主函数 +__device__ void upfirdn_device_double(int upRate, int downRate, double *input, + int inLength, double *filter, + int filterLength, double *results, + int *resultsCount) { + if (input == nullptr) { + LOG_ERROR_DEVICE("input ptr is nullptr!"); + return; + } + if (filter == nullptr) { + LOG_ERROR_DEVICE("filter ptr is nullptr!"); + return; + } + if (results == nullptr) { + LOG_ERROR_DEVICE("results ptr is nullptr!"); + return; + } + + // 计算填充后的系数数量 + int paddedCoefCount = filterLength; + while (paddedCoefCount % upRate) { + paddedCoefCount++; + } + + int coefsPerPhase = paddedCoefCount / upRate; + + // 分配转置系数内存 + double *transposedCoefs = (double *)malloc(paddedCoefCount * sizeof(double)); + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!"); + return; + } + + // 转置滤波器系数 + transpose_filter_coefs_device_double(transposedCoefs, filter, upRate, + filterLength, coefsPerPhase); + + // 创建Resampler状态 + DeviceResamplerStateDouble state; + resampler_init_state_device_double(&state, transposedCoefs, coefsPerPhase, + upRate, downRate); + + // 计算填充量 + int padding = coefsPerPhase - 1; + + // 分配填充输入内存 + double *inputPadded = (double *)malloc((inLength + padding) * sizeof(double)); + if (inputPadded == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!"); + return; + } + + // 复制输入并填充 + for (int i = 0; i < inLength + padding; i++) { + if (i < inLength) { + inputPadded[i] = input[i]; + } else { + inputPadded[i] = 0; + } + } + + // 计算输出大小 + int resultsCountValue = + resampler_needed_out_count_device_double(inLength + padding, &state); + + // 设置输出计数 + if (resultsCount != nullptr) { + *resultsCount = resultsCountValue; + } + + // 运行滤波 + int numSamplesComputed = resampler_apply_device_double( + inputPadded, inLength + padding, results, resultsCountValue, &state); + + // 清理设备内存 + free(inputPadded); + + if (state._state != nullptr) { + free(state._state); + state._state = nullptr; + } + + free(transposedCoefs); + state._transposedCoefs = nullptr; +} + +// 整数向上取整除法 +__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) { + return (num1 + num2 - 1) / num2; +} + +// CUDA设备端GCD函数:最大公约数 +__device__ __forceinline__ int dev_gcd(int a, int b) { + while (b != 0) { + int temp = b; + b = a % b; + a = temp; + } + return a; +} + +// 生成连续递增的序列 +__device__ __forceinline__ void dev_iota_double(double *data, int size, + double start) { + if (data == nullptr) { + LOG_ERROR_DEVICE("data ptr is nullptr!"); + return; + } + for (int i = 0; i < size; i++) { + data[i] = start + double(i); + } + return; +} + +// 填充data为value +__device__ __forceinline__ void dev_fill_double(double *data, int size, + double value) { + if (data == nullptr) { + LOG_ERROR_DEVICE("data ptr is nullptr!"); + return; + } + for (int i = 0; i < size; i++) { + data[i] = value; + } + return; +} + +__device__ int dev_firls_double(double *result, int length, double *freq, + const double *amplitude, int freqSize) { + if (result == nullptr) { + LOG_ERROR_DEVICE("result ptr is nullptr!"); + return; + } + if (freq == nullptr) { + LOG_ERROR_DEVICE("freq ptr is nullptr!"); + return; + } + if (amplitude == nullptr) { + LOG_ERROR_DEVICE("amplitude ptr is nullptr!"); + return; + } + + // 计算权重大小 + int weightSize = freqSize / 2; + + // 初始化权重向量 + double *weight = (double *)malloc(weightSize * sizeof(double)); + if (weight == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for weight!"); + return -1; + } + + // 初始化weight为全1 + dev_fill_double(weight, weightSize, double(1.0)); + + // 处理频率向量 + for (int i = 0; i < freqSize; i++) { + freq[i] = freq[i] / double(2.0); + } + + int filterLength = length + 1; + length = (filterLength - 1) / 2; + + // 奇偶判断 + bool Nodd = filterLength & 1; + + // 创建和初始化向量k + int kLength = length + 1; + double *k = (double *)malloc(kLength * sizeof(double)); + if (k == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!"); + return -1; + }; + + // 初始化k向量为递增序列:0,1,2... + dev_iota_double(k, kLength, double(0.0)); + + if (!Nodd) { + for (int i = 0; i < kLength; i++) { + k[i] += double(0.5); + } + } + + if (Nodd) { + for (int i = 0; i < kLength; i++) { + k[i] = k[i + 1]; + } + kLength--; + } + + // 创建和初始化向量b + int bLength = kLength; + if (Nodd) { + bLength++; // 此处++,因为后面需要在b[0]处插入b0 + } + + double *b = (double *)malloc(bLength * sizeof(double)); + if (b == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!"); + return -1; + }; + + dev_fill_double(b, bLength, double(0.0)); + + double b0 = double(0.0); + for (int i = 0; i < freqSize; i += 2) { + double Fi = freq[i]; + double Fip1 = freq[i + 1]; + double ampi = amplitude[i]; + double ampip1 = amplitude[i + 1]; + double wt2 = powf(weight[i / 2], double(2.0)); + double m_s = (ampip1 - ampi) / (Fip1 - Fi); + double b1 = ampi - (m_s * Fi); + + if (Nodd) { + b0 += (b1 * (Fip1 - Fi)) + + m_s / double(2.0) * + (powf(Fip1, double(2.0)) - powf(Fi, double(2.0))) * wt2; + } + + // 并行计算b向量 + for (int j = 0; j < kLength; j++) { + double kj = k[j]; + b[j] += + (m_s / (double(4.0) * powf(M_PI, double(2.0))) * + (cosf(double(2.0) * M_PI * Fip1) - cosf(double(2.0) * M_PI * Fi)) / + (powf(kj, double(2.0)))) * + wt2; + + b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(double(2.0) * kj * Fip1) - + Fi * (m_s * Fi + b1) * sinf(double(2.0) * kj * Fi)) * + wt2; + } + } + + // 处理最终结果,将b0插入到b向量的开始 + if (Nodd) { + for (int i = kLength; i >= 0; i--) { + if (i > 0) { + b[i] = b[i - 1]; + } else { + b[i] = b0; + } + } + } + + // 计算a向量 + double w0 = weight[0]; + + int aLength = bLength; + double *a = (double *)malloc(aLength * sizeof(double)); + if (a == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!"); + return -1; + }; + + // vector result = {a.rbegin(), a.rend()}; + for (int i = 0; i < aLength; i++) { + a[i] = powf(w0, double(2.0)) * double(4.0) * b[i]; + result[aLength - 1 - i] = a[i]; + } + + int it = 0; + if (Nodd) { + it = 1; + } + + // 构建结果向量 + for (int i = 0; i < aLength; i++) { + result[i] = result[i] * double(0.5); + if ((i + it) < aLength) { + result[aLength + i] = a[i + it] * double(0.5); + } + } + + // 释放动态分配的内存 + free(weight); + free(k); + free(b); + free(a); + return 0; +} + +// 设备端Bessel函数 +__device__ double dev_cyl_bessel_i_double(int n, double x) { + if (n == 0) return double(1); + double bessel = double(1), bessel_prev = double(1); + for (int i = 1; i <= n; ++i) { + bessel = (double(2) * i - double(1)) / i * x * bessel_prev - bessel; + bessel_prev = bessel; + } + return bessel; +} + +// 设备端凯塞窗核函数 +__device__ void dev_kaiser_double(double *window, int order, double bta) { + if (window == nullptr) { + LOG_ERROR_DEVICE("window ptr is nullptr!"); + return; + } + + double Numerator, Denominator; + Denominator = dev_cyl_bessel_i_double(0, bta); + double od2 = (order - double(1)) / double(2); + + for (int n = 0; n < order; n++) { + double x = bta * sqrt(double(1) - powf((n - od2) / od2, double(2))); + Numerator = dev_cyl_bessel_i_double(0, x); + window[n] = Numerator / Denominator; + } +} + +__device__ void dev_resample_double(int upFactor, int downFactor, + double *inputSignal, const int inputSize, + double *outputSignal) { + if (inputSignal == nullptr) { + LOG_ERROR_DEVICE("inputSignal ptr is nullptr!"); + return; + } + if (outputSignal == nullptr) { + LOG_ERROR_DEVICE("outputSignal ptr is nullptr!"); + return; + } + + const int n = 10; + const double bta = double(5.0); + + if (upFactor <= 0 || downFactor <= 0) { + LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!"); + return; + } + + int gcd_o = dev_gcd(upFactor, downFactor); + + upFactor /= gcd_o; + downFactor /= gcd_o; + + if (upFactor == downFactor) { + for (int i = 0; i < inputSize; i++) { + outputSignal[i] = inputSignal[i]; + } + return; + } + + int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor); + + int maxFactor = (upFactor > downFactor) ? upFactor : downFactor; + double firlsFreq = double(1.0) / double(2.0) / static_cast(maxFactor); + int length = 2 * n * maxFactor + 1; + + double firlsFreqsV[4]; + firlsFreqsV[0] = double(0.0); + firlsFreqsV[1] = double(2.0) * firlsFreq; + firlsFreqsV[2] = double(2.0) * firlsFreq; + firlsFreqsV[3] = double(1.0); + + double firlsAmplitudeV[4]; + firlsAmplitudeV[0] = double(1.0); + firlsAmplitudeV[1] = double(1.0); + firlsAmplitudeV[2] = double(0.0); + firlsAmplitudeV[3] = double(0.0); + + int freqSize = 4; + int coefficientsLength = length; + + double *coefficients = (double *)malloc(coefficientsLength * sizeof(double)); + if (coefficients == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!"); + return; + } + + int ret = dev_firls_double(coefficients, length - 1, firlsFreqsV, + firlsAmplitudeV, freqSize); + if (ret == -1) { + LOG_ERROR_DEVICE("dev_firls_double function error!"); + return; + } + + int windowSize = length; + double *window = (double *)malloc(windowSize * sizeof(double)); + if (window == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for window!"); + return; + } + + dev_kaiser_double(window, length, bta); + + for (int i = 0; i < coefficientsLength; i++) { + coefficients[i] *= (upFactor * window[i]); + } + + int lengthHalf = (length - 1) / 2; + int nz = downFactor - lengthHalf % downFactor; + + // 分配filter空间 + int hSize = coefficientsLength + nz; + double *filter = + (double *)malloc((coefficientsLength + 3 * nz) * sizeof(double)); + if (filter == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for filter!"); + return; + } + + int filterLength = 0; + for (int i = 0; i < nz; i++) { + filter[i + filterLength] = double(0.0); + } + filterLength += nz; + + for (int i = 0; i < coefficientsLength; i++) { + filter[i + filterLength] = coefficients[i]; + } + filterLength += coefficientsLength; + + lengthHalf += nz; + int delay = lengthHalf / downFactor; + nz = 0; + while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) - + delay < + outputSize) { + nz++; + } + + for (int i = 0; i < nz; i++) { + filter[i + filterLength] = double(0.0); + } + filterLength += nz; + + // 计算 + int paddedCoefCount = filterLength; + while (paddedCoefCount % upFactor) { + paddedCoefCount++; + } + + int coefsPerPhase = paddedCoefCount / upFactor; + int padding = coefsPerPhase - 1; + int outputCount = + ((inputSize + padding) * upFactor + downFactor - 1) / downFactor; + + double *results = (double *)malloc(outputCount * sizeof(double)); + if (results == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!"); + return; + } + + int resultsCount = 0; + upfirdn_device_double(upFactor, downFactor, inputSignal, inputSize, filter, + filterLength, results, &resultsCount); + + int j = 0; + for (int i = delay; i < outputSize + delay; i++) { + outputSignal[j++] = results[i]; + } + + // 释放动态分配的内存 + free(coefficients); + free(window); + free(filter); + free(results); + return; +} + +/** + * ShiftingAndResamplingKernelDoubleV1 + * 重采样核函数:完成原始信号的移频,重采样等计算 + * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + * 因此共 numChannels * numResults 个线程并行计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param VDownFactor:下采样率 + * @param VFrequency:频率 + * @param VOutputLength:每个检测结果的重采样输出信号长度 + * @param numResults:每帧的检测结果总数 + * @param numChannels:信号通道数 + * @param signalLength:每个通道的信号长度 + * @param CurrentRealfreq:当前实际频率 + * @param sampling_rate + * @param outputIdata:存放所有重采样后的Idata,调用时需确保显存空间足够 + * @param outputQdata:存放所有重采样后的Qdata,调用时需确保显存空间足够 + * @return true or false + */ +__global__ void ShiftingAndResamplingKernelDoubleV1( + const double *__restrict__ origIdata, const double *__restrict__ origQdata, + const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency, + const int *__restrict__ VOutputLength, const int numResults, + const int numChannels, const int signalLength, const double CurrentRealfreq, + const double sampling_rate, double *I_shifted, double *Q_shifted, + double *__restrict__ outputIdata, double *__restrict__ outputQdata) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numResults * numChannels) return; + + // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + int ResIdx = idx / numChannels; // 第几个检测结果 + int chIdx = idx % numChannels; // 第几个通道 + + double frequency = VFrequency[ResIdx]; // 频率 + double deltaFreq = (CurrentRealfreq - frequency) * 1e6; + + // 获取当前线程处理的原始数据 (某一个通道的原始数据的地址) + const auto I_orig = origIdata + chIdx * signalLength; + const auto Q_orig = origQdata + chIdx * signalLength; + + // 移频:生成本振信号并相乘 + for (int i = 0; i < signalLength; i++) { + double phase = 2 * M_PI * deltaFreq * i / sampling_rate; + double cosVal = cosf(phase); + double sinVal = sinf(phase); + I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal; + Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal; + } + + // 上采样因子为1,下采样因子为downFactor + int upFactor = 1; + int downFactor = VDownFactor[ResIdx]; + + // 计算之前带宽,对应的输出信号的总长度 + int beforeTotalLength = 0; + for (int i = 0; i < ResIdx; i++) { + beforeTotalLength += VOutputLength[i]; + } + // 当前带宽对应的输出信号的起始地址偏移 + int offset = beforeTotalLength * numChannels; + + // 获取当前检测结果对应的输出信号长度(这里假设的是每个带宽对应的输出信号长度可能不相同) + int outputLength = VOutputLength[ResIdx]; + + // outputIdata:存放所有重采样后的Idata,调用时需确保内存空间足够 + auto I_resampled = outputIdata + offset + chIdx * outputLength; + // outputQdata:存放所有重采样后的Qdata,调用时需确保内存空间足够 + auto Q_resampled = outputQdata + offset + chIdx * outputLength; + + // 重采样 + dev_resample_double(upFactor, downFactor, I_shifted, signalLength, + I_resampled); + dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, + Q_resampled); +} + +/** + * ShiftingAndResamplingKernelDoubleV2 + * 重采样核函数:完成原始信号的移频,重采样等计算 + * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + * 因此共 numChannels * numResults 个线程并行计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param VDownFactor:下采样率 + * @param VFrequency:频率 + * @param numResults:每帧的检测结果总数 + * @param numChannels:信号通道数 + * @param signalLength:每个通道的原始信号长度 + * @param CurrentRealfreq:当前实际频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param outputIdata:存放所有重采样后的Idata,调用时需确保显存空间足够 + * @param outputQdata:存放所有重采样后的Qdata,调用时需确保显存空间足够 + * @return true or false + */ +__global__ void ShiftingAndResamplingKernelDoubleV2( + const double *__restrict__ origIdata, const double *__restrict__ origQdata, + const int *__restrict__ VDownFactor, const double *__restrict__ VFrequency, + const int numResults, const int numChannels, const int signalLength, + const double CurrentRealfreq, const int alignSignalLength, + const double sampling_rate, double *I_shifted, double *Q_shifted, + double *__restrict__ outputIdata, double *__restrict__ outputQdata) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numChannels * numResults) return; + + // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + int ResIdx = idx / numChannels; // 第几个检测结果 + int chIdx = idx % numChannels; // 第几个通道 + + double frequency = VFrequency[ResIdx]; // 频率 + double deltaFreq = (CurrentRealfreq - frequency) * 1e6; + + // 获取当前线程处理的通道数据地址 + const auto I_orig = origIdata + chIdx * signalLength; + const auto Q_orig = origQdata + chIdx * signalLength; + + // 移频:生成本振信号并相乘 + for (int i = 0; i < signalLength; i++) { + double phase = 2 * M_PI * deltaFreq * i / sampling_rate; + double cosVal = cosf(phase); + double sinVal = sinf(phase); + I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal; + Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal; + } + + // 上采样因子为1,下采样因子为downFactor + int upFactor = 1; + int downFactor = VDownFactor[ResIdx]; + + // outputIdata:存放所有重采样后的Idata,调用时需确保内存空间足够 + auto I_resampled = + outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength; + // outputQdata:存放所有重采样后的Qdata,调用时需确保内存空间足够 + auto Q_resampled = + outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength; + + // 重采样 + dev_resample_double(upFactor, downFactor, I_shifted, signalLength, + I_resampled); + dev_resample_double(upFactor, downFactor, Q_shifted, signalLength, + Q_resampled); +} + +/** + * ShiftAndResampleSignalDoubleV1 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param outputLength:重采样后每个带宽对应的输出信号长度 + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param outputTotalLength:一个通道重采样后信号的总长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalDoubleV1( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &outputLength, std::vector &downFactor, + std::vector &detectFreq, const int outputTotalLength, + const int numResults, const int numChannels, const double CurrentRealfreq, + double *outputIdata, double *outputQdata) { + // 参数合法性检查 + if (outputTotalLength <= 0) { + LOG_ERROR_HOST("outputTotalLength <= 0"); + return false; + } + if (numResults <= 0) { + LOG_ERROR_HOST("numResults <= 0"); + return false; + } + if (numChannels <= 0) { + LOG_ERROR_HOST("numChannels <= 0"); + return false; + } + + if (outputLength.size() != numResults) { + LOG_ERROR_HOST("vector outputLength lenght != numResults"); + return false; + } + if (downFactor.size() != numResults) { + LOG_ERROR_HOST("vector downFactor lenght != numResults"); + return false; + } + if (detectFreq.size() != numResults) { + LOG_ERROR_HOST("vector detectFreq lenght != numResults"); + return false; + } + + if (outputIdata == nullptr) { + LOG_ERROR_HOST("outputIdata is null ptr"); + return false; + } + if (outputQdata == nullptr) { + LOG_ERROR_HOST("outputQdata is null ptr"); + return false; + } + + // 每个通道的信号长度:这里假设所有通道的原始信号长度是相同的 + int signalLength = origIdata[0].size(); + if (signalLength <= 0) { + LOG_ERROR_HOST("signalLength <= 0"); + return false; + } + + // 设置CUDA设备堆大小以便支持设备端malloc/free + int deviceId = 0; + double heapSizePercent = 10.0f; + size_t minHeapSizeMB = 16; + size_t maxHeapSizeMB = 512; + SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB); + + // ====准备调用重采样核函数:ShiftingAndResamplingKernel===== + // copy下采样率,频率,输出信号长度等数据到显存中 + int *d_downFactor = nullptr; + int *d_outputLength = nullptr; + double *d_frequency = nullptr; + + // 申请显存 + CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double)))); + + // copy下采样率到显存中 + const int *src_downFactor = downFactor.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy每个带宽,重采样后输出信号长度到显存中 + const int *src_outputLength = outputLength.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy频率到显存中 + const double *src_frequency = detectFreq.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency, + numResults * sizeof(double), + cudaMemcpyHostToDevice)); + + // 申请原始的idata和qdata所需的GPU显存,并将数据copy到GPU显存中 + double *d_Idata = nullptr; + double *d_Qdata = nullptr; + CHECK_CUDA_ERROR( + cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double)))); + CHECK_CUDA_ERROR( + cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double)))); + + // 将所有通道数据循环拷贝到GPU显存 + size_t copySize = signalLength * sizeof(double); + for (int i = 0; i < numChannels; i++) { + // copy 原始的 idata 到gpu显存 + double *dst_idata = d_Idata + i * signalLength; + const void *src_idata = origIdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice)); + + // copy 原始的 qdata 到gpu显存 + double *dst_qdata = d_Qdata + i * signalLength; + const void *src_qdata = origQdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice)); + } + + // 申请移频所需的空间 + double *I_shifted = nullptr; + double *Q_shifted = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double)))); + CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double)))); + + // 申请重采样后输出信号的GPU显存 + double *d_outputIdata = nullptr; + double *d_outputQdata = nullptr; + CHECK_CUDA_ERROR(cudaMalloc( + &d_outputIdata, (numChannels * outputTotalLength * sizeof(double)))); + CHECK_CUDA_ERROR(cudaMalloc( + &d_outputQdata, (numChannels * outputTotalLength * sizeof(double)))); + + // 线程数配置 + dim3 block(numChannels); + dim3 grid((numResults * numChannels + block.x - 1) / block.x); + const double sampling_rate = double(245.76e6); + ShiftingAndResamplingKernelDoubleV1<<>>( + d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults, + numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted, + Q_shifted, d_outputIdata, d_outputQdata); + + // copy重采样计算结果到主存 + // 存储格式:[numResults][numChannels][lengthPerResults] + // 且在内存中是连续存放的 + CHECK_CUDA_ERROR( + cudaMemcpy(outputIdata, d_outputIdata, + (numChannels * outputTotalLength * sizeof(double)), + cudaMemcpyDeviceToHost)); + + CHECK_CUDA_ERROR( + cudaMemcpy(outputQdata, d_outputQdata, + (numChannels * outputTotalLength * sizeof(double)), + cudaMemcpyDeviceToHost)); + + // 释放显存 + if (d_downFactor) { + cudaFree(d_downFactor); + d_downFactor = nullptr; + } + + if (d_outputLength) { + cudaFree(d_outputLength); + d_outputLength = nullptr; + } + + if (d_frequency) { + cudaFree(d_frequency); + d_frequency = nullptr; + } + + if (d_Idata) { + cudaFree(d_Idata); + d_Idata = nullptr; + } + + if (d_Qdata) { + cudaFree(d_Qdata); + d_Qdata = nullptr; + } + + if (I_shifted) { + cudaFree(I_shifted); + I_shifted = nullptr; + } + + if (Q_shifted) { + cudaFree(Q_shifted); + Q_shifted = nullptr; + } + + if (d_outputIdata) { + cudaFree(d_outputIdata); + d_outputIdata = nullptr; + } + + if (d_outputQdata) { + cudaFree(d_outputQdata); + d_outputQdata = nullptr; + } + + return true; +} + +/** + * ShiftAndResampleSignalDoubleV2 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalDoubleV2( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &downFactor, std::vector &detectFreq, + const int alignSignalLength, const int numResults, const int numChannels, + const double CurrentRealfreq, double *outputIdata, double *outputQdata) { + // 参数合法性检查 + if (alignSignalLength <= 0) { + LOG_ERROR_HOST("alignSignalLength <= 0"); + return false; + } + if (numResults <= 0) { + LOG_ERROR_HOST("numResults <= 0"); + return false; + } + if (numChannels <= 0) { + LOG_ERROR_HOST("numChannels <= 0"); + return false; + } + + if (downFactor.size() != numResults) { + LOG_ERROR_HOST("vector downFactor lenght != numResults"); + return false; + } + if (detectFreq.size() != numResults) { + LOG_ERROR_HOST("vector detectFreq lenght != numResults"); + return false; + } + + if (outputIdata == nullptr) { + LOG_ERROR_HOST("outputIdata is null ptr"); + return false; + } + if (outputQdata == nullptr) { + LOG_ERROR_HOST("outputQdata is null ptr"); + return false; + } + + // 每个通道的信号长度:这里假设所有通道的原始信号长度是相同的 + int signalLength = origIdata[0].size(); + if (signalLength <= 0) { + LOG_ERROR_HOST("signalLength <= 0"); + return false; + } + + // 设置CUDA设备堆大小以便支持设备端malloc/free + int deviceId = 0; + double heapSizePercent = 10.0f; + size_t minHeapSizeMB = 16; + size_t maxHeapSizeMB = 512; + SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB); + + // ====准备调用重采样核函数:ShiftingAndResamplingKernel===== + // copy下采样率,频率等数据到显存中 + int *d_downFactor = nullptr; + double *d_frequency = nullptr; + // 申请显存 + CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(double)))); + + // copy下采样率到显存中 + const int *src_downFactor = downFactor.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy频率到显存中 + const double *src_frequency = detectFreq.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency, + numResults * sizeof(double), + cudaMemcpyHostToDevice)); + + // 申请原始的idata和qdata所需的GPU显存,并将数据copy到GPU显存中 + double *d_Idata = nullptr; + double *d_Qdata = nullptr; + CHECK_CUDA_ERROR( + cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(double)))); + CHECK_CUDA_ERROR( + cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(double)))); + + // 将所有通道数据循环拷贝到GPU显存 + size_t copySize = signalLength * sizeof(double); + for (int i = 0; i < numChannels; i++) { + // copy 原始的idata 到gpu显存 + double *dst_idata = d_Idata + i * signalLength; + const void *src_idata = origIdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice)); + + // copy 原始的qdata 到gpu显存 + double *dst_qdata = d_Qdata + i * signalLength; + const void *src_qdata = origQdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice)); + } + + // 申请移频所需的空间 + double *I_shifted = nullptr; + double *Q_shifted = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(double)))); + CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(double)))); + + // 申请重采样后输出信号的GPU显存 + size_t totalsize = + numResults * numChannels * alignSignalLength * sizeof(double); + double *d_outputIdata = nullptr; + double *d_outputQdata = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize)); + CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize)); + + // 初始化为0 + CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize)); + CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize)); + + // 线程数配置,总的线程数:numChannels * numResults + dim3 block(numChannels); + dim3 grid((numChannels * numResults + block.x - 1) / block.x); + const double sampling_rate = double(245.76e6); + ShiftingAndResamplingKernelDoubleV2<<>>( + d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels, + signalLength, CurrentRealfreq, alignSignalLength, sampling_rate, + I_shifted, Q_shifted, d_outputIdata, d_outputQdata); + + // copy重采样计算结果到主存 + // 存储格式:[numResults][numChannels][lengthPerResults] + // 且在内存中是连续存放的 + CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize, + cudaMemcpyDeviceToHost)); + + CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize, + cudaMemcpyDeviceToHost)); + + // 释放显存 + if (d_downFactor) { + cudaFree(d_downFactor); + d_downFactor = nullptr; + } + + if (d_frequency) { + cudaFree(d_frequency); + d_frequency = nullptr; + } + + if (d_Idata) { + cudaFree(d_Idata); + d_Idata = nullptr; + } + + if (d_Qdata) { + cudaFree(d_Qdata); + d_Qdata = nullptr; + } + + if (I_shifted) { + cudaFree(I_shifted); + I_shifted = nullptr; + } + + if (Q_shifted) { + cudaFree(Q_shifted); + Q_shifted = nullptr; + } + + if (d_outputIdata) { + cudaFree(d_outputIdata); + d_outputIdata = nullptr; + } + + if (d_outputQdata) { + cudaFree(d_outputQdata); + d_outputQdata = nullptr; + } + + return true; +} \ No newline at end of file diff --git a/cuda_resample_double.h b/cuda_resample_double.h new file mode 100644 index 0000000000000000000000000000000000000000..48d33285235ebc3cbc110090e47a2eb9612e0576 --- /dev/null +++ b/cuda_resample_double.h @@ -0,0 +1,81 @@ +#ifndef CUDA_RESAMPLE_DOUBLE_H +#define CUDA_RESAMPLE_DOUBLE_H + +#include +#include +#include // CUDA数学常量头文件 +#include + +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.141592653589793238462643 +#endif + +// 设备端Resampler状态结构 +struct DeviceResamplerStateDouble { + int _t; // "time" (modulo upRate) + int _xOffset; // 输入偏移量 + double *_state; // 状态缓冲区指针 + double *_transposedCoefs; // 转置系数指针 + int _coefsPerPhase; // 每相系数数量 + int _upRate; // 上采样率 + int _downRate; // 下采样率 +}; + +/** + * ShiftAndResampleSignalDoubleV1 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param outputLength:重采样后每个带宽对应的输出信号长度 + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param outputTotalLength:一个通道重采样后信号的总长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalDoubleV1( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &outputLength, std::vector &downFactor, + std::vector &detectFreq, const int outputTotalLength, + const int numResults, const int numChannels, const double CurrentRealfreq, + double *outputIdata, double *outputQdata); + +/** + * ShiftAndResampleSignalDoubleV2 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalDoubleV2( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &downFactor, std::vector &detectFreq, + const int alignSignalLength, const int numResults, const int numChannels, + const double CurrentRealfreq, double *outputIdata, double *outputQdata); + +#endif // CUDA_RESAMPLE_DOUBLE_H diff --git a/cuda_resample_float.cu b/cuda_resample_float.cu new file mode 100644 index 0000000000000000000000000000000000000000..508bfc58d58c59ef2196fbedb391b02867b1f3ac --- /dev/null +++ b/cuda_resample_float.cu @@ -0,0 +1,1178 @@ +#include + +#include "common.h" +#include "cuda_resample_float.h" + +// 设备端Resampler初始化 +__device__ void resampler_init_state_device_float( + DeviceResamplerStateFloat *state, float *transposedCoefs, int coefsPerPhase, + int upRate, int downRate) { + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + + state->_t = 0; + state->_xOffset = 0; + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!"); + return; + } + state->_transposedCoefs = transposedCoefs; + state->_coefsPerPhase = coefsPerPhase; + state->_upRate = upRate; + state->_downRate = downRate; + + // 分配状态缓冲区 + state->_state = (float *)malloc((coefsPerPhase - 1) * sizeof(float)); + if (state->_state == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for state->_state!"); + return; + } + + // 初始化状态为零 + for (int i = 0; i < coefsPerPhase - 1; i++) { + state->_state[i] = float(0); + } +} + +// 设备端:计算所需输出数量 +__device__ int resampler_needed_out_count_device_float( + int inCount, DeviceResamplerStateFloat *state) { + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + + int np = inCount * state->_upRate; + int need = np / state->_downRate; + + if ((state->_t + state->_upRate * state->_xOffset) < + (np % state->_downRate)) { + need++; + } + + return need; +} + +// 设备端:应用重采样 +__device__ int resampler_apply_device_float(float *in, int inCount, float *out, + int outCount, + DeviceResamplerStateFloat *state) { + if (in == nullptr) { + LOG_ERROR_DEVICE("in ptr is nullptr!"); + return; + } + if (out == nullptr) { + LOG_ERROR_DEVICE("out ptr is nullptr!"); + return; + } + if (state == nullptr) { + LOG_ERROR_DEVICE("state ptr is nullptr!"); + return; + } + if (outCount < resampler_needed_out_count_device_float(inCount, state)) { + // 在设备端无法抛出异常,返回错误代码 + return -1; + } + + // x指向最新处理的输入样本 + float *x = in + state->_xOffset; + float *y = out; + float *end = in + inCount; + + while (x < end) { + float acc = 0; + float *h = state->_transposedCoefs + state->_t * state->_coefsPerPhase; + float *xPtr = x - state->_coefsPerPhase + 1; + + int offset = in - xPtr; + if (offset > 0) { + // 需要从_state缓冲区中获取 + float *statePtr = state->_state + (state->_coefsPerPhase - 1) - offset; + + while (statePtr < state->_state + (state->_coefsPerPhase - 1)) { + acc += (*statePtr++) * (*h++); + } + + xPtr += offset; + } + + while (xPtr <= x) { + acc += (*xPtr++) * (*h++); + } + + *y++ = acc; + state->_t += state->_downRate; + + int advanceAmount = state->_t / state->_upRate; + x += advanceAmount; + state->_t %= state->_upRate; + } + + state->_xOffset = x - end; + + // 管理_state缓冲区 + int retain = (state->_coefsPerPhase - 1) - inCount; + + if (retain > 0) { + // 对于小于状态缓冲区的inCount,将缓冲区的末尾复制到开头 + for (int i = 0; i < retain; i++) { + state->_state[i] = + state->_state[(state->_coefsPerPhase - 1) - retain + i]; + } + + // 然后将整个(短)输入复制到缓冲区末尾 + for (int i = 0; i < inCount; i++) { + state->_state[retain + i] = in[i]; + } + } else { + // 只将最后几个输入样本复制到状态缓冲区 + for (int i = 0; i < state->_coefsPerPhase - 1; i++) { + state->_state[i] = *end - (float)(state->_coefsPerPhase - 1) + (float)i; + } + } + + // 返回计算的样本数 + return y - out; +} + +// 设备端:转置滤波器系数(每个线程执行) +__device__ void transpose_filter_coefs_device_float(float *transposedCoefs, + float *coefs, int upRate, + int coefCount, + int coefsPerPhase) { + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("transposedCoefs ptr is nullptr!"); + return; + } + if (coefs == nullptr) { + LOG_ERROR_DEVICE("coefs ptr is nullptr!"); + return; + } + + // 初始化转置系数为零 + for (int i = 0; i < upRate * coefsPerPhase; i++) { + transposedCoefs[i] = float(0); + } + + // 转置并翻转每个相位 + for (int i = 0; i < upRate; ++i) { + for (int j = 0; j < coefsPerPhase; ++j) { + if (j * upRate + i < coefCount) { + transposedCoefs[(coefsPerPhase - 1 - j) + i * coefsPerPhase] = + coefs[j * upRate + i]; + } + } + } +} + +// 设备端upfirdn主函数 +__device__ void upfirdn_device_float(int upRate, int downRate, float *input, + int inLength, float *filter, + int filterLength, float *results, + int *resultsCount) { + if (input == nullptr) { + LOG_ERROR_DEVICE("input ptr is nullptr!"); + return; + } + if (filter == nullptr) { + LOG_ERROR_DEVICE("filter ptr is nullptr!"); + return; + } + if (results == nullptr) { + LOG_ERROR_DEVICE("results ptr is nullptr!"); + return; + } + + // 计算填充后的系数数量 + int paddedCoefCount = filterLength; + while (paddedCoefCount % upRate) { + paddedCoefCount++; + } + + int coefsPerPhase = paddedCoefCount / upRate; + + // 分配转置系数内存 + float *transposedCoefs = (float *)malloc(paddedCoefCount * sizeof(float)); + if (transposedCoefs == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for transposedCoefs!"); + return; + } + + // 转置滤波器系数 + transpose_filter_coefs_device_float(transposedCoefs, filter, upRate, + filterLength, coefsPerPhase); + + // 创建Resampler状态 + DeviceResamplerStateFloat state; + resampler_init_state_device_float(&state, transposedCoefs, coefsPerPhase, + upRate, downRate); + + // 计算填充量 + int padding = coefsPerPhase - 1; + + // 分配填充输入内存 + float *inputPadded = (float *)malloc((inLength + padding) * sizeof(float)); + if (inputPadded == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for inputPadded!"); + return; + } + + // 复制输入并填充 + for (int i = 0; i < inLength + padding; i++) { + if (i < inLength) { + inputPadded[i] = input[i]; + } else { + inputPadded[i] = 0; + } + } + + // 计算输出大小 + int resultsCountValue = + resampler_needed_out_count_device_float(inLength + padding, &state); + + // 设置输出计数 + if (resultsCount != nullptr) { + *resultsCount = resultsCountValue; + } + + // 运行滤波 + int numSamplesComputed = resampler_apply_device_float( + inputPadded, inLength + padding, results, resultsCountValue, &state); + + // 清理设备内存 + free(inputPadded); + + if (state._state != nullptr) { + free(state._state); + state._state = nullptr; + } + + free(transposedCoefs); + state._transposedCoefs = nullptr; +} + +// 整数向上取整除法 +__device__ __forceinline__ int dev_quotientCeil(int num1, int num2) { + return (num1 + num2 - 1) / num2; +} + +// CUDA设备端GCD函数:最大公约数 +__device__ __forceinline__ int dev_gcd(int a, int b) { + while (b != 0) { + int temp = b; + b = a % b; + a = temp; + } + return a; +} + +// 生成连续递增的序列 +__device__ __forceinline__ void dev_iota_float(float *data, int size, + float start) { + if (data == nullptr) { + LOG_ERROR_DEVICE("data ptr is nullptr!"); + return; + } + for (int i = 0; i < size; i++) { + data[i] = start + float(i); + } + return; +} + +// 填充data为value +__device__ __forceinline__ void dev_fill_float(float *data, int size, + float value) { + if (data == nullptr) { + LOG_ERROR_DEVICE("data ptr is nullptr!"); + return; + } + for (int i = 0; i < size; i++) { + data[i] = value; + } + return; +} + +__device__ int dev_firls_float(float *result, int length, float *freq, + const float *amplitude, int freqSize) { + if (result == nullptr) { + LOG_ERROR_DEVICE("result ptr is nullptr!"); + return; + } + if (freq == nullptr) { + LOG_ERROR_DEVICE("freq ptr is nullptr!"); + return; + } + if (amplitude == nullptr) { + LOG_ERROR_DEVICE("amplitude ptr is nullptr!"); + return; + } + + // 计算权重大小 + int weightSize = freqSize / 2; + + // 初始化权重向量 + float *weight = (float *)malloc(weightSize * sizeof(float)); + if (weight == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for weight!"); + return -1; + } + + // 初始化weight为全1 + dev_fill_float(weight, weightSize, float(1.0)); + + // 处理频率向量 + for (int i = 0; i < freqSize; i++) { + freq[i] = freq[i] / float(2.0); + } + + int filterLength = length + 1; + length = (filterLength - 1) / 2; + + // 奇偶判断 + bool Nodd = filterLength & 1; + + // 创建和初始化向量k + int kLength = length + 1; + float *k = (float *)malloc(kLength * sizeof(float)); + if (k == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for k向量!"); + return -1; + }; + + // 初始化k向量为递增序列:0,1,2... + dev_iota_float(k, kLength, float(0.0)); + + if (!Nodd) { + for (int i = 0; i < kLength; i++) { + k[i] += float(0.5); + } + } + + if (Nodd) { + for (int i = 0; i < kLength; i++) { + k[i] = k[i + 1]; + } + kLength--; + } + + // 创建和初始化向量b + int bLength = kLength; + if (Nodd) { + bLength++; // 此处++,因为后面需要在b[0]处插入b0 + } + + float *b = (float *)malloc(bLength * sizeof(float)); + if (b == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for b向量!"); + return -1; + }; + + dev_fill_float(b, bLength, float(0.0)); + + float b0 = float(0.0); + for (int i = 0; i < freqSize; i += 2) { + float Fi = freq[i]; + float Fip1 = freq[i + 1]; + float ampi = amplitude[i]; + float ampip1 = amplitude[i + 1]; + float wt2 = powf(weight[i / 2], float(2.0)); + float m_s = (ampip1 - ampi) / (Fip1 - Fi); + float b1 = ampi - (m_s * Fi); + + if (Nodd) { + b0 += (b1 * (Fip1 - Fi)) + + m_s / float(2.0) * (powf(Fip1, float(2.0)) - powf(Fi, float(2.0))) * + wt2; + } + + // 并行计算b向量 + for (int j = 0; j < kLength; j++) { + float kj = k[j]; + b[j] += (m_s / (float(4.0) * powf(M_PI, float(2.0))) * + (cosf(float(2.0) * M_PI * Fip1) - cosf(float(2.0) * M_PI * Fi)) / + (powf(kj, float(2.0)))) * + wt2; + + b[j] += (Fip1 * (m_s * Fip1 + b1) * sinf(float(2.0) * kj * Fip1) - + Fi * (m_s * Fi + b1) * sinf(float(2.0) * kj * Fi)) * + wt2; + } + } + + // 处理最终结果,将b0插入到b向量的开始 + if (Nodd) { + for (int i = kLength; i >= 0; i--) { + if (i > 0) { + b[i] = b[i - 1]; + } else { + b[i] = b0; + } + } + } + + // 计算a向量 + float w0 = weight[0]; + + int aLength = bLength; + float *a = (float *)malloc(aLength * sizeof(float)); + if (a == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for a向量!"); + return -1; + }; + + // vector result = {a.rbegin(), a.rend()}; + for (int i = 0; i < aLength; i++) { + a[i] = powf(w0, float(2.0)) * float(4.0) * b[i]; + result[aLength - 1 - i] = a[i]; + } + + int it = 0; + if (Nodd) { + it = 1; + } + + // 构建结果向量 + for (int i = 0; i < aLength; i++) { + result[i] = result[i] * float(0.5); + if ((i + it) < aLength) { + result[aLength + i] = a[i + it] * float(0.5); + } + } + + // 释放动态分配的内存 + free(weight); + free(k); + free(b); + free(a); + return 0; +} + +// 设备端Bessel函数 +__device__ float dev_cyl_bessel_i_float(int n, float x) { + if (n == 0) return float(1); + float bessel = float(1), bessel_prev = float(1); + for (int i = 1; i <= n; ++i) { + bessel = (float(2) * i - float(1)) / i * x * bessel_prev - bessel; + bessel_prev = bessel; + } + return bessel; +} + +// 设备端凯塞窗核函数 +__device__ void dev_kaiser_float(float *window, int order, float bta) { + if (window == nullptr) { + LOG_ERROR_DEVICE("window ptr is nullptr!"); + return; + } + float Numerator, Denominator; + Denominator = dev_cyl_bessel_i_float(0, bta); + float od2 = (order - float(1)) / float(2); + + for (int n = 0; n < order; n++) { + float x = bta * sqrt(float(1) - powf((n - od2) / od2, float(2))); + Numerator = dev_cyl_bessel_i_float(0, x); + window[n] = Numerator / Denominator; + } +} + +__device__ void dev_resample_float(int upFactor, int downFactor, + float *inputSignal, const int inputSize, + float *outputSignal) { + if (inputSignal == nullptr) { + LOG_ERROR_DEVICE("inputSignal ptr is nullptr!"); + return; + } + if (outputSignal == nullptr) { + LOG_ERROR_DEVICE("outputSignal ptr is nullptr!"); + return; + } + + const int n = 10; + const float bta = float(5.0); + + if (upFactor <= 0 || downFactor <= 0) { + LOG_ERROR_DEVICE("upFactor and downFactor must be positive integer!"); + return; + } + + int gcd_o = dev_gcd(upFactor, downFactor); + + upFactor /= gcd_o; + downFactor /= gcd_o; + + if (upFactor == downFactor) { + for (int i = 0; i < inputSize; i++) { + outputSignal[i] = inputSignal[i]; + } + return; + } + + int outputSize = dev_quotientCeil(inputSize * upFactor, downFactor); + + int maxFactor = (upFactor > downFactor) ? upFactor : downFactor; + float firlsFreq = float(1.0) / float(2.0) / static_cast(maxFactor); + int length = 2 * n * maxFactor + 1; + + float firlsFreqsV[4]; + firlsFreqsV[0] = float(0.0); + firlsFreqsV[1] = float(2.0) * firlsFreq; + firlsFreqsV[2] = float(2.0) * firlsFreq; + firlsFreqsV[3] = float(1.0); + + float firlsAmplitudeV[4]; + firlsAmplitudeV[0] = float(1.0); + firlsAmplitudeV[1] = float(1.0); + firlsAmplitudeV[2] = float(0.0); + firlsAmplitudeV[3] = float(0.0); + + int freqSize = 4; + int coefficientsLength = length; + + float *coefficients = (float *)malloc(coefficientsLength * sizeof(float)); + if (coefficients == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for coefficients!"); + return; + } + + int ret = dev_firls_float(coefficients, length - 1, firlsFreqsV, + firlsAmplitudeV, freqSize); + if (ret == -1) { + LOG_ERROR_DEVICE("dev_firls_float function error!"); + return; + } + + int windowSize = length; + float *window = (float *)malloc(windowSize * sizeof(float)); + if (window == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for window!"); + return; + } + + dev_kaiser_float(window, length, bta); + + for (int i = 0; i < coefficientsLength; i++) { + coefficients[i] *= (upFactor * window[i]); + } + + int lengthHalf = (length - 1) / 2; + int nz = downFactor - lengthHalf % downFactor; + + // 分配filter空间 + int hSize = coefficientsLength + nz; + float *filter = + (float *)malloc((coefficientsLength + 3 * nz) * sizeof(float)); + if (filter == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for filter!"); + return; + } + + int filterLength = 0; + for (int i = 0; i < nz; i++) { + filter[i + filterLength] = float(0.0); + } + filterLength += nz; + + for (int i = 0; i < coefficientsLength; i++) { + filter[i + filterLength] = coefficients[i]; + } + filterLength += coefficientsLength; + + lengthHalf += nz; + int delay = lengthHalf / downFactor; + nz = 0; + while (dev_quotientCeil((inputSize - 1) * upFactor + hSize + nz, downFactor) - + delay < + outputSize) { + nz++; + } + + for (int i = 0; i < nz; i++) { + filter[i + filterLength] = float(0.0); + } + filterLength += nz; + + // 计算 + int paddedCoefCount = filterLength; + while (paddedCoefCount % upFactor) { + paddedCoefCount++; + } + + int coefsPerPhase = paddedCoefCount / upFactor; + int padding = coefsPerPhase - 1; + int outputCount = + ((inputSize + padding) * upFactor + downFactor - 1) / downFactor; + + float *results = (float *)malloc(outputCount * sizeof(float)); + if (results == nullptr) { + LOG_ERROR_DEVICE("Failed to allocate device memory for upfirdn results!"); + return; + } + + int resultsCount = 0; + upfirdn_device_float(upFactor, downFactor, inputSignal, inputSize, filter, + filterLength, results, &resultsCount); + + int j = 0; + for (int i = delay; i < outputSize + delay; i++) { + outputSignal[j++] = results[i]; + } + + // 释放动态分配的内存 + free(coefficients); + free(window); + free(filter); + free(results); + return; +} + +/** + * ShiftingAndResamplingKernelFloatV1 + * 重采样核函数:完成原始信号的移频,重采样等计算 + * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + * 因此共 numChannels * numResults 个线程并行计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param VDownFactor:下采样率 + * @param VFrequency:频率 + * @param VOutputLength:每个检测结果的重采样输出信号长度 + * @param numResults:每帧的检测结果总数 + * @param numChannels:信号通道数 + * @param signalLength:每个通道的信号长度 + * @param CurrentRealfreq:当前实际频率 + * @param sampling_rate + * @param outputIdata:存放所有重采样后的Idata,调用时需确保显存空间足够 + * @param outputQdata:存放所有重采样后的Qdata,调用时需确保显存空间足够 + * @return true or false + */ +__global__ void ShiftingAndResamplingKernelFloatV1( + const float *__restrict__ origIdata, const float *__restrict__ origQdata, + const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency, + const int *__restrict__ VOutputLength, const int numResults, + const int numChannels, const int signalLength, const float CurrentRealfreq, + const float sampling_rate, float *I_shifted, float *Q_shifted, + float *__restrict__ outputIdata, float *__restrict__ outputQdata) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numResults * numChannels) return; + + // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + int ResIdx = idx / numChannels; // 第几个检测结果 + int chIdx = idx % numChannels; // 第几个通道 + + float frequency = VFrequency[ResIdx]; // 频率 + float deltaFreq = (CurrentRealfreq - frequency) * 1e6; + + // 获取当前线程处理的原始数据 (某一个通道的原始数据的地址) + const auto I_orig = origIdata + chIdx * signalLength; + const auto Q_orig = origQdata + chIdx * signalLength; + + // 移频:生成本振信号并相乘 + for (int i = 0; i < signalLength; i++) { + float phase = 2 * M_PI * deltaFreq * i / sampling_rate; + float cosVal = cosf(phase); + float sinVal = sinf(phase); + I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal; + Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal; + } + + // 上采样因子为1,下采样因子为downFactor + int upFactor = 1; + int downFactor = VDownFactor[ResIdx]; + + // 计算之前带宽,对应的输出信号的总长度 + int beforeTotalLength = 0; + for (int i = 0; i < ResIdx; i++) { + beforeTotalLength += VOutputLength[i]; + } + // 当前带宽对应的输出信号的起始地址偏移 + int offset = beforeTotalLength * numChannels; + + // 获取当前检测结果对应的输出信号长度(这里假设的是每个带宽对应的输出信号长度可能不相同) + int outputLength = VOutputLength[ResIdx]; + + // outputIdata:存放所有重采样后的Idata,调用时需确保内存空间足够 + auto I_resampled = outputIdata + offset + chIdx * outputLength; + // outputQdata:存放所有重采样后的Qdata,调用时需确保内存空间足够 + auto Q_resampled = outputQdata + offset + chIdx * outputLength; + + // 重采样 + dev_resample_float(upFactor, downFactor, I_shifted, signalLength, + I_resampled); + dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, + Q_resampled); +} + +/** + * ShiftingAndResamplingKernelFloatV2 + * 重采样核函数:完成原始信号的移频,重采样等计算 + * 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + * 因此共 numChannels * numResults 个线程并行计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param VDownFactor:下采样率 + * @param VFrequency:频率 + * @param numResults:每帧的检测结果总数 + * @param numChannels:信号通道数 + * @param signalLength:每个通道的原始信号长度 + * @param CurrentRealfreq:当前实际频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param outputIdata:存放所有重采样后的Idata,调用时需确保显存空间足够 + * @param outputQdata:存放所有重采样后的Qdata,调用时需确保显存空间足够 + * @return true or false + */ +__global__ void ShiftingAndResamplingKernelFloatV2( + const float *__restrict__ origIdata, const float *__restrict__ origQdata, + const int *__restrict__ VDownFactor, const float *__restrict__ VFrequency, + const int numResults, const int numChannels, const int signalLength, + const float CurrentRealfreq, const int alignSignalLength, + const float sampling_rate, float *I_shifted, float *Q_shifted, + float *__restrict__ outputIdata, float *__restrict__ outputQdata) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numChannels * numResults) return; + + // 每个GPU线程处理一个检测结果的一个通道的原始信号的重采样 + int ResIdx = idx / numChannels; // 第几个检测结果 + int chIdx = idx % numChannels; // 第几个通道 + + float frequency = VFrequency[ResIdx]; // 频率 + float deltaFreq = (CurrentRealfreq - frequency) * 1e6; + + // 获取当前线程处理的通道数据地址 + const auto I_orig = origIdata + chIdx * signalLength; + const auto Q_orig = origQdata + chIdx * signalLength; + + // 移频:生成本振信号并相乘 + for (int i = 0; i < signalLength; i++) { + float phase = 2 * M_PI * deltaFreq * i / sampling_rate; + float cosVal = cosf(phase); + float sinVal = sinf(phase); + I_shifted[i] = I_orig[i] * cosVal - Q_orig[i] * sinVal; + Q_shifted[i] = Q_orig[i] * cosVal + I_orig[i] * sinVal; + } + + // 上采样因子为1,下采样因子为downFactor + int upFactor = 1; + int downFactor = VDownFactor[ResIdx]; + + // outputIdata:存放所有重采样后的Idata,调用时需确保内存空间足够 + auto I_resampled = + outputIdata + (ResIdx * numChannels + chIdx) * alignSignalLength; + // outputQdata:存放所有重采样后的Qdata,调用时需确保内存空间足够 + auto Q_resampled = + outputQdata + (ResIdx * numChannels + chIdx) * alignSignalLength; + + // 重采样 + dev_resample_float(upFactor, downFactor, I_shifted, signalLength, + I_resampled); + dev_resample_float(upFactor, downFactor, Q_shifted, signalLength, + Q_resampled); +} + +/** + * ShiftAndResampleSignalFloatV1 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param outputLength:重采样后每个带宽对应的输出信号长度 + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param outputTotalLength:一个通道重采样后信号的总长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalFloatV1( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &outputLength, std::vector &downFactor, + std::vector &detectFreq, const int outputTotalLength, + const int numResults, const int numChannels, const float CurrentRealfreq, + float *outputIdata, float *outputQdata) { + // 参数合法性检查 + if (outputTotalLength <= 0) { + LOG_ERROR_HOST("outputTotalLength <= 0"); + return false; + } + if (numResults <= 0) { + LOG_ERROR_HOST("numResults <= 0"); + return false; + } + if (numChannels <= 0) { + LOG_ERROR_HOST("numChannels <= 0"); + return false; + } + + if (outputLength.size() != numResults) { + LOG_ERROR_HOST("vector outputLength lenght != numResults"); + return false; + } + if (downFactor.size() != numResults) { + LOG_ERROR_HOST("vector downFactor lenght != numResults"); + return false; + } + if (detectFreq.size() != numResults) { + LOG_ERROR_HOST("vector detectFreq lenght != numResults"); + return false; + } + + if (outputIdata == nullptr) { + LOG_ERROR_HOST("outputIdata is null ptr"); + return false; + } + if (outputQdata == nullptr) { + LOG_ERROR_HOST("outputQdata is null ptr"); + return false; + } + + // 每个通道的信号长度:这里假设所有通道的原始信号长度是相同的 + int signalLength = origIdata[0].size(); + if (signalLength <= 0) { + LOG_ERROR_HOST("signalLength <= 0"); + return false; + } + + // 设置CUDA设备堆大小以便支持设备端malloc/free + int deviceId = 0; + float heapSizePercent = 10.0f; + size_t minHeapSizeMB = 16; + size_t maxHeapSizeMB = 512; + SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB); + + // ====准备调用重采样核函数:ShiftingAndResamplingKernel===== + // copy下采样率,频率,输出信号长度等数据到显存中 + int *d_downFactor = nullptr; + int *d_outputLength = nullptr; + float *d_frequency = nullptr; + + // 申请显存 + CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_outputLength, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float)))); + + // copy下采样率到显存中 + const int *src_downFactor = downFactor.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy每个带宽,重采样后输出信号长度到显存中 + const int *src_outputLength = outputLength.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_outputLength, src_outputLength, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy频率到显存中 + const float *src_frequency = detectFreq.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency, + numResults * sizeof(float), + cudaMemcpyHostToDevice)); + + // 申请原始的idata和qdata所需的GPU显存,并将数据copy到GPU显存中 + float *d_Idata = nullptr; + float *d_Qdata = nullptr; + CHECK_CUDA_ERROR( + cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float)))); + CHECK_CUDA_ERROR( + cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float)))); + + // 将所有通道数据循环拷贝到GPU显存 + size_t copySize = signalLength * sizeof(float); + for (int i = 0; i < numChannels; i++) { + // copy 原始的 idata 到gpu显存 + float *dst_idata = d_Idata + i * signalLength; + const void *src_idata = origIdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice)); + + // copy 原始的 qdata 到gpu显存 + float *dst_qdata = d_Qdata + i * signalLength; + const void *src_qdata = origQdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice)); + } + + // 申请移频所需的空间 + float *I_shifted = nullptr; + float *Q_shifted = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float)))); + CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float)))); + + // 申请重采样后输出信号的GPU显存 + float *d_outputIdata = nullptr; + float *d_outputQdata = nullptr; + CHECK_CUDA_ERROR(cudaMalloc( + &d_outputIdata, (numChannels * outputTotalLength * sizeof(float)))); + CHECK_CUDA_ERROR(cudaMalloc( + &d_outputQdata, (numChannels * outputTotalLength * sizeof(float)))); + + // 线程数配置 + dim3 block(numChannels); + dim3 grid((numResults * numChannels + block.x - 1) / block.x); + const float sampling_rate = float(245.76e6); + ShiftingAndResamplingKernelFloatV1<<>>( + d_Idata, d_Qdata, d_downFactor, d_frequency, d_outputLength, numResults, + numChannels, signalLength, CurrentRealfreq, sampling_rate, I_shifted, + Q_shifted, d_outputIdata, d_outputQdata); + + // copy重采样计算结果到主存 + // 存储格式:[numResults][numChannels][lengthPerResults] + // 且在内存中是连续存放的 + CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, + (numChannels * outputTotalLength * sizeof(float)), + cudaMemcpyDeviceToHost)); + + CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, + (numChannels * outputTotalLength * sizeof(float)), + cudaMemcpyDeviceToHost)); + + // 释放显存 + if (d_downFactor) { + cudaFree(d_downFactor); + d_downFactor = nullptr; + } + + if (d_outputLength) { + cudaFree(d_outputLength); + d_outputLength = nullptr; + } + + if (d_frequency) { + cudaFree(d_frequency); + d_frequency = nullptr; + } + + if (d_Idata) { + cudaFree(d_Idata); + d_Idata = nullptr; + } + + if (d_Qdata) { + cudaFree(d_Qdata); + d_Qdata = nullptr; + } + + if (I_shifted) { + cudaFree(I_shifted); + I_shifted = nullptr; + } + + if (Q_shifted) { + cudaFree(Q_shifted); + Q_shifted = nullptr; + } + + if (d_outputIdata) { + cudaFree(d_outputIdata); + d_outputIdata = nullptr; + } + + if (d_outputQdata) { + cudaFree(d_outputQdata); + d_outputQdata = nullptr; + } + + return true; +} + +/** + * ShiftAndResampleSignalFloatV2 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalFloatV2( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &downFactor, std::vector &detectFreq, + const int alignSignalLength, const int numResults, const int numChannels, + const float CurrentRealfreq, float *outputIdata, float *outputQdata) { + // 参数合法性检查 + if (alignSignalLength <= 0) { + LOG_ERROR_HOST("alignSignalLength <= 0"); + return false; + } + if (numResults <= 0) { + LOG_ERROR_HOST("numResults <= 0"); + return false; + } + if (numChannels <= 0) { + LOG_ERROR_HOST("numChannels <= 0"); + return false; + } + + if (downFactor.size() != numResults) { + LOG_ERROR_HOST("vector downFactor lenght != numResults"); + return false; + } + if (detectFreq.size() != numResults) { + LOG_ERROR_HOST("vector detectFreq lenght != numResults"); + return false; + } + + if (outputIdata == nullptr) { + LOG_ERROR_HOST("outputIdata is null ptr"); + return false; + } + if (outputQdata == nullptr) { + LOG_ERROR_HOST("outputQdata is null ptr"); + return false; + } + + // 每个通道的信号长度:这里假设所有通道的原始信号长度是相同的 + int signalLength = origIdata[0].size(); + if (signalLength <= 0) { + LOG_ERROR_HOST("signalLength <= 0"); + return false; + } + + // 设置CUDA设备堆大小以便支持设备端malloc/free + int deviceId = 0; + float heapSizePercent = 10.0f; + size_t minHeapSizeMB = 16; + size_t maxHeapSizeMB = 512; + SetGPUHeapSize(deviceId, heapSizePercent, minHeapSizeMB, maxHeapSizeMB); + + // ====准备调用重采样核函数:ShiftingAndResamplingKernel===== + // copy下采样率,频率等数据到显存中 + int *d_downFactor = nullptr; + float *d_frequency = nullptr; + // 申请显存 + CHECK_CUDA_ERROR(cudaMalloc(&d_downFactor, (numResults * sizeof(int)))); + CHECK_CUDA_ERROR(cudaMalloc(&d_frequency, (numResults * sizeof(float)))); + + // copy下采样率到显存中 + const int *src_downFactor = downFactor.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_downFactor, src_downFactor, + numResults * sizeof(int), + cudaMemcpyHostToDevice)); + + // copy频率到显存中 + const float *src_frequency = detectFreq.data(); + CHECK_CUDA_ERROR(cudaMemcpy(d_frequency, src_frequency, + numResults * sizeof(float), + cudaMemcpyHostToDevice)); + + // 申请原始的idata和qdata所需的GPU显存,并将数据copy到GPU显存中 + float *d_Idata = nullptr; + float *d_Qdata = nullptr; + CHECK_CUDA_ERROR( + cudaMalloc(&d_Idata, (numChannels * signalLength * sizeof(float)))); + CHECK_CUDA_ERROR( + cudaMalloc(&d_Qdata, (numChannels * signalLength * sizeof(float)))); + + // 将所有通道数据循环拷贝到GPU显存 + size_t copySize = signalLength * sizeof(float); + for (int i = 0; i < numChannels; i++) { + // copy 原始的idata 到gpu显存 + float *dst_idata = d_Idata + i * signalLength; + const void *src_idata = origIdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_idata, src_idata, copySize, cudaMemcpyHostToDevice)); + + // copy 原始的qdata 到gpu显存 + float *dst_qdata = d_Qdata + i * signalLength; + const void *src_qdata = origQdata[i].data(); + CHECK_CUDA_ERROR( + cudaMemcpy(dst_qdata, src_qdata, copySize, cudaMemcpyHostToDevice)); + } + + // 申请移频所需的空间 + float *I_shifted = nullptr; + float *Q_shifted = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&I_shifted, (signalLength * sizeof(float)))); + CHECK_CUDA_ERROR(cudaMalloc(&Q_shifted, (signalLength * sizeof(float)))); + + // 申请重采样后输出信号的GPU显存 + size_t totalsize = + numResults * numChannels * alignSignalLength * sizeof(float); + float *d_outputIdata = nullptr; + float *d_outputQdata = nullptr; + CHECK_CUDA_ERROR(cudaMalloc(&d_outputIdata, totalsize)); + CHECK_CUDA_ERROR(cudaMalloc(&d_outputQdata, totalsize)); + + // 初始化为0 + CHECK_CUDA_ERROR(cudaMemset(d_outputIdata, 0, totalsize)); + CHECK_CUDA_ERROR(cudaMemset(d_outputQdata, 0, totalsize)); + + // 线程数配置,总的线程数:numChannels * numResults + dim3 block(numChannels); + dim3 grid((numChannels * numResults + block.x - 1) / block.x); + const float sampling_rate = float(245.76e6); + ShiftingAndResamplingKernelFloatV2<<>>( + d_Idata, d_Qdata, d_downFactor, d_frequency, numResults, numChannels, + signalLength, CurrentRealfreq, alignSignalLength, sampling_rate, + I_shifted, Q_shifted, d_outputIdata, d_outputQdata); + + // copy重采样计算结果到主存 + // 存储格式:[numResults][numChannels][lengthPerResults] + // 且在内存中是连续存放的 + CHECK_CUDA_ERROR(cudaMemcpy(outputIdata, d_outputIdata, totalsize, + cudaMemcpyDeviceToHost)); + + CHECK_CUDA_ERROR(cudaMemcpy(outputQdata, d_outputQdata, totalsize, + cudaMemcpyDeviceToHost)); + + // 释放显存 + if (d_downFactor) { + cudaFree(d_downFactor); + d_downFactor = nullptr; + } + + if (d_frequency) { + cudaFree(d_frequency); + d_frequency = nullptr; + } + + if (d_Idata) { + cudaFree(d_Idata); + d_Idata = nullptr; + } + + if (d_Qdata) { + cudaFree(d_Qdata); + d_Qdata = nullptr; + } + + if (I_shifted) { + cudaFree(I_shifted); + I_shifted = nullptr; + } + + if (Q_shifted) { + cudaFree(Q_shifted); + Q_shifted = nullptr; + } + + if (d_outputIdata) { + cudaFree(d_outputIdata); + d_outputIdata = nullptr; + } + + if (d_outputQdata) { + cudaFree(d_outputQdata); + d_outputQdata = nullptr; + } + + return true; +} \ No newline at end of file diff --git a/cuda_resample_float.h b/cuda_resample_float.h new file mode 100644 index 0000000000000000000000000000000000000000..5d6ac976cab17eb3ef8ceb96f75b87d9f9493698 --- /dev/null +++ b/cuda_resample_float.h @@ -0,0 +1,81 @@ +#ifndef CUDA_RESAMPLE_FLOAT_H +#define CUDA_RESAMPLE_FLOAT_H + +#include +#include +#include // CUDA数学常量头文件 +#include + +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.141592653589793238462643 +#endif + +// 设备端Resampler状态结构 +struct DeviceResamplerStateFloat { + int _t; // "time" (modulo upRate) + int _xOffset; // 输入偏移量 + float *_state; // 状态缓冲区指针 + float *_transposedCoefs; // 转置系数指针 + int _coefsPerPhase; // 每相系数数量 + int _upRate; // 上采样率 + int _downRate; // 下采样率 +}; + +/** + * ShiftAndResampleSignalFloatV1 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param outputLength:重采样后每个带宽对应的输出信号长度 + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param outputTotalLength:一个通道重采样后信号的总长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalFloatV1( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &outputLength, std::vector &downFactor, + std::vector &detectFreq, const int outputTotalLength, + const int numResults, const int numChannels, const float CurrentRealfreq, + float *outputIdata, float *outputQdata); + +/** + * ShiftAndResampleSignalFloatV2 + * 重采样函数:完成原始信号的移频,重采样等计算 + * + * @param origIdata:原始Idata + * @param origQdata:原始Qdata + * @param downFactor:每个带宽对应的下采样率 + * @param detectFreq:每个带宽对应的频率 + * @param alignSignalLength:重采样后信号的对齐长度 + * @param numResults:检测结果数量 + * @param numChannels:信号通道数 + * @param CurrentRealfreq:当前实际频率 + * @param outputIdata: + * 重采样后的Idata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @param outputQdata: + * 重采样后的Qdata,连续存储,格式:[numResults][numChannels][lengthPerResult] + * @return true or false + */ +bool ShiftAndResampleSignalFloatV2( + const std::vector> &origIdata, + const std::vector> &origQdata, + std::vector &downFactor, std::vector &detectFreq, + const int alignSignalLength, const int numResults, const int numChannels, + const float CurrentRealfreq, float *outputIdata, float *outputQdata); + +#endif // CUDA_RESAMPLE_FLOAT_H diff --git a/mainwindow.cpp b/mainwindow.cpp index 81143458bc38b390c208d825d616d0a3b55dc88b..a8c9d8ce66aa7303eb88cf27d877bddeead3adf1 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -10,7 +10,8 @@ using namespace std; -MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { +MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) +{ InitControlValues(); InitUI(); InitConnect(); @@ -18,7 +19,8 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { MainWindow::~MainWindow() {} -void MainWindow::InitControlValues() { +void MainWindow::InitControlValues() +{ m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this); basePath = "/../data/"; @@ -26,15 +28,18 @@ void MainWindow::InitControlValues() { void MainWindow::InitUI() { setCentralWidget(m_btnCalculate); } -void MainWindow::InitConnect() { +void MainWindow::InitConnect() +{ connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick())); } -int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) { +int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) +{ return m_calMC.CalMovingCorrlationRoutine(signalChannels, signalLength); } -void MainWindow::SlotCalculateClick() { +void MainWindow::SlotCalculateClick() +{ QString strFileName = "20250213105831_Detect_FreqList5720-5750MHz_Span15.36MHz_Point4096_2025-" "02-13 10-58-31_0.dat"; @@ -47,7 +52,8 @@ void MainWindow::SlotCalculateClick() { QString strIQFileName = QCoreApplication::applicationDirPath() + basePath + strFileName; - if (QFile::exists(strIQFileName) != true) { + if (QFile::exists(strIQFileName) != true) + { std::cerr << __FUNCTION__ << strIQFileName.toStdString() << ":文件不存在" << std::endl; return; @@ -72,18 +78,23 @@ void MainWindow::SlotCalculateClick() { // 打开回放文件 m_ReplayFile.open(strIQFileName.toStdString(), ios::in | ios::binary); - if (!m_ReplayFile) { + if (!m_ReplayFile) + { qDebug() << __FUNCTION__ << "file open error"; return; } // 循环读取每一帧数据 4096点 for (int m_uiCurrentFrame = 0; m_uiCurrentFrame < m_iframeCnt; - ++m_uiCurrentFrame) { - if (m_uiCurrentFrame < m_iframeCnt - 1) { + ++m_uiCurrentFrame) + { + if (m_uiCurrentFrame < m_iframeCnt - 1) + { oneframesize = m_vecReplayHeadposDetect.at(m_uiCurrentFrame + 1) - m_vecReplayHeadposDetect.at(m_uiCurrentFrame); - } else { + } + else + { oneframesize = m_ReplayfilesizeDetect - m_vecReplayHeadposDetect.at(m_uiCurrentFrame); } @@ -111,11 +122,13 @@ void MainWindow::SlotCalculateClick() { void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, std::vector &headPos, - qint64 &Replayfilesize) { + qint64 &Replayfilesize) +{ std::ifstream replayfileforcalculate; replayfileforcalculate.open(ReplayFilePath.toStdString(), ios::in | ios::binary); - if (!replayfileforcalculate) { + if (!replayfileforcalculate) + { qDebug() << __FUNCTION__ << "file open error"; return; } @@ -135,7 +148,8 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, size_t count = 0; headPos.clear(); headPos.reserve(5000); - while ((index = source.find(match, index)) < Replayfilesize) { + while ((index = source.find(match, index)) < Replayfilesize) + { unsigned int DataSize = m_droneIQParse.GetDataSize(buff + index); headPos.push_back(index); index += 8; @@ -150,17 +164,22 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, buff = nullptr; } -void MainWindow::ReplayIQDataParse(char *buf) { - if (signalLength_ > 0) { - if (signalChannels_ == 32) { - uint channelnumber = 8; //原逻辑也是只取了前8个通道 +void MainWindow::ReplayIQDataParse(char *buf) +{ + if (signalLength_ > 0) + { + if (signalChannels_ == 32) + { + uint channelnumber = 8; // 原逻辑也是只取了前8个通道 - if (signalDatas_ == nullptr) { + if (signalDatas_ == nullptr) + { // 申请零拷贝内存,自动完成CPU内存与GPU显存数据同步 if (!m_calMC.cudaCorrelation->AllocMappMemory( (void **)&(m_calMC.cudaCorrelation->h_signals), (void **)&(m_calMC.cudaCorrelation->d_signals), - channelnumber * signalLength_ * sizeof(cpuComplex))) { + channelnumber * signalLength_ * sizeof(cpuComplex))) + { std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl; return; } @@ -182,3 +201,50 @@ void MainWindow::ReplayIQDataParse(char *buf) { } } } + +template +void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC, + cpuComplex *signalDatas, const T *outputIdata, + const T *outputQdata, const int numResults, + const int numChannels, + const int signalLength) +{ + if (signalDatas == nullptr) + { + // 申请零拷贝内存,自动完成CPU内存与GPU显存数据同步 + if (!m_calMC.cudaCorrelation->AllocMappMemory( + (void **)&(m_calMC.cudaCorrelation->h_signals), + (void **)&(m_calMC.cudaCorrelation->d_signals), + numResults * numChannels * signalLength * sizeof(cpuComplex))) + { + std::cerr << __FUNCTION__ << " AllocMappMemory failed." << std::endl; + return; + } + + signalDatas = (cpuComplex *)m_calMC.cudaCorrelation->h_signals; + } + + int index = 0; + for (int i = 0; i < numResults; i++) + { + for (int j = 0; j < numChannels; j++) + { + for (int k = 0; k < signalLength; k++) + { + int idx = (i * numChannels + j) * signalLength + k; + cpuComplex data((T)outputIdata[idx], (T)outputQdata[idx]); // cpuComplex + signalDatas[index++] = data; + } + } + } + + QElapsedTimer tm; + tm.start(); + + // 每帧 SamplePoints 个点 IQ 输入 + // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰 + int result = m_calMC.CalMovingCorrlationRoutine(numResults * numChannels, signalLength); + + std::cout << __FUNCTION__ << " result:" << result + << " tm(ns):" << tm.nsecsElapsed() << std::endl; +} diff --git a/mainwindow.h b/mainwindow.h index a63a935761b36cadc43988f28f00ee0e69fd8d10..a5951b48d202835f038e3cffc89061b74a379ac7 100644 --- a/mainwindow.h +++ b/mainwindow.h @@ -14,10 +14,11 @@ using namespace std; -class MainWindow : public QMainWindow { +class MainWindow : public QMainWindow +{ Q_OBJECT - public: +public: MainWindow(QWidget *parent = nullptr); ~MainWindow(); @@ -34,7 +35,7 @@ class MainWindow : public QMainWindow { uint signalChannels_ = 0; uint signalLength_ = 0; - private: +private: // 获取测试数据文件中 每一帧数据的帧头下标 void GetReplayFileHeadPos(QString ReplayFilePath, std::vector &headPos, @@ -43,7 +44,7 @@ class MainWindow : public QMainWindow { // 解析每一帧数据为 8路的 IQ 原始数据 void ReplayIQDataParse(char *buf); - private: +private: QPushButton *m_btnCalculate; CalculateMovingCorrelation m_calMC; @@ -57,7 +58,15 @@ class MainWindow : public QMainWindow { ifstream m_ReplayFile; int oneframesize; - public slots: +public slots: void SlotCalculateClick(); }; -#endif // MAINWINDOW_H + +template +void ReplayIQDataParseV2(CalculateMovingCorrelation &m_calMC, + cpuComplex *signalDatas, const T *outputIdata, + const T *outputQdata, const int numResults, + const int numChannels, + const int signalLength); + +#endif // MAINWINDOW_H