1 Star 0 Fork 0

AlphaA/HPP_learning

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
mp8_implement.cu 6.34 KB
一键复制 编辑 原始数据 按行查看 历史
dalong 提交于 2021-06-24 14:10 . update
#include <wb.h>
#include <iostream>
#define SEGMENT_LENGTH 256
#define BLOCK_SIZE 256
__global__ void vecAdd(float * in1, float * in2, float * out, int len) {
//@@ Insert code to implement vector addition here
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < len){
out[idx] = in1[idx] + in2[idx];
}
}
int myMin(int a, int b){
if(a < b){
return a;
}
return b;
}
int ceil(int a, int b){
return (a + b - 1) / b;
}
int main(int argc, char ** argv) {
wbArg_t args;
int inputLength;
float * hostInput1;
float * hostInput2;
float * deviceInput1;
float * deviceInput2;
float * deviceOutput;
float* pinnedHostInput1;
float* pinnedHostInput2;
float* pinnedHostOutput;
cudaStream_t stream0, stream1, stream2, stream3;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);
cudaStreamCreate( &stream2);
cudaStreamCreate( &stream3);
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
wbTime_stop(Generic, "Importing data and creating memory on host");
// allocate page-locked memory on CPU
cudaMallocHost((void **)&pinnedHostInput1, sizeof(float) * inputLength);
cudaMallocHost((void **)&pinnedHostInput2, sizeof(float) * inputLength);
cudaMallocHost((void **)&pinnedHostOutput, sizeof(float) * inputLength);
// memcpy input
memcpy(pinnedHostInput1, hostInput1, sizeof(float) * inputLength);
memcpy(pinnedHostInput2, hostInput2, sizeof(float) * inputLength);
// 1. Allocate memory on GPU
cudaMalloc((void**) &deviceInput1, sizeof(float) * 4 * SEGMENT_LENGTH);
cudaMalloc((void**) &deviceInput2, sizeof(float) * 4 * SEGMENT_LENGTH);
cudaMalloc((void**) &deviceOutput, sizeof(float) * 4 * SEGMENT_LENGTH);
dim3 DimGrid(ceil(SEGMENT_LENGTH, BLOCK_SIZE), 1, 1);
dim3 DimBlock(BLOCK_SIZE, 1, 1);
// 2. do computation, Breadth First Kernel Issue
for(int index = 0; index < inputLength; index += 4 * SEGMENT_LENGTH){
int currentPtr1 = index;
int currentPtr2 = currentPtr1 + SEGMENT_LENGTH;
int currentPtr3 = currentPtr2 + SEGMENT_LENGTH;
int currentPtr4 = currentPtr3 + SEGMENT_LENGTH;
int length1 = 0, length2 = 0, length3 = 0, length4 = 0;
// copy data
if(currentPtr1 < inputLength){
length1 = myMin(SEGMENT_LENGTH, inputLength - currentPtr1);
cudaMemcpyAsync(&deviceInput1[0], &pinnedHostInput1[currentPtr1], sizeof(float) * length1, cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(&deviceInput2[0], &pinnedHostInput2[currentPtr1], sizeof(float) * length1, cudaMemcpyHostToDevice, stream0);
}
if(currentPtr2 < inputLength){
length2 = myMin(SEGMENT_LENGTH, inputLength - currentPtr2);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH], &pinnedHostInput1[currentPtr2], sizeof(float) * length2, cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH], &pinnedHostInput2[currentPtr2], sizeof(float) * length2, cudaMemcpyHostToDevice, stream1);
}
if(currentPtr3 < inputLength){
length3 = myMin(SEGMENT_LENGTH, inputLength - currentPtr3);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH * 2], &pinnedHostInput1[currentPtr3], sizeof(float) * length3, cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH * 2], &pinnedHostInput2[currentPtr3], sizeof(float) * length3, cudaMemcpyHostToDevice, stream2);
}
if(currentPtr4 < inputLength){
length4 = myMin(SEGMENT_LENGTH, inputLength - currentPtr4);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH * 3], &pinnedHostInput1[currentPtr4], sizeof(float) * length4, cudaMemcpyHostToDevice, stream3);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH * 3], &pinnedHostInput2[currentPtr4], sizeof(float) * length4, cudaMemcpyHostToDevice, stream3);
}
// do calculation
if(currentPtr1 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream0>>>(&deviceInput1[0], &deviceInput2[0], &deviceOutput[0], length1);
}
if(currentPtr2 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream1>>>(&deviceInput1[SEGMENT_LENGTH], &deviceInput2[SEGMENT_LENGTH], &deviceOutput[SEGMENT_LENGTH], length2);
}
if(currentPtr3 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream2>>>(&deviceInput1[SEGMENT_LENGTH * 2], &deviceInput2[SEGMENT_LENGTH * 2], &deviceOutput[SEGMENT_LENGTH * 2], length3);
}
if(currentPtr4 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream3>>>(&deviceInput1[SEGMENT_LENGTH * 3], &deviceInput2[SEGMENT_LENGTH * 3], &deviceOutput[SEGMENT_LENGTH * 3], length4);
}
// do memory copy from device to host
if(currentPtr1 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr1], &deviceOutput[0], sizeof(float) * length1, cudaMemcpyDeviceToHost, stream0);
}
if(currentPtr2 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr2], &deviceOutput[SEGMENT_LENGTH], sizeof(float) * length2, cudaMemcpyDeviceToHost, stream1);
}
if(currentPtr3 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr3], &deviceOutput[SEGMENT_LENGTH * 2], sizeof(float) * length3, cudaMemcpyDeviceToHost, stream2);
}
if(currentPtr4 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr4], &deviceOutput[SEGMENT_LENGTH * 3], sizeof(float) * length4, cudaMemcpyDeviceToHost, stream3);
}
}
cudaDeviceSynchronize();
std::cout<<"check hostoutput"<<std::endl;
for(int index = 0; index < myMin(10, inputLength); index++){
std::cout<<pinnedHostOutput[index]<<", ";
}
std::cout<<std::endl;
wbSolution(args, pinnedHostOutput, inputLength);
// free GPU memory
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
// free page-locked memory
cudaFreeHost(pinnedHostInput1);
cudaFreeHost(pinnedHostInput2);
cudaFreeHost(pinnedHostOutput);
// free pageable memory
free(hostInput1);
free(hostInput2);
return 0;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/AlphaA1412/HPP_learning.git
git@gitee.com:AlphaA1412/HPP_learning.git
AlphaA1412
HPP_learning
HPP_learning
main

搜索帮助

0d507c66 1850385 C8b1a773 1850385