代码拉取完成,页面将自动刷新
#include <stdio.h>
#include <cuda_runtime.h>
#define SIZE 1000
#define NUM_BIN 256
__global__ void histogram_shared_memory(int *d_b, int *d_a)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int offset = blockDim.x * gridDim.x;
__shared__ int cache[256];
cache[threadIdx.x] = 0;
__syncthreads();
while (tid < SIZE)
{
atomicAdd(&(cache[d_a[tid]]), 1);
tid += offset;
}
__syncthreads();
atomicAdd(&(d_b[threadIdx.x]), cache[threadIdx.x]);
}
int main()
{
// generate the input array on the host
int h_a[SIZE];
for (int i = 0; i < SIZE; i++) {
//h_a[i] = bit_reverse(i, log2(SIZE));
h_a[i] = i % NUM_BIN;
}
int h_b[NUM_BIN];
for (int i = 0; i < NUM_BIN; i++) {
h_b[i] = 0;
}
// declare GPU memory pointers
int * d_a;
int * d_b;
// allocate GPU memory
cudaMalloc((void **)&d_a, SIZE * sizeof(int));
cudaMalloc((void **)&d_b, NUM_BIN * sizeof(int));
// transfer the arrays to the GPU
cudaMemcpy(d_a, h_a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, NUM_BIN * sizeof(int), cudaMemcpyHostToDevice);
// launch the kernel
histogram_shared_memory << <SIZE / 256, 256 >> >(d_b, d_a);
// copy back the result from GPU
cudaMemcpy(h_b, d_b, NUM_BIN * sizeof(int), cudaMemcpyDeviceToHost);
printf("Histogram using 16 bin is: ");
for (int i = 0; i < NUM_BIN; i++) {
printf("bin %d: count %d\n", i, h_b[i]);
}
// free GPU memory allocation
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。