코드 >>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <stdlib.h>
#include <cstring>
#include <time.h>
#include <stdio.h>
#include <cmath>
#include <iostream>
#include "common.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPU 경고: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int reduction_cpu(int* input, int size)
{
int sum = 0;
for (int i = 0; i < size; i++)
{
sum += input[i];
}
return sum;
}
void compare_results(int gpu_result, int cpu_result)
{
printf("GPU 결과: %d, CPU 결과: %d \n", gpu_result, cpu_result);
if (gpu_result == cpu_result)
{
printf("GPU와 CPU의 결과 같음 \n");
return;
}
printf("GPU와 CPU의 결과 다름 \n");
}
__global__ void reduction_neighbored_pairs(int* input,
int* temp, int size)
{
int tid = threadIdx.x;
int gid = blockDim.x * blockIdx.x + threadIdx.x;
if (gid > size)
return;
for (int offset = 1; offset <= blockDim.x / 2; offset *= 2)
{
if (tid % (2 * offset) == 0)
{
input[gid] += input[gid + offset];
}
}
if (tid == 0)
{
temp[blockIdx.x] = input[gid];
}
}
int main()
{
printf("Start neighbored pairs reduction kernel. \n");
int size = 1 << 27; // 128 MB of data
int byte_size = size * sizeof(int);
int block_size = 128;
int* h_input, * h_ref;
h_input = (int*)malloc(byte_size);
memset(h_input, 0, byte_size);
int cpu_result = reduction_cpu(h_input, size);
dim3 block(block_size);
dim3 grid(size / block.x);
printf("grid.x : %d, block.x : %d\n",
grid.x, block.x);
int temp_array_byte_size = sizeof(int) * grid.x;
h_ref = (int*)malloc(temp_array_byte_size);
int* d_input, * d_temp;
gpuErrchk(cudaMalloc((void**)&d_input, byte_size));
gpuErrchk(cudaMalloc((void**)&d_temp, temp_array_byte_size));
cudaMemset(d_temp, 0, temp_array_byte_size);
cudaMemcpy(d_input, h_input, byte_size, cudaMemcpyHostToDevice);
reduction_neighbored_pairs << <grid, block >> > (d_input, d_temp, size);
gpuErrchk(cudaDeviceSynchronize());
cudaMemcpy(h_ref, d_temp, temp_array_byte_size, cudaMemcpyDeviceToHost);
int gpu_result = 0;
for (int i = 0; i < grid.x; i++)
{
gpu_result += h_ref[i];
}
compare_results(gpu_result, cpu_result);
cudaFree(d_input);
cudaFree(d_temp);
free(h_ref);
free(h_input);
gpuErrchk(cudaDeviceReset());
return 0;
}
|
'Cuda' 카테고리의 다른 글
C++ CUDA Warp 발산 (0) | 2021.05.31 |
---|---|
C++ Cuda 시스템 정보, GPU 정보 불러오기 (0) | 2021.05.31 |
C++ Cuda CPU와 GPU에서의 연산 속도 비교하기 (0) | 2021.05.30 |
C++ Cuda 메모리 할당에서의 예외 처리 (0) | 2021.05.30 |
C++ Cuda GPU에서의 더하기 연산 (0) | 2021.05.30 |