https://mangkyu.tistory.com/85
NVIDIA가 만든 병렬 컴퓨팅 플랫폼 및 API 모델
GPGPU(General-Purpose Computing on Graphics Processing Unit)
CUDA Platform은 GPU의 가상 명령어셋과 병렬처리 요소들을 사용할 수 있도록 만들어주는 소프트웨어 레이어
Host: CPU and its memory (Host Memory)
Device: GPU and its memory (Device Memory)
1. Data copied from CPU to GPU
2. Launch VectorAdd kernel on the GPU
3. Resulting data copied from GPU to CPU
디바이스 코드는 nvcc (NVIDIA's Compiler)가 컴파일하는데, 몇가지 키워드가 필요함
__global__ 키워드는 Host를 통해 호출되어 Device에서 동작하는 함수
Block과 Thread
GPU 코드를 병렬로 처리하기 위한 단위
1개의 Block은 N개의 Thread로 구성됨
kernel <<< BlockCount, Threads-per-Block >>>(...)
#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c)
{
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if (threadIdx.x == 0)
{
int sum = 0;
for (int i = 0; i < THREADS_PER_BLOCK; ++i)
sum += temp[i];
atomicAdd(c, sum);
}
}
int main(void)
{
int *p_a, *p_b, *p_c;
int *p_dev_a, *p_dev_b, *p_dev_c;
int cbSize = N * sizeof(int);
// allocate host memories
p_a = (int *)malloc(cbSize);
p_b = (int *)malloc(cbSize);
p_c = (int *)malloc(sizeof(int));
// allocate device memories
cudaMalloc(&p_dev_a, cbSize);
cudaMalloc(&p_dev_b, cbSize);
cudaMalloc(&p_dev_c, sizeof(int));
// initialize variables
for (int i = 0; i < N; ++i)
{
p_a[i] = i;
p_b[i] = i;
}
// copy host memories to device memories
cudaMemCpy(p_dev_a, p_a, cbSize, cudaMemcpyHostToDevice);
cudaMemCpy(p_dev_b, p_b, cbSize, cudaMemcpyHostToDevice);
// run dot with N threads
dot<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>(p_dev_a, p_dev_b, p_dev_c);
// copy device memories sum result(p_dev_c) to host memories(p_c)
cudaMemCpy(p_c, p_dev_c, sizeof(int), cudaMemCpyDeviceToHost);
printf("Total Sum: %d\n", *p_c);
free(p_a);
free(p_b);
free(p_c);
cudaFree(p_dev_a);
cudaFree(p_dev_b);
cudaFree(p_dev_c);
return 0;
}
Atomic Operations
- atomicAdd()
- atomicSub()
- atomicMin()
- atomicMax()
- atomicInc()
- atomicDec()
- atomicExch()
- atomicCAS()