//Ref : https://youtu.be/Ed_h2km0liI
#include <stdio.h>
#define SIZE 1024
//1. parallelize this function
//3. modify function call as gpu
__global__ void VectorAdd(int *a, int *b, int *c, int n){
int i = threadIdx.x;
//for( i=0;i<n;i++){
// c[i] = a[i] + b[i];
//}
if(i < n)
c[i] = a[i] + b[i];
}
//2. allocate memory on GPU memory
//2.1) Data copied from CPU to GPu
//2.2) Launch VectorAdd kernel on the GPU
//2.3) Resulting data copied from GPU to CPU
int main(){
int *a, *b, *c, i;
int *d_a, *d_b, *d_c; // device
a = (int*)malloc(sizeof(int)*SIZE);
b = (int*)malloc(sizeof(int)*SIZE);
c = (int*)malloc(sizeof(int)*SIZE);
//gpu side memory
cudaMalloc(&d_a, SIZE*sizeof(int));
cudaMalloc(&d_b, SIZE*sizeof(int));
cudaMalloc(&d_c, SIZE*sizeof(int));
for(i=0;i<SIZE;++i){
a[i] = i;
b[i] = i;
c[i] = 0;
}
cudaMemcpy(d_a,a,SIZE*sizeof(int),cudaMemcpyHostToDevice); // => 2.1
cudaMemcpy(d_b,b,SIZE*sizeof(int),cudaMemcpyHostToDevice); // => 2.1
cudaMemcpy(d_c,c,SIZE*sizeof(int),cudaMemcpyHostToDevice); // => 2.1
VectorAdd<<<1, SIZE>>>(d_a, d_b, d_c, SIZE);//1 block, SIZE threads => 2.2
cudaMemcpy(c,d_c,SIZE*sizeof(int),cudaMemcpyDeviceToHost); // => 2.3
for(i=0;i<10;++i){
printf("c[%d] = %d
",i,c[i]);
}
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}