Any CUDA program broadly consist of the following components:
1) Include header files
2) Kernel that executes on the CUDA device, e.g:
//__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
3) main( ) routine, the CPU must find.
3.1:- Define pointer to host and device arrays
3.2:- Define other variables used in the program e.g. arrays etc.
3.3:- Allocate array on the host
/e.g. a_h=(float*)malloc(size)
3.4:- Allocate array on device (DRAM of the GPU)
/e.g. cudaMalloc ((void**) (a_d,size))
3.5:- Copy the data from host array to device array.
// cudaMemcpy(Md_d,Md_h,size,cudaMemcpyHostToDevice);
3.6:- Kernel Call, Execution Configuration // e.g add_array<<<n block,p size>>>(…..)
3.7:- Retrieve result from device to host in the host memory, e.g;
cudaMemcpy(Pd_h,Pd_d,size,cudaMemcpyDeviceToHost);
3.8:- Print result // for (i=0,………)
printf(“%f “,,a_h[i]) ;
3.9:- Free allocated device and host memories // e.g
free(a_h);
cudaFree(a_d);
Using the above programming steps, the following program calculates and prints the square of first
1000 integers.
// 1) Include header files
#include <stdio.h>
#include <cuda.h>
#include <conio.h>
// 2) Kernel that executes on the CUDA device
__global__ void square_array(float*a,int N)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N)a[idx]=a[idx]*a[idx];
}// 3) main( ) routine, the CPU must find
int main(void)
{
// 3.1:- Define pointer to host and device arrays
float*a_h,*a_d;
// 3.2:- Define other variables used in the program e.g. arrays etc.
const int N=100;
size_t size=N*sizeof(float);
// 3.3:- Allocate array on the host
a_h=(float*)malloc(size);
// 3.4:- Allocate array on device (DRAM of the GPU)
cudaMalloc((void**)&a_d,size);
for(int i=0;i<N;i++)a_h[i]=(float)i;
// 3.5:- Copy the data from host array to device array.
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
// 3.6:- Kernel Call, Execution Configuration
int block_size=4;
int n_blocks=N/block_size+(N%block_size==0);
square_array<<<n_blocks,block_size>>>(a_d,N);
// 3.7:- Retrieve result from device to host in the host memory, e.g;
cudaMemcpy(a_h,a_d,sizeof(float)*N,cudaMemcpyDeviceToHost);
// 3.8:- Print result
for(int i=0;i<N;i++)
printf("%d\t%f\n",i,a_h[i]);
// 3.9:- Free allocated memories on the device and host
free(a_h);
cudaFree(a_d);
getch();
}
)