Use cudaThreadSynchronise()
Since CUDA kernel launch is asynchronous, and returns immediately, this function can be used to make sure that all kernel launches are synchronised. This can be particularly helpful in situations where we have a number of kernels in a for loop. Another situation can be in following sequential Kernel calls:
Kernel1();
Kernel2();
Kernel3(); //Kernel()3 uses the results of kernel1 and Kernel2
------
Calculating Array Indices:
Instead of
int idx = blockDim.x, blockIdx.x + threadIdx.x;
do this:
int idx = __umul24( blockDim.x, blockIdx.x) + threadIdx.x;
as this will be faster !
-------
Avoiding Branch Operations
Branch operations are Performance Killer. Let us see how we can avoid them in certain situations.
Consider the CPU code:
void CPUCode( int* input, int* output, int length)
{
for ( int i = 0; i < length; ++i )
{
output[ i ] = input[ i ] + 2 * input[ i + 1 ];
}
}
The corresponding CUDA Code can be:
__global__void GPUCode( int* input, int* output, int length)
{
int idx = __umul24( blockDim.x, blockIdx.x) + threadIdx.x;
if ( idx < length )
{
output[ idx ] = input[ idx ] + 2 * input[ idx + 1 ];
}
}
The if statement above will give rise to branch divergence, which will make the entire process serialised.
The above CUDA Code can be modified as follows:
__global__void GPUCode( int* input, int* output, int length)
{
int idx = __umul24( blockDim.x, blockIdx.x) + threadIdx.x;
idx=max(idx, 0);
idx=min(idx, length);
output[ idx ] = input[ idx ] + 2 * input[ idx + 1 ];
}
There is no branch divergence in the above CUDA code as all the threads are running the same code.
----------
Using single block for multiplying matrices with dimension 32 X 32
Accessing two data elements per Thread
int thid = threadIdx.x;
temp[2*thid] = g_idata[2*thid];
temp[2*thid+1] = g_idata[2*thid+1]
But this suffers from Bank Conflicts
----------
use_fast_math compiler option
use_fast_math compiler option of nvcc coerces every functionName() call
to the equivalent __func() call. This makes the code run faster at the cost of
slightly diminished precision and accuracy. For example __sin() will be faster than sin()
----------
Suppressing to Single Precision Calculationsif you have:
float val=0
user instead
val=0.f if you do not need double precision (by default its double precision). This will be faster being single precision .
Any CUDA program broadly consist of the following components:
1) Include header files
2) Kernel that executes on the CUDA device, e.g:
//__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
3) main( ) routine, the CPU must find.
3.1:- Define pointer to host and device arrays
3.2:- Define other variables used in the program e.g. arrays etc.
3.3:- Allocate array on the host
/e.g. a_h=(float*)malloc(size)
3.4:- Allocate array on device (DRAM of the GPU)
/e.g. cudaMalloc ((void**) (a_d,size))
3.5:- Copy the data from host array to device array.
// cudaMemcpy(Md_d,Md_h,size,cudaMemcpyHostToDevice);
3.6:- Kernel Call, Execution Configuration // e.g add_array<<<n block,p size>>>(…..)
3.7:- Retrieve result from device to host in the host memory, e.g;
cudaMemcpy(Pd_h,Pd_d,size,cudaMemcpyDeviceToHost);
3.8:- Print result // for (i=0,………)
printf(“%f “,,a_h[i]) ;
3.9:- Free allocated device and host memories // e.g
free(a_h);
cudaFree(a_d);
Using the above programming steps, the following program calculates and prints the square of first
1000 integers.
// 1) Include header files
#include <stdio.h>
#include <cuda.h>
#include <conio.h>
// 2) Kernel that executes on the CUDA device
__global__ void square_array(float*a,int N)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N)a[idx]=a[idx]*a[idx];
}// 3) main( ) routine, the CPU must find
int main(void)
{
// 3.1:- Define pointer to host and device arrays
float*a_h,*a_d;
// 3.2:- Define other variables used in the program e.g. arrays etc.
const int N=100;
size_t size=N*sizeof(float);
// 3.3:- Allocate array on the host
a_h=(float*)malloc(size);
// 3.4:- Allocate array on device (DRAM of the GPU)
cudaMalloc((void**)&a_d,size);
for(int i=0;i<N;i++)a_h[i]=(float)i;
// 3.5:- Copy the data from host array to device array.
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
// 3.6:- Kernel Call, Execution Configuration
int block_size=4;
int n_blocks=N/block_size+(N%block_size==0);
square_array<<<n_blocks,block_size>>>(a_d,N);
// 3.7:- Retrieve result from device to host in the host memory, e.g;
cudaMemcpy(a_h,a_d,sizeof(float)*N,cudaMemcpyDeviceToHost);
// 3.8:- Print result
for(int i=0;i<N;i++)
printf("%d\t%f\n",i,a_h[i]);
// 3.9:- Free allocated memories on the device and host
free(a_h);
cudaFree(a_d);
getch();
}
)