Syntax

API

cudaMalloc

cudaError_t cudaMalloc ( void** devPtr, size_t size )

cudaMemcpy

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )

This function exhibits synchronous behavior because the host application blocks until cudaMemcpy returns and the transfer is complete.

cudaMemset

cudaFree

__syncthreads

Thread configuration

1D block: dim3 BlockDim(int Ntx) 2D block: dim3 BlockDim(int Ntx, int Nty) 2D block: dim3 BlockDim(int Ntx, int Nty, int Ntz)

1D grid: dim3 GridDim(int Nbx) 2D grid: dim3 GridDim(int Nbx, int Nby) 2D grid: dim3 GridDim(int Nbx, int Nby, int Nbz)

Nt[xyz] is the number of threads in x/y/z direction. Nb[xyz] is the number of blocks in x/y/z direction.

Shared memory

Dynamic shared memory

Static shared Memory

See here

Kernel call

Declaration

__global__ void Kernel(argument list)

Do it

Kernel<<<dim3 GridDim, dim3 BlockDim, size_t Ns, cudaStream_t S>>>(argument list)

:::info

  1. Access to device memory only
  2. Must have void return type
  3. No support for a variable number of arguments
  4. No support for static variables
  5. No support for function pointers
  6. Exhibit an asynchronous behavior :::

Built-in variables

gridDim

blockIdx

blockDim

threadIdx

warpSize

Handling errors

#define CHECK(call)                                                        \
{                                                                          \
   const cudaError_t error = call;                                         \
   if (error != cudaSuccess)                                               \
   {                                                                       \
       printf("Error: %s:%d, ", __FILE__, __LINE__);                       \
       printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));  \
       exit(1);                                                            \
   }                                                                       \
}

Built-in API usage:

CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));

Kernel call usage:

kernel_function<<<grid, block>>>(argument list);
CHECK(cudaDeviceSynchronize());

Timing with CPU timer

double cpuSecond() {
    struct timeval tp;
    gettimeofday(&tp,NULL);
    return ((double)tp.tv_sec +(double)tp.tv_usec*1.e-6);
}

Timing kernel:

double iStart = cpuSecond();
kernel_name<<<grid, block>>>(argument list);
cudaDeviceSynchronize();
double iElaps = cpuSecond() - iStart;