Memory Management (global memory)

Pageable memory

// Allocation
int *host_input_arr = (int*)malloc(sizeof(int) * elementSize);
int *host_output_arr = (int*)malloc(sizeof(int) * elementSize);
int *device_arr;
cudaMalloc((void**)&device_arr, sizeof(int) * elementSize);

// Data Transfer and Kernel Function
cudaMemcpy(device_arr, host_input_arr, sizeof(int) * elementSize, cudaMemcpyHostToDevice);
kernel<<<blockSize, threadsPerBlock>>>(device_arr, elementSize);
cudaDeviceSynchronize();
cudaMemcpy(host_output_arr, device_arr, sizeof(int) * elementSize, cudaMemcpyDeviceToHost);

// Free
cudaFree(device_arr);
free(host_input_arr);
free(host_output_arr);

Pinned memory

Page-locking excessive amounts of memory with cudaMallocHost() may degrade system performance, since it reduces the amount of memory available to the system for paging. As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device.

// Allocation
int *host_input_arr;
int *host_output_arr;
int *device_arr;
cudaMallocHost((void**)&host_input_arr, sizeof(int) * elementSize, cudaHostAllocDefault);
cudaMallocHost((void**)&host_output_arr, sizeof(int) * elementSize, cudaHostAllocDefault);
cudaMalloc((void**)&device_arr, sizeof(int) * elementSize);

// Data Transfer and Kernel Function
cudaMemcpy(device_arr, host_input_arr, sizeof(int) * elementSize, cudaMemcpyHostToDevice);
vecMultiply<<<blockSize, threadsPerBlock>>>(device_arr, elementSize);
cudaDeviceSynchronize();
cudaMemcpy(host_output_arr, device_arr, sizeof(int) * elementSize, cudaMemcpyDeviceToHost);

// Free
cudaFree(device_arr);
cudaFreeHost(host_input_arr);
cudaFreeHost(host_output_arr);

Zero-copy memory

// Allocation
int *host_input_arr;
cudaHostAlloc((void**)&host_input_arr, sizeof(int) * elementSize, cudaHostAllocMapped);
cudaHostGetDevicePointer((void **)&device_input_arr,  (void *) host_input_arr , 0);

// Kernel Function
vecMultiply<<<blockSize, threadsPerBlock>>>(device_input_arr, elementSize);
cudaDeviceSynchronize();

// Free
cudaFree(device_input_arr);

Unified Virtual Addressing

// Allocation
int *host_input_arr;
cudaMallocManaged((void**)&host_input_arr, sizeof(int) * elementSize);

// Kernel Function
vecMultiply<<<blockSize, threadsPerBlock>>>(host_input_arr, elementSize);
cudaDeviceSynchronize();

// Free
cudaFree(host_input_arr);

Comparison

Reference

EVERYTHING YOU NEED TO KNOW ABOUT UNIFIED MEMORY