
// Allocation
int *host_input_arr = (int*)malloc(sizeof(int) * elementSize);
int *host_output_arr = (int*)malloc(sizeof(int) * elementSize);
int *device_arr;
cudaMalloc((void**)&device_arr, sizeof(int) * elementSize);
// Data Transfer and Kernel Function
cudaMemcpy(device_arr, host_input_arr, sizeof(int) * elementSize, cudaMemcpyHostToDevice);
kernel<<<blockSize, threadsPerBlock>>>(device_arr, elementSize);
cudaDeviceSynchronize();
cudaMemcpy(host_output_arr, device_arr, sizeof(int) * elementSize, cudaMemcpyDeviceToHost);
// Free
cudaFree(device_arr);
free(host_input_arr);
free(host_output_arr);
Page-locking excessive amounts of memory with cudaMallocHost() may degrade system performance, since it reduces the amount of memory available to the system for paging. As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device.

// Allocation
int *host_input_arr;
int *host_output_arr;
int *device_arr;
cudaMallocHost((void**)&host_input_arr, sizeof(int) * elementSize, cudaHostAllocDefault);
cudaMallocHost((void**)&host_output_arr, sizeof(int) * elementSize, cudaHostAllocDefault);
cudaMalloc((void**)&device_arr, sizeof(int) * elementSize);
// Data Transfer and Kernel Function
cudaMemcpy(device_arr, host_input_arr, sizeof(int) * elementSize, cudaMemcpyHostToDevice);
vecMultiply<<<blockSize, threadsPerBlock>>>(device_arr, elementSize);
cudaDeviceSynchronize();
cudaMemcpy(host_output_arr, device_arr, sizeof(int) * elementSize, cudaMemcpyDeviceToHost);
// Free
cudaFree(device_arr);
cudaFreeHost(host_input_arr);
cudaFreeHost(host_output_arr);

// Allocation
int *host_input_arr;
cudaHostAlloc((void**)&host_input_arr, sizeof(int) * elementSize, cudaHostAllocMapped);
cudaHostGetDevicePointer((void **)&device_input_arr, (void *) host_input_arr , 0);
// Kernel Function
vecMultiply<<<blockSize, threadsPerBlock>>>(device_input_arr, elementSize);
cudaDeviceSynchronize();
// Free
cudaFree(device_input_arr);

// Allocation
int *host_input_arr;
cudaMallocManaged((void**)&host_input_arr, sizeof(int) * elementSize);
// Kernel Function
vecMultiply<<<blockSize, threadsPerBlock>>>(host_input_arr, elementSize);
cudaDeviceSynchronize();
// Free
cudaFree(host_input_arr);
| | Pageable | Pinned | Zero-copy | Unified Memory Access |
|:————–:|:————:|:—————-:| ——— |:——-:|
| GPU cached | Y | Y | N | Y |
| CPU allocation | malloc | cudaHostAlloc | cudaHostAlloc</br> | |
| GPU allocaion | cudaMalloc | cudaMalloc | N/A | |
| Copy | cudaMemcpy | cudaMemcpy</br>cudaMemcpyAsync | cudaHostGetDevicePointer | |
| Pros | - Easy to use and manage.
- Abundant compared to other types.
- Managed automatically by OS. | - Fast data transfer to GPU.
- Allows asynchronous operations.
- Avoids paging overhead. | - Direct GPU access to host memory.
- No need to explicitly copy data. | - Simplifies memory management.
- Automatically migrates data between host and device. |
| Cons | - Slowest for GPU access due to copying overhead.
- Data might be paged out, causing further delays. | - Scarce resource; can lead to system performance issues if overused.
- Requires manual management. | - Slower than pinned memory for large data sets.
- May lead to performance issues in some architectures. | - Performance can be unpredictable.
- May not be optimal for all workloads or hardware configurations. |