An individual CUDA thread can access 1,2,4,8,or 16 bytes in a single instruction or transaction. When considered warp-wide, that translates to 32 bytes all the way up to 512 bytes. The GPU memory controller can typically issue requests to memory in granularities of 32 bytes, up to 128 bytes. Larger requests (say, 512 bytes, considered warp wide) will get issued via multiple “transactions” of typically no more than 128 bytes.


Assuming each thread in a warp read 4-byte element in cache/memory:
| 128-byte granularity (cached load) | 32-byte granularity (uncached load) | |
|---|---|---|
| Aligned and coalesced | ![]() |
![]() |
| Aligned and uncoalesced | ![]() |
![]() |
| Misaligned and coalesced | ![]() Bus utilization: 128/256=50% |
![]() Bus utilization: 128/(224-64)=80% |
| Misaligned and uncoalesced | ![]() Bus utilization: 128/384=33.3% |
![]() Bus utilization: 128/(32*6)=66.6% |
| Broadcast | ![]() Bus utilization: 4/128=3.125% |
![]() Bus utilization: 4/32=12.5% |
Check the size of the granularity on the L1/L1 cache:
nvprof --metrics l1_cache_line_size my_gpu_application
nvprof --metrics l2_cache_line_size my_gpu_application
Keep in mind that the size of the granularity on the L1/L2 cache can vary depending on the specific GPU model and architecture. Additionally, the size of the granularity may also be affected by the specific configuration of the GPU and the operating system.
Misalignment read/write occurs when offset is not a multiple of the size of granuality.
__global__ void readOffset(float *A, float *B, float *C, float *D, const int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
int offset, k;
offset = 11;
k = i + offset;
if (k < n) C[i] = A[k] + B[k]; // misalignment read
if (k < n) D[k] = A[i] + B[i]; // misalignment write
offset = 128;
k = i + offset;
if (k < n) C[i] = A[k] + B[k]; // alignment read
if (k < n) D[k] = A[i] + B[i]; // alignment write
}
Uncoalesce read/write occurs when all 32 threads in a warp do not access a contiguous chunk of memory.
__global__ void readOffset(float *A, float *B, float *C, const int n, int offset) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = (i%2==0) ? (2*i) : (i);
if (k < n) C[i] = A[k] + B[k]; // uncoaleased read
if (k < n) C[k] = A[i] + B[i]; // uncoaleased read
}
Array copy
#include <stdio.h>
// Kernel that processes an array
__global__ void processArray(float *array, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
// Each thread does some simple operation
array[idx] = array[idx] * 2.0f;
}
}
int main() {
const int numElements = 1024; // Number of elements in the array
const int elementSize = sizeof(float); // Size of each element
const int blockSize = 128; // Block size, chosen based on memory transaction size
// Ensure the array size is a multiple of the memory transaction size for alignment
const int arraySize = numElements * elementSize;
const int transactionSize = 128; // Transaction size in bytes, could be 32, 64, or 128
const int padding = (transactionSize - (arraySize % transactionSize)) % transactionSize;
const int totalArraySize = arraySize + padding;
float *d_array;
// Allocate aligned device memory
cudaMalloc(&d_array, totalArraySize);
// Initialize array on host
float *h_array = new float[numElements + padding / elementSize];
for (int i = 0; i < numElements; i++) {
h_array[i] = 1.0f; // Example initialization
}
// Copy data to device
cudaMemcpy(d_array, h_array, arraySize, cudaMemcpyHostToDevice);
// Configure blocks and grid
dim3 blocks(blockSize);
dim3 grid((numElements + blocks.x - 1) / blocks.x);
// Launch the kernel
processArray<<<grid, blocks>>>(d_array, numElements);
// Copy back the results to host
cudaMemcpy(h_array, d_array, arraySize, cudaMemcpyDeviceToHost);
// Free memory
cudaFree(d_array);
delete[] h_array;
return 0;
}
The starting address of device memory is neither a multiple of 32, 64, nor 128 bytes.
#include <cuda_runtime.h>
#include <iostream>
__global__ void processMisalignedArray(float *array, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
array[idx] = array[idx] * 2.0f; // Simple operation to demonstrate access
}
}
int main() {
const int numElements = 1024;
const int elementSize = sizeof(float);
const int misalignmentOffset = 7; // Deliberately misalign by 7 bytes
char *d_base; // Allocate as char* for byte-wise manipulation
float *d_misalignedArray;
// Allocate more memory than needed to ensure we can misalign manually
cudaMalloc((void**)&d_base, numElements * elementSize + misalignmentOffset);
// Intentionally create a misaligned float pointer
d_misalignedArray = reinterpret_cast<float*>(d_base + misalignmentOffset);
// Initialize array on host
float *h_array = new float[numElements];
for (int i = 0; i < numElements; i++) {
h_array[i] = 1.0f; // Initialize with some values
}
// Copy data to device (to the misaligned address)
cudaMemcpy(d_misalignedArray, h_array, numElements * elementSize, cudaMemcpyHostToDevice);
// Configure blocks and grid
dim3 blocks(256);
dim3 grid((numElements + blocks.x - 1) / blocks.x);
// Launch the kernel
processMisalignedArray<<<grid, blocks>>>(d_misalignedArray, numElements);
// Copy back the results to host
cudaMemcpy(h_array, d_misalignedArray, numElements * elementSize, cudaMemcpyDeviceToHost);
// Output the first few elements for demonstration
for (int i = 0; i < 10; i++) {
std::cout << "Element " << i << ": " << h_array[i] << std::endl;
}
// Clean up
cudaFree(d_base);
delete[] h_array;
return 0;
}
struct innerStruct {
float x;
float y;
};
__global__ void testInnerStruct(innerStruct *data,
innerStruct *result, const int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
// read a structure with one thread
innerStruct tmp = data[i];
tmp.x += 10.f;
tmp.y += 20.f;
// write a structure with one thread
result[i] = tmp;
}
}
The above code shows uncoalesced reading/writing, since t0 reads at data[0], t1 reads data[1], but the memory locations of data[0] and data[1] are not contiguous.
struct innerArray {
float x[N];
float y[N];
};
__global__ void testInnerArray(InnerArray *data, InnerArray *result, const int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n) {
float tmpx = data->x[i];
float tmpy = data->y[i];
tmpx += 10.f;
tmpy += 20.f;
result->x[i] = tmpx;
result->y[i] = tmpy;
}
}


https://www.cs.nthu.edu.tw/~cherung/teaching/2010gpucell/CUDA02.pdf