threadIdx.x.Row-major write/column-major read:
__global__ void setRowReadRow(int *out) {
// static shared memory
__shared__ int tile[BDIMY][BDIMX];
// mapping from thread index to global memory index
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
// shared memory store operation (conflict-free)
tile[threadIdx.y][threadIdx.x] = idx;
// wait for all threads to complete
__syncthreads();
// shared memory load operation (bank conflict)
out[idx] = tile[threadIdx.x][threadIdx.y] ;
}


threadIdx.x.threadIdx.x accessing consecutive locations in shared memory.__global__ void setRowReadColPad(int *out) {
// static shared memory
__shared__ int tile[BDIMY][BDIMX+IPAD];
// mapping from thread index to global memory offset
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
// shared memory store operation
tile[threadIdx.y][threadIdx.x] = idx;
// wait for all threads to complete
__syncthreads();
// shared memory load operation
out[idx] = tile[threadIdx.x][threadIdx.y];
}

| Conflict-free | Conflict-free | Conflict-free | Conflict-free (broadcast) | Bank conflict |
|---|---|---|---|---|
![]() |
![]() |
![]() |
![]() |
![]() |