For non-default stream, all operations are non-blocking with respect to the host.
#include <cuda_runtime.h>
int main() {
cudaStream_t stream;
cudaError_t result = cudaStreamCreate(&stream);
// Use the stream for operations...
// Clean up and destroy the stream
cudaStreamDestroy(stream);
return 0;
}
Non-default blocking stream is equivalent to the stream created by cudaStreamCreate(&stream).
#include <cuda_runtime.h>
int main() {
cudaStream_t stream;
cudaError_t result = cudaStreamCreateWithFlags(&stream, cudaStreamDefault);
// Use the stream for operations...
// Clean up and destroy the stream
cudaStreamDestroy(stream);
return 0;
}
#include <cuda_runtime.h>
int main() {
cudaStream_t stream;
cudaError_t result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// Use the stream for operations...
// Clean up and destroy the stream
cudaStreamDestroy(stream);
return 0;
}
| Host’ view | Can be blocked by a defaul stream? | Creation | |
|---|---|---|---|
| Default stream | Synchronous</br>(except kenel calls) | N/A | N/A |
| Non-default stream | Asynchronous | Yes | cudaStreamCreateWithFlags(&stream, cudaStreamDefault)</br>or</br>cudaStreamCreate(&stream1) |
| Non-default blocking stream | Asynchronous | Yes | cudaStreamCreateWithFlags(&stream, cudaStreamDefault) |
| Non-default non-blocking stream | Asynchronous | No | cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) |
K1<<<1, 1, 0, stream_1>>>();
K2<<<1, 1>>>();
K3<<<1, 1, 0, stream_2>>>();
| Non-default blocking | Non-default non-blocking | Execution timeline |
|---|---|---|
| K1, K3 | ![]() |
|
| K1,K3 | ![]() |
|
| K1 | K3 | ![]() |
| K3 | K1 | ![]() |
cudaDeviceSynchronize() blocks the host thread until all previously issued operations on the device have completed.
cudaStreamSynchronize(stream) blocks the host thread until all previously issued operations in the specified stream have completed.
Hyper-Q is a feature introduced in the Kepler architecture. It expands the capability of a single GPU to handle work from multiple CPU cores simultaneously. Traditionally, a single CPU core would queue tasks to the GPU. Hyper-Q allows multiple CPU cores to place tasks in the GPU’s queue, which increases GPU utilization and reduces CPU idle times. It essentially allows for more concurrent operations to be processed by the GPU, making it significantly more efficient in multi-threaded and parallel computing environments.
for (int i = 0; i < n_streams; i++) {
kernel_1<<<grid, block, 0, streams[i]>>>();
kernel_2<<<grid, block, 0, streams[i]>>>();
kernel_3<<<grid, block, 0, streams[i]>>>();
kernel_4<<<grid, block, 0, streams[i]>>>();
}

| with Hyper-Q</br>(multiple hardware work queues) | without Hyper-Q</br>(single hardware work queue) |
|---|---|
![]() |
![]() |
for (int i = 0; i < n_streams; i++)
kernel_1<<<grid, block, 0, streams[i]>>>();
for (int i = 0; i < n_streams; i++)
kernel_2<<<grid, block, 0, streams[i]>>>();
for (int i = 0; i < n_streams; i++)
kernel_3<<<grid, block, 0, streams[i]>>>();
for (int i = 0; i < n_streams; i++)
kernel_4<<<grid, block, 0, streams[i]>>>();

| with Hyper-Q</br>(multiple hardware work queues) | without Hyper-Q</br>(single hardware work queue) |
|---|---|
![]() |
![]() |
for (int i = 0; i < NSTREAM; ++i) {
int ioffset = i * iElem;
cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]);
sumArrays<<<grid, block,0,stream[i]>>>(&d_A[ioffset], &d_B[ioffset], &d_C[ioffset],iElem);
cudaMemcpyAsync(&gpuRef[ioffset],&d_C[ioffset],iBytes, cudaMemcpyDeviceToHost, stream[i]);
}

| w/ GMU | w/o GMU | |
|---|---|---|
| One work queue | ![]() |
![]() |
| Eight work queue | ![]() |
// initiate all asynchronous transfers to the device
for (int i = 0; i < NSTREAM; ++i) {
int ioffset = i * iElem;
cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]);
}
// launch a kernel in each stream
for (int i = 0; i < NSTREAM; ++i) {
int ioffset = i * iElem;
sumArrays<<<grid, block, 0, stream[i]>>>(&d_A[ioffset], &d_B[ioffset], &d_C[ioffset],iElem);
}
// queue asynchronous transfers from the device
for (int i = 0; i < NSTREAM; ++i) {
int ioffset = i * iElem;
cudaMemcpyAsync(&gpuRef[ioffset],&d_C[ioffset], iBytes,
cudaMemcpyDeviceToHost, stream[i]);
}

| w/ GMU | w/o GMU | |
|---|---|---|
| One work queue | ![]() |
![]() |
| Eight work queue | ![]() |
