Definitions

laneID = threadIdx.x % warpSize orlaneID = threadIdx.x & 0x1f warpID = threadIdx.x / warpSize

Broadcast

T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);

__shfl_sync(0xffffffff, var, 2, warpSize), where var=input[2]. __shfl_sync(0xffffffff, var, 2, warpSize/2), where var=input[2] or var=input[18].

Shuffle Up

T __shfl_up_sync(unsigned mask, T var, unsigned int delta, int width=warpSize);

__shfl_up_sync(0xffffffff, var, 2, warpSize), where var=input[2]. __shfl_up_sync(0xffffffff, var, 2, warpSize/2), where var=input[2].

Shuffle Down

T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width=warpSize);

__shfl_down_sync(0xffffffff, var, 3, warpSize) __shfl_down_sync(0xffffffff, var, 3, warpSize/4)

XOR Shuffle

T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width=warpSize);

The table below shows the XOR result between tid and laneMask: image

__shfl_xor_sync(0xffffffff, var, 1, warpSize)

__shfl_xor_sync(0xffffffff, var, 3, warpSize)

Examples

Local Reduction with XOR shuffle

for (int i=1; i<warpSize; i*=2)
   value += __shfl_xor_sync(0xffffffff, value, i);

Local Reduction with Down Shuffle

for (int i=warpSize/2; i>0; i=i/2)
   value += __shfl_down_sync(0xffffffff, value, i);

Block Reduction with Warp Shuffle

__inline__ __device__
int WarpReduceMin(int value)                                                               
{  
   for ( int offset=warpSize/2; offset>0; offset/=2 ){
      int value = max(value, __shfl_down_sync( 0xffffffff, value, offset, warpSize));                       
   }
   return value;
}
__inline__ __device__                                                                          
int BlockReduceMin(int value)                                                                  
{                                                                                              
   // size of array must have a constant value                                                 
   static __shared__ int buffer[32];                                                           
                                                                                               
   int laneID = threadIdx.x % warpSize;                                                        
   int warpID = threadIdx.x / warpSize;                                                        
                                                                                               
   // assumming number of warps <= 32                                                          
   int numWarp = blockDim.x / warpSize;                                                        
                                                                                               
   // execute warp shuffle for each thread; the result will be at the thread with laneID = 0                                             
   value = WarpShuffle(value);                                                                 
                                                                                               
   if (laneID==0)  buffer[warpID] = value;                                                     
                                                                                               
   __syncthreads();                                                                            
                                                                                               
   // fill the rest of warps with INT_MAX in case numWarp is less than 32                      
   value = (threadIdx.x < numWarp) ? buffer[threadIdx.x] : INT_MAX;                            
                                                                                               
   if (threadIdx.x < warpSize){                                                                 
      value = WarpShuffle(value);                                                              
   }
    
   return value;                                 
}

Reference