BookGPU/Chapters/chapter19/code.cu

   1 // compute y = B*x (B is stored in SCOO formats [ cols, rows, values,
   2 //offsets, numPacks, numRows ])
   3 // LANE_SIZE = 2^k
   4 // NUM_ROWS_PER_SLICE is computed based on sparsity
   5 template <const uint32_t THREADS_PER_BLOCK, const uint32_t NUM_ROWS_PER_SLICE, const uint32_t LANE_SIZE>
   6 __global__ void
   7 sliced_coo_kernel(
   8                 const uint32_t numRows,
   9                 const uint32_t numPacks,
  10                 const uint32_t * cols,
  11                 const uint16_t * rows,
  12                 const float * values,
  13                 const uint32_t * offsets,
  14                 const float * x,
  15                       float * y)
  16 {
  17     // ~ threadIdx.x % LANE_SIZE
  18          const int thread_lane = threadIdx.x & (LANE_SIZE-1);
  19
  20     const int row_lane = threadIdx.x/(LANE_SIZE);
  21
  22     __shared__ float sdata[NUM_ROWS_PER_SLICE][LANE_SIZE];
  23
  24     const uint32_t packNo=blockIdx.x;
  25     const uint32_t limit = ( (packNo==numPacks-1)?((numRows-1)%NUM_ROWS_PER_SLICE)+1:NUM_ROWS_PER_SLICE );
  26
  27     const uint32_t begin = offsets[packNo];
  28     const uint32_t end = offsets[packNo+1];
  29     for(int i=row_lane; i<limit; i+=THREADS_PER_BLOCK/LANE_SIZE)
  30         sdata[i][thread_lane] = 0;
  31
  32     __syncthreads();
  33
  34     for(int32_t index=begin+threadIdx.x; index<end; index+=THREADS_PER_BLOCK){
  35         const uint32_t col = cols[index];
  36         const uint16_t row = rows[index];
  37         const float value = values[index];
  38
  39         // fetch x[col] from texture cache
  40         const float input = fetch_x(col, x) * value;
  41         atomic_add(&sdata[row][thread_lane], input);
  42     }
  43
  44     __syncthreads();
  45
  46     // Parallel reduction
  47     // Sum of row i is available in column i%LANE_SIZE
  48     for (uint32_t i=row_lane; i<limit; i+=THREADS_PER_BLOCK/LANE_SIZE) {
  49         float *p = sdata[i];
  50         int des = (thread_lane+i) & (LANE_SIZE-1);
  51
  52         // assuming lane size <= 32
  53         if (LANE_SIZE>16 && thread_lane<16)
  54             p[des]+=p[(des+16)&(LANE_SIZE-1)]; __syncthreads();
  55         if (LANE_SIZE>8 && thread_lane<8)
  56             p[des]+=p[(des+8)&(LANE_SIZE-1)]; __syncthreads();
  57         if (LANE_SIZE>4 && thread_lane<4)
  58             p[des]+=p[(des+4)&(LANE_SIZE-1)]; __syncthreads();
  59         if (LANE_SIZE>2 && thread_lane<2)
  60             p[des]+=p[(des+2)&(LANE_SIZE-1)]; __syncthreads();
  61         if (LANE_SIZE>1 && thread_lane<1)
  62             p[des]+=p[(des+1)&(LANE_SIZE-1)]; __syncthreads();
  63     }
  64
  65     __syncthreads();
  66     const uint32_t actualRow = packNo * NUM_ROWS_PER_SLICE;
  67
  68     for(int r = threadIdx.x; r < limit; r+=THREADS_PER_BLOCK)
  69         y[actualRow+r] = sdata[r][thread_lane];
  70 }