1 // compute y = B*x (B is stored in SCOO formats [ cols, rows, values,
2 //offsets, numPacks, numRows ])
4 // NUM_ROWS_PER_SLICE is computed based on sparsity
5 template <const uint32_t THREADS_PER_BLOCK, const uint32_t NUM_ROWS_PER_SLICE, const uint32_t LANE_SIZE>
8 const uint32_t numRows,
9 const uint32_t numPacks,
10 const uint32_t * cols,
11 const uint16_t * rows,
13 const uint32_t * offsets,
17 // ~ threadIdx.x % LANE_SIZE
18 const int thread_lane = threadIdx.x & (LANE_SIZE-1);
20 const int row_lane = threadIdx.x/(LANE_SIZE);
22 __shared__ float sdata[NUM_ROWS_PER_SLICE][LANE_SIZE];
24 const uint32_t packNo=blockIdx.x;
25 const uint32_t limit = ( (packNo==numPacks-1)?((numRows-1)%NUM_ROWS_PER_SLICE)+1:NUM_ROWS_PER_SLICE );
27 const uint32_t begin = offsets[packNo];
28 const uint32_t end = offsets[packNo+1];
29 for(int i=row_lane; i<limit; i+=THREADS_PER_BLOCK/LANE_SIZE)
30 sdata[i][thread_lane] = 0;
34 for(int32_t index=begin+threadIdx.x; index<end; index+=THREADS_PER_BLOCK){
35 const uint32_t col = cols[index];
36 const uint16_t row = rows[index];
37 const float value = values[index];
39 // fetch x[col] from texture cache
40 const float input = fetch_x(col, x) * value;
41 atomic_add(&sdata[row][thread_lane], input);
47 // Sum of row i is available in column i%LANE_SIZE
48 for (uint32_t i=row_lane; i<limit; i+=THREADS_PER_BLOCK/LANE_SIZE) {
50 int des = (thread_lane+i) & (LANE_SIZE-1);
52 // assuming lane size <= 32
53 if (LANE_SIZE>16 && thread_lane<16)
54 p[des]+=p[(des+16)&(LANE_SIZE-1)]; __syncthreads();
55 if (LANE_SIZE>8 && thread_lane<8)
56 p[des]+=p[(des+8)&(LANE_SIZE-1)]; __syncthreads();
57 if (LANE_SIZE>4 && thread_lane<4)
58 p[des]+=p[(des+4)&(LANE_SIZE-1)]; __syncthreads();
59 if (LANE_SIZE>2 && thread_lane<2)
60 p[des]+=p[(des+2)&(LANE_SIZE-1)]; __syncthreads();
61 if (LANE_SIZE>1 && thread_lane<1)
62 p[des]+=p[(des+1)&(LANE_SIZE-1)]; __syncthreads();
66 const uint32_t actualRow = packNo * NUM_ROWS_PER_SLICE;
68 for(int r = threadIdx.x; r < limit; r+=THREADS_PER_BLOCK)
69 y[actualRow+r] = sdata[r][thread_lane];