-// compute y = B*x (B is stored in SCOO formats [ cols, rows, values, offsets, numPacks, numRows ])
+// compute y = B*x (B is stored in SCOO formats [ cols, rows, values,
+//offsets, numPacks, numRows ])
// LANE_SIZE = 2^k
// NUM_ROWS_PER_SLICE is computed based on sparsity
template <const uint32_t THREADS_PER_BLOCK, const uint32_t NUM_ROWS_PER_SLICE, const uint32_t LANE_SIZE>
const float * x,
float * y)
{
- const int thread_lane = threadIdx.x & (LANE_SIZE-1); // ~ threadIdx.x % LANE_SIZE
+ // ~ threadIdx.x % LANE_SIZE
+ const int thread_lane = threadIdx.x & (LANE_SIZE-1);
+
const int row_lane = threadIdx.x/(LANE_SIZE);
__shared__ float sdata[NUM_ROWS_PER_SLICE][LANE_SIZE];