1 extern __shared__ volatile double sdata[];
3 updateBasisKernel(int m, uint l, double d_l, double *B,
4 uint pitch_B, double *d) {
5 uint bId = blockIdx.x, tId = threadIdx.x;
6 uint colStart = bId*pitch_B;
9 // First thread loads Blj so it can be
10 // broadcast via shared memory to each thread
12 sdata[0] = B[colStart+l] / d_l;
15 // Each thread proccesses multiple elements
19 Bij = B[colStart+tId];
27 B[colStart+tId] = B2ij;