1 extern __shared__ volatile double sData[];
3 updateBasisKernel(int m, uint l, double d_l, double *B, uint pitch_B, double *d)
5 uint bId = blockIdx.x, tId = threadIdx.x;
6 uint colStart = bId*pitch_B;
9 // First thread load Blj so it can be
10 // broadcasted via shared memory to each threads
12 sdata[0] = B[colStart+leave] / d_l;
15 // Each thread proccess mutiple elements
19 Bij = B[colStart+tId];
27 B[colStart+tId] = B2ij;