BookGPU/Chapters/chapter10/optiSE.cu

   1 extern __shared__ volatile double sData[];
   2 __global__ void
   3 selectInVar(int m, int n, double *c, double *AN, uint pitchAN, uint *resIdx, double *resVal)
   4 {
   5         uint i, maxIdx = -1, bid = blockIdx.x;
   6         double val, locSum, xScore, maxScore = 0.0;
   7         while(bid < n){ // Processing multiple column
   8                 i = threadIdx.x;
   9                 locSum = 0.0;
  10                 if(isPotentialEnteringVar(bid)){ // Do the local processing
  11                         while(i < m) { // Each thread process multiple elements
  12                                 val = AN[i+bid*pitchAN];
  13                                 locSum += val*val;
  14                                 i += blockDim.x;
  15                         }
  16                         // Reduce the value using the shared memory
  17                         reduceSum(locSum);
  18                         if (tid == 0){ // Is this the best variable encoutered ?
  19                                 // on tid=0 locSum eqals the s.e. coeffcient
  20                                 xScore = cVal*rsqrt(locSum);
  21                                 if(fabs(maxScore) < fabs(xScore)){
  22                                         maxIdx = bid;
  23                                         maxScore = xScore;
  24                                 }
  25                         }
  26                         __syncthreads();
  27                 }
  28                 bid += gridDim.x;
  29         }
  30         // Write the result into global memory
  31         if (tid == 0){
  32                 resIdx[blockIdx.x] = maxIdx;
  33                 resVal[blockIdx.x] = maxScore;
  34         }
  35 }