BookGPU/Chapters/chapter10/optiSE.cu

   1 extern __shared__ volatile double sData[];
   2 __global__ void
   3 selectInVar(int m, int n, double *c, double *AN, uint pitchAN,
   4             uint *resIdx, double *resVal) {
   5         uint i, maxIdx = -1, bid = blockIdx.x;
   6         double val, locSum, xScore, maxScore = 0.0;
   7         while (bid < n) { // Processing multiple columns
   8                 i = threadIdx.x;
   9                 locSum = 0.0;
  10                 if (isPotentialEnteringVar(bid)) { // Do the local processing
  11                         while (i < m) { // Each thread processes multiple elements
  12                                 val = AN[i+bid*pitchAN];
  13                                 locSum += val*val;
  14                                 i += blockDim.x;
  15                         }
  16                         // Reduce the value using shared memory
  17                         reduceSum(locSum);
  18                         if (tid == 0){ // Is this the best variable encountered ?
  19                                 // on tid=0 locSum equals the steepest edge coeffcient
  20                                 xScore = cVal*rsqrt(locSum);
  21                                 if (fabs(maxScore) < fabs(xScore)) {
  22                                         maxIdx = bid;
  23                                         maxScore = xScore;
  24                                 }
  25                         }
  26                         __syncthreads();
  27                 }
  28                 bid += gridDim.x;
  29         }
  30         // Write the result into global memory
  31         if (tid == 0) {
  32                 resIdx[blockIdx.x] = maxIdx;
  33                 resVal[blockIdx.x] = maxScore;
  34         }
  35 }