BookGPU/Chapters/chapter11/code2.cu

   1 template<typename Tx, typename Ty>
   2 __global__ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
   3 {
   4    int tid = threadIdx.x + blockIdx.x * blockDim.x;
   5    while(tid<=(N-2)) {
   6      b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
   7      tid += blockDim.x * gridDim.x;
   8   }
   9  __syncthreads();
  10 }
  11
  12 __global__ void CalculateDGeneral( double *b, double *c, int N)
  13 {
  14    int tid = threadIdx.x + blockIdx.x * blockDim.x;
  15    while(tid<=(N-2)) {
  16      if((b[tid-1]*b[tid])<=0) c[tid]=0;
  17        else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
  18      }
  19      tid += blockDim.x * gridDim.x;
  20   }
  21   __syncthreads();
  22 }
  23
  24 __global__ void CalculateD( double *b, double *c, int N )
  25 {
  26    if((b[0]*(2*b[0]-c[1]))<=0)  c[0]=0;
  27      else  c[0]=2*b[0] - c[1];
  28    if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
  29      else c[N-1]=2*b[N-2] - c[N-2];
  30    __syncthreads();
  31 }
  32
  33 template<typename Tx, typename Ty>
  34 int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N, double *t, double *alpha, double *beta, double *gamma)
  35 {
  36   int T =  (N-1)*2+1; // length of the output array
  37   double *b, *c; // temp variables
  38   cudaMalloc( (void**)&b, 1*N*sizeof(double) );
  39   cudaMalloc( (void**)&c, 2*N*sizeof(double) );
  40   int threads=256;
  41   int blocks = (N-1)/threads + 1;
  42   CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
  43   CalculateDGeneral<<<blocks,threads>>>(b,c,N);
  44   CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
  45   CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,d_Y,b,c,h,alpha,beta,gamma,N);
  46   cudaFree(b); cudaFree(c);
  47   return T;
  48 }