BookGPU/Chapters/chapter13/ex1.cu

   1 /* GPU kernel */
   2 __global__ void kernel(..., int n, int nx, int ny, int slices, int stride, ...)
   3 {
   4         int tx = blockIdx.x * blockDim.x + threadIdx.x;//x-coordinate
   5         int ty = blockIdx.y * blockDim.y + threadIdx.y;//y-coordinate
   6         int tid = tx + ty * nx;                        //thread ID
   7         for(int i=0; i<slices; i++){
   8                 if((tx<nx) && (ty<ny) && (tid<n)){
   9                         ...
  10                 }
  11                 tid += stride;
  12         }
  13 }
  14
  15 /* CPU function */
  16 void Function(...)
  17 {
  18         int n = NX * ny * nz; //size of the sub-problem
  19         int slices = nz;
  20         int stride = NX * ny;
  21         int bx = 64, by = 4;
  22         int gx = (NX + bx - 1) / bx;
  23         int gy = (ny + by - 1) / by;
  24         dim3 block(bx,by);   //dimensions of a thread block
  25         dim3 grid(gx,gy);    //dimensions of the grid
  26         ...
  27         kernel<<<grid,block>>>(..., n, NX, ny, slices, stride, ...);
  28         ...
  29 }
  30