2 __global__ void kernel(..., int n, int nx, int ny, int slices, int stride, ...)
4 int tx = blockIdx.x * blockDim.x + threadIdx.x;//x-coordinate
5 int ty = blockIdx.y * blockDim.y + threadIdx.y;//y-coordinate
6 int tid = tx + ty * nx; //thread ID
7 for(int i=0; i<slices; i++){
8 if((tx<nx) && (ty<ny) && (tid<n)){
18 int n = NX * ny * nz; //size of the subproblem
22 int gx = (NX + bx - 1) / bx;
23 int gy = (ny + by - 1) / by;
24 dim3 block(bx,by); //dimensions of a thread block
25 dim3 grid(gx,gy); //dimensions of the grid
27 kernel<<<grid,block>>>(..., n, NX, ny, slices, stride, ...);