__global__ void kernel_median3_2pix( short *output, int i_dim, int j_dim) { // j base coordinate = 2*(thread index) int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; int a0, a1, a2, a3, a4, a5 ; // for left window int b0, b1, b2, b3, b4, b5 ; // for right window a0 = tex2D(tex_img_ins, j , i-1); // 6 common pixels a1 = tex2D(tex_img_ins, j+1, i-1); a2 = tex2D(tex_img_ins, j , i ); a3 = tex2D(tex_img_ins, j+1, i ); a4 = tex2D(tex_img_ins, j , i+1); a5 = tex2D(tex_img_ins, j+1, i+1); minmax6(&a0, &a1, &a2, &a3, &a4, &a5);// common minmax b0=a0; b1=a1; b2=a2; b3=a3; b4=a4; b5=a5;// separation a5 = tex2D(tex_img_ins, j-1, i); //separate process b5 = tex2D(tex_img_ins, j+2, i); minmax5(&a1, &a2, &a3, &a4, &a5); minmax5(&b1, &b2, &b3, &b4, &b5); a5 = tex2D(tex_img_ins, j-1, i-1); b5 = tex2D(tex_img_ins, j+2, i-1); minmax4(&a2, &a3, &a4, &a5); minmax4(&b2, &b3, &b4, &b5); a5 = tex2D(tex_img_ins, j-1, i+1); b5 = tex2D(tex_img_ins, j+2, i+1); minmax3(&a3, &a4, &a5); minmax3(&b3, &b4, &b5); output[ __mul24(i, j_dim) +j ] = a4 ; //2 outputs output[ __mul24(i, j_dim) +j+1 ] = b4 ; }