__global__ void genere_snake_rectangle_4nodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){ if (threadIdx.x == 0){ int n = 0; /* n0 */ d_snake[n].posi = dist_bords ; d_snake[n].posj = dist_bords ; n++ ; /* n1 */ d_snake[n].posi = i_dim - dist_bords ; d_snake[n].posj = dist_bords ; n++ ; /* n2 */ d_snake[n].posi = i_dim - dist_bords ; d_snake[n].posj = j_dim - dist_bords ; n++ ; /* n3 */ d_snake[n].posi = dist_bords ; d_snake[n].posj = j_dim - dist_bords ; for (int i=0; i<4; i++) { d_snake[i].freeman_in = 0; d_snake[i].freeman_out = 0; d_snake[i].centre_i = 0; d_snake[i].centre_j = 0; d_snake[i].last_move = 0; d_snake[i].nb_pixels = 0; d_snake[i].code_segment = 0; } } } __global__ void genere_snake_rectangle_Nnodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){ int nb_node_seg = 9 ; int limite = 64 ; int i , h= i_dim-2*dist_bords, l= j_dim-2*dist_bords ; int inch = h/(nb_node_seg+1), incl= l/(nb_node_seg+1) ; if (threadIdx.x == 0){ int n = 0; /* n0 */ d_snake[n].posi = dist_bords ; d_snake[n].posj = dist_bords ; n++ ; /*entre sommet 0 et 1*/ i = 0 ; while (i < nb_node_seg) { if ( (d_snake[n-1].posi + inch)-(i_dim - dist_bords) > limite ) d_snake[n].posi = d_snake[n-1].posi + inch ; else d_snake[n].posi = d_snake[n-1].posi + inch/2 ; d_snake[n].posj = dist_bords ; d_snake[n-1].nb_pixels = d_snake[n].posi - d_snake[n-1].posi ; n++ ; i++ ; } /* n1 */ d_snake[n].posi = i_dim - dist_bords ; d_snake[n].posj = dist_bords ; d_snake[n-1].nb_pixels = d_snake[n].posi - d_snake[n-1].posi ; n++ ; /*entre S1 et S2*/ i = 0 ; while (i< nb_node_seg) { if ( (j_dim - dist_bords) - (d_snake[n-1].posj + incl) > limite ) d_snake[n].posj = d_snake[n-1].posj + incl ; else d_snake[n].posj = d_snake[n-1].posj + incl/2 ; d_snake[n].posi = i_dim - dist_bords ; d_snake[n-1].nb_pixels = d_snake[n].posj - d_snake[n-1].posj ; n++ ; i++ ; } /* n2 */ d_snake[n].posi = i_dim - dist_bords ; d_snake[n].posj = j_dim - dist_bords ; d_snake[n-1].nb_pixels = d_snake[n].posj - d_snake[n-1].posj ; n++ ; /*entre S2 et S3*/ i = 0 ; while (i< nb_node_seg) { if ( (d_snake[n-1].posi - inch) - dist_bords > limite ) d_snake[n].posi = d_snake[n-1].posi - inch ; else d_snake[n].posi = d_snake[n-1].posi - inch/2 ; d_snake[n].posj = j_dim - dist_bords ; d_snake[n-1].nb_pixels = d_snake[n-1].posi - d_snake[n].posi ; n++ ; i++ ; } /* n3 */ d_snake[n].posi = dist_bords ; d_snake[n].posj = j_dim - dist_bords ; d_snake[n-1].nb_pixels = d_snake[n-1].posi - d_snake[n].posi ; n++ ; /*entre S3 et S0*/ i = 0 ; while (i< nb_node_seg) { if ( (d_snake[n-1].posj - incl) - dist_bords > limite) d_snake[n].posj = d_snake[n-1].posj - incl ; else d_snake[n].posj = d_snake[n-1].posj - incl/2 ; d_snake[n].posi = dist_bords ; d_snake[n-1].nb_pixels = d_snake[n-1].posj - d_snake[n].posj ; n++ ; i++ ; } d_snake[n-1].nb_pixels = d_snake[n-1].posj - d_snake[0].posj ; for (i=0; i taille smem dynamique t_sum_x* scumuls_x = (t_sum_x*) &scumuls_1[CFI(blockDim.x)] ; t_sum_x2* scumuls_x2 = (t_sum_x2*) &scumuls_x[CFI(blockDim.x)] ; //indices des noeuds uint x1, y1, x2, y2 ; int n1, n2 ; n1 = segment ; n2 = segment +1 ; //gestion du bouclage du snake if (n2 >= nb_nodes) n2 = 0 ; //affectation des differentes positions aux différents segments 'blocs de threads' x1 = d_snake[n1].posj ; y1 = d_snake[n1].posi ; x2 = d_snake[n2].posj ; y2 = d_snake[n2].posi ; //params des deplacements int dx=x2-x1; int dy=y2-y1; uint abs_dx = ABS(dx); uint abs_dy = ABS(dy); uint nb_pix = abs_dy>abs_dx?(abs_dy+1):(abs_dx+1); // alternative -> lecture ds liste_points[] int incx=0, incy=0; uint2 p ; int xprec, xsuiv ; //calcul liste des pixels du segment (x1,y1)-(x2,y2) if (dy > 0) incy=1; else incy=-1 ; if (dx > 0) incx=1; else incx=-1 ; if (tis < nb_pix){ if (abs_dy > abs_dx){ //1 thread par ligne double k = (double)dx/dy ; p.x = y1 + incy*tis ; p.y = x1 + floor((double)incy*k*tis+0.5) ; //enreg. coords. pixels en global mem pour freemans if ((tis < 2)||(tis > nb_pix - 3)||(tis == nb_pix/2)) { liste_pix[idx].x = p.x ; liste_pix[idx].y = p.y ; } } else { //1 thread par colonne double k=(double)dy/dx ; p.x = y1 + floor((double)(incx*k*tis)+0.5) ; p.y = x1 + incx*tis ; if ( tis > 0 ){ xsuiv = y1 + floor((double)(incx*k*(tis+1))+0.5) ; xprec = y1 + floor((double)(incx*k*(tis-1))+0.5) ; } //enreg. coords. pixels en global mem pour freeman //TODO //on peut calculer les freemans des segments //sans stocker l'ensemble des valeurs des pixels //juste avec les derivees aux extremites a calculer ici if ((tis < 2)||(tis > nb_pix - 3)||(tis == nb_pix/2)) { liste_pix[idx].x = p.x ; liste_pix[idx].y = p.y ; } } } __syncthreads(); //calcul contribs individuelles des pixels if ( (tis >0) && (tis < nb_pix-1) && ( (abs_dy <= abs_dx) && ( (xprec > p.x) || (xsuiv > p.x)) || (abs_dy > abs_dx) ) ) { int pos = p.x * l + p.y ; scumuls_1[ CFI(tib)] = 1+p.y; scumuls_x[ CFI(tib)] = cumul_x[ pos ] ; scumuls_x2[CFI(tib)] = cumul_x2[ pos ]; } else { scumuls_1[ CFI(tib)] = 0; scumuls_x[ CFI(tib)] = 0; scumuls_x2[CFI(tib)] = 0; } __syncthreads(); //somme des contribs individuelles // unroll des sommes partielles en smem if (blockSize >= 512) { if (tib < 256) { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 256) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 256) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 256) ]; } __syncthreads(); } if (blockSize >= 256) { if (tib < 128) { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 128) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 128) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 128) ]; } __syncthreads(); } if (blockSize >= 128) { if (tib < 64) { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 64) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 64) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 64) ]; } __syncthreads(); } //32 threads <==> 1 warp if (tib < 32) { { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 32) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 32) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 32) ]; } { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 16) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 16) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 16) ]; } { scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 8) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 8) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 8) ]; } scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 4) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 4) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 4) ]; scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 2) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 2) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 2) ]; scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 1) ]; scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 1) ]; scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 1) ]; } // resultat sommes partielles en gmem if (tib == 0) { gsombloc[ blockIdx.x ] = (t_sum_x2) scumuls_1[0]; gsombloc[ blockIdx.x + gridDim.x ] = (t_sum_x2) scumuls_x[0]; gsombloc[ blockIdx.x + 2*gridDim.x ] = (t_sum_x2) scumuls_x2[0]; } //calculs freemans, centre et code segment //1 uint4 par segment int Di, Dj; if (tis == 0){ Di = 1 + liste_pix[idx+1].x - liste_pix[idx].x ; Dj = 1 + liste_pix[idx+1].y - liste_pix[idx].y ; d_snake[segment].freeman_out = d_table_freeman[3*Di + Dj] ; //code seg if (dy > 0 ) d_snake[segment].code_segment = -1 ; if (dy < 0 ) d_snake[segment].code_segment = 1 ; if (dy == 0) d_snake[segment].code_segment = 0 ; } if (tis == nb_pix-1){ Di = 1 + liste_pix[idx].x - liste_pix[idx-1].x ; Dj = 1 + liste_pix[idx].y - liste_pix[idx-1].y; d_snake[segment].freeman_in = d_table_freeman[3*Di + Dj] ; } if (tis == (nb_pix/2)){ d_snake[segment].centre_i = liste_pix[idx].x ; d_snake[segment].centre_j = liste_pix[idx].y ; } } /* sommme des contribs par bloc -> contribs segment, pour le snake execution sur : 1bloc / 1 thread par segment */ __global__ void somsom_snake(t_sum_x2 * somblocs, int nb_nodes, unsigned int nb_bl_seg, snake_node_gpu * d_snake){ t_sum_x2 sdata[3]; unsigned int seg = blockIdx.x ; //un thread par segment { sdata[0] = 0; sdata[1] = 0; sdata[2] = 0; } for (int b=0; b < nb_bl_seg ; b++){ sdata[0] += somblocs[seg*nb_bl_seg + b]; sdata[1] += somblocs[(seg + nb_nodes)*nb_bl_seg + b]; sdata[2] += somblocs[(seg + 2*nb_nodes)*nb_bl_seg + b]; } //totaux en gmem { d_snake[seg].sum_1 = sdata[0]; d_snake[seg].sum_x = sdata[1]; d_snake[seg].sum_x2 = sdata[2]; } } __device__ double codage_gl_gauss(uint64 stat_sum_1, uint64 stat_sum_x, uint64 stat_sum_x2, uint64 n_dim, uint64 SUM_X, uint64 SUM_X2){ uint64 stat_sum_xe ; /* somme des xn region exterieure */ uint32 ne ; /* nombre de pixel region exterieure */ double sigi2, sige2; /* variance region interieure et exterieure */ /* variance des valeurs des niveaux de gris a l'interieur du snake */ sigi2 = ((double)stat_sum_x2/(double)stat_sum_1) - ((double)stat_sum_x/(uint64)stat_sum_1)*((double)stat_sum_x/(uint64)stat_sum_1) ; /* variance des valeurs des niveaux de gris a l'exterieur du snake */ ne = n_dim-stat_sum_1 ; stat_sum_xe = SUM_X - stat_sum_x ; sige2 = ((double)SUM_X2-stat_sum_x2)/(double)ne - ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ; if ((sigi2 > 0)|(sige2 > 0)) return 0.5*((double)stat_sum_1*log(sigi2) + (double)ne*log(sige2)) ; return -1 ; } __global__ void calcul_stats_snake(snake_node_gpu * d_snake, int nnodes, int64 * d_stats_snake, double * vrais_min, t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int * TABLE_CODAGE, uint32 l ) { int id_nx, id_nprec, id_nprecprec ; int code_noeud, code_segment, pos ; __shared__ int64 s_stats_snake[3] ; //init stats en shared mem s_stats_snake[0] = 0 ; s_stats_snake[1] = 0 ; s_stats_snake[2] = 0 ; for (id_nx = 0; id_nx < nnodes; id_nx++) { if (id_nx == 0) id_nprec = nnodes - 1; else id_nprec = id_nx - 1; if (id_nprec == 0) id_nprecprec = nnodes -1 ; else id_nprecprec = id_nprec - 1 ; /* gestion des segments partant du noeud */ /* vers le noeud suivant dans l'ordre trigo */ code_segment = d_snake[id_nprec].code_segment ; if (code_segment > 0) { /* on somme les contributions */ s_stats_snake[0] += d_snake[id_nprec].sum_1 ; s_stats_snake[1] += d_snake[id_nprec].sum_x ; s_stats_snake[2] += d_snake[id_nprec].sum_x2 ; } else if (code_segment < 0) { /* on soustrait les contributions */ s_stats_snake[0] -= d_snake[id_nprec].sum_1 ; s_stats_snake[1] -= d_snake[id_nprec].sum_x ; s_stats_snake[2] -= d_snake[id_nprec].sum_x2 ; } // else (code_segment == 0), on ne fait rien /* gestion des pixels connectant les segments */ /* pixel de depart du segment actuel np --> np->noeud_suiv */ /* freeman_out = np->freeman_out ; */ /* freeman_in = np->noeud_prec->freeman_in ; */ pos = d_snake[id_nprecprec].freeman_in*8 + d_snake[id_nprec].freeman_out ; code_noeud = TABLE_CODAGE[pos] ; pos = d_snake[id_nprec].posi*l + d_snake[id_nprec].posj ; if (code_noeud > 0) { /* on somme les contributions */ s_stats_snake[0] += 1 + d_snake[id_nprec].posj ; s_stats_snake[1] += cumul_x[pos] ; s_stats_snake[2] += cumul_x2[pos] ; } else if (code_noeud < 0) { /* on soustrait les contributions */ s_stats_snake[0] -= 1 + d_snake[id_nprec].posj ; s_stats_snake[1] -= cumul_x[pos] ; s_stats_snake[2] -= cumul_x2[pos] ; } // else (code_pixel == 0), on ne fait rien } d_stats_snake[0] = s_stats_snake[0] ; d_stats_snake[1] = s_stats_snake[1] ; d_stats_snake[2] = s_stats_snake[2] ; *vrais_min = codage_gl_gauss(s_stats_snake[0], s_stats_snake[1], s_stats_snake[2], d_stats_snake[3], d_stats_snake[4], d_stats_snake[5]); }