X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/snake_gpu.git/blobdiff_plain/935bdd1c8b99ce5d70b6e4a53ca29f06353e8baa..af4b787ce73a80f23e9e2b1ef9ac52660e8ab754:/src/lib_gpu.cu?ds=sidebyside diff --git a/src/lib_gpu.cu b/src/lib_gpu.cu index d6df5ec..1bbd61d 100644 --- a/src/lib_gpu.cu +++ b/src/lib_gpu.cu @@ -233,8 +233,32 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes, int dist = 140 ; tic(&chrono, NULL); - if (nb_nodes == 4) genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ; - else if (nb_nodes == 40) genere_snake_rectangle_Nnodes_gpu<<< 1, 1>>>(*d_snake, (H+L)/20, H, L) ; + + int MAX_DIAGOS = 1024*65536 ; + int ret, Q = 120 ; + int * d_n_diagos, h_n_diagos; + uint4 * d_diagos_snake ; + uint4 * h_diagos_snake = new uint4[MAX_DIAGOS]; + + + cudaMalloc( (void**) &d_n_diagos, sizeof(int)) ; + cudaMalloc( (void**) &d_diagos_snake, MAX_DIAGOS*sizeof(uint4)) ; + + genere_diagos_rectangle<<<1,1>>>(d_diagos_snake, H,L,Q, d_n_diagos); + + cudaMemcpy( &h_n_diagos, d_n_diagos, sizeof(int), cudaMemcpyDeviceToHost) ; + ret = cudaMemcpy( h_diagos_snake, d_diagos_snake, MAX_DIAGOS*sizeof(uint4), cudaMemcpyDeviceToHost) ; + + toc(chrono, "\tCalcul diagos"); + + printf("COPY : %s, DIAGOS = %d / %d\n", (ret==0)?"OK":"ERREUR", h_n_diagos,MAX_DIAGOS); + for (int d=0; d<200;d++){ + printf("| (%d,%d)-(%d,%d) ", h_diagos_snake[d].x, h_diagos_snake[d].y, h_diagos_snake[d].z, h_diagos_snake[d].w ); + } + + exit(0); + genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ; + int nnodes = nb_nodes ; snake_node_gpu * h_snake = new snake_node_gpu[nnodes]; @@ -261,15 +285,19 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes, cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64)); //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref + /* int64 h_stats_ref[3*nnodes] ; for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ; cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ; + */ //fin forçage a 0 //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart + /* t_sum_x2 h_sompart[ 3*nnodes*bps ] ; for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ; cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ; + */ //fin forçage a 0 calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>