# compilateur
CC = gcc
-NVCC = /usr/local/cuda/bin/nvcc
+NVCC = nvcc
CXX = g++
# options de compilation
# librairies pour la compilation
LIB_CC = -lm
-LIBSNV = -L/usr/local/cuda/lib64 -lcuda -lcudart
+LIBSNV = -L/cm/shared/apps/cuda31/toolkit/3.1/lib64 -lcuda -lcudart
# sources utiles a la compilation des main
SRCS = lib_alloc.c lib_images.c lib_math.c lib_snake_common.c lib_contour.c
# --use_fast_math
# --ptxas-options=-v
$(PATH_LIB)%_gpu.o : $(PATH_SRC)%_gpu.cu
- $(NVCC) -arch=sm_20 --use_fast_math -c $< -o $@
+ $(NVCC) -arch=sm_13 --use_fast_math -c $< -o $@
$(PATH_LIB)%.o : $(PATH_SRC)%.c $(DEPS)
$(CC) $(OPTION_CC) -c $< -o $@
int dist = 140 ;
tic(&chrono, NULL);
- if (nb_nodes == 4) genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
- else if (nb_nodes == 40) genere_snake_rectangle_Nnodes_gpu<<< 1, 1>>>(*d_snake, (H+L)/20, H, L) ;
+
+ int MAX_DIAGOS = 1024*65536 ;
+ int ret, Q = 100 ;
+ uint4 * d_diagos_snake ;
+ uint4 * h_diagos_snake = new uint4[MAX_DIAGOS];
+
+
+ ret = cudaMalloc( (void**) &d_diagos_snake, MAX_DIAGOS*sizeof(uint4)) ;
+
+ genere_diagos_rectangle<<<1,1>>>(d_diagos_snake, H,L,Q);
+
+ ret = cudaMemcpy( h_diagos_snake, d_diagos_snake, MAX_DIAGOS*sizeof(uint4), cudaMemcpyDeviceToHost) ;
+ //cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection
+
+ printf("COPY : %d, MAX_DIAGOS = %d\n", ret, MAX_DIAGOS);
+ for (int d=0; d<200;d++){
+ printf("| (%d,%d)-(%d,%d) ", h_diagos_snake[d].x, h_diagos_snake[d].y, h_diagos_snake[d].z, h_diagos_snake[d].w );
+ }
+
+ exit(0);
+ genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
+
int nnodes = nb_nodes ;
snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
//DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
+ /*
int64 h_stats_ref[3*nnodes] ;
for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
+ */
//fin forçage a 0
//DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
+ /*
t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
+ */
//fin forçage a 0
calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
}
}
+__global__ void genere_diagos_rectangle(uint4 * d_diagos, int h, int l, int q){
+ int inci = h/q;
+ int incj = l/q;
+ int iM,jM, iN, jN ;
+ int idxDiago = 0 ;
+ // boucle double pour les positions du point NO de la diagonale
+ for ( iM = 0; iM < q-1; iM++){
+ for (jM = 0 ; jM < q-1 ; jM++){
+ //boucle double pour les positions du point SE de la diagonale
+ for (iN = iM+1; iN < q; iN++){
+ for (jN = jM+1; jN < q; jN++){
+ d_diagos[idxDiago].x = iM*inci;
+ d_diagos[idxDiago].y = jM*incj;
+ d_diagos[idxDiago].z = iN*inci;
+ d_diagos[idxDiago].w = jN*incj;
+ idxDiago++;
+ }
+ }
+ }
+ }
+}
__global__ void genere_snake_rectangle_Nnodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){
int nb_node_seg = 9 ;