X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/44b8f845847505b81dc0f1199c49e67a495ed7a0..17d1891ee5feec4f52ef9c51bfa60b78f0bd14c2:/BookGPU/Chapters/chapter6/PartieSync.tex?ds=sidebyside diff --git a/BookGPU/Chapters/chapter6/PartieSync.tex b/BookGPU/Chapters/chapter6/PartieSync.tex index ac9a3dd..d213e50 100755 --- a/BookGPU/Chapters/chapter6/PartieSync.tex +++ b/BookGPU/Chapters/chapter6/PartieSync.tex @@ -142,16 +142,16 @@ int src = ... // Computation loop (using the GPU) for (int i = 0; i < NbIter; i++) { - cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) N,MPI_FLOAT, dest, 0, src, 0, ...); // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } ... @@ -258,7 +258,7 @@ omp_set_num_threads(2); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); @@ -266,13 +266,13 @@ omp_set_num_threads(2); // - Thread 1: achieves the GPU sequence (GPU computations and // CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - cudaMemcpy(gpuInputTabAdr, current, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, current, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } @@ -393,27 +393,27 @@ omp_set_num_threads(2); for (int i = 0; i < NbIter; i++) { // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); - // - Thread 1: achieves the streamed GPU sequence (GPU computations - // and CPU/GPU data transfers) + // - Thread 1: achieves the streamed GPU sequence (GPU + // computations and CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - for (int s = 0; s < NbS; s++) { // Start all data transfers: + for (int s = 0; s < NbS; s++) { // Start all data transfers: cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU current + s*stride, // (async. ops) sizeof(float)*stride, cudaMemcpyHostToDevice, TabS[s]); } - for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) + for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) gpuKernel_k1<<>>(gpuInputTabAdr + s*stride); } - cudaThreadSynchronize(); // Wait all threads are ended - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr, // Data transfers: - gpuResAdr, // GPU --> CPU (sync. op) + cudaThreadSynchronize(); // Wait all threads are ended + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr, // Data transfers: + gpuResAdr, // GPU --> CPU (sync. op) sizeof(float), cudaMemcpyDeviceToHost); } @@ -539,7 +539,7 @@ omp_set_num_threads(3); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { if (i < NbIter) { - MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) + MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, cpuFuture, N, MPI_FLOAT, dest, 0, ...); @@ -547,17 +547,17 @@ omp_set_num_threads(3); // - Thread 1: achieves the CPU/GPU data transfers } else if (omp_get_thread_num() == 1) { if (i < NbIter) { - cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); } - // - Thread 2: achieves the GPU computations and the result transfer + // - Thread 2: achieves the GPU computations and the result transfer } else if (omp_get_thread_num() == 2) { if (i > 0) { - gpuKernel_k1<<>>(gpuCurrent); // GPU comp. (async. op) + gpuKernel_k1<<>>(gpuCurrent);// GPU comp. (async. op) // IF there is (now) a result to transfer from GPU to CPU: - cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: - gpuResAdr, sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: + gpuResAdr, sizeof(float),// GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } }