ch10

[book_gpu.git] / BookGPU / Chapters / chapter6 / PartieSync.tex
diff --git a/BookGPU/Chapters/chapter6/PartieSync.tex b/BookGPU/Chapters/chapter6/PartieSync.tex

index ac9a3dde235c8ef0f3a8eadae9ecaa1d51b137e2..d213e50234857612b8ab5cf322a870a187b11318 100755 (executable)
--- a/BookGPU/Chapters/chapter6/PartieSync.tex
+++ b/BookGPU/Chapters/chapter6/PartieSync.tex
@@ -142,16 +142,16 @@ int src = ...
  
  // Computation loop (using the GPU)
  for (int i = 0; i < NbIter; i++) {
  
  // Computation loop (using the GPU)
  for (int i = 0; i < NbIter; i++) {
-  cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr,  // Data transfer:
-             sizeof(float)*N,                 // CPU --> GPU (sync. op)
+  cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer:
+             sizeof(float)*N,                // CPU --> GPU (sync. op)
               cudaMemcpyHostToDevice);
               cudaMemcpyHostToDevice);
-  gpuKernel_k1<<<Dg,Db>>>();                  // GPU comp. (async. op)
-  MPI_Sendrecv_replace(cpuInputTabAdr,        // MPI comms. (sync. op)
+  gpuKernel_k1<<<Dg,Db>>>();                 // GPU comp. (async. op)
+  MPI_Sendrecv_replace(cpuInputTabAdr,       // MPI comms. (sync. op)
                         N,MPI_FLOAT,
                         dest, 0, src, 0, ...);
    // IF there is (now) a result to transfer from the GPU to the CPU:
                         N,MPI_FLOAT,
                         dest, 0, src, 0, ...);
    // IF there is (now) a result to transfer from the GPU to the CPU:
-  cudaMemcpy(cpuResTabAdr + i, gpuResAdr,     // Data transfer:
-             sizeof(float),                   // GPU --> CPU (sync. op)
+  cudaMemcpy(cpuResTabAdr + i, gpuResAdr,    // Data transfer:
+             sizeof(float),                  // GPU --> CPU (sync. op)
               cudaMemcpyDeviceToHost);
  }
  ...
               cudaMemcpyDeviceToHost);
  }
  ...
@@ -258,7 +258,7 @@ omp_set_num_threads(2);
  
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
  
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
-      MPI_Sendrecv(current,                   // MPI comms. (sync. op)
+      MPI_Sendrecv(current,                  // MPI comms. (sync. op)
                     N, MPI_FLOAT, dest, 0,
                     future,
                     N, MPI_FLOAT, dest, 0, ...);
                     N, MPI_FLOAT, dest, 0,
                     future,
                     N, MPI_FLOAT, dest, 0, ...);
@@ -266,13 +266,13 @@ omp_set_num_threads(2);
      // - Thread 1: achieves the GPU sequence (GPU computations and
      //             CPU/GPU data transfers)
      } else if (omp_get_thread_num() == 1) {
      // - Thread 1: achieves the GPU sequence (GPU computations and
      //             CPU/GPU data transfers)
      } else if (omp_get_thread_num() == 1) {
-      cudaMemcpy(gpuInputTabAdr, current,     // Data transfer:
-                 sizeof(float)*N,             // CPU --> GPU (sync. op)
+      cudaMemcpy(gpuInputTabAdr, current,    // Data transfer:
+                 sizeof(float)*N,            // CPU --> GPU (sync. op)
                   cudaMemcpyHostToDevice);
                   cudaMemcpyHostToDevice);
-      gpuKernel_k1<<<Dg,Db>>>();              // GPU comp. (async. op)
-      // IF there is (now) a result to transfer from the GPU to the CPU:
-      cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
-                 sizeof(float),               // GPU --> CPU (sync. op)
+      gpuKernel_k1<<<Dg,Db>>>();             // GPU comp. (async. op)
+    // IF there is (now) a result to transfer from the GPU to the CPU:
+      cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer:
+                 sizeof(float),              // GPU --> CPU (sync. op)
                   cudaMemcpyDeviceToHost);
      }
  
                   cudaMemcpyDeviceToHost);
      }
  
@@ -393,27 +393,27 @@ omp_set_num_threads(2);
    for (int i = 0; i < NbIter; i++) {
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
    for (int i = 0; i < NbIter; i++) {
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
-      MPI_Sendrecv(current,                // MPI comms. (sync. op)
+      MPI_Sendrecv(current,             // MPI comms. (sync. op)
                     N, MPI_FLOAT, dest, 0,
                     future,
                     N, MPI_FLOAT, dest, 0, ...);
                     N, MPI_FLOAT, dest, 0,
                     future,
                     N, MPI_FLOAT, dest, 0, ...);
-    // - Thread 1: achieves the streamed GPU sequence (GPU computations
-    //             and CPU/GPU data transfers)
+    // - Thread 1: achieves the streamed GPU sequence (GPU 
+    //   computations and CPU/GPU data transfers)
      } else if (omp_get_thread_num() == 1) {
      } else if (omp_get_thread_num() == 1) {
-      for (int s = 0; s < NbS; s++) {      // Start all data transfers:
+      for (int s = 0; s < NbS; s++) {   // Start all data transfers:
          cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU
                          current + s*stride,        // (async. ops)
                          sizeof(float)*stride,
                          cudaMemcpyHostToDevice,
                          TabS[s]);
        }
          cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU
                          current + s*stride,        // (async. ops)
                          sizeof(float)*stride,
                          cudaMemcpyHostToDevice,
                          TabS[s]);
        }
-      for (int s = 0; s < NbS; s++) {   // Start all GPU comps. (async.)
+      for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.)
          gpuKernel_k1<<<Dg, Db, 0, TabS[s]>>>(gpuInputTabAdr + s*stride);
        }
          gpuKernel_k1<<<Dg, Db, 0, TabS[s]>>>(gpuInputTabAdr + s*stride);
        }
-      cudaThreadSynchronize();             // Wait all threads are ended
-      // IF there is (now) a result to transfer from the GPU to the CPU:
-      cudaMemcpy(cpuResTabAdr,             // Data transfers:
-                 gpuResAdr,                // GPU --> CPU (sync. op)
+      cudaThreadSynchronize();          // Wait all threads are ended
+   // IF there is (now) a result to transfer from the GPU to the CPU:
+      cudaMemcpy(cpuResTabAdr,          // Data transfers:
+                 gpuResAdr,             // GPU --> CPU (sync. op)
                   sizeof(float),
                   cudaMemcpyDeviceToHost);
      }
                   sizeof(float),
                   cudaMemcpyDeviceToHost);
      }
@@ -539,7 +539,7 @@ omp_set_num_threads(3);
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
        if (i < NbIter) {
      // - Thread 0: achieves MPI communications
      if (omp_get_thread_num() == 0) {
        if (i < NbIter) {
-        MPI_Sendrecv(cpuCurrent,              // MPI comms. (sync. op)
+        MPI_Sendrecv(cpuCurrent,            // MPI comms. (sync. op)
                       N, MPI_FLOAT, dest, 0,
                       cpuFuture,
                       N, MPI_FLOAT, dest, 0, ...);
                       N, MPI_FLOAT, dest, 0,
                       cpuFuture,
                       N, MPI_FLOAT, dest, 0, ...);
@@ -547,17 +547,17 @@ omp_set_num_threads(3);
      // - Thread 1: achieves the CPU/GPU data transfers
      } else if (omp_get_thread_num() == 1) {
        if (i < NbIter) {
      // - Thread 1: achieves the CPU/GPU data transfers
      } else if (omp_get_thread_num() == 1) {
        if (i < NbIter) {
-        cudaMemcpy(gpuFuture, cpuCurrent,     // Data transfer:
-                   sizeof(float)*N,           // CPU --> GPU (sync. op)
+        cudaMemcpy(gpuFuture, cpuCurrent,   // Data transfer:
+                   sizeof(float)*N,         // CPU --> GPU (sync. op)
                     cudaMemcpyHostToDevice);
        }
                     cudaMemcpyHostToDevice);
        }
-    // - Thread 2: achieves the GPU computations and the result transfer
+ // - Thread 2: achieves the GPU computations and the result transfer
      } else if (omp_get_thread_num() == 2) {
        if (i > 0) {
      } else if (omp_get_thread_num() == 2) {
        if (i > 0) {
-        gpuKernel_k1<<<Dg,Db>>>(gpuCurrent);  // GPU comp. (async. op)
+        gpuKernel_k1<<<Dg,Db>>>(gpuCurrent);// GPU comp. (async. op)
          // IF there is (now) a result to transfer from GPU to CPU:
          // IF there is (now) a result to transfer from GPU to CPU:
-        cudaMemcpy(cpuResTabAdr + (i-1),      // Data transfer:
-                   gpuResAdr, sizeof(float),  // GPU --> CPU (sync. op)
+        cudaMemcpy(cpuResTabAdr + (i-1),    // Data transfer:
+                   gpuResAdr, sizeof(float),// GPU --> CPU (sync. op)
                     cudaMemcpyDeviceToHost);
        }
      }
                     cudaMemcpyDeviceToHost);
        }
      }