// Computation loop (using the GPU)
for (int i = 0; i < NbIter; i++) {
- cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
- gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
- MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op)
+ gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
+ MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op)
N,MPI_FLOAT,
dest, 0, src, 0, ...);
// IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
- sizeof(float), // GPU --> CPU (sync. op)
+ cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
+ sizeof(float), // GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
...
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
- MPI_Sendrecv(current, // MPI comms. (sync. op)
+ MPI_Sendrecv(current, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
future,
N, MPI_FLOAT, dest, 0, ...);
// - Thread 1: achieves the GPU sequence (GPU computations and
// CPU/GPU data transfers)
} else if (omp_get_thread_num() == 1) {
- cudaMemcpy(gpuInputTabAdr, current, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuInputTabAdr, current, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
- gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
- // IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
- sizeof(float), // GPU --> CPU (sync. op)
+ gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
+ // IF there is (now) a result to transfer from the GPU to the CPU:
+ cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer:
+ sizeof(float), // GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
for (int i = 0; i < NbIter; i++) {
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
- MPI_Sendrecv(current, // MPI comms. (sync. op)
+ MPI_Sendrecv(current, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
future,
N, MPI_FLOAT, dest, 0, ...);
- // - Thread 1: achieves the streamed GPU sequence (GPU computations
- // and CPU/GPU data transfers)
+ // - Thread 1: achieves the streamed GPU sequence (GPU
+ // computations and CPU/GPU data transfers)
} else if (omp_get_thread_num() == 1) {
- for (int s = 0; s < NbS; s++) { // Start all data transfers:
+ for (int s = 0; s < NbS; s++) { // Start all data transfers:
cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU
current + s*stride, // (async. ops)
sizeof(float)*stride,
cudaMemcpyHostToDevice,
TabS[s]);
}
- for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.)
+ for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.)
gpuKernel_k1<<<Dg, Db, 0, TabS[s]>>>(gpuInputTabAdr + s*stride);
}
- cudaThreadSynchronize(); // Wait all threads are ended
- // IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr, // Data transfers:
- gpuResAdr, // GPU --> CPU (sync. op)
+ cudaThreadSynchronize(); // Wait all threads are ended
+ // IF there is (now) a result to transfer from the GPU to the CPU:
+ cudaMemcpy(cpuResTabAdr, // Data transfers:
+ gpuResAdr, // GPU --> CPU (sync. op)
sizeof(float),
cudaMemcpyDeviceToHost);
}
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
if (i < NbIter) {
- MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op)
+ MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
cpuFuture,
N, MPI_FLOAT, dest, 0, ...);
// - Thread 1: achieves the CPU/GPU data transfers
} else if (omp_get_thread_num() == 1) {
if (i < NbIter) {
- cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
}
- // - Thread 2: achieves the GPU computations and the result transfer
+ // - Thread 2: achieves the GPU computations and the result transfer
} else if (omp_get_thread_num() == 2) {
if (i > 0) {
- gpuKernel_k1<<<Dg,Db>>>(gpuCurrent); // GPU comp. (async. op)
+ gpuKernel_k1<<<Dg,Db>>>(gpuCurrent);// GPU comp. (async. op)
// IF there is (now) a result to transfer from GPU to CPU:
- cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer:
- gpuResAdr, sizeof(float), // GPU --> CPU (sync. op)
+ cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer:
+ gpuResAdr, sizeof(float),// GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
}