X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/6e645292c9053655e4e44bbebf1c1eb751a554a6..a60840091cd318f2c8e530740dd7940fc393e898:/BookGPU/Chapters/chapter8/ch8.tex?ds=inline diff --git a/BookGPU/Chapters/chapter8/ch8.tex b/BookGPU/Chapters/chapter8/ch8.tex index a2529b1..5d012eb 100644 --- a/BookGPU/Chapters/chapter8/ch8.tex +++ b/BookGPU/Chapters/chapter8/ch8.tex @@ -1,6 +1,6 @@ -\chapterauthor{Imen Chakroun}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} -\chapterauthor{Nouredine Melab}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} +\chapterauthor{Imen Chakroun and Nouredine Melab}{University of Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique, 59655 Villeneuve d'Ascq cedex, France\\} +%\chapterauthor{Nouredine Melab}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} \chapter{GPU-accelerated Tree-based Exact Optimization Methods} \label{ch8:GPU-accelerated-tree-based-exact-optimization-methods} @@ -254,11 +254,12 @@ In the following, we present how we dealt with the thread/branch divergence issu \vspace{-0.4cm} -\section{Thread divergence \index{Thread divergence}} +\section{Thread divergence} +\label{ch8:ThreadDivergence} \subsection{The thread divergence issue} -During the execution of an application on GPU, to each GPU multiprocessor is assigned one or more thread block(s) to execute. Those threads are partitioned into warps that get scheduled for execution. For each instruction of the flow, the multiprocessor selects a warp that is ready to be run. A warp executes one common instruction at a time, so full efficiency is realized when all threads of a warp agree on their execution path. In this chapter, the G80 model, in which a warp is a pool of 32 threads, is used. If threads of a warp diverge via a data-dependent conditional branch, the warp serially executes each branch path taken. Threads that are not on the taken path are disabled, and when all paths complete, the threads converge back to the same execution path. This phenomenon is called thread/branch divergence and often causes serious performance degradations. Branch divergence occurs only within a warp; different warps execute independently regardless of whether they are executing common or disjointed code paths. +During the execution of an application on GPU, to each GPU multiprocessor is assigned one or more thread block(s) to execute. Those threads are partitioned into warps that get scheduled for execution. For each instruction of the flow, the multiprocessor selects a warp that is ready to be run. A warp executes one common instruction at a time, so full efficiency is realized when all threads of a warp agree on their execution path. In this chapter, the G80 model, in which a warp is a pool of 32 threads, is used. If threads of a warp diverge via a data-dependent conditional branch, the warp serially executes each branch path taken. Threads that are not on the taken path are disabled, and when all paths complete, the threads converge back to the same execution path. This phenomenon is called thread/branch divergence\index{Thread divergence} and often causes serious performance degradations. Branch divergence occurs only within a warp; different warps execute independently regardless of whether they are executing common or disjointed code paths. \vspace{0.2cm} @@ -499,7 +500,7 @@ To reduce the computation time cost of the term $\min\limits_{(i,j)\in \jmath^2, MM & $m \times (m-1)$ & $m \times (m-1)$ \\ \hline \end{tabular} - \caption{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} + \caption[The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses.]{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} \label{ch8:tabMemComplex} \end{table} @@ -534,7 +535,7 @@ $50 \times 20$ & 9.500 (9.5KB) & 9.500 (19KB) & 1.000 (1KB) & 20 (0.04KB) & 380 $20 \times 20$ & 3.800 (3.8KB) & 3.800 (7.6KB) & 400 (0.4KB) & 20 (0.04KB) & 380 (0.76KB) \\ \hline \end{tabular} -\caption{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} +\caption[The sizes of each data structure for the different experimented problem instances.]{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} \label{ch8:tabMemSizes} \end{table} @@ -765,7 +766,7 @@ $20 \times $20 & 41.94 & \textbf{60.10} & 48.28 & 39.86 & 39.61 & 38.93 & 37.79 % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:PTM-on-SM} \end{table} @@ -795,7 +796,7 @@ $20 \times $20 & 49.00 & \textbf{60.25} & 55.50 & 45.88 & 44.47 & 43.11 & 42.82 % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $JM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:JM-on-SM} \end{table} @@ -826,7 +827,7 @@ $20 \times $20 & 53.64 & \textbf{61.47} & 59.55 & 51.39 & 47.40 & 46.53 & 46.37\ % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} \label{ch8:JM-PTM-on-SM} \end{table}