X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/0d39f3bfb1736ae41805f75a779e0bb01f4f5139..55ce7168c6e69a2462d76c95dc9a5298ceedb04f:/BookGPU/Chapters/chapter4/ch4.tex?ds=inline diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex index 90612c9..be254fe 100644 --- a/BookGPU/Chapters/chapter4/ch4.tex +++ b/BookGPU/Chapters/chapter4/ch4.tex @@ -14,7 +14,7 @@ operation} basically consists of taking the sum of products of elements from two 2D functions, letting one of the two functions move over every element of the other, producing a third function that is typically viewed as a modified version of one of the original functions. To -begin with, we shall examine non separable or generic convolutions, +begin with, we shall examine nonseparable or generic convolutions, before addressing the matter of separable convolutions. We shall refer to $I$ as an $H\times L$ pixel gray-level image and to $I(x,y)$ as the gray-level value of each pixel of coordinates $(x,y)$. @@ -113,7 +113,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533 &25.56&533 \\\hline \end{tabular} } -\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card.]{Timings (time) and throughput values (TP in MPx/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} +\caption[Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card.]{Timings (time) and throughput values (TP in MPx/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} \label{tab:convoNonSepReg1} \end{table} @@ -142,7 +142,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline \end{tabular} } -\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280.]{Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} +\caption[Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280.]{Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} \label{tab:convoNonSepReg3} \end{table} @@ -239,7 +239,7 @@ However, our technique requires writing one kernel per mask size, which can be s \lstinputlisting[label={lst:convoGene8x8pL3},caption=CUDA kernel achieving a $3\times 3$ convolution operation with the mask in symbol memory and direct data fetches in texture memory]{Chapters/chapter4/code/convoGene8x8pL3.cu} -\subsection{Using shared memory to store prefetched data\index{prefetching}.} +\subsection{Using shared memory to store prefetched data\index{prefetching}} \index{memory hierarchy!shared memory} A more convenient way of coding a convolution kernel is to use shared memory to perform a prefetching stage of the whole halo before computing the convolution sums. This proves to be quite efficient and more versatile, but it obviously generates some overhead because @@ -302,7 +302,7 @@ This saves a lot of arithmetic operations, as a generic $n\times n$ convolution However, besides reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory. CPU implementations of separable convolutions often use a single function to perform both 1D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. -Applying this principle to GPUs is not efficient, as outputting the transposed image means non coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of the vertical and horizontal convolutions. +Applying this principle to GPUs is not efficient, as outputting the transposed image means noncoalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of the vertical and horizontal convolutions. Here, the use of shared memory is the best choice, as there is no overlapping between neighbor windows and thus no possible optimization. Moreover, to ensure efficiency, it is important to read the input image from texture memory, which implies an internal GPU data copy between both 1D convolution stages. @@ -322,7 +322,7 @@ $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.46 $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline \end{tabular}} -\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.} +\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than nonseparable ones.} \label{tab:convoSepSh1} \end{table} \begin{table}[h] @@ -337,7 +337,7 @@ $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hl $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline \end{tabular} } -\caption[Throughput values, in megapixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones (data transfer durations are those of Table \ref{tab:memcpy1}).} +\caption[Throughput values, in megapixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card. Bold values correspond to situations where separable-convolution kernels run faster than nonseparable ones (data transfer durations are those of Table \ref{tab:memcpy1}).} \label{tab:convoSepSh2} \end{table} \begin{table}[h]