new

[book_gpu.git] / BookGPU / Chapters / chapter4 / ch4.tex
diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex

index ea473a15bec738d54b08222571fd45558eab4530..d618753ae9b6fe8a1cf4ab2797c536c8d4bbff1f 100644 (file)
--- a/BookGPU/Chapters/chapter4/ch4.tex
+++ b/BookGPU/Chapters/chapter4/ch4.tex
@@ -70,7 +70,7 @@ For more readability, only part of the connecting lines are shown.
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
-   \caption{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
+   \caption[Principle of a generic convolution implementation.]{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
@@ -138,7 +138,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg1}
  \end{table} 
  
  \label{tab:convoNonSepReg1}
  \end{table} 
  
@@ -155,7 +155,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg3}
  \end{table}
  
  \label{tab:convoNonSepReg3}
  \end{table}
  
@@ -250,7 +250,7 @@ $\mathbf{2048\times 2048}$&0.495&2071 &\bf 0.987&1666 &1.615&1334 \\\hline
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGene8x8p}
  \end{table}
   
  \label{tab:convoGene8x8p}
  \end{table}
   
@@ -269,7 +269,7 @@ This proves to be quite efficient and more versatile, but it obviously generates
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
-   \caption{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
+   \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
@@ -304,7 +304,7 @@ $\mathbf{2048\times 2048}$&2023 &\bf 1586 &1172 &818&676&554\\\hline
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
@@ -354,7 +354,7 @@ $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.46
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
-\caption{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
+\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
@@ -369,7 +369,7 @@ $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hl
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoSepSh2}
  \end{table} 
  
  \label{tab:convoSepSh2}
  \end{table}