\usepackage{subfigure}
%\usepackage{epsfig}
\usepackage{makeidx}
+\usepackage{listings}
+\usepackage{caption}
+\usepackage{courier}
+\usepackage{color}
\usepackage[sectionbib]{bibunits}
\usepackage{multicol}
\frenchspacing
\makeatother
+
+
+
+ \lstset{
+ basicstyle=\footnotesize\ttfamily, % Standardschrift
+ %numbers=left, % Ort der Zeilennummern
+ numberstyle=\tiny, % Stil der Zeilennummern
+ %stepnumber=2, % Abstand zwischen den Zeilennummern
+ numbersep=5pt, % Abstand der Nummern zum Text
+ tabsize=2, % Groesse von Tabs
+ extendedchars=true, %
+ breaklines=true, % Zeilen werden Umgebrochen
+ keywordstyle=\color{red},
+ frame=b,
+ % keywordstyle=[1]\textbf, % Stil der Keywords
+ % keywordstyle=[2]\textbf, %
+ % keywordstyle=[3]\textbf, %
+ % keywordstyle=[4]\textbf, \sqrt{\sqrt{}} %
+ stringstyle=\color{white}\ttfamily, % Farbe der String
+ showspaces=false, % Leerzeichen anzeigen ?
+ showtabs=false, % Tabs anzeigen ?
+ xleftmargin=17pt,
+ framexleftmargin=17pt,
+ framexrightmargin=5pt,
+ framexbottommargin=4pt,
+ %backgroundcolor=\color{lightgray},
+ showstringspaces=false % Leerzeichen in Strings anzeigen ?
+ }
+ \lstloadlanguages{% Check Dokumentation for further languages ...
+ %[Visual]Basic
+ %Pascal
+ C
+ %C++
+ %XML
+ %HTML
+ %Java
+ }
+ %\DeclareCaptionFont{blue}{\color{blue}}
+
+ %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
+\DeclareCaptionFont{white}{\color{white}}
+\DeclareCaptionFormat{listing}{\colorbox[cmyk]{0.43, 0.35, 0.35,0.01}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
+\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singlelinecheck=false, margin=0pt, font={bf,footnotesize}}
+
+
+
\makeindex
\begin{document}
--- /dev/null
+@Book{Sanders:2010:CEI,
+ author = "J. Sanders and E. Kandrot",
+ title = "{CUDA} by example: an introduction to general-purpose
+ {GPU} programming",
+ publisher = "Ad{\-d}i{\-s}on-Wes{\-l}ey",
+ address = "pub-AW:adr",
+ pages = "xix + 290",
+ year = "2010",
+ LCCN = "QA76.76.A65",
+}
\ No newline at end of file
In this chapter we give some simple examples on CUDA programming. The goal is
not to provide an exhaustive presentation of all the functionalities of CUDA but
rather giving some basic elements. Of course, readers that do not know CUDA are
-invited to read other books that are specialized on CUDA programming.
+invited to read other books that are specialized on CUDA programming (for example: \cite{Sanders:2010:CEI}).
\section{First example}
This first example is intented to show how to build a very simple example with
CUDA. The goal of this example is to performed the sum of two arrays and
putting the result into a third array. A cuda program consists in a C code
-which calls CUDA kernels that are executed on a GPU.
+which calls CUDA kernels that are executed on a GPU. The listing of this code is in Listing~\ref{ch2:lst:ex1}
As GPUs have their own memory, the first step consists in allocating memory on
function is the destination array, the second is the source array and the third
is the number of elements to copy (exprimed in bytes).
-\putbib[biblio]
+Now the GPU contains the data needed to perform the addition. In sequential such
+addition is achieved out with a loop on all the elements. With a GPU, it is
+possible to perform the addition of all elements of the arrays in parallel (if
+the number of blocks and threads per blocks is sufficient). In
+Listing\ref{ch2:lst:ex1} at the beginning, a simple kernel,
+called \texttt{addition} is defined to compute in parallel the summation of the
+two arrays. With CUDA, a kernel starts with the keyword \texttt{\_\_global\_\_}
+which indicates that this kernel can be call from the C code. The first
+instruction in this kernel is used to computed the \texttt{tid} which
+representes the thread index. This thread index is computed according to the
+values of the block index (it is a variable of CUDA
+called \texttt{blockIdx\index{CUDA~keywords!blockIdx}}). Blocks of threads can
+be decomposed into 1 dimension, 2 dimensions or 3 dimensions. According to the
+dimension of data manipulated, the appropriate dimension can be useful. In our
+example, only one dimension is used. Then using notation \texttt{.x} we can
+access to the first dimension (\texttt{.y} and \texttt{.z} allow respectively to
+access to the second and third dimension). The
+variable \texttt{blockDim}\index{CUDA~keywords!blockDim} gives the size of each
+block.
+
+
+
+\lstinputlisting[label=ch2:lst:ex1,caption=A simple example]{Chapters/chapter2/ex1.cu}
+
+\putbib[Chapters/chapter2/biblio]
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "cutil_inline.h"
+
+const int nbThreadsPerBloc=256;
+
+__global__
+void addition(int size, int *d_C, int *d_A, int *d_B) {
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if(tid<size) {
+ d_C[tid]=d_A[tid]+d_B[tid];
+ }
+}
+
+
+int main( int argc, char** argv)
+{
+
+ if(argc!=2) {
+ printf("usage: ex1 nb_components\n");
+ exit(0);
+ }
+
+
+
+ int size=atoi(argv[1]);
+
+ int i;
+ int *h_arrayA=(int*)malloc(size*sizeof(int));
+ int *h_arrayB=(int*)malloc(size*sizeof(int));
+ int *h_arrayC=(int*)malloc(size*sizeof(int));
+ int *h_arrayCgpu=(int*)malloc(size*sizeof(int));
+ int *d_arrayA, *d_arrayB, *d_arrayC;
+
+
+ cudaMalloc((void**)&d_arrayA,size*sizeof(int));
+ cudaMalloc((void**)&d_arrayB,size*sizeof(int));
+ cudaMalloc((void**)&d_arrayC,size*sizeof(int));
+
+ for(i=0;i<size;i++) {
+ h_arrayA[i]=i;
+ h_arrayB[i]=2*i;
+ }
+
+
+ unsigned int timer_cpu = 0;
+ cutilCheckError(cutCreateTimer(&timer_cpu));
+ cutilCheckError(cutStartTimer(timer_cpu));
+ for(i=0;i<size;i++) {
+ h_arrayC[i]=h_arrayA[i]+h_arrayB[i];
+ }
+ cutilCheckError(cutStopTimer(timer_cpu));
+ printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
+ cutDeleteTimer(timer_cpu);
+
+
+ unsigned int timer_gpu = 0;
+ cutilCheckError(cutCreateTimer(&timer_gpu));
+ cutilCheckError(cutStartTimer(timer_gpu));
+ cudaMemcpy(d_arrayA,h_arrayA, size * sizeof(int), cudaMemcpyHostToDevice);
+ cudaMemcpy(d_arrayB,h_arrayB, size * sizeof(int), cudaMemcpyHostToDevice);
+
+
+
+
+ int nbBlocs=(size+nbThreadsPerBloc-1)/nbThreadsPerBloc;
+
+ addition<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayC,d_arrayA,d_arrayB);
+
+ cudaMemcpy(h_arrayCgpu,d_arrayC, size * sizeof(int), cudaMemcpyDeviceToHost);
+
+ cutilCheckError(cutStopTimer(timer_gpu));
+ printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
+ cutDeleteTimer(timer_gpu);
+
+ for(i=0;i<size;i++)
+ assert(h_arrayC[i]==h_arrayCgpu[i]);
+
+ cudaFree(d_arrayA);
+ cudaFree(d_arrayB);
+ cudaFree(d_arrayC);
+ free(h_arrayA);
+ free(h_arrayB);
+ free(h_arrayC);
+
+
+ return 0;
+
+}
pdflatex ${BOOK}
bibtex bu1
bibtex bu2
- bibtex bu3
makeindex ${BOOK}.idx
pdflatex ${BOOK}
pdflatex ${BOOK}