From fc4670d0de6814f682df0ce247905cba40b9d547 Mon Sep 17 00:00:00 2001 From: Raphael Couturier Date: Mon, 8 Oct 2012 18:18:13 +0200 Subject: [PATCH 1/1] suite --- BookGPU/Chapters/chapter2/biblio.bib | 19 ++++- BookGPU/Chapters/chapter2/ch2.tex | 58 +++++++++++--- BookGPU/Chapters/chapter2/ex2.cu | 109 +++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 12 deletions(-) create mode 100644 BookGPU/Chapters/chapter2/ex2.cu diff --git a/BookGPU/Chapters/chapter2/biblio.bib b/BookGPU/Chapters/chapter2/biblio.bib index f52ff92..d577ae8 100644 --- a/BookGPU/Chapters/chapter2/biblio.bib +++ b/BookGPU/Chapters/chapter2/biblio.bib @@ -1,4 +1,4 @@ -@Book{Sanders:2010:CEI, +@Book{ch2:Sanders:2010:CEI, author = "J. Sanders and E. Kandrot", title = "{CUDA} by example: an introduction to general-purpose {GPU} programming", @@ -7,4 +7,19 @@ pages = "xix + 290", year = "2010", LCCN = "QA76.76.A65", -} \ No newline at end of file +} + + +@Article{ch2:journals/ijhpca/Dongarra02, + title = "Basic Linear Algebra Subprograms Technical (Blast) + Forum Standard (1)", + author = "Jack Dongarra", + journal = "IJHPCA", + year = "2002", + number = "1", + volume = "16", + bibdate = "2009-11-11", + bibsource = "DBLP, + http://dblp.uni-trier.de/db/journals/ijhpca/ijhpca16.html#Dongarra02", + pages = "1--111", +} diff --git a/BookGPU/Chapters/chapter2/ch2.tex b/BookGPU/Chapters/chapter2/ch2.tex index 804afc2..a9e9a87 100755 --- a/BookGPU/Chapters/chapter2/ch2.tex +++ b/BookGPU/Chapters/chapter2/ch2.tex @@ -1,6 +1,4 @@ -\chapterauthor{Author Name1}{Affiliation text1} -\chapterauthor{Author Name2}{Affiliation text2} - +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} \chapter{Introduction to CUDA} \label{chapter2} @@ -9,7 +7,8 @@ In this chapter we give some simple examples on CUDA programming. The goal is not to provide an exhaustive presentation of all the functionalities of CUDA but rather giving some basic elements. Of course, readers that do not know CUDA are -invited to read other books that are specialized on CUDA programming (for example: \cite{Sanders:2010:CEI}). +invited to read other books that are specialized on CUDA programming (for +example: \cite{ch2:Sanders:2010:CEI}). \section{First example} @@ -17,7 +16,8 @@ invited to read other books that are specialized on CUDA programming (for exampl This first example is intented to show how to build a very simple example with CUDA. The goal of this example is to performed the sum of two arrays and putting the result into a third array. A cuda program consists in a C code -which calls CUDA kernels that are executed on a GPU. The listing of this code is in Listing~\ref{ch2:lst:ex1} +which calls CUDA kernels that are executed on a GPU. The listing of this code is +in Listing~\ref{ch2:lst:ex1}. As GPUs have their own memory, the first step consists in allocating memory on @@ -47,11 +47,12 @@ is possible to perform the addition of all elements of the arrays in parallel (if the number of blocks and threads per blocks is sufficient). In Listing\ref{ch2:lst:ex1} at the beginning, a simple kernel, called \texttt{addition} is defined to compute in parallel the summation of the -two arrays. With CUDA, a kernel starts with the keyword \texttt{\_\_global\_\_} \index{CUDA~keywords!\_\_shared\_\_} -which indicates that this kernel can be called from the C code. The first -instruction in this kernel is used to compute the variable \texttt{tid} which -represents the thread index. This thread index\index{thread index} is computed -according to the values of the block index (it is a variable of CUDA +two arrays. With CUDA, a kernel starts with the +keyword \texttt{\_\_global\_\_} \index{CUDA~keywords!\_\_shared\_\_} which +indicates that this kernel can be called from the C code. The first instruction +in this kernel is used to compute the variable \texttt{tid} which represents the +thread index. This thread index\index{thread index} is computed according to +the values of the block index (it is a variable of CUDA called \texttt{blockIdx}\index{CUDA~keywords!blockIdx}). Blocks of threads can be decomposed into 1 dimension, 2 dimensions or 3 dimensions. According to the dimension of data manipulated, the appropriate dimension can be useful. In our @@ -65,5 +66,42 @@ block. \lstinputlisting[label=ch2:lst:ex1,caption=A simple example]{Chapters/chapter2/ex1.cu} +\section{Second example: using CUBLAS} + +The Basic Linear Algebra Subprograms (BLAS) allows programmer to use performant +routines that are often used. Those routines are heavily used in many scientific +applications and are very optimzed for vector operations, matrix-vector +operations and matrix-matrix +operations~\cite{ch2:journals/ijhpca/Dongarra02}. Some of those operations seems +to be easy to implement with CUDA. Nevertheless, as soon as a reduction is +needed, implementing an efficient reduction routines with CUDA is far from being +simple. + +In this second example, we consider that we have two vectors $A$ and $B$. First +of all we want to compute the sum of both vectors in a vector $C$. Then we want +to compute the scalar product between $1/C$ and $1/A$. This is just an example +which has not direct interest except to show how to program it with CUDA. + +Listing~\ref{ch2:lst:ex2} shows this example with CUDA. The first kernel for the +addition of two arrays is exactly the same that the one described in the +previous example. + +The kernel to compute the inverse of the elements of an array is very +simple. For each thread index, the inverse of the array replaces the initial +array. + +In the main function, the beginning is very similar to the one in the previous +example. First the number of elements is asked to the user. Then a call +to \texttt{cublasCreate} allows to initialize the cublas library. It creates an +handle. Then all the arrays are allocated in the host and the device, as in the +previous example. Both arrays $A$ and $B$ are initialized. Then the CPU +computation is performed and the time for this CPU computation is measured. In +order to compute the same result on the GPU, first of all, data from the CPU +need to be copied into the memory of the GPU. For that, it is possible to use +cublas function \texttt{cublasSetVector}. + +\lstinputlisting[label=ch2:lst:ex2,caption=A simple example]{Chapters/chapter2/ex2.cu} + + \putbib[Chapters/chapter2/biblio] diff --git a/BookGPU/Chapters/chapter2/ex2.cu b/BookGPU/Chapters/chapter2/ex2.cu new file mode 100644 index 0000000..762654c --- /dev/null +++ b/BookGPU/Chapters/chapter2/ex2.cu @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include "cutil_inline.h" +#include + + +const int nbThreadsPerBloc=256; + +__global__ +void addition(int size, double *d_C, double *d_A, double *d_B) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if(tid>>(size,d_arrayC,d_arrayA,d_arrayB); + inverse<<>>(size,d_arrayC); + inverse<<>>(size,d_arrayA); + double dot_gpu=0; + stat = cublasDdot(handle,size,d_arrayC,1,d_arrayA,1,&dot_gpu); + + + cutilCheckError(cutStopTimer(timer_gpu)); + printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu)); + cutDeleteTimer(timer_gpu); + + cublasGetVector(size,sizeof(double),d_arrayC,1,h_arrayCgpu,1); + + printf("cpu dot %e --- gpu dot %e\n",dot,dot_gpu); + + + cudaFree(d_arrayA); + cudaFree(d_arrayB); + cudaFree(d_arrayC); + free(h_arrayA); + free(h_arrayB); + free(h_arrayC); + free(h_arrayCgpu); + + cublasDestroy(handle); + return 0; + +} -- 2.39.5