%% %\renewcommand{\baselinestretch}{1}
-%% \begin{table}[!h]
-%% \begin{center}
-%% \caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } \label{ch11:table1}
-%% \begin{tabular}{|r|r|r|r|}
-
-%% Data & PAVA & MLS & GPU MLS \\ \hline
-
-%% monotone increasing $f$ & & & \\
-%% $n=5\times 10^4$ &0.01&5& 0.092\\
-%% $n=10^5$ &0.03&40& 0.35\\
-%% $n=5\times 10^5$ &0.4&1001&8.6 \\
-%% $n=10^6$ &0.8& 5000& 38 \\
-%% $n=2 \times 10^6$ & 1.6 &-- &152 \\
-%% $n=10 \times 10^6$ & 2 &-- & 3500 \\
-%% $n=20 \times 10^6$ & 4.5&-- & --\\
-%% $n=50 \times 10^6$ & 12 &-- & --\\
-%% \hline
-
-%% constant or decreasing $f$ & & & \\
-%% $n=10^6$ &0.2&0.1& 38\\
-%% $n=10 \times 10^6$ &1.9& 1.9& 3500 \\
-%% $n=20 \times 10^6$ &3.5& 4.0&-- \\
-%% $n=50 \times 10^6$ &11& 11& -- \\
-
-%% \end{tabular}
-%% \end{center}
-%% \end{table}
+\begin{table}[!h]
+\begin{center}
+\caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } \label{ch11:table1}
+\begin{tabular}{|r|r|r|r|}
+
+Data & PAVA & MLS & GPU MLS \\ \hline
+
+monotone increasing $f$ & & & \\
+$n=5\times 10^4$ &0.01&5& 0.092\\
+$n=10^5$ &0.03&40& 0.35\\
+$n=5\times 10^5$ &0.4&1001&8.6 \\
+$n=10^6$ &0.8& 5000& 38 \\
+$n=2 \times 10^6$ & 1.6 &-- &152 \\
+$n=10 \times 10^6$ & 2 &-- & 3500 \\
+$n=20 \times 10^6$ & 4.5&-- & --\\
+$n=50 \times 10^6$ & 12 &-- & --\\
+\hline
+
+constant or decreasing $f$ & & & \\
+$n=10^6$ &0.2&0.1& 38\\
+$n=10 \times 10^6$ &1.9& 1.9& 3500 \\
+$n=20 \times 10^6$ &3.5& 4.0&-- \\
+$n=50 \times 10^6$ &11& 11& -- \\
+
+\end{tabular}
+\end{center}
+\end{table}
%% %\renewcommand{\baselinestretch}{2}
\chapter{Introduction to CUDA}
\label{chapter2}
-\section{Introduction}\label{intro}
+\section{Introduction}
+\label{ch2:intro}
+
In this chapter we give some simple examples on CUDA programming. The goal is
not to provide an exhaustive presentation of all the functionalities of CUDA but
rather giving some basic elements. Of course, readers that do not know CUDA are
\section{First example}
+\label{ch2:1ex}
This first example is intented to show how to build a very simple example with
CUDA. The goal of this example is to performed the sum of two arrays and
\lstinputlisting[label=ch2:lst:ex1,caption=A simple example]{Chapters/chapter2/ex1.cu}
\section{Second example: using CUBLAS}
+\label{ch2:2ex}
The Basic Linear Algebra Subprograms (BLAS) allows programmer to use performant
routines that are often used. Those routines are heavily used in many scientific
-applications and are very optimized for vector operations, matrix-vector
+applications and are very optimized for vector operations, matrix-vector
operations and matrix-matrix
-operations~\cite{ch2:journals/ijhpca/Dongarra02}. Some of those operations seem
+operations~\cite{ch2:journals/ijhpca/Dongarra02}. Some of those operations seem
to be easy to implement with CUDA. Nevertheless, as soon as a reduction is
needed, implementing an efficient reduction routines with CUDA is far from being
-simple.
+simple. Roughly speaking, a reduction operation\index{reduction~operation} is an
+operation which combines all the elements of an array and extract a number
+computed with all the elements. For example, a sum, a maximum or a dot product
+are reduction operations.
In this second example, we consider that we have two vectors $A$ and $B$. First
of all, we want to compute the sum of both vectors in a vector $C$. Then we want
source of the array to transfer (in the GPU), the fourth is an offset between
each element of the source (usually this value is set to 1), the fifth is the
destination (in the GPU) and the last is an offset between each element of the
-destination.
-
-\lstinputlisting[label=ch2:lst:ex2,caption=A simple example]{Chapters/chapter2/ex2.cu}
-
+destination. Then we call the kernel \texttt{addition} which computes the sum of
+all elements of arrays $A$ and $B$. The \texttt{inverse} kernel is called twice,
+once to inverse elements of array $C$ and once for $A$. Finally, we call the
+function \texttt{cublasDdot} which computes the dot product of two vectors. To
+use this routine, we must specify the handle initialized by Cuda, the number of
+elements to consider, then each vector is followed by the offset between every
+element. After the GPU computation, it is possible to check that both
+computation produce the same result.
+
+\lstinputlisting[label=ch2:lst:ex2,caption=A simple example with cublas]{Chapters/chapter2/ex2.cu}
+
+\section{Third example: matrix-matrix multiplication}
+\label{ch2:3ex}
+
+
+
+Matrix-matrix multiplication is an operation which is quite easy to parallelize
+with a GPU. If we consider that a matrix is represented using a two dimensional
+array, A[i][j] represents the the element of the $i^{th}$ row and of the
+$j^{th}$ column. In many case, it is easier to manipulate 1D array instead of 2D
+array. With Cuda, even if it is possible to manipulate 2D arrays, in the
+following we present an example based on 1D array. For sake of simplicity we
+consider we have a squared matrix of size \texttt{size}. So with a 1D
+array, \texttt{A[i*size+j]} allows us to access to the element of the $i^{th}$
+row and of the $j^{th}$ column.
+
+On C2070M Tesla card, this code take 37.68ms to perform the multiplication. On a
+Intel Xeon E31245 at 3.30GHz, it takes 2465ms without any parallelization (using
+only one core). Consequently the speed up between the CPU and GPU version is
+about 65 which is very good regarding the difficulty of parallelizing this code.
+
+\lstinputlisting[label=ch2:lst:ex3,caption=simple Matrix-matrix multiplication with cuda]{Chapters/chapter2/ex3.cu}
\putbib[Chapters/chapter2/biblio]
cutilCheckError(cutStopTimer(timer_gpu));
printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
cutDeleteTimer(timer_gpu);
-
- cublasGetVector(size,sizeof(double),d_arrayC,1,h_arrayCgpu,1);
-
printf("cpu dot %e --- gpu dot %e\n",dot,dot_gpu);
-
cudaFree(d_arrayA);
cudaFree(d_arrayB);
cudaFree(d_arrayC);
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "cutil_inline.h"
+#include <cublas_v2.h>
+
+
+const int width=16;
+const int nbTh=width*width;
+
+const int size=1024;
+const int sizeMat=size*size;
+
+
+
+
+__global__
+void matmul(float *d_A, float *d_B, float *d_C) {
+ int i= blockIdx.y*blockDim.y+ threadIdx.y;
+ int j= blockIdx.x*blockDim.x+ threadIdx.x;
+
+ float sum=0;
+ for(int k=0;k<size;k++) {
+ sum+=d_A[i*size+k]*d_B[k*size+j];
+ }
+ d_C[i*size+j]=sum;
+
+}
+
+
+
+
+int main( int argc, char** argv)
+{
+
+ float *h_arrayA=(float*)malloc(sizeMat*sizeof(float));
+ float *h_arrayB=(float*)malloc(sizeMat*sizeof(float));
+ float *h_arrayC=(float*)malloc(sizeMat*sizeof(float));
+ float *h_arrayCgpu=(float*)malloc(sizeMat*sizeof(float));
+
+ float *d_arrayA, *d_arrayB, *d_arrayC;
+
+ cudaMalloc((void**)&d_arrayA,sizeMat*sizeof(float));
+ cudaMalloc((void**)&d_arrayB,sizeMat*sizeof(float));
+ cudaMalloc((void**)&d_arrayC,sizeMat*sizeof(float));
+
+
+ srand48(32);
+
+ for(int i=0;i<sizeMat;i++) {
+ h_arrayA[i]=drand48();
+ h_arrayB[i]=drand48();
+ h_arrayC[i]=0;
+ h_arrayCgpu[i]=0;
+
+ }
+
+ cudaMemcpy(d_arrayA,h_arrayA, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
+ cudaMemcpy(d_arrayB,h_arrayB, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
+ cudaMemcpy(d_arrayC,h_arrayC, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
+
+
+ unsigned int timer_cpu = 0;
+ cutilCheckError(cutCreateTimer(&timer_cpu));
+ cutilCheckError(cutStartTimer(timer_cpu));
+ int sum=0;
+ for(int i=0;i<size;i++) {
+ for(int j=0;j<size;j++) {
+ for(int k=0;k<size;k++) {
+ h_arrayC[size*i+j]+=h_arrayA[size*i+k]*h_arrayB[size*k+j];
+ }
+ }
+ }
+ cutilCheckError(cutStopTimer(timer_cpu));
+ printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
+ cutDeleteTimer(timer_cpu);
+
+
+
+
+ unsigned int timer_gpu = 0;
+ cutilCheckError(cutCreateTimer(&timer_gpu));
+ cutilCheckError(cutStartTimer(timer_gpu));
+
+
+
+ dim3 dimGrid(size/width,size/width);
+ dim3 dimBlock(width,width);
+
+ printf("%d %d\n",dimGrid.x,dimBlock.x);
+
+ matmul<<<dimGrid,dimBlock>>>(d_arrayA,d_arrayB,d_arrayC);
+ cudaThreadSynchronize();
+
+ cutilCheckError(cutStopTimer(timer_gpu));
+ printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
+ cutDeleteTimer(timer_gpu);
+
+ cudaMemcpy(h_arrayCgpu,d_arrayC, sizeMat * sizeof(float), cudaMemcpyDeviceToHost);
+
+ int good=1;
+ for(int i=0;i<sizeMat;i++)
+ if (fabs(h_arrayC[i]-h_arrayCgpu[i])>1e-4)
+ printf("%f %f\n",h_arrayC[i],h_arrayCgpu[i]);
+
+
+ cudaFree(d_arrayA);
+ cudaFree(d_arrayB);
+ cudaFree(d_arrayC);
+ free(h_arrayA);
+ free(h_arrayB);
+ free(h_arrayC);
+ free(h_arrayCgpu);
+
+ return 0;
+
+}