]> AND Private Git Repository - book_gpu.git/blob - BookGPU/Chapters/chapter2/ex2.cu
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
762654ca88ba6d5346274e86d5bc168ba3f20b04
[book_gpu.git] / BookGPU / Chapters / chapter2 / ex2.cu
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <assert.h>
6 #include "cutil_inline.h"
7 #include <cublas_v2.h>
8
9
10 const int nbThreadsPerBloc=256;
11
12 __global__ 
13 void addition(int size, double *d_C, double *d_A, double *d_B) {
14         int tid = blockIdx.x * blockDim.x + threadIdx.x;
15         if(tid<size) {
16                 d_C[tid]=d_A[tid]+d_B[tid];
17         }
18 }
19
20 __global__ 
21 void inverse(int size, double *d_x) {
22         int tid = blockIdx.x * blockDim.x + threadIdx.x;
23         if(tid<size) {
24                 d_x[tid]=1./d_x[tid];
25         }
26 }
27
28
29 int main( int argc, char** argv) 
30 {
31
32         if(argc!=2) { 
33                 printf("usage: ex2 nb_components\n");
34                 exit(0);
35         }
36
37         int size=atoi(argv[1]);
38
39         cublasStatus_t stat;
40         cublasHandle_t handle; 
41         stat=cublasCreate(&handle);
42
43
44         int i;
45         double *h_arrayA=(double*)malloc(size*sizeof(double));
46         double *h_arrayB=(double*)malloc(size*sizeof(double));
47         double *h_arrayC=(double*)malloc(size*sizeof(double));
48         double *h_arrayCgpu=(double*)malloc(size*sizeof(double));
49         double *d_arrayA, *d_arrayB, *d_arrayC;
50
51
52         cudaMalloc((void**)&d_arrayA,size*sizeof(double));
53         cudaMalloc((void**)&d_arrayB,size*sizeof(double));
54         cudaMalloc((void**)&d_arrayC,size*sizeof(double));
55
56         for(i=0;i<size;i++) {
57                 h_arrayA[i]=i+1;
58                 h_arrayB[i]=2*(i+1);
59         }
60
61
62         unsigned int timer_cpu = 0;
63         cutilCheckError(cutCreateTimer(&timer_cpu));
64   cutilCheckError(cutStartTimer(timer_cpu));
65         double dot=0;
66         for(i=0;i<size;i++) {
67                 h_arrayC[i]=h_arrayA[i]+h_arrayB[i];
68                 dot+=(1./h_arrayC[i])*(1./h_arrayA[i]);
69         }
70         cutilCheckError(cutStopTimer(timer_cpu));
71         printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
72         cutDeleteTimer(timer_cpu);
73
74
75         unsigned int timer_gpu = 0;
76         cutilCheckError(cutCreateTimer(&timer_gpu));
77   cutilCheckError(cutStartTimer(timer_gpu));
78         stat = cublasSetVector(size,sizeof(double),h_arrayA,1,d_arrayA,1);
79         stat = cublasSetVector(size,sizeof(double),h_arrayB,1,d_arrayB,1);
80         int nbBlocs=(size+nbThreadsPerBloc-1)/nbThreadsPerBloc;
81
82         addition<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayC,d_arrayA,d_arrayB);
83         inverse<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayC);
84         inverse<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayA);
85         double dot_gpu=0;
86         stat = cublasDdot(handle,size,d_arrayC,1,d_arrayA,1,&dot_gpu);
87
88
89         cutilCheckError(cutStopTimer(timer_gpu));
90         printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
91         cutDeleteTimer(timer_gpu);
92         
93         cublasGetVector(size,sizeof(double),d_arrayC,1,h_arrayCgpu,1);
94
95         printf("cpu dot %e --- gpu dot %e\n",dot,dot_gpu);
96
97
98         cudaFree(d_arrayA);
99         cudaFree(d_arrayB);
100         cudaFree(d_arrayC);
101         free(h_arrayA);
102         free(h_arrayB);
103         free(h_arrayC);
104         free(h_arrayCgpu);
105
106         cublasDestroy(handle);
107         return 0;
108
109 }