]> AND Private Git Repository - book_gpu.git/blob - BookGPU/Chapters/chapter2/ex2.cu
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
add ch17
[book_gpu.git] / BookGPU / Chapters / chapter2 / ex2.cu
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <assert.h>
6 #include "cutil_inline.h"
7 #include <cublas_v2.h>
8
9 const int nbThreadsPerBloc=256;
10
11 __global__ 
12 void addition(int size, double *d_C, double *d_A, double *d_B) {
13         int tid = blockIdx.x * blockDim.x + threadIdx.x;
14         if(tid<size) {
15                 d_C[tid]=d_A[tid]+d_B[tid];
16         }
17 }
18
19 __global__ 
20 void inverse(int size, double *d_x) {
21         int tid = blockIdx.x * blockDim.x + threadIdx.x;
22         if(tid<size) {
23                 d_x[tid]=1./d_x[tid];
24         }
25 }
26
27
28 int main( int argc, char** argv) 
29 {
30         if(argc!=2) { 
31                 printf("usage: ex2 nb_components\n");
32                 exit(0);
33         }
34
35         int size=atoi(argv[1]);
36         cublasStatus_t stat;
37         cublasHandle_t handle; 
38         stat=cublasCreate(&handle);
39         int i;
40         double *h_arrayA=(double*)malloc(size*sizeof(double));
41         double *h_arrayB=(double*)malloc(size*sizeof(double));
42         double *h_arrayC=(double*)malloc(size*sizeof(double));
43         double *h_arrayCgpu=(double*)malloc(size*sizeof(double));
44         double *d_arrayA, *d_arrayB, *d_arrayC;
45
46         cudaMalloc((void**)&d_arrayA,size*sizeof(double));
47         cudaMalloc((void**)&d_arrayB,size*sizeof(double));
48         cudaMalloc((void**)&d_arrayC,size*sizeof(double));
49
50         for(i=0;i<size;i++) {
51                 h_arrayA[i]=i+1;
52                 h_arrayB[i]=2*(i+1);
53         }
54
55         unsigned int timer_cpu = 0;
56         cutilCheckError(cutCreateTimer(&timer_cpu));
57   cutilCheckError(cutStartTimer(timer_cpu));
58         double dot=0;
59         for(i=0;i<size;i++) {
60                 h_arrayC[i]=h_arrayA[i]+h_arrayB[i];
61                 dot+=(1./h_arrayC[i])*(1./h_arrayA[i]);
62         }
63         cutilCheckError(cutStopTimer(timer_cpu));
64         printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
65         cutDeleteTimer(timer_cpu);
66
67         unsigned int timer_gpu = 0;
68         cutilCheckError(cutCreateTimer(&timer_gpu));
69   cutilCheckError(cutStartTimer(timer_gpu));
70         stat = cublasSetVector(size,sizeof(double),h_arrayA,1,d_arrayA,1);
71         stat = cublasSetVector(size,sizeof(double),h_arrayB,1,d_arrayB,1);
72         int nbBlocs=(size+nbThreadsPerBloc-1)/nbThreadsPerBloc;
73
74         addition<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayC,d_arrayA,d_arrayB);
75         inverse<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayC);
76         inverse<<<nbBlocs,nbThreadsPerBloc>>>(size,d_arrayA);
77         double dot_gpu=0;
78         stat = cublasDdot(handle,size,d_arrayC,1,d_arrayA,1,&dot_gpu);
79
80         cutilCheckError(cutStopTimer(timer_gpu));
81         printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
82         cutDeleteTimer(timer_gpu);
83         printf("cpu dot %e --- gpu dot %e\n",dot,dot_gpu);
84
85         cudaFree(d_arrayA);
86         cudaFree(d_arrayB);
87         cudaFree(d_arrayC);
88         free(h_arrayA);
89         free(h_arrayB);
90         free(h_arrayC);
91         free(h_arrayCgpu);
92         cublasDestroy(handle);
93         return 0;
94 }