]> AND Private Git Repository - book_gpu.git/blob - BookGPU/Chapters/chapter2/ex3.cu
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Merge branch 'master' of ssh://info.iut-bm.univ-fcomte.fr/book_gpu
[book_gpu.git] / BookGPU / Chapters / chapter2 / ex3.cu
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <assert.h>
6 #include "cutil_inline.h"
7 #include <cublas_v2.h>
8
9 const int width=16;
10 const int nbTh=width*width;
11
12 const int size=1024;
13 const   int sizeMat=size*size;
14
15 __global__ 
16 void matmul(float *d_A, float *d_B, float *d_C) {
17         int i= blockIdx.y*blockDim.y+ threadIdx.y;
18         int j= blockIdx.x*blockDim.x+ threadIdx.x;
19
20         float sum=0;
21         for(int k=0;k<size;k++) {
22                 sum+=d_A[i*size+k]*d_B[k*size+j];
23         }       
24         d_C[i*size+j]=sum;
25 }
26
27 int main( int argc, char** argv) 
28 {
29         float *h_arrayA=(float*)malloc(sizeMat*sizeof(float));
30         float *h_arrayB=(float*)malloc(sizeMat*sizeof(float));
31         float *h_arrayC=(float*)malloc(sizeMat*sizeof(float));
32         float *h_arrayCgpu=(float*)malloc(sizeMat*sizeof(float));
33
34         float *d_arrayA, *d_arrayB, *d_arrayC;
35
36         cudaMalloc((void**)&d_arrayA,sizeMat*sizeof(float));
37         cudaMalloc((void**)&d_arrayB,sizeMat*sizeof(float));
38         cudaMalloc((void**)&d_arrayC,sizeMat*sizeof(float));
39
40         srand48(32);
41         for(int i=0;i<sizeMat;i++) {
42                 h_arrayA[i]=drand48();
43                 h_arrayB[i]=drand48();
44                 h_arrayC[i]=0;
45                 h_arrayCgpu[i]=0;
46
47         }
48
49         cudaMemcpy(d_arrayA,h_arrayA, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
50         cudaMemcpy(d_arrayB,h_arrayB, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
51         cudaMemcpy(d_arrayC,h_arrayC, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
52
53         unsigned int timer_cpu = 0;
54         cutilCheckError(cutCreateTimer(&timer_cpu));
55   cutilCheckError(cutStartTimer(timer_cpu));
56         int sum=0;
57         for(int i=0;i<size;i++) {
58                 for(int j=0;j<size;j++) {
59                         for(int k=0;k<size;k++) {
60                                 h_arrayC[size*i+j]+=h_arrayA[size*i+k]*h_arrayB[size*k+j];
61                         }       
62                 }       
63         }
64         cutilCheckError(cutStopTimer(timer_cpu));
65         printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
66         cutDeleteTimer(timer_cpu);
67
68         unsigned int timer_gpu = 0;
69         cutilCheckError(cutCreateTimer(&timer_gpu));
70   cutilCheckError(cutStartTimer(timer_gpu));
71
72         dim3 dimGrid(size/width,size/width);
73         dim3 dimBlock(width,width);
74
75         matmul<<<dimGrid,dimBlock>>>(d_arrayA,d_arrayB,d_arrayC);
76         cudaThreadSynchronize();
77         
78         cutilCheckError(cutStopTimer(timer_gpu));
79         printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
80         cutDeleteTimer(timer_gpu);
81         
82         cudaMemcpy(h_arrayCgpu,d_arrayC, sizeMat * sizeof(float), cudaMemcpyDeviceToHost);
83         
84         for(int i=0;i<sizeMat;i++)
85                 if (fabs(h_arrayC[i]-h_arrayCgpu[i])>1e-4)
86                         printf("%f %f\n",h_arrayC[i],h_arrayCgpu[i]);
87         
88         cudaFree(d_arrayA);
89         cudaFree(d_arrayB);
90         cudaFree(d_arrayC);
91         free(h_arrayA);
92         free(h_arrayB);
93         free(h_arrayC);
94         free(h_arrayCgpu);
95         return 0;
96 }