]> AND Private Git Repository - book_gpu.git/blob - BookGPU/Chapters/chapter2/ex3.cu
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
fbdf3a24d3d0b8f70975712be40383111e4bcab0
[book_gpu.git] / BookGPU / Chapters / chapter2 / ex3.cu
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <assert.h>
6 #include "cutil_inline.h"
7 #include <cublas_v2.h>
8
9
10 const int width=16;
11 const int nbTh=width*width;
12
13 const int size=1024;
14 const   int sizeMat=size*size;
15
16
17
18
19 __global__ 
20 void matmul(float *d_A, float *d_B, float *d_C) {
21         int i= blockIdx.y*blockDim.y+ threadIdx.y;
22         int j= blockIdx.x*blockDim.x+ threadIdx.x;
23
24         float sum=0;
25         for(int k=0;k<size;k++) {
26                 sum+=d_A[i*size+k]*d_B[k*size+j];
27         }       
28         d_C[i*size+j]=sum;
29
30 }
31
32
33
34
35 int main( int argc, char** argv) 
36 {
37
38         float *h_arrayA=(float*)malloc(sizeMat*sizeof(float));
39         float *h_arrayB=(float*)malloc(sizeMat*sizeof(float));
40         float *h_arrayC=(float*)malloc(sizeMat*sizeof(float));
41         float *h_arrayCgpu=(float*)malloc(sizeMat*sizeof(float));
42
43         float *d_arrayA, *d_arrayB, *d_arrayC;
44
45         cudaMalloc((void**)&d_arrayA,sizeMat*sizeof(float));
46         cudaMalloc((void**)&d_arrayB,sizeMat*sizeof(float));
47         cudaMalloc((void**)&d_arrayC,sizeMat*sizeof(float));
48
49
50         srand48(32);
51
52         for(int i=0;i<sizeMat;i++) {
53                 h_arrayA[i]=drand48();
54                 h_arrayB[i]=drand48();
55                 h_arrayC[i]=0;
56                 h_arrayCgpu[i]=0;
57
58         }
59
60         cudaMemcpy(d_arrayA,h_arrayA, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
61         cudaMemcpy(d_arrayB,h_arrayB, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
62         cudaMemcpy(d_arrayC,h_arrayC, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
63
64
65         unsigned int timer_cpu = 0;
66         cutilCheckError(cutCreateTimer(&timer_cpu));
67   cutilCheckError(cutStartTimer(timer_cpu));
68         int sum=0;
69         for(int i=0;i<size;i++) {
70                 for(int j=0;j<size;j++) {
71                         for(int k=0;k<size;k++) {
72                                 h_arrayC[size*i+j]+=h_arrayA[size*i+k]*h_arrayB[size*k+j];
73                         }       
74                 }       
75         }
76         cutilCheckError(cutStopTimer(timer_cpu));
77         printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
78         cutDeleteTimer(timer_cpu);
79
80
81
82
83         unsigned int timer_gpu = 0;
84         cutilCheckError(cutCreateTimer(&timer_gpu));
85   cutilCheckError(cutStartTimer(timer_gpu));
86
87
88
89         dim3 dimGrid(size/width,size/width);
90         dim3 dimBlock(width,width);
91
92         printf("%d %d\n",dimGrid.x,dimBlock.x);
93
94         matmul<<<dimGrid,dimBlock>>>(d_arrayA,d_arrayB,d_arrayC);
95         cudaThreadSynchronize();
96         
97         cutilCheckError(cutStopTimer(timer_gpu));
98         printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
99         cutDeleteTimer(timer_gpu);
100         
101         cudaMemcpy(h_arrayCgpu,d_arrayC, sizeMat * sizeof(float), cudaMemcpyDeviceToHost);
102         
103         int good=1;
104         for(int i=0;i<sizeMat;i++)
105                 if (fabs(h_arrayC[i]-h_arrayCgpu[i])>1e-4)
106                         printf("%f %f\n",h_arrayC[i],h_arrayCgpu[i]);
107         
108
109         cudaFree(d_arrayA);
110         cudaFree(d_arrayB);
111         cudaFree(d_arrayC);
112         free(h_arrayA);
113         free(h_arrayB);
114         free(h_arrayC);
115         free(h_arrayCgpu);
116
117         return 0;
118
119 }