6 #include "cutil_inline.h"
11 const int nbTh=width*width;
14 const int sizeMat=size*size;
20 void matmul(float *d_A, float *d_B, float *d_C) {
21 int i= blockIdx.y*blockDim.y+ threadIdx.y;
22 int j= blockIdx.x*blockDim.x+ threadIdx.x;
25 for(int k=0;k<size;k++) {
26 sum+=d_A[i*size+k]*d_B[k*size+j];
35 int main( int argc, char** argv)
38 float *h_arrayA=(float*)malloc(sizeMat*sizeof(float));
39 float *h_arrayB=(float*)malloc(sizeMat*sizeof(float));
40 float *h_arrayC=(float*)malloc(sizeMat*sizeof(float));
41 float *h_arrayCgpu=(float*)malloc(sizeMat*sizeof(float));
43 float *d_arrayA, *d_arrayB, *d_arrayC;
45 cudaMalloc((void**)&d_arrayA,sizeMat*sizeof(float));
46 cudaMalloc((void**)&d_arrayB,sizeMat*sizeof(float));
47 cudaMalloc((void**)&d_arrayC,sizeMat*sizeof(float));
52 for(int i=0;i<sizeMat;i++) {
53 h_arrayA[i]=drand48();
54 h_arrayB[i]=drand48();
60 cudaMemcpy(d_arrayA,h_arrayA, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
61 cudaMemcpy(d_arrayB,h_arrayB, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
62 cudaMemcpy(d_arrayC,h_arrayC, sizeMat * sizeof(float), cudaMemcpyHostToDevice);
65 unsigned int timer_cpu = 0;
66 cutilCheckError(cutCreateTimer(&timer_cpu));
67 cutilCheckError(cutStartTimer(timer_cpu));
69 for(int i=0;i<size;i++) {
70 for(int j=0;j<size;j++) {
71 for(int k=0;k<size;k++) {
72 h_arrayC[size*i+j]+=h_arrayA[size*i+k]*h_arrayB[size*k+j];
76 cutilCheckError(cutStopTimer(timer_cpu));
77 printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
78 cutDeleteTimer(timer_cpu);
83 unsigned int timer_gpu = 0;
84 cutilCheckError(cutCreateTimer(&timer_gpu));
85 cutilCheckError(cutStartTimer(timer_gpu));
89 dim3 dimGrid(size/width,size/width);
90 dim3 dimBlock(width,width);
92 printf("%d %d\n",dimGrid.x,dimBlock.x);
94 matmul<<<dimGrid,dimBlock>>>(d_arrayA,d_arrayB,d_arrayC);
95 cudaThreadSynchronize();
97 cutilCheckError(cutStopTimer(timer_gpu));
98 printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
99 cutDeleteTimer(timer_gpu);
101 cudaMemcpy(h_arrayCgpu,d_arrayC, sizeMat * sizeof(float), cudaMemcpyDeviceToHost);
104 for(int i=0;i<sizeMat;i++)
105 if (fabs(h_arrayC[i]-h_arrayCgpu[i])>1e-4)
106 printf("%f %f\n",h_arrayC[i],h_arrayCgpu[i]);