#include "cutil_inline.h"
#include <cublas_v2.h>
-
const int nbThreadsPerBloc=256;
__global__
int main( int argc, char** argv)
{
-
if(argc!=2) {
printf("usage: ex2 nb_components\n");
exit(0);
}
int size=atoi(argv[1]);
-
cublasStatus_t stat;
cublasHandle_t handle;
stat=cublasCreate(&handle);
-
-
int i;
double *h_arrayA=(double*)malloc(size*sizeof(double));
double *h_arrayB=(double*)malloc(size*sizeof(double));
double *h_arrayCgpu=(double*)malloc(size*sizeof(double));
double *d_arrayA, *d_arrayB, *d_arrayC;
-
cudaMalloc((void**)&d_arrayA,size*sizeof(double));
cudaMalloc((void**)&d_arrayB,size*sizeof(double));
cudaMalloc((void**)&d_arrayC,size*sizeof(double));
h_arrayB[i]=2*(i+1);
}
-
unsigned int timer_cpu = 0;
cutilCheckError(cutCreateTimer(&timer_cpu));
- cutilCheckError(cutStartTimer(timer_cpu));
+ cutilCheckError(cutStartTimer(timer_cpu));
double dot=0;
for(i=0;i<size;i++) {
h_arrayC[i]=h_arrayA[i]+h_arrayB[i];
printf("CPU processing time : %f (ms) \n", cutGetTimerValue(timer_cpu));
cutDeleteTimer(timer_cpu);
-
unsigned int timer_gpu = 0;
cutilCheckError(cutCreateTimer(&timer_gpu));
- cutilCheckError(cutStartTimer(timer_gpu));
+ cutilCheckError(cutStartTimer(timer_gpu));
stat = cublasSetVector(size,sizeof(double),h_arrayA,1,d_arrayA,1);
stat = cublasSetVector(size,sizeof(double),h_arrayB,1,d_arrayB,1);
int nbBlocs=(size+nbThreadsPerBloc-1)/nbThreadsPerBloc;
double dot_gpu=0;
stat = cublasDdot(handle,size,d_arrayC,1,d_arrayA,1,&dot_gpu);
-
cutilCheckError(cutStopTimer(timer_gpu));
printf("GPU processing time : %f (ms) \n", cutGetTimerValue(timer_gpu));
cutDeleteTimer(timer_gpu);
-
- cublasGetVector(size,sizeof(double),d_arrayC,1,h_arrayCgpu,1);
-
printf("cpu dot %e --- gpu dot %e\n",dot,dot_gpu);
-
cudaFree(d_arrayA);
cudaFree(d_arrayB);
cudaFree(d_arrayC);
free(h_arrayB);
free(h_arrayC);
free(h_arrayCgpu);
-
cublasDestroy(handle);
return 0;
-
}