-#include <gpulab/vector.h>\r
-\r
-__global__ void add(double* a, double const* b, int N)\r
-{\r
- int i = blockDim.x*blockIdx.x + threadIdx.x;\r
- if(i<N)\r
- a[i] += b[i];\r
-}\r
-\r
-int main(int argc, char *argv[])\r
-{\r
- int N = 1000;\r
-\r
- // Plain CUDA example\r
- double *a1, *b1;\r
- cudaMalloc((void**)&a1, N*sizeof(double));\r
- cudaMalloc((void**)&b1, N*sizeof(double));\r
- cudaMemset(a1, 2.0, N);\r
- cudaMemset(b1, 3.0, N);\r
- int blocksize = 128;\r
- add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);\r
-\r
- // gpulab example\r
- gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);\r
- gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);\r
- a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2\r
- \r
- return 0;\r
+#include <gpulab/vector.h>
+
+__global__ void add(double* a, double const* b, int N)
+{
+ int i = blockDim.x*blockIdx.x + threadIdx.x;
+ if(i<N)
+ a[i] += b[i];
+}
+
+int main(int argc, char *argv[])
+{
+ int N = 1000;
+
+ // Basic CUDA example
+ double *a1, *b1;
+ cudaMalloc((void**)&a1, N*sizeof(double));
+ cudaMalloc((void**)&b1, N*sizeof(double));
+ cudaMemset(a1, 2.0, N);
+ cudaMemset(b1, 3.0, N);
+ int blocksize = 128;
+ add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);
+
+ // gpulab example
+ gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);
+ gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);
+ a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2
+
+ return 0;