1 #include <gpulab/vector.h>
\r
3 __global__ void add(double* a, double const* b, int N)
\r
5 int i = blockDim.x*blockIdx.x + threadIdx.x;
\r
10 int main(int argc, char *argv[])
\r
14 // Plain CUDA example
\r
16 cudaMalloc((void**)&a1, N*sizeof(double));
\r
17 cudaMalloc((void**)&b1, N*sizeof(double));
\r
18 cudaMemset(a1, 2.0, N);
\r
19 cudaMemset(b1, 3.0, N);
\r
20 int blocksize = 128;
\r
21 add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);
\r
24 gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);
\r
25 gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);
\r
26 a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2
\r