1 #include <gpulab/vector.h>
3 __global__ void add(double* a, double const* b, int N)
5 int i = blockDim.x*blockIdx.x + threadIdx.x;
10 int main(int argc, char *argv[])
16 cudaMalloc((void**)&a1, N*sizeof(double));
17 cudaMalloc((void**)&b1, N*sizeof(double));
18 cudaMemset(a1, 2.0, N);
19 cudaMemset(b1, 3.0, N);
21 add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);
24 gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);
25 gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);
26 a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2