From 88ce161b6f410c4d12b011560d4be74f23c4c471 Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 17 Oct 2012 13:53:23 +0200 Subject: [PATCH] new --- BookGPU/Chapters/chapter11/ch11.tex | 354 ++++++++++++++-------------- BookGPU/Chapters/chapter11/code1.cu | 44 ++++ BookGPU/Chapters/chapter11/code2.cu | 48 ++++ BookGPU/Chapters/chapter11/code3.cu | 40 ++++ 4 files changed, 312 insertions(+), 174 deletions(-) create mode 100644 BookGPU/Chapters/chapter11/code1.cu create mode 100644 BookGPU/Chapters/chapter11/code2.cu create mode 100644 BookGPU/Chapters/chapter11/code3.cu diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex index 8a8d54c..1bf638e 100644 --- a/BookGPU/Chapters/chapter11/ch11.tex +++ b/BookGPU/Chapters/chapter11/ch11.tex @@ -103,180 +103,186 @@ It is almost straightforward to parallelise this scheme for GPUs, by processing At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using bisection algorithm presented in Figure \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also done in parallel. The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Figure \ref{ch11:algeval}. -\begin{figure}[!hp] -\renewcommand{\baselinestretch}{1} - \begin{alltt} -\begin{center} -\begin{minipage}{13cm}\small - -template -\_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c, - double *t, double *alpha, double *beta, double *gamma, int N ) -\{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int s = tid*2; - while(tid<=(N-2)) - \{ - // decide whether an additional knot is necessary - if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance - \{ //no additional knot - h[s]=h[s+1]=u[tid]; - alpha[s]=alpha[s+1]=v[tid]; - beta[s]=beta[s+1]=c[tid]; - gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid]))); - \} else \{ //adding a knot - h[s]=u[tid]; - //determine the position of the knot - if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0) - h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/ - fmax(1e-10,(c[tid+1] - c[tid])); - else - h[s+1]=0.5*(u[tid+1] + u[tid]); - //calculate coefficients - double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/ - fmax(1e-10,(u[tid+1] - u[tid])); - alpha[s]=v[tid]; beta[s]=c[tid]; - gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid]))); - alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) + - (dtemp - c[tid])*(h[s+1] - u[tid])/2; - gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1]))); - beta[s+1]=dtemp; - \} - tid += blockDim.x * gridDim.x; s = tid*2; - \} - \_\_syncthreads(); - // Select a single thread to perform the last operation - if((threadIdx.x ) == 0) \{ - s = (N-1) * 2; h[s]=u[N-1]; - \} - \_\_syncthreads(); -\} -\end{minipage} -\end{center} -\end{alltt} -\caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.} -\label{ch11:algcoef} -\renewcommand{\baselinestretch}{2} -\end{figure} - -\begin{figure}[!hp] -\renewcommand{\baselinestretch}{1} - \begin{alltt} -\begin{center} -\begin{minipage}{13cm}\small - -template -\_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N) -\{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - while(tid<=(N-2)) \{ - b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid])); - tid += blockDim.x * gridDim.x; - \} - \_\_syncthreads(); -\} -\_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N) -\{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - while(tid<=(N-2)) \{ - if((b[tid-1]*b[tid])<=0) c[tid]=0; - else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]); - \} - tid += blockDim.x * gridDim.x; - \} - \_\_syncthreads(); -\} -\_\_global\_\_ void CalculateD( double *b, double *c, int N ) -\{ - if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0; - else c[0]=2*b[0] - c[1]; - if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0; - else c[N-1]=2*b[N-2] - c[N-2]; - \_\_syncthreads(); -\} -template -int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N, - double *t, double *alpha, double *beta, double *gamma) -\{ - int T = (N-1)*2+1; // length of the output array - double *b, *c; // temp variables - cudaMalloc( (void**)&b, 1*N*sizeof(double) ); - cudaMalloc( (void**)&c, 2*N*sizeof(double) ); - int threads=256; - int blocks = (N-1)/threads + 1; - CalculateBeta<<>>(d_X,d_Y,b,N); - CalculateDGeneral<<>>(b,c,N); - CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N - CalculateCoefficientsKnots<<>>(d_X, - d_Y,b,c,h,alpha,beta,gamma,N); - cudaFree(b); cudaFree(c); - return T; -\} -\end{minipage} -\end{center} -\end{alltt} -\caption{Calculation of monotone spline knots and coefficients.} -\label{ch11:algcoef1} -\renewcommand{\baselinestretch}{2} -\end{figure} - -\begin{figure}[!hp] -\renewcommand{\baselinestretch}{1} - \begin{alltt} -\begin{center} -\begin{minipage}{13cm}\small - -template -\_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma, int* l) -\{ - int i; ma--; - while(1) \{ - i=(mi+ma)/2; - if(z >= t[i]) mi=i+1; - else ma=i; - if(mi>=ma) break; - \} - *l = mi-1; -\} - -/* Kernel to evaluates monotone spline for a sequence of query points - residing in the array z of size m -*/ -template -\_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t, - double * alpha, double * beta, double * gamma, int T, Ty *value) -\{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int mi=0, ma=T, i=0; - Ty r; - while(tid -void MonotoneSplineValue(Tx *z, int K, double* t, - double * alpha, double * beta, double * gamma, int T, Ty* result) -\{ - int blocks,threads=256; - blocks=(K-1)/threads+1; - d\_MonSplineValue<<>>(z,K,t,alpha,beta,gamma,T,result); -\} -\end{minipage} -\end{center} -\end{alltt} -\caption{Implementation of the spline evaluation algorithm for GPU.} -\label{ch11:algeval} -\renewcommand{\baselinestretch}{2} -\end{figure} - - +\lstinputlisting[label=ch11:algcoef,caption=Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu} + + +%% \begin{figure}[!hp] +%% \renewcommand{\baselinestretch}{1} +%% \begin{alltt} +%% \begin{center} +%% \begin{minipage}{13cm}\small + +%% template +%% \_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c, +%% double *t, double *alpha, double *beta, double *gamma, int N ) +%% \{ +%% int tid = threadIdx.x + blockIdx.x * blockDim.x; +%% int s = tid*2; +%% while(tid<=(N-2)) +%% \{ +%% // decide whether an additional knot is necessary +%% if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance +%% \{ //no additional knot +%% h[s]=h[s+1]=u[tid]; +%% alpha[s]=alpha[s+1]=v[tid]; +%% beta[s]=beta[s+1]=c[tid]; +%% gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid]))); +%% \} else \{ //adding a knot +%% h[s]=u[tid]; +%% //determine the position of the knot +%% if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0) +%% h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/ +%% fmax(1e-10,(c[tid+1] - c[tid])); +%% else +%% h[s+1]=0.5*(u[tid+1] + u[tid]); +%% //calculate coefficients +%% double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/ +%% fmax(1e-10,(u[tid+1] - u[tid])); +%% alpha[s]=v[tid]; beta[s]=c[tid]; +%% gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid]))); +%% alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) + +%% (dtemp - c[tid])*(h[s+1] - u[tid])/2; +%% gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1]))); +%% beta[s+1]=dtemp; +%% \} +%% tid += blockDim.x * gridDim.x; s = tid*2; +%% \} +%% \_\_syncthreads(); +%% // Select a single thread to perform the last operation +%% if((threadIdx.x ) == 0) \{ +%% s = (N-1) * 2; h[s]=u[N-1]; +%% \} +%% \_\_syncthreads(); +%% \} +%% \end{minipage} +%% \end{center} +%% \end{alltt} +%% \caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.} +%% \label{ch11:algcoef} +%% \renewcommand{\baselinestretch}{2} +%% \end{figure} + + +\lstinputlisting[label=ch11:algcoef1,caption=Calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu} + +%% \begin{figure}[!hp] +%% \renewcommand{\baselinestretch}{1} +%% \begin{alltt} +%% \begin{center} +%% \begin{minipage}{13cm}\small + +%% template +%% \_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N) +%% \{ +%% int tid = threadIdx.x + blockIdx.x * blockDim.x; +%% while(tid<=(N-2)) \{ +%% b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid])); +%% tid += blockDim.x * gridDim.x; +%% \} +%% \_\_syncthreads(); +%% \} +%% \_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N) +%% \{ +%% int tid = threadIdx.x + blockIdx.x * blockDim.x; +%% while(tid<=(N-2)) \{ +%% if((b[tid-1]*b[tid])<=0) c[tid]=0; +%% else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]); +%% \} +%% tid += blockDim.x * gridDim.x; +%% \} +%% \_\_syncthreads(); +%% \} +%% \_\_global\_\_ void CalculateD( double *b, double *c, int N ) +%% \{ +%% if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0; +%% else c[0]=2*b[0] - c[1]; +%% if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0; +%% else c[N-1]=2*b[N-2] - c[N-2]; +%% \_\_syncthreads(); +%% \} +%% template +%% int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N, +%% double *t, double *alpha, double *beta, double *gamma) +%% \{ +%% int T = (N-1)*2+1; // length of the output array +%% double *b, *c; // temp variables +%% cudaMalloc( (void**)&b, 1*N*sizeof(double) ); +%% cudaMalloc( (void**)&c, 2*N*sizeof(double) ); +%% int threads=256; +%% int blocks = (N-1)/threads + 1; +%% CalculateBeta<<>>(d_X,d_Y,b,N); +%% CalculateDGeneral<<>>(b,c,N); +%% CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N +%% CalculateCoefficientsKnots<<>>(d_X, +%% d_Y,b,c,h,alpha,beta,gamma,N); +%% cudaFree(b); cudaFree(c); +%% return T; +%% \} +%% \end{minipage} +%% \end{center} +%% \end{alltt} +%% \caption{Calculation of monotone spline knots and coefficients.} +%% \label{ch11:algcoef1} +%% \renewcommand{\baselinestretch}{2} +%% \end{figure} + +%% \begin{figure}[!hp] +%% \renewcommand{\baselinestretch}{1} +%% \begin{alltt} +%% \begin{center} +%% \begin{minipage}{13cm}\small + +%% template +%% \_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma, int* l) +%% \{ +%% int i; ma--; +%% while(1) \{ +%% i=(mi+ma)/2; +%% if(z >= t[i]) mi=i+1; +%% else ma=i; +%% if(mi>=ma) break; +%% \} +%% *l = mi-1; +%% \} + +%% /* Kernel to evaluates monotone spline for a sequence of query points +%% residing in the array z of size m +%% */ +%% template +%% \_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t, +%% double * alpha, double * beta, double * gamma, int T, Ty *value) +%% \{ +%% int tid = threadIdx.x + blockIdx.x * blockDim.x; +%% int mi=0, ma=T, i=0; +%% Ty r; +%% while(tid +%% void MonotoneSplineValue(Tx *z, int K, double* t, +%% double * alpha, double * beta, double * gamma, int T, Ty* result) +%% \{ +%% int blocks,threads=256; +%% blocks=(K-1)/threads+1; +%% d\_MonSplineValue<<>>(z,K,t,alpha,beta,gamma,T,result); +%% \} +%% \end{minipage} +%% \end{center} +%% \end{alltt} +%% \caption{Implementation of the spline evaluation algorithm for GPU.} +%% \label{ch11:algeval} +%% \renewcommand{\baselinestretch}{2} +%% \end{figure} + +\lstinputlisting[label=ch11:algeval,caption=Implementation of the spline evaluation algorithm for GPU.]{Chapters/chapter11/code3.cu} \subsection{Monotone Hermite splines} diff --git a/BookGPU/Chapters/chapter11/code1.cu b/BookGPU/Chapters/chapter11/code1.cu new file mode 100644 index 0000000..3b77cc2 --- /dev/null +++ b/BookGPU/Chapters/chapter11/code1.cu @@ -0,0 +1,44 @@ + +template +__global__ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c, + double *t, double *alpha, double *beta, double *gamma, int N ) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int s = tid*2; + while(tid<=(N-2)) + { + // decide whether an additional knot is necessary + if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance + { + //no additional knot + h[s]=h[s+1]=u[tid]; + alpha[s]=alpha[s+1]=v[tid]; + beta[s]=beta[s+1]=c[tid]; + gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid]))); + } + else { + //adding a knot + h[s]=u[tid]; + //determine the position of the knot + if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0) + h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/fmax(1e-10,(c[tid+1] - c[tid])); + else + h[s+1]=0.5*(u[tid+1] + u[tid]); + //calculate coefficients + double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/fmax(1e-10,(u[tid+1] - u[tid])); + alpha[s]=v[tid]; beta[s]=c[tid]; + gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid]))); + alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) + + (dtemp - c[tid])*(h[s+1] - u[tid])/2; + gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1]))); + beta[s+1]=dtemp; + } + tid += blockDim.x * gridDim.x; s = tid*2; + } + __syncthreads(); + // Select a single thread to perform the last operation + if((threadIdx.x ) == 0) { + s = (N-1) * 2; h[s]=u[N-1]; + } + __syncthreads(); +} diff --git a/BookGPU/Chapters/chapter11/code2.cu b/BookGPU/Chapters/chapter11/code2.cu new file mode 100644 index 0000000..76b6c1a --- /dev/null +++ b/BookGPU/Chapters/chapter11/code2.cu @@ -0,0 +1,48 @@ +template +__global__ void CalculateBeta(Tx *u, Ty *v, double *b, int N) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + while(tid<=(N-2)) { + b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid])); + tid += blockDim.x * gridDim.x; + } + __syncthreads(); +} + +__global__ void CalculateDGeneral( double *b, double *c, int N) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + while(tid<=(N-2)) { + if((b[tid-1]*b[tid])<=0) c[tid]=0; + else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]); + } + tid += blockDim.x * gridDim.x; + } + __syncthreads(); +} + +__global__ void CalculateD( double *b, double *c, int N ) +{ + if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0; + else c[0]=2*b[0] - c[1]; + if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0; + else c[N-1]=2*b[N-2] - c[N-2]; + __syncthreads(); +} + +template +int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N, double *t, double *alpha, double *beta, double *gamma) +{ + int T = (N-1)*2+1; // length of the output array + double *b, *c; // temp variables + cudaMalloc( (void**)&b, 1*N*sizeof(double) ); + cudaMalloc( (void**)&c, 2*N*sizeof(double) ); + int threads=256; + int blocks = (N-1)/threads + 1; + CalculateBeta<<>>(d_X,d_Y,b,N); + CalculateDGeneral<<>>(b,c,N); + CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N + CalculateCoefficientsKnots<<>>(d_X,d_Y,b,c,h,alpha,beta,gamma,N); + cudaFree(b); cudaFree(c); + return T; +} \ No newline at end of file diff --git a/BookGPU/Chapters/chapter11/code3.cu b/BookGPU/Chapters/chapter11/code3.cu new file mode 100644 index 0000000..de460f6 --- /dev/null +++ b/BookGPU/Chapters/chapter11/code3.cu @@ -0,0 +1,40 @@ +template +__device__ void Bisection_device(T z, T* t, int mi, int ma, int* l) +{ + int i; ma--; + while(1) { + i=(mi+ma)/2; + if(z >= t[i]) mi=i+1; + else ma=i; + if(mi>=ma) break; + } + *l = mi-1; +} + +/* Kernel to evaluates monotone spline for a sequence of query points residing in the array z of size m +*/ +template +__global__ void d_MonSplineValue(Tx* z, int K, double* t, double * alpha, double * beta, double * gamma, int T, Ty *value) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int mi=0, ma=T, i=0; + Ty r; + while(tid +void MonotoneSplineValue(Tx *z, int K, double* t, + double * alpha, double * beta, double * gamma, int T, Ty* result) +{ + int blocks,threads=256; + blocks=(K-1)/threads+1; + d_MonSplineValue<<>>(z,K,t,alpha,beta,gamma,T,result); +} \ No newline at end of file -- 2.39.5