+\pagebreak
+\lstinputlisting[label=ch11:algcoef,caption=Implementation of the kernel for calculating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu}
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
+%% double *t, double *alpha, double *beta, double *gamma, int N )
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% int s = tid*2;
+%% while(tid<=(N-2))
+%% \{
+%% // decide whether an additional knot is necessary
+%% if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
+%% \{ //no additional knot
+%% h[s]=h[s+1]=u[tid];
+%% alpha[s]=alpha[s+1]=v[tid];
+%% beta[s]=beta[s+1]=c[tid];
+%% gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
+%% \} else \{ //adding a knot
+%% h[s]=u[tid];
+%% //determine the position of the knot
+%% if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
+%% h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
+%% fmax(1e-10,(c[tid+1] - c[tid]));
+%% else
+%% h[s+1]=0.5*(u[tid+1] + u[tid]);
+%% //calculate coefficients
+%% double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
+%% fmax(1e-10,(u[tid+1] - u[tid]));
+%% alpha[s]=v[tid]; beta[s]=c[tid];
+%% gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
+%% alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
+%% (dtemp - c[tid])*(h[s+1] - u[tid])/2;
+%% gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
+%% beta[s+1]=dtemp;
+%% \}
+%% tid += blockDim.x * gridDim.x; s = tid*2;
+%% \}
+%% \_\_syncthreads();
+%% // Select a single thread to perform the last operation
+%% if((threadIdx.x ) == 0) \{
+%% s = (N-1) * 2; h[s]=u[N-1];
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
+%% \label{ch11:algcoef}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% while(tid<=(N-2)) \{
+%% b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% while(tid<=(N-2)) \{
+%% if((b[tid-1]*b[tid])<=0) c[tid]=0;
+%% else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
+%% \}
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateD( double *b, double *c, int N )
+%% \{
+%% if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0;
+%% else c[0]=2*b[0] - c[1];
+%% if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
+%% else c[N-1]=2*b[N-2] - c[N-2];
+%% \_\_syncthreads();
+%% \}
+%% template<typename Tx, typename Ty>
+%% int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
+%% double *t, double *alpha, double *beta, double *gamma)
+%% \{
+%% int T = (N-1)*2+1; // length of the output array
+%% double *b, *c; // temp variables
+%% cudaMalloc( (void**)&b, 1*N*sizeof(double) );
+%% cudaMalloc( (void**)&c, 2*N*sizeof(double) );
+%% int threads=256;
+%% int blocks = (N-1)/threads + 1;
+%% CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
+%% CalculateDGeneral<<<blocks,threads>>>(b,c,N);
+%% CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N
+%% CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
+%% d_Y,b,c,h,alpha,beta,gamma,N);
+%% cudaFree(b); cudaFree(c);
+%% return T;
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Calculation of monotone spline knots and coefficients.}
+%% \label{ch11:algcoef1}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename T>
+%% \_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma, int* l)
+%% \{
+%% int i; ma--;
+%% while(1) \{
+%% i=(mi+ma)/2;
+%% if(z >= t[i]) mi=i+1;
+%% else ma=i;
+%% if(mi>=ma) break;
+%% \}
+%% *l = mi-1;
+%% \}
+
+%% /* Kernel to evaluates monotone spline for a sequence of query points
+%% residing in the array z of size m
+%% */
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
+%% double * alpha, double * beta, double * gamma, int T, Ty *value)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% int mi=0, ma=T, i=0;
+%% Ty r;
+%% while(tid<K)
+%% \{
+%% Bisection\_device(z[tid], t, mi, ma, &i);
+%% r= z[tid]-t[i];
+%% r= alpha[i] + r*(beta[i] + gamma[i]*r);
+%% value[tid]=r;
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+
+%% template<typename Tx, typename Ty>
+%% void MonotoneSplineValue(Tx *z, int K, double* t,
+%% double * alpha, double * beta, double * gamma, int T, Ty* result)
+%% \{
+%% int blocks,threads=256;
+%% blocks=(K-1)/threads+1;
+%% d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the spline evaluation algorithm for GPU.}
+%% \label{ch11:algeval}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+\lstinputlisting[label=ch11:algeval,caption=Implementation of the spline evaluation algorithm for GPU.]{Chapters/chapter11/code3.cu}