-It is almost straightforward to parallelise this scheme for GPUs, by processing each subinterval $[x_i,x_{i+1}]$ independently in a separate thread. However, it is not known in advance whether an extra knot $t_i$ needs to be inserted or not, and therefore calculation of the position of the knot in the output sequence of knots ${t_i}$ is problematic for parallel implementation (for a sequential algorithm no such issue arises). To avoid serialisation, we decided to insert an additional knot in every interval $[x_i,x_{i+1}]$, but set $t_i=x_i$ when the extra knot is not actually needed. This way we know in advance the position of the output knots and the length of this sequence is $2(n-1)$, and therefore all calculations can now be performed independently. The price we pay is that some of the spline knots can coincide. However, this does not affect spline evaluation, as one of the coinciding knots is simply disregarded, and the spline coefficients are replicated (so for a double knot $t_i=t_{i+1}$, we have $\alpha_i=\alpha_{i+1}$, $\beta_i=\beta_{i+1}$, $\gamma_i=\gamma_{i+1}$). Our implementation is presented in Figures \ref{ch11:algcoef}-\ref{ch11:algcoef1}.
-
-At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using bisection algorithm presented in Figure \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also done in parallel.
-The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Figure \ref{ch11:algeval}.
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
- double *t, double *alpha, double *beta, double *gamma, int N )
-\{
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- int s = tid*2;
- while(tid<=(N-2))
- \{
- // decide whether an additional knot is necessary
- if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
- \{ //no additional knot
- h[s]=h[s+1]=u[tid];
- alpha[s]=alpha[s+1]=v[tid];
- beta[s]=beta[s+1]=c[tid];
- gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
- \} else \{ //adding a knot
- h[s]=u[tid];
- //determine the position of the knot
- if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
- h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
- fmax(1e-10,(c[tid+1] - c[tid]));
- else
- h[s+1]=0.5*(u[tid+1] + u[tid]);
- //calculate coefficients
- double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
- fmax(1e-10,(u[tid+1] - u[tid]));
- alpha[s]=v[tid]; beta[s]=c[tid];
- gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
- alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
- (dtemp - c[tid])*(h[s+1] - u[tid])/2;
- gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
- beta[s+1]=dtemp;
- \}
- tid += blockDim.x * gridDim.x; s = tid*2;
- \}
- \_\_syncthreads();
- // Select a single thread to perform the last operation
- if((threadIdx.x ) == 0) \{
- s = (N-1) * 2; h[s]=u[N-1];
- \}
- \_\_syncthreads();
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
-\label{ch11:algcoef}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
-\{
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- while(tid<=(N-2)) \{
- b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
- tid += blockDim.x * gridDim.x;
- \}
- \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
-\{
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- while(tid<=(N-2)) \{
- if((b[tid-1]*b[tid])<=0) c[tid]=0;
- else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
- \}
- tid += blockDim.x * gridDim.x;
- \}
- \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateD( double *b, double *c, int N )
-\{
- if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0;
- else c[0]=2*b[0] - c[1];
- if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
- else c[N-1]=2*b[N-2] - c[N-2];
- \_\_syncthreads();
-\}
-template<typename Tx, typename Ty>
-int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
- double *t, double *alpha, double *beta, double *gamma)
-\{
- int T = (N-1)*2+1; // length of the output array
- double *b, *c; // temp variables
- cudaMalloc( (void**)&b, 1*N*sizeof(double) );
- cudaMalloc( (void**)&c, 2*N*sizeof(double) );
- int threads=256;
- int blocks = (N-1)/threads + 1;
- CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
- CalculateDGeneral<<<blocks,threads>>>(b,c,N);
- CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N
- CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
- d_Y,b,c,h,alpha,beta,gamma,N);
- cudaFree(b); cudaFree(c);
- return T;
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Calculation of monotone spline knots and coefficients.}
-\label{ch11:algcoef1}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename T>
-\_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma, int* l)
-\{
- int i; ma--;
- while(1) \{
- i=(mi+ma)/2;
- if(z >= t[i]) mi=i+1;
- else ma=i;
- if(mi>=ma) break;
- \}
- *l = mi-1;
-\}
-
-/* Kernel to evaluates monotone spline for a sequence of query points
- residing in the array z of size m
-*/
-template<typename Tx, typename Ty>
-\_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
- double * alpha, double * beta, double * gamma, int T, Ty *value)
-\{
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- int mi=0, ma=T, i=0;
- Ty r;
- while(tid<K)
- \{
- Bisection\_device(z[tid], t, mi, ma, &i);
- r= z[tid]-t[i];
- r= alpha[i] + r*(beta[i] + gamma[i]*r);
- value[tid]=r;
- tid += blockDim.x * gridDim.x;
- \}
- \_\_syncthreads();
-\}
-
-template<typename Tx, typename Ty>
-void MonotoneSplineValue(Tx *z, int K, double* t,
- double * alpha, double * beta, double * gamma, int T, Ty* result)
-\{
- int blocks,threads=256;
- blocks=(K-1)/threads+1;
- d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the spline evaluation algorithm for GPU.}
-\label{ch11:algeval}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-
+It is almost straightforward to parallelize this scheme for GPUs, by processing each subinterval $[x_i,x_{i+1}]$ independently in a separate thread. However, it is not known in advance whether an extra knot $t_i$ needs to be inserted, and therefore calculation of the position of the knot in the output sequence of knots ${t_i}$ is problematic for parallel implementation (for a sequential algorithm no such issue arises). To avoid serialization, we decided to insert an additional knot in every interval $[x_i,x_{i+1}]$, but set $t_i=x_i$ when the extra knot is not actually needed. This way we know in advance the position of the output knots and the length of this sequence is $2(n-1)$, and therefore all calculations can now be performed independently. The price we pay is that some of the spline knots can coincide. However, this does not affect spline evaluation, as one of the coinciding knots is simply disregarded, and the spline coefficients are replicated (so for a double knot $t_i=t_{i+1}$, we have $\alpha_i=\alpha_{i+1}$, $\beta_i=\beta_{i+1}$, $\gamma_i=\gamma_{i+1}$). Our implementation is presented in Listings \ref{ch11:algcoef1}-\ref{ch11:algcoef}.
+
+\lstinputlisting[label=ch11:algcoef1,caption=calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu}
+
+
+At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using the bisection algorithm presented in Listing \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also done in parallel.
+The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Listing \ref{ch11:algeval}.
+
+\pagebreak
+\lstinputlisting[label=ch11:algcoef,caption=implementation of the kernel for calculating spline knots and coefficients; function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu}
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
+%% double *t, double *alpha, double *beta, double *gamma, int N )
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% int s = tid*2;
+%% while(tid<=(N-2))
+%% \{
+%% // decide whether an additional knot is necessary
+%% if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
+%% \{ //no additional knot
+%% h[s]=h[s+1]=u[tid];
+%% alpha[s]=alpha[s+1]=v[tid];
+%% beta[s]=beta[s+1]=c[tid];
+%% gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
+%% \} else \{ //adding a knot
+%% h[s]=u[tid];
+%% //determine the position of the knot
+%% if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
+%% h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
+%% fmax(1e-10,(c[tid+1] - c[tid]));
+%% else
+%% h[s+1]=0.5*(u[tid+1] + u[tid]);
+%% //calculate coefficients
+%% double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
+%% fmax(1e-10,(u[tid+1] - u[tid]));
+%% alpha[s]=v[tid]; beta[s]=c[tid];
+%% gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
+%% alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
+%% (dtemp - c[tid])*(h[s+1] - u[tid])/2;
+%% gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
+%% beta[s+1]=dtemp;
+%% \}
+%% tid += blockDim.x * gridDim.x; s = tid*2;
+%% \}
+%% \_\_syncthreads();
+%% // Select a single thread to perform the last operation
+%% if((threadIdx.x ) == 0) \{
+%% s = (N-1) * 2; h[s]=u[N-1];
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
+%% \label{ch11:algcoef}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% while(tid<=(N-2)) \{
+%% b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% while(tid<=(N-2)) \{
+%% if((b[tid-1]*b[tid])<=0) c[tid]=0;
+%% else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
+%% \}
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateD( double *b, double *c, int N )
+%% \{
+%% if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0;
+%% else c[0]=2*b[0] - c[1];
+%% if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
+%% else c[N-1]=2*b[N-2] - c[N-2];
+%% \_\_syncthreads();
+%% \}
+%% template<typename Tx, typename Ty>
+%% int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
+%% double *t, double *alpha, double *beta, double *gamma)
+%% \{
+%% int T = (N-1)*2+1; // length of the output array
+%% double *b, *c; // temp variables
+%% cudaMalloc( (void**)&b, 1*N*sizeof(double) );
+%% cudaMalloc( (void**)&c, 2*N*sizeof(double) );
+%% int threads=256;
+%% int blocks = (N-1)/threads + 1;
+%% CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
+%% CalculateDGeneral<<<blocks,threads>>>(b,c,N);
+%% CalculateD<<<1,1>>>(b,c,NN); // calculate d_1 and d_N
+%% CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
+%% d_Y,b,c,h,alpha,beta,gamma,N);
+%% cudaFree(b); cudaFree(c);
+%% return T;
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Calculation of monotone spline knots and coefficients.}
+%% \label{ch11:algcoef1}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename T>
+%% \_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma, int* l)
+%% \{
+%% int i; ma--;
+%% while(1) \{
+%% i=(mi+ma)/2;
+%% if(z >= t[i]) mi=i+1;
+%% else ma=i;
+%% if(mi>=ma) break;
+%% \}
+%% *l = mi-1;
+%% \}
+
+%% /* Kernel to evaluates monotone spline for a sequence of query points
+%% residing in the array z of size m
+%% */
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
+%% double * alpha, double * beta, double * gamma, int T, Ty *value)
+%% \{
+%% int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%% int mi=0, ma=T, i=0;
+%% Ty r;
+%% while(tid<K)
+%% \{
+%% Bisection\_device(z[tid], t, mi, ma, &i);
+%% r= z[tid]-t[i];
+%% r= alpha[i] + r*(beta[i] + gamma[i]*r);
+%% value[tid]=r;
+%% tid += blockDim.x * gridDim.x;
+%% \}
+%% \_\_syncthreads();
+%% \}
+
+%% template<typename Tx, typename Ty>
+%% void MonotoneSplineValue(Tx *z, int K, double* t,
+%% double * alpha, double * beta, double * gamma, int T, Ty* result)
+%% \{
+%% int blocks,threads=256;
+%% blocks=(K-1)/threads+1;
+%% d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the spline evaluation algorithm for GPU.}
+%% \label{ch11:algeval}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+\lstinputlisting[label=ch11:algeval,caption=implementation of the spline evaluation algorithm for GPU.]{Chapters/chapter11/code3.cu}