new ch11

[book_gpu.git] / BookGPU / Chapters / chapter11 / ch11.tex
diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex

index 8a8d54c5d379ca23758588deb423ae6220998654..80071382b5966b1d871323f1238bc2dd7c37b2c2 100644 (file)
--- a/BookGPU/Chapters/chapter11/ch11.tex
+++ b/BookGPU/Chapters/chapter11/ch11.tex
@@ -7,7 +7,7 @@
  
  \section{Introduction} \label{ch11:Introduction}
  
-Monotonicity preserving interpolation and approximation have received substantial attention in the last thirty years because of their numerous applications in computer aided design, statistics and machine learning \cite{Dierckx1995_book,Kvasov2000_book,deboor2001_book}. Constrained splines are particularly popular because of their flexibility in modelling different geometrical shapes, sound theoretical properties and availability of numerically stable algorithms \cite{Dierckx1995_book,Schumaker1981_book,deboor2001_book}. 
+Monotonicity preserving interpolation and approximation have received substantial attention in the last thirty years because of their numerous applications in computer aided design, statistics and machine learning \cite{Dierckx1995_book,Kvasov2000_book,deboor2001_book}. Constrained splines are particularly popular because of their flexibility in modelling different geometrical shapes, sound theoretical properties and availability of numerically stable algorithms \cite{Dierckx1995_book,Schumaker1981_book,deboor2001_book}.
  % It is surprising though that few parallel spline algorithms are available.
  In this work we examine parallelisation and adaptation for GPUs of a few algorithms of monotone spline interpolation and data smoothing, which arose in the context of estimating probability distributions.
  
@@ -28,14 +28,14 @@ The rest of the chapter is organised as follows. Section \ref{ch11:splines} disc
  \begin{figure}[h]
  \centering
  \includegraphics[angle=0,width=8cm]{Chapters/chapter11/gregory1_plot1.pdf}
-\caption{Cubic spline (solid) and monotone quadratic spline (dashed) interpolating monotone data from \cite{Gregory1982}. Cubic spline fails to preserve monotonicity of the data.}
+\caption[Cubic spline (solid) and monotone quadratic spline (dashed) interpolating monotone data]{Cubic spline (solid) and monotone quadratic spline (dashed) interpolating monotone data from \cite{Gregory1982}. Cubic spline fails to preserve monotonicity of the data.}
  \label{ch11:fig1}
  \end{figure}
  
  \begin{figure}[h]
  \centering
  \includegraphics[angle=00,width=8cm]{Chapters/chapter11/gregory1_plot2_b.pdf}
-\caption{Hermite cubic spline (solid) and Hermite rational spline interpolating monotone data from \cite{Gregory1982} with non-negative prescribed slopes. Despite non-negative slopes, Hermite cubic spline is not monotone.}
+\caption[Hermite cubic spline (solid) and Hermite rational spline interpolating monotone data]{Hermite cubic spline (solid) and Hermite rational spline interpolating monotone data from \cite{Gregory1982} with non-negative prescribed slopes. Despite non-negative slopes, Hermite cubic spline is not monotone.}
  \label{ch11:fig2}
  \end{figure}
  
@@ -71,7 +71,9 @@ d_1=\left\{\begin{array}{ll}
              2\delta_{1}-d_2, & \mbox{if } \delta_{1}(2\delta_1-d_2)>0, \\
              0 & \mbox{otherwise},
            \end{array}
- \right. \;
+ \right. 
+ $$
+ $$
   d_n=\left\{\begin{array}{ll}
              2\delta_{n-1}-d_{n-1}, & \mbox{if } \delta_{n-1}(2\delta_{n-1}-d_{n-1})>0,\\
              0 & \mbox{otherwise}.
@@ -103,180 +105,186 @@ It is almost straightforward to parallelise this scheme for GPUs, by processing
  At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using bisection algorithm presented in Figure \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also  done in parallel.
  The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Figure \ref{ch11:algeval}.
  
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
-  double *t, double *alpha, double *beta, double *gamma, int N )
-\{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int s = tid*2;
-  while(tid<=(N-2))
-  \{
-   // decide whether an additional knot is necessary
-   if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
-   \{  //no additional knot
-      h[s]=h[s+1]=u[tid];
-      alpha[s]=alpha[s+1]=v[tid];
-      beta[s]=beta[s+1]=c[tid];
-      gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
-   \} else  \{  //adding a knot
-      h[s]=u[tid];
-      //determine the position of the knot
-      if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
-        h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
-               fmax(1e-10,(c[tid+1] - c[tid]));
-      else
-        h[s+1]=0.5*(u[tid+1] + u[tid]);
-   //calculate coefficients
-      double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
-             fmax(1e-10,(u[tid+1] - u[tid]));
-      alpha[s]=v[tid];   beta[s]=c[tid];
-      gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
-      alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
-                (dtemp - c[tid])*(h[s+1] - u[tid])/2;
-      gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
-      beta[s+1]=dtemp;
-    \}
-    tid += blockDim.x * gridDim.x;   s = tid*2;
-  \}
-  \_\_syncthreads();
-   // Select a single thread  to perform the last operation
-  if((threadIdx.x  ) == 0)  \{
-   s = (N-1) * 2;   h[s]=u[N-1];
-  \}
-  \_\_syncthreads();
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
-\label{ch11:algcoef}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
-\{
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   while(tid<=(N-2)) \{
-     b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
-     tid += blockDim.x * gridDim.x;
-  \}
- \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
-\{
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   while(tid<=(N-2)) \{
-     if((b[tid-1]*b[tid])<=0) c[tid]=0;
-       else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
-     \}
-     tid += blockDim.x * gridDim.x;
-  \}
-  \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateD( double *b, double *c, int N )
-\{
-   if((b[0]*(2*b[0]-c[1]))<=0) c[0]=0;
-     else  c[0]=2*b[0] - c[1];
-   if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
-     else c[N-1]=2*b[N-2] - c[N-2];
-   \_\_syncthreads();
-\}
-template<typename Tx, typename Ty>             
-int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
-   double *t, double *alpha, double *beta, double *gamma)
-\{
-  int T =  (N-1)*2+1; // length of the output array
-  double *b, *c; // temp variables
-  cudaMalloc( (void**)&b, 1*N*sizeof(double) );
-  cudaMalloc( (void**)&c, 2*N*sizeof(double) );
-  int threads=256;
-  int blocks = (N-1)/threads + 1;
-  CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
-  CalculateDGeneral<<<blocks,threads>>>(b,c,N);
-  CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
-  CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
-                          d_Y,b,c,h,alpha,beta,gamma,N);
-  cudaFree(b); cudaFree(c);
-  return T;
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Calculation of monotone spline knots and coefficients.}
-\label{ch11:algcoef1}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename T>
-\_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma,  int* l)
-\{
-  int i; ma--;
-  while(1) \{
-    i=(mi+ma)/2;
-    if(z >= t[i]) mi=i+1;
-             else ma=i;
-    if(mi>=ma) break;
-  \}           
-  *l = mi-1;
-\}
-
-/* Kernel to evaluates monotone spline for a sequence of query points
-   residing in the array z of size m
-*/
-template<typename Tx, typename Ty>     
-\_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
-  double * alpha, double * beta, double * gamma, int T, Ty *value)
-\{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int mi=0, ma=T, i=0;
-  Ty r;
-  while(tid<K)
-  \{
-     Bisection\_device(z[tid], t, mi, ma,  &i);
-     r= z[tid]-t[i];
-     r= alpha[i] + r*(beta[i] + gamma[i]*r);
-     value[tid]=r;
-     tid += blockDim.x * gridDim.x;
-   \}
-   \_\_syncthreads();
-\}
-
-template<typename Tx, typename Ty>     
-void MonotoneSplineValue(Tx *z, int K, double* t,
-   double * alpha, double * beta, double * gamma, int T, Ty* result)
-\{     
-  int blocks,threads=256;
-  blocks=(K-1)/threads+1;
-  d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the spline evaluation algorithm for GPU.}
-\label{ch11:algeval}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-
+\lstinputlisting[label=ch11:algcoef,caption=Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu}
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
+%%   double *t, double *alpha, double *beta, double *gamma, int N )
+%% \{
+%%   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%   int s = tid*2;
+%%   while(tid<=(N-2))
+%%   \{
+%%    // decide whether an additional knot is necessary
+%%    if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
+%%    \{  //no additional knot
+%%       h[s]=h[s+1]=u[tid];
+%%       alpha[s]=alpha[s+1]=v[tid];
+%%       beta[s]=beta[s+1]=c[tid];
+%%       gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
+%%    \} else  \{  //adding a knot
+%%       h[s]=u[tid];
+%%       //determine the position of the knot
+%%       if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
+%%         h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
+%%                fmax(1e-10,(c[tid+1] - c[tid]));
+%%       else
+%%         h[s+1]=0.5*(u[tid+1] + u[tid]);
+%%    //calculate coefficients
+%%       double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
+%%              fmax(1e-10,(u[tid+1] - u[tid]));
+%%       alpha[s]=v[tid];   beta[s]=c[tid];
+%%       gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
+%%       alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
+%%                 (dtemp - c[tid])*(h[s+1] - u[tid])/2;
+%%       gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
+%%       beta[s+1]=dtemp;
+%%     \}
+%%     tid += blockDim.x * gridDim.x;   s = tid*2;
+%%   \}
+%%   \_\_syncthreads();
+%%    // Select a single thread  to perform the last operation
+%%   if((threadIdx.x  ) == 0)  \{
+%%    s = (N-1) * 2;   h[s]=u[N-1];
+%%   \}
+%%   \_\_syncthreads();
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
+%% \label{ch11:algcoef}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+
+\lstinputlisting[label=ch11:algcoef1,caption=Calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
+%% \{
+%%    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%    while(tid<=(N-2)) \{
+%%      b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
+%%      tid += blockDim.x * gridDim.x;
+%%   \}
+%%  \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
+%% \{
+%%    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%    while(tid<=(N-2)) \{
+%%      if((b[tid-1]*b[tid])<=0) c[tid]=0;
+%%        else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
+%%      \}
+%%      tid += blockDim.x * gridDim.x;
+%%   \}
+%%   \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateD( double *b, double *c, int N )
+%% \{
+%%    if((b[0]*(2*b[0]-c[1]))<=0)      c[0]=0;
+%%      else  c[0]=2*b[0] - c[1];
+%%    if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
+%%      else c[N-1]=2*b[N-2] - c[N-2];
+%%    \_\_syncthreads();
+%% \}
+%% template<typename Tx, typename Ty>          
+%% int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
+%%    double *t, double *alpha, double *beta, double *gamma)
+%% \{
+%%   int T =  (N-1)*2+1; // length of the output array
+%%   double *b, *c; // temp variables
+%%   cudaMalloc( (void**)&b, 1*N*sizeof(double) );
+%%   cudaMalloc( (void**)&c, 2*N*sizeof(double) );
+%%   int threads=256;
+%%   int blocks = (N-1)/threads + 1;
+%%   CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
+%%   CalculateDGeneral<<<blocks,threads>>>(b,c,N);
+%%   CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
+%%   CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
+%%                           d_Y,b,c,h,alpha,beta,gamma,N);
+%%   cudaFree(b); cudaFree(c);
+%%   return T;
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Calculation of monotone spline knots and coefficients.}
+%% \label{ch11:algcoef1}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename T>
+%% \_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma,  int* l)
+%% \{
+%%   int i; ma--;
+%%   while(1) \{
+%%     i=(mi+ma)/2;
+%%     if(z >= t[i]) mi=i+1;
+%%              else ma=i;
+%%     if(mi>=ma) break;
+%%   \}                
+%%   *l = mi-1;
+%% \}
+
+%% /* Kernel to evaluates monotone spline for a sequence of query points
+%%    residing in the array z of size m
+%% */
+%% template<typename Tx, typename Ty>  
+%% \_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
+%%   double * alpha, double * beta, double * gamma, int T, Ty *value)
+%% \{
+%%   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%   int mi=0, ma=T, i=0;
+%%   Ty r;
+%%   while(tid<K)
+%%   \{
+%%      Bisection\_device(z[tid], t, mi, ma,  &i);
+%%      r= z[tid]-t[i];
+%%      r= alpha[i] + r*(beta[i] + gamma[i]*r);
+%%      value[tid]=r;
+%%      tid += blockDim.x * gridDim.x;
+%%    \}
+%%    \_\_syncthreads();
+%% \}
+
+%% template<typename Tx, typename Ty>  
+%% void MonotoneSplineValue(Tx *z, int K, double* t,
+%%    double * alpha, double * beta, double * gamma, int T, Ty* result)
+%% \{  
+%%   int blocks,threads=256;
+%%   blocks=(K-1)/threads+1;
+%%   d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the spline evaluation algorithm for GPU.}
+%% \label{ch11:algeval}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+\lstinputlisting[label=ch11:algeval,caption=Implementation of the spline evaluation algorithm for GPU.]{Chapters/chapter11/code3.cu}
  
  \subsection{Monotone Hermite splines}
  
@@ -381,15 +389,17 @@ As expected,  the runtimes of both methods differed significantly, as shown in T
  
  From the results in Table \ref{ch11:table1} we conclude that serial PAVA is superior to MLS for $n>10^4$. While it is possible to transfer data from GPU to CPU and run PAVA there, it is warranted only for sufficiently large data $n\geq 5 \times 10^5$ , for otherwise the data transfer overheads will dominate CPU time. For smaller $n$, isotone regression is best performed on GPU.
  
-We also see that the use of GPU accelerated MLS by a factor of at least 100. The cost of serial MLS is prohibitive for  $n>10^6$. 
+We also see that the use of GPU accelerated MLS by a factor of at least 100. The cost of serial MLS is prohibitive for  $n>10^6$.
  
  We should mention that not all isotone regression problems allow a PAV-like algorithm linear in time. When the data may contain large outliers, monotonizing the data is better done not in the least squares sense, but using other cost functionals, such as by minimizing the sum of absolute deviations \cite{Wang} or using M-estimators \cite{Yohai}, which are less sensitive to outliers. It is interesting than in all such cases the solution to isotone regression problem can be found by solving maximin problem
  $$
-u_i=\max_{k\leq i} \min_{l \geq i} \hat y(k,l), 
+u_i=\max_{k\leq i} \min_{l \geq i} \hat y(k,l),
  $$
  with $\hat y(k,l)$ being the unrestricted maximum likelihood estimator of $y_k\ldots,y_l$. For quadratic cost function $\hat y(k,l)$ is the mean, as in PAV and MLS algorithms, for the absolute deviations it becomes the median, and for other cost functions an M-estimator of location. The MLS algorithm can be applied to such isotone regression problems with very little modification, while linear in time algorithm may not be available. Our parallel MLS algorithm will be valuable in such cases.
  
-%\renewcommand{\baselinestretch}{1}
+
+
+%% %\renewcommand{\baselinestretch}{1}
  \begin{table}[!h]
  \begin{center}
  \caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms.  } \label{ch11:table1}
@@ -417,70 +427,74 @@ $n=50 \times 10^6$ &11& 11& -- \\
  \end{tabular}
  \end{center}
  \end{table}
-%\renewcommand{\baselinestretch}{2}
-
-
-\begin{figure}[!hp]
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-template<typename Tx>   
-__device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\}
-
-template<typename Tx>
-__global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n)  
-\{ int i = threadIdx.x + blockIdx.x * blockDim.x;
-   if(i<n) \{
-      int smallestJ = i;
-      Tx curP, smallestP, curz=z[i];
-      smallestP=Aver(curz,i,i,z);
-      for(int j = i+1; j < n; j++) \{
-          curP=Aver(curz,i,j,z);
-          if(smallestP>curP) \{
-               smallestJ = j;
-               smallestP = curP;
-          \}   
-      \}
-      curP=y[i];
-      if(curP > smallestP) t=smallestP;
-                      else smallestJ=i;
-      key[i]=smallestJ;
-      u[i]=t;
-   \}
-\}
-
-template< typename Tx >
-void MonotonizeData(Tx *y, int n, Tx *u) \{
-    thrust::less_equal<int> binary_pred;
-    thrust::maximum<Tx>     binary_op2;
-    thrust::device_vector<Tx> z_d(n+1);
-    thrust::device_vector<int> keys_d(n);      
-    thrust::device_ptr<Tx> y_d(y), u_d(u);
-    thrust::fill(u_d, u_d+n, -1e100);
-    thrust::fill(keys_d.begin(), keys_d.end(), 0);
-
-    thrust::reverse_iterator< typename thrust::device_vector<Tx>::iterator >
-            y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end());
+%% %\renewcommand{\baselinestretch}{2}
+
+
+%% \begin{figure}[!hp]
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+%% template<typename Tx>       
+%% __device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\}
+
+%% template<typename Tx>
+%% __global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n)
+%% \{ int i = threadIdx.x + blockIdx.x * blockDim.x;
+%%    if(i<n) \{
+%%       int smallestJ = i;
+%%       Tx curP, smallestP, curz=z[i];
+%%       smallestP=Aver(curz,i,i,z);
+%%       for(int j = i+1; j < n; j++) \{
+%%           curP=Aver(curz,i,j,z);
+%%           if(smallestP>curP) \{
+%%                smallestJ = j;
+%%                smallestP = curP;
+%%           \}        
+%%       \}
+%%       curP=y[i];
+%%       if(curP > smallestP) t=smallestP;
+%%                       else smallestJ=i;
+%%       key[i]=smallestJ;
+%%       u[i]=t;
+%%    \}
+%% \}
+
+%% template< typename Tx >
+%% void MonotonizeData(Tx *y, int n, Tx *u) \{
+%%     thrust::less_equal<int> binary_pred;
+%%     thrust::maximum<Tx>     binary_op2;
+%%     thrust::device_vector<Tx> z_d(n+1);
+%%     thrust::device_vector<int> keys_d(n);   
+%%     thrust::device_ptr<Tx> y_d(y), u_d(u);
+%%     thrust::fill(u_d, u_d+n, -1e100);
+%%     thrust::fill(keys_d.begin(), keys_d.end(), 0);
+
+%%     thrust::reverse_iterator< typename thrust::device_vector<Tx>::iterator >
+%%             y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end());
         
-    thrust::inclusive_scan(y_reverse_b, y_reverse_end, z_reverse_b+1);
+%%     thrust::inclusive_scan(y_reverse_b, y_reverse_end, z_reverse_b+1);
  
-    monotonizekernel<<<grid, block>>>(y, thrust::raw_pointer_cast(&z_d[0]), 
-                               u, thrust::raw_pointer_cast(&keys_d[0]), n );
+%%     monotonizekernel<<<grid, block>>>(y, thrust::raw_pointer_cast(&z_d[0]),
+%%                                u, thrust::raw_pointer_cast(&keys_d[0]), n );
  
-    thrust::sort(keys_d.begin(), keys_d.end());
-    thrust::inclusive_scan_by_key(keys_d.begin(), keys_d.end(), 
-                                  u_d, u_d, binary_pred, binary_op2);
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.}
-\label{ch11:algMLS}
-\end{figure}
+%%     thrust::sort(keys_d.begin(), keys_d.end());
+%%     thrust::inclusive_scan_by_key(keys_d.begin(), keys_d.end(),
+%%                                   u_d, u_d, binary_pred, binary_op2);
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.}
+%% \label{ch11:algMLS}
+%% \end{figure}
+
+\lstinputlisting[label=ch11:algMLS,caption=Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.]{Chapters/chapter11/code4.cu}
  
  \section{Conclusion} \label{ch11:conc}
  
  We presented three GPU-based parallel algorithms for approximating monotone data: monotone quadratic spline, monotone Hermite rational spline and minimum lower sets algorithm for monotonizing noisy data. These tools are valuable in a number of applications that involve large data sets modeled by monotone nonlinear functions.
-The source code of the package monospline is available from \texttt{www.deakin.edu.au/$\sim$ gleb/monospline.html }
+The source code of the package monospline is available from \texttt{www.deakin.edu.au/$\sim$gleb/monospline.html }
+
+
  
  \putbib[Chapters/chapter11/biblio11]