-\begin{figure}[!hp]
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-template<typename Tx>
-__device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\}
-
-template<typename Tx>
-__global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n)
-\{ int i = threadIdx.x + blockIdx.x * blockDim.x;
- if(i<n) \{
- int smallestJ = i;
- Tx curP, smallestP, curz=z[i];
- smallestP=Aver(curz,i,i,z);
- for(int j = i+1; j < n; j++) \{
- curP=Aver(curz,i,j,z);
- if(smallestP>curP) \{
- smallestJ = j;
- smallestP = curP;
- \}
- \}
- curP=y[i];
- if(curP > smallestP) t=smallestP;
- else smallestJ=i;
- key[i]=smallestJ;
- u[i]=t;
- \}
-\}
-
-template< typename Tx >
-void MonotonizeData(Tx *y, int n, Tx *u) \{
- thrust::less_equal<int> binary_pred;
- thrust::maximum<Tx> binary_op2;
- thrust::device_vector<Tx> z_d(n+1);
- thrust::device_vector<int> keys_d(n);
- thrust::device_ptr<Tx> y_d(y), u_d(u);
- thrust::fill(u_d, u_d+n, -1e100);
- thrust::fill(keys_d.begin(), keys_d.end(), 0);
-
- thrust::reverse_iterator< typename thrust::device_vector<Tx>::iterator >
- y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end());
+As expected, the runtimes of both methods differed significantly, as shown in Table \ref{ch11:table1}, and clearly linear PAVA was superior to serial MLS algorithm. Even though for some special cases, e.g., test function $f=const$, both serial methods gave the same running time; this can be explained by the fact that large blocks of data allowed MLS to skip the majority of tests. This did not happen in the parallel version of MLS, where for each datum the smallest value of $P_{ij^*}$ was computed (in parallel), so the average CPU times were the same for all data.
+
+From the results in Table \ref{ch11:table1} we conclude that serial PAVA is superior to MLS for $n>10^4$. While it is possible to transfer data from GPU to CPU and run PAVA there, it is warranted only for sufficiently large data $n\geq 5 \times 10^5$, for otherwise the data transfer overheads will dominate CPU time. For smaller $n$, isotone regression is best performed on GPU.
+
+
+We also see that the use of GPU accelerated MLS by a factor of at least 100, except for antitone data. The cost of serial MLS is prohibitive for $n>10^6$.
+
+We should mention that not all isotone regression problems allow a PAV-like algorithm linear in time. When the data may contain large outliers, monotonizing the data is better done not in the least squares sense, but using other cost functionals, such as by minimizing the sum of absolute deviations \cite{Wang} or using M-estimators \cite{Yohai}, which are less sensitive to outliers. It is interesting than in all such cases the solution to an isotone regression problem can be found by solving maximin problem
+$$
+u_i=\max_{k\leq i} \min_{l \geq i} \hat y(k,l),
+$$
+with $\hat y(k,l)$ being the unrestricted maximum likelihood estimator of $y_k\ldots,y_l$. For the quadratic cost function $\hat y(k,l)$ corresponds to the mean of these data (as in PAV and MLS algorithms), for the absolute deviations $\hat y(k,l)$ corresponds to the median, and for other cost functions it corresponds to an M-estimator of location. The MLS algorithm can be applied to such isotone regression problems with very little modification. However, we are unaware of other algorithms for solving the modified problem that linear in time. Our parallel MLS algorithm will be valuable in such cases.
+
+
+%% %\renewcommand{\baselinestretch}{2}
+
+
+%% \begin{figure}[!hp]
+%% \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+%% template<typename Tx>
+%% __device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\}
+
+%% template<typename Tx>
+%% __global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n)
+%% \{ int i = threadIdx.x + blockIdx.x * blockDim.x;
+%% if(i<n) \{
+%% int smallestJ = i;
+%% Tx curP, smallestP, curz=z[i];
+%% smallestP=Aver(curz,i,i,z);
+%% for(int j = i+1; j < n; j++) \{
+%% curP=Aver(curz,i,j,z);
+%% if(smallestP>curP) \{
+%% smallestJ = j;
+%% smallestP = curP;
+%% \}
+%% \}
+%% curP=y[i];
+%% if(curP > smallestP) t=smallestP;
+%% else smallestJ=i;
+%% key[i]=smallestJ;
+%% u[i]=t;
+%% \}
+%% \}
+
+%% template< typename Tx >
+%% void MonotonizeData(Tx *y, int n, Tx *u) \{
+%% thrust::less_equal<int> binary_pred;
+%% thrust::maximum<Tx> binary_op2;
+%% thrust::device_vector<Tx> z_d(n+1);
+%% thrust::device_vector<int> keys_d(n);
+%% thrust::device_ptr<Tx> y_d(y), u_d(u);
+%% thrust::fill(u_d, u_d+n, -1e100);
+%% thrust::fill(keys_d.begin(), keys_d.end(), 0);
+
+%% thrust::reverse_iterator< typename thrust::device_vector<Tx>::iterator >
+%% y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end());