From 1a01129f963257afbf1ca4effbb4e6e1f378cefa Mon Sep 17 00:00:00 2001 From: Raphael Couturier Date: Wed, 17 Oct 2012 15:23:25 +0200 Subject: [PATCH 1/1] new --- BookGPU/Chapters/chapter11/ch11.tex | 178 ++++++++++++++-------------- BookGPU/Chapters/chapter11/code1.cu | 3 +- BookGPU/Chapters/chapter11/code3.cu | 5 +- BookGPU/Chapters/chapter11/code4.cu | 46 +++++++ 4 files changed, 140 insertions(+), 92 deletions(-) create mode 100644 BookGPU/Chapters/chapter11/code4.cu diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex index 1bf638e..7e319fd 100644 --- a/BookGPU/Chapters/chapter11/ch11.tex +++ b/BookGPU/Chapters/chapter11/ch11.tex @@ -395,94 +395,98 @@ u_i=\max_{k\leq i} \min_{l \geq i} \hat y(k,l), $$ with $\hat y(k,l)$ being the unrestricted maximum likelihood estimator of $y_k\ldots,y_l$. For quadratic cost function $\hat y(k,l)$ is the mean, as in PAV and MLS algorithms, for the absolute deviations it becomes the median, and for other cost functions an M-estimator of location. The MLS algorithm can be applied to such isotone regression problems with very little modification, while linear in time algorithm may not be available. Our parallel MLS algorithm will be valuable in such cases. -%\renewcommand{\baselinestretch}{1} -\begin{table}[!h] -\begin{center} -\caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } \label{ch11:table1} -\begin{tabular}{|r|r|r|r|} - -Data & PAVA & MLS & GPU MLS \\ \hline - -monotone increasing $f$ & & & \\ -$n=5\times 10^4$ &0.01&5& 0.092\\ -$n=10^5$ &0.03&40& 0.35\\ -$n=5\times 10^5$ &0.4&1001&8.6 \\ -$n=10^6$ &0.8& 5000& 38 \\ -$n=2 \times 10^6$ & 1.6 &-- &152 \\ -$n=10 \times 10^6$ & 2 &-- & 3500 \\ -$n=20 \times 10^6$ & 4.5&-- & --\\ -$n=50 \times 10^6$ & 12 &-- & --\\ -\hline - -constant or decreasing $f$ & & & \\ -$n=10^6$ &0.2&0.1& 38\\ -$n=10 \times 10^6$ &1.9& 1.9& 3500 \\ -$n=20 \times 10^6$ &3.5& 4.0&-- \\ -$n=50 \times 10^6$ &11& 11& -- \\ - -\end{tabular} -\end{center} -\end{table} -%\renewcommand{\baselinestretch}{2} - - -\begin{figure}[!hp] - \begin{alltt} -\begin{center} -\begin{minipage}{13cm}\small -template -__device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\} - -template -__global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n) -\{ int i = threadIdx.x + blockIdx.x * blockDim.x; - if(icurP) \{ - smallestJ = j; - smallestP = curP; - \} - \} - curP=y[i]; - if(curP > smallestP) t=smallestP; - else smallestJ=i; - key[i]=smallestJ; - u[i]=t; - \} -\} - -template< typename Tx > -void MonotonizeData(Tx *y, int n, Tx *u) \{ - thrust::less_equal binary_pred; - thrust::maximum binary_op2; - thrust::device_vector z_d(n+1); - thrust::device_vector keys_d(n); - thrust::device_ptr y_d(y), u_d(u); - thrust::fill(u_d, u_d+n, -1e100); - thrust::fill(keys_d.begin(), keys_d.end(), 0); - - thrust::reverse_iterator< typename thrust::device_vector::iterator > - y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end()); + + +%% %\renewcommand{\baselinestretch}{1} +%% \begin{table}[!h] +%% \begin{center} +%% \caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } \label{ch11:table1} +%% \begin{tabular}{|r|r|r|r|} + +%% Data & PAVA & MLS & GPU MLS \\ \hline + +%% monotone increasing $f$ & & & \\ +%% $n=5\times 10^4$ &0.01&5& 0.092\\ +%% $n=10^5$ &0.03&40& 0.35\\ +%% $n=5\times 10^5$ &0.4&1001&8.6 \\ +%% $n=10^6$ &0.8& 5000& 38 \\ +%% $n=2 \times 10^6$ & 1.6 &-- &152 \\ +%% $n=10 \times 10^6$ & 2 &-- & 3500 \\ +%% $n=20 \times 10^6$ & 4.5&-- & --\\ +%% $n=50 \times 10^6$ & 12 &-- & --\\ +%% \hline + +%% constant or decreasing $f$ & & & \\ +%% $n=10^6$ &0.2&0.1& 38\\ +%% $n=10 \times 10^6$ &1.9& 1.9& 3500 \\ +%% $n=20 \times 10^6$ &3.5& 4.0&-- \\ +%% $n=50 \times 10^6$ &11& 11& -- \\ + +%% \end{tabular} +%% \end{center} +%% \end{table} +%% %\renewcommand{\baselinestretch}{2} + + +%% \begin{figure}[!hp] +%% \begin{alltt} +%% \begin{center} +%% \begin{minipage}{13cm}\small +%% template +%% __device__ Tx Aver(Tx z,int i,int j, Tx *z) \{return (z-z[j+1])/(j-i+1);\} + +%% template +%% __global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n) +%% \{ int i = threadIdx.x + blockIdx.x * blockDim.x; +%% if(icurP) \{ +%% smallestJ = j; +%% smallestP = curP; +%% \} +%% \} +%% curP=y[i]; +%% if(curP > smallestP) t=smallestP; +%% else smallestJ=i; +%% key[i]=smallestJ; +%% u[i]=t; +%% \} +%% \} + +%% template< typename Tx > +%% void MonotonizeData(Tx *y, int n, Tx *u) \{ +%% thrust::less_equal binary_pred; +%% thrust::maximum binary_op2; +%% thrust::device_vector z_d(n+1); +%% thrust::device_vector keys_d(n); +%% thrust::device_ptr y_d(y), u_d(u); +%% thrust::fill(u_d, u_d+n, -1e100); +%% thrust::fill(keys_d.begin(), keys_d.end(), 0); + +%% thrust::reverse_iterator< typename thrust::device_vector::iterator > +%% y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end()); - thrust::inclusive_scan(y_reverse_b, y_reverse_end, z_reverse_b+1); - - monotonizekernel<<>>(y, thrust::raw_pointer_cast(&z_d[0]), - u, thrust::raw_pointer_cast(&keys_d[0]), n ); - - thrust::sort(keys_d.begin(), keys_d.end()); - thrust::inclusive_scan_by_key(keys_d.begin(), keys_d.end(), - u_d, u_d, binary_pred, binary_op2); -\} -\end{minipage} -\end{center} -\end{alltt} -\caption{Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.} -\label{ch11:algMLS} -\end{figure} +%% thrust::inclusive_scan(y_reverse_b, y_reverse_end, z_reverse_b+1); + +%% monotonizekernel<<>>(y, thrust::raw_pointer_cast(&z_d[0]), +%% u, thrust::raw_pointer_cast(&keys_d[0]), n ); + +%% thrust::sort(keys_d.begin(), keys_d.end()); +%% thrust::inclusive_scan_by_key(keys_d.begin(), keys_d.end(), +%% u_d, u_d, binary_pred, binary_op2); +%% \} +%% \end{minipage} +%% \end{center} +%% \end{alltt} +%% \caption{Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.} +%% \label{ch11:algMLS} +%% \end{figure} + +\lstinputlisting[label=ch11:algMLS,caption=Fragments of implementation of a parallel version of the MLS algorithm using Thrust library.]{Chapters/chapter11/code4.cu} \section{Conclusion} \label{ch11:conc} diff --git a/BookGPU/Chapters/chapter11/code1.cu b/BookGPU/Chapters/chapter11/code1.cu index 3b77cc2..4f2316e 100644 --- a/BookGPU/Chapters/chapter11/code1.cu +++ b/BookGPU/Chapters/chapter11/code1.cu @@ -1,7 +1,6 @@ template -__global__ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c, - double *t, double *alpha, double *beta, double *gamma, int N ) +__global__ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c, double *t, double *alpha, double *beta, double *gamma, int N ) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int s = tid*2; diff --git a/BookGPU/Chapters/chapter11/code3.cu b/BookGPU/Chapters/chapter11/code3.cu index de460f6..fa658eb 100644 --- a/BookGPU/Chapters/chapter11/code3.cu +++ b/BookGPU/Chapters/chapter11/code3.cu @@ -31,10 +31,9 @@ __global__ void d_MonSplineValue(Tx* z, int K, double* t, double * alpha, double } template -void MonotoneSplineValue(Tx *z, int K, double* t, - double * alpha, double * beta, double * gamma, int T, Ty* result) +void MonotoneSplineValue(Tx *z, int K, double* t, double * alpha, double * beta, double * gamma, int T, Ty* result) { int blocks,threads=256; blocks=(K-1)/threads+1; d_MonSplineValue<<>>(z,K,t,alpha,beta,gamma,T,result); -} \ No newline at end of file +} diff --git a/BookGPU/Chapters/chapter11/code4.cu b/BookGPU/Chapters/chapter11/code4.cu new file mode 100644 index 0000000..6eb646d --- /dev/null +++ b/BookGPU/Chapters/chapter11/code4.cu @@ -0,0 +1,46 @@ +template +__device__ Tx Aver(Tx z,int i,int j, Tx *z) {return (z-z[j+1])/(j-i+1);} + +template +__global__ void monotonizekernel(Tx *y, Tx *z, Tx *u, int *key, int n) +{ int i = threadIdx.x + blockIdx.x * blockDim.x; + if(icurP) { + smallestJ = j; + smallestP = curP; + } + } + curP=y[i]; + if(curP > smallestP) + t=smallestP; + else + smallestJ=i; + key[i]=smallestJ; + u[i]=t; + } +} + +template< typename Tx > +void MonotonizeData(Tx *y, int n, Tx *u) { + thrust::less_equal binary_pred; + thrust::maximum binary_op2; + thrust::device_vector z_d(n+1); + thrust::device_vector keys_d(n); + thrust::device_ptr y_d(y), u_d(u); + thrust::fill(u_d, u_d+n, -1e100); + thrust::fill(keys_d.begin(), keys_d.end(), 0); + + thrust::reverse_iterator< typename thrust::device_vector::iterator > y_reverse_b(y_d+n), y_reverse_end(y_d), z_reverse_b(z_d.end()); + + thrust::inclusive_scan(y_reverse_b, y_reverse_end, z_reverse_b+1); + + monotonizekernel<<>>(y, thrust::raw_pointer_cast(&z_d[0]), u, thrust::raw_pointer_cast(&keys_d[0]), n ); + + thrust::sort(keys_d.begin(), keys_d.end()); + thrust::inclusive_scan_by_key(keys_d.begin(), keys_d.end(), u_d, u_d, binary_pred, binary_op2); +} -- 2.39.5