From 88ce161b6f410c4d12b011560d4be74f23c4c471 Mon Sep 17 00:00:00 2001
From: couturie <couturie@extinction.(none)>
Date: Wed, 17 Oct 2012 13:53:23 +0200
Subject: [PATCH] new

---
 BookGPU/Chapters/chapter11/ch11.tex | 354 ++++++++++++++--------------
 BookGPU/Chapters/chapter11/code1.cu |  44 ++++
 BookGPU/Chapters/chapter11/code2.cu |  48 ++++
 BookGPU/Chapters/chapter11/code3.cu |  40 ++++
 4 files changed, 312 insertions(+), 174 deletions(-)
 create mode 100644 BookGPU/Chapters/chapter11/code1.cu
 create mode 100644 BookGPU/Chapters/chapter11/code2.cu
 create mode 100644 BookGPU/Chapters/chapter11/code3.cu
diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex
index 8a8d54c..1bf638e 100644
--- a/BookGPU/Chapters/chapter11/ch11.tex
+++ b/BookGPU/Chapters/chapter11/ch11.tex
@@ -103,180 +103,186 @@ It is almost straightforward to parallelise this scheme for GPUs, by processing
 At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using bisection algorithm presented in Figure \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also  done in parallel.
 The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Figure \ref{ch11:algeval}.
 
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
-  double *t, double *alpha, double *beta, double *gamma, int N )
-\{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int s = tid*2;
-  while(tid<=(N-2))
-  \{
-   // decide whether an additional knot is necessary
-   if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
-   \{  //no additional knot
-      h[s]=h[s+1]=u[tid];
-      alpha[s]=alpha[s+1]=v[tid];
-      beta[s]=beta[s+1]=c[tid];
-      gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
-   \} else  \{  //adding a knot
-      h[s]=u[tid];
-      //determine the position of the knot
-      if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
-        h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
-               fmax(1e-10,(c[tid+1] - c[tid]));
-      else
-        h[s+1]=0.5*(u[tid+1] + u[tid]);
-   //calculate coefficients
-      double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
-             fmax(1e-10,(u[tid+1] - u[tid]));
-      alpha[s]=v[tid];   beta[s]=c[tid];
-      gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
-      alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
-                (dtemp - c[tid])*(h[s+1] - u[tid])/2;
-      gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
-      beta[s+1]=dtemp;
-    \}
-    tid += blockDim.x * gridDim.x;   s = tid*2;
-  \}
-  \_\_syncthreads();
-   // Select a single thread  to perform the last operation
-  if((threadIdx.x  ) == 0)  \{
-   s = (N-1) * 2;   h[s]=u[N-1];
-  \}
-  \_\_syncthreads();
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
-\label{ch11:algcoef}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename Tx, typename Ty>
-\_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
-\{
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   while(tid<=(N-2)) \{
-     b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
-     tid += blockDim.x * gridDim.x;
-  \}
- \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
-\{
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   while(tid<=(N-2)) \{
-     if((b[tid-1]*b[tid])<=0) c[tid]=0;
-       else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
-     \}
-     tid += blockDim.x * gridDim.x;
-  \}
-  \_\_syncthreads();
-\}
-\_\_global\_\_ void CalculateD( double *b, double *c, int N )
-\{
-   if((b[0]*(2*b[0]-c[1]))<=0)	c[0]=0;
-     else  c[0]=2*b[0] - c[1];
-   if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
-     else c[N-1]=2*b[N-2] - c[N-2];
-   \_\_syncthreads();
-\}
-template<typename Tx, typename Ty>		
-int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
-   double *t, double *alpha, double *beta, double *gamma)
-\{
-  int T =  (N-1)*2+1; // length of the output array
-  double *b, *c; // temp variables
-  cudaMalloc( (void**)&b, 1*N*sizeof(double) );
-  cudaMalloc( (void**)&c, 2*N*sizeof(double) );
-  int threads=256;
-  int blocks = (N-1)/threads + 1;
-  CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
-  CalculateDGeneral<<<blocks,threads>>>(b,c,N);
-  CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
-  CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
-                          d_Y,b,c,h,alpha,beta,gamma,N);
-  cudaFree(b); cudaFree(c);
-  return T;
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Calculation of monotone spline knots and coefficients.}
-\label{ch11:algcoef1}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-\begin{figure}[!hp]
-\renewcommand{\baselinestretch}{1}
- \begin{alltt}
-\begin{center}
-\begin{minipage}{13cm}\small
-
-template<typename T>
-\_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma,  int* l)
-\{
-  int i; ma--;
-  while(1) \{
-    i=(mi+ma)/2;
-    if(z >= t[i]) mi=i+1;
-             else ma=i;
-    if(mi>=ma) break;
-  \}		
-  *l = mi-1;
-\}
-
-/* Kernel to evaluates monotone spline for a sequence of query points
-   residing in the array z of size m
-*/
-template<typename Tx, typename Ty>	
-\_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
-  double * alpha, double * beta, double * gamma, int T, Ty *value)
-\{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int mi=0, ma=T, i=0;
-  Ty r;
-  while(tid<K)
-  \{
-     Bisection\_device(z[tid], t, mi, ma,  &i);
-     r= z[tid]-t[i];
-     r= alpha[i] + r*(beta[i] + gamma[i]*r);
-     value[tid]=r;
-     tid += blockDim.x * gridDim.x;
-   \}
-   \_\_syncthreads();
-\}
-
-template<typename Tx, typename Ty>	
-void MonotoneSplineValue(Tx *z, int K, double* t,
-   double * alpha, double * beta, double * gamma, int T, Ty* result)
-\{	
-  int blocks,threads=256;
-  blocks=(K-1)/threads+1;
-  d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
-\}
-\end{minipage}
-\end{center}
-\end{alltt}
-\caption{Implementation of the spline evaluation algorithm for GPU.}
-\label{ch11:algeval}
-\renewcommand{\baselinestretch}{2}
-\end{figure}
-
-
+\lstinputlisting[label=ch11:algcoef,caption=Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu}
+
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
+%%   double *t, double *alpha, double *beta, double *gamma, int N )
+%% \{
+%%   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%   int s = tid*2;
+%%   while(tid<=(N-2))
+%%   \{
+%%    // decide whether an additional knot is necessary
+%%    if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
+%%    \{  //no additional knot
+%%       h[s]=h[s+1]=u[tid];
+%%       alpha[s]=alpha[s+1]=v[tid];
+%%       beta[s]=beta[s+1]=c[tid];
+%%       gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
+%%    \} else  \{  //adding a knot
+%%       h[s]=u[tid];
+%%       //determine the position of the knot
+%%       if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
+%%         h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/
+%%                fmax(1e-10,(c[tid+1] - c[tid]));
+%%       else
+%%         h[s+1]=0.5*(u[tid+1] + u[tid]);
+%%    //calculate coefficients
+%%       double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/
+%%              fmax(1e-10,(u[tid+1] - u[tid]));
+%%       alpha[s]=v[tid];   beta[s]=c[tid];
+%%       gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
+%%       alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
+%%                 (dtemp - c[tid])*(h[s+1] - u[tid])/2;
+%%       gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
+%%       beta[s+1]=dtemp;
+%%     \}
+%%     tid += blockDim.x * gridDim.x;   s = tid*2;
+%%   \}
+%%   \_\_syncthreads();
+%%    // Select a single thread  to perform the last operation
+%%   if((threadIdx.x  ) == 0)  \{
+%%    s = (N-1) * 2;   h[s]=u[N-1];
+%%   \}
+%%   \_\_syncthreads();
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the kernel for calcuating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.}
+%% \label{ch11:algcoef}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+
+\lstinputlisting[label=ch11:algcoef1,caption=Calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename Tx, typename Ty>
+%% \_\_global\_\_ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
+%% \{
+%%    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%    while(tid<=(N-2)) \{
+%%      b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
+%%      tid += blockDim.x * gridDim.x;
+%%   \}
+%%  \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateDGeneral( double *b, double *c, int N)
+%% \{
+%%    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%    while(tid<=(N-2)) \{
+%%      if((b[tid-1]*b[tid])<=0) c[tid]=0;
+%%        else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
+%%      \}
+%%      tid += blockDim.x * gridDim.x;
+%%   \}
+%%   \_\_syncthreads();
+%% \}
+%% \_\_global\_\_ void CalculateD( double *b, double *c, int N )
+%% \{
+%%    if((b[0]*(2*b[0]-c[1]))<=0)	c[0]=0;
+%%      else  c[0]=2*b[0] - c[1];
+%%    if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
+%%      else c[N-1]=2*b[N-2] - c[N-2];
+%%    \_\_syncthreads();
+%% \}
+%% template<typename Tx, typename Ty>		
+%% int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N,
+%%    double *t, double *alpha, double *beta, double *gamma)
+%% \{
+%%   int T =  (N-1)*2+1; // length of the output array
+%%   double *b, *c; // temp variables
+%%   cudaMalloc( (void**)&b, 1*N*sizeof(double) );
+%%   cudaMalloc( (void**)&c, 2*N*sizeof(double) );
+%%   int threads=256;
+%%   int blocks = (N-1)/threads + 1;
+%%   CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
+%%   CalculateDGeneral<<<blocks,threads>>>(b,c,N);
+%%   CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
+%%   CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,
+%%                           d_Y,b,c,h,alpha,beta,gamma,N);
+%%   cudaFree(b); cudaFree(c);
+%%   return T;
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Calculation of monotone spline knots and coefficients.}
+%% \label{ch11:algcoef1}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+%% \begin{figure}[!hp]
+%% \renewcommand{\baselinestretch}{1}
+%%  \begin{alltt}
+%% \begin{center}
+%% \begin{minipage}{13cm}\small
+
+%% template<typename T>
+%% \_\_device\_\_ void Bisection\_device(T z, T* t, int mi, int ma,  int* l)
+%% \{
+%%   int i; ma--;
+%%   while(1) \{
+%%     i=(mi+ma)/2;
+%%     if(z >= t[i]) mi=i+1;
+%%              else ma=i;
+%%     if(mi>=ma) break;
+%%   \}		
+%%   *l = mi-1;
+%% \}
+
+%% /* Kernel to evaluates monotone spline for a sequence of query points
+%%    residing in the array z of size m
+%% */
+%% template<typename Tx, typename Ty>	
+%% \_\_global\_\_ void d\_MonSplineValue(Tx* z, int K, double* t,
+%%   double * alpha, double * beta, double * gamma, int T, Ty *value)
+%% \{
+%%   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+%%   int mi=0, ma=T, i=0;
+%%   Ty r;
+%%   while(tid<K)
+%%   \{
+%%      Bisection\_device(z[tid], t, mi, ma,  &i);
+%%      r= z[tid]-t[i];
+%%      r= alpha[i] + r*(beta[i] + gamma[i]*r);
+%%      value[tid]=r;
+%%      tid += blockDim.x * gridDim.x;
+%%    \}
+%%    \_\_syncthreads();
+%% \}
+
+%% template<typename Tx, typename Ty>	
+%% void MonotoneSplineValue(Tx *z, int K, double* t,
+%%    double * alpha, double * beta, double * gamma, int T, Ty* result)
+%% \{	
+%%   int blocks,threads=256;
+%%   blocks=(K-1)/threads+1;
+%%   d\_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
+%% \}
+%% \end{minipage}
+%% \end{center}
+%% \end{alltt}
+%% \caption{Implementation of the spline evaluation algorithm for GPU.}
+%% \label{ch11:algeval}
+%% \renewcommand{\baselinestretch}{2}
+%% \end{figure}
+
+\lstinputlisting[label=ch11:algeval,caption=Implementation of the spline evaluation algorithm for GPU.]{Chapters/chapter11/code3.cu}
 
 \subsection{Monotone Hermite splines}
 
diff --git a/BookGPU/Chapters/chapter11/code1.cu b/BookGPU/Chapters/chapter11/code1.cu
new file mode 100644
index 0000000..3b77cc2
--- /dev/null
+++ b/BookGPU/Chapters/chapter11/code1.cu
@@ -0,0 +1,44 @@
+
+template<typename Tx, typename Ty>
+__global__ void CalculateCoefficientsKnots( Tx *u, Ty *v, double *b, double *c,
+  double *t, double *alpha, double *beta, double *gamma, int N )
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int s = tid*2;
+  while(tid<=(N-2))
+  {
+    // decide whether an additional knot is necessary
+    if(fabs(c[tid]+c[tid+1]- 2*b[tid])<=0.1e-5) // tolerance
+    { 
+      //no additional knot
+      h[s]=h[s+1]=u[tid];
+      alpha[s]=alpha[s+1]=v[tid];
+      beta[s]=beta[s+1]=c[tid];
+      gamma[s]=gamma[s+1]=(c[tid+1]-c[tid])/(2*(fmax(1e-10,u[tid+1]-u[tid])));
+    } 
+    else {  
+      //adding a knot
+      h[s]=u[tid];
+      //determine the position of the knot
+      if((c[tid+1] - b[tid])*(c[tid] - b[tid])<0)
+        h[s+1]=u[tid+1] + (c[tid] - b[tid])*(fmax(1e-10,u[tid+1]-u[tid]))/fmax(1e-10,(c[tid+1] - c[tid]));
+      else
+        h[s+1]=0.5*(u[tid+1] + u[tid]);
+      //calculate coefficients
+      double dtemp = (2*b[tid] - c[tid+1])+((c[tid+1] - c[tid])*(h[s+1] - u[tid]))/fmax(1e-10,(u[tid+1] - u[tid]));
+      alpha[s]=v[tid];   beta[s]=c[tid];
+      gamma[s]=(dtemp - c[tid])/(2*fmax(1e-10,(h[s+1] - u[tid])));
+      alpha[s+1]=v[tid] + c[tid]*(h[s+1] - u[tid]) +
+                (dtemp - c[tid])*(h[s+1] - u[tid])/2;
+      gamma[s+1]=(c[tid+1] - dtemp)/(2*fmax(1e-10,(u[tid+1] - h[s+1])));
+      beta[s+1]=dtemp;
+    }
+    tid += blockDim.x * gridDim.x;   s = tid*2;
+  }
+  __syncthreads();
+  // Select a single thread  to perform the last operation
+  if((threadIdx.x  ) == 0)  {
+    s = (N-1) * 2;   h[s]=u[N-1];
+  }
+  __syncthreads();
+}
diff --git a/BookGPU/Chapters/chapter11/code2.cu b/BookGPU/Chapters/chapter11/code2.cu
new file mode 100644
index 0000000..76b6c1a
--- /dev/null
+++ b/BookGPU/Chapters/chapter11/code2.cu
@@ -0,0 +1,48 @@
+template<typename Tx, typename Ty>
+__global__ void CalculateBeta(Tx *u, Ty *v, double *b, int N)
+{
+   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+   while(tid<=(N-2)) {
+     b[tid]=(v[tid+1]-v[tid])/fmax(1e-20,double(u[tid+1]-u[tid]));
+     tid += blockDim.x * gridDim.x;
+  }
+ __syncthreads();
+}
+
+__global__ void CalculateDGeneral( double *b, double *c, int N)
+{
+   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+   while(tid<=(N-2)) {
+     if((b[tid-1]*b[tid])<=0) c[tid]=0;
+       else c[tid]=(2*b[tid-1]*b[tid])/(b[tid-1]+b[tid]);
+     }
+     tid += blockDim.x * gridDim.x;
+  }
+  __syncthreads();
+}
+
+__global__ void CalculateD( double *b, double *c, int N )
+{
+   if((b[0]*(2*b[0]-c[1]))<=0)	c[0]=0;
+     else  c[0]=2*b[0] - c[1];
+   if((b[N-2]*(2*b[N-2]-c[N-2]))<=0) c[N-1]=0;
+     else c[N-1]=2*b[N-2] - c[N-2];
+   __syncthreads();
+}
+
+template<typename Tx, typename Ty>		
+int BuildMonotonSpline(Tx *d_X, Ty *d_Y, int N, double *t, double *alpha, double *beta, double *gamma)
+{
+  int T =  (N-1)*2+1; // length of the output array
+  double *b, *c; // temp variables
+  cudaMalloc( (void**)&b, 1*N*sizeof(double) );
+  cudaMalloc( (void**)&c, 2*N*sizeof(double) );
+  int threads=256;
+  int blocks = (N-1)/threads + 1;
+  CalculateBeta<<<blocks,threads>>>(d_X,d_Y,b,N);
+  CalculateDGeneral<<<blocks,threads>>>(b,c,N);
+  CalculateD<<<1,1>>>(b,c,NN);  // calculate d_1 and d_N
+  CalculateCoefficientsKnots<<<blocks,threads>>>(d_X,d_Y,b,c,h,alpha,beta,gamma,N);
+  cudaFree(b); cudaFree(c);
+  return T;
+}
\ No newline at end of file
diff --git a/BookGPU/Chapters/chapter11/code3.cu b/BookGPU/Chapters/chapter11/code3.cu
new file mode 100644
index 0000000..de460f6
--- /dev/null
+++ b/BookGPU/Chapters/chapter11/code3.cu
@@ -0,0 +1,40 @@
+template<typename T>
+__device__ void Bisection_device(T z, T* t, int mi, int ma,  int* l)
+{
+  int i; ma--;
+  while(1) {
+    i=(mi+ma)/2;
+    if(z >= t[i]) mi=i+1;
+             else ma=i;
+    if(mi>=ma) break;
+  }		
+  *l = mi-1;
+}
+
+/* Kernel to evaluates monotone spline for a sequence of query points residing in the array z of size m
+*/
+template<typename Tx, typename Ty>	
+__global__ void d_MonSplineValue(Tx* z, int K, double* t, double * alpha, double * beta, double * gamma, int T, Ty *value)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int mi=0, ma=T, i=0;
+  Ty r;
+  while(tid<K)
+  {
+     Bisection_device(z[tid], t, mi, ma,  &i);
+     r= z[tid]-t[i];
+     r= alpha[i] + r*(beta[i] + gamma[i]*r);
+     value[tid]=r;
+     tid += blockDim.x * gridDim.x;
+   }
+   __syncthreads();
+}
+
+template<typename Tx, typename Ty>	
+void MonotoneSplineValue(Tx *z, int K, double* t,
+   double * alpha, double * beta, double * gamma, int T, Ty* result)
+{	
+  int blocks,threads=256;
+  blocks=(K-1)/threads+1;
+  d_MonSplineValue<<<blocks,threads>>>(z,K,t,alpha,beta,gamma,T,result);
+}
\ No newline at end of file
-- 
2.39.5