From b16860cf1aa5417092807647bf36283d4c566809 Mon Sep 17 00:00:00 2001
From: asider <ar.sider@univ-bejaia.dz>
Date: Tue, 20 Oct 2015 15:36:33 +0100
Subject: [PATCH] section 1 2, et 3.

---
 paper.tex | 246 +++++++++++++++++++++++++++---------------------------
 1 file changed, 123 insertions(+), 123 deletions(-)

diff --git a/paper.tex b/paper.tex
index 73a2b47..c213794 100644
--- a/paper.tex
+++ b/paper.tex
@@ -1,10 +1,9 @@
 \documentclass[review]{elsarticle}
 
-\usepackage{lineno,hyperref}  
-\usepackage[utf8]{inputenc}
+\usepackage{lineno,hyperref}
+%%\usepackage[utf8]{inputenc}
 %%\usepackage[T1]{fontenc}
 %%\usepackage[french]{babel}
-
 \usepackage{amsmath,amsfonts,amssymb}
 \usepackage[ruled,vlined]{algorithm2e}
 \usepackage{array,multirow,makecell}
@@ -53,12 +52,12 @@
 
 \begin{frontmatter}
 
-\title{A parallel  root finding polynomial on GPU}
+\title{Parallel polynomial root finding  using GPU}
 
 %% Group authors per affiliation:
-%\author{Elsevier\fnref{myfootnote}}
-%\address{Radarweg 29, Amsterdam}
-%\fntext[myfootnote]{Since 1880.}
+\author{Elsevier\fnref{myfootnote}}
+\address{Radarweg 29, Amsterdam}
+\fntext[myfootnote]{Since 1880.}
 
 %% or include affiliations in footnotes:
 \author[mymainaddress]{Ghidouche Kahina\corref{mycorrespondingauthor}}
@@ -66,7 +65,7 @@
 \cortext[mycorrespondingauthor]{Corresponding author}
 \ead{kahina.ghidouche@gmail.com}
 
-\author[mysecondaryaddress]{Couturier RaphaÃ«l\corref{mycorrespondingauthor}}
+\author[mysecondaryaddress]{Couturier Raphael\corref{mycorrespondingauthor}}
 %%\cortext[mycorrespondingauthor]{Corresponding author}
 \ead{raphael.couturier@univ-fcomte.fr}
 
@@ -74,10 +73,8 @@
 %%\cortext[mycorrespondingauthor]{Corresponding author}
 \ead{ar.sider@univ-bejaia.dz}
 
-\address[mymainaddress]{Department of informatics, University of
-  BÃ©jaia, Algeria}
-\address[mysecondaryaddress]{FEMTO-ST Institute, University of
-  Bourgogne Franche-Comte }
+\address[mymainaddress]{Department of informatics,University of Bejaia,Algeria}
+\address[mysecondaryaddress]{FEMTO-ST Institute, University of Franche-ComptÃ© }
 
 \begin{abstract}
 in this article we present a parallel implementation
@@ -94,130 +91,129 @@ root finding of polynomials, high degree, iterative methods, Durant-Kerner, GPU,
 
 \linenumbers
 
-\section{Root finding problem}
-We consider a polynomial of degree \textit{n} having coefficients
-in the complex \textit{C} and zeros $\alpha_{i},\textit{i=1,...,n}$. 
+\section{The problem of finding roots of a polynomial}
+Polynomials are algebraic structures used in mathematics that capture physical phenomenons and that express the outcome in the form of a function of some unknown variable. Formally speaking,  a polynomial $p(x)$ of degree \textit{n} having $n$ coefficients in the complex plane \textit{C} and zeros $\alpha_{i},\textit{i=1,...,n}$ 
 %%\begin{center}
 \begin{equation}
-     {\Large p(x)=\sum{a_{i}x^{i}}=a_{n}\prod(x-\alpha_{i}),a_{0} a_{n}\neq 0}
+     {\Large p(x)=\sum{a_{i}x^{i}}=a_{n}\prod(x-\alpha_{i}),a_{0} a_{n}\neq 0}.
 \end{equation}
 %%\end{center}
 
- The root finding problem consist to find
-all n root of \textit{p(x)}. the problem of finding a root is
-equivalent to the problem of finding a fixed-point. To see this
-consider the fixed-point problem of finding the n-dimensional
-vector x such that
+The root finding problem consists in finding the values of all the $n$ values of the variable $x$ for which \textit{p(x)} is nullified. Such values are called zeroes of $p$. The problem of finding a root is equivalent to that of solving a fixed-point problem. To see this, consider the fixed-point problem of finding the $n$-dimensional
+vector $x$ such that
 \begin{center}
-$x=g(x).  $
+$x=g(x)$
 \end{center}
-Where $g : C^{n}\longrightarrow C^{n}$. Note that we can easily
+where $g : C^{n}\longrightarrow C^{n}$. Usually, we can easily
 rewrite this fixed-point problem as a root-finding problem by
-setting $f (x) = x-g(x)$ and likewise we can recast the
+setting $f(x) = x-g(x)$ and likewise we can recast the
 root-finding problem into a fixed-point problem by setting
 \begin{center}
-$g(x)= f(x)-x$
+$g(x)= f(x)-x$.
 \end{center}
-Often it will not be possible to solve such nonlinear equation
+
+Often it is not be possible to solve such nonlinear equation
 root-finding problems analytically. When this occurs we turn to
-numerical methods to approximate the solution. Generally speaking,
-algorithms for solving problems numerically can be divided into
+numerical methods to approximate the solution. 
+Generally speaking, algorithms for solving problems can be divided into
 two main groups: direct methods and iterative methods.
 \\
- Direct methods exist only for $n \leq 4$,solved in closed form by G. Cardano
+Direct methods exist only for $n \leq 4$, solved in closed form by G. Cardano
 in the mid-16th century. However, N.H. Abel in the early 19th
 century showed that polynomials of degree five or more could not
-be solved by  directs methods. Since then researchers have
-concentrated on numerical (iterative) methods such as the famous
+be solved by  directs methods. Since then, mathmathicians have
+focussed on numerical (iterative) methods such as the famous
 Newton's method, Bernoulli's method of the 18th, and Graeffe's.
-With the advent of electronic computers, different methods has
-been developed such as the Jenkins-Traub method, Larkin s method,
+
+Later on, with the advent of electronic computers, other methods has
+been developed such as the Jenkins-Traub method, Larkin's method,
 Muller's method, and several methods for simultaneous
-approximation of all the roots, starting with the Durand-Kerner
-method:
+approximation of all the roots, starting with the Durand-Kerner (DK)
+method :
 %%\begin{center}
 \begin{equation}
  Z_{i}=Z_{i}-\frac{P(Z_{i})}{\prod_{i\neq j}(z_{i}-z_{j})}
 \end{equation}
 %%\end{center}
 
-This formula is mentioned for the first time from
+This formula is mentioned for the first time by
 Weiestrass~\cite{Weierstrass03} as part of the fundamental theorem
-of Algebra and is rediscovered from Ilieff~\cite{Ilie50},
+of Algebra and is rediscovered by Ilieff~\cite{Ilie50},
 Docev~\cite{Docev62}, Durand~\cite{Durand60},
-Kerner~\cite{Kerner66}. Another method discovered from
+Kerner~\cite{Kerner66}. Another method discovered by
 Borsch-Supan~\cite{ Borch-Supan63} and also described and brought
-in the following form from Ehrlich~\cite{Ehrlich67} and
-Aberth~\cite{Aberth73}.
+in the following form by Ehrlich~\cite{Ehrlich67} and
+Aberth~\cite{Aberth73} uses a different iteration formula given as fellows :
 %%\begin{center}
 \begin{equation}
- Z_{i}=Z_{i}-\frac{1}{{\frac {P'(Z_{i})} {P(Z_{i})}}-{\sum_{i\neq j}(z_{i}-z_{j})}}
+ Z_{i}=Z_{i}-\frac{1}{{\frac {P'(Z_{i})} {P(Z_{i})}}-{\sum_{i\neq j}(z_{i}-z_{j})}}.
 \end{equation}
 %%\end{center}
 
 Aberth, Ehrlich and Farmer-Loizou~\cite{Loizon83} have proved that
-the above method has cubic order of convergence for simple roots.
+the Ehrlisch-Aberth method (EA) has a cubic order of convergence for simple roots whereas the Durand-Kerner has a quadratic order of convergence.
 
 
 Iterative methods raise several problem when implemented e.g.
 specific sizes of numbers must be used to deal with this
-difficulty.Moreover,the convergence time of iterative methods
-drastically increase like the degrees of high polynomials. The
+difficulty. Moreover, the convergence time of iterative methods
+drastically increases like the degrees of high polynomials. It is expected that the
 parallelization of these algorithms will improve the convergence
 time.
 
-Many authors have treated the problem of parallelization of
-simultaneous methods. Freeman~\cite{Freeman89} has tested the DK
-method, EA method and another method of the fourth order proposed
-from Farmer and Loizou~\cite{Loizon83},on a 8- processor linear
-chain, for polynomial of degree up to 8. The third method often
+Many authors have dealt with parallelisation of
+simultaneous methods, i.e. that find all the zeros simultaneously. 
+Freeman~\cite{Freeman89} implemeted and compared DK, EA and another method of the fourth order proposed
+by Farmer and Loizou~\cite{Loizon83}, on a 8- processor linear
+chain, for polynomials of degree up to 8. The third method often
 diverges, but the first two methods have speed-up 5.5
-(speed-up=(Time on one processor)/(Time on p processors)). Later
-Freeman and Bane~\cite{Freemanall90}  consider asynchronous
+(speed-up=(Time on one processor)/(Time on p processors)). Later,
+Freeman and Bane~\cite{Freemanall90}  considered asynchronous
 algorithms, in which each processor continues to update its
-approximations even although the latest values of other $z_i((k))$
-have not received from the other processors, in difference with
-the synchronous version where it would wait.
-in~\cite{Raphaelall01}proposed two methods of parallelization for
-architecture with shared memory and distributed memory,it able to
-compute the root of polynomial degree  10000 on 430 s with only 8
-pc and 2 communications per iteration. Compare to the sequential
-it take 3300 s to obtain the same results.
-
-After this few works discuses this problem until the apparition of
-the Compute Unified Device Architecture (CUDA)~\cite{CUDA10},a
+approximations even though the latest values of other $z_i((k))$
+have not been received from the other processors, in contrast with synchronous algorithms where it would wait those values before making a new iteration.
+Couturier et al. ~\cite{Raphaelall01} proposed two methods of parallelisation for
+a shared memory architecture and for distributed memory one. They were able to
+compute the roots of polynomials of degree 10000 in 430 seconds with only 8
+personal computers and 2 communications per iteration. Comparing to the sequential implementation
+where it takes up to 3300 seconds to obtain the same results, the authors show an interesting speedup, indeed.
+
+Very few works had been since this last work until the appearing of
+the Compute Unified Device Architecture (CUDA)~\cite{CUDA10}, a
 parallel computing platform and a programming model invented by
-NVIDIA. The computing ability of GPU has exceeded the counterpart
-of CPU. It is a waste of resource to be just a graphics card for
-GPU. CUDA adopts a totally new computing architecture to use the
+NVIDIA. The computing power of GPUs (Graphics Processing Unit) has exceeded that of
+of CPUs. However, CUDA adopts a totally new computing architecture to use the
 hardware resources provided by GPU in order to offer a stronger
 computing ability to the massive data computing.
 
 
-Indeed,~\cite{Kahinall14}proposed the implementation of the
-Durand-Kerner method on GPU (Graphics Processing Unit). The main
-result prove that a parallel implementation is 10 times as fast as
+Ghidouche et al. ~\cite{Kahinall14} proposed an implementation of the
+Durand-Kerner method on GPU. Their main
+result showed that a parallel CUDA implementation is 10 times as fast as
 the sequential implementation on a single CPU for high degree
-polynomials that is greater than about 48000.
-\paragraph{}
-The mean part of our work is to implement the Aberth method for the problem root finding for
-high degree polynomials on GPU architecture (Graphics Processing Unit). Initially we present the Aberth method in section 1. Amelioration of Aberth method was proposed in section 2. A related works for the implementation of simultaneous methods in a parallel computer was discuss in section 3. Section 4 we propose a parallel implementation of Aberth method on GPU. Section 5, we present our result and discuss it. Finally, in Section 6, we present our conclusions and future research directions.  
+polynomials of about 48000. To our knowledge, it is the first time such high degree polynomials are numerically solved.
+
 
-\section{Aberth method}
+In this paper, we focus on the implementation of the Aberth method for
+high degree polynomials on GPU. The paper is organised as fellows. Initially, we recall the Aberth method in Section.\ref{sec1}. Improvements for the Aberth method are proposed in Section.\ref{sec2}. Related work to the implementation of simultaneous methods using a parallel approach is presented in Section.\ref{secStateofArt}.
+In Section.4 we propose a parallel implementation of the Aberth method on GPU and discuss it. Section 5 presents and investigates our implementation and experimental study results. Finally, Section 6 concludes this paper and gives some hints for future research directions in this topic.  
+
+\section{The Sequential Aberth method}
+\label{sec1}
 A cubically convergent iteration method for finding zeros of
 polynomials was proposed by O.Aberth~\cite{Aberth73}. The Aberth
-method is a purely algebraic derivation.To illustrate the
-derivation, we let $w_{i}(z)$ be the product of linear factor 
+method is a purely algebraic derivation. To illustrate the
+derivation, we let $w_{i}(z)$ be the product of linear factors 
 
 \begin{equation}
 w_{i}(z)=\prod_{j=1,j \neq i}^{n} (z-x_{j})
 \end{equation}
 
-And rational function $R_{i}(z)$ be the correction term of
-Weistrass method~\cite{Weierstrass03}:
+And let a rational function $R_{i}(z)$ be the correction term of the
+Weistrass method~\cite{Weierstrass03}
 
 \begin{equation}
-R_{i}(z)=\frac{p(z)}{w_{i}(z)} , i=1,2,...,n
+R_{i}(z)=\frac{p(z)}{w_{i}(z)} , i=1,2,...,n.
 \end{equation}
 
 Differentiating the rational function $R_{i}(z)$ and applying the
@@ -227,29 +223,28 @@ Newton method, we have:
 \frac{R_{i}(z)}{R_{i}^{'}(z)}= \frac{p(z)}{p^{'}(z)-p(z)\frac{w_{i}(z)}{w_{i}^{'}(z)}}= \frac{p(z)}{p^{'}(z)-p(z) \sum _{j=1,j \neq i}^{n}\frac{1}{z-x_{i}}}, i=1,2,...,n
 \end{equation}
 
-Substituting $x_{j}$ for z we obtain the Aberth iteration method
+Substituting $x_{j}$ for z we obtain the Aberth iteration method.
 
-Let present the means stages of Aberth method.
+In the fellowing we present the main stages of the running of the Aberth method.
 
 \subsection{Polynomials Initialization}
- The initialization of polynomial P(z) with complex coefficients
- are given by:
+The initialization of a polynomial p(z) is done by setting each of the $n$ complex coefficients $a_{i}$
+:
 
 \begin{equation}
   p(z)=\sum{a_{i}z^{n-i}} , a_{n} \neq 0,a_{0}=1, a_{i}\subset C
 \end{equation}
 
 
-\subsection{Vector $Z^{(0)}$ Initialization}
+\subsection{Vector $z^{(0)}$ Initialization}
 
-The choice of the initial points $z^{(0)}_{i}, i = 1, . . . , n.$
-from which starting the iteration  (2) or (3), is rather delicate
-since the number of steps needed by the iterative method to reach
+Like for any iterative method, we need to choose $n$ initial guess points $z^{(0)}_{i}, i = 1, . . . , n.$
+The initial guess is very important since the number of steps needed by the iterative method to reach
 a given approximation strongly depends on it.
-In~\cite{Aberth73}the Aberth iteration is started by selecting n
-equispaced points on a circle of center 0 and radius r, where r is
-an upper bound to the moduli of the zeros. After,~\cite{Bini96}
-performs this choice by selecting complex numbers along different
+In~\cite{Aberth73} the Aberth iteration is started by selecting $n$
+equi-spaced points on a circle of center 0 and radius r, where r is
+an upper bound to the moduli of the zeros. Later, Bini et al.~\cite{Bini96}
+performed this choice by selecting complex numbers along different
 circles and relies on the result of~\cite{Ostrowski41}.
 
 \begin{equation}
@@ -265,19 +260,19 @@ v_{i}=\frac{|\frac{a_{n}}{a_{i}}|^{\frac{1}{n-i}}}{2}.
 \end{equation}
 
 \subsection{Iterative Function $H_{i}$}
-The operator used with Aberth method is corresponding to the
+The operator used by the Aberth method is corresponding to the
 following equation which will enable the convergence towards
 polynomial solutions, provided all the roots are distinct.
 
 \begin{equation}
-H_{i}(z)=z_{i}-\frac{1}{\frac{P^{'}(z_{i})}{P(z_{i})}-\sum_{j\neq
+H_{i}(z)=z_{i}-\frac{1}{\frac{p^{'}(z_{i})}{p(z_{i})}-\sum_{j\neq
 i}{\frac{1}{z_{i}-z_{j}}}}
 \end{equation}
 
-\subsection{Convergence condition}
-Determines the success of the termination. It consists in stopping
-the iterative function $H_{i}(z)$ when the root are stable, the method
-converge sufficiently:
+\subsection{Convergence Condition}
+The convergence condition determines the termination of the algorithm. It consists in stopping from running
+the iterative function $H_{i}(z)$ when the roots are sufficiently stable. We consider that the method
+converges sufficiently when:
 
 \begin{equation}
 \forall i \in
@@ -285,33 +280,34 @@ converge sufficiently:
 \end{equation}
 
 
-\section{Amelioration of Aberth method }
-The Aberth method implementation suffer of overflow problems. This
+\section{Improving the Ehrlisch-Aberth Method}
+\label{sec2}
+The Ehrlisch-Aberth method implementation suffers of overflow problems. This
 situation occurs, for instance, in the case where a polynomial
-having positive coefficients and large degree is computed at a
-point $\xi$ where $|\xi| > 1$. Indeed the limited number in the
-mantissa of floating takings the computation of P(z) wrong when z
-is large. for example $(10^{50}) +1+ (- 10^{50})$ will give result
-0 instead of 1 in reality. Consequently we can not compute the roots
-for large polynomial's degree. This problem was discuss in
+having positive coefficients and a large degree is computed at a
+point $\xi$ where $|\xi| > 1$, where $|x|$ stands for the modolus of a complex $x$. Indeed, the limited number in the
+mantissa of floating points representations makes the computation of p(z) wrong when z
+is large. For example $(10^{50}) +1+ (- 10^{50})$ will give the wrong result
+of $0$ instead of $1$. Consequently, we can not compute the roots
+for large degrees. This problem was early discussed in
 ~\cite{Karimall98} for the Durand-Kerner method, the authors
-propose to use the logarithm and the exponential of a complex:
+propose to use the logarithm and the exponential of a complex in order to compute the power at a high exponent.
 
 \begin{equation}
+\label{deflncomplex}
  \forall(x,y)\in R^{*2}; \ln (x+i.y)=\ln(x^{2}+y^{2})
 2+i.\arcsin(y\sqrt{x^{2}+y^{2}})_{\left] -\pi, \pi\right[ }
 \end{equation}
 %%\begin{equation}
 \begin{align}
+\label{defexpcomplex}
  \forall(x,y)\in R^{*2}; \exp(x+i.y) & = \exp(x).\exp(i.y)\\
-                                     & =\exp(x).\cos(y)+i.\exp(x).\sin(y)
+                                     & =\exp(x).\cos(y)+i.\exp(x).\sin(y)\label{defexpcomplex}
 \end{align}
 %%\end{equation}
 
-The application of logarithm can replace any multiplications and
-divisions with additions and subtractions. Consequently, it
-manipulates lower absolute values and can be compute the roots for
-large polynomial's degree exceed~\cite{Karimall98}.
+Using the logarithm (eq. \ref{deflncomplex}) and the exponential (eq. \ref{defexpcomplex}) operators, we can replace any multiplications and divisions with additions and subtractions. Consequently, computations
+manipulate lower absolute values and the roots for large polynomial's degrees can be looked for successfully~\cite{Karimall98}.
 
 Applying this solution for the Aberth method we obtain the
 iteration function with logarithm:
@@ -319,29 +315,33 @@ iteration function with logarithm:
 \begin{equation}
 H_{i}(z)=z_{i}^{k}-\exp \left(\ln \left(
 p(z_{k})\right)-\ln\left(p(z_{k}^{'})\right)- \ln
-\left(1-Q(z_{k})\right)\right)
+\left(1-Q(z_{k})\right)\right),
 \end{equation}
-Where:
+
+where:
 
 \begin{equation}
 Q(z_{k})=\exp\left( \ln (p(z_{k}))-\ln(p(z_{k}^{'}))+\ln \left(
-\sum_{k\neq j}^{n}\frac{1}{z_{k}-z_{j}}\right)\right)
+\sum_{k\neq j}^{n}\frac{1}{z_{k}-z_{j}}\right)\right).
 \end{equation}
 
-This solution is applying when it is necessary
+This solution is applied when it is necessary ??? When ??? (SIDER)
 
 \section{The implementation of simultaneous methods in a parallel computer}
-    The main problem of the simultaneous methods is that the necessary
-time needed for the convergence is increased with the increasing
-of the degree of the polynomial. The parallelization of these
-algorithms will improve the convergence time. Researchers usually
-adopt one of the two following approaches to parallelize root
-finding algorithms. One approach is to reduce the total number of
-iterations as implemented by Miranker
+\label{secStateofArt}   
+The main problem of simultaneous methods is that the necessary
+time needed for convergence is increased when we increase
+the degree of the polynomial. The parallelisation of these
+algorithms is expected to improve the convergence time.
+Authors usually adopt one of the two following approaches to parallelize root
+finding algorithms. The first approach aims at reducing the total number of
+iterations as by Miranker
 ~\cite{Mirankar68,Mirankar71}, Schedler~\cite{Schedler72} and
-Winogard~\cite{Winogard72}. Another approach is to reduce the
+Winogard~\cite{Winogard72}. The second approach aims at reducing the
 computation time per iteration, as reported
-in~\cite{Benall68,Jana06,Janall99,Riceall06}. There are many
+in~\cite{Benall68,Jana06,Janall99,Riceall06}. 
+
+There are many
 schemes for simultaneous approximations of all roots of a given
 polynomial. Several works on different methods and issues of root
 finding have been reported in~\cite{Azad07, Gemignani07, Kalantari08, Skachek08, Zhancall08, Zhuall08}. However, Durand-Kerner and Ehrlich methods are the most practical choices among
-- 
2.39.5