From: Raphael Couturier Date: Mon, 17 Dec 2012 20:39:12 +0000 (+0100) Subject: ch3 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/b0cfcc742771497c83313352b59170ead2f99f40?ds=inline;hp=-c ch3 --- b0cfcc742771497c83313352b59170ead2f99f40 diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 293141d..8bc7b38 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -117,6 +117,7 @@ \part{This is a Part} \include{Chapters/chapter1/ch1} \include{Chapters/chapter2/ch2} +\include{Chapters/chapter3/ch3} \include{Chapters/chapter6/ch6} \include{Chapters/chapter8/ch8} \include{Chapters/chapter11/ch11} diff --git a/BookGPU/Chapters/chapter3/biblio3.bib b/BookGPU/Chapters/chapter3/biblio3.bib new file mode 100644 index 0000000..6b91912 --- /dev/null +++ b/BookGPU/Chapters/chapter3/biblio3.bib @@ -0,0 +1,450 @@ +@inproceedings{AllainBG08, + author = {Marc Allain and Nicolas Bertaux and Fr{\'e}d{\'e}ric Galland}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + booktitle = {ACIVS}, + crossref = {DBLP:conf/acivs/2008}, + ee = {http://dx.doi.org/10.1007/978-3-540-88458-3_46}, + pages = {506--517}, + title = {Nonparametric Level-Set Segmentation Based on the Minimization of the Stochastic Complexity}, + year = {2008} +} + +@inproceedings{KauffmannP08, + author = {Claude Kauffmann and Nicolas Piche}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + booktitle = {ICPR}, + crossref = {DBLP:conf/icpr/2008}, + ee = {http://dx.doi.org/10.1109/ICPR.2008.4761628}, + pages = {1--4}, + title = {Cellular automaton for ultra-fast watershed transform on GPU}, + year = {2008} +} + +@article{cremD09, + address = {Los Alamitos, CA, USA}, + author = {Thomas Schoenemann and Daniel Cremers}, + doi = {10.1109/TPAMI.2009.79}, + issn = {0162-8828}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + pages = {1153--1164}, + publisher = {IEEE Computer Society}, + title = {A Combinatorial Solution for Model-Based Image Segmentation and Real-Time Tracking}, + volume = {32}, + year = {2010} +} + +@article{KassWT88, + author = {Michael Kass and Andrew P. Witkin and Demetri Terzopoulos}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://dx.doi.org/10.1007/BF00133570}, + journal = {International Journal of Computer Vision}, + number = {4}, + pages = {321--331}, + title = {Snakes: Active contour models}, + volume = {1}, + year = {1988} +} + +@article{XuP98, + author = {Chenyang Xu and Jerry L. Prince}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://dx.doi.org/10.1109/83.661186}, + journal = {IEEE Transactions on Image Processing}, + number = {3}, + pages = {359--369}, + title = {Snakes, shapes, and gradient vector flow}, + volume = {7}, + year = {1998} +} + +@article{GallandBR03, + author = {Fr{\'e}d{\'e}ric Galland and Nicolas Bertaux and Philippe R{\'e}fr{\'e}gier}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://dx.doi.org/10.1109/TIP.2003.816005}, + journal = {IEEE Transactions on Image Processing}, + number = {9}, + pages = {995--1006}, + title = {Minimum description length synthetic aperture radar image segmentation}, + volume = {12}, + year = {2003} +} + +@article{Brunett, + abstract = {Abstract. Active contours have been proven to be a powerful semiautomatic image segmentation approach, that seems to cope with many applications and different image modalities. However, they exhibit inherent drawbacks, including the sensibility to contour initialization due to the limited capture range of image edges and problems with concave boundary regions. The Gradient Vector Flow replaces the traditional image force and provides an enlarged capture range as well as enhanced concavity extraction capabilities, but it involves an expensive computational effort and considerably increased memory requirements at the time of computation. In this paper, we present an enhancement of the active contour model to facilitate semiautomatic contour detection in huge images. We propose a tile-based image decomposition accompanying an image force computation scheme on demand in order to minimize both computational and memory requirements. We show an efficient implementation of this approach on the basis of general purpose GPU processing providing for continuous active contour deformation without a considerable delay.}, + author = {Enrico {Dipl.-Inf. Kienel} and Guido {Prof. Dr. Brunnett}}, + institution = {MONARCH - Dokumenten- und Publikationsservice [http://archiv.tu-chemnitz.de/cgi-bin/interfaces/oai/oai2.pl] (Germany)}, + keywords = {Active Contours; GPGPU; Gradient Vector Flow; Image Segmentation; Snakes; Tiling; 004}, + location = {http://www.scientificcommons.org/41680702}, + publisher = {TU Chemnitz, Fakult{\"a}t f{\"u}r Informatik}, + title = {GPU-Accelerated Contour Extraction on Large Images Using Snakes}, + url = {http://archiv.tu-chemnitz.de/pub/2009/0035}, + year = {2009} +} + +@article{ChesnaudRB99, + author = {Christophe Chesnaud and Philippe R{\'e}fr{\'e}gier and Vlady Boulet}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://www.computer.org/tpami/tp1999/i1145abs.htm}, + journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, + number = {11}, + pages = {1145--1157}, + title = {Statistical Region Snake-Based Segmentation Adapted to Different Physical Noise Models}, + volume = {21}, + year = {1999} +} + +@article{GermainR01, + author = {Olivier Germain and Philippe R{\'e}fr{\'e}gier}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + journal = {Pattern Recognition Letters}, + number = {10}, + pages = {1125--1132}, + title = {Statistical active grid for segmentation refinement}, + volume = {22}, + year = {2001} +} + +@article{Ruch01, + author = {Olivier Ruch and Philippe R{\'e}fr{\'e}gier}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://www.computer.org/tpami/tp1999/i1145abs.htm}, + journal = {Optics Letters}, + month = {july}, + number = {13}, + title = {Minimal-complexity segmentation with a polygonal snake adapted to different optical noise models}, + volume = {26}, + year = {2001} +} + +@techreport{BlellochTR90, + author = {Guy~E. Blelloch}, + institution = {School of Computer Science, Carnegie Mellon University}, + month = nov, + number = {CMU-CS-90-190}, + title = {Prefix Sums and Their Applications}, + year = 1990 +} + +@inbook{Harris07, + author = {Mark Harris and Shubhabrata Sengupta and John D. Owens}, + chapter = {39 - Parallel Prefix Sum with CUDA}, + edition = {first}, + isbn = {9780321545428}, + publisher = {Addison-Wesley Professional}, + title = {Gpu gems 3}, + year = {2007} +} + +@manual{CUDAPG, + month = {7}, + organization = {NVIDIA Corporation}, + title = {NVIDIA CUDA C Programming Guide v3.1.1}, + year = {2010} +} + +@manual{CUDAFC, + month = {7}, + organization = {NVIDIA Corporation}, + title = {NVIDIA Fermi Compatibility Guide}, + year = {2010} +} + +@manual{CUDAFT, + month = {7}, + organization = {NVIDIA Corporation}, + title = {NVIDIA Fermi Tuning Guide}, + year = {2010} +} + +@inproceedings{Dabov09bm3dimage, + author = {Kostadin Dabov and Ro Foi and Vladimir Katkovnik and Karen Egiazarian}, + booktitle = {Proc. Workshop on Signal Processing with Adaptive Sparse Structured Representations (SPARS{\rq}09}, + title = {BM3D Image Denoising with Shape-Adaptive Principal Component Analysis}, + year = {2009} +} + +@article{Bertaux:04, + abstract = {We propose a method based on the maximum-likelihood technique for removing speckle patterns that plague coherent images. The proposed method is designed for images whose gray levels vary continuously in space. The image model is based on a lattice of nodes corresponding to vertices of triangles in which the gray level of each pixel is produced by linear interpolation. A constraint on isoline gray levels is introduced to regularize the solution.}, + author = {Nicolas Bertaux and Yann Frauel and Philippe R{\'e}fr{\'e}gier and Bahram Javidi}, + doi = {10.1364/JOSAA.21.002283}, + journal = {J. Opt. Soc. Am. A}, + keywords = {Coherence and statistical optics; Noise in imaging systems; Image processing; Image reconstruction techniques}, + month = {Dec}, + number = {12}, + pages = {2283--2291}, + publisher = {OSA}, + title = {Speckle removal using a maximum-likelihood technique with isoline gray-level regularization}, + url = {http://josaa.osa.org/abstract.cfm?URI=josaa-21-12-2283}, + volume = {21}, + year = {2004} +} + +@article{Wang04imagequality, + author = {Zhou Wang and Alan Conrad Bovik and Hamid Rahim Sheikh and Student Member and Eero P. Simoncelli and Senior Member}, + journal = {IEEE Transactions on Image Processing}, + pages = {600--612}, + title = {Image Quality Assessment: From Error Visibility to Structural Similarity}, + volume = {13}, + year = {2004} +} + +@proceedings{denoiselab, + author = {Steven Lansel}, + journal = {Scholl of electrical Engineering}, + month = oct, + school = {Stanford University}, + title = {DenoiseLab Philosophy: A Standard Test Set and Evaluation Method to Compare Denoising Algorithms}, + year = {2007} +} + +@article{denoisereview, + author = {A. Buades and B. Coll and J.M. Morel}, + issn = {1540-3459}, + journal = {Multiscale Modeling and Simulation}, + number = {2}, + pages = {490--530}, + publisher = {Society for Industrial and Applied Mathematics}, + title = {A Review of Image Denoising Algorithms, with a New One}, + volume = {4}, + year = {2005} +} + +@incollection{springerlink:10.1007/3-540-48236-9_16, + affiliation = {CMLA, ENS Cachan 61, av du Pr{\'e}sident Wilson 94235 Cachan Cedex France}, + author = {Pascal Monasse and Fr{\'e}d{\'e}ric Guichard}, + booktitle = {Scale-Space Theories in Computer Vision}, + editor = {Mads Nielsen and Peter Johansen and Ole Olsen and Joachim Weickert}, + isbn = {978-3-540-66498-7}, + keyword = {Computer Science}, + note = {10.1007/3-540-48236-9\_16}, + pages = {175--186}, + publisher = {Springer Berlin / Heidelberg}, + series = {Lecture Notes in Computer Science}, + title = {Scale-Space from a Level Lines Tree}, + url = {http://dx.doi.org/10.1007/3-540-48236-9\_16}, + volume = {1682}, + year = {1999} +} + +@inproceedings{caselles97, + author = {Vincent Caselles and Bartomeu Coll and Jean-Michel Morel}, + isbn = {978-3-540-63167-5}, + journal = {First International Conference on Scale-Space Theory in Computer Vision (Scale-Space'97)}, + month = {07}, + pages = {29--49}, + publisher = {Springer}, + title = {Scale space versus topographic map for natural images}, + year = {1997} +} + +@book{matheron75, + author = {Georges Matheron}, + isbn = {0-471-57621-2}, + publisher = {Wiley}, + title = {Random sets and integral geometry}, + year = {1975} +} + +@article{BuadesCM06, + author = {Antoni Buades and Bartomeu Coll and Jean-Michel Morel}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://doi.ieeecomputersociety.org/10.1109/TIP.2006.871137}, + journal = {IEEE Transactions on Image Processing}, + number = {6}, + pages = {1499--1505}, + title = {The staircasing effect in neighborhood filters and its solution}, + volume = {15}, + year = {2006} +} + +@inproceedings{mcguire2008median, + author = {Morgan Mc{G}uire}, + booktitle = {ShaderX6}, + month = {February}, + title = {A fast, small-radius GPU median filter}, + url = {http://graphics.cs.williams.edu/papers/MedianShaderX6}, + year = {2008} +} + +@article{ipol.2011.bcm_nlm, + author = {Bartomeu Coll and Jean-Michel Morel and Antoni Buades}, + journal = {Image Processing On Line}, + title = {Non-local Means Denoising}, + doi = {10.5201/ipol.2011.bcm_nlm}, + year = 2011 +} + +@article{PALHANOXAVIERDEFONTES, + hal_id = {inria-00476122}, + url = {http://hal.inria.fr/inria-00476122}, + title = {{Real time ultrasound image denoising}}, + author = {Palhano Xavier De Fontes, Fernanda and Andrade Barroso, Guillermo and Coup{\'e}, Pierrick and Hellier, Pierre}, + abstract = {{Image denoising is the process of removing the noise that perturbs image analysis methods. In some applications like segmentation or registration, denoising is intended to smooth homogeneous areas while preserving the contours. In many applications like video analysis, visual servoing or image-guided surgical interventions, real-time denoising is required. This paper presents a method for real-time denoising of ultrasound images: a modified version of the NL-means method is presented that incorporates an ultrasound dedicated noise model, as well as a GPU implementation of the algorithm. Results demonstrate that the proposed method is very efficient in terms of denoising quality and is real-time.}}, + language = {Anglais}, + affiliation = {VISAGES : Vision Action et Gestion d'Informations en Sant{\'e} - VISAGES , Service Exp{\'e}rimentation et D{\'e}veloppement - SED , Montreal Neurological Institute , SERPICO - INRIA}, + publisher = {Springer}, + journal = {Journal of Real-Time Image Processing}, + audience = {internationale }, + doi = {10.1007/s11554-010-0158-5 }, + year = {2010}, + month = May, + pdf = {http://hal.inria.fr/inria-00476122/PDF/JRTIP.pdf}, +} + + +@inproceedings{YangTA09, + author = {Qingxiong Yang and Kar-Han Tan and Narendra Ahuja}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + booktitle = {CVPR}, + crossref = {DBLP:conf/cvpr/2009}, + pages = {557--564}, + title = {Real-time O(1) bilateral filtering}, + url = {http://doi.ieeecomputersociety.org/10.1109/CVPRW.2009.5206542}, + year = {2009} +} + +@proceedings{DBLP:conf/cvpr/2009, + title = {2009 IEEE Computer Society Conference on Computer Vision + and Pattern Recognition (CVPR 2009), 20-25 June 2009, Miami, + Florida, USA}, + booktitle = {CVPR}, + publisher = {IEEE}, + year = {2009}, + isbn = {978-1-4244-3992-8}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + + + +@article{abs-1104, + author = {Gleb Beliakov}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + journal = {CoRR}, + title = {Parallel calculation of the median and order statistics on GPUs with application to robust regression}, + url = {http://arxiv.org/abs/1104.2732}, + volume = {abs/1104.2732}, + year = {2011} +} + +@inproceedings{chen09, + author = {Wei Chen and M. Beister and Y. Kyriakou and M. Kachelries}, + booktitle = {Nuclear Science Symposium Conference Record (NSS/MIC), 2009 IEEE}, + doi = {10.1109/NSSMIC.2009.5402323}, + issn = {1095-7863}, + keywords = {CUDA-based BVM filter; NVIDIA compute unified device architecture; O(M In M) computational complexity; O(M2) computational complexity; branchless vectorized median filter; computerised tomography; data-level parallelism; fast accessing scheme; high performance median filtering; memory layout; modern commodity graphics processing units; pivot median filter; vectorized median computation; biology computing; computerised tomography; medical image processing}, + month = {24 2009-nov. 1}, + pages = {4142--4147}, + title = {High performance median filtering using commodity graphics hardware}, + year = {2009} +} + +@inproceedings{sanchezICASSP12, + author = {Ricardo M. Sanchez and Paul A. Rodriguez}, + booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on}, + doi = {10.1109/ICASSP.2012.6288187}, + issn = {1520-6149}, + month = {march}, + pages = {1549--1552}, + title = {Bidimensional median filter for parallel computing architectures}, + year = {2012} +} + +@INPROCEEDINGS{6036776, +author={Perrot, G. and Domas, S. and Couturier, R. and Bertaux, N.}, +booktitle={Computer and Information Technology (CIT), 2011 IEEE 11th International Conference on}, title={GPU Implementation of a Region Based Algorithm for Large Images Segmentation}, +year={2011}, +month={31 2011-sept. 2}, +volume={}, +number={}, +pages={291 -298}, +keywords={GPU implementation;Nvidia GPU architecture;algorithmic optimization;graphical processing units;image computing;image segmentation;image size;multicore CPU;multithreaded execution capability;region based algorithm;region-based active contour technique;snake algorithm;computer graphic equipment;coprocessors;image enhancement;image segmentation;multi-threading;multiprocessing systems;optimisation;}, +doi={10.1109/CIT.2011.60}, +ISSN={},} + +@book{tukey77, +author = {Tukey, John Wilder}, +isbn = {0-201-07616-0}, +publisher = {Addison-Wesley}, +title = {Exploratory Data Analysis}, +year = 1977 +} + +@INPROCEEDINGS{5402362, +author={Kachelriess, M.}, +booktitle={Nuclear Science Symposium Conference Record (NSS/MIC), 2009 IEEE}, title={Branchless vectorized median filtering}, +year={2009}, +month={24 2009-nov. 1}, +volume={}, +number={}, +pages={4099 -4105}, +keywords={CPU-based implementations;Intel performance primitives library;branchless vectorized median filtering;computational complexity;conventional fast median filters;data sorting algorithm;data-level parallelism;doubly linked lists;image processing;instruction pipeline;median algorithm;median filtering noisy data;modern hardware;one-dimensional signals;random data;scalar data;signal processing;sorted array;spin-off effect;vector capabilities;vectorized mask operation;vectorized max operation;vectorized min operation;computational complexity;high energy physics instrumentation computing;median filters;sorting;}, +doi={10.1109/NSSMIC.2009.5402362}, +ISSN={1095-7863},} + + +@article{Weiss:2006:FMB:1141911.1141918, + author = {Weiss, Ben}, + title = {Fast median and bilateral filtering}, + journal = {ACM Trans. Graph.}, + issue_date = {July 2006}, + volume = {25}, + number = {3}, + month = jul, + year = {2006}, + issn = {0730-0301}, + pages = {519--526}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1141911.1141918}, + doi = {10.1145/1141911.1141918}, + acmid = {1141918}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {SIMD, algorithms, bilateral filtering, complexity, data structures, histograms, image processing, median filtering, rank-order filtering, sorting, vector processing}, +} + +@inproceedings{Weiss:2006:FMB:1179352.1141918, + author = {Weiss, Ben}, + title = {Fast median and bilateral filtering}, + booktitle = {ACM SIGGRAPH 2006 Papers}, + series = {SIGGRAPH '06}, + year = {2006}, + isbn = {1-59593-364-6}, + location = {Boston, Massachusetts}, + pages = {519--526}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1179352.1141918}, + doi = {10.1145/1179352.1141918}, + acmid = {1141918}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {SIMD, algorithms, bilateral filtering, complexity, data structures, histograms, image processing, median filtering, rank-order filtering, sorting, vector processing}, +} + +@book{Huang:1981:TDS:539567, + author = {Huang, Thomas S.}, + title = {Two-Dimensional Digital Signal Processing II: Transforms and Median Filters}, + year = {1981}, + isbn = {0387103597}, + publisher = {Springer-Verlag New York, Inc.}, + address = {Secaucus, NJ, USA}, +} + +@article{zheng2011performance, + title={Performance Tuning for CUDA-Accelerated Neighborhood Denoising Filters}, + author={Zheng, Z. and Xu, W. and Mueller, K.}, + journal={Workshop on High Performance Image Reconstruction (HPIR)}, + pages={52--55}, + year={2011} +} + +@INPROCEEDINGS{6288187, +author={Sanchez, R.M. and Rodriguez, P.A.}, +booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on}, title={Bidimensional median filter for parallel computing architectures}, +year={2012}, +month={march}, +volume={}, +number={}, +pages={1549 -1552}, +keywords={adaptive rate compressive sensing;background subtraction;classical CS theory;cross validation;current measurement rate;sensor measurements;signal reconstruction;signal sparsity;time-varying signal;visual surveillance applications;compressed sensing;image reconstruction;video surveillance;}, +doi={10.1109/ICASSP.2012.6288187}, +ISSN={1520-6149},} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex new file mode 100755 index 0000000..2afb337 --- /dev/null +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -0,0 +1,454 @@ +\chapterauthor{Zulu pero}{Zulumachine Institute} +%\graphicspath{{img/}} + + +% \begin{VF} +% ``A '' + +% \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP} +% \end{VF} + + + +% \begin{shadebox} +% A component part for an electronic item is +% manufactured at one of three different factories, and then delivered to +% the main assembly line.Of the total number supplied, factory A supplies +% 50\%, factory B 30\%, and factory C 20\%. Of the components +% manufactured at factory A, 1\% are faulty and the corresponding +% proportions for factories B and C are 4\% and 2\% respectively. A +% component is picked at random from the assembly line. What is the +% probability that it is faulty? +% \end{shadebox} + + +% \begin{equation} +% \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t +% \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k) = \sum_{j = 1}^t +% \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2} +% \end{equation} + + +% \begin{shortbox} +% \Boxhead{Box Title Here} +% \end{shortbox} + +% \begin{theorem}\label{1th:Z_m} +% Let $m$ be a prime number. With the addition and multiplication as +% defined above, $Z_m$ is a field. +% \end{theorem} + +% \begin{proof} +% \end{proof} + +% \begin{notelist}{000000} +% \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.} +% \end{notelist} + + +% \begin{VT1} +% \VH{Think About It...} +% Com +% \VT +% \VTA{The Information Revolution}{Business Week} +% \end{VT1} + + +%\begin{definition}\label{1def:linearcomb}{}\end{definition} + + + +% \begin{extract} +% text +% \end{extract} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Listings +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\lstset{ + language=C, + columns=fixed, + basicstyle=\footnotesize\ttfamily, + numbers=left, + firstnumber=1, + numberstyle=\tiny, + stepnumber=5, + numbersep=5pt, + tabsize=3, + extendedchars=true, + breaklines=true, + keywordstyle=\textbf, + frame=single, + % keywordstyle=[1]\textbf, + %identifierstyle=\textbf, + commentstyle=\color{white}\textbf, + stringstyle=\color{white}\ttfamily, + % xleftmargin=17pt, + % framexleftmargin=17pt, + % framexrightmargin=5pt, + % framexbottommargin=4pt, + backgroundcolor=\color{lightgray}, + } + +%\DeclareCaptionFont{blue}{\color{blue}} +%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}} + +%\DeclareCaptionFont{white}{\color{white}} +%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}} +%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline} +%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernLeft.png}~} +\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernRight.png}} + +\chapter{Setting up the environnement.} +Image processing using a GPU often means using it as a general purpose computing processor, which soon brings up the issue of data transfers, especially when kernel runtime is fast and/or when large data sets are processed. +The truth is that, in certain cases, data transfers between GPU and CPU are slower than the actual computation on GPU. +It remains that global runtime can still be faster than similar processes run on CPU. +Therefore, to fully optimize global runtimes, it is important to pay attention to how memory transfers are done. +This leads us to propose, in the following section, an overall code structure to be used with all our kernel examples. + +Obviously, our code originally accepts various image dimensions and can process color images. +However, so as to propose concise and more readable code, we will assume the following limitations: +8 or 16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. + +\section{Data transfers, memory management.} +This section deals with the following issues: +\begin{enumerate} +\item data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has lead us to choose \textbf{texture memory} as primary GPU memory area for images. +\item data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching in thread block shared memory. +\item data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. +\item data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, keeping in mind it has to be used sparingly. +\end{enumerate} +Algorithm \ref{algo:memcopy} summarizes all the above considerations and describe how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to CUDA programmer's guide. + +At debug stage, for simplicity's sake, we use the \textbf{cutil} library supplied by the NVidia developpement kit (SDK). Thus, in order to easily implement our examples, we suggest readers download download and install the latest NVidia-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} present in each sub-directory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be enough to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. +Listings \ref{lst:main1}, \ref{lst:fkern1} and \ref{lst:mkfile} implement all the above considerations minimally, while remaining functional. + +The main file of Listing \ref{lst:main1} is a simplified version of our actual main file. +It has to be noticed that cutil functions \texttt{cutLoadPGMi} and \texttt{cutSavePGMi} only operate on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves casting data after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. + +Listing \ref{lst:fkern1} gives a minimal kernel skeleton that will serve as the basis for all other kernels. Lines 5 and 6 determine the coordinates $(i, j)$ of the pixel to be processed. Each pixel is associated with one thread. +The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2-D texture memory. +The Makefile given in Listing \ref{lst:mkfile} shows how to adapt examples given in SDK. + +\begin{algorithm} + \SetNlSty{textbf}{}{:} + allocate and populate CPU memory \textbf{h\_in}\; + allocate CPU pinned-memory \textbf{h\_out}\; + allocate GPU global memory \textbf{d\_out}\; + declare GPU texture reference \textbf{tex\_img\_in}\; + allocate GPU array in global memory \textbf{array\_img\_in}\; + bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\; + copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; + kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\; + copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\; +\caption{Global memory management on CPU and GPU sides.} +\label{algo:memcopy} +\end{algorithm} + +\lstinputlisting[label={lst:main1},caption=Generic main.cu file used to launch CUDA kernels]{Chapters/chapter3/code/mainSkel.cu} + +\lstinputlisting[label={lst:fkern1},caption=fast\_kernels.cu file featuring one kernel skeleton]{Chapters/chapter3/code/kernSkel.cu} + +\lstinputlisting[label={lst:mkfile},caption=Generic Makefile based on those provided by NV SDK]{Chapters/chapter3/code/Makefile} + + +\section{Performance measurements} +As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01~ms$. Again, the easiest way of doing so is to use the helper functions of the cutil library. As usual, as the durations we are measuring are short and possibly suject to non neglectable variations, a good practice is to measure multiple executions and issue the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. + +Listing \ref{lst:chronos} shows how to use the dedicated cutil functions. Timer declaration and creation only need to be performed once while reset, start and stop can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measure being biased. +\lstinputlisting[label={lst:chronos},caption=Time measurement technique using cutil functions]{Chapters/chapter3/code/exChronos.cu} + +In an attempt to provide relevant speedup values, we either implemented CPU versions of the algorithms studied, or used the values found in existing literature. Still, the large number and diversity of hardware platforms and GPU cards make it impossible to benchmark every possible combination and significant differences may occur between the speedups we announce and those obtained with different devices. As a reference, our developing platform details as follows: + +\begin{itemize} +\item CPU codes run on: + \begin{itemize} + \item Quad Core Xeon E31245 at 3.3GHz-8GByte RAM running Linux kernel 3.2 + \item Quad Core Xeon E5620 at 2.40GHz-12GByte RAM running Linux kernel 2.6.18 + \end{itemize} +\item GPU codes run on: +\begin{itemize} + \item Nvidia Tesla C2070 hosted by a PC QuadCore Xeon E5620 at 2.4GHz-12GByte RAM, running Linux kernel 2.6.18 + \item NVidia GeForce GTX 280 hosted by a PC QuadCore Xeon X5482 at 3.20GHz-4GByte RAM, running Linux kernel 2.6.32 + \end{itemize} +\end{itemize} + +All kernels have also been tested with various image sizes from 512$\times$512 to 4096$\times$4096 pixels. This allows to guess runtime dependancy over image size. + +Last, like many authors, we chose to use the pixel throughput value of each process in Mega Pixels per second (MP/s) as a performance indicator, including data transfers and kernel runtimes. +In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070. + + + +\chapter{Implementing a fast median filter} +\section{Introduction} +Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. + +First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. +Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have adress these issues and designed, for example, efficient histogram-based median filter with predictible runtime \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. + +More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphic capabilities: in that respect, we can cite the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based CCDS median filter \cite{6288187}. + +Some of the following implementations, feature very fast runtimes. They are targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x) but may easily be adapted to other models e.g. those of compute capability 1.3. + +The fastest ones are based on one efficient parallel implementation of the BVM algorithm described in \cite{mcguire2008median}, improving its performance through fine tuning of its implementation. + +\section{Median filtering} +\subsection{Basic principles} +DEsigning a 2-D median filter basically consists in defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n\times n$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray level values of the $n\times n$ pixels of $H(i,j)$. Figure \ref{fig:median_1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$. The output value is the median value of the 25 values of the dark gray window centered on pixel $I(5,6)$. + The generic filtering method is given by Algorithm \ref{algo_median_generic}. After the data transfer stage of line \ref{algo_median_generic:memcpyH2D} which copies data from CPU memory to GPU texture memory, the actual median computing occurs between lines \ref{algo_median_generic:cptstart} and lines \ref{algo_median_generic:cptend}, before the final transfer which copies data back to CPU memory at line \ref{algo_median_generic:memcpyD2H}. Obviously, on key issue is the selection method that identifies the median value. But, as shown in figure \ref{fig:median_overlap}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$. +As mentioned earlier, the selection of the median value can be performed by por than one technique, using either histogram-based or sorting methods, each of them having its own benefits and drawbacks as will be discussed further down. + +\subsection{A naive implementation} +As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit very well GPU architecture. Each pixel loads the whole of its $n\times n$ neighborhood meaning that one pixel is loaded multiple times inside one single thread block, and above all, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow). + +Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$ and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms , such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. This block size, is a good compromise in this case. + +The first observation to make when analysing results of Table \ref{tab:medianHisto1} is that, on CPU, window size has almost no influence on the effective pixel throughput. +Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note neglectable runtime compared to the runtime of outer loops that fetch image pixels (from 256k to 16M instructions). +One could be tempted to claim that CPU has no chance to win, which is not so obvious as it highly depends on what kind of algorithm is run and above all, how it is implemented. Despite a maximum effective throughput potential that is almost five times higher, measured GTX280 throughput values sometimes prove slower than CPU values, as shown in Table \ref{tab:medianHisto1}. + +On the GPU's side, we note high dependence on window size due to the redundancy induced by the multiple fetches of each pixel inside each block, becoming higher with the window size as illustrated by Figure \ref{fig:median_overlap}. On C2070 card, thanks to a more efficient caching mechanism, this effect is lesser. On GPUs, dependency over image size is low, due to slightly more efficient data transfers when copying larger data amounts. Thus transferring a 4096$\times$4096 pixel image (32~MBytes) is a bit faster than transferring 64 times a 512$\times$512 pixel image (0.5~MBytes). + +%% mettre l'eau à la bouche + +\lstinputlisting[label={lst:medianGeneric},caption=Generic CUDA kernel achieving median filtering]{Chapters/chapter3/code/medianGeneric.cu} + +\begin{figure} + \centering + \includegraphics[width=8cm]{Chapters/chapter3/img/median_1.png} + \caption{Exemple of 5x5 median filtering} + \label{fig:median_1} +\end{figure} + +\begin{algorithm} + \SetNlSty{textbf}{}{:} + copy data from CPU to GPU texture memory\label{algo_median_generic:memcpyH2D}\; + \ForEach(\tcc*[f]{in parallel}){pixel at position $(x, y)$}{ + Read gray-level values of the n$\times$n neighborhood\label{algo_median_generic:cptstart}\; + Selects the median ($(n^2/2)^{th}$) value among those n$\times$n values\; + Outputs the new gray-level value \label{algo_median_generic:cptend}\; + } +copy data from GPU global memory to CPU memory\label{algo_median_generic:memcpyD2H}\; +\caption{generic n$\times$n median filter} +\label{algo_median_generic} +\end{algorithm} + +\begin{figure} + \centering + \includegraphics[width=5cm]{Chapters/chapter3/img/median_overlap.png} + \caption{Illustration of window overlapping in 5x5 median filtering} + \label{fig:median_overlap} +\end{figure} + + +\begin{table}[h] +%\newcolumntype{I}{!{\vrule width 1.5pt}} +\newlength\savedwidth +\newcommand\whline{\noalign{\global\savedwidth + \arrayrulewidth\global\arrayrulewidth 1.5pt} + \hline \noalign{\global\arrayrulewidth + \savedwidth} +} +\renewcommand{\arraystretch}{1.5} +\centering +{\tiny +\begin{tabular}{|c|l||c|c|c|c|c|c|c|c|c|} +\hline +\multicolumn{2}{|l||}{Processor} & \multicolumn{3}{c|}{\textbf{GTX280}} & \multicolumn{3}{c|}{\textbf{C2070}} & \multicolumn{3}{c|}{\textbf{Xeon}} \\ \hline +\multicolumn{2}{|l||}{\shortstack{Performances$\rightarrow$\\sizes (pixels)$\downarrow$}} & \shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% } \\ \whline +\multirow{3}{*}{\rotatebox{90}{512$^2$}} &3$\times$3&11.50 &22 &2.2 &7.58 &33 &3.4 & 19.25& 14&-\\ + &5$\times$5&19.10 &14 &1.3 &8.60 &30 &3.0 &18.49 &14 &-\\ + &7$\times$7&31.30 &8 &0.8 &10.60 &24 &2.5 &20.27 &13 &-\\\whline +\multirow{3}{*}{\rotatebox{90}{1024$^2$}}&3$\times$3&44.50 &23 &2.3 &29.60 &34 &3.5 &75.49 &14 &-\\ + &5$\times$5&71.10 &14 &1.4 &33.00 &31 &3.2 &73.88 &14 &-\\ + &7$\times$7&114.50 &9 &0.9 &39.10 &26 &2.7 &77.40 &13 &-\\\whline +\multirow{3}{*}{\rotatebox{90}{2048$^2$}}&3$\times$3&166.00 &24 &2.4 &115.20 &36 &3.6 &296.18&14 &-\\ + &5$\times$5&261.00&16 &1.5 &128.20&32 &3.3 &294.55&14 &-\\ + &7$\times$7&411.90 &10&1.0 &143.30&28 &2.8 &303.48&14 &-\\\whline +\multirow{3}{*}{\rotatebox{90}{4096$^2$}}&3$\times$3&523.80 &31 &3.0 &435.00 &38 &3.9 &1184.16&14 &-\\ + &5$\times$5&654.10&25 &2.4 &460.20&36 &3.7 &1158.26&14 &-\\ + &7$\times$7&951.30 &17&1.7 &509.60&32 &3.3 &1213.55&14 &-\\\whline + +\end{tabular}} +\caption{Performance results of \texttt{kernel medianR}. } +\label{tab:medianHisto1} +\end{table} + +\section{NVidia GPU tuning recipes} +When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three type of GPU memory are available: +\begin{enumerate} +\item \textbf{Global memory, the most versatile:}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. +\item \textbf{Registers, the fastest:}\\Allows access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM). +\item \textbf{Shared memory, a complex compromise:}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. +However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32 bit data at two consecutive adresses. +\end{enumerate} + +\noindent As observed earlier, designing a median filter GPU implementation using only global memory is fairly straightforward, but its performance remains quite low even if it is faster than CPU. +To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to n$\times$n different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than do it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (for example: 32ms for 5$\times$5 median on a 1024$^2$ pixel image). + +As for registers, designing a generic median filter that would only use that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation. +Yet, nothing forbids us to design fixed-size filters, each of them specific to one of the most popular window sizes. It might be worth the effort as dramatic increase in performance could be expected. + +Another track to follow in order to improve performance of GPU implementations consists in hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism (ILP) and by increasing the data count output by each thread. Though such techniques may seem to break the NVidia occupancy paradigm, they can lead to dramatically higher data throughput values. +The following sections illustrate these ideas and detail the design of the fastest CUDA median filter known to date. + +\section{A 3$\times$3 median filter: using registers } +Designing a median filter dedicated to the smallest possible square window size is a good challenge to start using registers. +One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray level pixel per thread} rule, each histogram requires one 256-element vector to store its values, e.g. four times the maximum register count allowed per thread (63). Considering a 3$\times$3 median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. + +\subsection{The simplest way} +In the case of a 3$\times$3 median filter, the simplest solution consists in associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, e.g. the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernelMedian3RegTri9()}), the constraint of only using registers leads to adopt an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU. +The diagram of Figure \ref{fig:compMedians1} summarizes these first results. Only C2070 throughputs are shown and compared to CPU results. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughputd achieved by \textit{libJacket}, a commercial implementation, as it was the fastest known implementation of 3$\times$3 median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its 3$\times$3 median filter, that we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 used in the paper, but to the way we perform memory transfers and to our register-only method of storing temporary data. + +\lstinputlisting[label={lst:kernelMedian3RegTri9},caption= 3$\times$3 median filter kernel using one register per neighborhood pixel and bubble sort]{Chapters/chapter3/code/kernMedianRegTri9.cu} + +\begin{figure} + \centering + \includegraphics[width=11cm]{Chapters/chapter3/img/debitPlot1.png} + \caption{Comparison of pixel throughputs on GPU C2070 and CPU for generic median, in 3$\times$3 median register-only and \textit{libJacket}.} + \label{fig:compMedians1} +\end{figure} + +\subsection{Further optimization} +Running the above register-only 3$\times$3 median filter through the NVidia CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: one is to increase the number of concurrent threads by reducing the number of registers used, the other to have each thread process more data which can be achieved by outputting the gray-level value of two pixels or more. +\subsubsection{Reducing register count} +Our current kernel (\texttt{kernelMedian3RegTri9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. +This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_n$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value, e.g. $k_n=\lceil n^2/2\rceil+1$. +%To ensure this, the number of values that are not part of the process must remain lower than the number of values that would have had an index higher (or lower) than the middle one in the fully sorted $n^2$ value vector. +This rule can be applied to the first eliminating stage and remains true with the next ones as each stage suppresses exactly two values. +In our 3$\times$3 pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$. + +The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. +Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value sorting function ($s()$, lines 1 to 5) that swaps input values if necessary. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14. + +\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 and finding the median value by forgetful selection method]{Chapters/chapter3/code/kernMedianForget1pix3.cu} + +Our such modified kernel provides significantly improved runtimes: a speedup of around 16\% is obtained, and pixel throughput reaches around 1000~MPixel/s on C2070. + +\subsubsection{More data output per thread} +In the case of a kernel achieving an effective memory throughput much lower than the peak value, and if enough threads are run, another technique may help hiding memory latency and thus leverage performance: one thread produces multiple pixel outputs. +Attentive readers should notice that it would increase the register count per thread. That's true, but dividing thread block size by the same quantity allow, at least, to keep the same register count per block, which is the parallelism limiting factor. +Moreover, it is now possible to take advantage of the window overlapping, first illustrated In Figure \ref{fig:median_overlap}, and more detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values and as it is exactly the number of pixels that overlap between two neighbor window of adjacent pixels, it allows to save 6 texture fetches and one \texttt{minmax6} selection per thread. Again, speedup is expected through the modified kernel source code and the associated grid dimensions presented in Listing \ref{lst:medianForget2pix3}. Important differences to be noticed are pixel coordinates computation given thread index. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains the same. Consequently, in kernel code, each thread of block coordinates $(tx, ty)$ will be in charge of processing pixels of block coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this. + +\begin{figure} + \centering + \includegraphics[width=4cm]{Chapters/chapter3/img/median3_overlap.png} + \caption{Illustration of how window overlapping is used to combine 2 pixel selections in 3$\times$3 median kernel.} + \label{fig:median3_overlap} +\end{figure} + +\lstinputlisting[label={lst:medianForget2pix3},caption=kernel 3$\times$3 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix3.cu} + +Running this ultimate kernel saves another 10\% of runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput known so far on C2070 of 1155~Mpixel/s which is 86\% of the maximum effective throughput. + +\begin{figure} + \centering + \includegraphics[width=11cm]{Chapters/chapter3/img/debitPlot2.png} + \caption{Comparison of pixel throughput on GPU C2070 for the different 3$\times$3 median kernels.} + \label{fig:compMedians2} +\end{figure} + +\section{Median filter 5$\times$5 and more} +Considering the maximum register count allowed dper thread (63) and trying to push this technique to its limit would let us design median filters up to 9$\times$9 pixel window. This maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus a few ones used by the compiler to complete arithmetic operations (9) leading to a total register count of 51. +This would oviously forbids us to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median but that larger window sizes could take advantage of using shared memory. +The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and then a generic kernel for larger window sizes. + +\subsection{Median filter 5$\times$5: register only } +The minimum register count allowing to apply the forgetfull selection method to 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels at once, from the first selection stage with 14 common values to the passing of the last common value, a count of 6 common selection stages can be carried out. That allows to limit the register count to 14+8=22 per thread. Figure \ref{fig:median5overlap} +\begin{figure} + \centering + \includegraphics[width=8cm]{Chapters/chapter3/img/median5_overlap.png} + \caption{Reduction of register count in 5$\times$5 register only median kernel, outputting 2 pixel at once. The first 6 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.} + \label{fig:median5overlap} +\end{figure} +Listing \ref{lst:medianForget2pix5} reproduces the kernel \texttt{kernel\_medianForget2pix5} code where the common selection stages take place from line XX to line YY. The remaining separate selection stages occur between lines XX and YY after the separation of line GG. + +\lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu} + +Timing results follow the same variations with image size than previous ones. That is why Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image. + +\begin{table}[h] +%\newlength\savedwidth +\newcommand\whline{\noalign{\global\savedwidth + \arrayrulewidth\global\arrayrulewidth 1.5pt} + \hline \noalign{\global\arrayrulewidth + \savedwidth} +} +\centering +{\scriptsize +\begin{tabular}{|l||c|c|c|c|} +\hline +\textbf{Implementation}&\shortstack{\textbf{registers only}\\\textbf{1 pix/thread}}&\shortstack{\textbf{registers only}\\\textbf{2 pix/thread}}&\shortstack{\textbf{libJacket}\\(interpolated)}&\shortstack{\textbf{shared mem}}\\\whline + \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}&551&738&152&540\\\hline +\end{tabular} +} +\caption{Performance of various 5$\times$5 median kernel implementations, applied on 4096$\times$4096 pixel image with C2070 GPU card..} +\label{tab:median5comp} +\end{table} + +\subsection{True median filter n$\times$n} +Shared memory can represent an efficient way to reduce global or texture loads, but it is also a limiting factor for performance. +On Fermi GPUs (as C2070), a maximum of 48~kB of per block shared memory is avalaible. With 16-bit coded gray levels, that allows to store up to 24576 values, which can be organised as a square of 156$\times$156 pixels maximum. +A point is that it is not efficient to use the shared memory at its maximum, as it would reduce the number of blocks beeing run in parallel on each SM. +Another point is that it is not possible to avoid bank conflicts when designing a generic median kernel. +Thus, the most efficient way to code a generic, large window, median filter, is to do without shared memory but use texture direct fetching. +Listing \ref{lst:medianForgetGeneric} reproduce such a code, where the most interesting part is between lines XX and YY, where the forgetfull selection has been generalized to an arbitrary window size. +Performance results summarized in table \ref{tab:medianForgetGeneric} demonstrate that such a method is far from beeing as efficient as small fixed-size implementations. + +\begin{table}[h] +%\newlength\savedwidth +\newcommand\whline{\noalign{\global\savedwidth + \arrayrulewidth\global\arrayrulewidth 1.5pt} + \hline \noalign{\global\arrayrulewidth + \savedwidth} +} +\centering +{\scriptsize +\begin{tabular}{|l||c|c|c|c|} +\hline +\shortstack{\textbf{Window size}\\(in pixels)}&\textbf{121}&\textbf{169}&\textbf{225}&\textbf{441}\\\whline + \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}& & & & \\\hline +\end{tabular} +} +\caption{Performance of generic median kernel applied to various window sizes on 4096$\times$4096 pixel image.} +\label{tab:medianForgetGeneric} +\end{table} + +\lstinputlisting[label={lst:medianForgetGeneric},caption= generic median kernel by forgetfull selection.]{Chapters/chapter3/code/kernMedianForgetGeneric.cu} + +\subsection{Fast approximated median filter n$\times$n} +If faster process is required, a possible technique is to split median selection in two separate 1-D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists would say that this method does not selects the actual median value. They would be right, but for large window sizes and \textit{real-life} images, the so selected value is statically near the true median value and often represents an acceptable approximation. +In this particular case, we use a Torben Morgensen sorting algorithm, as it only needs a few and fixed register count. + +\begin{table}[h] +%\newlength\savedwidth +\newcommand\whline{\noalign{\global\savedwidth + \arrayrulewidth\global\arrayrulewidth 1.5pt} + \hline \noalign{\global\arrayrulewidth + \savedwidth} +} +\centering +{\scriptsize +\begin{tabular}{|l||c|c|c|c|} +\hline +\shortstack{\textbf{Window size}\\(in pixels)}&\textbf{121}&\textbf{169}&\textbf{225}&\textbf{441}\\\whline + \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}& & & & \\\hline +\end{tabular} +} +\caption{Performance of generic pseudo separable median kernel applied to various window sizes on 4096$\times$4096 pixel image.} +\label{tab:medianSeparable} +\end{table} + +\lstinputlisting[label={lst:medianSeparable},caption= generic pseudo median kernel.]{Chapters/chapter3/code/kernMedianSeparable.cu} + + +\section{Glossary} +\begin{Glossary} +\item[CUDA] Compute Unified Device Architecture. +\end{Glossary} + +\putbib[Chapters/chapter3/biblio3] + diff --git a/BookGPU/Chapters/chapter3/code/Makefile b/BookGPU/Chapters/chapter3/code/Makefile new file mode 100644 index 0000000..1a8ec5d --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/Makefile @@ -0,0 +1,7 @@ +EXECUTABLE := fast_median +CUFILES := main.cu + +include ../../common/common.mk + +NVCCFLAGS += -arch=sm_20 +NVCCFLAGS += --ptxas-options=-v diff --git a/BookGPU/Chapters/chapter3/code/exChronos.cu b/BookGPU/Chapters/chapter3/code/exChronos.cu new file mode 100644 index 0000000..8926a00 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/exChronos.cu @@ -0,0 +1,10 @@ +unsigned int timer ; +cutilCheckError( cutCreateTimer(&timer)); +cutilCheckError( cutResetTimer(timer) ); +cutilCheckError( cutStartTimer(timer) ); +for (int ct=0; ct<1000 ; ct++) + kernel_ident<<< dimGrid, dimBlock, 0>>>(d_out, W); +cudaThreadSynchronize() ; +cutilCheckError( cutStopTimer(timer) ); +cutilCheckError( cutStopTimer(timer) ); +printf("Mean runtime: %f ms,\n", cutGetTimerValue(timer)/1000); diff --git a/BookGPU/Chapters/chapter3/code/kernMedian2pix3.cu b/BookGPU/Chapters/chapter3/code/kernMedian2pix3.cu new file mode 100644 index 0000000..f6f48da --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedian2pix3.cu @@ -0,0 +1,38 @@ +__global__ void kernel_median3_2pix( short *output, + int i_dim, int j_dim) +{ + // j base coordinate = 2*(thread index) + int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; + int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; + int a0, a1, a2, a3, a4, a5 ; // for left window + int b0, b1, b2, b3, b4, b5 ; // for right window + + a0 = tex2D(tex_img_ins, j , i-1); // 6 common pixels + a1 = tex2D(tex_img_ins, j+1, i-1); + a2 = tex2D(tex_img_ins, j , i ); + a3 = tex2D(tex_img_ins, j+1, i ); + a4 = tex2D(tex_img_ins, j , i+1); + a5 = tex2D(tex_img_ins, j+1, i+1); + + minmax6(&a0, &a1, &a2, &a3, &a4, &a5);// common minmax + b0=a0; b1=a1; b2=a2; b3=a3; b4=a4; b5=a5;// separation + + a5 = tex2D(tex_img_ins, j-1, i); //separate processes + b5 = tex2D(tex_img_ins, j+2, i); + minmax5(&a1, &a2, &a3, &a4, &a5); + minmax5(&b1, &b2, &b3, &b4, &b5); + a5 = tex2D(tex_img_ins, j-1, i-1); + b5 = tex2D(tex_img_ins, j+2, i-1); + minmax4(&a2, &a3, &a4, &a5); + minmax4(&b2, &b3, &b4, &b5); + a5 = tex2D(tex_img_ins, j-1, i+1); + b5 = tex2D(tex_img_ins, j+2, i+1); + minmax3(&a3, &a4, &a5); + minmax3(&b3, &b4, &b5); + + output[ __mul24(i, j_dim) +j ] = a4 ; //2 outputs + output[ __mul24(i, j_dim) +j+1 ] = b4 ; +} + +//grid dimensions to be set in main.cu file + dimGrid = dim3( (W/dimBlock.x)/2, H/dimBlock.y, 1 ) ; diff --git a/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu new file mode 100644 index 0000000..b01619e --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu @@ -0,0 +1,66 @@ +__global__ void kernel_median5_2pix( short *output, + int i_dim, int j_dim) +{ + int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2); + int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y; + int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13;//left window + int b6,b7,b8,b9,b10,b11,b12,b13 ; //right window + //first 14 common pixels + a0 = tex2D(tex_img_ins, j-1, i-2) ; // first line + a1 = tex2D(tex_img_ins, j , i-2) ; + a2 = tex2D(tex_img_ins, j+1, i-2) ; + a3 = tex2D(tex_img_ins, j+2, i-2) ; + a4 = tex2D(tex_img_ins, j-1, i-1) ; //seconde line + a5 = tex2D(tex_img_ins, j , i-1) ; + a6 = tex2D(tex_img_ins, j+1, i-1) ; + a7 = tex2D(tex_img_ins, j+2, i-1) ; + a8 = tex2D(tex_img_ins, j-1, i) ; // third line + a9 = tex2D(tex_img_ins, j , i) ; + a10 = tex2D(tex_img_ins, j+1, i) ; + a11 = tex2D(tex_img_ins, j+2, i) ; // first 2 of fourth line + a12 = tex2D(tex_img_ins, j-1, i+1) ; + a13 = tex2D(tex_img_ins, j , i+1) ; + + //common selection + minmax14(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j+1, i+1); + minmax13(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j+2, i+1); + minmax12(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j-1, i+2); + minmax11(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j , i+2); + minmax10(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j+1, i+2); + minmax9(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + a13 = tex2D(tex_img_ins, j+2, i+2); + minmax8(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); + + // separation + b6=a6; b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; + + // separate selections: 5 remaining pixels in both windows + a13 = tex2D(tex_img_ins, j-2, i-2); + b13 = tex2D(tex_img_ins, j+3, i-2); + minmax7(&a7,&a8,&a9,&a10,&a11,&a12,&a13); + minmax7(&b7,&b8,&b9,&b10,&b11,&b12,&b13); + a13 = tex2D(tex_img_ins, j-2, i-1); + b13 = tex2D(tex_img_ins, j+3, i-1); + minmax6(&a8,&a9,&a10,&a11,&a12,&a13); + minmax6(&b8,&b9,&b10,&b11,&b12,&b13); + a13 = tex2D(tex_img_ins, j-2, i ); + b13 = tex2D(tex_img_ins, j+3, i ); + minmax5(&a9,&a10,&a11,&a12,&a13); + minmax5(&b9,&b10,&b11,&b12,&b13); + a13 = tex2D(tex_img_ins, j-2, i+1); + b13 = tex2D(tex_img_ins, j+3, i+1); + minmax4(&a10,&a11,&a12,&a13); + minmax4(&b10,&b11,&b12,&b13); + a13 = tex2D(tex_img_ins, j-2, i+2); + b13 = tex2D(tex_img_ins, j+3, i+2); + minmax3(&a11,&a12,&a13); + minmax3(&b11,&b12,&b13); + + output[ __mul24(i, j_dim) +j ] = a12 ; //middle values + output[ __mul24(i, j_dim) +j+1 ] = b12 ; +} diff --git a/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu new file mode 100644 index 0000000..a34d784 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu @@ -0,0 +1,38 @@ +__device__ inline void s(int* a, int* b) +{ + int tmp ; + if (*a > *b) { tmp = *b; *b = *a; *a = tmp;} +} + +#define min3(a, b, c) s(a, b); s(a, c); +#define max3(a, b, c) s(b, c); s(a, c); +#define minmax3(a, b, c) max3(a, b, c); s(a, b); +#define minmax4(a, b, c, d) s(a, b); s(c, d); s(a, c); s(b, d); +#define minmax5(a, b, c, d, e) s(a, b); s(c, d); min3(a, c, e); max3(b, d, e); +#define minmax6(a, b, c, d, e, f) s(a,d); s(b, e); s(c, f); min3(a, b, c); max3(d, e, f); + +__global__ void kernel_medianForget1pix3( short *output, + int i_dim, int j_dim) +{ + int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; + int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; + int a0, a1, a2, a3, a4, a5 ; + + a0 = tex2D(tex_img_ins, j-1, i-1) ; // first 6 values + a1 = tex2D(tex_img_ins, j, i-1) ; + a2 = tex2D(tex_img_ins, j+1, i-1) ; + a3 = tex2D(tex_img_ins, j-1, i) ; + a4 = tex2D(tex_img_ins, j, i) ; + a5 = tex2D(tex_img_ins, j+1, i) ; + + minmax6(&a0, &a1, &a2, &a3, &a4, &a5);//min->a0 max->a5 + a5 = tex2D(tex_img_in, j-1, i+1) ; //next value in a5 + minmax5(&a1, &a2, &a3, &a4, &a5) ; //min->a1 max->a5 + a5 = tex2D(tex_img_ins, j, i+1) ; //next value in a5 + minmax4(&a2, &a3, &a4, &a5) ; //min->a1 max->a5 + a5 = tex2D(tex_img_ins, j+1, i+1) ; //next value in a5 + minmax3(&a3, &a4, &a5) ; //min->a1 max->a5 + + output[ __mul24(i, j_dim) +j ] = a4 ; //middle value + +} diff --git a/BookGPU/Chapters/chapter3/code/kernMedianForgetGeneric.cu b/BookGPU/Chapters/chapter3/code/kernMedianForgetGeneric.cu new file mode 100644 index 0000000..3bb8ad6 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedianForgetGeneric.cu @@ -0,0 +1,31 @@ +__global__ void kernel_medianForgetR( short *output, int i_dim, int j_dim, int r) +{ + // coordonnees absolues du point + int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; + int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; + short ic, jc ; + short Nreg = ((2*r+1)*(2*r+1))/2 + 2 ; + + // remplissage du vecteur de tri minmax + short vect[8066] ; + int Freg=Nreg ; + for (ic=0; ic<2*r+1; ic++) + { + for (jc=0; jc<2*r+1; jc++) + { + if ( ic*(2*r+1)+jc < Nreg ) + { + vect[ ic*(2*r+1)+jc ] = tex2D(tex_img_ins, j-r+jc, i-r+ic) ; + } else + { + minmaxN(vect, Freg--) ; + vect[ Nreg-1 ] = tex2D(tex_img_ins, j-r+jc, i-r+ic) ; + } + } + } + minmax3(&vect[Nreg-3], &vect[Nreg-2], &vect[Nreg-1]) + + //medRoi[ (threadIdx.y+ic)*(bdimX+L-1)+ (tidX+jc) ] + + output[ __mul24(i, j_dim) +j ] = vect[ Nreg-2 ]; +} diff --git a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu new file mode 100644 index 0000000..363b181 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu @@ -0,0 +1,21 @@ +__global__ void kernel_Median3RegTri9( short *output, + int i_dim, int j_dim) +{ + int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; + int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; + int a0, a1, a2, a3, a4, a5, a6, a7, a8 ; // 1 register per pixel + + a0 = tex2D(tex_img_ins, j-1, i-1) ; // fetching values + a1 = tex2D(tex_img_ins, j , i-1) ; + a2 = tex2D(tex_img_ins, j+1, i-1) ; + a3 = tex2D(tex_img_ins, j-1, i) ; + a4 = tex2D(tex_img_ins, j , i) ; + a5 = tex2D(tex_img_ins, j+1, i) ; + a6 = tex2D(tex_img_ins, j-1, i+1) ; + a7 = tex2D(tex_img_ins, j , i+1) ; + a8 = tex2D(tex_img_ins, j+1, i+1) ; + + bubReg9(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8); // bubble sort + + output[ __mul24(i, j_dim) +j ] = a4 ; // median at the middle + } diff --git a/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu new file mode 100644 index 0000000..5c79c82 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu @@ -0,0 +1,74 @@ +__global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) +{ + + int idc, val, min, max, inf, egal, sup, mxinf, minsup, estim ; + + //coordinates in the block + int ib = threadIdx.y ; + int jb = threadIdx.x ; + int idx_h = __mul24(ib+r,blockDim.x) + jb ; // index pixel deans shmem (bloc+halo) + int offset = __mul24(blockDim.x,r) ; + + // coordonnees absolues du point + int j = __mul24(blockIdx.x,blockDim.x) + jb ; + int i = __mul24(blockIdx.y,blockDim.y) + ib ; + + extern __shared__ int buff[] ; + /*********************************************************************************** + * CHARGEMENT DATA EN SHARED MEM + ***********************************************************************************/ + buff[ idx_h ] = tex2D(tex_img_ins, j, i) ; + + if (ib < r) + { + buff[ idx_h - offset ] = tex2D(tex_img_ins, j, i-r) ; + } else + if (ib >= (blockDim.y-r)) + { + buff[ idx_h + offset ] = tex2D(tex_img_ins, j, i+r) ; + } + + __syncthreads() ; + /********************************************************************************************** + * TRI VERTICAL par algo TORBEN MOGENSEN + * (a little bit slow but saves memory => faster !) + **********************************************************************************************/ + min = max = buff[ ib*blockDim.x +jb] ; + + for (idc= 0 ; idc< 2*r+1 ; idc++ ) + { + val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; + if ( val < min ) min = val ; + if ( val > max ) max = val ; + } + + while (1) + { + estim = (min+max)/2 ; + inf = sup = egal = 0 ; + mxinf = min ; + minsup= max ; + for (idc =0; idc< 2*r+1 ; idc++) + { + val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; + if( val < estim ) + { + inf++; + if( val > mxinf) mxinf = val ; + } else if (val > estim) + { + sup++; + if( val < minsup) minsup = val ; + } else egal++ ; + } + if ( (inf <= (r+1))&&(sup <=(r+1)) ) break ; + else if (inf>sup) max = mxinf ; + else min = minsup ; + } + + if ( inf >= r+1 ) val = mxinf ; + else if (inf+egal >= r+1) val = estim ; + else val = minsup ; + + output[ __mul24(j, i_dim) +i ] = val ; +} diff --git a/BookGPU/Chapters/chapter3/code/kernSkel.cu b/BookGPU/Chapters/chapter3/code/kernSkel.cu new file mode 100644 index 0000000..bacc10a --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/kernSkel.cu @@ -0,0 +1,12 @@ +texture tex_img_in ; + +__global__ void kernel_ident( short *output, int w) +{ + int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; + int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; + + output[ __mul24(i, w) + j ] = tex2D(tex_img_in, j, i) ; + +} + + diff --git a/BookGPU/Chapters/chapter3/code/mainSkel.cu b/BookGPU/Chapters/chapter3/code/mainSkel.cu new file mode 100644 index 0000000..fc691e4 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/mainSkel.cu @@ -0,0 +1,43 @@ +#include +#include +#include "fast_kernels.cu" + +int main(int argc, char **argv){ + cudaSetDevice( 0 ); // select first GPU + char filename[80] = "image.pgm" ; + short *h_in, *h_out, *d_out ; + int size, bsx=16, bsy=16 ; + dim3 dimBlock, dimGrid ; + cudaChannelFormatDesc channelD=cudaCreateChannelDesc(); + cudaArray * array_img_in ; + /*....................... load image and cast...........*/ + unsigned int * h_img = NULL ; + unsigned int *h_outui, H, L ; + cutilCheckError( cutLoadPGMi(filename, &h_img, &L, &H)); + size = H * L * sizeof( short ); + h_in = new short[H*L] ; + for (int k=0; k>>(d_out, W, H) ; + + cutilSafeCall( cudaMemcpy(h_out , d_out, size, cudaMemcpyDeviceToHost) ) ; + /*...............cast and save output image (optional) */ + h_outui = new unsigned int[H*L] ; + for (int k=0; k ((2*r+1)*(2*r+1))>>1 ) break ; + } + output[ __mul24(i, j_dim) +j ] = ic ; +} diff --git a/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux b/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux new file mode 100644 index 0000000..03ee019 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux @@ -0,0 +1,32 @@ +\relax +\@setckpt{code/medianGeneric.cu}{ +\setcounter{page}{4} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{0} +\setcounter{chapter}{1} +\setcounter{section}{2} +\setcounter{subsection}{1} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{2} +\setcounter{table}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{AlgoLine}{7} +\setcounter{algocfline}{1} +\setcounter{algocfproc}{1} +\setcounter{algocf}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{lstlisting}{0} +} diff --git a/BookGPU/Chapters/chapter3/code/memSkel.cu b/BookGPU/Chapters/chapter3/code/memSkel.cu new file mode 100644 index 0000000..bd7ae06 --- /dev/null +++ b/BookGPU/Chapters/chapter3/code/memSkel.cu @@ -0,0 +1,64 @@ +// C libraries +#include +#include + +// NVidia libraries +#include +#include + +// our kernels +#include "fast_kernels.cu" + +int main(int argc, char **argv){ + // raw way of selecting GPU + cudaSetDevice( 0 ); + + unsigned int timer ; + + // CPU memory allocation + short *h_in, *h_out ; + int *h_img, H, L, size ; + + // allocation mem GPU + short * d_out ; + + dim3 dimBlock, dimGrid ; + int bsx=16, bsy=16 ; + + cudaChannelFormatDesc channelDescS = cudaCreateChannelDesc(); + cudaArray * array_img_in ; + + // chargt image + cutilCheckError( cutLoadPGMi("image.pgm", &h_data, &L, &H)); + size = H * L * sizeof( short ); + + /* transfert en zone short ;) */ + h_in = new short[H*L] ; + for (int k=0; k>>(d_outs, H, L) ; + + + return 0; +} + diff --git a/BookGPU/Chapters/chapter3/img/debitPlot1.png b/BookGPU/Chapters/chapter3/img/debitPlot1.png new file mode 100644 index 0000000..4b3d036 Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/debitPlot1.png differ diff --git a/BookGPU/Chapters/chapter3/img/debitPlot2.png b/BookGPU/Chapters/chapter3/img/debitPlot2.png new file mode 100644 index 0000000..7121b54 Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/debitPlot2.png differ diff --git a/BookGPU/Chapters/chapter3/img/kernLeft.bb b/BookGPU/Chapters/chapter3/img/kernLeft.bb new file mode 100644 index 0000000..73f0816 --- /dev/null +++ b/BookGPU/Chapters/chapter3/img/kernLeft.bb @@ -0,0 +1,5 @@ +%%Title: ./kernLeft.png +%%Creator: extractbb 20120420 +%%BoundingBox: 0 0 79 25 +%%CreationDate: Thu Nov 15 10:51:52 2012 + diff --git a/BookGPU/Chapters/chapter3/img/kernLeft.png b/BookGPU/Chapters/chapter3/img/kernLeft.png new file mode 100644 index 0000000..ae5d0ca Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/kernLeft.png differ diff --git a/BookGPU/Chapters/chapter3/img/kernRight.bb b/BookGPU/Chapters/chapter3/img/kernRight.bb new file mode 100644 index 0000000..f9047a3 --- /dev/null +++ b/BookGPU/Chapters/chapter3/img/kernRight.bb @@ -0,0 +1,5 @@ +%%Title: ./kernRight.png +%%Creator: extractbb 20120420 +%%BoundingBox: 0 0 79 25 +%%CreationDate: Thu Nov 15 10:51:56 2012 + diff --git a/BookGPU/Chapters/chapter3/img/kernRight.png b/BookGPU/Chapters/chapter3/img/kernRight.png new file mode 100644 index 0000000..34ae66a Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/kernRight.png differ diff --git a/BookGPU/Chapters/chapter3/img/median3_overlap.png b/BookGPU/Chapters/chapter3/img/median3_overlap.png new file mode 100644 index 0000000..0957bd3 Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/median3_overlap.png differ diff --git a/BookGPU/Chapters/chapter3/img/median5_overlap.png b/BookGPU/Chapters/chapter3/img/median5_overlap.png new file mode 100644 index 0000000..f616641 Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/median5_overlap.png differ diff --git a/BookGPU/Chapters/chapter3/img/median_1.png b/BookGPU/Chapters/chapter3/img/median_1.png new file mode 100644 index 0000000..0968e06 Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/median_1.png differ diff --git a/BookGPU/Chapters/chapter3/img/median_1.svg b/BookGPU/Chapters/chapter3/img/median_1.svg new file mode 100644 index 0000000..3c17e21 --- /dev/null +++ b/BookGPU/Chapters/chapter3/img/median_1.svg @@ -0,0 +1,18010 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 12 + 21 + 100 + 194 + 12 + 29 + 73 + 91 + 34 + 201 + 36 + 45 + 45 + 59 + 63 + 42 + 61 + 30 + 64 + 62 + 28 + 22 + 35 + 19 + 17 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + i + j + + + + + i + j + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 5 + 6 + 5 + 6 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + i + j + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + first pixel + second pixel + 6 common + pixels + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + i + j + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 14 first + 6 last + 5 specific pixels for both center pixels + pixels + common + common pixels + + + + + + diff --git a/BookGPU/Chapters/chapter3/img/median_overlap.png b/BookGPU/Chapters/chapter3/img/median_overlap.png new file mode 100644 index 0000000..3ef8cab Binary files /dev/null and b/BookGPU/Chapters/chapter3/img/median_overlap.png differ diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index 973672c..eec3fee 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -3,110 +3,110 @@ \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Development methodologies for GPU and cluster of GPUs}{23}} +\@writefile{toc}{\contentsline {chapter}{\numberline {5}Development methodologies for GPU and cluster of GPUs}{49}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Introduction}{24}} -\newlabel{ch6:intro}{{3.1}{24}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{24}} -\newlabel{ch6:part1}{{3.2}{24}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.1}Synchronous parallel algorithms on GPU clusters}{24}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{26}} -\newlabel{fig:ch6p1overlapnative}{{3.1}{26}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.2}Native overlap of CPU communications and GPU computations}{26}} -\newlabel{algo:ch6p1overlapnative}{{3.1}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{27}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{28}} -\newlabel{fig:ch6p1overlapseqsequence}{{3.2}{28}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.3}Overlapping with sequences of transfers and computations}{28}} -\newlabel{algo:ch6p1overlapseqsequence}{{3.2}{29}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{29}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{30}} -\newlabel{fig:ch6p1overlapstreamsequence}{{3.3}{30}} -\newlabel{algo:ch6p1overlapstreamsequence}{{3.3}{31}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{31}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{33}} -\newlabel{fig:ch6p1overlapinterleaved}{{3.4}{33}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.4}Interleaved communications-transfers-computations overlapping}{33}} -\newlabel{algo:ch6p1overlapinterleaved}{{3.4}{34}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{34}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.5}Experimental validation}{36}} -\newlabel{ch6:p1expes}{{3.2.5}{36}} -\newlabel{ch6:p1block-cyclic}{{3.2.5}{36}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{37}} -\newlabel{fig:ch6p1syncexpematrixprod}{{3.5}{37}} -\@writefile{toc}{\contentsline {section}{\numberline {3.3}General scheme of asynchronous parallel code with computation/communication overlapping}{38}} -\newlabel{ch6:part2}{{3.3}{38}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Synchronous iterative scheme\relax }}{38}} -\newlabel{algo:ch6p2sync}{{1}{38}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Asynchronous iterative scheme\relax }}{38}} -\newlabel{algo:ch6p2async}{{2}{38}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.1}A basic asynchronous scheme}{40}} -\newlabel{ch6:p2BasicAsync}{{3.3.1}{40}} -\newlabel{algo:ch6p2BasicAsync}{{3.5}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.5}Initialization of the basic asynchronous scheme}{40}} -\newlabel{algo:ch6p2BasicAsyncComp}{{3.6}{41}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.6}Computing function in the basic asynchronous scheme}{41}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{3.7}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.7}Sending function in the basic asynchronous scheme}{42}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{3.8}{43}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.8}Reception function in the basic asynchronous scheme}{43}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.2}Synchronization of the asynchronous scheme}{44}} -\newlabel{ch6:p2SsyncOverAsync}{{3.3.2}{44}} -\newlabel{algo:ch6p2Sync}{{3.9}{45}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.9}Initialization of the synchronized scheme}{45}} -\newlabel{algo:ch6p2SyncComp}{{3.10}{46}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.10}Computing function in the synchronized scheme}{46}} -\newlabel{algo:ch6p2SyncReceptions}{{3.11}{47}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.11}Reception function in the synchronized scheme}{47}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{48}} -\newlabel{ch6:p2GPUAsync}{{3.3.3}{48}} -\newlabel{algo:ch6p2AsyncSyncComp}{{3.12}{50}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.12}Computing function in the final asynchronous scheme}{50}} -\newlabel{algo:ch6p2syncGPU}{{3.13}{51}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.13}Computing function in the final asynchronous scheme}{51}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{3.14}{53}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.14}Initialization of the main process of complete overlap with asynchronism}{53}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{3.15}{54}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{54}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{3.16}{55}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{55}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.4}Experimental validation}{56}} -\newlabel{sec:ch6p2expes}{{3.3.4}{56}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{57}} -\newlabel{fig:ch6p2syncasync}{{3.6}{57}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{58}} -\newlabel{fig:ch6p2aux}{{3.7}{58}} -\@writefile{toc}{\contentsline {section}{\numberline {3.4}Perspective: A unifying programming model}{59}} -\newlabel{sec:ch6p3unify}{{3.4}{59}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.1}Resources}{59}} -\newlabel{sec:ch6p3resources}{{3.4.1}{59}} -\newlabel{algo:ch6p3ORWLresources}{{3.17}{60}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{60}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.2}Control}{60}} -\newlabel{sec:ch6p3ORWLcontrol}{{3.4.2}{60}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.3}Example: block-cyclic matrix multiplication (MM)}{61}} -\newlabel{sec:ch6p3ORWLMM}{{3.4.3}{61}} -\newlabel{algo:ch6p3ORWLBCCMM}{{3.18}{61}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.18}Block-cyclic matrix multiplication, high level per task view}{61}} -\newlabel{algo:ch6p3ORWLlcopy}{{3.19}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.19}An iterative local copy operation}{62}} -\newlabel{algo:ch6p3ORWLrcopy}{{3.20}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{62}} -\newlabel{algo:ch6p3ORWLtrans}{{3.21}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{62}} -\newlabel{algo:ch6p3ORWLdecl}{{3.22}{63}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.22}Dynamic declaration of handles to represent the resources}{63}} -\newlabel{algo:ch6p3ORWLinit}{{3.23}{64}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.23}Dynamic initialization of access mode and priorities}{64}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.4}Tasks and operations}{64}} -\newlabel{sec:ch6p3tasks}{{3.4.4}{64}} -\@writefile{toc}{\contentsline {section}{\numberline {3.5}Conclusion}{65}} -\newlabel{ch6:conclu}{{3.5}{65}} -\@writefile{toc}{\contentsline {section}{\numberline {3.6}Glossary}{65}} -\@writefile{toc}{\contentsline {section}{Bibliography}{66}} +\@writefile{toc}{\contentsline {section}{\numberline {5.1}Introduction}{50}} +\newlabel{ch6:intro}{{5.1}{50}} +\@writefile{toc}{\contentsline {section}{\numberline {5.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{50}} +\newlabel{ch6:part1}{{5.2}{50}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Synchronous parallel algorithms on GPU clusters}{50}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{52}} +\newlabel{fig:ch6p1overlapnative}{{5.1}{52}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}Native overlap of CPU communications and GPU computations}{52}} +\newlabel{algo:ch6p1overlapnative}{{5.1}{53}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{53}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{54}} +\newlabel{fig:ch6p1overlapseqsequence}{{5.2}{54}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.3}Overlapping with sequences of transfers and computations}{54}} +\newlabel{algo:ch6p1overlapseqsequence}{{5.2}{55}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{55}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{56}} +\newlabel{fig:ch6p1overlapstreamsequence}{{5.3}{56}} +\newlabel{algo:ch6p1overlapstreamsequence}{{5.3}{57}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{57}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{59}} +\newlabel{fig:ch6p1overlapinterleaved}{{5.4}{59}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.4}Interleaved communications-transfers-computations overlapping}{59}} +\newlabel{algo:ch6p1overlapinterleaved}{{5.4}{60}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{60}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.5}Experimental validation}{62}} +\newlabel{ch6:p1expes}{{5.2.5}{62}} +\newlabel{ch6:p1block-cyclic}{{5.2.5}{62}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{63}} +\newlabel{fig:ch6p1syncexpematrixprod}{{5.5}{63}} +\@writefile{toc}{\contentsline {section}{\numberline {5.3}General scheme of asynchronous parallel code with computation/communication overlapping}{64}} +\newlabel{ch6:part2}{{5.3}{64}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{64}} +\newlabel{algo:ch6p2sync}{{3}{64}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{64}} +\newlabel{algo:ch6p2async}{{4}{64}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.1}A basic asynchronous scheme}{66}} +\newlabel{ch6:p2BasicAsync}{{5.3.1}{66}} +\newlabel{algo:ch6p2BasicAsync}{{5.5}{66}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.5}Initialization of the basic asynchronous scheme}{66}} +\newlabel{algo:ch6p2BasicAsyncComp}{{5.6}{67}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.6}Computing function in the basic asynchronous scheme}{67}} +\newlabel{algo:ch6p2BasicAsyncSendings}{{5.7}{68}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.7}Sending function in the basic asynchronous scheme}{68}} +\newlabel{algo:ch6p2BasicAsyncReceptions}{{5.8}{69}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.8}Reception function in the basic asynchronous scheme}{69}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.2}Synchronization of the asynchronous scheme}{70}} +\newlabel{ch6:p2SsyncOverAsync}{{5.3.2}{70}} +\newlabel{algo:ch6p2Sync}{{5.9}{71}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.9}Initialization of the synchronized scheme}{71}} +\newlabel{algo:ch6p2SyncComp}{{5.10}{72}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.10}Computing function in the synchronized scheme}{72}} +\newlabel{algo:ch6p2SyncReceptions}{{5.11}{73}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.11}Reception function in the synchronized scheme}{73}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{74}} +\newlabel{ch6:p2GPUAsync}{{5.3.3}{74}} +\newlabel{algo:ch6p2AsyncSyncComp}{{5.12}{76}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.12}Computing function in the final asynchronous scheme}{76}} +\newlabel{algo:ch6p2syncGPU}{{5.13}{77}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.13}Computing function in the final asynchronous scheme}{77}} +\newlabel{algo:ch6p2FullOverAsyncMain}{{5.14}{79}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.14}Initialization of the main process of complete overlap with asynchronism}{79}} +\newlabel{algo:ch6p2FullOverAsyncComp1}{{5.15}{80}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{80}} +\newlabel{algo:ch6p2FullOverAsyncComp2}{{5.16}{81}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{81}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.4}Experimental validation}{82}} +\newlabel{sec:ch6p2expes}{{5.3.4}{82}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{83}} +\newlabel{fig:ch6p2syncasync}{{5.6}{83}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{84}} +\newlabel{fig:ch6p2aux}{{5.7}{84}} +\@writefile{toc}{\contentsline {section}{\numberline {5.4}Perspective: A unifying programming model}{85}} +\newlabel{sec:ch6p3unify}{{5.4}{85}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.1}Resources}{85}} +\newlabel{sec:ch6p3resources}{{5.4.1}{85}} +\newlabel{algo:ch6p3ORWLresources}{{5.17}{86}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{86}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.2}Control}{86}} +\newlabel{sec:ch6p3ORWLcontrol}{{5.4.2}{86}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.3}Example: block-cyclic matrix multiplication (MM)}{87}} +\newlabel{sec:ch6p3ORWLMM}{{5.4.3}{87}} +\newlabel{algo:ch6p3ORWLBCCMM}{{5.18}{87}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.18}Block-cyclic matrix multiplication, high level per task view}{87}} +\newlabel{algo:ch6p3ORWLlcopy}{{5.19}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.19}An iterative local copy operation}{88}} +\newlabel{algo:ch6p3ORWLrcopy}{{5.20}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{88}} +\newlabel{algo:ch6p3ORWLtrans}{{5.21}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{88}} +\newlabel{algo:ch6p3ORWLdecl}{{5.22}{89}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.22}Dynamic declaration of handles to represent the resources}{89}} +\newlabel{algo:ch6p3ORWLinit}{{5.23}{90}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.23}Dynamic initialization of access mode and priorities}{90}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.4}Tasks and operations}{90}} +\newlabel{sec:ch6p3tasks}{{5.4.4}{90}} +\@writefile{toc}{\contentsline {section}{\numberline {5.5}Conclusion}{91}} +\newlabel{ch6:conclu}{{5.5}{91}} +\@writefile{toc}{\contentsline {section}{\numberline {5.6}Glossary}{91}} +\@writefile{toc}{\contentsline {section}{Bibliography}{92}} \@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{68} +\setcounter{page}{94} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0} @@ -115,7 +115,7 @@ \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{1} -\setcounter{chapter}{3} +\setcounter{chapter}{5} \setcounter{section}{6} \setcounter{subsection}{0} \setcounter{subsubsection}{0} @@ -132,7 +132,7 @@ \setcounter{lstnumber}{17} \setcounter{ContinuedFloat}{0} \setcounter{float@type}{16} -\setcounter{algorithm}{2} +\setcounter{algorithm}{4} \setcounter{ALC@unique}{0} \setcounter{ALC@line}{0} \setcounter{ALC@rem}{0} diff --git a/BookGPU/Makefile b/BookGPU/Makefile index 56526d8..62572ba 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -9,7 +9,8 @@ all: bibtex bu3 bibtex bu4 bibtex bu5 - bibtex bu7 + bibtex bu6 + bibtex bu8 makeindex ${BOOK}.idx pdflatex ${BOOK} pdflatex ${BOOK}