From: Gilles Perrot Date: Tue, 15 Mar 2011 14:28:43 +0000 (+0100) Subject: Added paper source X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/snake_gpu.git/commitdiff_plain/9f26a74684e7eb33d10029d02e3ea2759be5f183 Added paper source --- diff --git a/doc/biblio.bib b/doc/biblio.bib new file mode 100644 index 0000000..1d370d2 --- /dev/null +++ b/doc/biblio.bib @@ -0,0 +1,134 @@ +@InProceedings{ AllainBG08, + author = "Marc Allain and Nicolas Bertaux and Fr{\'e}d{\'e}ric Galland", + title = "Nonparametric Level-Set Segmentation Based on the Minimization of the Stochastic Complexity", + booktitle = "ACIVS", + year = "2008", + pages = "506--517", + ee = "http://dx.doi.org/10.1007/978-3-540-88458-3_46", + crossref = "DBLP:conf/acivs/2008", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@Article{ KassWT88, + author = "Michael Kass and Andrew P. Witkin and Demetri Terzopoulos", + title = "Snakes: Active contour models", + journal = "International Journal of Computer Vision", + volume = "1", + number = "4", + year = "1988", + pages = "321--331", + ee = "http://dx.doi.org/10.1007/BF00133570", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@Article{ XuP98, + author = "Chenyang Xu and Jerry L. Prince", + title = "Snakes, shapes, and gradient vector flow", + journal = "IEEE Transactions on Image Processing", + volume = "7", + number = "3", + year = "1998", + pages = "359--369", + bibsource = "DBLP, http://dblp.uni-trier.de", + ee = "http://dx.doi.org/10.1109/83.661186" +} + +@Article{ GallandBR03, + author = "Fr{\'e}d{\'e}ric Galland and Nicolas Bertaux and Philippe R{\'e}fr{\'e}gier", + title = "Minimum description length synthetic aperture radar image segmentation", + journal = "IEEE Transactions on Image Processing", + volume = "12", + number = "9", + year = "2003", + pages = "995--1006", + ee = "http://dx.doi.org/10.1109/TIP.2003.816005", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@Article{ Brunett, + location = "http://www.scientificcommons.org/41680702", + title = "GPU-Accelerated Contour Extraction on Large Images Using Snakes", + author = "Enrico {Dipl.-Inf. Kienel} and Guido {Prof. Dr. Brunnett}", + year = "2009", + keywords = "Active Contours, GPGPU, Gradient Vector Flow, Image Segmentation, Snakes, Tiling, 004", + abstract = "Abstract. Active contours have been proven to be a powerful semiautomatic image segmentation approach, that seems to cope with many applications and different image modalities. However, they exhibit inherent drawbacks, including the sensibility to contour initialization due to the limited capture range of image edges and problems with concave boundary regions. The Gradient Vector Flow replaces the traditional image force and provides an enlarged capture range as well as enhanced concavity extraction capabilities, but it involves an expensive computational effort and considerably increased memory requirements at the time of computation. In this paper, we present an enhancement of the active contour model to facilitate semiautomatic contour detection in huge images. We propose a tile-based image decomposition accompanying an image force computation scheme on demand in order to minimize both computational and memory requirements. We show an efficient implementation of this approach on the basis of general purpose GPU processing providing for continuous active contour deformation without a considerable delay.", + publisher = {TU Chemnitz, Fakult{\"a}t f{\"u}r Informatik}, + url = "http://archiv.tu-chemnitz.de/pub/2009/0035", + institution = "MONARCH - Dokumenten- und Publikationsservice [http://archiv.tu-chemnitz.de/cgi-bin/interfaces/oai/oai2.pl] (Germany)" +} + +@Article{ ChesnaudRB99, + author = "Christophe Chesnaud and Philippe R{\'e}fr{\'e}gier and Vlady Boulet", + title = "Statistical Region Snake-Based Segmentation Adapted to Different Physical Noise Models", + journal = "IEEE Trans. Pattern Anal. Mach. Intell.", + volume = "21", + number = "11", + year = "1999", + pages = "1145--1157", + ee = "http://www.computer.org/tpami/tp1999/i1145abs.htm", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@Article{ GermainR01, + author = "Olivier Germain and Philippe R{\'e}fr{\'e}gier", + title = "Statistical active grid for segmentation refinement", + journal = "Pattern Recognition Letters", + volume = "22", + number = "10", + year = "2001", + pages = "1125--1132", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@Article{ Ruch01, + author = "Olivier Ruch and Philippe R{\'e}fr{\'e}gier", + title = "Minimal-complexity segmentation with a polygonal snake adapted to different optical noise models", + journal = "Optics Letters", + volume = "26", + number = "13", + month = "july", + year = "2001", + ee = "http://www.computer.org/tpami/tp1999/i1145abs.htm", + bibsource = "DBLP, http://dblp.uni-trier.de" +} + +@TechReport{ BlellochTR90, + author = "Guy~E. Blelloch", + title = "Prefix Sums and Their Applications", + institution = "School of Computer Science, Carnegie Mellon University", + number = "CMU-CS-90-190", + month = nov, + year = 1990 +} + +@InBook{ Harris07, + author = "Mark Harris and Shubhabrata Sengupta and John D. Owens", + title = "Gpu gems 3", + year = "2007", + isbn = "9780321545428", + edition = "first", + publisher = "Addison-Wesley Professional", + chapter = "39 - Parallel Prefix Sum with CUDA" +} + +@Manual{ CUDAPG, + title = "NVIDIA CUDA C Programming Guide v3.1.1", + organization = "NVIDIA Corporation", + month = "7", + year = "2010" +} + +@Manual{ CUDAFC, + title = "NVIDIA Fermi Compatibility Guide", + organization = "NVIDIA Corporation", + month = "7", + year = "2010" +} + +@Manual{ CUDAFT, + title = "NVIDIA Fermi Tuning Guide", + organization = "NVIDIA Corporation", + month = "7", + year = "2010" +} + diff --git a/doc/img/GPU_block.png b/doc/img/GPU_block.png new file mode 100644 index 0000000..fd3233e Binary files /dev/null and b/doc/img/GPU_block.png differ diff --git a/doc/img/GPUaddsoms2cumuls.fig b/doc/img/GPUaddsoms2cumuls.fig new file mode 100644 index 0000000..0b208e6 --- /dev/null +++ b/doc/img/GPUaddsoms2cumuls.fig @@ -0,0 +1,322 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +0 32 #9c0000 +0 33 #8c8c8c +0 34 #8c8c8c +0 35 #424242 +0 36 #8c8c8c +0 37 #424242 +0 38 #8c8c8c +0 39 #424242 +0 40 #8c8c8c +0 41 #424242 +0 42 #8c8c8c +0 43 #424242 +0 44 #c6b797 +0 45 #eff8ff +0 46 #dccba6 +0 47 #404040 +0 48 #808080 +0 49 #c0c0c0 +0 50 #e0e0e0 +0 51 #8e8f8e +0 52 #aaaaaa +0 53 #555555 +0 54 #c7c3c7 +0 55 #565151 +0 56 #8e8e8e +0 57 #d7d7d7 +0 58 #85807d +0 59 #d2d2d2 +0 60 #3a3a3a +0 61 #4573aa +0 62 #aeaeae +0 63 #7b79a5 +0 64 #444444 +0 65 #73758c +0 66 #f7f7f7 +0 67 #414541 +0 68 #635dce +0 69 #bebebe +0 70 #515151 +0 71 #e7e3e7 +0 72 #000049 +0 73 #797979 +0 74 #303430 +0 75 #414141 +0 76 #c7b696 +0 77 #dd9d93 +0 78 #f1ece0 +0 79 #c3c3c3 +0 80 #e2c8a8 +0 81 #e1e1e1 +0 82 #ededed +0 83 #da7a1a +0 84 #f1e41a +0 85 #887dc2 +0 86 #b0a193 +0 87 #837cdd +0 88 #d6d6d6 +0 89 #8c8ca5 +0 90 #4a4a4a +0 91 #8c6b6b +0 92 #5a5a5a +0 93 #636363 +0 94 #b79b73 +0 95 #4193ff +0 96 #bf703b +0 97 #db7700 +0 98 #dab800 +0 99 #006400 +0 100 #5a6b3b +0 101 #d3d3d3 +0 102 #8e8ea4 +0 103 #f3b95d +0 104 #89996b +0 105 #646464 +0 106 #b7e6ff +0 107 #86c0ec +0 108 #bdbdbd +0 109 #d39552 +0 110 #98d2fe +0 111 #616161 +0 112 #aeb2ae +0 113 #717171 +0 114 #ff9a00 +0 115 #8c9c6b +0 116 #f76b00 +0 117 #5a6b39 +0 118 #8c9c6b +0 119 #8c9c7b +0 120 #184a18 +0 121 #adadad +0 122 #f7bd5a +0 123 #636b9c +0 124 #de0000 +0 125 #adadad +0 126 #f7bd5a +0 127 #adadad +0 128 #f7bd5a +0 129 #636b9c +0 130 #526b29 +0 131 #949494 +0 132 #006300 +0 133 #00634a +0 134 #7b844a +0 135 #e7bd7b +0 136 #a5b5c6 +0 137 #6b6b94 +0 138 #846b6b +0 139 #529c4a +0 140 #d6e7e7 +0 141 #526363 +0 142 #186b4a +0 143 #9ca5b5 +0 144 #ff9400 +0 145 #ff9400 +0 146 #00634a +0 147 #7b844a +0 148 #63737b +0 149 #e7bd7b +0 150 #184a18 +0 151 #f7bd5a +0 152 #000000 +0 153 #f73829 +0 154 #000000 +0 155 #ffff52 +0 156 #52794a +0 157 #639a5a +0 158 #c66142 +0 159 #e76942 +0 160 #ff7952 +0 161 #dedede +0 162 #f3eed3 +0 163 #f5ae5d +0 164 #95ce99 +0 165 #b5157d +0 166 #eeeeee +0 167 #848484 +0 168 #7b7b7b +0 169 #005a00 +0 170 #e77373 +0 171 #ffcb31 +0 172 #29794a +0 173 #de2821 +0 174 #2159c6 +0 175 #f8f8f8 +0 176 #e6e6e6 +0 177 #21845a +0 178 #cccccc +6 1530 11160 15570 11880 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 3089 11205 11626 11205 11626 11835 3089 11835 3089 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4365 11205 4365 11835 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5715 11205 5715 11835 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 7515 11205 7515 11835 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 7605 11520 9630 11520 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9675 11205 9675 11835 +4 2 -1 50 -1 4 10 0.0000 2 150 270 3825 11565 $0$\001 +4 0 0 50 -1 4 12 0.0000 3 195 5670 5760 11565 $\\displaystyle\\sum_{k=0}^{k=1}\\sum_{j=k.bs}^{j=(k+1).bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 4410 11565 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 5895 9675 11565 $\\displaystyle\\sum_{k=0}^{k=(n-1)}\\sum_{j=k.bs}^{j=(k+1)bs-1}z(i,j)$\001 +4 2 0 50 -1 4 12 0.0000 2 180 945 3015 11475 vector $V$\001 +4 2 0 50 -1 4 12 0.0000 0 195 1485 3015 11655 in global memory\001 +-6 +6 810 13185 4140 13905 +6 1755 13230 3015 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3015 13230 3015 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2700 13230 2700 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1755 13230 1755 13860 +-6 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 855 13230 4095 13230 4095 13860 855 13860 855 13230 +-6 +6 810 9000 4140 9720 +6 1755 9045 3015 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3015 9045 3015 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2700 9045 2700 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1755 9045 1755 9675 +-6 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 855 9045 4095 9045 4095 9675 855 9675 855 9045 +-6 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 2250 9675 3645 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 3285 9675 3780 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 4590 9675 4590 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 5400 9675 4770 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 6660 9675 4950 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 8640 9675 10530 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 10710 9675 10710 11205 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 3510 11835 1350 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 3645 11835 2250 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 3780 11835 3420 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 4590 11835 4590 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 4770 11835 5400 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 4950 11835 6660 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 10530 11835 8640 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 10710 11835 10710 13230 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7425 13230 8235 13230 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7425 13860 8235 13860 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4185 13230 7425 13230 7425 13860 4185 13860 4185 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6345 13230 6345 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5085 13230 5085 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6075 13230 6075 13860 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10395 13230 10395 13860 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8280 13230 11610 13230 11610 13860 8280 13860 8280 13230 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9495 13230 9495 13860 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 2745 13500 2970 13500 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 6120 13500 6345 13500 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 9585 13545 10305 13545 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 1260 9675 3555 11205 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7425 9045 8235 9045 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7425 9675 8235 9675 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4185 9045 7425 9045 7425 9675 4185 9675 4185 9045 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6345 9045 6345 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5085 9045 5085 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6075 9045 6075 9675 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10395 9045 10395 9675 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8280 9045 11610 9045 11610 9675 8280 9675 8280 9045 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9495 9045 9495 9675 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 2745 9315 2970 9315 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 6120 9315 6345 9315 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 9585 9360 10305 9360 +4 0 0 50 -1 4 12 0.0000 3 195 4020 10395 9405 $\\displaystyle\\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 990 13590 $z(i,0)$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 825 11610 14040 block $n-1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 7425 14040 block $1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 4095 14040 block $0$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1245 855 14085 row i of $C_z$\001 +4 0 0 50 -1 4 12 0.0000 0 195 1215 855 14310 in global mem\001 +4 0 0 50 -1 4 12 0.0000 3 195 3525 10440 13590 $\\displaystyle\\sum_{j=0}^{j=n.bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3585 8325 13590 $\\displaystyle\\sum_{j=0}^{j=(n-1)bs}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3465 6345 13590 $\\displaystyle\\sum_{j=0}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3420 5085 13590 $\\displaystyle\\sum_{j=0}^{j=bs+1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3195 4230 13590 $\\displaystyle\\sum_{j=0}^{j=bs}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 3060 13590 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3090 1800 13590 $\\displaystyle\\sum_{j=0}^{j=1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 990 9405 $z(i,0)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 750 4320 9405 $z(i,bs)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3570 6390 9405 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3525 5085 9405 $\\displaystyle\\sum_{j=bs}^{j=bs+1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 3015 9405 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1200 8325 9405 $z(i,(n-1).bs)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3090 1800 9405 $\\displaystyle\\sum_{j=0}^{j=1}z(i,j)$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 825 11610 9855 block $n-1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 7425 9855 block $1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 4095 9855 block $0$\001 +4 0 0 50 -1 4 12 0.0000 0 195 1695 855 8955 in GPU global mem\001 +4 0 0 50 -1 4 12 0.0000 0 195 960 855 8775 prefixsums\001 diff --git a/doc/img/GPUaddsoms2cumuls.pdf b/doc/img/GPUaddsoms2cumuls.pdf new file mode 100644 index 0000000..ea7ff50 Binary files /dev/null and b/doc/img/GPUaddsoms2cumuls.pdf differ diff --git a/doc/img/GPUaddsoms2cumuls.pdf_t b/doc/img/GPUaddsoms2cumuls.pdf_t new file mode 100644 index 0000000..809bf36 --- /dev/null +++ b/doc/img/GPUaddsoms2cumuls.pdf_t @@ -0,0 +1,62 @@ +\begin{picture}(0,0)% +\includegraphics{GPUaddsoms2cumuls.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(10815,5779)(834,-13544) +\put(3826,-10726){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}$0$}}}} +\put(5761,-10726){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{k=0}^{k=1}\sum_{j=k.bs}^{j=(k+1).bs-1}z(i,j)$}% +}}}} +\put(4411,-10726){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(9676,-10726){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{k=0}^{k=(n-1)}\sum_{j=k.bs}^{j=(k+1)bs-1}z(i,j)$}% +}}}} +\put(3016,-10636){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}vector $V$}% +}}}} +\put(10396,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$}% +}}}} +\put(991,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,0)$}% +}}}} +\put(11611,-13201){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $n-1$}}}} +\put(7426,-13201){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $1$}}}} +\put(4096,-13201){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $0$}}}} +\put(856,-13246){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}row i of $C_z$}% +}}}} +\put(10441,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=n.bs-1}z(i,j)$}% +}}}} +\put(8326,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=(n-1)bs}z(i,j)$}% +}}}} +\put(6346,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=2bs-1}z(i,j)$}% +}}}} +\put(5086,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs+1}z(i,j)$}% +}}}} +\put(4231,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs}z(i,j)$}% +}}}} +\put(3061,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(1801,-12751){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=1}z(i,j)$}% +}}}} +\put(991,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,0)$}% +}}}} +\put(4321,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,bs)$}% +}}}} +\put(6391,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=2bs-1}z(i,j)$}% +}}}} +\put(5086,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=bs+1}z(i,j)$}% +}}}} +\put(3016,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(8326,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,(n-1).bs)$}% +}}}} +\put(1801,-8566){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=1}z(i,j)$}% +}}}} +\put(11611,-9016){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $n-1$}}}} +\put(7426,-9016){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $1$}}}} +\put(4096,-9016){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $0$}}}} +\end{picture}% diff --git a/doc/img/GPUcumuls.fig b/doc/img/GPUcumuls.fig new file mode 100644 index 0000000..720b20d --- /dev/null +++ b/doc/img/GPUcumuls.fig @@ -0,0 +1,331 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +0 32 #9c0000 +0 33 #8c8c8c +0 34 #8c8c8c +0 35 #424242 +0 36 #8c8c8c +0 37 #424242 +0 38 #8c8c8c +0 39 #424242 +0 40 #8c8c8c +0 41 #424242 +0 42 #8c8c8c +0 43 #424242 +0 44 #c6b797 +0 45 #eff8ff +0 46 #dccba6 +0 47 #404040 +0 48 #808080 +0 49 #c0c0c0 +0 50 #e0e0e0 +0 51 #8e8f8e +0 52 #aaaaaa +0 53 #555555 +0 54 #c7c3c7 +0 55 #565151 +0 56 #8e8e8e +0 57 #d7d7d7 +0 58 #85807d +0 59 #d2d2d2 +0 60 #3a3a3a +0 61 #4573aa +0 62 #aeaeae +0 63 #7b79a5 +0 64 #444444 +0 65 #73758c +0 66 #f7f7f7 +0 67 #414541 +0 68 #635dce +0 69 #bebebe +0 70 #515151 +0 71 #e7e3e7 +0 72 #000049 +0 73 #797979 +0 74 #303430 +0 75 #414141 +0 76 #c7b696 +0 77 #dd9d93 +0 78 #f1ece0 +0 79 #c3c3c3 +0 80 #e2c8a8 +0 81 #e1e1e1 +0 82 #ededed +0 83 #da7a1a +0 84 #f1e41a +0 85 #887dc2 +0 86 #b0a193 +0 87 #837cdd +0 88 #d6d6d6 +0 89 #8c8ca5 +0 90 #4a4a4a +0 91 #8c6b6b +0 92 #5a5a5a +0 93 #636363 +0 94 #b79b73 +0 95 #4193ff +0 96 #bf703b +0 97 #db7700 +0 98 #dab800 +0 99 #006400 +0 100 #5a6b3b +0 101 #d3d3d3 +0 102 #8e8ea4 +0 103 #f3b95d +0 104 #89996b +0 105 #646464 +0 106 #b7e6ff +0 107 #86c0ec +0 108 #bdbdbd +0 109 #d39552 +0 110 #98d2fe +0 111 #616161 +0 112 #aeb2ae +0 113 #717171 +0 114 #ff9a00 +0 115 #8c9c6b +0 116 #f76b00 +0 117 #5a6b39 +0 118 #8c9c6b +0 119 #8c9c7b +0 120 #184a18 +0 121 #adadad +0 122 #f7bd5a +0 123 #636b9c +0 124 #de0000 +0 125 #adadad +0 126 #f7bd5a +0 127 #adadad +0 128 #f7bd5a +0 129 #636b9c +0 130 #526b29 +0 131 #949494 +0 132 #006300 +0 133 #00634a +0 134 #7b844a +0 135 #e7bd7b +0 136 #a5b5c6 +0 137 #6b6b94 +0 138 #846b6b +0 139 #529c4a +0 140 #d6e7e7 +0 141 #526363 +0 142 #186b4a +0 143 #9ca5b5 +0 144 #ff9400 +0 145 #ff9400 +0 146 #00634a +0 147 #7b844a +0 148 #63737b +0 149 #e7bd7b +0 150 #184a18 +0 151 #f7bd5a +0 152 #000000 +0 153 #f73829 +0 154 #000000 +0 155 #ffff52 +0 156 #52794a +0 157 #639a5a +0 158 #c66142 +0 159 #e76942 +0 160 #ff7952 +0 161 #dedede +0 162 #f3eed3 +0 163 #f5ae5d +0 164 #95ce99 +0 165 #b5157d +0 166 #eeeeee +0 167 #848484 +0 168 #7b7b7b +0 169 #005a00 +0 170 #e77373 +0 171 #ffcb31 +0 172 #29794a +0 173 #de2821 +0 174 #2159c6 +0 175 #f8f8f8 +0 176 #e6e6e6 +0 177 #21845a +0 178 #cccccc +6 1710 1170 2970 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2970 1170 2970 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2655 1170 2655 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1710 1170 1710 1710 +-6 +6 765 2610 4095 3330 +6 1710 2655 2970 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2970 2655 2970 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2655 2655 2655 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1710 2655 1710 3285 +-6 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 810 2655 4050 2655 4050 3285 810 3285 810 2655 +-6 +6 5400 1800 5670 2610 +6 5400 2340 5670 2610 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 5587 2340 5483 2340 5483 2501 5400 2501 5535 2610 5670 2501 + 5587 2501 5587 2340 +-6 +6 5400 1800 5670 2070 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 5587 1800 5483 1800 5483 1961 5400 1961 5535 2070 5670 1961 + 5587 1961 5587 1800 +-6 +-6 +6 9540 1800 9810 2610 +6 9540 2340 9810 2610 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 9727 2340 9623 2340 9623 2501 9540 2501 9675 2610 9810 2501 + 9727 2501 9727 2340 +-6 +6 9540 1800 9810 2070 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 9727 1800 9623 1800 9623 1961 9540 1961 9675 2070 9810 1961 + 9727 1961 9727 1800 +-6 +-6 +6 2700 1800 2970 2610 +6 2700 2340 2970 2610 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 2887 2340 2783 2340 2783 2501 2700 2501 2835 2610 2970 2501 + 2887 2501 2887 2340 +-6 +6 2700 1800 2970 2070 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 2887 1800 2783 1800 2783 1961 2700 1961 2835 2070 2970 1961 + 2887 1961 2887 1800 +-6 +-6 +6 2025 3330 2295 3600 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 2212 3330 2108 3330 2108 3491 2025 3491 2160 3600 2295 3491 + 2212 3491 2212 3330 +-6 +6 5445 3330 5715 3600 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 5632 3330 5528 3330 5528 3491 5445 3491 5580 3600 5715 3491 + 5632 3491 5632 3330 +-6 +6 9540 3330 9810 3600 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 9727 3330 9623 3330 9623 3491 9540 3491 9675 3600 9810 3491 + 9727 3491 9727 3330 +-6 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10350 1170 10350 1710 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 1710 8190 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6300 1170 6300 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5040 1170 5040 1710 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 810 1170 4050 1170 4050 1710 810 1710 810 1170 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4140 1170 7380 1170 7380 1710 4140 1710 4140 1170 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 1170 8190 1170 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 2655 8190 2655 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 3285 8190 3285 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4140 2655 7380 2655 7380 3285 4140 3285 4140 2655 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6300 2655 6300 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5040 2655 5040 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6030 2655 6030 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10350 2655 10350 3285 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8190 1170 11520 1170 11520 1710 8190 1710 8190 1170 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8235 2655 11565 2655 11565 3285 8235 3285 8235 2655 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6030 1170 6030 1710 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 3044 4320 11581 4320 11581 4950 3044 4950 3044 4320 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4320 4320 4320 4950 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5580 4320 5580 4950 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10305 4320 10305 4950 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 2700 1440 2925 1440 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9450 2655 9450 3285 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 2700 2925 2925 2925 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 6075 1440 6300 1440 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 6075 2925 6300 2925 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 9540 1440 10260 1440 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 5670 4635 10170 4635 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 3105 3285 3240 4320 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 6398 3284 4860 4320 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 90.00 150.00 + 10442 3285 10440 4275 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9450 1170 9450 1710 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 9540 2970 10260 2970 +4 0 0 50 -1 4 12 0.0000 2 180 645 945 3015 $z(i,0)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 750 4275 3015 $z(i,bs)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1080 10395 1530 $z(i,n.bs-1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 750 4275 1530 $z(i,bs)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 975 5085 1530 $z(i,bs+1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1020 6345 1530 $z(i,2bs-1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 915 3015 1530 $z(i,bs-1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 945 1530 $z(i,0)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 1845 1530 $z(i,1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1200 8235 1530 $z(i,(n-1).bs)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3570 6345 3015 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3525 5040 3015 $\\displaystyle\\sum_{j=bs}^{j=bs+1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 4020 10350 3015 $\\displaystyle\\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 2970 3015 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1200 8280 3015 $z(i,(n-1).bs)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3570 4365 4680 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 3105 4680 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 4020 10305 4680 $\\displaystyle\\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3090 1755 3015 $\\displaystyle\\sum_{j=0}^{j=1}z(i,j)$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 4050 1890 block $0$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 7380 1890 block $1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 825 11520 1890 block $n-1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 825 11565 3465 block $n-1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 7380 3465 block $1$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 675 4050 3465 block $0$\001 +4 1 0 50 -1 4 12 0.0000 0 195 2760 5535 2250 one parallel prefixsum per block\001 +4 0 0 50 -1 4 12 0.0000 0 195 1695 810 1080 in GPU global mem\001 +4 0 0 50 -1 4 12 0.0000 0 150 1770 810 2565 in GPU shared mem\001 +4 0 0 50 -1 4 12 0.0000 0 195 960 810 2385 prefixsums\001 +4 0 0 50 -1 4 12 0.0000 0 195 1545 810 900 row i of the image\001 +4 2 0 50 -1 4 12 0.0000 2 180 2235 3015 4545 vector $V$ of block sums\001 +4 2 0 50 -1 4 12 0.0000 0 195 1965 3015 4770 in GPU global memory\001 +4 1 0 50 -1 4 12 0.0000 0 195 1980 2115 3780 to GPU global memory\001 +4 1 0 50 -1 4 12 0.0000 0 195 1980 9360 3780 to GPU global memory\001 +4 1 0 50 -1 4 12 0.0000 0 195 1980 5490 3780 to GPU global memory\001 diff --git a/doc/img/GPUcumuls.pdf b/doc/img/GPUcumuls.pdf new file mode 100644 index 0000000..6893d89 Binary files /dev/null and b/doc/img/GPUcumuls.pdf differ diff --git a/doc/img/GPUcumuls.pdf_t b/doc/img/GPUcumuls.pdf_t new file mode 100644 index 0000000..56e7c6f --- /dev/null +++ b/doc/img/GPUcumuls.pdf_t @@ -0,0 +1,59 @@ +\begin{picture}(0,0)% +\includegraphics{GPUcumuls.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(10815,4243)(789,-4133) +\put(946,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,0)$}% +}}}} +\put(4276,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,bs)$}% +}}}} +\put(10396,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,n.bs-1)$}% +}}}} +\put(4276,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,bs)$}% +}}}} +\put(5086,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,bs+1)$}% +}}}} +\put(6346,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,2bs-1)$}% +}}}} +\put(3016,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,bs-1)$}% +}}}} +\put(946,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,0)$}% +}}}} +\put(1846,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,1)$}% +}}}} +\put(8236,-691){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,(n-1).bs)$}% +}}}} +\put(6346,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=2bs-1}z(i,j)$}% +}}}} +\put(5041,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=bs+1}z(i,j)$}% +}}}} +\put(10351,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$}% +}}}} +\put(2971,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(8281,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$z(i,(n-1).bs)$}% +}}}} +\put(4366,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=2bs-1}z(i,j)$}% +}}}} +\put(3106,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(10306,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$}% +}}}} +\put(1756,-2176){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=1}z(i,j)$}% +}}}} +\put(4051,-1051){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $0$}}}} +\put(7381,-1051){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $1$}}}} +\put(11521,-1051){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $n-1$}}}} +\put(11566,-2626){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $n-1$}}}} +\put(7381,-2626){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $1$}}}} +\put(4051,-2626){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}block $0$}}}} +\put(3016,-3706){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}vector $V$ of block sums}% +}}}} +\end{picture}% diff --git a/doc/img/GPUscansomblocs.fig b/doc/img/GPUscansomblocs.fig new file mode 100644 index 0000000..ab71cb9 --- /dev/null +++ b/doc/img/GPUscansomblocs.fig @@ -0,0 +1,202 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +0 32 #9c0000 +0 33 #8c8c8c +0 34 #8c8c8c +0 35 #424242 +0 36 #8c8c8c +0 37 #424242 +0 38 #8c8c8c +0 39 #424242 +0 40 #8c8c8c +0 41 #424242 +0 42 #8c8c8c +0 43 #424242 +0 44 #c6b797 +0 45 #eff8ff +0 46 #dccba6 +0 47 #404040 +0 48 #808080 +0 49 #c0c0c0 +0 50 #e0e0e0 +0 51 #8e8f8e +0 52 #aaaaaa +0 53 #555555 +0 54 #c7c3c7 +0 55 #565151 +0 56 #8e8e8e +0 57 #d7d7d7 +0 58 #85807d +0 59 #d2d2d2 +0 60 #3a3a3a +0 61 #4573aa +0 62 #aeaeae +0 63 #7b79a5 +0 64 #444444 +0 65 #73758c +0 66 #f7f7f7 +0 67 #414541 +0 68 #635dce +0 69 #bebebe +0 70 #515151 +0 71 #e7e3e7 +0 72 #000049 +0 73 #797979 +0 74 #303430 +0 75 #414141 +0 76 #c7b696 +0 77 #dd9d93 +0 78 #f1ece0 +0 79 #c3c3c3 +0 80 #e2c8a8 +0 81 #e1e1e1 +0 82 #ededed +0 83 #da7a1a +0 84 #f1e41a +0 85 #887dc2 +0 86 #b0a193 +0 87 #837cdd +0 88 #d6d6d6 +0 89 #8c8ca5 +0 90 #4a4a4a +0 91 #8c6b6b +0 92 #5a5a5a +0 93 #636363 +0 94 #b79b73 +0 95 #4193ff +0 96 #bf703b +0 97 #db7700 +0 98 #dab800 +0 99 #006400 +0 100 #5a6b3b +0 101 #d3d3d3 +0 102 #8e8ea4 +0 103 #f3b95d +0 104 #89996b +0 105 #646464 +0 106 #b7e6ff +0 107 #86c0ec +0 108 #bdbdbd +0 109 #d39552 +0 110 #98d2fe +0 111 #616161 +0 112 #aeb2ae +0 113 #717171 +0 114 #ff9a00 +0 115 #8c9c6b +0 116 #f76b00 +0 117 #5a6b39 +0 118 #8c9c6b +0 119 #8c9c7b +0 120 #184a18 +0 121 #adadad +0 122 #f7bd5a +0 123 #636b9c +0 124 #de0000 +0 125 #adadad +0 126 #f7bd5a +0 127 #adadad +0 128 #f7bd5a +0 129 #636b9c +0 130 #526b29 +0 131 #949494 +0 132 #006300 +0 133 #00634a +0 134 #7b844a +0 135 #e7bd7b +0 136 #a5b5c6 +0 137 #6b6b94 +0 138 #846b6b +0 139 #529c4a +0 140 #d6e7e7 +0 141 #526363 +0 142 #186b4a +0 143 #9ca5b5 +0 144 #ff9400 +0 145 #ff9400 +0 146 #00634a +0 147 #7b844a +0 148 #63737b +0 149 #e7bd7b +0 150 #184a18 +0 151 #f7bd5a +0 152 #000000 +0 153 #f73829 +0 154 #000000 +0 155 #ffff52 +0 156 #52794a +0 157 #639a5a +0 158 #c66142 +0 159 #e76942 +0 160 #ff7952 +0 161 #dedede +0 162 #f3eed3 +0 163 #f5ae5d +0 164 #95ce99 +0 165 #b5157d +0 166 #eeeeee +0 167 #848484 +0 168 #7b7b7b +0 169 #005a00 +0 170 #e77373 +0 171 #ffcb31 +0 172 #29794a +0 173 #de2821 +0 174 #2159c6 +0 175 #f8f8f8 +0 176 #e6e6e6 +0 177 #21845a +0 178 #cccccc +6 7290 4995 7695 6209 +6 7290 5804 7695 6209 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 7571 5804 7415 5804 7415 6046 7290 6046 7493 6209 7695 6046 + 7571 6046 7571 5804 +-6 +6 7290 4995 7695 5400 +2 3 0 1 0 7 100 0 20 0.000 0 0 7 0 0 8 + 7571 4995 7415 4995 7415 5236 7290 5236 7493 5400 7695 5236 + 7571 5236 7571 4995 +-6 +-6 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 3044 6255 11581 6255 11581 6885 3044 6885 3044 6255 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4320 6255 4320 6885 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 3044 4320 11581 4320 11581 4950 3044 4950 3044 4320 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4320 4320 4320 4950 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5670 4320 5670 4950 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5670 6255 5670 6885 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 7470 6255 7470 6885 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 5760 4635 9540 4635 +2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 2 + 7560 6570 9585 6570 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9630 6255 9630 6885 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9630 4320 9630 4950 +4 0 0 50 -1 4 12 0.0000 3 195 3570 4365 4680 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 3105 4680 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 2 -1 50 -1 4 10 0.0000 2 150 270 3780 6615 $0$\001 +4 0 0 50 -1 4 12 0.0000 3 195 5670 5715 6615 $\\displaystyle\\sum_{k=0}^{k=1}\\sum_{j=k.bs}^{j=(k+1).bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 4365 6615 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 1 0 50 -1 4 12 0.0000 0 195 2415 7470 5625 parallel exclusive prefixsum\001 +4 0 0 50 -1 4 12 0.0000 3 195 4020 10080 4680 $\\displaystyle\\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 5895 9630 6615 $\\displaystyle\\sum_{k=0}^{k=(n-1)}\\sum_{j=k.bs}^{j=(k+1)bs-1}z(i,j)$\001 +4 2 0 50 -1 4 12 0.0000 2 180 2235 2970 4590 vector $V$ of block sums\001 +4 2 0 50 -1 4 12 0.0000 0 195 1485 2970 4770 in global memory\001 +4 2 0 50 -1 4 12 0.0000 2 180 945 2970 6525 vector $V$\001 +4 2 0 50 -1 4 12 0.0000 0 195 1485 2970 6705 in global memory\001 diff --git a/doc/img/GPUscansomblocs.pdf b/doc/img/GPUscansomblocs.pdf new file mode 100644 index 0000000..16367be Binary files /dev/null and b/doc/img/GPUscansomblocs.pdf differ diff --git a/doc/img/GPUscansomblocs.pdf_t b/doc/img/GPUscansomblocs.pdf_t new file mode 100644 index 0000000..fabdb8d --- /dev/null +++ b/doc/img/GPUscansomblocs.pdf_t @@ -0,0 +1,30 @@ +\begin{picture}(0,0)% +\includegraphics{GPUscansomblocs.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(10172,2609)(1432,-6068) +\put(4366,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=bs}^{j=2bs-1}z(i,j)$}% +}}}} +\put(3106,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(3781,-5776){\makebox(0,0)[rb]{\smash{{\SetFigFont{10}{12.0}{\sfdefault}{\mddefault}{\updefault}$0$}}}} +\put(5716,-5776){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{k=0}^{k=1}\sum_{j=k.bs}^{j=(k+1).bs-1}z(i,j)$}% +}}}} +\put(4366,-5776){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=0}^{j=bs-1}z(i,j)$}% +}}}} +\put(10081,-3841){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{j=(n-1)bs}^{j=n.bs-1}z(i,j)$}% +}}}} +\put(9631,-5776){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\displaystyle\sum_{k=0}^{k=(n-1)}\sum_{j=k.bs}^{j=(k+1)bs-1}z(i,j)$}% +}}}} +\put(2971,-3751){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}vector $V$ of block sums}% +}}}} +\put(2971,-5686){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}vector $V$}% +}}}} +\end{picture}% diff --git a/doc/img/SAVE.fig b/doc/img/SAVE.fig new file mode 100644 index 0000000..6900cc8 --- /dev/null +++ b/doc/img/SAVE.fig @@ -0,0 +1,244 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +0 32 #9c0000 +0 33 #8c8c8c +0 34 #8c8c8c +0 35 #424242 +0 36 #8c8c8c +0 37 #424242 +0 38 #8c8c8c +0 39 #424242 +0 40 #8c8c8c +0 41 #424242 +0 42 #8c8c8c +0 43 #424242 +0 44 #c6b797 +0 45 #eff8ff +0 46 #dccba6 +0 47 #404040 +0 48 #808080 +0 49 #c0c0c0 +0 50 #e0e0e0 +0 51 #8e8f8e +0 52 #aaaaaa +0 53 #555555 +0 54 #c7c3c7 +0 55 #565151 +0 56 #8e8e8e +0 57 #d7d7d7 +0 58 #85807d +0 59 #d2d2d2 +0 60 #3a3a3a +0 61 #4573aa +0 62 #aeaeae +0 63 #7b79a5 +0 64 #444444 +0 65 #73758c +0 66 #f7f7f7 +0 67 #414541 +0 68 #635dce +0 69 #bebebe +0 70 #515151 +0 71 #e7e3e7 +0 72 #000049 +0 73 #797979 +0 74 #303430 +0 75 #414141 +0 76 #c7b696 +0 77 #dd9d93 +0 78 #f1ece0 +0 79 #c3c3c3 +0 80 #e2c8a8 +0 81 #e1e1e1 +0 82 #ededed +0 83 #da7a1a +0 84 #f1e41a +0 85 #887dc2 +0 86 #b0a193 +0 87 #837cdd +0 88 #d6d6d6 +0 89 #8c8ca5 +0 90 #4a4a4a +0 91 #8c6b6b +0 92 #5a5a5a +0 93 #636363 +0 94 #b79b73 +0 95 #4193ff +0 96 #bf703b +0 97 #db7700 +0 98 #dab800 +0 99 #006400 +0 100 #5a6b3b +0 101 #d3d3d3 +0 102 #8e8ea4 +0 103 #f3b95d +0 104 #89996b +0 105 #646464 +0 106 #b7e6ff +0 107 #86c0ec +0 108 #bdbdbd +0 109 #d39552 +0 110 #98d2fe +0 111 #616161 +0 112 #aeb2ae +0 113 #717171 +0 114 #ff9a00 +0 115 #8c9c6b +0 116 #f76b00 +0 117 #5a6b39 +0 118 #8c9c6b +0 119 #8c9c7b +0 120 #184a18 +0 121 #adadad +0 122 #f7bd5a +0 123 #636b9c +0 124 #de0000 +0 125 #adadad +0 126 #f7bd5a +0 127 #adadad +0 128 #f7bd5a +0 129 #636b9c +0 130 #526b29 +0 131 #949494 +0 132 #006300 +0 133 #00634a +0 134 #7b844a +0 135 #e7bd7b +0 136 #a5b5c6 +0 137 #6b6b94 +0 138 #846b6b +0 139 #529c4a +0 140 #d6e7e7 +0 141 #526363 +0 142 #186b4a +0 143 #9ca5b5 +0 144 #ff9400 +0 145 #ff9400 +0 146 #00634a +0 147 #7b844a +0 148 #63737b +0 149 #e7bd7b +0 150 #184a18 +0 151 #f7bd5a +0 152 #000000 +0 153 #f73829 +0 154 #000000 +0 155 #ffff52 +0 156 #52794a +0 157 #639a5a +0 158 #c66142 +0 159 #e76942 +0 160 #ff7952 +0 161 #dedede +0 162 #f3eed3 +0 163 #f5ae5d +0 164 #95ce99 +0 165 #b5157d +0 166 #eeeeee +0 167 #848484 +0 168 #7b7b7b +0 169 #005a00 +0 170 #e77373 +0 171 #ffcb31 +0 172 #29794a +0 173 #de2821 +0 174 #2159c6 +0 175 #f8f8f8 +0 176 #e6e6e6 +0 177 #21845a +0 178 #cccccc +6 1710 1170 2970 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2970 1170 2970 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2655 1170 2655 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1710 1170 1710 1710 +-6 +6 1710 2655 2970 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2970 2655 2970 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2655 2655 2655 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1710 2655 1710 3285 +-6 +6 5040 2655 6300 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6300 2655 6300 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5985 2655 5985 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5040 2655 5040 3285 +-6 +6 9090 2655 10350 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10350 2655 10350 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10035 2655 10035 3285 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9090 2655 9090 3285 +-6 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10350 1170 10350 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 9090 1170 9090 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 10035 1170 10035 1710 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 1710 8190 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6300 1170 6300 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5985 1170 5985 1710 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 5040 1170 5040 1710 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 810 1170 4050 1170 4050 1710 810 1710 810 1170 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4140 1170 7380 1170 7380 1710 4140 1710 4140 1170 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8190 1170 11430 1170 11430 1710 8190 1710 8190 1170 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 1170 8190 1170 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 2655 8190 2655 +2 1 1 2 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7380 3285 8190 3285 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 4140 2655 7380 2655 7380 3285 4140 3285 4140 2655 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 8190 2655 11430 2655 11430 3285 8190 3285 8190 2655 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 810 2655 4050 2655 4050 3285 810 3285 810 2655 +4 0 0 50 -1 4 12 0.0000 2 195 4335 585 5895 $N_{TB}$ blocks of $bs$ threads for one segment\001 +4 2 0 50 -1 4 12 0.0000 2 195 2730 3555 6435 $\\overrightarrow{P_{i-1}T_{i,0}}$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 1245 11430 3465 block $N_b-1$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 945 3015 $z(i,0)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1080 10395 1530 $z(i,n.bs-1)$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 1005 11430 1980 block $n-1$\001 +4 0 0 50 -1 4 12 0.0000 2 180 750 4275 1530 $z(i,bs)$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 840 7380 1980 block $1$\001 +4 0 0 50 -1 4 12 0.0000 2 180 975 5085 1530 $z(i,bs+1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1020 6345 1530 $z(i,2bs-1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 915 3015 1530 $z(i,bs-1)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 945 1530 $z(i,0)$\001 +4 0 0 50 -1 4 12 0.0000 2 180 645 1845 1530 $z(i,1)$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 840 4050 1980 block $0$\001 +4 0 0 50 -1 4 12 0.0000 2 180 750 4275 3015 $z(i,bs)$\001 +4 2 -1 50 -1 4 12 0.0000 0 150 630 11430 3465 block 1\001 +4 0 0 50 -1 4 12 0.0000 3 195 3570 6345 3015 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3525 5130 3015 $\\displaystyle\\sum_{j=bs}^{j=bs+1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3360 3015 3015 $\\displaystyle\\sum_{j=0}^{j=bs-1}z(i,j)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3090 1755 3015 $\\displaystyle\\sum_{j=0}^{j=1}z(i,j)$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 840 7380 3510 block $1$\001 +4 2 -1 50 -1 4 12 0.0000 2 180 840 4050 3510 block $0$\001 +4 0 0 50 -1 4 12 0.0000 2 180 1200 8235 1530 $z(i,(n-1).bs)$\001 +4 0 0 50 -1 4 12 0.0000 3 195 3570 10395 3015 $\\displaystyle\\sum_{j=bs}^{j=2bs-1}z(i,j)$\001 diff --git a/doc/img/cochon_b_entier.jpg b/doc/img/cochon_b_entier.jpg new file mode 100644 index 0000000..55e9615 Binary files /dev/null and b/doc/img/cochon_b_entier.jpg differ diff --git a/doc/img/cochon_init.png b/doc/img/cochon_init.png new file mode 100644 index 0000000..24f20dd Binary files /dev/null and b/doc/img/cochon_init.png differ diff --git a/doc/img/cochon_it1.png b/doc/img/cochon_it1.png new file mode 100644 index 0000000..185024e Binary files /dev/null and b/doc/img/cochon_it1.png differ diff --git a/doc/img/cochon_it2_1.png b/doc/img/cochon_it2_1.png new file mode 100644 index 0000000..eba6c45 Binary files /dev/null and b/doc/img/cochon_it2_1.png differ diff --git a/doc/img/cochon_it2_2.png b/doc/img/cochon_it2_2.png new file mode 100644 index 0000000..632fa87 Binary files /dev/null and b/doc/img/cochon_it2_2.png differ diff --git a/doc/img/cochon_it4.png b/doc/img/cochon_it4.png new file mode 100644 index 0000000..26961ed Binary files /dev/null and b/doc/img/cochon_it4.png differ diff --git a/doc/img/cochon_petit_detail_16segments.jpg b/doc/img/cochon_petit_detail_16segments.jpg new file mode 100644 index 0000000..ce38be7 Binary files /dev/null and b/doc/img/cochon_petit_detail_16segments.jpg differ diff --git a/doc/img/cochon_petit_init.jpg b/doc/img/cochon_petit_init.jpg new file mode 100644 index 0000000..38093b2 Binary files /dev/null and b/doc/img/cochon_petit_init.jpg differ diff --git a/doc/img/cochon_petit_init.pgm b/doc/img/cochon_petit_init.pgm new file mode 100644 index 0000000..4e6f62f Binary files /dev/null and b/doc/img/cochon_petit_init.pgm differ diff --git a/doc/img/cochon_petit_it1.jpg b/doc/img/cochon_petit_it1.jpg new file mode 100644 index 0000000..32c59ac Binary files /dev/null and b/doc/img/cochon_petit_it1.jpg differ diff --git a/doc/img/cochon_petit_it1.pgm b/doc/img/cochon_petit_it1.pgm new file mode 100644 index 0000000..833de0b Binary files /dev/null and b/doc/img/cochon_petit_it1.pgm differ diff --git a/doc/img/cochon_petit_it2.jpg b/doc/img/cochon_petit_it2.jpg new file mode 100644 index 0000000..49e7adb Binary files /dev/null and b/doc/img/cochon_petit_it2.jpg differ diff --git a/doc/img/cochon_petit_it2.pgm b/doc/img/cochon_petit_it2.pgm new file mode 100644 index 0000000..a05cda8 Binary files /dev/null and b/doc/img/cochon_petit_it2.pgm differ diff --git a/doc/img/cochon_petit_it3.jpg b/doc/img/cochon_petit_it3.jpg new file mode 100644 index 0000000..44dc59d Binary files /dev/null and b/doc/img/cochon_petit_it3.jpg differ diff --git a/doc/img/cochon_petit_it3.pgm b/doc/img/cochon_petit_it3.pgm new file mode 100644 index 0000000..176f5bb Binary files /dev/null and b/doc/img/cochon_petit_it3.pgm differ diff --git a/doc/img/cochon_petit_it4.jpg b/doc/img/cochon_petit_it4.jpg new file mode 100644 index 0000000..b7d6532 Binary files /dev/null and b/doc/img/cochon_petit_it4.jpg differ diff --git a/doc/img/cochon_petit_it4.pgm b/doc/img/cochon_petit_it4.pgm new file mode 100644 index 0000000..0a7f74b Binary files /dev/null and b/doc/img/cochon_petit_it4.pgm differ diff --git a/doc/img/cochon_positions.png b/doc/img/cochon_positions.png new file mode 100644 index 0000000..ef0fbaa Binary files /dev/null and b/doc/img/cochon_positions.png differ diff --git a/doc/img/cochon_small.jpg b/doc/img/cochon_small.jpg new file mode 100644 index 0000000..f4686f8 Binary files /dev/null and b/doc/img/cochon_small.jpg differ diff --git a/doc/img/contribs_segments.fig b/doc/img/contribs_segments.fig new file mode 100644 index 0000000..549b078 --- /dev/null +++ b/doc/img/contribs_segments.fig @@ -0,0 +1,1189 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +0 32 #9c0000 +0 33 #8c8c8c +0 34 #8c8c8c +0 35 #424242 +0 36 #8c8c8c +0 37 #424242 +0 38 #8c8c8c +0 39 #424242 +0 40 #8c8c8c +0 41 #424242 +0 42 #8c8c8c +0 43 #424242 +0 44 #c6b797 +0 45 #eff8ff +0 46 #dccba6 +0 47 #404040 +0 48 #808080 +0 49 #c0c0c0 +0 50 #e0e0e0 +0 51 #8e8f8e +0 52 #aaaaaa +0 53 #555555 +0 54 #c7c3c7 +0 55 #565151 +0 56 #8e8e8e +0 57 #d7d7d7 +0 58 #85807d +0 59 #d2d2d2 +0 60 #3a3a3a +0 61 #4573aa +0 62 #aeaeae +0 63 #7b79a5 +0 64 #444444 +0 65 #73758c +0 66 #f7f7f7 +0 67 #414541 +0 68 #635dce +0 69 #bebebe +0 70 #515151 +0 71 #e7e3e7 +0 72 #000049 +0 73 #797979 +0 74 #303430 +0 75 #414141 +0 76 #c7b696 +0 77 #dd9d93 +0 78 #f1ece0 +0 79 #c3c3c3 +0 80 #e2c8a8 +0 81 #e1e1e1 +0 82 #ededed +0 83 #da7a1a +0 84 #f1e41a +0 85 #887dc2 +0 86 #b0a193 +0 87 #837cdd +0 88 #d6d6d6 +0 89 #8c8ca5 +0 90 #4a4a4a +0 91 #8c6b6b +0 92 #5a5a5a +0 93 #636363 +0 94 #b79b73 +0 95 #4193ff +0 96 #bf703b +0 97 #db7700 +0 98 #dab800 +0 99 #006400 +0 100 #5a6b3b +0 101 #d3d3d3 +0 102 #8e8ea4 +0 103 #f3b95d +0 104 #89996b +0 105 #646464 +0 106 #b7e6ff +0 107 #86c0ec +0 108 #bdbdbd +0 109 #d39552 +0 110 #98d2fe +0 111 #616161 +0 112 #aeb2ae +0 113 #717171 +0 114 #ff9a00 +0 115 #8c9c6b +0 116 #f76b00 +0 117 #5a6b39 +0 118 #8c9c6b +0 119 #8c9c7b +0 120 #184a18 +0 121 #adadad +0 122 #f7bd5a +0 123 #636b9c +0 124 #de0000 +0 125 #adadad +0 126 #f7bd5a +0 127 #adadad +0 128 #f7bd5a +0 129 #636b9c +0 130 #526b29 +0 131 #949494 +0 132 #006300 +0 133 #00634a +0 134 #7b844a +0 135 #e7bd7b +0 136 #a5b5c6 +0 137 #6b6b94 +0 138 #846b6b +0 139 #529c4a +0 140 #d6e7e7 +0 141 #526363 +0 142 #186b4a +0 143 #9ca5b5 +0 144 #ff9400 +0 145 #ff9400 +0 146 #00634a +0 147 #7b844a +0 148 #63737b +0 149 #e7bd7b +0 150 #184a18 +0 151 #f7bd5a +0 152 #000000 +0 153 #f73829 +0 154 #000000 +0 155 #ffff52 +0 156 #52794a +0 157 #639a5a +0 158 #c66142 +0 159 #e76942 +0 160 #ff7952 +0 161 #dedede +0 162 #f3eed3 +0 163 #f5ae5d +0 164 #95ce99 +0 165 #b5157d +0 166 #eeeeee +0 167 #848484 +0 168 #7b7b7b +0 169 #005a00 +0 170 #e77373 +0 171 #ffcb31 +0 172 #29794a +0 173 #de2821 +0 174 #2159c6 +0 175 #f8f8f8 +0 176 #e6e6e6 +0 177 #21845a +0 178 #cccccc +6 855 1350 2475 1755 +6 1251 1350 1395 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1337 1350 1280 1408 1395 1466 1251 1524 1395 1581 1280 1639 + 1337 1697 1337 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1386 1350 1530 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1472 1350 1415 1408 1530 1466 1386 1524 1530 1581 1415 1639 + 1472 1697 1472 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1521 1350 1665 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1607 1350 1550 1408 1665 1466 1521 1524 1665 1581 1550 1639 + 1607 1697 1607 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1656 1350 1800 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1742 1350 1685 1408 1800 1466 1656 1524 1800 1581 1685 1639 + 1742 1697 1742 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1791 1350 1935 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1877 1350 1820 1408 1935 1466 1791 1524 1935 1581 1820 1639 + 1877 1697 1877 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1926 1350 2070 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2012 1350 1955 1408 2070 1466 1926 1524 2070 1581 1955 1639 + 2012 1697 2012 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2061 1350 2205 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2147 1350 2090 1408 2205 1466 2061 1524 2205 1581 2090 1639 + 2147 1697 2147 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2196 1350 2340 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2282 1350 2225 1408 2340 1466 2196 1524 2340 1581 2225 1639 + 2282 1697 2282 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2331 1350 2475 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2417 1350 2360 1408 2475 1466 2331 1524 2475 1581 2360 1639 + 2417 1697 2417 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1125 1350 1269 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1211 1350 1154 1408 1269 1466 1125 1524 1269 1581 1154 1639 + 1211 1697 1211 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 990 1350 1134 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1076 1350 1019 1408 1134 1466 990 1524 1134 1581 1019 1639 + 1076 1697 1076 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 855 1350 999 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 941 1350 884 1408 999 1466 855 1524 999 1581 884 1639 + 941 1697 941 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 2700 1350 4320 1755 +6 3096 1350 3240 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3182 1350 3125 1408 3240 1466 3096 1524 3240 1581 3125 1639 + 3182 1697 3182 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3231 1350 3375 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3317 1350 3260 1408 3375 1466 3231 1524 3375 1581 3260 1639 + 3317 1697 3317 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3366 1350 3510 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3452 1350 3395 1408 3510 1466 3366 1524 3510 1581 3395 1639 + 3452 1697 3452 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3501 1350 3645 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3587 1350 3530 1408 3645 1466 3501 1524 3645 1581 3530 1639 + 3587 1697 3587 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3636 1350 3780 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3722 1350 3665 1408 3780 1466 3636 1524 3780 1581 3665 1639 + 3722 1697 3722 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3771 1350 3915 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3857 1350 3800 1408 3915 1466 3771 1524 3915 1581 3800 1639 + 3857 1697 3857 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3906 1350 4050 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3992 1350 3935 1408 4050 1466 3906 1524 4050 1581 3935 1639 + 3992 1697 3992 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4041 1350 4185 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4127 1350 4070 1408 4185 1466 4041 1524 4185 1581 4070 1639 + 4127 1697 4127 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4176 1350 4320 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4262 1350 4205 1408 4320 1466 4176 1524 4320 1581 4205 1639 + 4262 1697 4262 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2970 1350 3114 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3056 1350 2999 1408 3114 1466 2970 1524 3114 1581 2999 1639 + 3056 1697 3056 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2835 1350 2979 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2921 1350 2864 1408 2979 1466 2835 1524 2979 1581 2864 1639 + 2921 1697 2921 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2700 1350 2844 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2786 1350 2729 1408 2844 1466 2700 1524 2844 1581 2729 1639 + 2786 1697 2786 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 5616 1350 5760 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5702 1350 5645 1408 5760 1466 5616 1524 5760 1581 5645 1639 + 5702 1697 5702 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5751 1350 5895 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5837 1350 5780 1408 5895 1466 5751 1524 5895 1581 5780 1639 + 5837 1697 5837 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5886 1350 6030 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5972 1350 5915 1408 6030 1466 5886 1524 6030 1581 5915 1639 + 5972 1697 5972 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6021 1350 6165 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6107 1350 6050 1408 6165 1466 6021 1524 6165 1581 6050 1639 + 6107 1697 6107 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6156 1350 6300 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6242 1350 6185 1408 6300 1466 6156 1524 6300 1581 6185 1639 + 6242 1697 6242 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6291 1350 6435 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6377 1350 6320 1408 6435 1466 6291 1524 6435 1581 6320 1639 + 6377 1697 6377 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6426 1350 6570 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6512 1350 6455 1408 6570 1466 6426 1524 6570 1581 6455 1639 + 6512 1697 6512 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5490 1350 5634 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5576 1350 5519 1408 5634 1466 5490 1524 5634 1581 5519 1639 + 5576 1697 5576 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5355 1350 5499 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5441 1350 5384 1408 5499 1466 5355 1524 5499 1581 5384 1639 + 5441 1697 5441 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5220 1350 5364 1755 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5306 1350 5249 1408 5364 1466 5220 1524 5364 1581 5249 1639 + 5306 1697 5306 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6615 1350 6795 1755 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 6701 1350 6644 1408 6759 1466 6615 1524 6759 1581 6644 1639 + 6701 1697 6701 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6750 1350 6930 1755 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 6836 1350 6779 1408 6894 1466 6750 1524 6894 1581 6779 1639 + 6836 1697 6836 1755 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 810 2835 1755 3240 +6 810 2835 954 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 896 2835 839 2893 954 2951 810 3009 954 3066 839 3124 + 896 3182 896 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 945 2835 1089 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1031 2835 974 2893 1089 2951 945 3009 1089 3066 974 3124 + 1031 3182 1031 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1575 2835 1755 3240 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 1661 2835 1604 2893 1719 2951 1575 3009 1719 3066 1604 3124 + 1661 3182 1661 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1071 2835 1215 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1157 2835 1100 2893 1215 2951 1071 3009 1215 3066 1100 3124 + 1157 3182 1157 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1206 2835 1350 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1292 2835 1235 2893 1350 2951 1206 3009 1350 3066 1235 3124 + 1292 3182 1292 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1341 2835 1485 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 1427 2835 1370 2893 1485 2951 1341 3009 1485 3066 1370 3124 + 1427 3182 1427 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 2835 2835 3780 3240 +6 2835 2835 2979 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 2921 2835 2864 2893 2979 2951 2835 3009 2979 3066 2864 3124 + 2921 3182 2921 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2970 2835 3114 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3056 2835 2999 2893 3114 2951 2970 3009 3114 3066 2999 3124 + 3056 3182 3056 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3600 2835 3780 3240 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 3686 2835 3629 2893 3744 2951 3600 3009 3744 3066 3629 3124 + 3686 3182 3686 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3096 2835 3240 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3182 2835 3125 2893 3240 2951 3096 3009 3240 3066 3125 3124 + 3182 3182 3182 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3231 2835 3375 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3317 2835 3260 2893 3375 2951 3231 3009 3375 3066 3260 3124 + 3317 3182 3317 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3366 2835 3510 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3452 2835 3395 2893 3510 2951 3366 3009 3510 3066 3395 3124 + 3452 3182 3452 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 3870 2835 4815 3240 +6 3870 2835 4014 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 3956 2835 3899 2893 4014 2951 3870 3009 4014 3066 3899 3124 + 3956 3182 3956 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4005 2835 4149 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4091 2835 4034 2893 4149 2951 4005 3009 4149 3066 4034 3124 + 4091 3182 4091 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4635 2835 4815 3240 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 4721 2835 4664 2893 4779 2951 4635 3009 4779 3066 4664 3124 + 4721 3182 4721 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4131 2835 4275 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4217 2835 4160 2893 4275 2951 4131 3009 4275 3066 4160 3124 + 4217 3182 4217 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4266 2835 4410 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4352 2835 4295 2893 4410 2951 4266 3009 4410 3066 4295 3124 + 4352 3182 4352 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 4401 2835 4545 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 4487 2835 4430 2893 4545 2951 4401 3009 4545 3066 4430 3124 + 4487 3182 4487 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 5895 2835 6840 3240 +6 5895 2835 6039 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 5981 2835 5924 2893 6039 2951 5895 3009 6039 3066 5924 3124 + 5981 3182 5981 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6030 2835 6174 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6116 2835 6059 2893 6174 2951 6030 3009 6174 3066 6059 3124 + 6116 3182 6116 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6660 2835 6840 3240 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 39.00 48.00 + 6746 2835 6689 2893 6804 2951 6660 3009 6804 3066 6689 3124 + 6746 3182 6746 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6156 2835 6300 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6242 2835 6185 2893 6300 2951 6156 3009 6300 3066 6185 3124 + 6242 3182 6242 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6291 2835 6435 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6377 2835 6320 2893 6435 2951 6291 3009 6435 3066 6320 3124 + 6377 3182 6377 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6426 2835 6570 3240 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 38.49 48.11 + 6512 2835 6455 2893 6570 2951 6426 3009 6570 3066 6455 3124 + 6512 3182 6512 3240 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 2385 4365 4005 4770 +6 2798 4365 3214 4770 +6 2914 4365 3048 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2994 4365 2941 4423 3048 4481 2914 4539 3048 4596 2941 4654 + 2994 4712 2994 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2798 4365 2931 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2877 4365 2825 4423 2931 4481 2798 4539 2931 4596 2825 4654 + 2877 4712 2877 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3131 4365 3078 4423 3185 4481 3051 4539 3185 4596 3078 4654 + 3131 4712 3131 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3214 4365 3630 4770 +6 3331 4365 3464 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3410 4365 3358 4423 3464 4481 3331 4539 3464 4596 3358 4654 + 3410 4712 3410 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3214 4365 3347 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3294 4365 3241 4423 3347 4481 3214 4539 3347 4596 3241 4654 + 3294 4712 3294 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3547 4365 3494 4423 3601 4481 3468 4539 3601 4596 3494 4654 + 3547 4712 3547 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3589 4365 4005 4770 +6 3705 4365 3839 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3785 4365 3732 4423 3839 4481 3705 4539 3839 4596 3732 4654 + 3785 4712 3785 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3589 4365 3722 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3668 4365 3616 4423 3722 4481 3589 4539 3722 4596 3616 4654 + 3668 4712 3668 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3922 4365 3869 4423 3975 4481 3842 4539 3975 4596 3869 4654 + 3922 4712 3922 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2423 4365 2840 4770 +6 2540 4365 2673 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2619 4365 2567 4423 2673 4481 2540 4539 2673 4596 2567 4654 + 2619 4712 2619 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2423 4365 2556 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2503 4365 2450 4423 2556 4481 2423 4539 2556 4596 2450 4654 + 2503 4712 2503 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 2756 4365 2704 4423 2810 4481 2677 4539 2810 4596 2704 4654 + 2756 4712 2756 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 5310 4365 6930 4770 +6 5723 4365 6139 4770 +6 5839 4365 5973 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5919 4365 5866 4423 5973 4481 5839 4539 5973 4596 5866 4654 + 5919 4712 5919 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5723 4365 5856 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5802 4365 5750 4423 5856 4481 5723 4539 5856 4596 5750 4654 + 5802 4712 5802 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6056 4365 6003 4423 6110 4481 5976 4539 6110 4596 6003 4654 + 6056 4712 6056 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6139 4365 6555 4770 +6 6256 4365 6389 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6335 4365 6283 4423 6389 4481 6256 4539 6389 4596 6283 4654 + 6335 4712 6335 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6139 4365 6272 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6219 4365 6166 4423 6272 4481 6139 4539 6272 4596 6166 4654 + 6219 4712 6219 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6472 4365 6419 4423 6526 4481 6393 4539 6526 4596 6419 4654 + 6472 4712 6472 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6514 4365 6930 4770 +6 6630 4365 6764 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6710 4365 6657 4423 6764 4481 6630 4539 6764 4596 6657 4654 + 6710 4712 6710 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6514 4365 6647 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6593 4365 6541 4423 6647 4481 6514 4539 6647 4596 6541 4654 + 6593 4712 6593 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6847 4365 6794 4423 6900 4481 6767 4539 6900 4596 6794 4654 + 6847 4712 6847 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5348 4365 5765 4770 +6 5465 4365 5598 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5544 4365 5492 4423 5598 4481 5465 4539 5598 4596 5492 4654 + 5544 4712 5544 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5348 4365 5481 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5428 4365 5375 4423 5481 4481 5348 4539 5481 4596 5375 4654 + 5428 4712 5428 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 5681 4365 5629 4423 5735 4481 5602 4539 5735 4596 5629 4654 + 5681 4712 5681 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 765 4365 2385 4770 +6 1178 4365 1594 4770 +6 1294 4365 1428 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1374 4365 1321 4423 1428 4481 1294 4539 1428 4596 1321 4654 + 1374 4712 1374 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1178 4365 1311 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1257 4365 1205 4423 1311 4481 1178 4539 1311 4596 1205 4654 + 1257 4712 1257 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1511 4365 1458 4423 1565 4481 1431 4539 1565 4596 1458 4654 + 1511 4712 1511 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1594 4365 2010 4770 +6 1711 4365 1844 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1790 4365 1738 4423 1844 4481 1711 4539 1844 4596 1738 4654 + 1790 4712 1790 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1594 4365 1727 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1674 4365 1621 4423 1727 4481 1594 4539 1727 4596 1621 4654 + 1674 4712 1674 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1927 4365 1874 4423 1981 4481 1848 4539 1981 4596 1874 4654 + 1927 4712 1927 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1969 4365 2385 4770 +6 2085 4365 2219 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2165 4365 2112 4423 2219 4481 2085 4539 2219 4596 2112 4654 + 2165 4712 2165 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1969 4365 2102 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2048 4365 1996 4423 2102 4481 1969 4539 2102 4596 1996 4654 + 2048 4712 2048 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 2302 4365 2249 4423 2355 4481 2222 4539 2355 4596 2249 4654 + 2302 4712 2302 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 803 4365 1220 4770 +6 920 4365 1053 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 999 4365 947 4423 1053 4481 920 4539 1053 4596 947 4654 + 999 4712 999 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 803 4365 936 4770 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 883 4365 830 4423 936 4481 803 4539 936 4596 830 4654 + 883 4712 883 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1136 4365 1084 4423 1190 4481 1057 4539 1190 4596 1084 4654 + 1136 4712 1136 4770 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 5310 5535 6930 5940 +6 5723 5535 6139 5940 +6 5839 5535 5973 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5919 5535 5866 5593 5973 5651 5839 5709 5973 5766 5866 5824 + 5919 5882 5919 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5723 5535 5856 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5802 5535 5750 5593 5856 5651 5723 5709 5856 5766 5750 5824 + 5802 5882 5802 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6056 5535 6003 5593 6110 5651 5976 5709 6110 5766 6003 5824 + 6056 5882 6056 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6139 5535 6555 5940 +6 6256 5535 6389 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6335 5535 6283 5593 6389 5651 6256 5709 6389 5766 6283 5824 + 6335 5882 6335 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6139 5535 6272 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6219 5535 6166 5593 6272 5651 6139 5709 6272 5766 6166 5824 + 6219 5882 6219 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6472 5535 6419 5593 6526 5651 6393 5709 6526 5766 6419 5824 + 6472 5882 6472 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6514 5535 6930 5940 +6 6630 5535 6764 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6710 5535 6657 5593 6764 5651 6630 5709 6764 5766 6657 5824 + 6710 5882 6710 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 6514 5535 6647 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 6593 5535 6541 5593 6647 5651 6514 5709 6647 5766 6541 5824 + 6593 5882 6593 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 6847 5535 6794 5593 6900 5651 6767 5709 6900 5766 6794 5824 + 6847 5882 6847 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5348 5535 5765 5940 +6 5465 5535 5598 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5544 5535 5492 5593 5598 5651 5465 5709 5598 5766 5492 5824 + 5544 5882 5544 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 5348 5535 5481 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 5428 5535 5375 5593 5481 5651 5348 5709 5481 5766 5375 5824 + 5428 5882 5428 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 5681 5535 5629 5593 5735 5651 5602 5709 5735 5766 5629 5824 + 5681 5882 5681 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 2385 5535 4005 5940 +6 2798 5535 3214 5940 +6 2914 5535 3048 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2994 5535 2941 5593 3048 5651 2914 5709 3048 5766 2941 5824 + 2994 5882 2994 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2798 5535 2931 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2877 5535 2825 5593 2931 5651 2798 5709 2931 5766 2825 5824 + 2877 5882 2877 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3131 5535 3078 5593 3185 5651 3051 5709 3185 5766 3078 5824 + 3131 5882 3131 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3214 5535 3630 5940 +6 3331 5535 3464 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3410 5535 3358 5593 3464 5651 3331 5709 3464 5766 3358 5824 + 3410 5882 3410 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3214 5535 3347 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3294 5535 3241 5593 3347 5651 3214 5709 3347 5766 3241 5824 + 3294 5882 3294 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3547 5535 3494 5593 3601 5651 3468 5709 3601 5766 3494 5824 + 3547 5882 3547 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3589 5535 4005 5940 +6 3705 5535 3839 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3785 5535 3732 5593 3839 5651 3705 5709 3839 5766 3732 5824 + 3785 5882 3785 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 3589 5535 3722 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 3668 5535 3616 5593 3722 5651 3589 5709 3722 5766 3616 5824 + 3668 5882 3668 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 3922 5535 3869 5593 3975 5651 3842 5709 3975 5766 3869 5824 + 3922 5882 3922 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2423 5535 2840 5940 +6 2540 5535 2673 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2619 5535 2567 5593 2673 5651 2540 5709 2673 5766 2567 5824 + 2619 5882 2619 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 2423 5535 2556 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2503 5535 2450 5593 2556 5651 2423 5709 2556 5766 2450 5824 + 2503 5882 2503 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 2756 5535 2704 5593 2810 5651 2677 5709 2810 5766 2704 5824 + 2756 5882 2756 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +6 765 5535 2385 5940 +6 1178 5535 1594 5940 +6 1294 5535 1428 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1374 5535 1321 5593 1428 5651 1294 5709 1428 5766 1321 5824 + 1374 5882 1374 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1178 5535 1311 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1257 5535 1205 5593 1311 5651 1178 5709 1311 5766 1205 5824 + 1257 5882 1257 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1511 5535 1458 5593 1565 5651 1431 5709 1565 5766 1458 5824 + 1511 5882 1511 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1594 5535 2010 5940 +6 1711 5535 1844 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1790 5535 1738 5593 1844 5651 1711 5709 1844 5766 1738 5824 + 1790 5882 1790 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1594 5535 1727 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 1674 5535 1621 5593 1727 5651 1594 5709 1727 5766 1621 5824 + 1674 5882 1674 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1927 5535 1874 5593 1981 5651 1848 5709 1981 5766 1874 5824 + 1927 5882 1927 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1969 5535 2385 5940 +6 2085 5535 2219 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2165 5535 2112 5593 2219 5651 2085 5709 2219 5766 2112 5824 + 2165 5882 2165 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 1969 5535 2102 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 2048 5535 1996 5593 2102 5651 1969 5709 2102 5766 1996 5824 + 2048 5882 2048 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 2302 5535 2249 5593 2355 5651 2222 5709 2355 5766 2249 5824 + 2302 5882 2302 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 803 5535 1220 5940 +6 920 5535 1053 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 999 5535 947 5593 1053 5651 920 5709 1053 5766 947 5824 + 999 5882 999 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +6 803 5535 936 5940 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.05 46.31 + 883 5535 830 5593 936 5651 803 5709 936 5766 830 5824 + 883 5882 883 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +3 0 0 1 52 7 50 -1 -1 0.000 0 1 0 8 + 1 1 1.00 37.54 46.20 + 1136 5535 1084 5593 1190 5651 1057 5709 1190 5766 1084 5824 + 1136 5882 1136 5940 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +-6 +-6 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4860 2790 5895 2790 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4860 3330 5895 3330 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 1800 2790 2835 2790 +2 1 0 1 0 0 50 -1 -1 0.000 0 0 -1 0 0 2 + 765 1890 765 2745 +2 1 0 1 0 0 50 -1 -1 0.000 0 0 -1 0 0 4 + 6930 1890 6930 2070 1800 2565 1800 2745 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4455 1260 5130 1260 +2 1 0 1 0 0 50 -1 -1 0.000 0 0 -1 0 0 2 + 765 3420 765 4275 +2 1 0 1 0 0 50 -1 -1 0.000 0 0 -1 0 0 4 + 6885 3375 6885 3600 2385 4095 2385 4275 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4455 1800 5130 1800 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 6615 1260 6930 1260 6930 1800 6615 1800 6615 1260 +2 1 1 2 0 0 50 -1 8 6.000 0 0 -1 0 0 2 + 1800 3330 2835 3330 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 2835 2790 3825 2790 3825 3330 2835 3330 2835 2790 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 3825 2790 4860 2790 4860 3330 3825 3330 3825 2790 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 5895 2790 6885 2790 6885 3330 5895 3330 5895 2790 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 5323 4320 6930 4320 6930 4860 5323 4860 5323 4320 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4050 4320 5310 4320 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4050 4860 5310 4860 +2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 765 2790 1800 2790 1800 3330 765 3330 765 2790 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1530 2790 1530 3330 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3555 2790 3555 3330 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4590 2790 4590 3330 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 6615 2790 6615 3330 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 778 1260 2565 1260 2565 1800 778 1800 778 1260 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 2623 1260 4410 1260 4410 1800 2623 1800 2623 1260 +2 2 0 2 0 0 50 -1 -1 0.000 0 0 -1 0 0 5 + 5143 1260 6615 1260 6615 1800 5143 1800 5143 1260 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 5323 5490 6930 5490 6930 6030 5323 6030 5323 5490 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 778 5490 2385 5490 2385 6030 778 6030 778 5490 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 2398 5490 4005 5490 4005 6030 2398 6030 2398 5490 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 2398 5490 4005 5490 4005 6030 2398 6030 2398 5490 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4050 5490 5310 5490 +2 1 1 2 0 0 50 -1 -1 6.000 0 0 -1 0 0 2 + 4050 6030 5310 6030 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 2398 4320 4005 4320 4005 4860 2398 4860 2398 4320 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 2385 4320 4005 4320 4005 4860 2385 4860 2385 4320 +2 2 0 2 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 765 4320 2376 4320 2376 4860 765 4860 765 4320 +4 0 0 50 -1 4 12 0.0000 2 195 4335 810 2160 $N_{TB}$ blocks of $bs$ threads for one segment\001 +4 2 -1 50 -1 4 12 0.0000 0 150 630 2565 1980 block 0\001 +4 2 -1 50 -1 4 12 0.0000 2 195 1515 6930 1980 block $N_{TB}-1$\001 +4 2 -1 50 -1 4 12 0.0000 0 150 630 4410 1980 block 1\001 +4 0 0 50 -1 4 12 0.0000 0 195 2625 810 3780 16 segments around one node\001 +4 2 0 50 -1 4 12 0.0000 2 195 2730 1800 3555 $\\overrightarrow{P_{i-1}T_{i,0}}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 2730 3825 3555 $\\overrightarrow{P_{i-1}T_{i,7}}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 2790 4860 3555 $\\overrightarrow{T_{i,0}P_{i+1}}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 2790 6885 3555 $\\overrightarrow{T_{i,7}P_{i+1}}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 2280 6930 5085 $P_{2(N_n/2+N_n\\%2-1)}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 675 4005 5085 $P_{2}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 675 2385 5085 $P_{0}$\001 +4 1 0 50 -1 4 12 0.0000 2 180 2715 4005 5310 $(N_n/2+N_n\\%2)$ even nodes\001 +4 2 0 50 -1 4 12 0.0000 2 195 675 2385 6255 $P_{1}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 675 4005 6255 $P_{3}$\001 +4 2 0 50 -1 4 12 0.0000 2 195 1695 6930 6255 $P_{2(N_n/2-1)+1}$\001 +4 1 0 50 -1 4 12 0.0000 2 180 1800 4005 6480 $(N_n/2)$ odd nodes\001 diff --git a/doc/img/contribs_segments.pdf b/doc/img/contribs_segments.pdf new file mode 100644 index 0000000..5eefe73 Binary files /dev/null and b/doc/img/contribs_segments.pdf differ diff --git a/doc/img/contribs_segments.pdf_t b/doc/img/contribs_segments.pdf_t new file mode 100644 index 0000000..eeda410 --- /dev/null +++ b/doc/img/contribs_segments.pdf_t @@ -0,0 +1,40 @@ +\begin{picture}(0,0)% +\includegraphics{contribs_segments.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(6209,5311)(744,-5710) +\put(811,-1321){\makebox(0,0)[lb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$N_{TB}$ blocks of $bs$ threads for one segment}% +}}}} +\put(6931,-1141){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}block $N_{TB}-1$}}}} +\put(1801,-2716){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\overrightarrow{P_{i-1}T_{i,0}}$}% +}}}} +\put(3826,-2716){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\overrightarrow{P_{i-1}T_{i,7}}$}% +}}}} +\put(4861,-2716){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\overrightarrow{T_{i,0}P_{i+1}}$}% +}}}} +\put(6886,-2716){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$\overrightarrow{T_{i,7}P_{i+1}}$}% +}}}} +\put(6931,-4246){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{2(N_n/2+N_n\%2-1)}$}% +}}}} +\put(4006,-4246){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{2}$}% +}}}} +\put(2386,-4246){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{0}$}% +}}}} +\put(4006,-4471){\makebox(0,0)[b]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$(N_n/2+N_n\%2)$ even nodes}% +}}}} +\put(2386,-5416){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{1}$}% +}}}} +\put(4006,-5416){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{3}$}% +}}}} +\put(6931,-5416){\makebox(0,0)[rb]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$P_{2(N_n/2-1)+1}$}% +}}}} +\put(4006,-5641){\makebox(0,0)[b]{\smash{{\SetFigFont{12}{14.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}$(N_n/2)$ odd nodes}% +}}}} +\end{picture}% diff --git a/doc/img/data_profile_cpu.png b/doc/img/data_profile_cpu.png new file mode 100644 index 0000000..d3271ed Binary files /dev/null and b/doc/img/data_profile_cpu.png differ diff --git a/doc/img/data_profile_cpu.txt b/doc/img/data_profile_cpu.txt new file mode 100644 index 0000000..a7c13b5 --- /dev/null +++ b/doc/img/data_profile_cpu.txt @@ -0,0 +1,18 @@ +# graph du profiling de la version CPU +=cluster;compute_segment_contribution();compute_cumulated_images();compute_pixels_coordinates() +# green instead of gray since not planning on printing this +colors=black,yellow,red +=table +yformat=%g%% +max=100 +font=Helvetica Bold +=norotate +ylabel=relative time costs +# stretch it out in x direction +#extraops=set size 1,1 + +# seg cumuls pixels +15MPix 46 26 6 +50MPix 50 25 6 +100MPix 53 23 7 + diff --git a/doc/img/detail_segments.fig b/doc/img/detail_segments.fig new file mode 100644 index 0000000..4eba135 --- /dev/null +++ b/doc/img/detail_segments.fig @@ -0,0 +1,35 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 5 0 1 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 0 cochon_petit_detail_16segments.jpg + 945 765 6120 765 6120 5310 945 5310 945 765 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 150.00 180.00 + 6210 3780 2610 3195 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 150.00 180.00 + 6165 3285 3915 1665 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 150.00 180.00 + 6165 4275 2790 4320 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 150.00 180.00 + 6210 1800 5040 1575 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 150.00 180.00 + 6210 1035 2880 1530 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 1 + 2835 1215 +4 0 0 50 -1 4 18 0.0000 2 270 1845 6255 1845 node $P_{i-1}$\001 +4 0 0 50 -1 4 18 0.0000 2 270 1905 6255 4320 node $P_{i+1}$\001 +4 0 0 50 -1 4 18 0.0000 2 270 3795 6255 3330 segment with 1 thread per row\001 +4 0 0 50 -1 4 18 0.0000 2 270 4245 6255 3825 segment with 1 thread per column\001 +4 0 0 50 -1 4 18 0.0000 2 270 4350 6255 1035 8 test positions around node $P_i$\001 +4 0 0 50 -1 4 18 0.0000 2 270 5520 6255 1305 from $T_{i,0}$ to $T_{i,7}$ counterclockwise\001 diff --git a/doc/img/detail_segments.pdf b/doc/img/detail_segments.pdf new file mode 100644 index 0000000..7e9f6c9 Binary files /dev/null and b/doc/img/detail_segments.pdf differ diff --git a/doc/img/detail_segments.pdf_t b/doc/img/detail_segments.pdf_t new file mode 100644 index 0000000..1266683 --- /dev/null +++ b/doc/img/detail_segments.pdf_t @@ -0,0 +1,25 @@ +\begin{picture}(0,0)% +\includegraphics{detail_segments.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(9678,4545)(946,-4471) +\put(6256,-1006){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}node $P_{i-1}$}% +}}}} +\put(6256,-3481){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}node $P_{i+1}$}% +}}}} +\put(6256,-2491){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}segment with 1 thread per row}% +}}}} +\put(6256,-2986){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}segment with 1 thread per column}% +}}}} +\put(6256,-196){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}8 test positions around node $P_i$}% +}}}} +\put(6256,-466){\makebox(0,0)[lb]{\smash{{\SetFigFont{17}{20.4}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{0,0,0}from $T_{i,0}$ to $T_{i,7}$ counterclockwise}% +}}}} +\end{picture}% diff --git a/doc/img/topologie.fig b/doc/img/topologie.fig new file mode 100644 index 0000000..4f0f9c1 --- /dev/null +++ b/doc/img/topologie.fig @@ -0,0 +1,34 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 5 0 1 0 -1 50 -1 -1 0.000 0 0 -1 0 0 5 + 0 cochon_positions.png + 3375 2835 11947 2835 11947 10907 3375 10907 3375 2835 +2 1 0 1 7 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 0 1.00 90.00 150.00 + 10350 4095 10980 3780 +2 1 0 1 7 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 0 1.00 90.00 150.00 + 4815 10620 4320 10260 +2 1 0 1 7 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 0 1.00 90.00 150.00 + 9135 10530 9900 9945 +2 1 0 1 7 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 0 1.00 90.00 150.00 + 10395 7965 11436 8738 +3 0 0 1 7 7 50 -1 -1 0.000 0 1 0 2 + 3 0 1.00 90.00 150.00 + 5085 3150 4590 3555 + 0.000 0.000 +4 0 7 50 -1 4 14 0.0000 2 225 3810 5220 3060 8 tests positions around node $P_i$\001 +4 0 7 50 -1 4 14 0.0000 2 225 4665 5220 3300 from $T_{i,0}$ to $T_{i,7}$ counterclockwise\001 +4 0 7 50 -1 4 12 0.0000 0 195 2610 8640 7875 segment with 1 thread per row\001 +4 0 7 50 -1 4 12 0.0000 0 195 2940 7335 10710 segment with 1 thread per column\001 +4 2 7 50 -1 4 14 0.0000 2 225 1590 10305 4320 node $P_{i-1}$\001 +4 0 7 50 -1 4 14 0.0000 2 225 1650 4860 10710 node $P_{i+1}$\001 diff --git a/doc/img/topologie.pdf b/doc/img/topologie.pdf new file mode 100644 index 0000000..762db25 Binary files /dev/null and b/doc/img/topologie.pdf differ diff --git a/doc/img/topologie.pdf_t b/doc/img/topologie.pdf_t new file mode 100644 index 0000000..aa48a6a --- /dev/null +++ b/doc/img/topologie.pdf_t @@ -0,0 +1,21 @@ +\begin{picture}(0,0)% +\includegraphics{topologie.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(8572,8072)(3376,-10068) +\put(5221,-2221){\makebox(0,0)[lb]{\smash{{\SetFigFont{14}{16.8}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{1,1,1}8 tests positions around node $P_i$}% +}}}} +\put(5221,-2461){\makebox(0,0)[lb]{\smash{{\SetFigFont{14}{16.8}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{1,1,1}from $T_{i,0}$ to $T_{i,7}$ counterclockwise}% +}}}} +\put(10306,-3481){\makebox(0,0)[rb]{\smash{{\SetFigFont{14}{16.8}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{1,1,1}node $P_{i-1}$}% +}}}} +\put(4861,-9871){\makebox(0,0)[lb]{\smash{{\SetFigFont{14}{16.8}{\sfdefault}{\mddefault}{\updefault}{\color[rgb]{1,1,1}node $P_{i+1}$}% +}}}} +\end{picture}% diff --git a/doc/img/tripix.fig b/doc/img/tripix.fig new file mode 100644 index 0000000..1fb6710 --- /dev/null +++ b/doc/img/tripix.fig @@ -0,0 +1,112 @@ +#FIG 3.2 Produced by xfig version 3.2.5b +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +6 5175 4275 6525 4725 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 5175 4275 5625 4275 5625 4725 5175 4725 5175 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 5625 4275 6075 4275 6075 4725 5625 4725 5625 4275 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 6075 4275 6525 4275 6525 4725 6075 4725 6075 4275 +-6 +6 6525 4725 7875 5175 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 6525 4725 6975 4725 6975 5175 6525 5175 6525 4725 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 6975 4725 7425 4725 7425 5175 6975 5175 6975 4725 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 7425 4725 7875 4725 7875 5175 7425 5175 7425 4725 +-6 +6 7875 4275 8775 4725 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 8775 4275 8325 4275 8325 4725 8775 4725 8775 4275 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 8325 4275 7875 4275 7875 4725 8325 4725 8325 4275 +-6 +6 8775 3825 9675 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 9675 3825 9225 3825 9225 4275 9675 4275 9675 3825 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 9225 3825 8775 3825 8775 4275 9225 4275 9225 3825 +-6 +6 9675 3375 10575 3825 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 10575 3375 10125 3375 10125 3825 10575 3825 10575 3375 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 10125 3375 9675 3375 9675 3825 10125 3825 10125 3375 +-6 +6 10575 2925 11475 3375 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 11475 2925 11025 2925 11025 3375 11475 3375 11475 2925 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 11025 2925 10575 2925 10575 3375 11025 3375 11025 2925 +-6 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1125 2925 1575 2925 1575 3375 1125 3375 1125 2925 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1575 2925 2025 2925 2025 3375 1575 3375 1575 2925 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 2025 2925 2475 2925 2475 3375 2025 3375 2025 2925 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2475 3375 2925 3375 2925 3825 2475 3825 2475 3375 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2925 3375 3375 3375 3375 3825 2925 3825 2925 3375 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 3375 3375 3825 3375 3825 3825 3375 3825 3375 3375 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3825 3825 4275 3825 4275 4275 3825 4275 3825 3825 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 4275 3825 4725 3825 4725 4275 4275 4275 4275 3825 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 4725 3825 5175 3825 5175 4275 4725 4275 4725 3825 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 180.00 210.00 + 1800 3780 4860 4725 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 120.00 180.00 + 4275 3060 3780 3510 +2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 3 1 1.00 120.00 180.00 + 9225 3060 9720 3465 +2 2 1 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 5 + 3825 3375 9675 3375 9675 3825 3825 3825 3825 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 4275 3825 4275 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 4725 3825 4725 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 5175 3825 5175 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 5625 3825 5625 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 6075 3825 6075 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 6525 3825 6525 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 6975 3825 6975 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7425 3825 7425 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 7875 3825 7875 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 8325 3825 8325 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 8775 3825 8775 3375 +2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 + 9225 3825 9225 3375 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1125 5625 1350 5625 1350 5850 1125 5850 1125 5625 +2 2 0 1 0 0 50 -1 12 0.000 0 0 -1 0 0 5 + 5850 5625 6075 5625 6075 5850 5850 5850 5850 5625 +4 0 0 50 -1 4 18 0.0000 0 210 2250 1125 4950 Outside the snake\001 +4 0 0 50 -1 4 18 0.0000 0 270 1725 8145 2925 high end pixel\001 +4 0 0 50 -1 4 18 0.0000 0 270 1635 4050 2925 low end pixel\001 +4 0 0 50 -1 4 18 0.0000 0 210 2025 5985 2520 Inside the snake\001 +4 0 0 50 -1 4 18 0.0000 0 270 3495 1440 5850 pixels with null contributions\001 +4 0 0 50 -1 4 18 0.0000 0 270 4050 6165 5850 pixels with non-null contributions\001 diff --git a/doc/img/tripix.pdf b/doc/img/tripix.pdf new file mode 100644 index 0000000..eb3abcd Binary files /dev/null and b/doc/img/tripix.pdf differ diff --git a/doc/img/tripix.pdf_t b/doc/img/tripix.pdf_t new file mode 100644 index 0000000..2c10360 --- /dev/null +++ b/doc/img/tripix.pdf_t @@ -0,0 +1,13 @@ +\begin{picture}(0,0)% +\includegraphics{tripix.pdf}% +\end{picture}% +\setlength{\unitlength}{4144sp}% +% +\begingroup\makeatletter\ifx\SetFigFont\undefined% +\gdef\SetFigFont#1#2#3#4#5{% + \reset@font\fontsize{#1}{#2pt}% + \fontfamily{#3}\fontseries{#4}\fontshape{#5}% + \selectfont}% +\fi\endgroup% +\begin{picture}(10377,3651)(1111,-5107) +\end{picture}% diff --git a/doc/snake_gpu.aux b/doc/snake_gpu.aux new file mode 100644 index 0000000..cab60de --- /dev/null +++ b/doc/snake_gpu.aux @@ -0,0 +1,99 @@ +\relax +\citation{KassWT88} +\citation{Ruch01} +\citation{XuP98} +\citation{ChesnaudRB99,AllainBG08} +\citation{Brunett} +\citation{Ruch01} +\citation{ChesnaudRB99} +\@writefile{toc}{\contentsline {section}{\numberline {I}Introduction}{1}} +\newlabel{secCPUalgooutlines}{{II}{1}} +\@writefile{toc}{\contentsline {section}{\numberline {II}Sequential algorithm : outlines}{1}} +\citation{ChesnaudRB99} +\newlabel{cpualgosimple}{{1}{2}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Sequential algorithm : outlines}}{2}} +\newlabel{fig:labelinit}{{1a}{2}} +\newlabel{sub@fig:labelinit}{{(a)}{a}} +\newlabel{fig:labelit1}{{1b}{2}} +\newlabel{sub@fig:labelit1}{{(b)}{b}} +\newlabel{fig:labelit2}{{1c}{2}} +\newlabel{sub@fig:labelit2}{{(c)}{c}} +\newlabel{fig:labelit4}{{1d}{2}} +\newlabel{sub@fig:labelit4}{{(d)}{d}} +\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces segmentation of a noisy image}}{2}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Initial snake }}}{2}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {End of first iteration (4 nodes) }}}{2}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {End of second iteration (8 nodes)}}}{2}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {End of fourth iteration (29 nodes)}}}{2}} +\newlabel{images_algo}{{1}{2}} +\newlabel{secCPUalgodetails}{{III}{2}} +\@writefile{toc}{\contentsline {section}{\numberline {III}Sequential algorithm : details}{2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {III-A}}Criterion}{2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {III-B}}CPU implementation}{2}} +\newlabel{CPUcontrib_segments}{{2}{2}} +\newlabel{CPUbresenham}{{2}{2}} +\newlabel{CPUcontrib_pixels}{{2}{2}} +\citation{CUDAPG} +\citation{CUDAPG} +\citation{CUDAFT} +\citation{CUDAFC} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces Sequential simplified algorithm}}{3}} +\newlabel{cpualgo}{{2}{3}} +\newlabel{cumuls}{{2}{3}} +\newlabel{debinit}{{3}{3}} +\newlabel{fininit}{{9}{3}} +\newlabel{loopnewnodes}{{10}{3}} +\newlabel{loopmovenodes}{{11}{3}} +\newlabel{kernelPLH}{{15}{3}} +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces the three most-consumming functions for various image sizes}}{3}} +\newlabel{CPUprofile}{{2}{3}} +\newlabel{GPUgeneralites}{{IV}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {IV}NVidia's GPU architecture}{3}} +\citation{ChesnaudRB99} +\citation{Harris07} +\citation{CUDAPG} +\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces schematic diagram of GPU's internal architecture}}{4}} +\newlabel{GPUC1060}{{3}{4}} +\newlabel{GPUimplementation}{{V}{4}} +\@writefile{toc}{\contentsline {section}{\numberline {V}GPU implementation}{4}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-A}}Pre-computations}{4}} +\citation{CUDAFT} +\citation{CUDAPG} +\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces \texttt {compute\_blocks\_prefixes()} details.}}{5}} +\newlabel{GPUcumuls}{{4}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-B}}Segment contributions}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces \texttt {scan\_blocksums()} details.}}{6}} +\newlabel{GPUscansomblocs}{{5}{6}} +\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces \texttt {add\_sums2prefixes()} details.}}{6}} +\newlabel{GPUaddsoms2cumuls}{{6}{6}} +\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces topology around nodes}}{6}} +\newlabel{GPUtopo}{{7}{6}} +\citation{AllainBG08} +\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces structure for segments contributions computation. Gray symbols help to locate inactive threads as opposed to black ones that figure active threads.}}{7}} +\newlabel{contribs_segments}{{8}{7}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-C}}Segments with a slope $k$ such as $|k|\leq 1$}{7}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-D}}Parameters estimation}{7}} +\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Zoom on part a of segment with $|k| < 1$, at pixel level.}}{7}} +\newlabel{tripix}{{9}{7}} +\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-E}}End of segmentation}{7}} +\newlabel{secSpeedups}{{VI}{7}} +\@writefile{toc}{\contentsline {section}{\numberline {VI}Speedups}{7}} +\bibstyle{IEEEtran} +\bibdata{IEEEabrv,biblio} +\bibcite{KassWT88}{1} +\bibcite{Ruch01}{2} +\bibcite{XuP98}{3} +\bibcite{ChesnaudRB99}{4} +\bibcite{AllainBG08}{5} +\bibcite{Brunett}{6} +\bibcite{CUDAPG}{7} +\bibcite{CUDAFT}{8} +\bibcite{CUDAFC}{9} +\bibcite{Harris07}{10} +\newlabel{gpualgosimple}{{3}{8}} +\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Parralel GPU algorithm : outlines. \texttt {<<<...>>>} indicates a GPU kernel parallel process.}}{8}} +\@writefile{lot}{\contentsline {table}{\numberline {I}{\ignorespaces GPU (C2050, sm20) vs CPU timings.}}{8}} +\newlabel{tabresults}{{I}{8}} +\newlabel{secConclusion}{{VII}{8}} +\@writefile{toc}{\contentsline {section}{\numberline {VII}Conclusion}{8}} +\@writefile{toc}{\contentsline {section}{References}{8}} diff --git a/doc/snake_gpu.bbl b/doc/snake_gpu.bbl new file mode 100644 index 0000000..92a7ea5 --- /dev/null +++ b/doc/snake_gpu.bbl @@ -0,0 +1,71 @@ +% Generated by IEEEtran.bst, version: 1.13 (2008/09/30) +\begin{thebibliography}{10} +\providecommand{\url}[1]{#1} +\csname url@samestyle\endcsname +\providecommand{\newblock}{\relax} +\providecommand{\bibinfo}[2]{#2} +\providecommand{\BIBentrySTDinterwordspacing}{\spaceskip=0pt\relax} +\providecommand{\BIBentryALTinterwordstretchfactor}{4} +\providecommand{\BIBentryALTinterwordspacing}{\spaceskip=\fontdimen2\font plus +\BIBentryALTinterwordstretchfactor\fontdimen3\font minus + \fontdimen4\font\relax} +\providecommand{\BIBforeignlanguage}[2]{{% +\expandafter\ifx\csname l@#1\endcsname\relax +\typeout{** WARNING: IEEEtran.bst: No hyphenation pattern has been}% +\typeout{** loaded for the language `#1'. Using the pattern for}% +\typeout{** the default language instead.}% +\else +\language=\csname l@#1\endcsname +\fi +#2}} +\providecommand{\BIBdecl}{\relax} +\BIBdecl + +\bibitem{KassWT88} +M.~Kass, A.~P. Witkin, and D.~Terzopoulos, ``Snakes: Active contour models,'' + \emph{International Journal of Computer Vision}, vol.~1, no.~4, pp. 321--331, + 1988. + +\bibitem{Ruch01} +O.~Ruch and P.~R{\'e}fr{\'e}gier, ``Minimal-complexity segmentation with a + polygonal snake adapted to different optical noise models,'' \emph{Optics + Letters}, vol.~26, no.~13, july 2001. + +\bibitem{XuP98} +C.~Xu and J.~L. Prince, ``Snakes, shapes, and gradient vector flow,'' + \emph{IEEE Transactions on Image Processing}, vol.~7, no.~3, pp. 359--369, + 1998. + +\bibitem{ChesnaudRB99} +C.~Chesnaud, P.~R{\'e}fr{\'e}gier, and V.~Boulet, ``Statistical region + snake-based segmentation adapted to different physical noise models,'' + \emph{IEEE Trans. Pattern Anal. Mach. Intell.}, vol.~21, no.~11, pp. + 1145--1157, 1999. + +\bibitem{AllainBG08} +M.~Allain, N.~Bertaux, and F.~Galland, ``Nonparametric level-set segmentation + based on the minimization of the stochastic complexity,'' in \emph{ACIVS}, + 2008, pp. 506--517. + +\bibitem{Brunett} +\BIBentryALTinterwordspacing +E.~{Dipl.-Inf. Kienel} and G.~{Prof. Dr. Brunnett}, ``Gpu-accelerated contour + extraction on large images using snakes,'' 2009. [Online]. Available: + \url{http://archiv.tu-chemnitz.de/pub/2009/0035} +\BIBentrySTDinterwordspacing + +\bibitem{CUDAPG} +\emph{NVIDIA CUDA C Programming Guide v3.1.1}, NVIDIA Corporation, 7 2010. + +\bibitem{CUDAFT} +\emph{NVIDIA Fermi Tuning Guide}, NVIDIA Corporation, 7 2010. + +\bibitem{CUDAFC} +\emph{NVIDIA Fermi Compatibility Guide}, NVIDIA Corporation, 7 2010. + +\bibitem{Harris07} +M.~Harris, S.~Sengupta, and J.~D. Owens, \emph{Gpu gems 3}, 1st~ed.\hskip 1em + plus 0.5em minus 0.4em\relax Addison-Wesley Professional, 2007, ch. 39 - + Parallel Prefix Sum with CUDA. + +\end{thebibliography} diff --git a/doc/snake_gpu.blg b/doc/snake_gpu.blg new file mode 100644 index 0000000..69feba7 --- /dev/null +++ b/doc/snake_gpu.blg @@ -0,0 +1,57 @@ +This is BibTeX, Version 0.99c (TeX Live 2009/Debian) +The top-level auxiliary file: snake_gpu.aux +The style file: IEEEtran.bst +Reallocated wiz_functions (elt_size=4) to 6000 items from 3000. +Database file #1: IEEEabrv.bib +Database file #2: biblio.bib +A bad cross reference---entry "AllainBG08" +refers to entry "DBLP:conf/acivs/2008", which doesn't exist +Warning--I didn't find a database entry for "DBLP:conf/acivs/2008" +-- IEEEtran.bst version 1.13 (2008/09/30) by Michael Shell. +-- http://www.michaelshell.org/tex/ieeetran/bibtex/ +-- See the "IEEEtran_bst_HOWTO.pdf" manual for usage information. +Warning--empty journal in Brunett + +Done. +You've used 10 entries, + 4039 wiz_defined-function locations, + 1273 strings with 16462 characters, +and the built_in function-call counts, 6085 in all, are: += -- 504 +> -- 118 +< -- 49 ++ -- 65 +- -- 18 +* -- 297 +:= -- 950 +add.period$ -- 17 +call.type$ -- 10 +change.case$ -- 8 +chr.to.int$ -- 108 +cite$ -- 11 +duplicate$ -- 456 +empty$ -- 487 +format.name$ -- 25 +if$ -- 1370 +int.to.chr$ -- 0 +int.to.str$ -- 10 +missing$ -- 79 +newline$ -- 55 +num.names$ -- 7 +pop$ -- 174 +preamble$ -- 1 +purify$ -- 0 +quote$ -- 2 +skip$ -- 480 +stack$ -- 0 +substring$ -- 248 +swap$ -- 359 +text.length$ -- 13 +text.prefix$ -- 0 +top$ -- 5 +type$ -- 10 +warning$ -- 1 +while$ -- 23 +width$ -- 12 +write$ -- 113 +(There was 1 error message) diff --git a/doc/snake_gpu.log b/doc/snake_gpu.log new file mode 100644 index 0000000..ecb3737 --- /dev/null +++ b/doc/snake_gpu.log @@ -0,0 +1,470 @@ +This is pdfTeX, Version 3.1415926-1.40.10 (TeX Live 2009/Debian) (format=pdflatex 2011.1.7) 15 MAR 2011 15:27 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**snake_gpu.tex +(./snake_gpu.tex +LaTeX2e <2009/09/24> +Babel and hyphenation patterns for english, usenglishmax, dumylang, noh +yphenation, farsi, arabic, croatian, bulgarian, ukrainian, russian, czech, slov +ak, danish, dutch, finnish, french, basque, ngerman, german, german-x-2009-06-1 +9, ngerman-x-2009-06-19, ibycus, monogreek, greek, ancientgreek, hungarian, san +skrit, italian, latin, latvian, lithuanian, mongolian2a, mongolian, bokmal, nyn +orsk, romanian, irish, coptic, serbian, turkish, welsh, esperanto, uppersorbian +, estonian, indonesian, interlingua, icelandic, kurmanji, slovenian, polish, po +rtuguese, spanish, galician, catalan, swedish, ukenglish, pinyin, loaded. +(/home/perrot/texmf/tex/latex/base/IEEEtran.cls +Document Class: IEEEtran 2007/03/05 V1.7a by Michael Shell +-- See the "IEEEtran_HOWTO" manual for usage information. +-- http://www.michaelshell.org/tex/ieeetran/ +\@IEEEtrantmpdimenA=\dimen102 +\@IEEEtrantmpdimenB=\dimen103 +\@IEEEtrantmpcountA=\count79 +\@IEEEtrantmpcountB=\count80 +\@IEEEtrantmptoksA=\toks14 +LaTeX Font Info: Try loading font information for OT1+ptm on input line 373. + +(/usr/share/texmf-texlive/tex/latex/psnfss/ot1ptm.fd +File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm. +) +-- Using 8.5in x 11in (letter) paper. +-- Using PDF output. +\@IEEEnormalsizeunitybaselineskip=\dimen104 +-- This is a 10 point document. +\CLASSINFOnormalsizebaselineskip=\dimen105 +\CLASSINFOnormalsizeunitybaselineskip=\dimen106 +\IEEEnormaljot=\dimen107 +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <5> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <5> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <7> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <7> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <8> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <8> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <9> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <9> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <10> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <10> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <11> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <11> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <12> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <12> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <17> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <17> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <20> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <20> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <24> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 731. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <24> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 731. +\IEEEilabelindentA=\dimen108 +\IEEEilabelindentB=\dimen109 +\IEEEilabelindent=\dimen110 +\IEEEelabelindent=\dimen111 +\IEEEdlabelindent=\dimen112 +\IEEElabelindent=\dimen113 +\IEEEiednormlabelsep=\dimen114 +\IEEEiedmathlabelsep=\dimen115 +\IEEEiedtopsep=\skip41 +\c@section=\count81 +\c@subsection=\count82 +\c@subsubsection=\count83 +\c@paragraph=\count84 +\c@IEEEsubequation=\count85 +\abovecaptionskip=\skip42 +\belowcaptionskip=\skip43 +\c@figure=\count86 +\c@table=\count87 +\@IEEEeqnnumcols=\count88 +\@IEEEeqncolcnt=\count89 +\@IEEEtmpitemindent=\dimen116 +\c@IEEEbiography=\count90 +\@IEEEtranrubishbin=\box26 +) (/usr/share/texmf-texlive/tex/latex/cite/cite.sty +LaTeX Info: Redefining \cite on input line 285. +LaTeX Info: Redefining \nocite on input line 356. +Package: cite 2009/08/29 v 5.2 +) +(/usr/share/texmf-texlive/tex/latex/graphics/graphicx.sty +Package: graphicx 1999/02/16 v1.0f Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/share/texmf-texlive/tex/latex/graphics/keyval.sty +Package: keyval 1999/03/16 v1.13 key=value parser (DPC) +\KV@toks@=\toks15 +) +(/usr/share/texmf-texlive/tex/latex/graphics/graphics.sty +Package: graphics 2009/02/05 v1.0o Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texmf-texlive/tex/latex/graphics/trig.sty +Package: trig 1999/03/16 v1.09 sin cos tan (DPC) +) +(/etc/texmf/tex/latex/config/graphics.cfg +File: graphics.cfg 2009/08/28 v1.8 graphics configuration of TeX Live +) +Package graphics Info: Driver file: pdftex.def on input line 91. + +(/usr/share/texmf-texlive/tex/latex/pdftex-def/pdftex.def +File: pdftex.def 2010/03/12 v0.04p Graphics/color for pdfTeX +\Gread@gobject=\count91 +)) +\Gin@req@height=\dimen117 +\Gin@req@width=\dimen118 +) +(/usr/share/texmf-texlive/tex/latex/graphics/color.sty +Package: color 2005/11/14 v1.0j Standard LaTeX Color (DPC) + +(/etc/texmf/tex/latex/config/color.cfg +File: color.cfg 2007/01/18 v1.5 color configuration of teTeX/TeXLive +) +Package color Info: Driver file: pdftex.def on input line 130. +) +(/home/perrot/texmf/tex/latex/algorithm2e.sty +Package: algorithm2e 2008/00/00 v3.10 algorithms environments +\c@AlgoLine=\count92 + +(/usr/share/texmf-texlive/tex/latex/base/ifthen.sty +Package: ifthen 2001/05/26 v1.1c Standard LaTeX ifthen package (DPC) +) +(/usr/share/texmf-texlive/tex/latex/tools/xspace.sty +Package: xspace 2006/05/08 v1.12 Space after command names (DPC,MH) +) +(/usr/share/texmf-texlive/tex/latex/ltxmisc/relsize.sty +Package: relsize 2003/07/04 ver 3.1 +) +******************************************************** +Package `algorithm2e' Release 4.01 -- december 14 2009 -- +- algorithm2e-announce@lirmm.fr mailing list for announcement about releases +- algorithm2e-discussion@lirmm.fr mailing list for discussion about package +subscribe by emailing sympa@lirmm.fr with 'subscribe ' +- Author: Christophe Fiorio (fiorio@lirmm.fr) +******************************************************** +\skiptotal=\skip44 +\skiplinenumber=\skip45 +\skiprule=\skip46 +\skiphlne=\skip47 +\skiptext=\skip48 +\skiplength=\skip49 +\algomargin=\skip50 +\skipalgocfslide=\skip51 +\algowidth=\dimen119 +\inoutsize=\dimen120 +\inoutline=\dimen121 +\interspacetitleruled=\dimen122 +\interspacealgoruled=\dimen123 +\interspacetitleboxruled=\dimen124 +\algocf@inoutbox=\box27 +\algocf@inputbox=\box28 +\AlCapSkip=\skip52 +\AlCapHSkip=\skip53 +\algocf@nlbox=\box29 +\algocf@hangingbox=\box30 +\algocf@capbox=\box31 +\algoheightruledefault=\skip54 +\algoheightrule=\skip55 +\algotitleheightruledefault=\skip56 +\algotitleheightrule=\skip57 +\c@algocfline=\count93 +\c@algocfproc=\count94 +\c@algocf=\count95 +\algocf@algoframe=\box32 +\algocf@algobox=\box33 +) (/usr/share/texmf-texlive/tex/latex/tools/array.sty +Package: array 2008/09/09 v2.4c Tabular extension package (FMi) +\col@sep=\dimen125 +\extrarowheight=\dimen126 +\NC@list=\toks16 +\extratabsurround=\skip58 +\backup@length=\skip59 +) +(/usr/share/texmf-texlive/tex/latex/mdwtools/mdwmath.sty +Package: mdwmath 1996/04/11 1.1 Nice mathematical things +\sq@sqrt=\count96 +LaTeX Info: Redefining \sqrt on input line 84. +) +(/usr/share/texmf-texlive/tex/latex/mdwtools/mdwtab.sty +Package: mdwtab 1998/04/28 1.9 Table typesetting with style +\tab@state=\count97 +\tab@columns=\count98 +\tab@preamble=\toks17 +\tab@shortline=\toks18 +\extrarowheight=\dimen127 +\tabextrasep=\dimen128 +\arrayextrasep=\dimen129 +\smarraycolsep=\dimen130 +\smarrayextrasep=\dimen131 +\tab@width=\dimen132 +\col@sep=\dimen133 +\tab@endheight=\dimen134 +\tab@leftskip=\skip60 +\tab@rightskip=\skip61 +\fn@notes=\box34 +\fn@width=\dimen135 +) +(/usr/share/texmf-texlive/tex/latex/subfig/subfig.sty +Package: subfig 2005/06/28 ver: 1.3 subfig package + +(/usr/share/texmf-texlive/tex/latex/caption/caption3.sty +Package: caption3 2009/10/09 v3.1k caption3 kernel (AR) +\captionmargin=\dimen136 +\captionmargin@=\dimen137 +\captionwidth=\dimen138 +\caption@indent=\dimen139 +\caption@parindent=\dimen140 +\caption@hangindent=\dimen141 +Package caption Info: Unknown document class (or package), +(caption) standard defaults will be used on input line 1068. +) +\c@KVtest=\count99 +\sf@farskip=\skip62 +\sf@captopadj=\dimen142 +\sf@capskip=\skip63 +\sf@nearskip=\skip64 +\c@subfigure=\count100 +\c@subfigure@save=\count101 +\c@lofdepth=\count102 +\c@subtable=\count103 +\c@subtable@save=\count104 +\c@lotdepth=\count105 +\sf@top=\skip65 +\sf@bottom=\skip66 +) +(/usr/share/texmf-texlive/tex/latex/base/fixltx2e.sty +Package: fixltx2e 2006/09/13 v1.1m fixes to LaTeX +LaTeX Info: Redefining \em on input line 420. +) (./snake_gpu.aux) +\openout1 = `snake_gpu.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 331. +LaTeX Font Info: ... okay on input line 331. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 331. +LaTeX Font Info: ... okay on input line 331. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 331. +LaTeX Font Info: ... okay on input line 331. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 331. +LaTeX Font Info: ... okay on input line 331. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 331. +LaTeX Font Info: ... okay on input line 331. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 331. +LaTeX Font Info: ... okay on input line 331. + +(/usr/share/texmf/tex/context/base/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count106 +\scratchdimen=\dimen143 +\scratchbox=\box35 +\nofMPsegments=\count107 +\nofMParguments=\count108 +\everyMPshowfont=\toks19 +\MPscratchCnt=\count109 +\MPscratchDim=\dimen144 +\MPnumerator=\count110 +\everyMPtoPDFconversion=\toks20 +) +Package caption Info: Begin \AtBeginDocument code. +Package caption3 Info: subfig package 1.2 or 1.3 is loaded. +LaTeX Info: Redefining \subref on input line 331. +Package caption Info: End \AtBeginDocument code. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <14> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 358. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <7> on input line 394. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <5> on input line 394. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <8> on input line 394. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <6> on input line 394. + + +LaTeX Warning: Command \textcopyright invalid in math mode on input line 395. + + +LaTeX Warning: Command \textcircled invalid in math mode on input line 395. + +LaTeX Font Info: Try loading font information for OMS+ptm on input line 395. + +(/usr/share/texmf-texlive/tex/latex/psnfss/omsptm.fd +File: omsptm.fd +) +LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 395. + + +LaTeX Warning: Command \textcircled invalid in math mode on input line 395. + +[1{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map} + + +] +LaTeX Font Info: Try loading font information for OT1+pcr on input line 437. + + +(/usr/share/texmf-texlive/tex/latex/psnfss/ot1pcr.fd +File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr. +) +<./img/cochon_petit_init.jpg, id=13, 397.485pt x 298.11375pt> +File: ./img/cochon_petit_init.jpg Graphic file (type jpg) + + +<./img/cochon_petit_it1.jpg, id=14, 397.485pt x 298.11375pt> +File: ./img/cochon_petit_it1.jpg Graphic file (type jpg) + + +<./img/cochon_petit_it2.jpg, id=15, 397.485pt x 298.11375pt> +File: ./img/cochon_petit_it2.jpg Graphic file (type jpg) + + +<./img/cochon_petit_it4.jpg, id=16, 397.485pt x 298.11375pt> +File: ./img/cochon_petit_it4.jpg Graphic file (type jpg) + + +Underfull \vbox (badness 4621) has occurred while \output is active [] + + [2 <./img/cochon_petit_init.jpg> <./img/cochon_petit_it1.jpg> <./img/cochon_pe +tit_it2.jpg> <./img/cochon_petit_it4.jpg>] +Underfull \hbox (badness 10000) in paragraph at lines 539--540 +[]\OT1/pcr/m/n/10 compute_pixels_coordinate() \OT1/ptm/m/n/10 which is + [] + +<./img/data_profile_cpu.png, id=25, 695.59875pt x 420.97275pt> +File: ./img/data_profile_cpu.png Graphic file (type png) + + + +LaTeX Warning: `h' float specifier changed to `ht'. + + +Underfull \hbox (badness 2334) in paragraph at lines 548--553 +\OT1/ptm/m/n/10 that func-tion \OT1/pcr/m/n/10 compute_segment_contribution() + [] + + +Underfull \hbox (badness 1824) in paragraph at lines 548--553 +\OT1/ptm/m/n/10 the func-tion \OT1/pcr/m/n/10 compute_cumulated_images() \OT1/p +tm/m/n/10 costs + [] + +<./img/GPU_block.png, id=27, 2719.15875pt x 1077.02374pt> +File: ./img/GPU_block.png Graphic file (type png) + + [3 <./img/data_profile_cpu.png>] +(./img/GPUcumuls.pdf_t +File: img/GPUcumuls.pdf Graphic file (type pdf) + + +LaTeX Font Info: Try loading font information for OT1+phv on input line 14. + (/usr/share/texmf-texlive/tex/latex/psnfss/ot1phv.fd +File: ot1phv.fd 2001/06/04 scalable font definitions for OT1/phv. +) +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <12> on input line 14. +) +[4 <./img/GPU_block.png>] (./img/GPUscansomblocs.pdf_t + +File: img/GPUscansomblocs.pdf Graphic file (type pdf) + +) (./img/GPUaddsoms2cumuls.pdf_t + +File: img/GPUaddsoms2cumuls.pdf Graphic file (type pdf) + +) (./img/topologie.pdf_t + +File: img/topologie.pdf Graphic file (type pdf) + +LaTeX Font Info: Calculating math sizes for size <14> on input line 14. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <14> on input line 14. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <9.79996> on input line 14. +) + +LaTeX Warning: `h' float specifier changed to `ht'. + +[5 <./img/GPUcumuls.pdf>] +Underfull \vbox (badness 10000) has occurred while \output is active [] + + +(./img/contribs_segments.pdf_t + +File: img/contribs_segments.pdf Graphic file (type pdf) + +) [6 <./img/GPUscansomblocs.pdf> <./img/GPUaddso +ms2cumuls.pdf> <./img/topologie.pdf>] (./img/tripix.pdf_t + +File: img/tripix.pdf Graphic file (type pdf) + ) + +LaTeX Warning: `h' float specifier changed to `ht'. + + +Underfull \hbox (badness 2799) in paragraph at lines 786--786 +[]\OT1/ptm/b/n/10 Algorithm 3: \OT1/ptm/m/n/10 Par-ralel GPU al-go-rithm : out- +lines. + [] + + +LaTeX Warning: `h' float specifier changed to `ht'. + +[7 <./img/contribs_segments.pdf> <./img/tripix.pdf>] (./snake_gpu.bbl) + +** Conference Paper ** +Before submitting the final camera ready copy, remember to: + + 1. Manually equalize the lengths of two columns on the last page + of your paper; + + 2. Ensure that any PostScript and/or PDF output post-processing + uses only Type 1 fonts and that every step in the generation + process uses the appropriate paper size. + +[8] (./snake_gpu.aux) ) +Here is how much of TeX's memory you used: + 4161 strings out of 493848 + 62117 string characters out of 1152824 + 150152 words of memory out of 3000000 + 7379 multiletter control sequences out of 15000+50000 + 52358 words of font info for 94 fonts, out of 3000000 for 9000 + 714 hyphenation exceptions out of 8191 + 39i,21n,45p,287b,512s stack positions out of 5000i,500n,10000p,200000b,50000s +{/usr/share/texmf-texlive/fonts/enc/dvips/base/8r.enc} +Output written on snake_gpu.pdf (8 pages, 928099 bytes). +PDF statistics: + 161 PDF objects out of 1000 (max. 8388607) + 0 named destinations out of 1000 (max. 500000) + 61 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/doc/snake_gpu.pdf b/doc/snake_gpu.pdf new file mode 100644 index 0000000..831fae4 Binary files /dev/null and b/doc/snake_gpu.pdf differ diff --git a/doc/snake_gpu.tex b/doc/snake_gpu.tex new file mode 100644 index 0000000..da5b694 --- /dev/null +++ b/doc/snake_gpu.tex @@ -0,0 +1,910 @@ + +%% bare_conf.tex +%% V1.3 +%% 2007/01/11 +%% by Michael Shell +%% See: +%% http://www.michaelshell.org/ +%% for current contact information. +%% +%% This is a skeleton file demonstrating the use of IEEEtran.cls +%% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper. +%% +%% Support sites: +%% http://www.michaelshell.org/tex/ieeetran/ +%% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/ +%% and +%% http://www.ieee.org/ + +%%************************************************************************* +%% Legal Notice: +%% This code is offered as-is without any warranty either expressed or +%% implied; without even the implied warranty of MERCHANTABILITY or +%% FITNESS FOR A PARTICULAR PURPOSE! +%% User assumes all risk. +%% In no event shall IEEE or any contributor to this code be liable for +%% any damages or losses, including, but not limited to, incidental, +%% consequential, or any other damages, resulting from the use or misuse +%% of any information contained here. +%% +%% All comments are the opinions of their respective authors and are not +%% necessarily endorsed by the IEEE. +%% +%% This work is distributed under the LaTeX Project Public License (LPPL) +%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used, +%% distributed and modified. A copy of the LPPL, version 1.3, is included +%% in the base LaTeX documentation of all distributions of LaTeX released +%% 2003/12/01 or later. +%% Retain all contribution notices and credits. +%% ** Modified files should be clearly indicated as such, including ** +%% ** renaming them and changing author support contact information. ** +%% +%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex, +%% bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex +%%************************************************************************* + +% *** Authors should verify (and, if needed, correct) their LaTeX system *** +% *** with the testflow diagnostic prior to trusting their LaTeX platform *** +% *** with production work. IEEE's font choices can trigger bugs that do *** +% *** not appear when using other class files. *** +% The testflow support page is at: +% http://www.michaelshell.org/tex/testflow/ + + + +% Note that the a4paper option is mainly intended so that authors in +% countries using A4 can easily print to A4 and see how their papers will +% look in print - the typesetting of the document will not typically be +% affected with changes in paper size (but the bottom and side margins will). +% Use the testflow package mentioned above to verify correct handling of +% both paper sizes by the user's LaTeX system. +% +% Also note that the "draftcls" or "draftclsnofoot", not "draft", option +% should be used if it is desired that the figures are to be displayed in +% draft mode. +% +\documentclass[10pt, conference, compsocconf]{IEEEtran} +% Add the compsocconf option for Computer Society conferences. +% +% If IEEEtran.cls has not been installed into the LaTeX system files, +% manually specify the path to it like: +% \documentclass[conference]{../sty/IEEEtran} + + +% \usepackage[latin1]{inputenc} +% \usepackage[cyr]{aeguill} +% \usepackage[francais]{babel} + + +% Some very useful LaTeX packages include: +% (uncomment the ones you want to load) + + +% *** MISC UTILITY PACKAGES *** +% +%\usepackage{ifpdf} +% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional +% compilation based on whether the output is pdf or dvi. +% usage: +% \ifpdf +% % pdf code +% \else +% % dvi code +% \fi +% The latest version of ifpdf.sty can be obtained from: +% http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/ +% Also, note that IEEEtran.cls V1.7 and later provides a builtin +% \ifCLASSINFOpdf conditional that works the same way. +% When switching from latex to pdflatex and vice-versa, the compiler may +% have to be run twice to clear warning/error messages. + + + + + + +% *** CITATION PACKAGES *** +% +\usepackage{cite} +% cite.sty was written by Donald Arseneau +% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package +% \cite{} output to follow that of IEEE. Loading the cite package will +% result in citation numbers being automatically sorted and properly +% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using +% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's +% \cite will automatically add leading space, if needed. Use cite.sty's +% noadjust option (cite.sty V3.8 and later) if you want to turn this off. +% cite.sty is already installed on most LaTeX systems. Be sure and use +% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does +% not currently provide for hyperlinked citations. +% The latest version can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/ +% The documentation is contained in the cite.sty file itself. + + + + + + +% *** GRAPHICS RELATED PACKAGES *** +% +\ifCLASSINFOpdf + \usepackage[pdftex]{graphicx,color} + % declare the path(s) where your graphic files are + \graphicspath{{img/}} + % and their extensions so you won't have to specify these with + % every instance of \includegraphics + \DeclareGraphicsExtensions{.pdf,.jpeg,.png} +\else + % or other class option (dvipsone, dvipdf, if not using dvips). graphicx + % will default to the driver specified in the system graphics.cfg if no + % driver is specified. + % \usepackage[dvips]{graphicx} + % declare the path(s) where your graphic files are + % \graphicspath{{../eps/}} + % and their extensions so you won't have to specify these with + % every instance of \includegraphics + % \DeclareGraphicsExtensions{.eps} +\fi +% graphicx was written by David Carlisle and Sebastian Rahtz. It is +% required if you want graphics, photos, etc. graphicx.sty is already +% installed on most LaTeX systems. The latest version and documentation can +% be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/required/graphics/ +% Another good source of documentation is "Using Imported Graphics in +% LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or +% epslatex.pdf at: http://www.ctan.org/tex-archive/info/ +% +% latex, and pdflatex in dvi mode, support graphics in encapsulated +% postscript (.eps) format. pdflatex in pdf mode supports graphics +% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure +% that all non-photo figures use a vector format (.eps, .pdf, .mps) and +% not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats +% which can result in "jaggedy"/blurry rendering of lines and letters as +% well as large increases in file sizes. +% +% You can find documentation about the pdfTeX application at: +% http://www.tug.org/applications/pdftex + + + + + +% *** MATH PACKAGES *** +% +%\usepackage[cmex10]{amsmath} +% A popular package from the American Mathematical Society that provides +% many useful and powerful commands for dealing with mathematics. If using +% it, be sure to load this package with the cmex10 option to ensure that +% only type 1 fonts will utilized at all point sizes. Without this option, +% it is possible that some math symbols, particularly those within +% footnotes, will be rendered in bitmap form which will result in a +% document that can not be IEEE Xplore compliant! +% +% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000 +% thus preventing page breaks from occurring within multiline equations. Use: +%\interdisplaylinepenalty=2500 +% after loading amsmath to restore such page breaks as IEEEtran.cls normally +% does. amsmath.sty is already installed on most LaTeX systems. The latest +% version and documentation can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/ + + + + + +% *** SPECIALIZED LIST PACKAGES *** +% +\usepackage[ruled,lined,linesnumbered]{algorithm2e} +%\usepackage{algorithmic} +% algorithmic.sty was written by Peter Williams and Rogerio Brito. +% This package provides an algorithmic environment fo describing algorithms. +% You can use the algorithmic environment in-text or within a figure +% environment to provide for a floating algorithm. Do NOT use the algorithm +% floating environment provided by algorithm.sty (by the same authors) or +% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated +% algorithm float types and packages that provide these will not provide +% correct IEEE style captions. The latest version and documentation of +% algorithmic.sty can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/ +% There is also a support site at: +% http://algorithms.berlios.de/index.html +% Also of interest may be the (relatively newer and more customizable) +% algorithmicx.sty package by Szasz Janos: +% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/ + + + + +% *** ALIGNMENT PACKAGES *** +% +\usepackage{array} +% Frank Mittelbach's and David Carlisle's array.sty patches and improves +% the standard LaTeX2e array and tabular environments to provide better +% appearance and additional user controls. As the default LaTeX2e table +% generation code is lacking to the point of almost being broken with +% respect to the quality of the end results, all users are strongly +% advised to use an enhanced (at the very least that provided by array.sty) +% set of table tools. array.sty is already installed on most systems. The +% latest version and documentation can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/required/tools/ + + +\usepackage{mdwmath} +\usepackage{mdwtab} +% Also highly recommended is Mark Wooding's extremely powerful MDW tools, +% especially mdwmath.sty and mdwtab.sty which are used to format equations +% and tables, respectively. The MDWtools set is already installed on most +% LaTeX systems. The lastest version and documentation is available at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/ + + +% IEEEtran contains the IEEEeqnarray family of commands that can be used to +% generate multiline equations as well as matrices, tables, etc., of high +% quality. + + +%\usepackage{eqparbox} +% Also of notable interest is Scott Pakin's eqparbox package for creating +% (automatically sized) equal width boxes - aka "natural width parboxes". +% Available at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/ + + + + + +% *** SUBFIGURE PACKAGES *** +%\usepackage[tight,footnotesize]{subfigure} +% subfigure.sty was written by Steven Douglas Cochran. This package makes it +% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE +% work, it is a good idea to load it with the tight package option to reduce +% the amount of white space around the subfigures. subfigure.sty is already +% installed on most LaTeX systems. The latest version and documentation can +% be obtained at: +% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/ +% subfigure.sty has been superceeded by subfig.sty. + + + +%\usepackage[caption=false]{caption} +%\usepackage[font=footnotesize]{subfig} +% subfig.sty, also written by Steven Douglas Cochran, is the modern +% replacement for subfigure.sty. However, subfig.sty requires and +% automatically loads Axel Sommerfeldt's caption.sty which will override +% IEEEtran.cls handling of captions and this will result in nonIEEE style +% figure/table captions. To prevent this problem, be sure and preload +% caption.sty with its "caption=false" package option. This is will preserve +% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later +% (recommended due to many improvements over 1.2) of subfig.sty supports +% the caption=false option directly: +\usepackage[caption=false,font=footnotesize]{subfig} +% +% The latest version and documentation can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/ +% The latest version and documentation of caption.sty can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/ + + + + +% *** FLOAT PACKAGES *** +% +\usepackage{fixltx2e} +% fixltx2e, the successor to the earlier fix2col.sty, was written by +% Frank Mittelbach and David Carlisle. This package corrects a few problems +% in the LaTeX2e kernel, the most notable of which is that in current +% LaTeX2e releases, the ordering of single and double column floats is not +% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a +% single column figure to be placed prior to an earlier double column +% figure. The latest version and documentation can be found at: +% http://www.ctan.org/tex-archive/macros/latex/base/ + + + +%\usepackage{stfloats} +% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e +% the ability to do double column floats at the bottom of the page as well +% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in +% LaTeX2e). It also provides a command: +%\fnbelowfloat +% to enable the placement of footnotes below bottom floats (the standard +% LaTeX2e kernel puts them above bottom floats). This is an invasive package +% which rewrites many portions of the LaTeX2e float routines. It may not work +% with other packages that modify the LaTeX2e float routines. The latest +% version and documentation can be obtained at: +% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/ +% Documentation is contained in the stfloats.sty comments as well as in the +% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE +% does not allow \baselineskip to stretch. Authors submitting work to the +% IEEE should note that IEEE rarely uses double column equations and +% that authors should try to avoid such use. Do not be tempted to use the +% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does +% not format its papers in such ways. + + + +% correct bad hyphenation here +% \hyphenation{op-tical net-works semi-conduc-tor} + + +\begin{document} +% +% paper title +% can use linebreaks \\ within to get better formatting as desired +\title{GPU implementation of a region based algorithm \\ for large images segmentation} + + +% author names and affiliations +% use a multiple column layout for up to two different +% affiliations + +\author{ +\IEEEauthorblockN{Gilles Perrot, St\'{e}phane Domas, Rapha\"{e}l Couturier} +\IEEEauthorblockA{Distributed Numerical Algorithmics team (AND), Laboratoire d'Informatique de Franche-comt\'{e}\\ +Rue Engel Gros, 90000 Belfort, France\\ +forename.name@univ-fcomte.fr} +} + + + +% use for special paper notices +%\IEEEspecialpapernotice{(Invited Paper)} + + + + +% make the title area +\maketitle + +\begin{abstract} +Image segmentation is one of the most challenging issues in image computing. +In this work, we focus on region-based active contour techniques (snakes) as they seem to achieve a high level of robustness and fit with a large range of +applications. Some algorithmic optimizations provide significant speedups, but even so, execution times are still non-neglectable +with the continuing increase of image sizes. Moreover, these algorithms are not well suited for running on multi-core CPU's. +At the same time, recent developments of Graphical Processing Units (GPU) suggest that higher speedups could be obtained +by use of their specific design. We have managed to adapt a specially efficient snake algorithm that fits recent Nvidia GPU architecture +and takes advantage of its massive multithreaded execution capabilities. The speedup obtained is most often around 7. +\end{abstract} + +\begin{IEEEkeywords} + GPU; segmentation; snake; +\end{IEEEkeywords} + +\section{Introduction} +Segmentation and shape detection are still key issues in image computing. These techniques are used in numerous fields ranging from medical imaging to video tracking, shape recognition or localization. +Since 1988, the active contours (snakes) introduced by and Kass et al. \cite{KassWT88}, have proved to be efficient and robust, especially against noise, for a wide range of image types. + +The main shortcoming of these algorithms is often their high dependence on the initial shape, though several contributions have lowered this dependency and also brought +more accurate segmentation of non convex shapes \cite{Ruch01} \cite{XuP98}. + +The information that drives a snake model comes either from the contour itself or from the characteristics of the regions it defines. +For noisy images, the second option is often more suitable as it takes into account the statistical fluctuations of the pixels. +One approach \cite{ChesnaudRB99,AllainBG08} proposes a geometric (polygonal) region-based snake driven by the minimization of the stochastic complexity. One significant +advantage is that it runs without any free parameter which can be helpful when dealing with image sequences or slices (3D). + +An important issue of image processing, especially segmentation, has always been the computation time of most algorithms. Over the years, the increase of CPU computing capabilities, +although quite impressive, has not been able to fulfill the combined needs of growing resolution and real-time computation. +Since having been introduced in the early 1980's, the capabilities and speed of graphics accelerators have always been increasing. So much so that the recent GPGPU +(General Purpose Graphic Processing Units) currently benefit by a massively parallel architecture for general purpose programming, especially when dealing with large matrices +or vectors. On the other hand, their specific design obviously imposes a number of limitations and constraints. +Some implementations of parametric snakes have already been tested, such as \cite{Brunett}. However, a similar solution (computation per small tile) +is not suited for the algorithm we have implemented. + +Our goal, in collaboration with the PhyTI team\footnote{Physics and Image Processing Group, Fresnel Institute, Ecole Centrale de Marseille (France)}, was to propose a way to fit their algorithm +to the Nvidia$^{\textcopyright}$ Tesla GPU architecture. +The remainder of this paper presents the principles of the algorithm and notations in section \ref{secCPUalgooutlines}. In section \ref{secCPUalgodetails}, the details of +the sequential CPU implementation are explained. Section \ref{GPUgeneralites} summarizes Nvidia's GPU +important characteristics and how to deal with them efficiently. Then sections \ref{GPUimplementation} and \ref{secSpeedups} detail our GPU implementation and timing results. +Finally, the conclusion of section \ref{secConclusion} evaluates the pros and cons of this implementation and then gives a few direction to be followed in future works. + + + +\section{\label{secCPUalgooutlines}Sequential algorithm : outlines} +The goal of the active contour segmentation (snake) method we studied \cite{Ruch01} is to distinguish, inside an image $I$, a target region $T$ from the background region +$B$. The size of $I$ is L x H pixels of coordinates $(i,j)$ and gray level $z(i,j)$. +We assume that the gray levels of $T$ and $B$ are independent random vectors, each with a distribution $p^{\Omega}$ of its components $(\Omega \in \{T ; B\})$. +The present implementation uses a Gaussian distribution, but another one can easily be used as Gamma, Poisson,...(Cf. \cite{ChesnaudRB99})\dots + +The \textit{active contour} $S$, which represents the shape of $T$ is chosen as polygonal. +The purpose of the segmentation is then to determine the shape that optimizes a pseudo log-likelihood-based criterion (PLH). +This is done by a very simple iterative process which is initialized with an arbitrary shape, then at each step : +\begin{enumerate} + \item it modifies the shape + \item it estimates the parameters of the Gaussian functions for the two regions and evaluates the criterion. + \item it validates the new shape if the criterion has a better value. +\end{enumerate} +A simplified description of it is given in \emph{Algorithm \ref{cpualgosimple}} which features two nested loops : the main one, on iteration level, is +responsible for tuning the number of nodes ; the inner one, on step level, takes care of finding the best shape for a given number of nodes. +\emph{Figure \ref{images_algo}} shows intermediate results at iteration level. Sub-figure \emph{\ref{fig:labelinit}} shows the initial rectangular shape, \emph{\ref{fig:labelit1}} + shows the best four-node shape that ends +the first iteration. Sub-figures \emph{\ref{fig:labelit2}} and \emph{\ref{fig:labelit4}} show the best shape for an eight-node snake (resp. 29-node) +which occurs at the end of the second iteration (resp. fourth). + +\begin{algorithm}[h] +\label{cpualgosimple} +\caption{Sequential algorithm : outlines} +\SetNlSty{textbf}{}{:} + + %compute\_cumulated\_images()\; + begin with a rectangular 4 nodes snake\; + \Repeat(\tcc*[f]{iteration level}){no more node can be added}{ + \Repeat(\tcc*[f]{step level}){no more node can be moved}{ + Test some other positions for each node, near its current position\; + Find the best PLH and adjust the node's position\; + } + Add a node in the middle of each \emph{long enough} segment\; + } +\end{algorithm} + + +\begin{figure}[h] + \centering +\subfloat[Initial snake ]{\label{fig:labelinit} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_init.jpg}}\qquad +\subfloat[End of first iteration (4 nodes) ]{\label{fig:labelit1} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it1.jpg}}\\ +\subfloat[End of second iteration (8 nodes)]{\label{fig:labelit2} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it2.jpg}}\qquad +\subfloat[End of fourth iteration (29 nodes)]{\label{fig:labelit4} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it4.jpg}} +%\subfloat[width=0.4\linewidth]{./img/cochon_b_entier.jpg} + % cochon_b_entier.jpg: 3960x2970 pixel, 72dpi, 139.70x104.78 cm, bb=0 0 3960 2970 + \caption{segmentation of a noisy image} + \label{images_algo} +\end{figure} + + + +\section{\label{secCPUalgodetails}Sequential algorithm : details} +\subsection{Criterion} +For $p^{\Omega}$ a Gaussian function, $\Theta_{\Omega}$ ($\Omega \in \{T ; B\}$) has two components, the average value $\mu$ and the deviation $\sigma$ which are estimated by +$$ +\widehat{\Theta_{\Omega}} \left( +\begin{array}{l} +\widehat{\mu} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z(i,j) \\ +\widehat{\sigma^2} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z^2(i,j) - \mu^2 \\ + \end{array} +\right. +$$ +The likelihood of a region is given by +$$ P[I|S_{n,l}, \Theta_T, \Theta_B] = P(\chi_T | \Theta_T)P(\chi_B | \Theta_B)$$ +where +$$P(\chi_{\Omega} | \Theta_{\Omega}) = \prod_{(i,j)\in \Omega} p^{\Omega}[z(i,j)] ~~~~(\Omega \in \{T ; B\})$$ +And then the log-likelihood by +$$-N_{\Omega}\log\left(\sqrt{2\pi}\right) -N_{\Omega}.log\left(\sigma\right) - \frac{1}{2\sigma^2}\sum_{(i,j)\in \Omega} \left( z(i,j)-\mu \right)^2 $$ +Considering the two regions, the criterion to be optimized is then : +$$C = \frac{1}{2}\left( N_B\log\left(\widehat{\sigma_B}^2\right) + N_T\log\left(\widehat{\sigma_T}^2\right)\right)$$ + +\subsection{CPU implementation} +Let $S_{n,l}$ be the snake state at step $l$ of iteration $n$, and $S_{n,l}^i$ the node $i$ of $S_{n,l}$ ($i \in [0;N_n]$). +Each segment of $S_{n,l}$ is considered as an oriented list of discrete points. +Chesnaud \& Refregier \cite{ChesnaudRB99} have shown how to replace the 2 dimensions sums needed to estimate $\Theta_{\Omega}$ by 1 dimension sums along $S_{n,l}$. +However, this approach involves weighing coefficients for every single point of $S_{n,l}$ which leads to compute a pair of transformed images, at the very +beginning of the process. Such images are called cumulated images and will be used as lookup tables. +Therefore, beyond this point, we will talk about the \emph{contribution} of each point to the 1D sums. By extension, we also talk about the \emph{contribution} of each segment to the 1D sums. + +A more detailed description of the sequential algorithm is given by \emph{Algorithm \ref{cpualgo}}. +The process starts with the computation of cumulated images ; an initialization stage takes place from line \ref{debinit} to line \ref{fininit}. +Then we recognize the two nested loops (line \ref{loopnewnodes} and line \ref{loopmovenodes}) and finally the heart of the algorithm stands on line \ref{kernelPLH} which represents +the main part of the calculations to be done : +\begin{enumerate} + \item compute the various sums without the contributions of both segments connected to current node $S_{n,l}^i$. + \item \label{CPUcontrib_segments} compute the contributions of both segments, which requires : + \begin{itemize} + \item \label{CPUbresenham} To determine the coordinates of every discrete pixel of both segments connected to $S_{n,l}^{i,w}$. + \item \label{CPUcontrib_pixels} To compute every pixel contribution. + \item To sum pixel contributions to obtain segment contributions. + \end{itemize} + \item compute the PLH given the contribution of each segment of the tested snake. +\end{enumerate} + +\begin{algorithm}[h] +\SetNlSty{textbf}{}{:} +\caption{Sequential simplified algorithm} +\label{cpualgo} + read image from HDD\; + compute\_cumulated\_images()\label{cumuls}\; + iteration $n \leftarrow 0$\label{debinit}\; + $N_0 \leftarrow 4$\; + $S_{n,l} \leftarrow S_{0,0}$\; + step $d \leftarrow d_{max} = 2^q$\; + current node $S_{0,0}^i \leftarrow S_{0,0}^0$\; + $l \leftarrow 0$\; + compute $PLH_{ref}$, the PLH of $S_{n,0}$\label{fininit}\; + \Repeat(\tcc*[f]{iteration level}){no new node added}{\label{loopnewnodes} + \Repeat(\tcc*[f]{step level}){no node move occured}{\label{loopmovenodes} + \For{$i=0$ to $N_n$}{ + $S_{n,l}^{i,w}$ ($w \in [0;7]$) are the neighbors of $S_{n,l}^i$ by distance $d$\; + \For{$w=0$ to $7$}{ + compute $PLH_w$ for $S_{n,l}$ when $S_{n,l}^{i,w}$ replaces $S_{n,l}^i$ \label{kernelPLH}\; + \lIf{$PLH_w$ is better than $PLH_{ref}$}{ + $PLH_{ref} \leftarrow PLH_w$\; + move node $S_{n,l}^i \leftarrow S_{n,l}^{i,w}$\; + } + } + } + $l \leftarrow l+1$\; + } + add new nodes, $N_n \leftarrow N_n + N_{newnodes}$\; + \lIf{$d > 1$}{ $d \leftarrow d/2$ } \lElse{ $d=1$ }\; + $n \leftarrow n+1$\; + compute $PLH_{ref}$, the PLH of $S_{n,0}$ \; + } +\end{algorithm} + + + +The profiling results of the CPU implementation shown in \emph{Figure \ref{CPUprofile}} display the relative costs of the most time-consumming functions. +It appears that more than 80\% of the total execution time is always spent by only three functions~: +\begin{itemize} + \item \texttt{compute\_segment\_contribution()} which is responsible for point \ref{CPUcontrib_segments} above, + \item \texttt{compute\_cumulated\_images()} which computes the 3 lookup tables at the very beginning, + \item \texttt{compute\_pixels\_coordinate()} which is called by \texttt{compute\_segment\_contribution()}. +\end{itemize} + +\begin{figure}[h] + \centering + \includegraphics[width=0.9\linewidth, height=0.5\linewidth]{./img/data_profile_cpu.png} + \caption{\label{CPUprofile}the three most-consumming functions for various image sizes} +\end{figure} + +Measurements have been performed for several image sizes from 15~MPixels (about 3900 x 3900) +to 144 MPixels (about 12000 x 12000). On the one hand, we can notice that function \texttt{compute\_segment\_contribution()} always lasts more than 45\% of the total running time, and even +more when the image gets larger. +On the other hand, the function \texttt{compute\_cumulated\_images()} costs more than 23\%, decreasing with image size, while function \texttt{compute\_pixels\_coordinate()} always takes around 6\%. +It confirms that the need for parallelization resides in line \ref{kernelPLH} and line \ref{cumuls} of Algorithm \ref{cpualgo} as they contain every call to those three functions. + +The following sections detail how we managed to implement these time-consumming functions in parallel, but +a brief reminder on GPU's recent architecture is presented first. + + + +\section{\label{GPUgeneralites}NVidia's GPU architecture} +GPUs are multi-core, multi-threaded processors, optimized for highly parallel computation. Their design focuses on SIMT model by devoting +more transistors to data processing rather than data-caching and flow control \cite{CUDAPG}. + +For example, Figure \ref{GPUC1060} shows a Tesla C1060 with its 4GB of global memory and 30 SM processors, each including : +\begin{itemize} + \item 8 Scalar Processors (SP) + \item a Floating Point Unit (FPU) + \item a parallel execution unit (SIMT) that runs threads by warps of 32. + \item 16KB of shared memory, organized in 16 banks of 32 bits words +\end{itemize} +Nvidia uses a parameter called the \emph{compute capability} of each GPU model. Its value is composed of a major number and a minor number ; for example the C1060 is a sm13 GPU (major=1 minor=3) +and C2050 is a sm20 GPU. + +\begin{figure*}[htbp] + \centering + \includegraphics[width=0.7\linewidth]{./img/GPU_block.png} + \caption{\label{GPUC1060}schematic diagram of GPU's internal architecture} +\end{figure*} + +The recent Fermi cards (eg. C2050,) have improved performances by supplying more shared memory in a 32 banks array, a second execution +unit and several managing +capabilities on both the shared memory and level 1 cache memory ( \cite{CUDAPG}, \cite{CUDAFT}, \cite{CUDAFC}. +However, writing efficient code for such architectures is not obvious, as re-serialization must be avoided as much as possible. Thus, when designing, one must +keep a few key points in mind : +\begin{itemize} + \item CUDA model organizes threads by a) threads blocks in which synchronization is possible, b) a grid of blocks with no possible synchronization + between blocks. + \item there is no way to know in what order the blocks are to be scheduled during one single kernel execution. + \item data must be kept in GPU memory, to reduce the overhead due to copying between CPU and GPU memories. + \item the total amount of threads running the same computation must be maximized. + \item the number of execution branches inside a block should be reduced as much as possible. + \item global memory accesses should be coalescent, \emph{ie}. memory accesses done by physically parallel threads (16 at a time) must be consecutive and contained in a 128 Bytes range. + \item shared memory is organized by 16 x 32 bits wide banks. To avoid bank conflicts, each parallel thread (16 at a time) must access a different bank. +\end{itemize} + +All the above charasteristics make it always a quite constrained problem to solve when designing a GPU code. +%detailler +Moreover, a non suited code would probably run even slower on GPU than on CPU due to the automatic serialization which would be done at run time. +%% + +\section{\label{GPUimplementation}GPU implementation} + +In the implementation described below, pre-computations and proper segmentation are discussed separately. +To keep data in GPU memory, the whole computation is assigned to the GPU. CPU still hosts : +\begin{itemize} + \item data reading from HDD + \item data writing on HDD if needed + \item main loops control (corresponding to lines \ref{loopnewnodes} and \ref{loopmovenodes} of Algorithm \ref{cpualgo}) +\end{itemize} + +It must be noticed that controlling these loops is achieved with only a very small amount of data being transferred between host (CPU) and device (GPU), +which does not produce high overhead. \\ +Morever, the structures described below need 20 Bytes per pixel of the image to process (plus an offset of about 50~MByte). +It defines the maximum image size we can accept : approximately 150 M Pixels. + +\subsection{Pre-computations} +To replace 2D sums by 1D sums, Chesnaud \textit{et al.} \cite{ChesnaudRB99} have shown that the three matrices below should be computed : +$$C_1(i,j) = \sum_{k=0}^{k=j} (1+k)$$ +$$C_z(i,j) = \sum_{k=0}^{k=j} z(i,k)$$ and +$$C_{z^2}(i,j) = \sum_{k=0}^{k=j} z^2(i,k)$$ +Where $z(i,k)$ is the gray level of pixel of coordinate $(i,j)$, so that $C_1$, $C_z$ and $C_{z^2}$ are the same size as image $I$. + +\begin{figure*}[htbp] + \centering + \resizebox{0.8\linewidth}{0.3\linewidth}{\input{./img/GPUcumuls.pdf_t}} + \caption{\label{GPUcumuls}\texttt{compute\_blocks\_prefixes()} details.} +\end{figure*} + +\medskip +First, we chose not to generate $C_1(i,j)$, which requires that values should be computed when needed, but saves global memory and does not lead to any overhead. +The computation of $C_{z}$ and $C_{z^2}$ easily decomposes into series of \emph{inclusive prefixsums} \cite{Harris07}. +However, by keeping the \emph{1 thread per pixel} rule, as the total number of threads that can be run in a grid cannot exceed $2^{25}$ (Cf. \cite{CUDAPG}), +slicing is necessary for images exceeding a size threshold which can vary according to the GPU model (e.g. 33 MPix for sm13 GPU, eg. C1060). +It's quite easy to do, but it leads to a small overhead as the process requires multiple calls to one kernel. +Slicing can be done in two ways : +\begin{itemize} + \item all slices are of the same size (balanced) + \item slices fit the maximum size allowed by the GPU, leaving one smaller slice at the end of the process (full-sized). +\end{itemize} +The balanced slice option has proved to run faster.\\ +For example : if a given image has 9000 lines and the GPU can process up to 4000 lines at a time, it's faster to run 3 times with 3000 lines rather than twice with +4000 and once with 1000. + +As the sums in $C_z$ and $C_{z^2}$ are row-wide, it is easy to see that every block-wide sum will be needed before being able to use it in the global sum. +But as mentioned earlier, the scheduling of blocks must be considered as random. +So, in order to ensure synchronizations, each row of the original image is then treated by three different kernels : +\begin{itemize} + \item \texttt{compute\_blocks\_prefixes()}. + \item \texttt{scan\_blocksums()}. + \item \texttt{add\_sums2prefixes()}. +\end{itemize} +Figures \ref{GPUcumuls}, \ref{GPUscansomblocs} and \ref{GPUaddsoms2cumuls} show relevant data structures for a given row $i$ of $I$. +We assume that each thread block runs $bs$ threads in parallel and each row of $C_z$ needs $n$ blocks to cover its $L$ pixels. + +Figure \ref{GPUcumuls} shows the details of the process for row $i$ of the original image $I$, already stored in GPU global memory. +Operands are first copied into GPU shared memory for efficiency reasons. +An inclusive prefixsum is then performed inside each independant thread block. +At this point, only the first shared memory block contains the final values. Its last element contains the sum of all +elements in the corresponding block of $I$. +In order to obtain the right values for the row $i$ of $C_z$, every element value in the other blocks must then be summed with an offset value. +This offset value is the sum of all element values in every corresponding previous block of row $i$. + +As the scheduling of blocks is fully unpredictable, the necessary intermediate results have to be stored in GPU global memory before exiting from kernel. +Each element of the prefixsums in GPU shared memory has been stored in its corresponding position in $C_z$ (GPU global mem), +along with the vector of block sums which will be passed later to the next kernel \texttt{scan\_blocksums()}. + +The kernel \texttt{scan\_blocksums()} (Figure \ref{GPUscansomblocs}) only makes an exclusive prefixsum on the vector of block sums described above. +The result is a vector containing, at index $x$, the value to be added to every element of block $x$ in each line of $C_z$. + +This summing is done in shared memory by kernel \texttt{add\_sums2prefixes()} as described by Figure \ref{GPUaddsoms2cumuls}. + +The values of $C_{z^2}$ are obtained together with those of $C_{z}$ and in exactly the same way. +For publishing reasons, figures do not show the $C_{z^2}$ part of structures. + + + +\begin{figure*}[htbp] + \centering + \resizebox{0.6\linewidth}{0.2\linewidth}{\input{./img/GPUscansomblocs.pdf_t}} + \caption{\label{GPUscansomblocs}\texttt{scan\_blocksums()} details.} +\end{figure*} + +\begin{figure*}[htbp] + \centering + \resizebox{0.7\linewidth}{0.4\linewidth}{\input{./img/GPUaddsoms2cumuls.pdf_t}} + \caption{\label{GPUaddsoms2cumuls}\texttt{add\_sums2prefixes()} details.} +\end{figure*} + +With this implementation, speedups are quite significant (Table \ref{tabresults}). Moreover, the larger the image, +the higher the speedup is, as the step-complexity of the sequential algorithm is of $O(N^2)$ and $O(N\log(N))$ for the parallel version. +Even higher speedups are achieved by adapting the code to specific-size images, especially when the number of columns is a power of 2. This avoids +inactive threads in the grid, and thus improves efficiency. +However, on sm13 GPUs, these computations are made with a 2-way bank conflict as sums are based on 64-bit words, thus creating overhead. + + +\subsection{Segment contributions} +The choice made for this implementation has been to keep the \emph{1 thread per pixel} rule for the main kernels. +Of course, some reduction stages need to override this principle and will be pointed out. + +As each of the $N_n$ nodes of the snake $S_{n,l}$ may move to one of the eight neighbor positions as shown in \emph{Figure \ref{GPUtopo}}, +there is $16 N_n$ segments whose contribution has to be estimated. +The best combination is then chosen to obtain $S_{n,l+1}$ (Figure \ref{GPUtopo}). +Segment contributions are computed in parallel by kernel \texttt{GPU\_compute\_segments\_contrib()}. + +\begin{figure}[h] + \centering + \resizebox{0.9\linewidth}{0.81\linewidth}{\input{./img/topologie.pdf_t}} + \caption{\label{GPUtopo}topology around nodes} +\end{figure} + +The grid parameters for this kernel are determined according to the size of the longest segment $npix_{max}$. +If $bs_{max}$ is the maximum theoritical blocksize that a GPU can accept, +\begin{itemize} + \item the block size $bs$ is taken as + \begin{itemize} + \item $npix_{max}$'s next power of two if \\${npix_{max} \in [33 ; bs_{max} ] }$ + \item 32 if ${npix_{max} < 32 }$ + \item $bs_{max}$ if ${npix_{max} > 256 }$ + \end{itemize} + \item the number of threads blocks assigned to each segment, $N_{TB} = \frac{npix_{max} + bs -1 }{bs}$ +\end{itemize} +Our implementation makes intensive use of shared memory and does not allow the use of the maximum theoritical blocksizes +(512 for sm13, 1024 for sm20, see \cite{CUDAFT} and \cite{CUDAPG}). +Instead we set $bs_{max}^{sm13} = 256$ and $bs_{max}^{sm20} = 512$. +Anyway, testing has shown that most often, the best value is 256 for both \textit{sm13} and \textit{sm20} GPU's. + +\begin{figure*}[htbp] + \centering + \resizebox{0.6\linewidth}{0.35\linewidth}{\input{./img/contribs_segments.pdf_t}} + \caption{\label{contribs_segments}structure for segments contributions computation. Gray symbols help to locate inactive threads as opposed to black + ones that figure active threads.} +\end{figure*} + +Then \texttt{GPU\_compute\_segments\_contrib()} computes in parallel : +\begin{itemize} + \item every pixel coordinates for all $16 N_n$ segments. Since the snake is only read in one direction, we have been able + to use a very simple parallel algorithm instead of Bresenham's. +It is based on the slope $k$ of each segment~: one pixel per row if $|k|>1$, one pixel per column otherwise. + \item every pixel contribution by reading the corresponding values in the lookup tables. + \item every thread blocks sums of individual pixel contributions by running a \textit{reduction} stage for each block. +\end{itemize} +The top line of Figure \ref{contribs_segments} shows the base data structure in GPU shared memory which is relative to one segment. +We concatenate the single segment structure as much as necessary to create a large vector representing every pixel of every test segment. +As each segment has a different size (most often different from any power of two), there is a non-neglectable number of inactive threads scattered in the whole structure. +Two stages are processed separately : one for all even nodes and another one for odd nodes, +as shown in the two bottom lines of Figure \ref{contribs_segments}. + + +The process is entirely done in shared memory ; only a small amount of data needs to be stored in global memory for each segment~: +\begin{itemize} + \item the coordinates of its middle point, in order to be able to add nodes easily if needed. + \item the coordinates of its first and last two points, to compute the slope at each end of the segment. +\end{itemize} +The five values above are part of the weighing coefficients determination for each segment and node. + + The \texttt{GPU\_sum\_contribs()} takes the blocks sums obtained by \texttt{GPU\_compute\_segments\_contrib()} and computes a second stage parallel summing to provide +the $16 N_n$ segment contributions. + +\subsection{Segments with a slope $k$ such as $|k|\leq1$} +Such a segment is treated with 1 thread per column and consequently, it often has more than one pixel per row as shown by Figure \ref{tripix}. +In an image row, consecutive pixels which belong to the target define an interval which can only have one low and one high ends +That's why, on each row, we choose to consider only the contributions of the innermost pixels. +This selection is also done inside \texttt{GPU\_compute\_segments\_contrib()} when reading the lookup tables for each pixel contribution. +We simply set a null contribution for pixels that need to be ignored. +\begin{figure}[h] + \centering + \resizebox{0.75\linewidth}{0.35\linewidth}{\input{./img/tripix.pdf_t}} + \caption{\label{tripix}Zoom on part a of segment with $|k| < 1$, at pixel level.} +\end{figure} + + +\subsection{Parameters estimation} +A \texttt{GPU\_compute\_PLH()} kernel computes in parallel : +\begin{itemize} + \item every $8N_n$ vector of parameters values corresponding to each possible next state of the snake. Summing is done in shared memory but relevant + data for these operations are stored in global memory. + \item every associated pseudo likelihood value. + \item node substitutions when better PLH have been found and if it does not lead to segments crossing. +\end{itemize} + +\subsection{End of segmentation} +Segmentation is considered achieved out when no other node can be added to the snake (Algorithm \ref{gpualgosimple}). +A very simple GPU kernel adds every possible node and returns the number it added. + +\begin{algorithm}[h] +\label{gpualgosimple} +\caption{Parralel GPU algorithm : outlines. \texttt{<<<...>>>} indicates a GPU kernel parallel process.} +\SetNlSty{textbf}{}{:} + load images\; + transfer image from CPU to GPU\; + \texttt{<<<}compute the 2 cumulated images\texttt{>>>}\; + \texttt{<<<}initialize the snake\texttt{>>>}\; + \Repeat(\tcc*[f]{iteration level}){no more node can be added}{ + \Repeat(\tcc*[f]{step level}){no more node can be moved}{ + \texttt{<<<}find best neighbor snake\texttt{>>>}\; + \texttt{<<<}adjust node's positions\texttt{>>>}\; + transfer the number of moves achieved from GPU memory to CPU memory. + } + \texttt{<<<}Add nodes\texttt{>>>}\; + transfert the number of nodes added from GPU memory to CPU memory. + } +\end{algorithm} + +\section{\label{secSpeedups}Speedups} +The CPU (SSE) implementation by N. Bertaux from the PhyTI team, based on \cite{AllainBG08} has been our reference to ensure segmentation's quality and to estimate speedups. +Results are given in Table \ref{tabresults}. +CPU timings were measured on an Intel Xeon E5530-2.4GHz with 12Go RAM (LIFC cluster). +GPU timings were obtained on a C2050 GPU with 3GB RAM (adonis-11.grenoble.grid5000.fr).\\ +Execution times reported are means on ten executions. +%Measurements on CPU may vary more than on GPU. So CPU results given in \ref{tabresults} are near the fastest values we observed. +The image of figure \ref{fig:labelinit} (scaled down for printing reasons) is a 16-bit gray level photo from PhyTI team, +voluntarily noisy for testing reasons. The contrast has been enhanced for better viewing. + +We separately give the timings of pre-computations as they are a very general purpose piece of code. +Segmentations have been performed with strictly the same parameters (initial shape, threshold length). +The neighborhood distance for the first iteration is 32 pixels. It has a slight influence on the +time process, but it leads to similar speedups values of approximately 7 times faster than CPU. + +Though it does not appear in Table \ref{tabresults}, we observed that during segmentation stage, higher speedups are obtained in the very first iterations, when segments are made of a lot of pixels, leading to a higher parallelism ratio.\\ +Several parameters prevent from achieving higher speedups~: +\begin{itemize} + \item accesses in the lookup tables in global memory cannot be coalescent. It would imply that the pixel contributions of a segment are stored in consecutive spaces in $C_z$ and $C_{z^2}$. + This is only the case for horizontal segments. + \item the use of 64-bit words for computations in shared memory often leads to 2-way bank conflicts. + \item the level of parallelism is not so high, ie. the total number of pixel is not large enough to achieve impressive speedups. For example, on C2050 GPU, a grid can + run about 66 million of threads, but a snake in a 10000 x 10000 image would be less than 0.1 million pixel long. +\end{itemize} + + +% \begin{table} +% \begin{center} +% \begin{tabular}{|l| r|r r r|} +% \hline +% && CPU & GPU & Speedup\\\cline{3-5} +% Image 15MP & \bf total & \bf0.51 s & \bf0.06 s & \bf x8.5 \\ +% & pre-comp. & 0.21 s & 0.02 s & x10\\ +% & segment. & 0.34 s & 0.04 s & x8.5\\\hline +% Image 100MP & \bf total & \bf 4.33 s & \bf 0.59 s & \bf x7.3\\ +% & pre-comp. & 1.49 s & 0.13 s & x11\\ +% & segment. & 2.84 s & 0.46 s & x6.1\\\hline +% Image 150Mp & \bf total & \bf 26.4 s & \bf 0.79 s & \bf x33\\ +% & pre-comp. & 8.4 s & 0.20 s & x42\\ +% & segment. & 18.0 s & 0.59 s & x30\\\hline +% \end{tabular} +% \end{center} +% +% \caption{\label{tabresults} GPU (C2050, sm20) vs CPU timings.} +% \end{table} + + +\begin{table} + \begin{center} +\begin{tabular}{|l| r|r r r|} +\hline +&& CPU & GPU & Speedup\\\cline{3-5} +Image 15MP & \bf total & \bf0.51 s & \bf0.06 s & \bf x8.5 \\ + & pre-comp. & 0.13 s & 0.02 s & x6.5\\ + & segment. & 0.46 s & 0.04 s & x11.5\\\hline +Image 100MP & \bf total & \bf 4.08 s & \bf 0.59 s & \bf x6.9\\ + & pre-comp. & 0.91 s & 0.13 s & x6.9\\ + & segment. & 3.17 s & 0.46 s & x6.9\\\hline +Image 150Mp & \bf total & \bf 5.7 s & \bf 0.79 s & \bf x7.2\\ + & pre-comp. & 1.4 s & 0.20 s & x7.0\\ + & segment. & 4.3 s & 0.59 s & x7.3\\\hline + \end{tabular} + \end{center} + +\caption{\label{tabresults} GPU (C2050, sm20) vs CPU timings.} +\end{table} + +\IEEEpeerreviewmaketitle + +\vspace{1cm} + +\section{\label{secConclusion}Conclusion} +The algorithm we have focused on is not easy to adapt for high speedups on GPGPU, though we managed to make it work faster than on CPU. +The main drawback is clearly its relative low level of parallelism. Nevertheless, we proposed different kernels that allowed us to take advantage of the computation power of GPUs. + In future works, we plan to try and manage to benefit from larger computing grids of thread blocks. Among the possible solutions, we plan to work on: +\begin{itemize} + \item slicing the image and proceeding the parts in parallel. This is made possible since sm20 GPU provide multi kernel capabilities. + \item slicing the image and proceeding the parts on two different GPUs, hosted by the same CPU. + \item translating the parallelism from pixel level (\emph{1 thread per pixel}) to snake level (\emph{1 thread per snake}), at least during the first iteration, which +is often the longest lasting one. + \item designing an algorithm, in a GPU way of thinking, instead of adapting the existing CPU-designed algorithm to GPU constraints as we did. +\end{itemize} + + +%%RAPH +%%Est ce qu'on parle du fait qu'on va également réfléchir à repenser l'algo en gpu? + + +% trigger a \newpage just before the given reference +% number - used to balance the columns on the last page +% adjust value as needed - may need to be readjusted if +% the document is modified later +%\IEEEtriggeratref{8} +% The "triggered" command can be changed if desired: +%\IEEEtriggercmd{\enlargethispage{-5in}} + +% references section + + +\bibliographystyle{IEEEtran} + +\bibliography{IEEEabrv,biblio} + + +% that's all folks +\end{document} + +