From 2ab6a29cc145e0798db4fac4804cbb9bf24c9d16 Mon Sep 17 00:00:00 2001 From: asider Date: Thu, 7 Jan 2016 22:35:21 +0100 Subject: [PATCH 1/1] Relecture section Expermients --- Simulations/mpi.eps | 454 ++++++++++++++++++-------------------------- Simulations/omp.eps | 454 ++++++++++++++++++-------------------------- paper.tex | 86 +++++---- 3 files changed, 411 insertions(+), 583 deletions(-) diff --git a/Simulations/mpi.eps b/Simulations/mpi.eps index f287ff5..c0daa18 100644 --- a/Simulations/mpi.eps +++ b/Simulations/mpi.eps @@ -1,7 +1,7 @@ %!PS-Adobe-2.0 -%%Title: mpi.eps +%%Title: MPI.eps %%Creator: gnuplot 4.6 patchlevel 0 -%%CreationDate: Fri Dec 25 23:08:11 2015 +%%CreationDate: Fri Dec 25 21:35:56 2015 %%DocumentFonts: (atend) %%BoundingBox: 50 50 554 770 %%Orientation: Landscape @@ -456,13 +456,13 @@ systemdict /pdfmark known not { userdict /pdfmark systemdict /cleartomark get put } if SDict begin [ - /Title (mpi.eps) + /Title (MPI.eps) /Subject (gnuplot plot) /Creator (gnuplot 4.6 patchlevel 0) /Author (lilia) % /Producer (gnuplot) % /Keywords () - /CreationDate (Fri Dec 25 23:08:11 2015) + /CreationDate (Fri Dec 25 21:35:56 2015) /DOCINFO pdfmark end } ifelse @@ -488,107 +488,165 @@ LTb -63 0 V stroke 854 448 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1)] +[ [(Helvetica) 140.0 0.0 true true 0 ( 10)] ] -46.7 MRshow 1.000 UL LTb -938 714 M +938 781 M 31 0 V 5978 0 R -31 0 V -938 1066 M +938 976 M 31 0 V 5978 0 R -31 0 V -938 1247 M +938 1114 M 31 0 V 5978 0 R -31 0 V -938 1333 M -63 0 V -5946 0 R --63 0 V -stroke -854 1333 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 10)] -] -46.7 MRshow -1.000 UL -LTb -938 1599 M +938 1221 M +31 0 V +5978 0 R +-31 0 V +938 1308 M +31 0 V +5978 0 R +-31 0 V +938 1382 M 31 0 V 5978 0 R -31 0 V -938 1951 M +938 1447 M 31 0 V 5978 0 R -31 0 V -938 2131 M +938 1503 M 31 0 V 5978 0 R -31 0 V -938 2217 M +938 1554 M 63 0 V 5946 0 R -63 0 V stroke -854 2217 M +854 1554 M [ [(Helvetica) 140.0 0.0 true true 0 ( 100)] ] -46.7 MRshow 1.000 UL LTb -938 2483 M +938 1887 M 31 0 V 5978 0 R -31 0 V -938 2836 M +938 2081 M 31 0 V 5978 0 R -31 0 V -938 3016 M +938 2219 M 31 0 V 5978 0 R -31 0 V -938 3102 M +938 2327 M +31 0 V +5978 0 R +-31 0 V +938 2414 M +31 0 V +5978 0 R +-31 0 V +938 2488 M +31 0 V +5978 0 R +-31 0 V +938 2552 M +31 0 V +5978 0 R +-31 0 V +938 2609 M +31 0 V +5978 0 R +-31 0 V +938 2660 M 63 0 V 5946 0 R -63 0 V stroke -854 3102 M +854 2660 M [ [(Helvetica) 140.0 0.0 true true 0 ( 1000)] ] -46.7 MRshow 1.000 UL LTb -938 3368 M +938 2992 M +31 0 V +5978 0 R +-31 0 V +938 3187 M +31 0 V +5978 0 R +-31 0 V +938 3325 M +31 0 V +5978 0 R +-31 0 V +938 3432 M 31 0 V 5978 0 R -31 0 V -938 3720 M +938 3520 M 31 0 V 5978 0 R -31 0 V -938 3901 M +938 3594 M 31 0 V 5978 0 R -31 0 V -938 3986 M +938 3658 M +31 0 V +5978 0 R +-31 0 V +938 3715 M +31 0 V +5978 0 R +-31 0 V +938 3765 M 63 0 V 5946 0 R -63 0 V stroke -854 3986 M +854 3765 M [ [(Helvetica) 140.0 0.0 true true 0 ( 10000)] ] -46.7 MRshow 1.000 UL LTb -938 4253 M +938 4098 M +31 0 V +5978 0 R +-31 0 V +938 4293 M +31 0 V +5978 0 R +-31 0 V +938 4431 M 31 0 V 5978 0 R -31 0 V -938 4605 M +938 4538 M 31 0 V 5978 0 R -31 0 V -938 4785 M +938 4626 M +31 0 V +5978 0 R +-31 0 V +938 4700 M +31 0 V +5978 0 R +-31 0 V +938 4764 M +31 0 V +5978 0 R +-31 0 V +938 4820 M 31 0 V 5978 0 R -31 0 V @@ -608,67 +666,27 @@ LTb 0 -63 V stroke 938 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 0)] -] -46.7 MCshow -1.000 UL -LTb -1796 448 M -0 63 V -0 4360 R -0 -63 V -stroke -1796 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 200000)] +[ [(Helvetica) 140.0 0.0 true true 0 (1)] ] -46.7 MCshow 1.000 UL LTb -2655 448 M +2941 448 M 0 63 V 0 4360 R 0 -63 V stroke -2655 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 400000)] +2941 308 M +[ [(Helvetica) 140.0 0.0 true true 0 (2)] ] -46.7 MCshow 1.000 UL LTb -3513 448 M +4944 448 M 0 63 V 0 4360 R 0 -63 V stroke -3513 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 600000)] -] -46.7 MCshow -1.000 UL -LTb -4372 448 M -0 63 V -0 4360 R -0 -63 V -stroke -4372 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 800000)] -] -46.7 MCshow -1.000 UL -LTb -5230 448 M -0 63 V -0 4360 R -0 -63 V -stroke -5230 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1e+06)] -] -46.7 MCshow -1.000 UL -LTb -6089 448 M -0 63 V -0 4360 R -0 -63 V -stroke -6089 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1.2e+06)] +4944 308 M +[ [(Helvetica) 140.0 0.0 true true 0 (3)] ] -46.7 MCshow 1.000 UL LTb @@ -678,7 +696,7 @@ LTb 0 -63 V stroke 6947 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1.4e+06)] +[ [(Helvetica) 140.0 0.0 true true 0 (4)] ] -46.7 MCshow 1.000 UL LTb @@ -699,7 +717,7 @@ grestore LTb LCb setrgbcolor 3942 98 M -[ [(Helvetica) 140.0 0.0 true true 0 (Polynomial's degrees)] +[ [(Helvetica) 140.0 0.0 true true 0 (Number of GPUs)] ] -46.7 MCshow LTb 1.000 UP @@ -710,235 +728,129 @@ LTb 3.000 UL LT0 LCb setrgbcolor -1022 4738 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 1 GPU)] -] -46.7 MLshow +6296 4738 M +[ [(Helvetica) 140.0 0.0 true true 0 (200K sparse)] +] -46.7 MRshow LT0 -2114 4738 M +6380 4738 M 399 0 V -1367 1516 M -429 527 V -859 754 V -858 137 V -859 362 V -858 249 V -859 80 V -858 -73 V -1367 1516 Pls -1796 2043 Pls -2655 2797 Pls -3513 2934 Pls -4372 3296 Pls -5230 3545 Pls -6089 3625 Pls -6947 3552 Pls -2313 4738 Pls +938 1336 M +2941 1125 L +2003 -17 V +6947 818 L +938 1336 BoxF +2941 1125 BoxF +4944 1108 BoxF +6947 818 BoxF +6579 4738 BoxF % End plot #1 % Begin plot #2 1.000 UP 3.000 UL LT1 LCb setrgbcolor -1022 4598 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 2 GPU)] -] -46.7 MLshow +6296 4598 M +[ [(Helvetica) 140.0 0.0 true true 0 (200K full)] +] -46.7 MRshow LT1 -2114 4598 M +6380 4598 M 399 0 V -1367 1277 M -429 597 V -859 420 V -858 342 V -859 353 V -858 33 V -859 272 V -858 73 V -1367 1277 Crs -1796 1874 Crs -2655 2294 Crs -3513 2636 Crs -4372 2989 Crs -5230 3022 Crs -6089 3294 Crs -6947 3367 Crs -2313 4598 Crs +938 2013 M +2941 1685 L +4944 1538 L +6947 1430 L +938 2013 TriUF +2941 1685 TriUF +4944 1538 TriUF +6947 1430 TriUF +6579 4598 TriUF % End plot #2 % Begin plot #3 1.000 UP 3.000 UL LT2 LCb setrgbcolor -1022 4458 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 3 GPU)] -] -46.7 MLshow +6296 4458 M +[ [(Helvetica) 140.0 0.0 true true 0 (800K sparse)] +] -46.7 MRshow LT2 -2114 4458 M +6380 4458 M 399 0 V -1367 1203 M -429 658 V -859 276 V -858 334 V -859 398 V -858 -2 V -859 420 V -858 20 V -1367 1203 Star -1796 1861 Star -2655 2137 Star -3513 2471 Star -4372 2869 Star -5230 2867 Star -6089 3287 Star -6947 3307 Star -2313 4458 Star +938 2902 M +2941 2519 L +4944 2368 L +6947 2052 L +938 2902 BoxF +2941 2519 BoxF +4944 2368 BoxF +6947 2052 BoxF +6579 4458 BoxF % End plot #3 % Begin plot #4 1.000 UP 3.000 UL LT3 LCb setrgbcolor -1022 4318 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 4 GPU)] -] -46.7 MLshow +6296 4318 M +[ [(Helvetica) 140.0 0.0 true true 0 (800K full)] +] -46.7 MRshow LT3 -2114 4318 M +6380 4318 M 399 0 V -1367 1035 M -429 593 V -859 415 V -858 339 V -859 234 V -858 170 V -859 534 V -858 -68 V -1367 1035 Box -1796 1628 Box -2655 2043 Box -3513 2382 Box -4372 2616 Box -5230 2786 Box -6089 3320 Box -6947 3252 Box -2313 4318 Box +938 3346 M +2941 3008 L +4944 2816 L +6947 2690 L +938 3346 TriUF +2941 3008 TriUF +4944 2816 TriUF +6947 2690 TriUF +6579 4318 TriUF % End plot #4 % Begin plot #5 1.000 UP 3.000 UL LT4 LCb setrgbcolor -1022 4178 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 1 GPU)] -] -46.7 MLshow +6296 4178 M +[ [(Helvetica) 140.0 0.0 true true 0 (1.4M sparse)] +] -46.7 MRshow LT4 -2114 4178 M +6380 4178 M 399 0 V -1367 2018 M -429 566 V -859 630 V -858 313 V -859 124 V -858 271 V -859 66 V -858 120 V -1367 2018 BoxF -1796 2584 BoxF -2655 3214 BoxF -3513 3527 BoxF -4372 3651 BoxF -5230 3922 BoxF -6089 3988 BoxF -6947 4108 BoxF -2313 4178 BoxF +938 3223 M +2941 2990 L +2003 -74 V +2003 -69 V +938 3223 BoxF +2941 2990 BoxF +4944 2916 BoxF +6947 2847 BoxF +6579 4178 BoxF % End plot #5 % Begin plot #6 1.000 UP 3.000 UL LT5 -LC8 setrgbcolor +LC7 setrgbcolor LCb setrgbcolor -1022 4038 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 2 GPU)] -] -46.7 MLshow +6296 4038 M +[ [(Helvetica) 140.0 0.0 true true 0 (1.4M full)] +] -46.7 MRshow LT5 -LC8 setrgbcolor -2114 4038 M +LC7 setrgbcolor +6380 4038 M 399 0 V -1367 1950 M -429 372 V -859 639 V -858 365 V -859 55 V -858 297 V -859 120 V -858 10 V -1367 1950 Circle -1796 2322 Circle -2655 2961 Circle -3513 3326 Circle -4372 3381 Circle -5230 3678 Circle -6089 3798 Circle -6947 3808 Circle -2313 4038 Circle +938 3917 M +2941 3542 L +4944 3370 L +6947 3230 L +938 3917 TriUF +2941 3542 TriUF +4944 3370 TriUF +6947 3230 TriUF +6579 4038 TriUF % End plot #6 -% Begin plot #7 -1.000 UP -3.000 UL -LT6 -LCb setrgbcolor -1022 3898 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 3 GPU)] -] -46.7 MLshow -LT6 -2114 3898 M -399 0 V -1367 1884 M -429 321 V -859 674 V -858 644 V -859 -296 V -858 293 V -859 159 V -858 -9 V -1367 1884 CircleF -1796 2205 CircleF -2655 2879 CircleF -3513 3523 CircleF -4372 3227 CircleF -5230 3520 CircleF -6089 3679 CircleF -6947 3670 CircleF -2313 3898 CircleF -% End plot #7 -% Begin plot #8 -1.000 UP -3.000 UL -LT7 -LCb setrgbcolor -1022 3758 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 4 GPU)] -] -46.7 MLshow -LT7 -2114 3758 M -399 0 V -1367 1723 M -429 395 V -859 609 V -858 422 V -859 -22 V -858 332 V -859 80 V -858 19 V -1367 1723 TriU -1796 2118 TriU -2655 2727 TriU -3513 3149 TriU -4372 3127 TriU -5230 3459 TriU -6089 3539 TriU -6947 3558 TriU -2313 3758 TriU -% End plot #8 1.000 UL LTb 938 4871 N diff --git a/Simulations/omp.eps b/Simulations/omp.eps index 7397ef4..554272c 100644 --- a/Simulations/omp.eps +++ b/Simulations/omp.eps @@ -1,7 +1,7 @@ %!PS-Adobe-2.0 -%%Title: omp.eps +%%Title: OMP.eps %%Creator: gnuplot 4.6 patchlevel 0 -%%CreationDate: Fri Dec 25 23:10:53 2015 +%%CreationDate: Fri Dec 25 21:39:39 2015 %%DocumentFonts: (atend) %%BoundingBox: 50 50 554 770 %%Orientation: Landscape @@ -456,13 +456,13 @@ systemdict /pdfmark known not { userdict /pdfmark systemdict /cleartomark get put } if SDict begin [ - /Title (omp.eps) + /Title (OMP.eps) /Subject (gnuplot plot) /Creator (gnuplot 4.6 patchlevel 0) /Author (lilia) % /Producer (gnuplot) % /Keywords () - /CreationDate (Fri Dec 25 23:10:53 2015) + /CreationDate (Fri Dec 25 21:39:39 2015) /DOCINFO pdfmark end } ifelse @@ -488,107 +488,165 @@ LTb -63 0 V stroke 854 448 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1)] +[ [(Helvetica) 140.0 0.0 true true 0 ( 10)] ] -46.7 MRshow 1.000 UL LTb -938 714 M +938 781 M 31 0 V 5978 0 R -31 0 V -938 1066 M +938 976 M 31 0 V 5978 0 R -31 0 V -938 1247 M +938 1114 M 31 0 V 5978 0 R -31 0 V -938 1333 M -63 0 V -5946 0 R --63 0 V -stroke -854 1333 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 10)] -] -46.7 MRshow -1.000 UL -LTb -938 1599 M +938 1221 M +31 0 V +5978 0 R +-31 0 V +938 1308 M +31 0 V +5978 0 R +-31 0 V +938 1382 M 31 0 V 5978 0 R -31 0 V -938 1951 M +938 1447 M 31 0 V 5978 0 R -31 0 V -938 2131 M +938 1503 M 31 0 V 5978 0 R -31 0 V -938 2217 M +938 1554 M 63 0 V 5946 0 R -63 0 V stroke -854 2217 M +854 1554 M [ [(Helvetica) 140.0 0.0 true true 0 ( 100)] ] -46.7 MRshow 1.000 UL LTb -938 2483 M +938 1887 M 31 0 V 5978 0 R -31 0 V -938 2836 M +938 2081 M 31 0 V 5978 0 R -31 0 V -938 3016 M +938 2219 M 31 0 V 5978 0 R -31 0 V -938 3102 M +938 2327 M +31 0 V +5978 0 R +-31 0 V +938 2414 M +31 0 V +5978 0 R +-31 0 V +938 2488 M +31 0 V +5978 0 R +-31 0 V +938 2552 M +31 0 V +5978 0 R +-31 0 V +938 2609 M +31 0 V +5978 0 R +-31 0 V +938 2660 M 63 0 V 5946 0 R -63 0 V stroke -854 3102 M +854 2660 M [ [(Helvetica) 140.0 0.0 true true 0 ( 1000)] ] -46.7 MRshow 1.000 UL LTb -938 3368 M +938 2992 M +31 0 V +5978 0 R +-31 0 V +938 3187 M +31 0 V +5978 0 R +-31 0 V +938 3325 M +31 0 V +5978 0 R +-31 0 V +938 3432 M 31 0 V 5978 0 R -31 0 V -938 3720 M +938 3520 M 31 0 V 5978 0 R -31 0 V -938 3901 M +938 3594 M 31 0 V 5978 0 R -31 0 V -938 3986 M +938 3658 M +31 0 V +5978 0 R +-31 0 V +938 3715 M +31 0 V +5978 0 R +-31 0 V +938 3765 M 63 0 V 5946 0 R -63 0 V stroke -854 3986 M +854 3765 M [ [(Helvetica) 140.0 0.0 true true 0 ( 10000)] ] -46.7 MRshow 1.000 UL LTb -938 4253 M +938 4098 M +31 0 V +5978 0 R +-31 0 V +938 4293 M +31 0 V +5978 0 R +-31 0 V +938 4431 M 31 0 V 5978 0 R -31 0 V -938 4605 M +938 4538 M 31 0 V 5978 0 R -31 0 V -938 4785 M +938 4626 M +31 0 V +5978 0 R +-31 0 V +938 4700 M +31 0 V +5978 0 R +-31 0 V +938 4764 M +31 0 V +5978 0 R +-31 0 V +938 4820 M 31 0 V 5978 0 R -31 0 V @@ -608,67 +666,27 @@ LTb 0 -63 V stroke 938 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 0)] -] -46.7 MCshow -1.000 UL -LTb -1796 448 M -0 63 V -0 4360 R -0 -63 V -stroke -1796 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 200000)] +[ [(Helvetica) 140.0 0.0 true true 0 (1)] ] -46.7 MCshow 1.000 UL LTb -2655 448 M +2941 448 M 0 63 V 0 4360 R 0 -63 V stroke -2655 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 400000)] +2941 308 M +[ [(Helvetica) 140.0 0.0 true true 0 (2)] ] -46.7 MCshow 1.000 UL LTb -3513 448 M +4944 448 M 0 63 V 0 4360 R 0 -63 V stroke -3513 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 600000)] -] -46.7 MCshow -1.000 UL -LTb -4372 448 M -0 63 V -0 4360 R -0 -63 V -stroke -4372 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 800000)] -] -46.7 MCshow -1.000 UL -LTb -5230 448 M -0 63 V -0 4360 R -0 -63 V -stroke -5230 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1e+06)] -] -46.7 MCshow -1.000 UL -LTb -6089 448 M -0 63 V -0 4360 R -0 -63 V -stroke -6089 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1.2e+06)] +4944 308 M +[ [(Helvetica) 140.0 0.0 true true 0 (3)] ] -46.7 MCshow 1.000 UL LTb @@ -678,7 +696,7 @@ LTb 0 -63 V stroke 6947 308 M -[ [(Helvetica) 140.0 0.0 true true 0 ( 1.4e+06)] +[ [(Helvetica) 140.0 0.0 true true 0 (4)] ] -46.7 MCshow 1.000 UL LTb @@ -699,7 +717,7 @@ grestore LTb LCb setrgbcolor 3942 98 M -[ [(Helvetica) 140.0 0.0 true true 0 (Polynomial's degrees)] +[ [(Helvetica) 140.0 0.0 true true 0 (Number of GPUs)] ] -46.7 MCshow LTb 1.000 UP @@ -710,235 +728,129 @@ LTb 3.000 UL LT0 LCb setrgbcolor -1022 4738 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 1 GPU)] -] -46.7 MLshow +6296 4738 M +[ [(Helvetica) 140.0 0.0 true true 0 (200K sparse)] +] -46.7 MRshow LT0 -2114 4738 M +6380 4738 M 399 0 V -1367 1518 M -429 563 V -859 487 V -858 331 V -859 267 V -858 122 V -859 360 V -858 -62 V -1367 1518 Pls -1796 2081 Pls -2655 2568 Pls -3513 2899 Pls -4372 3166 Pls -5230 3288 Pls -6089 3648 Pls -6947 3586 Pls -2313 4738 Pls +938 1383 M +2941 1058 L +2003 29 V +6947 847 L +938 1383 BoxF +2941 1058 BoxF +4944 1087 BoxF +6947 847 BoxF +6579 4738 BoxF % End plot #1 % Begin plot #2 1.000 UP 3.000 UL LT1 LCb setrgbcolor -1022 4598 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 2 GPU)] -] -46.7 MLshow +6296 4598 M +[ [(Helvetica) 140.0 0.0 true true 0 (200K full)] +] -46.7 MRshow LT1 -2114 4598 M +6380 4598 M 399 0 V -1367 1297 M -429 524 V -859 474 V -858 342 V -859 253 V -858 149 V -859 251 V -858 143 V -1367 1297 Crs -1796 1821 Crs -2655 2295 Crs -3513 2637 Crs -4372 2890 Crs -5230 3039 Crs -6089 3290 Crs -6947 3433 Crs -2313 4598 Crs +938 2014 M +2941 1703 L +4944 1544 L +6947 1437 L +938 2014 TriUF +2941 1703 TriUF +4944 1544 TriUF +6947 1437 TriUF +6579 4598 TriUF % End plot #2 % Begin plot #3 1.000 UP 3.000 UL LT2 LCb setrgbcolor -1022 4458 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 3 GPU)] -] -46.7 MLshow +6296 4458 M +[ [(Helvetica) 140.0 0.0 true true 0 (800K sparse)] +] -46.7 MRshow LT2 -2114 4458 M +6380 4458 M 399 0 V -1367 1247 M -429 597 V -859 316 V -858 328 V -859 212 V -858 205 V -859 474 V -858 -54 V -1367 1247 Star -1796 1844 Star -2655 2160 Star -3513 2488 Star -4372 2700 Star -5230 2905 Star -6089 3379 Star -6947 3325 Star -2313 4458 Star +938 2739 M +2941 2395 L +4944 2157 L +6947 2010 L +938 2739 BoxF +2941 2395 BoxF +4944 2157 BoxF +6947 2010 BoxF +6579 4458 BoxF % End plot #3 % Begin plot #4 1.000 UP 3.000 UL LT3 LCb setrgbcolor -1022 4318 M -[ [(Helvetica) 140.0 0.0 true true 0 (Sparse 4 GPU)] -] -46.7 MLshow +6296 4318 M +[ [(Helvetica) 140.0 0.0 true true 0 (800K full)] +] -46.7 MRshow LT3 -2114 4318 M +6380 4318 M 399 0 V -1367 1128 M -429 524 V -859 417 V -858 315 V -859 199 V -858 188 V -859 501 V -858 35 V -1367 1128 Box -1796 1652 Box -2655 2069 Box -3513 2384 Box -4372 2583 Box -5230 2771 Box -6089 3272 Box -6947 3307 Box -2313 4318 Box +938 3315 M +2941 3006 L +4944 2820 L +6947 2685 L +938 3315 TriUF +2941 3006 TriUF +4944 2820 TriUF +6947 2685 TriUF +6579 4318 TriUF % End plot #4 % Begin plot #5 1.000 UP 3.000 UL LT4 LCb setrgbcolor -1022 4178 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 1 GPU)] -] -46.7 MLshow +6296 4178 M +[ [(Helvetica) 140.0 0.0 true true 0 (1.4M sparse)] +] -46.7 MRshow LT4 -2114 4178 M +6380 4178 M 399 0 V -1367 2017 M -429 568 V -859 623 V -858 346 V -859 73 V -858 304 V -859 100 V -858 86 V -1367 2017 BoxF -1796 2585 BoxF -2655 3208 BoxF -3513 3554 BoxF -4372 3627 BoxF -5230 3931 BoxF -6089 4031 BoxF -6947 4117 BoxF -2313 4178 BoxF +938 3264 M +2941 3074 L +4944 2938 L +2003 -22 V +938 3264 BoxF +2941 3074 BoxF +4944 2938 BoxF +6947 2916 BoxF +6579 4178 BoxF % End plot #5 % Begin plot #6 1.000 UP 3.000 UL LT5 -LC8 setrgbcolor +LC7 setrgbcolor LCb setrgbcolor -1022 4038 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 2 GPU)] -] -46.7 MLshow +6296 4038 M +[ [(Helvetica) 140.0 0.0 true true 0 (1.4M full)] +] -46.7 MRshow LT5 -LC8 setrgbcolor -2114 4038 M +LC7 setrgbcolor +6380 4038 M 399 0 V -1367 1940 M -429 397 V -859 637 V -858 327 V -859 78 V -858 297 V -859 81 V -858 79 V -1367 1940 Circle -1796 2337 Circle -2655 2974 Circle -3513 3301 Circle -4372 3379 Circle -5230 3676 Circle -6089 3757 Circle -6947 3836 Circle -2313 4038 Circle +938 3929 M +2941 3577 L +4944 3377 L +6947 3237 L +938 3929 TriUF +2941 3577 TriUF +4944 3377 TriUF +6947 3237 TriUF +6579 4038 TriUF % End plot #6 -% Begin plot #7 -1.000 UP -3.000 UL -LT6 -LCb setrgbcolor -1022 3898 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 3 GPU)] -] -46.7 MLshow -LT6 -2114 3898 M -399 0 V -1367 1893 M -429 316 V -859 662 V -858 654 V -859 -295 V -858 297 V -859 105 V -858 44 V -1367 1893 CircleF -1796 2209 CircleF -2655 2871 CircleF -3513 3525 CircleF -4372 3230 CircleF -5230 3527 CircleF -6089 3632 CircleF -6947 3676 CircleF -2313 3898 CircleF -% End plot #7 -% Begin plot #8 -1.000 UP -3.000 UL -LT7 -LCb setrgbcolor -1022 3758 M -[ [(Helvetica) 140.0 0.0 true true 0 (Full 4 GPU)] -] -46.7 MLshow -LT7 -2114 3758 M -399 0 V -1367 1719 M -429 405 V -859 625 V -858 411 V -859 -37 V -858 329 V -859 105 V -858 7 V -1367 1719 TriU -1796 2124 TriU -2655 2749 TriU -3513 3160 TriU -4372 3123 TriU -5230 3452 TriU -6089 3557 TriU -6947 3564 TriU -2313 3758 TriU -% End plot #8 1.000 UL LTb 938 4871 N diff --git a/paper.tex b/paper.tex index be7faa0..d301646 100644 --- a/paper.tex +++ b/paper.tex @@ -701,11 +701,11 @@ Algorithm~\ref{alg1-cuda} shows the GPU parallel implementation of Ehrlich-Abert -\section{The EA algorithm on Multi-GPU} +\section{The EA algorithm on Multiple GPUs} \label{sec4} -\subsection{MGPU : an OpenMP-CUDA approach} +\subsection{M-GPU : an OpenMP-CUDA approach} Our OpenMP-CUDA implementation of EA algorithm is based on the hybrid OpenMP and CUDA programming model. It works as follows. -Based on the metadata, a shared memory is used to make data evenly shared among OpenMP threads. The shared data are the solution vector $Z$, the polynomial to solve $P$, and the error vector $\Delta z$. Let (T\_omp) the number of OpenMP threads be equal to the number of GPUs, each OpenMP thread binds to one GPU, and controls a part of the shared memory, that is a part of the vector Z , that is $(n/num\_gpu)$ roots where $n$ is the polynomial's degree and $num\_gpu$ the total number of available GPUs. Each OpenMP thread copies its data from host memory to GPU’s device memory.Then every GPU will have a grid of computation organized according to the device performance and the size of data on which it runs the computation kernels. %In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema shows the architecture of (CUDA,OpenMP). +Based on the metadata, a shared memory is used to make data evenly shared among OpenMP threads. The shared data are the solution vector $Z$, the polynomial to solve $P$, and the error vector $\Delta z$. Let (T\_omp) the number of OpenMP threads be equal to the number of GPUs, each OpenMP thread binds to one GPU, and controls a part of the shared memory, that is a part of the vector Z , that is $(n/num\_gpu)$ roots where $n$ is the polynomial's degree and $num\_gpu$ the total number of available GPUs. Each OpenMP thread copies its data from host memory to GPU’s device memory. Then every GPU will have a grid of computation organized according to the device performance and the size of data on which it runs the computation kernels. %In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema shows the architecture of (CUDA,OpenMP). %\begin{figure}[htbp] %\centering @@ -817,8 +817,8 @@ We study two categories of polynomials: sparse polynomials and full polynomials. For our tests, a CPU Intel(R) Xeon(R) CPU E5620@2.40GHz and a GPU K40 (with 6 Go of ram) are used. %SIDER : Une meilleure présentation de l'architecture est à faire ici. -In order to evaluate both the MGPU and Multi-GPU approaches, we performed a set of experiments on a single GPU and multiple GPUs using OpenMP or MPI by EA algorithm, for both sparse and full polynomials of different sizes. -All experimental results obtained are made in double precision data, the convergence threshold of the methods is set to $10^{-7}$. +In order to evaluate both the M-GPU and Multi-GPU approaches, we performed a set of experiments on a single GPU and multiple GPUs using OpenMP or MPI by EA algorithm, for both sparse and full polynomials of different sizes. +All experimental results obtained are made in double precision data whereas the convergence threshold of the EA method is set to $10^{-7}$. %Since we were more interested in the comparison of the %performance behaviors of Ehrlich-Aberth and Durand-Kerner methods on %CPUs versus on GPUs. @@ -827,117 +827,121 @@ of the methods are given in %Section~\ref{sec:vec_initialization}. \subsection{Evaluating the M-GPU (CUDA-OpenMP) approach} -We report here the results of the set of experiments with M-GPU approach for full and sparse polynomials of different degrees, and we compare it with a Single GPU execution. -\subsubsection{Execution times in seconds of the EA method for solving sparse polynomials on GPUs using shared memory paradigm with OpenMP} +We report here the results of the set of experiments with the M-GPU approach for full and sparse polynomials of different degrees, and we compare it with a Single GPU execution. +\subsubsection{Execution time of the EA method for solving sparse polynomials on multiple GPUs using the M-GPU approach} In this experiments we report the execution time of the EA algorithm, on single GPU and Multi-GPU with (2,3,4) GPUs, for different sparse polynomial degrees ranging from 100,000 to 1,400,000. \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_omp} -\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using shared memory paradigm with OpenMP} +\caption{Execution time in seconds of the Ehrlich-Aberth method for solving sparse polynomials on multiple GPUs using the M-GPU approach} \label{fig:01} \end{figure} -This figure~\ref{fig:01} shows that the (CUDA-OpenMP) Multi-GPU approach reduces the execution time by a factor up to 100 w.r.t the single GPU apparaoch and a by a factor of 1000 for polynomials exceeding degree 1,000,000. It shows the advantage to use the OpenMP parallel paradigm to gather the capabilities of several GPUs and solve polynomials of very high degrees. +This figure~\ref{fig:01} shows that the (CUDA-OpenMP) M-GPU approach reduces the execution time by a factor up to 100 w.r.t the single GPU approach and a by a factor of 1000 for polynomials exceeding degree 1,000,000. It shows the advantage to use the OpenMP parallel paradigm to gather the capabilities of several GPUs and solve polynomials of very high degrees. -\subsubsection{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials on GPUs using shared memory paradigm with OpenMP} +\subsubsection{Execution time in seconds of the Ehrlich-Aberth method for solving full polynomials on multiple GPUs using the M-GPU approach} The experiments shows the execution time of the EA algorithm, on a single GPU and on multiple GPUs using the CUDA OpenMP approach for full polynomials of degrees ranging from 100,000 to 1,400,000. \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{Full_omp} -\caption{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials on multiple GPUs using shared memory paradigm with OpenMP} +\caption{Execution time in seconds of the Ehrlich-Aberth method for solving full polynomials on multiple GPUs using the M-GPU appraoch} \label{fig:03} \end{figure} Results with full polynomials show very important savings in execution time. For a polynomial of degree 1,4 million, the CUDA-OpenMP approach with 4 GPUs solves it 4 times as fast as single GPU, thus achieving a quasi-linear speedup. \subsection{Evaluating the Multi-GPU (CUDA-MPI) approach} -In this part we perform a set of experiments to compare Multi-GPU (CUDA MPI) approach with single GPU, for solving full and sparse polynomials of degrees ranging from 100,000 to 1,400,000. +In this part we perform a set of experiments to compare the Multi-GPU (CUDA MPI) approach with a single GPU, for solving full and sparse polynomials of degrees ranging from 100,000 to 1,400,000. -\subsubsection{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using distributed memory paradigm with MPI} +\subsubsection{Execution time of the Ehrlich-Aberth method for solving sparse polynomials on multiple GPUs using the Multi-GPU approach} \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_mpi} -\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using distributed memory paradigm with MPI} +\caption{Execution time in seconds of the Ehrlich-Aberth method for solving sparse polynomials on multiple GPUs using the Multi-GPU approach} \label{fig:02} \end{figure} ~\\ -This figure shows 4 curves of execution time of EA algorithm, a curve with single GPU, 3 curves with multiple GPUs (2, 3, 4). We can clearly see that the curve with single GPU is above the other curves, which shows consumption in execution time compared to the Multi-GPU. We can see also that the CUDA-MPI approach reduces the execution time by a factor of 100 for polynomials of degree more than 1,000,000 whereas a single GPU is of the scale 1000. +Figure~\ref{fig:02} shows execution time of EA algorithm, for a single GPU, and multiple GPUs (2, 3, 4) on respectively 2, 3 and four MPI nodes. We can clearly see that the curve for a single GPU is above the other curves, which shows overtime in execution time compared to the Multi-GPU approach. We can see also that the CUDA-MPI approach reduces the execution time by a factor of 10 for polynomials of degree more than 1,000,000. For example, at degree 1000000, the execution time with a single GPU amounted to 10 thousand seconds, while with 4 GPUs, it is lowered to about just one thousand seconds which makes it for a tenfold speedup. %%SIDER : Je n'ai pas reformuler car je n'ai pas compris la phrase, merci de l'ecrire ici en fran\cais. \\cette figure montre 4 courbes de temps d'exécution pour l'algorithme EA, une courbe avec un seul GPU, 3 courbes pour multiple GPUs(2, 3, 4), on peut constaté clairement que la courbe à un seul GPU est au-dessus des autres courbes, vue sa consomation en temps d'exècution. On peut voir aussi qu'avec l'approche Multi-GPU (CUDA-MPI) reduit le temps d'exècution jusqu'à l'echelle 100 pour le polynômes qui dépasse 1,000,000 tandis que Single GPU est de l'echelle 1000. -\subsubsection{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials on GPUs using distributed memory paradigm with MPI} +\subsubsection{Execution time of the Ehrlich-Aberth method for solving full polynomials on multiple GPUs using the Multi-GPU appraoch} \begin{figure}[htbp] \centering - \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpi} -\caption{Execution times in seconds of the Ehrlich-Aberth method for full polynomials on GPUs using distributed memory paradigm with MPI} + \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpi} +\caption{Execution times in seconds of the Ehrlich-Aberth method for full polynomials on GPUs using the Multi-GPU} \label{fig:04} \end{figure} -%SIDER : Corriger le point de la courbe 3-GPUs qui correpsond à un degré de 600000 -Figure \ref{fig:04} shows the execution time of the algorithm on single GPU and on multipe GPUs with (2, 3, 4) GPUs for full polynomials. With the CUDA-MPI approach, we notice that the three curves are distinct from each other, more we use GPUs more the execution time decreases. On the other hand the curve with a single GPU is well above the other curves. + + Figure \ref{fig:04} shows execution time for a single GPU, and multiple GPUs (2, 3, 4) on respectively 2, 3 and four MPI nodes. With the CUDA-MPI approach, we notice that the three curves are distinct from each other, more we use GPUs more the execution time decreases. On the other hand the curve for a single GPU is well above the other curves. This is due to the use of MPI parallel paradigm that divides the problem computations and assigns portions to each GPU. But unlike the single GPU which carries all the computations on a single GPU, data communications are introduced, consequently engendering more execution time. But experiments show that execution time is still highly reduced. -\subsection{Comparative between (CUDA-OpenMP) approach and (CUDA-MPI) approach} -In this part we present some experiment comparing the two Multi-GPU approach (OpenMP versus MPI) for solving sparse polynomial, full polynomials than we compare the execution time of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with MPI and with OpenMP. +\subsection{Comparing the CUDA-OpenMP approach and the CUDA-MPI approach} + +In the previuos section we saw that both approches are very effective in reducing execution time for sparse as well as full polynomials. At this stage, the interesting question is which approach is better. In the fellowing, we present appropriate experiments comparing the two Multi-GPU approaches to answer the question. -\subsubsection{Comparison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving sparse polynomials on GPUs} -In this experiment we chose three polynomials of different size like (200K, 800K, 1,4M). We compare their execution time according to the number of the GPUs. +\subsubsection{Solving sparse polynomials} +In this experiment three sparse polynomials of size 200K, 800K and 1,4M are investigated. \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{Sparse} -\caption{Comparison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving sparse polynomials on GPUs.} +\caption{Execution time for solving sparse polynomials of three distinct sizes on multiple GPUs using MPI and OpenMP approaches using Ehrlich-Aberth} \label{fig:05} \end{figure} -in figure ~\ref{fig:05} we have two curves: MPI curve and OpenMP curve for each polynomials size. We can see that the results are similar between OpenMP curves and MPI curves for the polynomials size (200K, 1,4M), but there is a slight different between MPI curve and OpenMP curve for the polynomial of size 800K. ... +In Figure~\ref{fig:05} there two curves for each polynomial size : one for the MPI-CUDA and another for the OpenMP. We can see that the results are similar between OpenMP and MPI for the polynomials size of 200K. For the size of 800K, the MPI version is a little slower than the OpenMP approach but for for the 1,4M size, there is a slight advantage for the MPI version. -\subsubsection{Comparison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving full polynomials on GPUs} +\subsubsection{Solving full polynomials} \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{Full} -\caption{Comparison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving full polynomials on GPUs.} +\caption{Execution time for solving full polynomials of three distinct sizes on multiple GPUs using MPI and OpenMP approaches using Ehrlich-Aberth} \label{fig:06} \end{figure} -in figure ~\ref{fig:06}, we can see that the two paradigm MPI and OpenMP give the same result for solving full polynomials with EA algorithm. -% size (200k,800K, 1,4M) are very similar for solving full polynomials with the EA algorithm. +In Figure~\ref{fig:06}, we can see that when it comes to full polynomials, both approaches are almost equivalent. -\subsubsection{Comparison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with distributed memory paradigm using MPI} -in this experiment we compare the execution time of EA algorithm according to the number of the GPU for solving sparse and full polynomials on Multi-GPU using MPI. We chose three sparse and full polynomials of different size like (200K, 800K, 1,4M). +\subsubsection{Solving sparse and full polynomials of the same size with CUDA-MPI} +In this experiment we compare the execution time of the EA algorithm according to the number of GPUs for solving sparse and full polynomials on Multi-GPU using MPI. We chose three sparse and full polynomials of size 200K, 800K and 1,4M. \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{MPI} -\caption{Comparison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with distributed memory paradigm using MPI.} +\caption{Execution time for solving sparse and full polynomials of three distinct sizes on multiple GPUs using MPI} \label{fig:07} \end{figure} -in figure ~\ref{fig:07} we can see that (CUDA-MPI) can solve sparse and full polynomials of high degrees, the execution time with sparse polynomial are very low comparing to full polynomials. with sparse polynomials the number of monomial are reduce, consequently the number of operation are reduce than the execution time decrease. +in figure ~\ref{fig:07} we can see that CUDA-MPI can solve sparse and full polynomials of high degrees, the execution time with sparse polynomial are very low comparing to full polynomials. with sparse polynomials the number of monomial are reduce, consequently the number of operation are reduce than the execution time decrease. -\subsubsection{Comparison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with shared memory paradigm using OpenMP} +\subsubsection{Solving sparse and full polynomials of the same size with CUDA-OpenMP} \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{OMP} -\caption{Comparison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with shared memory paradigm using OpenMP.} +\caption{Execution time for solving sparse and full polynomials of three distinct sizes on multiple GPUs using OpenMP} \label{fig:08} \end{figure} -in figure ~\ref{fig:08} -\subsection{Scalability of the EA method on Multi-GPU to solve very high polynomials degrees} - This experiment we report the execution time according to the degrees polynomials ranging from 1,000,000 to 5,000,000 for both approaches (cuda-OpenMP) and (CUDA-MPI) with sparse and full polynomials. +Figure ~\ref{fig:08} shows the impact of sparsity on the effectiveness of the CUDA-OpenMP approach. We can see that the impact fellows the same pattern, a difference in execution time in favor of the sparse polynomials. +%SIDER : il faut une explication ici. je ne vois pas de prime abords, qu'est-ce qui engendre cette différence, car quelques soient les coefficients nulls ou non nulls, c'est toutes les racines qui sont calculées qu'elles soient similaires ou non (degrés de multiplicité). +\subsection{Scalability of the EA method on Multi-GPU to solve very high degree polynomials} +These experiments report the execution time according to the degrees of polynomials ranging from 1,000,000 to 5,000,000 for both approaches with sparse and full polynomials. \begin{figure}[htbp] \centering \includegraphics[angle=-90,width=0.5\textwidth]{big} - \caption{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials of high degrees on 4 GPUs.} + \caption{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials of high degree on 4 GPUs for sizes ranging from 1M to 5M} \label{fig:09} \end{figure} -in figure ~\ref{fig:09} we can see that both (cuda-OpenMP) and (CUDA-MPI) approaches are scalable can solve very high polynomials degrees. with full polynomial the both approaches give very interesting ans similar results for polynomials of 5,000,000 degrees we not reach 30,000 s +In figure ~\ref{fig:09} we can see that both approaches are scalable and can solve very high degree polynomials. With full polynomial both approaches give interestingly very similar results. For the sparse case however, there are a noticeable difference in favour of MPI when the degree is above 4M. Between 1M and 3M, the OMP approach is more effective and under 1M degree, OMP and MPI approaches are almost equivalent. + +%SIDER : il faut une explication sur les différences ici aussi. + %for sparse and full polynomials % An example of a floating figure using the graphicx package. % Note that \label must occur AFTER (or within) \caption. -- 2.39.5