@Conference{ch19:spmv-iccs,
author = {Dang, H.-V. and Schmidt, B.},
- title = {{The Sliced COO format for Sparse Matrix-Vector Multiplication on CUDA-enabled GPUs}},
+ title = {The Sliced {COO} format for Sparse Matrix-Vector Multiplication on {CUDA}-enabled {GPUs}},
year = {2012},
pages = {57-66},
- booktitle = {Proc. ICCS, Procedia Vol. 9}
+ booktitle = {International Conference on Computational Science ICCS, Procedia Vol. 9}
}
@article{ch19:spmv-ccpe,
Processing Unit Systems}},
journal = {Concurrency and Computation: Practice and Experience},
volume = {25},
- number = {4},
+ issue = {4},
year = {2013},
pages = {586-603},
}
@misc{ch19:matview,
author = { J. Kohl },
- url = {http://www.csm.ornl.gov/~kohl/MatView/},
- title = {MatView: Scalable Sparse Matrix Viewer},
- year = {2008},
+ note = {http://www.csm.ornl.gov/~kohl/MatView/},
+ title = {Mat{V}iew: Scalable Sparse Matrix Viewer},
+ OPTyear = {2008},
}
@inproceedings{ch19:nvidia-spmv,
@article{ch19:bellpack,
author = {Choi, J. W. and Singh, A. and Vuduc, R. W.},
- title = {{Model-driven autotuning of sparse matrix-vector multiply on GPUs}},
- journal = {SIGPLAN Not.},
+ title = {Model-driven autotuning of sparse matrix-vector multiply on {GPUs}},
+ journal = {ACM SIGPLAN Notices - PPoPP '10},
volume = {45},
issue = {5},
month = {January},
author = {Bertil Schmidt and
Hans Aribowo and
Hoang-Vu Dang},
- title = { {Iterative Sparse Matrix-Vector Multiplication for Integer
- Factorization on GPUs} },
+ title = { Iterative Sparse Matrix-Vector Multiplication for Integer
+ Factorization on {GPUs} },
booktitle = {Euro-Par (2)},
year = {2011},
pages = {413-424},
@article{ch19:Thome:subqad,
author = {Thom\'{e}, E.},
- title = { {Subquadratic Computation of Vector Generating Polynomials and Improvement of the Block Wiedemann Algorithm} },
+ title = { Subquadratic Computation of Vector Generating Polynomials and Improvement of the Block {W}iedemann Algorithm },
journal = {J. Symb. Comput.},
volume = {33},
issue = {5},
@InProceedings{ch19:aoki,
author = {Aoki, K. and Shimoyama, T. and Ueda, H.},
- title = {{Experiments on the Linear Algebra Step in the Number Field Sieve}},
+ title = {Experiments on the Linear Algebra Step in the Number Field Sieve},
booktitle = {Proceedings of the Security 2nd international conference on Advances in information and computer security},
series = {IWSEC'07},
year = {2007},
@InProceedings{ch19:kilobit,
author = {K. Aoki and J. Franke and T. Kleinjung and A. K. Lenstra and D. A. Osvik},
-title = {{A Kilobit Special Number Field Sieve Factorization.}},
+title = {A Kilobit Special Number Field Sieve Factorization.},
+address ={Kuching, Malaysia},
booktitle = {ASIACRYPT},
year = {2007}
}
@techreport{ch19:nvidia,
author = {N. Bell and M. Garland},
- title = {{Efficient Sparse Matrix-Vector Multiplication on {CUDA}}},
+ title = {Efficient Sparse Matrix-Vector Multiplication on {CUDA}},
month = dec,
year = {2008},
institution = {NVIDIA Corporation},
author = {B. Boyer and
J.-G. Dumas and
P. Giorgi},
- title = {{Exact Sparse Matrix-Vector Multiplication on GPU's and Multicore
- Architectures}},
+ title = {Exact Sparse Matrix-Vector Multiplication on GPU's and Multicore
+ Architectures},
journal = {CoRR},
volume = {abs/1004.3719},
year = {2010},
@article{ch19:bw,
author = {D. Coppersmith},
-title = {{Solving Homogeneous Linear Equations Over GF(2) via Block Wiedemann Algorithm}},
+title = {Solving Homogeneous Linear Equations Over {GF(2)} via Block {W}iedemann Algorithm},
journal = {Mathematics of Computation},
volume = {62},
year = {1994},
D. Kim},
title = {{Load Balanced Block Lanczos Algorithm over GF(2) for Factorization
of Large Keys}},
- booktitle = {HiPC},
+ booktitle = {High Performance Computing, HiPC},
+address = {Bangalore, India},
year = {2006},
pages = {375-386},
ee = {http://dx.doi.org/10.1007/11945918_38},
@InProceedings{ch19:rsa768,
author = {{T. Kleinjung et al.}},
-title = {{Factorization of a 768-Bit RSA Modulus}},
+title = {Factorization of a 768-bit {RSA} modulus},
booktitle = {International Crytology Conference},
year = {2010},
+adress= {Melaka, Malaysia},
pages = {333--350},
doi = {10.1007/978-3-642-14623-7_18},
masid = {13669015}
author = {{T. Kleinjung et al.}},
HAL_ID = {inria-00535765},
_URL = {http://hal.inria.fr/inria-00535765/en/},
- title = { {A} {H}eterogeneous {C}omputing {E}nvironment to {S}olve the 768-bit {RSA} {C}hallenge},
- language = {{A}nglais},
- affiliation = {{L}aboratory for {C}ryptologic {A}lgorithms - {LACAL} - {\'E}cole {P}olytechnique {F}{\'e}d{\'e}rale de {L}ausanne - {NTT} {I}nformation {S}haring {P}latform {L}aboratories - {ISL} - {N}ippon {T}elegraph \& {T}elephone {C}orporation - {NTT} - {S}ilverbrook {R}esearch - silverbrook research - {D}epartment of {M}athematics - {U}niversity of {B}onn - {B}onn {U}niversit{\"a}t - {U}niversity of {B}onn - {CARAMEL} - {INRIA} {N}ancy - {G}rand {E}st / {LORIA} - {INRIA} - {CNRS} : {UMR}7503 - {U}niversit{\'e} {H}enri {P}oincar{\'e} - {N}ancy {I} - {U}niversit{\'e} {N}ancy {II} - {I}nstitut {N}ational {P}olytechnique de {L}orraine - {EPFL} / {D}omaine {IT} - {DIT} - {\'E}cole {P}olytechnique {F}{\'e}d{\'e}rale de {L}ausanne - {M}icrosoft {R}esearch - {M}icrosoft - {S}cientific {C}omputing and {C}ontrol {T}heory - {MAS}2 - {CWI} - {S}wiss {I}nstitute of {B}ioinformatics - {L}ausanne - {SIB} - {S}wiss {I}nstitute of {B}ioinformatics },
+ title = { A Heterogeneous Computing Environment to Solve the 768-bit {RSA} Challenge},
publisher = {{S}pringer-{V}erlag },
- journal = {{C}luster {C}omputing },
- audience = {internationale },
- year = {2010},
+ journal = {Cluster Computing },
+ year = {2012},
+ volume={15},
+number={1},
+pages = {53-68},
}
@inproceedings{ch19:grid,
HAL_ID = {inria-00502899},
_URL = {http://hal.inria.fr/inria-00502899/en/},
- title = { {Using a Grid Platform for Slving Large Sparse Linear Systems over {GF}(2)}},
+ title = { Using a Grid Platform for Slving Large Sparse Linear Systems over {GF}(2)},
author = {{K}leinjung, {T}. and {N}ussbaum, {L}. and {T}hom{\'e}, {E}.},
language = {{A}nglais},
affiliation = {{L}aboratory for {C}ryptologic {A}lgorithms - {LACAL} - {\'E}cole {P}olytechnique {F}{\'e}d{\'e}rale de {L}ausanne - {ALGORILLE} - {INRIA} {L}orraine - {LORIA} - {INRIA} - {CNRS} : {UMR}7503 - {U}niversit{\'e} {H}enri {P}oincar{\'e} - {N}ancy {I} - {U}niversit{\'e} {N}ancy {II} - {I}nstitut {N}ational {P}olytechnique de {L}orraine - {CARAMEL} - {INRIA} {N}ancy - {G}rand {E}st / {LORIA} - {INRIA} - {CNRS} : {UMR}7503 - {U}niversit{\'e} {H}enri {P}oincar{\'e} - {N}ancy {I} - {U}niversit{\'e} {N}ancy {II} - {I}nstitut {N}ational {P}olytechnique de {L}orraine },
author = {A. Monakov and
A. Lokhmotov and
A. Avetisyan},
- title = {{Automatically Tuning Sparse Matrix-Vector Multiplication
- for GPU Architectures}},
- booktitle = {HiPEAC},
+ title = {Automatically Tuning Sparse Matrix-Vector Multiplication
+ for {GPU} Architectures},
+ booktitle = {International conference on High-Performance
+Embedded Architectures and Compilers,HiPEAC},
+ address= {Pisa, Italy},
year = {2010},
pages = {111-125},
ee = {http://dx.doi.org/10.1007/978-3-642-11515-8_10},
}
@misc{ch19:cuda-guide,
- author = {NVIDIA},
+ author = {{NVIDIA}},
url = {http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf},
title = {{CUDA C Programming Guide}},
year={2011},
@misc{ch19:cadonfs,
author = {{P. Gaudry et al.}},
title = {{CADO-NFS}},
- url = {http://cado-nfs.gforge.inria.fr/},
- year = {2010}
+ note = {http://cado-nfs.gforge.inria.fr/},
+ OPTyear = {2010}
}
@MISC{ch19:cusp,
author = "N. Bell and M. Garland",
- title = {{Cusp: Generic Parallel Algorithms for Sparse Matrix and Graph Computations}},
- year = "2010",
- url = "http://cusp-library.googlecode.com",
- note = "Version 0.1.0"
+ title = {CUSP: Generic Parallel Algorithms for Sparse Matrix and Graph Computations},
+OPTyear = "2012",
+ note = "http://cusplibrary.github.io/",
+
}
@misc{ch19:msieve,
-The SCOO format achieves a stable performance for different matrices in single-precision mode. In most cases a performance of over 10 Gflop/s can be sustained. For some highly unstructured matrices such as \emph{GL7d19}, \emph{wikipedia-20070206}, \emph{rgg\_n\_2\_24\_s0} and \emph{kron\_g500-logn21} SCOO achieves high speedups ranging from 3 to 6 compared to the best performaning Cusp format.
+The SCOO format achieves a stable performance for different matrices in single-precision mode. In most cases a performance of over 10 Gflop/s can be sustained. For some highly unstructured matrices such as \emph{GL7d19}, \emph{wikipedia-20070206}, \emph{rgg\_n\_2\_24\_s0}, and \emph{kron\_g500-logn21}, SCOO achieves high speedups ranging from 3 to 6 compared to the best performaning Cusp format.
-For most matrices, HYB produces the best performance among the tested Cusp formats. HYB is able to outperform SCOO only for two matrices: \emph{nlpkkt120} and \emph{nlpkkt160}. Both matrices have a similar structure i.e. they consist of consecutive rows that have a very similar number of nonzero coefficients which is suitable to be stored in the ELL section of the HYB format. Moreover the nonzeros are close to each other facilitating coaleasing and cache-friendly access patterns by nature. SCOO is able to outperform COO and CSR for all tested matrices.
+For most matrices, HYB produces the best performance among the tested Cusp formats. HYB is able to outperform SCOO for only two matrices: \emph{nlpkkt120} and \emph{nlpkkt160}. Both matrices have a similar structure, i.e., they consist of consecutive rows that have a very similar number of nonzero coefficients which are suitable to be stored in the ELL section of the HYB format. Moreover the nonzeros are close to each other facilitating coaleasing and cache-friendly access patterns by nature. SCOO is able to outperform COO and CSR for all tested matrices.
In matrix $Relat9$ we observe some patterns but the matrix is still generally unstructured, thus SCOO is able to achieve about 2 times speed up compared to HYB which is the best among tested Cusp formats in this case. The average speedup of SCOO for the tested matrices is 3.0 compared to CSR, 5.02 compared to COO, 2.15 compared to HYB.
-We show the visualization of sparse matrices \emph{nlpkkt120}, \emph{Relat9}, \emph{GL7d19} in Figure \ref{fig:mat-str}, \ref{fig:mat-mid}, \ref{fig:mat-unstr} using MatView \cite{ch19:matview}. The white color represents zero entries, gray color represents nonzero entries.
+We show the visualization of sparse matrices \emph{nlpkkt120}, \emph{relat9}, \emph{GL7d19} in Figure \ref{fig:mat-str}, \ref{fig:mat-mid}, \ref{fig:mat-unstr} using MatView \cite{ch19:matview}. The white color represents zero entries, gray color represents nonzero entries.
\begin{figure}[htbp]
\centering
\includegraphics[width=100pt]{Chapters/chapter19/fig/matrix-str.pdf}
\label{fig:mat-str}
}
- \subfigure[Relat9 - first 10000 rows] {
+ \subfigure[relat9--first 10000 rows] {
\includegraphics[width=100pt]{Chapters/chapter19/fig/matrix-mid.pdf}
\label{fig:mat-mid}
}
- \subfigure[GL7d19 - first 500 rows and columns] {
+ \subfigure[GL7d19--first 500 rows and columns] {
\includegraphics[width=100pt]{Chapters/chapter19/fig/matrix-uns.pdf}
\label{fig:mat-unstr}
}
- \caption{Visualization of \emph{nlpkkt120}, \emph{Relat9}, and \emph{GL7d19} matrix.}
+ \caption{Visualization of \emph{nlpkkt120}, \emph{relat9}, and \emph{GL7d19} matrix.}
% \label{fig:mat-visual}
\end{figure}
\label{fig:scoo-vs-cpu}
\end{figure}
-We use the Intel MKL library 10.3 in order to compare SCOO performance to an optimized CPU implementation. MKL SpMV receives the input matrices in CSR format. The results are shown in Figure \ref{fig:scoo-vs-cpu}. Using a GTX-580, we achieve speedups ranging between 5.5 and 18 over MKL on a 4-core CPU with hyper-threading using 8 threads. Also note that the SCOO performance on a GTX-580 is around 1.5 times faster than on the C2075 due to the increased memory bandwidth and clock speed. The storage requirement for the \emph{rgg\_n\_2\_24\_s0} and \emph{uk-2002} matrices and associated input/output vectors slightly exceeds the 3 GB global memory of the GTX-580 and thus are not included.
+We used the Intel MKL library 10.3 in order to compare SCOO performance to an optimized CPU implementation. MKL SpMV receives the input matrices in CSR format. The results are shown in Figure \ref{fig:scoo-vs-cpu}. Using a GTX-580, we achieved speedups ranging between 5.5 and 18 over MKL on a 4-core CPU with hyper-threading using 8 threads. Also note that the SCOO performance on a GTX-580 is around 1.5 times faster than on the C2075 due to the increased memory bandwidth and clock speed. The storage requirement for the \emph{rgg\_n\_2\_24\_s0} and \emph{uk-2002} matrices and associated input/output vectors slightly exceeds the 3 GB global memory of the GTX-580 and thus are not included.
\section{Conclusion}
\label{ch19:conclusion}
-In this chapter, we have presented our implementation of iterative SpMV for NFS matrices on GPUs with the CUDA programming language. Our GPU implementation takes advantage of the variety of sparseness properties in NFS matrices to produce suitable formats for different parts. The GPU implementation shows promising improvement over an optimized CPU implementation. As the size of integers in factorization projects is expected to increase further, the linear algebrea step of NFS will become an even bigger bottleneck. The size and sparseness of matrices generated by the NFS sieving step are growing significantly with the size of the integer to be factored. Thus, a big GPU cluster is required to accelerate the linear algebra step. However, in order to achieve scalability for bigger problem sizes, the amount of GPU RAM and data transfer bandwidth need to be increased in addition to the number of GPUs.
+In this chapter, we have presented our implementation of iterative SpMV for NFS matrices on GPUs with the CUDA programming language. Our GPU implementation takes advantage of the variety of sparseness properties in NFS matrices to produce suitable formats for different parts. The GPU implementation shows promising improvement over an optimized CPU implementation. As the size of integers in factorization projects is expected to increase further, the linear algebrea step of NFS will become an even bigger bottleneck. The size and sparseness of matrices generated by the NFS sieving step are growing significantly with the size of the integer to be factored. Thus, a large GPU cluster is required to accelerate the linear algebra step. However, in order to achieve scalability for larger problem sizes, the amounts of GPU RAM and data transfer bandwidth need to be increased in addition to the number of GPUs.
-We further adapted the proposed Sliced COO format to single-precision floating-point numbers and evaluated it with large and sparse matrices derived from other computational science applications. We have published our code at https://github.com/danghvu/cudaSpmv.
+We further adapted the proposed Sliced COO format to single-precision floating-point numbers and evaluated it with large and sparse matrices derived from other computational science applications. We have published our code at https://github.com/danghvu/cudaSpmv
\putbib[Chapters/chapter19/biblio]