\relax \citation{tukey77} \citation{Huang:1981:TDS:539567} \citation{Weiss:2006:FMB:1179352.1141918} \citation{5402362} \citation{chen09} \citation{6288187} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}} \@writefile{toc}{\contentsline {section}{\numberline {2}General structure}{2}} \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \newlabel{algo:memcopy:H2D}{{7}{2}} \newlabel{algo:memcopy:kernel}{{8}{2}} \newlabel{algo:memcopy:D2H}{{9}{2}} \@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{2}} \newlabel{algo:memcopy}{{1}{2}} \@writefile{toc}{\contentsline {section}{\numberline {3}Implementing a fast median filter}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Basic principles}{2}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Illustration of $5\times 5$ median filtering\relax }}{2}} \newlabel{median1}{{1}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Using registers}{2}} \citation{Batcher:1968:SNA:1468075.1468121} \citation{cormen2001introduction} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{3}} \newlabel{forget}{{2}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Hiding Latencies}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Compute complexity}{3}} \citation{Sanchez-2-2012} \@writefile{toc}{\contentsline {section}{\numberline {4}Experiments}{4}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel processing 2 input pixels. The first 7 forgetful selection stages are common to both processed center pixels: the first one needs 14 pixels, leaving 6 more pixels to be processedone after another.\relax }}{4}} \newlabel{median5overlap}{{3}{4}} \@writefile{toc}{\contentsline {section}{\numberline {5}Results}{4}} \bibstyle{plain} \bibdata{biblio3} \bibcite{chen09}{1} \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Time cost of data transfer for each image size and gray-level format on C2070 GPU.\relax }}{5}} \newlabel{tabmemcpy}{{1}{5}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Pixel throughput value comparison, in million pixels per second, of several implementation against our PRMF. From left to right: PCMF, BVM, PRMF, ArrayFire (impossible with 4096$\times $4096)\relax }}{5}} \newlabel{figcomp}{{4}{5}} \@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Maximum effective pixel throughput values for $T_8$ and $T_{16}$ (in MPixel per second) on C2070, achieved when processing 8 and 16 bit-coded gray-level images.\relax }}{5}} \newlabel{tabmaxtp}{{2}{5}} \bibcite{Huang:1981:TDS:539567}{2} \bibcite{5402362}{3} \bibcite{6288187}{4} \bibcite{Sanchez-2-2012}{5} \bibcite{tukey77}{6} \bibcite{Weiss:2006:FMB:1179352.1141918}{7} \@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Runtime and pixel throughput of fast median kernels processing 8 and 16 bit-coded gray-level images and run by C2070 GPU.\relax }}{6}} \newlabel{tabresults}{{3}{6}}