@article{46120, abstract = {{The rise of exascale supercomputers has fueled competition among GPU vendors, driving lattice QCD developers to write code that supports multiple APIs. Moreover, new developments in algorithms and physics research require frequent updates to existing software. These challenges have to be balanced against constantly changing personnel. At the same time, there is a wide range of applications for HISQ fermions in QCD studies. This situation encourages the development of software featuring a HISQ action that is flexible, high-performing, open source, easy to use, and easy to adapt. In this technical paper, we explain the design strategy, provide implementation details, list available algorithms and modules, and show key performance indicators for SIMULATeQCD, a simple multi-GPU lattice code for large-scale QCD calculations, mainly developed and used by the HotQCD collaboration. The code is publicly available on GitHub.}}, author = {{Mazur, Lukas and Bollweg, Dennis and Clarke, David A. and Altenkort, Luis and Kaczmarek, Olaf and Larsen, Rasmus and Shu, Hai-Tao and Goswami, Jishnu and Scior, Philipp and Sandmeyer, Hauke and Neumann, Marius and Dick, Henrik and Ali, Sajid and Kim, Jangho and Schmidt, Christian and Petreczky, Peter and Mukherjee, Swagato}}, journal = {{Computer Physics Communications}}, title = {{{SIMULATeQCD: A simple multi-GPU lattice code for QCD calculations}}}, doi = {{10.48550/ARXIV.2306.01098}}, year = {{2023}}, } @article{46119, author = {{Altenkort, Luis and Eller, Alexander M. and Francis, Anthony and Kaczmarek, Olaf and Mazur, Lukas and Moore, Guy D. and Shu, Hai-Tao}}, issn = {{2470-0010}}, journal = {{Physical Review D}}, number = {{1}}, publisher = {{American Physical Society (APS)}}, title = {{{Viscosity of pure-glue QCD from the lattice}}}, doi = {{10.1103/physrevd.108.014503}}, volume = {{108}}, year = {{2023}}, } @article{38041, abstract = {{While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces. In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.}}, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, issn = {{1936-7406}}, journal = {{ACM Transactions on Reconfigurable Technology and Systems}}, keywords = {{General Computer Science}}, publisher = {{Association for Computing Machinery (ACM)}}, title = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}}, doi = {{10.1145/3576200}}, year = {{2023}}, } @inbook{45893, author = {{Hansmeier, Tim and Kenter, Tobias and Meyer, Marius and Riebler, Heinrich and Platzner, Marco and Plessl, Christian}}, booktitle = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}}, editor = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}}, pages = {{165--182}}, publisher = {{Heinz Nixdorf Institut, Universität Paderborn}}, title = {{{Compute Centers I: Heterogeneous Execution Environments}}}, doi = {{10.5281/zenodo.8068642}}, volume = {{412}}, year = {{2023}}, } @inproceedings{46190, author = {{Opdenhövel, Jan-Oliver and Plessl, Christian and Kenter, Tobias}}, booktitle = {{Proceedings of the 13th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies}}, publisher = {{ACM}}, title = {{{Mutation Tree Reconstruction of Tumor Cells on FPGAs Using a Bit-Level Matrix Representation}}}, doi = {{10.1145/3597031.3597050}}, year = {{2023}}, } @inproceedings{46188, author = {{Faj, Jennifer and Kenter, Tobias and Faghih-Naini, Sara and Plessl, Christian and Aizinger, Vadym}}, booktitle = {{Proceedings of the Platform for Advanced Scientific Computing Conference}}, publisher = {{ACM}}, title = {{{Scalable Multi-FPGA Design of a Discontinuous Galerkin Shallow-Water Model on Unstructured Meshes}}}, doi = {{10.1145/3592979.3593407}}, year = {{2023}}, } @inproceedings{46189, author = {{Prouveur, Charles and Haefele, Matthieu and Kenter, Tobias and Voss, Nils}}, booktitle = {{Proceedings of the Platform for Advanced Scientific Computing Conference}}, publisher = {{ACM}}, title = {{{FPGA Acceleration for HPC Supercapacitor Simulations}}}, doi = {{10.1145/3592979.3593419}}, year = {{2023}}, } @inproceedings{43228, abstract = {{The computation of electron repulsion integrals (ERIs) over Gaussian-type orbitals (GTOs) is a challenging problem in quantum-mechanics-based atomistic simulations. In practical simulations, several trillions of ERIs may have to be computed for every time step. In this work, we investigate FPGAs as accelerators for the ERI computation. We use template parameters, here within the Intel oneAPI tool flow, to create customized designs for 256 different ERI quartet classes, based on their orbitals. To maximize data reuse, all intermediates are buffered in FPGA on-chip memory with customized layout. The pre-calculation of intermediates also helps to overcome data dependencies caused by multi-dimensional recurrence relations. The involved loop structures are partially or even fully unrolled for high throughput of FPGA kernels. Furthermore, a lossy compression algorithm utilizing arbitrary bitwidth integers is integrated in the FPGA kernels. To our best knowledge, this is the first work on ERI computation on FPGAs that supports more than just the single most basic quartet class. Also, the integration of ERI computation and compression it a novelty that is not even covered by CPU or GPU libraries so far. Our evaluation shows that using 16-bit integer for the ERI compression, the fastest FPGA kernels exceed the performance of 10 GERIS ($10 \times 10^9$ ERIs per second) on one Intel Stratix 10 GX 2800 FPGA, with maximum absolute errors around $10^{-7}$ - $10^{-5}$ Hartree. The measured throughput can be accurately explained by a performance model. The FPGA kernels deployed on 2 FPGAs outperform similar computations using the widely used libint reference on a two-socket server with 40 Xeon Gold 6148 CPU cores of the same process technology by factors up to 6.0x and on a new two-socket server with 128 EPYC 7713 CPU cores by up to 1.9x.}}, author = {{Wu, Xin and Kenter, Tobias and Schade, Robert and Kühne, Thomas and Plessl, Christian}}, booktitle = {{2023 IEEE 31st Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}}, pages = {{162--173}}, title = {{{Computing and Compressing Electron Repulsion Integrals on FPGAs}}}, doi = {{10.1109/FCCM57271.2023.00026}}, year = {{2023}}, } @article{45361, abstract = {{ The non-orthogonal local submatrix method applied to electronic structure–based molecular dynamics simulations is shown to exceed 1.1 EFLOP/s in FP16/FP32-mixed floating-point arithmetic when using 4400 NVIDIA A100 GPUs of the Perlmutter system. This is enabled by a modification of the original method that pushes the sustained fraction of the peak performance to about 80%. Example calculations are performed for SARS-CoV-2 spike proteins with up to 83 million atoms. }}, author = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Kühne, Thomas and Plessl, Christian}}, issn = {{1094-3420}}, journal = {{The International Journal of High Performance Computing Applications}}, keywords = {{Hardware and Architecture, Theoretical Computer Science, Software}}, publisher = {{SAGE Publications}}, title = {{{Breaking the exascale barrier for the electronic structure problem in ab-initio molecular dynamics}}}, doi = {{10.1177/10943420231177631}}, year = {{2023}}, } @unpublished{50172, abstract = {{Viscous hydrodynamics serves as a successful mesoscopic description of the Quark-Gluon Plasma produced in relativistic heavy-ion collisions. In order to investigate, how such an effective description emerges from the underlying microscopic dynamics we calculate the hydrodynamic and non-hydrodynamic modes of linear response in the sound channel from a first-principle calculation in kinetic theory. We do this with a new approach wherein we discretize the collision kernel to directly calculate eigenvalues and eigenmodes of the evolution operator. This allows us to study the Green's functions at any point in the complex frequency space. Our study focuses on scalar theory with quartic interaction and we find that the analytic structure of Green's functions in the complex plane is far more complicated than just poles or cuts which is a first step towards an equivalent study in QCD kinetic theory.}}, author = {{Ochsenfeld, Stephan and Schlichting, Sören}}, booktitle = {{arXiv:2308.04491}}, title = {{{Hydrodynamic and Non-hydrodynamic Excitations in Kinetic Theory -- A Numerical Analysis in Scalar Field Theory}}}, year = {{2023}}, }