@article{38041, abstract = {{While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces. In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.}}, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, issn = {{1936-7406}}, journal = {{ACM Transactions on Reconfigurable Technology and Systems}}, keywords = {{General Computer Science}}, publisher = {{Association for Computing Machinery (ACM)}}, title = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}}, doi = {{10.1145/3576200}}, year = {{2023}}, } @inbook{45893, author = {{Hansmeier, Tim and Kenter, Tobias and Meyer, Marius and Riebler, Heinrich and Platzner, Marco and Plessl, Christian}}, booktitle = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}}, editor = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}}, pages = {{165--182}}, publisher = {{Heinz Nixdorf Institut, Universität Paderborn}}, title = {{{Compute Centers I: Heterogeneous Execution Environments}}}, doi = {{10.5281/zenodo.8068642}}, volume = {{412}}, year = {{2023}}, } @inproceedings{46190, author = {{Opdenhövel, Jan-Oliver and Plessl, Christian and Kenter, Tobias}}, booktitle = {{Proceedings of the 13th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies}}, publisher = {{ACM}}, title = {{{Mutation Tree Reconstruction of Tumor Cells on FPGAs Using a Bit-Level Matrix Representation}}}, doi = {{10.1145/3597031.3597050}}, year = {{2023}}, } @inproceedings{46188, author = {{Faj, Jennifer and Kenter, Tobias and Faghih-Naini, Sara and Plessl, Christian and Aizinger, Vadym}}, booktitle = {{Proceedings of the Platform for Advanced Scientific Computing Conference}}, publisher = {{ACM}}, title = {{{Scalable Multi-FPGA Design of a Discontinuous Galerkin Shallow-Water Model on Unstructured Meshes}}}, doi = {{10.1145/3592979.3593407}}, year = {{2023}}, } @inproceedings{46189, author = {{Prouveur, Charles and Haefele, Matthieu and Kenter, Tobias and Voss, Nils}}, booktitle = {{Proceedings of the Platform for Advanced Scientific Computing Conference}}, publisher = {{ACM}}, title = {{{FPGA Acceleration for HPC Supercapacitor Simulations}}}, doi = {{10.1145/3592979.3593419}}, year = {{2023}}, } @inproceedings{43228, abstract = {{The computation of electron repulsion integrals (ERIs) over Gaussian-type orbitals (GTOs) is a challenging problem in quantum-mechanics-based atomistic simulations. In practical simulations, several trillions of ERIs may have to be computed for every time step. In this work, we investigate FPGAs as accelerators for the ERI computation. We use template parameters, here within the Intel oneAPI tool flow, to create customized designs for 256 different ERI quartet classes, based on their orbitals. To maximize data reuse, all intermediates are buffered in FPGA on-chip memory with customized layout. The pre-calculation of intermediates also helps to overcome data dependencies caused by multi-dimensional recurrence relations. The involved loop structures are partially or even fully unrolled for high throughput of FPGA kernels. Furthermore, a lossy compression algorithm utilizing arbitrary bitwidth integers is integrated in the FPGA kernels. To our best knowledge, this is the first work on ERI computation on FPGAs that supports more than just the single most basic quartet class. Also, the integration of ERI computation and compression it a novelty that is not even covered by CPU or GPU libraries so far. Our evaluation shows that using 16-bit integer for the ERI compression, the fastest FPGA kernels exceed the performance of 10 GERIS ($10 \times 10^9$ ERIs per second) on one Intel Stratix 10 GX 2800 FPGA, with maximum absolute errors around $10^{-7}$ - $10^{-5}$ Hartree. The measured throughput can be accurately explained by a performance model. The FPGA kernels deployed on 2 FPGAs outperform similar computations using the widely used libint reference on a two-socket server with 40 Xeon Gold 6148 CPU cores of the same process technology by factors up to 6.0x and on a new two-socket server with 128 EPYC 7713 CPU cores by up to 1.9x.}}, author = {{Wu, Xin and Kenter, Tobias and Schade, Robert and Kühne, Thomas and Plessl, Christian}}, booktitle = {{2023 IEEE 31st Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}}, pages = {{162--173}}, title = {{{Computing and Compressing Electron Repulsion Integrals on FPGAs}}}, doi = {{10.1109/FCCM57271.2023.00026}}, year = {{2023}}, } @article{45361, abstract = {{ The non-orthogonal local submatrix method applied to electronic structure–based molecular dynamics simulations is shown to exceed 1.1 EFLOP/s in FP16/FP32-mixed floating-point arithmetic when using 4400 NVIDIA A100 GPUs of the Perlmutter system. This is enabled by a modification of the original method that pushes the sustained fraction of the peak performance to about 80%. Example calculations are performed for SARS-CoV-2 spike proteins with up to 83 million atoms. }}, author = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Kühne, Thomas and Plessl, Christian}}, issn = {{1094-3420}}, journal = {{The International Journal of High Performance Computing Applications}}, keywords = {{Hardware and Architecture, Theoretical Computer Science, Software}}, publisher = {{SAGE Publications}}, title = {{{Breaking the exascale barrier for the electronic structure problem in ab-initio molecular dynamics}}}, doi = {{10.1177/10943420231177631}}, year = {{2023}}, } @inbook{46191, author = {{Alt, Christoph and Kenter, Tobias and Faghih-Naini, Sara and Faj, Jennifer and Opdenhövel, Jan-Oliver and Plessl, Christian and Aizinger, Vadym and Hönig, Jan and Köstler, Harald}}, booktitle = {{Lecture Notes in Computer Science}}, isbn = {{9783031320408}}, issn = {{0302-9743}}, publisher = {{Springer Nature Switzerland}}, title = {{{Shallow Water DG Simulations on FPGAs: Design and Comparison of a Novel Code Generation Pipeline}}}, doi = {{10.1007/978-3-031-32041-5_5}}, year = {{2023}}, } @unpublished{43439, abstract = {{This preprint makes the claim of having computed the $9^{th}$ Dedekind Number. This was done by building an efficient FPGA Accelerator for the core operation of the process, and parallelizing it on the Noctua 2 Supercluster at Paderborn University. The resulting value is 286386577668298411128469151667598498812366. This value can be verified in two steps. We have made the data file containing the 490M results available, each of which can be verified separately on CPU, and the whole file sums to our proposed value.}}, author = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}}, booktitle = {{arXiv:2304.03039}}, title = {{{A computation of D(9) using FPGA Supercomputing}}}, year = {{2023}}, } @inproceedings{46193, author = {{Karp, Martin and Podobas, Artur and Kenter, Tobias and Jansson, Niclas and Plessl, Christian and Schlatter, Philipp and Markidis, Stefano}}, booktitle = {{International Conference on High Performance Computing in Asia-Pacific Region}}, publisher = {{ACM}}, title = {{{A High-Fidelity Flow Solver for Unstructured Meshes on Field-Programmable Gate Arrays: Design, Evaluation, and Future Challenges}}}, doi = {{10.1145/3492805.3492808}}, year = {{2022}}, } @article{33684, author = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Schütt, Ole and Lazzaro, Alfio and Pabst, Hans and Mohr, Stephan and Hutter, Jürg and Kühne, Thomas and Plessl, Christian}}, issn = {{0167-8191}}, journal = {{Parallel Computing}}, keywords = {{Artificial Intelligence, Computer Graphics and Computer-Aided Design, Computer Networks and Communications, Hardware and Architecture, Theoretical Computer Science, Software}}, publisher = {{Elsevier BV}}, title = {{{Towards electronic structure-based ab-initio molecular dynamics simulations with hundreds of millions of atoms}}}, doi = {{10.1016/j.parco.2022.102920}}, volume = {{111}}, year = {{2022}}, } @article{27364, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, issn = {{0743-7315}}, journal = {{Journal of Parallel and Distributed Computing}}, title = {{{In-depth FPGA Accelerator Performance Evaluation with Single Node Benchmarks from the HPC Challenge Benchmark Suite for Intel and Xilinx FPGAs using OpenCL}}}, doi = {{10.1016/j.jpdc.2021.10.007}}, year = {{2022}}, } @article{28099, abstract = {{N-body methods are one of the essential algorithmic building blocks of high-performance and parallel computing. Previous research has shown promising performance for implementing n-body simulations with pairwise force calculations on FPGAs. However, to avoid challenges with accumulation and memory access patterns, the presented designs calculate each pair of forces twice, along with both force sums of the involved particles. Also, they require large problem instances with hundreds of thousands of particles to reach their respective peak performance, limiting the applicability for strong scaling scenarios. This work addresses both issues by presenting a novel FPGA design that uses each calculated force twice and overlaps data transfers and computations in a way that allows to reach peak performance even for small problem instances, outperforming previous single precision results even in double precision, and scaling linearly over multiple interconnected FPGAs. For a comparison across architectures, we provide an equally optimized CPU reference, which for large problems actually achieves higher peak performance per device, however, given the strong scaling advantages of the FPGA design, in parallel setups with few thousand particles per device, the FPGA platform achieves highest performance and power efficiency.}}, author = {{Menzel, Johannes and Plessl, Christian and Kenter, Tobias}}, issn = {{1936-7406}}, journal = {{ACM Transactions on Reconfigurable Technology and Systems}}, number = {{1}}, pages = {{1--30}}, title = {{{The Strong Scaling Advantage of FPGAs in HPC for N-body Simulations}}}, doi = {{10.1145/3491235}}, volume = {{15}}, year = {{2021}}, } @inproceedings{46194, author = {{Kenter, Tobias and Shambhu, Adesh and Faghih-Naini, Sara and Aizinger, Vadym}}, booktitle = {{Proceedings of the Platform for Advanced Scientific Computing Conference}}, publisher = {{ACM}}, title = {{{Algorithm-hardware co-design of a discontinuous Galerkin shallow-water model for a dataflow architecture on FPGA}}}, doi = {{10.1145/3468267.3470617}}, year = {{2021}}, } @inproceedings{46195, author = {{Karp, Martin and Podobas, Artur and Jansson, Niclas and Kenter, Tobias and Plessl, Christian and Schlatter, Philipp and Markidis, Stefano}}, booktitle = {{2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}}, publisher = {{IEEE}}, title = {{{High-Performance Spectral Element Methods on Field-Programmable Gate Arrays : Implementation, Evaluation, and Future Projection}}}, doi = {{10.1109/ipdps49936.2021.00116}}, year = {{2021}}, } @inbook{21587, abstract = {{Solving partial differential equations on unstructured grids is a cornerstone of engineering and scientific computing. Nowadays, heterogeneous parallel platforms with CPUs, GPUs, and FPGAs enable energy-efficient and computationally demanding simulations. We developed the HighPerMeshes C++-embedded Domain-Specific Language (DSL) for bridging the abstraction gap between the mathematical and algorithmic formulation of mesh-based algorithms for PDE problems on the one hand and an increasing number of heterogeneous platforms with their different parallel programming and runtime models on the other hand. Thus, the HighPerMeshes DSL aims at higher productivity in the code development process for multiple target platforms. We introduce the concepts as well as the basic structure of the HighPerMeshes DSL, and demonstrate its usage with three examples, a Poisson and monodomain problem, respectively, solved by the continuous finite element method, and the discontinuous Galerkin method for Maxwell’s equation. The mapping of the abstract algorithmic description onto parallel hardware, including distributed memory compute clusters, is presented. Finally, the achievable performance and scalability are demonstrated for a typical example problem on a multi-core CPU cluster.}}, author = {{Alhaddad, Samer and Förstner, Jens and Groth, Stefan and Grünewald, Daniel and Grynko, Yevgen and Hannig, Frank and Kenter, Tobias and Pfreundt, Franz-Josef and Plessl, Christian and Schotte, Merlind and Steinke, Thomas and Teich, Jürgen and Weiser, Martin and Wende, Florian}}, booktitle = {{Euro-Par 2020: Parallel Processing Workshops}}, isbn = {{9783030715922}}, issn = {{0302-9743}}, keywords = {{tet_topic_hpc}}, title = {{{HighPerMeshes – A Domain-Specific Language for Numerical Algorithms on Unstructured Grids}}}, doi = {{10.1007/978-3-030-71593-9_15}}, year = {{2021}}, } @inbook{29936, author = {{Ramaswami, Arjun and Kenter, Tobias and Kühne, Thomas and Plessl, Christian}}, booktitle = {{Applied Reconfigurable Computing. Architectures, Tools, and Applications}}, isbn = {{9783030790240}}, issn = {{0302-9743}}, publisher = {{Springer International Publishing}}, title = {{{Evaluating the Design Space for Offloading 3D FFT Calculations to an FPGA for High-Performance Computing}}}, doi = {{10.1007/978-3-030-79025-7_21}}, year = {{2021}}, } @article{24788, author = {{Alhaddad, Samer and Förstner, Jens and Groth, Stefan and Grünewald, Daniel and Grynko, Yevgen and Hannig, Frank and Kenter, Tobias and Pfreundt, Franz‐Josef and Plessl, Christian and Schotte, Merlind and Steinke, Thomas and Teich, Jürgen and Weiser, Martin and Wende, Florian}}, issn = {{1532-0626}}, journal = {{Concurrency and Computation: Practice and Experience}}, keywords = {{tet_topic_hpc}}, pages = {{e6616}}, title = {{{The HighPerMeshes framework for numerical algorithms on unstructured grids}}}, doi = {{10.1002/cpe.6616}}, year = {{2021}}, } @inproceedings{29937, author = {{Karp, Martin and Podobas, Artur and Jansson, Niclas and Kenter, Tobias and Plessl, Christian and Schlatter, Philipp and Markidis, Stefano}}, booktitle = {{2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}}, publisher = {{IEEE}}, title = {{{High-Performance Spectral Element Methods on Field-Programmable Gate Arrays : Implementation, Evaluation, and Future Projection}}}, doi = {{10.1109/ipdps49936.2021.00116}}, year = {{2021}}, } @inproceedings{21632, abstract = {{FPGAs have found increasing adoption in data center applications since a new generation of high-level tools have become available which noticeably reduce development time for FPGA accelerators and still provide high-quality results. There is, however, no high-level benchmark suite available, which specifically enables a comparison of FPGA architectures, programming tools, and libraries for HPC applications. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community.}}, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, booktitle = {{2020 IEEE/ACM International Workshop on Heterogeneous High-performance Reconfigurable Computing (H2RC)}}, isbn = {{9781665415927}}, keywords = {{FPGA, OpenCL, High Level Synthesis, HPC benchmarking}}, title = {{{Evaluating FPGA Accelerator Performance with a Parameterized OpenCL Adaptation of Selected Benchmarks of the HPCChallenge Benchmark Suite}}}, doi = {{10.1109/h2rc51942.2020.00007}}, year = {{2020}}, }