@unpublished{53364,
  abstract     = {{Most FPGA boards in the HPC domain are well-suited for parallel scaling because of the direct integration of versatile and high-throughput network ports. However, the utilization of their network capabilities is often challenging and error-prone because the whole network stack and communication patterns have to be implemented and managed on the FPGAs. Also, this approach conceptually involves a trade-off between the performance potential of improved communication and the impact of resource consumption for communication infrastructure, since the utilized resources on the FPGAs could otherwise be used for computations. In this work, we investigate this trade-off, firstly, by using synthetic benchmarks to evaluate the different configuration options of the communication framework ACCL and their impact on communication latency and throughput. Finally, we use our findings to implement a shallow water simulation whose scalability heavily depends on low-latency communication. With a suitable configuration of ACCL, good scaling behavior can be shown to all 48 FPGAs installed in the system. Overall, the results show that the availability of inter-FPGA communication frameworks as well as the configurability of framework and network stack are crucial to achieve the best application performance with low latency communication.}},
  author       = {{Meyer, Marius and Kenter, Tobias and Petrica, Lucian and O'Brien, Kenneth and Blott, Michaela and Plessl, Christian}},
  booktitle    = {{arXiv:2403.18374}},
  title        = {{{Optimizing Communication for Latency Sensitive HPC Applications on up to 48 FPGAs Using ACCL}}},
  year         = {{2024}},
}

@article{53474,
  abstract     = {{We present a novel approach to characterize and quantify microheterogeneity and microphase separation in computer simulations of complex liquid mixtures. Our post-processing method is based on local density fluctuations of the different constituents in sampling spheres of varying size. It can be easily applied to both molecular dynamics (MD) and Monte Carlo (MC) simulations, including periodic boundary conditions. Multidimensional correlation of the density distributions yields a clear picture of the domain formation due to the subtle balance of different interactions. We apply our approach to the example of force field molecular dynamics simulations of imidazolium-based ionic liquids with different side chain lengths at different temperatures, namely 1-ethyl-3-methylimidazolium chloride, 1-hexyl-3-methylimidazolium chloride, and 1-decyl-3-methylimidazolium chloride, which are known to form distinct liquid domains. We put the results into the context of existing microheterogeneity analyses and demonstrate the advantages and sensitivity of our novel method. Furthermore, we show how to estimate the configuration entropy from our analysis, and we investigate voids in the system. The analysis has been implemented into our program package TRAVIS and is thus available as free software.}},
  author       = {{Lass, Michael and Kenter, Tobias and Plessl, Christian and Brehm, Martin}},
  issn         = {{1099-4300}},
  journal      = {{Entropy}},
  number       = {{4}},
  publisher    = {{MDPI AG}},
  title        = {{{Characterizing Microheterogeneity in Liquid Mixtures via Local Density Fluctuations}}},
  doi          = {{10.3390/e26040322}},
  volume       = {{26}},
  year         = {{2024}},
}

@inproceedings{53503,
  author       = {{Olgu, Kaan and Kenter, Tobias and Nunez-Yanez, Jose and Mcintosh-Smith, Simon}},
  booktitle    = {{Proceedings of the 12th International Workshop on OpenCL and SYCL}},
  publisher    = {{ACM}},
  title        = {{{Optimisation and Evaluation of Breadth First Search with oneAPI/SYCL on Intel FPGAs: from Describing Algorithms to Describing Architectures}}},
  doi          = {{10.1145/3648115.3648134}},
  year         = {{2024}},
}

@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@inproceedings{56605,
  author       = {{Opdenhövel, Jan-Oliver and Alt, Christoph and Plessl, Christian and Kenter, Tobias}},
  booktitle    = {{2024 34th International Conference on Field-Programmable Logic and Applications (FPL)}},
  publisher    = {{IEEE}},
  title        = {{{StencilStream: A SYCL-based Stencil Simulation Framework Targeting FPGAs}}},
  doi          = {{10.1109/fpl64840.2024.00023}},
  year         = {{2024}},
}

@inproceedings{56607,
  author       = {{Tareen, Abdul Rehman and Meyer, Marius and Plessl, Christian and Kenter, Tobias}},
  booktitle    = {{2024 IEEE 32nd Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}},
  publisher    = {{IEEE}},
  title        = {{{HiHiSpMV: Sparse Matrix Vector Multiplication with Hierarchical Row Reductions on FPGAs with High Bandwidth Memory}}},
  doi          = {{10.1109/fccm60383.2024.00014}},
  volume       = {{35}},
  year         = {{2024}},
}

@inproceedings{54312,
  author       = {{Büttner, Markus and Alt, Christoph and Kenter, Tobias and Köstler, Harald and Plessl, Christian and Aizinger, Vadym}},
  booktitle    = {{Proceedings of the Platform for Advanced Scientific Computing Conference (PASC)}},
  publisher    = {{ACM}},
  title        = {{{Enabling Performance Portability for Shallow Water Equations on CPUs, GPUs, and FPGAs with SYCL}}},
  doi          = {{10.1145/3659914.3659925}},
  year         = {{2024}},
}

@article{56604,
  abstract     = {{<jats:p>
            This manuscript makes the claim of having computed the
            <jats:inline-formula content-type="math/tex">
              <jats:tex-math notation="LaTeX" version="MathJax">\(9\)</jats:tex-math>
            </jats:inline-formula>
            th Dedekind number, D(9). This was done by accelerating the core operation of the process with an efficient FPGA design that outperforms an optimized 64-core CPU reference by 95
            <jats:inline-formula content-type="math/tex">
              <jats:tex-math notation="LaTeX" version="MathJax">\(\times\)</jats:tex-math>
            </jats:inline-formula>
            . The FPGA execution was parallelized on the Noctua 2 supercomputer at Paderborn University. The resulting value for D(9) is 286386577668298411128469151667598498812366. This value can be verified in two steps. We have made the data file containing the 490 M results available, each of which can be verified separately on CPU, and the whole file sums to our proposed value. The paper explains the mathematical approach in the first part, before putting the focus on a deep dive into the FPGA accelerator implementation followed by a performance analysis. The FPGA implementation was done in Register-Transfer Level using a dual-clock architecture and shows how we achieved an impressive FMax of 450 MHz on the targeted Stratix 10 GX 2,800 FPGAs. The total compute time used was 47,000 FPGA hours.
          </jats:p>}},
  author       = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  number       = {{3}},
  pages        = {{1--28}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{A Computation of the Ninth Dedekind Number Using FPGA Supercomputing}}},
  doi          = {{10.1145/3674147}},
  volume       = {{17}},
  year         = {{2024}},
}

@inbook{46191,
  author       = {{Alt, Christoph and Kenter, Tobias and Faghih-Naini, Sara and Faj, Jennifer and Opdenhövel, Jan-Oliver and Plessl, Christian and Aizinger, Vadym and Hönig, Jan and Köstler, Harald}},
  booktitle    = {{Lecture Notes in Computer Science}},
  isbn         = {{9783031320408}},
  issn         = {{0302-9743}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{Shallow Water DG Simulations on FPGAs: Design and Comparison of a Novel Code Generation Pipeline}}},
  doi          = {{10.1007/978-3-031-32041-5_5}},
  year         = {{2023}},
}

@unpublished{43439,
  abstract     = {{This preprint makes the claim of having computed the $9^{th}$ Dedekind
Number. This was done by building an efficient FPGA Accelerator for the core
operation of the process, and parallelizing it on the Noctua 2 Supercluster at
Paderborn University. The resulting value is
286386577668298411128469151667598498812366. This value can be verified in two
steps. We have made the data file containing the 490M results available, each
of which can be verified separately on CPU, and the whole file sums to our
proposed value.}},
  author       = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}},
  booktitle    = {{arXiv:2304.03039}},
  title        = {{{A computation of D(9) using FPGA Supercomputing}}},
  year         = {{2023}},
}

@inproceedings{46188,
  author       = {{Faj, Jennifer and Kenter, Tobias and Faghih-Naini, Sara and Plessl, Christian and Aizinger, Vadym}},
  booktitle    = {{Proceedings of the Platform for Advanced Scientific Computing Conference (PASC)}},
  publisher    = {{ACM}},
  title        = {{{Scalable Multi-FPGA Design of a Discontinuous Galerkin Shallow-Water Model on Unstructured Meshes}}},
  doi          = {{10.1145/3592979.3593407}},
  year         = {{2023}},
}

@inproceedings{46189,
  author       = {{Prouveur, Charles and Haefele, Matthieu and Kenter, Tobias and Voss, Nils}},
  booktitle    = {{Proceedings of the Platform for Advanced Scientific Computing Conference (PASC)}},
  publisher    = {{ACM}},
  title        = {{{FPGA Acceleration for HPC Supercapacitor Simulations}}},
  doi          = {{10.1145/3592979.3593419}},
  year         = {{2023}},
}

@inbook{45893,
  author       = {{Hansmeier, Tim and Kenter, Tobias and Meyer, Marius and Riebler, Heinrich and Platzner, Marco and Plessl, Christian}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{165--182}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Compute Centers I: Heterogeneous Execution Environments}}},
  doi          = {{10.5281/zenodo.8068642}},
  volume       = {{412}},
  year         = {{2023}},
}

@inproceedings{46190,
  author       = {{Opdenhövel, Jan-Oliver and Plessl, Christian and Kenter, Tobias}},
  booktitle    = {{Proceedings of the 13th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies (HEART)}},
  publisher    = {{ACM}},
  title        = {{{Mutation Tree Reconstruction of Tumor Cells on FPGAs Using a Bit-Level Matrix Representation}}},
  doi          = {{10.1145/3597031.3597050}},
  year         = {{2023}},
}

@article{38041,
  abstract     = {{<jats:p>While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces.</jats:p>
          <jats:p>In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.</jats:p>}},
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  keywords     = {{General Computer Science}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}},
  doi          = {{10.1145/3576200}},
  year         = {{2023}},
}

@inproceedings{43228,
  abstract     = {{The computation of electron repulsion integrals (ERIs) over Gaussian-type orbitals (GTOs) is a challenging problem in quantum-mechanics-based atomistic simulations. In practical simulations, several trillions of ERIs may have to be
computed for every time step.
In this work, we investigate FPGAs as accelerators for the ERI computation. We use template parameters, here within the Intel oneAPI tool flow, to create customized designs for 256 different ERI quartet classes, based on their orbitals. To maximize data reuse, all intermediates are buffered in FPGA on-chip memory with customized layout. The pre-calculation of intermediates also helps to overcome data dependencies caused by multi-dimensional recurrence
relations. The involved loop structures are partially or even fully unrolled for high throughput of FPGA kernels. Furthermore, a lossy compression algorithm utilizing arbitrary bitwidth integers is integrated in the FPGA kernels. To our
best knowledge, this is the first work on ERI computation on FPGAs that supports more than just the single most basic quartet class. Also, the integration of ERI computation and compression it a novelty that is not even covered by CPU or GPU libraries so far.
Our evaluation shows that using 16-bit integer for the ERI compression, the fastest FPGA kernels exceed the performance of 10 GERIS ($10 \times 10^9$ ERIs per second) on one Intel Stratix 10 GX 2800 FPGA, with maximum absolute errors around $10^{-7}$ - $10^{-5}$ Hartree. The measured throughput can be accurately explained by a performance model. The FPGA kernels deployed on 2 FPGAs outperform similar computations using the widely used libint reference on a two-socket server with 40 Xeon Gold 6148 CPU cores of the same process technology by factors up to 6.0x and on a new two-socket server with 128 EPYC 7713 CPU cores by up to 1.9x.}},
  author       = {{Wu, Xin and Kenter, Tobias and Schade, Robert and Kühne, Thomas and Plessl, Christian}},
  booktitle    = {{2023 IEEE 31st Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}},
  pages        = {{162--173}},
  title        = {{{Computing and Compressing Electron Repulsion Integrals on FPGAs}}},
  doi          = {{10.1109/FCCM57271.2023.00026}},
  year         = {{2023}},
}

@article{45361,
  abstract     = {{<jats:p> The non-orthogonal local submatrix method applied to electronic structure–based molecular dynamics simulations is shown to exceed 1.1 EFLOP/s in FP16/FP32-mixed floating-point arithmetic when using 4400 NVIDIA A100 GPUs of the Perlmutter system. This is enabled by a modification of the original method that pushes the sustained fraction of the peak performance to about 80%. Example calculations are performed for SARS-CoV-2 spike proteins with up to 83 million atoms. </jats:p>}},
  author       = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Kühne, Thomas and Plessl, Christian}},
  issn         = {{1094-3420}},
  journal      = {{The International Journal of High Performance Computing Applications}},
  keywords     = {{Hardware and Architecture, Theoretical Computer Science, Software}},
  publisher    = {{SAGE Publications}},
  title        = {{{Breaking the exascale barrier for the electronic structure problem in ab-initio molecular dynamics}}},
  doi          = {{10.1177/10943420231177631}},
  year         = {{2023}},
}

@inproceedings{46193,
  author       = {{Karp, Martin and Podobas, Artur and Kenter, Tobias and Jansson, Niclas and Plessl, Christian and Schlatter, Philipp and Markidis, Stefano}},
  booktitle    = {{International Conference on High Performance Computing in Asia-Pacific Region}},
  publisher    = {{ACM}},
  title        = {{{A High-Fidelity Flow Solver for Unstructured Meshes on Field-Programmable Gate Arrays: Design, Evaluation, and Future Challenges}}},
  doi          = {{10.1145/3492805.3492808}},
  year         = {{2022}},
}

@article{33684,
  author       = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Schütt, Ole and Lazzaro, Alfio and Pabst, Hans and Mohr, Stephan and Hutter, Jürg and Kühne, Thomas and Plessl, Christian}},
  issn         = {{0167-8191}},
  journal      = {{Parallel Computing}},
  keywords     = {{Artificial Intelligence, Computer Graphics and Computer-Aided Design, Computer Networks and Communications, Hardware and Architecture, Theoretical Computer Science, Software}},
  publisher    = {{Elsevier BV}},
  title        = {{{Towards electronic structure-based ab-initio molecular dynamics simulations with hundreds of millions of atoms}}},
  doi          = {{10.1016/j.parco.2022.102920}},
  volume       = {{111}},
  year         = {{2022}},
}

@article{27364,
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  issn         = {{0743-7315}},
  journal      = {{Journal of Parallel and Distributed Computing}},
  title        = {{{In-depth FPGA Accelerator Performance Evaluation with Single Node Benchmarks from the HPC Challenge Benchmark Suite for Intel and Xilinx FPGAs using OpenCL}}},
  doi          = {{10.1016/j.jpdc.2021.10.007}},
  year         = {{2022}},
}