@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@inproceedings{56607,
  author       = {{Tareen, Abdul Rehman and Meyer, Marius and Plessl, Christian and Kenter, Tobias}},
  booktitle    = {{2024 IEEE 32nd Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}},
  publisher    = {{IEEE}},
  title        = {{{HiHiSpMV: Sparse Matrix Vector Multiplication with Hierarchical Row Reductions on FPGAs with High Bandwidth Memory}}},
  doi          = {{10.1109/fccm60383.2024.00014}},
  volume       = {{35}},
  year         = {{2024}},
}

@inbook{62067,
  abstract     = {{Most FPGA boards in the HPC domain are well-suited for parallel scaling because of the direct integration of versatile and high-throughput network ports. However, the utilization of their network capabilities is often challenging and error-prone because the whole network stack and communication patterns have to be implemented and managed on the FPGAs. Also, this approach conceptually involves a trade-off between the performance potential of improved communication and the impact of resource consumption for communication infrastructure, since the utilized resources on the FPGAs could otherwise be used for computations. In this work, we investigate this trade-off, firstly, by using synthetic benchmarks to evaluate the different configuration options of the communication framework ACCL and their impact on communication latency and throughput. Finally, we use our findings to implement a shallow water simulation whose scalability heavily depends on low-latency communication. With a suitable configuration of ACCL, good scaling behavior can be shown to all 48 FPGAs installed in the system. Overall, the results show that the availability of inter-FPGA communication frameworks as well as the configurability of framework and network stack are crucial to achieve the best application performance with low latency communication.}},
  author       = {{Meyer, Marius and Kenter, Tobias and Petrica, Lucian and O’Brien, Kenneth and Blott, Michaela and Plessl, Christian}},
  booktitle    = {{Lecture Notes in Computer Science}},
  isbn         = {{9783031697654}},
  issn         = {{0302-9743}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{Optimizing Communication for Latency Sensitive HPC Applications on up to 48 FPGAs Using ACCL}}},
  doi          = {{10.1007/978-3-031-69766-1_9}},
  year         = {{2024}},
}

@inbook{45893,
  author       = {{Hansmeier, Tim and Kenter, Tobias and Meyer, Marius and Riebler, Heinrich and Platzner, Marco and Plessl, Christian}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{165--182}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Compute Centers I: Heterogeneous Execution Environments}}},
  doi          = {{10.5281/zenodo.8068642}},
  volume       = {{412}},
  year         = {{2023}},
}

@article{38041,
  abstract     = {{<jats:p>While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces.</jats:p>
          <jats:p>In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.</jats:p>}},
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  keywords     = {{General Computer Science}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}},
  doi          = {{10.1145/3576200}},
  year         = {{2023}},
}

@article{27364,
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  issn         = {{0743-7315}},
  journal      = {{Journal of Parallel and Distributed Computing}},
  title        = {{{In-depth FPGA Accelerator Performance Evaluation with Single Node Benchmarks from the HPC Challenge Benchmark Suite for Intel and Xilinx FPGAs using OpenCL}}},
  doi          = {{10.1016/j.jpdc.2021.10.007}},
  year         = {{2022}},
}

@inproceedings{27365,
  author       = {{Meyer, Marius}},
  booktitle    = {{Proceedings of the 11th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies}},
  title        = {{{Towards Performance Characterization of FPGAs in Context of HPC using OpenCL Benchmarks}}},
  doi          = {{10.1145/3468044.3468058}},
  year         = {{2021}},
}

@inproceedings{21632,
  abstract     = {{FPGAs have found increasing adoption in data center applications since a new generation of high-level tools have become available which noticeably reduce development time for FPGA accelerators and still provide high-quality results. There is, however, no high-level benchmark suite available, which specifically enables a comparison of FPGA architectures, programming tools, and libraries for HPC applications. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community.}},
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{2020 IEEE/ACM International Workshop on Heterogeneous High-performance Reconfigurable Computing (H2RC)}},
  isbn         = {{9781665415927}},
  keywords     = {{FPGA, OpenCL, High Level Synthesis, HPC benchmarking}},
  title        = {{{Evaluating FPGA Accelerator Performance with a Parameterized OpenCL Adaptation of Selected Benchmarks of the HPCChallenge Benchmark Suite}}},
  doi          = {{10.1109/h2rc51942.2020.00007}},
  year         = {{2020}},
}

