@article{56604,
  abstract     = {{<jats:p>
            This manuscript makes the claim of having computed the
            <jats:inline-formula content-type="math/tex">
              <jats:tex-math notation="LaTeX" version="MathJax">\(9\)</jats:tex-math>
            </jats:inline-formula>
            th Dedekind number, D(9). This was done by accelerating the core operation of the process with an efficient FPGA design that outperforms an optimized 64-core CPU reference by 95
            <jats:inline-formula content-type="math/tex">
              <jats:tex-math notation="LaTeX" version="MathJax">\(\times\)</jats:tex-math>
            </jats:inline-formula>
            . The FPGA execution was parallelized on the Noctua 2 supercomputer at Paderborn University. The resulting value for D(9) is 286386577668298411128469151667598498812366. This value can be verified in two steps. We have made the data file containing the 490 M results available, each of which can be verified separately on CPU, and the whole file sums to our proposed value. The paper explains the mathematical approach in the first part, before putting the focus on a deep dive into the FPGA accelerator implementation followed by a performance analysis. The FPGA implementation was done in Register-Transfer Level using a dual-clock architecture and shows how we achieved an impressive FMax of 450 MHz on the targeted Stratix 10 GX 2,800 FPGAs. The total compute time used was 47,000 FPGA hours.
          </jats:p>}},
  author       = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  number       = {{3}},
  pages        = {{1--28}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{A Computation of the Ninth Dedekind Number Using FPGA Supercomputing}}},
  doi          = {{10.1145/3674147}},
  volume       = {{17}},
  year         = {{2024}},
}

@article{38041,
  abstract     = {{<jats:p>While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces.</jats:p>
          <jats:p>In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.</jats:p>}},
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  keywords     = {{General Computer Science}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}},
  doi          = {{10.1145/3576200}},
  year         = {{2023}},
}

@article{28099,
  abstract     = {{N-body methods are one of the essential algorithmic building blocks of high-performance and parallel computing. Previous research has shown promising performance for implementing n-body simulations with pairwise force calculations on FPGAs. However, to avoid challenges with accumulation and memory access patterns, the presented designs calculate each pair of forces twice, along with both force sums of the involved particles. Also, they require large problem instances with hundreds of thousands of particles to reach their respective peak performance, limiting the applicability for strong scaling scenarios. This work addresses both issues by presenting a novel FPGA design that uses each calculated force twice and overlaps data transfers and computations in a way that allows to reach peak performance even for small problem instances, outperforming previous single precision results even in double precision, and scaling linearly over multiple interconnected FPGAs. For a comparison across architectures, we provide an equally optimized CPU reference, which for large problems actually achieves higher peak performance per device, however, given the strong scaling advantages of the FPGA design, in parallel setups with few thousand particles per device, the FPGA platform achieves highest performance and power efficiency.}},
  author       = {{Menzel, Johannes and Plessl, Christian and Kenter, Tobias}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  number       = {{1}},
  pages        = {{1--30}},
  title        = {{{The Strong Scaling Advantage of FPGAs in HPC for N-body Simulations}}},
  doi          = {{10.1145/3491235}},
  volume       = {{15}},
  year         = {{2021}},
}

@article{29150,
  abstract     = {{Robotics applications process large amounts of data in real time and require compute platforms that provide high performance and energy efficiency. FPGAs are well suited for many of these applications, but there is a reluctance in the robotics community to use hardware acceleration due to increased design complexity and a lack of consistent programming models across the software/hardware boundary. In this article, we present ReconROS, a framework that integrates the widely used robot operating system (ROS) with ReconOS, which features multithreaded programming of hardware and software threads for reconfigurable computers. This unique combination gives ROS 2 developers the flexibility to transparently accelerate parts of their robotics applications in hardware. We elaborate on the architecture and the design flow for ReconROS and report on a set of experiments that underline the feasibility and flexibility of our approach.}},
  author       = {{Lienen, Christian and Platzner, Marco}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  pages        = {{1--20}},
  title        = {{{Design of Distributed Reconfigurable Robotics Systems with ReconROS}}},
  doi          = {{10.1145/3494571}},
  year         = {{2021}},
}

@article{18,
  abstract     = {{Branch and bound (B&B) algorithms structure the search space as a tree and eliminate infeasible solutions early by pruning subtrees that cannot lead to a valid or optimal solution. Custom hardware designs significantly accelerate the execution of these algorithms. In this article, we demonstrate a high-performance B&B implementation on FPGAs. First, we identify general elements of B&B algorithms and describe their implementation as a finite state machine. Then, we introduce workers that autonomously cooperate using work stealing to allow parallel execution and full utilization of the target FPGA. Finally, we explore advantages of instance-specific designs that target a specific problem instance to improve performance.

We evaluate our concepts by applying them to a branch and bound problem, the reconstruction of corrupted AES keys obtained from cold-boot attacks. The evaluation shows that our work stealing approach is scalable with the available resources and provides speedups proportional to the number of workers. Instance-specific designs allow us to achieve an overall speedup of 47 × compared to the fastest implementation of AES key reconstruction so far. Finally, we demonstrate how instance-specific designs can be generated just-in-time such that the provided speedups outweigh the additional time required for design synthesis.}},
  author       = {{Riebler, Heinrich and Lass, Michael and Mittendorf, Robert and Löcke, Thomas and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems (TRETS)}},
  keywords     = {{coldboot}},
  number       = {{3}},
  pages        = {{24:1--24:23}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{Efficient Branch and Bound on FPGAs Using Work Stealing and Instance-Specific Designs}}},
  doi          = {{10.1145/3053687}},
  volume       = {{10}},
  year         = {{2017}},
}