@inproceedings{63890,
  abstract     = {{The computation of highly contracted electron repulsion integrals (ERIs) is essential to achieve quantum accuracy in atomistic simulations based on quantum mechanics. Its growing computational demands make energy efficiency a critical concern. Recent studies demonstrate FPGAs’ superior performance and energy efficiency for computing primitive ERIs, but the computation of highly contracted ERIs introduces significant algorithmic complexity and new design challenges for FPGA acceleration.In this work, we present SORCERI, the first streaming overlay acceleration for highly contracted ERI computations on FPGAs. SORCERI introduces a novel streaming Rys computing unit to calculate roots and weights of Rys polynomials on-chip, and a streaming contraction unit for the contraction of primitive ERIs. This shifts the design bottleneck from limited CPU-FPGA communication bandwidth to available FPGA computation resources. To address practical deployment challenges for a large number of quartet classes, we design three streaming overlays, together with an efficient memory transpose optimization, to cover the 21 most commonly used quartet classes in realistic atomistic simulations. To address the new computation constraints, we use flexible calculation stages with a free-running streaming architecture to achieve high DSP utilization and good timing closure.Experiments demonstrate that SORCERI achieves an average 5.96x, 1.99x, and 1.16x better performance per watt than libint on a 64-core AMD EPYC 7713 CPU, libintx on an Nvidia A40 GPU, and SERI, the prior best-performing FPGA design for primitive ERIs. Furthermore, SORCERI reaches a peak throughput of 44.11 GERIS (109 ERIs per second) that is 1.52x, 1.13x, and 1.93x greater than libint, libintx and SERI, respectively. SORCERI will be released soon at https://github.com/SFU-HiAccel/SORCERI.}},
  author       = {{Stachura, Philip and Wu, Xin and Plessl, Christian and Fang, Zhenman}},
  booktitle    = {{Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA '26)}},
  isbn         = {{9798400720796}},
  keywords     = {{electron repulsion integrals, quantum chemistry, atomistic simulation, overlay architecture, fpga acceleration}},
  pages        = {{224--234}},
  publisher    = {{Association for Computing Machinery}},
  title        = {{{SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry}}},
  doi          = {{10.1145/3748173.3779198}},
  year         = {{2026}},
}

@inproceedings{62066,
  abstract     = {{In the context of high-performance computing (HPC) for distributed workloads, individual field-programmable gate arrays (FPGAs) need efficient ways to exchange data, which requires network infrastructure and software abstractions. Dedicated multi-FPGA clusters provide inter-FPGA networks for direct device to device communication. The oneAPI high-level synthesis toolchain offers I/O pipes to allow user kernels to interact with the networking ports of the FPGA board. In this work, we evaluate using oneAPI I/O pipes for direct FPGA-to-FPGA communication by scaling a SYCL implementation of a Jacobi solver on up to 25 FPGAs in the Noctua 2 cluster. We see good results in weak and strong scaling experiments.}},
  author       = {{Alt, Christoph and Plessl, Christian and Kenter, Tobias}},
  booktitle    = {{Proceedings of the 13th International Workshop on OpenCL and SYCL}},
  isbn         = {{9798400713606}},
  keywords     = {{Multi-FPGA, High-level Synthesis, oneAPI, FPGA}},
  publisher    = {{Association for Computing Machinery}},
  title        = {{{Evaluating oneAPI I/O Pipes in a Case Study of Scaling a SYCL Jacobi Solver to multiple FPGAs}}},
  doi          = {{10.1145/3731125.3731131}},
  year         = {{2025}},
}

@techreport{62981,
  abstract     = {{Otus is a high-performance computing cluster that was launched in 2025 and is operated by the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. The system is part of the National High Performance Computing (NHR) initiative. Otus complements the previous supercomputer Noctua 2, offering approximately twice the computing power while retaining the three node types that were characteristic of Noctua 2: 1) CPU compute nodes with different memory capacities, 2) high-end GPU nodes, and 3) HPC-grade FPGA nodes. On the Top500 list, which ranks the 500 most powerful supercomputers in the world, Otus is in position 164 with the CPU partition and in position 255 with the GPU partition (June 2025). On the Green500 list, ranking the 500 most energy-efficient supercomputers in the world, Otus is in position 5 with the GPU partition (June 2025).


This article provides a comprehensive overview of the system in terms of its hardware, software, system integration, and its overall integration into the data center building to ensure energy-efficient operation. The article aims to provide unique insights for scientists using the system and for other centers operating HPC clusters. The article will be continuously updated to reflect the latest system setup and measurements. }},
  author       = {{Ehtesabi, Sadaf and Hossain, Manoar and Kenter, Tobias and Krawinkel, Andreas and Ostermann, Lukas and Plessl, Christian and Riebler, Heinrich and Rohde, Stefan and Schade, Robert and Schwarz, Michael and Simon, Jens and Winnwa, Nils and Wiens, Alex and Wu, Xin}},
  keywords     = {{Otus, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing, Noctua 2, HPC}},
  pages        = {{33}},
  publisher    = {{Paderborn Center for Parallel Computing (PC2)}},
  title        = {{{Otus Supercomputer}}},
  doi          = {{10.48550/ARXIV.2512.07401}},
  volume       = {{1}},
  year         = {{2025}},
}

@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@phdthesis{29769,
  abstract     = {{Wettstreit zwischen der Entwicklung neuer Hardwaretrojaner und entsprechender Gegenmaßnahmen beschreiten Widersacher immer raffiniertere Wege um Schaltungsentwürfe zu infizieren und dabei selbst fortgeschrittene Test- und Verifikationsmethoden zu überlisten. Abgesehen von den konventionellen Methoden um einen Trojaner in eine Schaltung für ein Field-programmable Gate Array (FPGA) einzuschleusen, können auch die Entwurfswerkzeuge heimlich kompromittiert werden um einen Angreifer dabei zu unterstützen einen erfolgreichen Angriff durchzuführen, der zum Beispiel Fehlfunktionen oder ungewollte Informationsabflüsse bewirken kann. Diese Dissertation beschäftigt sich hauptsächlich mit den beiden Blickwinkeln auf Hardwaretrojaner in rekonfigurierbaren Systemen, einerseits der Perspektive des Verteidigers mit einer Methode zur Erkennung von Trojanern auf der Bitstromebene, und andererseits derjenigen des Angreifers mit einer neuartigen Angriffsmethode für FPGA Trojaner. Für die Verteidigung gegen den Trojaner ``Heimtückische LUT'' stellen wir die allererste erfolgreiche Gegenmaßnahme vor, die durch Verifikation mittels Proof-carrying Hardware (PCH) auf der Bitstromebene direkt vor der Konfiguration der Hardware angewendet werden kann, und präsentieren ein vollständiges Schema für den Entwurf und die Verifikation von Schaltungen für iCE40 FPGAs. Für die Gegenseite führen wir einen neuen Angriff ein, welcher bösartiges Routing im eingefügten Trojaner ausnutzt um selbst im fertigen Bitstrom in einem inaktiven Zustand zu verbleiben: Hierdurch kann dieser neuartige Angriff zur Zeit weder von herkömmlichen Test- und Verifikationsmethoden, noch von unserer vorher vorgestellten Verifikation auf der Bitstromebene entdeckt werden.}},
  author       = {{Ahmed, Qazi Arbab}},
  keywords     = {{FPGA Security, Hardware Trojans, Bitstream-level Trojans, Bitstream Verification}},
  publisher    = {{ Paderborn University, Paderborn, Germany}},
  title        = {{{Hardware Trojans in Reconfigurable Computing}}},
  doi          = {{10.17619/UNIPB/1-1271}},
  year         = {{2022}},
}

@inproceedings{21632,
  abstract     = {{FPGAs have found increasing adoption in data center applications since a new generation of high-level tools have become available which noticeably reduce development time for FPGA accelerators and still provide high-quality results. There is, however, no high-level benchmark suite available, which specifically enables a comparison of FPGA architectures, programming tools, and libraries for HPC applications. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community. To fill this gap, we have developed an OpenCL-based open-source implementation of the HPCC benchmark suite for Xilinx and Intel FPGAs. This benchmark can serve to analyze the current capabilities of FPGA devices, cards, and development tool flows, track progress over time, and point out specific difficulties for FPGA acceleration in the HPC domain. Additionally, the benchmark documents proven performance optimization patterns. We will continue optimizing and porting the benchmark for new generations of FPGAs and design tools and encourage active participation to create a valuable tool for the community.}},
  author       = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{2020 IEEE/ACM International Workshop on Heterogeneous High-performance Reconfigurable Computing (H2RC)}},
  isbn         = {{9781665415927}},
  keywords     = {{FPGA, OpenCL, High Level Synthesis, HPC benchmarking}},
  title        = {{{Evaluating FPGA Accelerator Performance with a Parameterized OpenCL Adaptation of Selected Benchmarks of the HPCChallenge Benchmark Suite}}},
  doi          = {{10.1109/h2rc51942.2020.00007}},
  year         = {{2020}},
}

@article{11950,
  abstract     = {{Advances in electromyographic (EMG) sensor technology and machine learning algorithms have led to an increased research effort into high density EMG-based pattern recognition methods for prosthesis control. With the goal set on an autonomous multi-movement prosthesis capable of performing training and classification of an amputee’s EMG signals, the focus of this paper lies in the acceleration of the embedded signal processing chain. We present two Xilinx Zynq-based architectures for accelerating two inherently different high density EMG-based control algorithms. The first hardware accelerated design achieves speed-ups of up to 4.8 over the software-only solution, allowing for a processing delay lower than the sample period of 1 ms. The second system achieved a speed-up of 5.5 over the software-only version and operates at a still satisfactory low processing delay of up to 15 ms while providing a higher reliability and robustness against electrode shift and noisy channels.}},
  author       = {{Boschmann, Alexander and Agne, Andreas and Thombansen, Georg and Witschen, Linus Matthias and Kraus, Florian and Platzner, Marco}},
  issn         = {{0743-7315}},
  journal      = {{Journal of Parallel and Distributed Computing}},
  keywords     = {{High density electromyography, FPGA acceleration, Medical signal processing, Pattern recognition, Prosthetics}},
  pages        = {{77--89}},
  publisher    = {{Elsevier}},
  title        = {{{Zynq-based acceleration of robust high density myoelectric signal processing}}},
  doi          = {{10.1016/j.jpdc.2018.07.004}},
  volume       = {{123}},
  year         = {{2019}},
}

@misc{5417,
  abstract     = {{Molecular Dynamic (MD) simulations are computationally intensive and accelerating them using specialized hardware is a topic of investigation in many studies. One of the routines in the critical path of MD simulations is the three-dimensional Fast Fourier Transformation (FFT3d). The potential in accelerating FFT3d using hardware is usually bound by bandwidth and memory. Therefore, designing a high throughput solution for an FPGA that overcomes this problem is challenging.
In this thesis, the feasibility of offloading FFT3d computations to FPGA implemented using OpenCL is investigated. In order to mask the latency in memory access, an FFT3d that overlaps computation with communication is designed. The implementa- tion of this design is synthesized for the Arria 10 GX 1150 FPGA and evaluated with the FFTW benchmark. Analysis shows a better performance using FPGA over CPU for larger FFT sizes, with the 643 FFT showing a 70% improvement in runtime using FPGAs.
This FFT3d design is integrated with CP2K to explore the potential in accelerating molecular dynamic simulations. Evaluation of CP2K simulations using FPGA shows a 41% improvement in runtime in FFT3d computations over CPU for larger FFT3d designs.}},
  author       = {{Ramaswami, Arjun}},
  keywords     = {{FFT: FPGA, CP2K, OpenCL}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Accelerating Molecular Dynamic Simulations by Offloading Fast Fourier Transformations to FPGA}}},
  year         = {{2018}},
}

@inproceedings{10673,
  author       = {{Ho, Nam and Ahmed, Abdullah Fathi and Kaufmann, Paul and Platzner, Marco}},
  booktitle    = {{Proc. NASA/ESA Conf. Adaptive Hardware and Systems (AHS)}},
  keywords     = {{cache storage, field programmable gate arrays, multiprocessing systems, parallel architectures, reconfigurable architectures, FPGA, dynamic reconfiguration, evolvable cache mapping, many-core architecture, memory-to-cache address mapping function, microarchitectural optimization, multicore architecture, nature-inspired optimization, parallelization degrees, processor, reconfigurable cache mapping, reconfigurable computing, Field programmable gate arrays, Software, Tuning}},
  pages        = {{1--7}},
  title        = {{{Microarchitectural optimization by means of reconfigurable and evolvable cache mappings}}},
  doi          = {{10.1109/AHS.2015.7231178}},
  year         = {{2015}},
}

@inproceedings{10620,
  author       = {{Anwer, Jahanzeb and Meisner, Sebastian and Platzner, Marco}},
  booktitle    = {{Reconfigurable Computing and FPGAs (ReConFig), 2013 International Conference on}},
  keywords     = {{fault tolerant computing, field programmable gate arrays, logic design, reliability, BYU-LANL tool, DRM tool flow, FPGA based hardware designs, avionic application, device technologies, dynamic reliability management, fault-tolerant operation, hardware designs, reconfiguring reliability levels, space applications, Field programmable gate arrays, Hardware, Redundancy, Reliability engineering, Runtime, Tunneling magnetoresistance}},
  pages        = {{1--6}},
  title        = {{{Dynamic reliability management: Reconfiguring reliability-levels of hardware designs at runtime}}},
  doi          = {{10.1109/ReConFig.2013.6732280}},
  year         = {{2013}},
}

@article{2412,
  abstract     = {{ Reconfigurable architectures that tightly integrate a standard CPU core with a field-programmable hardware structure have recently been receiving impact of these design decisions on the overall system performance is a challenging task. In this paper, we first present a framework for the cycle-accurate performance evaluation of hybrid reconfigurable processors on the system level. Then, we discuss a reconfigurable processor for data-streaming applications, which attaches a coarse-grained reconfigurable unit to the coprocessor interface of a standard embedded CPU core. By means of a case study we evaluate the system-level impact of certain design features for the reconfigurable unit, such as multiple contexts, register replication, and hardware context scheduling. The results illustrate that a system-level evaluation framework is of paramount importance for studying the architectural trade-offs and optimizing design parameters for reconfigurable processors.}},
  author       = {{Enzler, Rolf and Plessl, Christian and Platzner, Marco}},
  journal      = {{Microprocessors and Microsystems}},
  keywords     = {{FPGA, reconfigurable computing, co-simulation, Zippy}},
  number       = {{2-3}},
  pages        = {{63--73}},
  publisher    = {{Elsevier}},
  title        = {{{System-level performance evaluation of reconfigurable processors}}},
  doi          = {{10.1016/j.micpro.2004.06.004}},
  volume       = {{29}},
  year         = {{2005}},
}

@inproceedings{2418,
  abstract     = {{ This paper presents TKDM, a PC-based high-performance reconfigurable computing environment. The TKDM hardware consists of an FPGA module that uses the DIMM (dual inline memory module) bus for high-bandwidth and low-latency communication with the host CPU. The system's firmware is integrated with the Linux host operating system and offers functions for data communication and FPGA reconfiguration. The intended use of TKDM is that of a dynamically reconfigurable co-processor for data streaming applications. The system's firmware can be customized for specific application domains to facilitate simple and easy-to-use programming interfaces. }},
  author       = {{Plessl, Christian and Platzner, Marco}},
  booktitle    = {{Proc. Int. Conf. on Field Programmable Technology (ICFPT)}},
  keywords     = {{coprocessor, DIMM, memory bus, FPGA, high performance computing}},
  pages        = {{252--259}},
  publisher    = {{IEEE Computer Society}},
  title        = {{{TKDM – A Reconfigurable Co-processor in a PC's Memory Slot}}},
  doi          = {{10.1109/FPT.2003.1275755}},
  year         = {{2003}},
}

@inproceedings{2421,
  abstract     = {{In contrast to processors, current reconfigurable devices totally lack programming models that would allow for device independent compilation and forward compatibility. The key to overcome this limitation is hardware virtualization. In this paper, we resort to a macro-pipelined execution model to achieve hardware virtualization for data streaming applications. As a hardware implementation we present a hybrid multi-context architecture that attaches a coarse-grained reconfigurable array to a host CPU. A co-simulation framework enables cycle-accurate simulation of the complete architecture. As a case study we map an FIR filter to our virtualized hardware model and evaluate different designs. We discuss the impact of the number of contexts and the feature of context state on the speedup and the CPU load.}},
  author       = {{Enzler, Rolf and Plessl, Christian and Platzner, Marco}},
  booktitle    = {{Proc. Int. Conf. on Field Programmable Logic and Applications (FPL)}},
  keywords     = {{Zippy, multi-context, FPGA}},
  pages        = {{151--160}},
  publisher    = {{Springer}},
  title        = {{{Virtualizing Hardware with Multi-Context Reconfigurable Arrays}}},
  doi          = {{10.1007/b12007}},
  volume       = {{2778}},
  year         = {{2003}},
}

