@inproceedings{63890,
  abstract     = {{The computation of highly contracted electron repulsion integrals (ERIs) is essential to achieve quantum accuracy in atomistic simulations based on quantum mechanics. Its growing computational demands make energy efficiency a critical concern. Recent studies demonstrate FPGAs’ superior performance and energy efficiency for computing primitive ERIs, but the computation of highly contracted ERIs introduces significant algorithmic complexity and new design challenges for FPGA acceleration.In this work, we present SORCERI, the first streaming overlay acceleration for highly contracted ERI computations on FPGAs. SORCERI introduces a novel streaming Rys computing unit to calculate roots and weights of Rys polynomials on-chip, and a streaming contraction unit for the contraction of primitive ERIs. This shifts the design bottleneck from limited CPU-FPGA communication bandwidth to available FPGA computation resources. To address practical deployment challenges for a large number of quartet classes, we design three streaming overlays, together with an efficient memory transpose optimization, to cover the 21 most commonly used quartet classes in realistic atomistic simulations. To address the new computation constraints, we use flexible calculation stages with a free-running streaming architecture to achieve high DSP utilization and good timing closure.Experiments demonstrate that SORCERI achieves an average 5.96x, 1.99x, and 1.16x better performance per watt than libint on a 64-core AMD EPYC 7713 CPU, libintx on an Nvidia A40 GPU, and SERI, the prior best-performing FPGA design for primitive ERIs. Furthermore, SORCERI reaches a peak throughput of 44.11 GERIS (109 ERIs per second) that is 1.52x, 1.13x, and 1.93x greater than libint, libintx and SERI, respectively. SORCERI will be released soon at https://github.com/SFU-HiAccel/SORCERI.}},
  author       = {{Stachura, Philip and Wu, Xin and Plessl, Christian and Fang, Zhenman}},
  booktitle    = {{Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA '26)}},
  isbn         = {{9798400720796}},
  keywords     = {{electron repulsion integrals, quantum chemistry, atomistic simulation, overlay architecture, fpga acceleration}},
  pages        = {{224--234}},
  publisher    = {{Association for Computing Machinery}},
  title        = {{{SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry}}},
  doi          = {{10.1145/3748173.3779198}},
  year         = {{2026}},
}

@unpublished{64071,
  abstract     = {{Stimulated by the renewed interest and recent developments in semi-empirical quantum chemical (SQC) methods for noncovalent interactions, we examine the properties of liquid water at ambient conditions by means of molecular dynamics (MD) simulations, both with the conventional NDDO-type (neglect of diatomic differential overlap) methods, e.g. AM1 and PM6, and with DFTB-type (density-functional tight-binding) methods, e.g. DFTB2 and GFN-xTB. Besides the original parameter sets, some specifically reparametrized SQC methods (denoted as AM1-W, PM6-fm, and DFTB2-iBi) targeting various smaller water systems ranging from molecular clusters to bulk are considered as well. The quality of these different SQC methods for describing liquid water properties at ambient conditions are assessed by comparison to well-established experimental data and also to BLYP-D3 density functional theory-based ab initio MD simulations. Our analyses reveal that static and dynamics properties of bulk water are poorly described by all considered SQC methods with the original parameters, regardless of the underlying theoretical models, with most of the methods suffering from too weak hydrogen bonds and hence predicting a far too fluid water with highly distorted hydrogen bond kinetics. On the other hand, the reparametrized force-matchcd PM6-fm method is shown to be able to quantitatively reproduce the static and dynamic features of liquid water, and thus can be used as a computationally efficient alternative to electronic structure-based MD simulations for liquid water that requires extended length and time scales. DFTB2-iBi predicts a slightly overstructured water with reduced fluidity, whereas AM1-W gives an amorphous ice-like structure for water at ambient conditions.}},
  author       = {{Wu, Xin and Elgabarty, Hossam and Alizadeh, Vahideh and Henao Aristizabal, Andres and Zysk, Frederik and Plessl, Christian and Ehlert, Sebastian and Hutter, Jürg and Kühne, Thomas D.}},
  title        = {{{Benchmarking semi-empirical quantum chemical methods on liquid water}}},
  year         = {{2025}},
}

@techreport{62981,
  abstract     = {{Otus is a high-performance computing cluster that was launched in 2025 and is operated by the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. The system is part of the National High Performance Computing (NHR) initiative. Otus complements the previous supercomputer Noctua 2, offering approximately twice the computing power while retaining the three node types that were characteristic of Noctua 2: 1) CPU compute nodes with different memory capacities, 2) high-end GPU nodes, and 3) HPC-grade FPGA nodes. On the Top500 list, which ranks the 500 most powerful supercomputers in the world, Otus is in position 164 with the CPU partition and in position 255 with the GPU partition (June 2025). On the Green500 list, ranking the 500 most energy-efficient supercomputers in the world, Otus is in position 5 with the GPU partition (June 2025).


This article provides a comprehensive overview of the system in terms of its hardware, software, system integration, and its overall integration into the data center building to ensure energy-efficient operation. The article aims to provide unique insights for scientists using the system and for other centers operating HPC clusters. The article will be continuously updated to reflect the latest system setup and measurements. }},
  author       = {{Ehtesabi, Sadaf and Hossain, Manoar and Kenter, Tobias and Krawinkel, Andreas and Ostermann, Lukas and Plessl, Christian and Riebler, Heinrich and Rohde, Stefan and Schade, Robert and Schwarz, Michael and Simon, Jens and Winnwa, Nils and Wiens, Alex and Wu, Xin}},
  keywords     = {{Otus, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing, Noctua 2, HPC}},
  pages        = {{33}},
  publisher    = {{Paderborn Center for Parallel Computing (PC2)}},
  title        = {{{Otus Supercomputer}}},
  doi          = {{10.48550/ARXIV.2512.07401}},
  volume       = {{1}},
  year         = {{2025}},
}

@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@inproceedings{56609,
  abstract     = {{The computation of electron repulsion integrals (ERIs) is a key component for quantum chemical methods. The intensive computation and bandwidth demand for ERI evaluation presents a significant challenge for quantum-mechanics-based atomistic simulations with hybrid density functional theory: due to the tens of trillions of ERI computations in each time step, practical applications are usually limited to thousands of atoms. In this work, we propose SERI, a high-throughput streaming accelerator for ERI computation on HBM-based FPGAs. In contrast to prior buffer-based designs, SERI proposes a novel streaming architecture to address the on-chip buffer limitation and the floorplanning challenge, and leverages the high-bandwidth memory to overcome the bandwidth bottleneck in prior designs. Moreover, to meet the varying computation, bandwidth, and floorplanning requirements between the 55 canonical quartet classes in ERI calculation, we design an automation tool, together with an accurate performance model, to automatically customize the architecture and floorplanning strategy for each canonical quartet class to maximize their throughput. Our performance evaluation on the AMD/Xilinx Alveo U280 FPGA board shows that, SERI achieves an average speedup of 9.80 x over the previous best-performing FPGA design, a 3.21x speedup over a 64-core AMD EPYC 7713 CPU, and a 15.64x speedup over an Nvidia A40 GPU. It reaches a peak throughput of 23.8 GERIS ($10^9$ ERIs per second) on one Alveo U280 FPGA. SERI will be released soon at https://github.com/SFU-HiAccel/SERI.}},
  author       = {{Stachura, Philip and Li, Guanyu and Wu, Xin and Plessl, Christian and Fang, Zhenman}},
  booktitle    = {{2024 34th International Conference on Field-Programmable Logic and Applications (FPL)}},
  pages        = {{60--68}},
  publisher    = {{IEEE}},
  title        = {{{SERI: High-Throughput Streaming Acceleration of Electron Repulsion Integral Computation in Quantum Chemistry using HBM-based FPGAs}}},
  doi          = {{10.1109/fpl64840.2024.00018}},
  year         = {{2024}},
}

@inproceedings{43228,
  abstract     = {{The computation of electron repulsion integrals (ERIs) over Gaussian-type orbitals (GTOs) is a challenging problem in quantum-mechanics-based atomistic simulations. In practical simulations, several trillions of ERIs may have to be
computed for every time step.
In this work, we investigate FPGAs as accelerators for the ERI computation. We use template parameters, here within the Intel oneAPI tool flow, to create customized designs for 256 different ERI quartet classes, based on their orbitals. To maximize data reuse, all intermediates are buffered in FPGA on-chip memory with customized layout. The pre-calculation of intermediates also helps to overcome data dependencies caused by multi-dimensional recurrence
relations. The involved loop structures are partially or even fully unrolled for high throughput of FPGA kernels. Furthermore, a lossy compression algorithm utilizing arbitrary bitwidth integers is integrated in the FPGA kernels. To our
best knowledge, this is the first work on ERI computation on FPGAs that supports more than just the single most basic quartet class. Also, the integration of ERI computation and compression it a novelty that is not even covered by CPU or GPU libraries so far.
Our evaluation shows that using 16-bit integer for the ERI compression, the fastest FPGA kernels exceed the performance of 10 GERIS ($10 \times 10^9$ ERIs per second) on one Intel Stratix 10 GX 2800 FPGA, with maximum absolute errors around $10^{-7}$ - $10^{-5}$ Hartree. The measured throughput can be accurately explained by a performance model. The FPGA kernels deployed on 2 FPGAs outperform similar computations using the widely used libint reference on a two-socket server with 40 Xeon Gold 6148 CPU cores of the same process technology by factors up to 6.0x and on a new two-socket server with 128 EPYC 7713 CPU cores by up to 1.9x.}},
  author       = {{Wu, Xin and Kenter, Tobias and Schade, Robert and Kühne, Thomas and Plessl, Christian}},
  booktitle    = {{2023 IEEE 31st Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}},
  pages        = {{162--173}},
  title        = {{{Computing and Compressing Electron Repulsion Integrals on FPGAs}}},
  doi          = {{10.1109/FCCM57271.2023.00026}},
  year         = {{2023}},
}