@article{60298,
  abstract     = {{In this work, we introduce PHOENIX, a highly optimized explicit open-source solver for two-dimensional nonlinear Schrödinger equations with extensions. The nonlinear Schrödinger equation and its extensions (Gross-Pitaevskii equation) are widely studied to model and analyze complex phenomena in fields such as optics, condensed matter physics, fluid dynamics, and plasma physics. It serves as a powerful tool for understanding nonlinear wave dynamics, soliton formation, and the interplay between nonlinearity, dispersion, and diffraction. By extending the nonlinear Schrödinger equation, various physical effects such as non-Hermiticity, spin-orbit interaction, and quantum optical aspects can be incorporated. PHOENIX is designed to accommodate a wide range of applications by a straightforward extendability without the need for user knowledge of computing architectures or performance optimization. The high performance and power efficiency of PHOENIX are demonstrated on a wide range of entry-class to high-end consumer and high-performance computing GPUs and CPUs. Compared to a more conventional MATLAB implementation, a speedup of up to three orders of magnitude and energy savings of up to 99.8% are achieved. The performance is compared to a performance model showing that PHOENIX performs close to the relevant performance bounds in many situations. The possibilities of PHOENIX are demonstrated with a range of practical examples from the realm of nonlinear (quantum) photonics in planar microresonators with active media including exciton-polariton condensates. Examples range from solutions on very large grids, the use of local optimization algorithms, to Monte Carlo ensemble evolutions with quantum noise enabling the tomography of the system's quantum state.}},
  author       = {{Wingenbach, Jan and Bauch, David and Ma, Xuekai and Schade, Robert and Plessl, Christian and Schumacher, Stefan}},
  issn         = {{0010-4655}},
  journal      = {{Computer Physics Communications}},
  publisher    = {{Elsevier BV}},
  title        = {{{PHOENIX – Paderborn highly optimized and energy efficient solver for two-dimensional nonlinear Schrödinger equations with integrated extensions}}},
  doi          = {{10.1016/j.cpc.2025.109689}},
  volume       = {{315}},
  year         = {{2025}},
}

@unpublished{60975,
  abstract     = {{CP2K is a versatile open-source software package for simulations across a
wide range of atomistic systems, from isolated molecules in the gas phase to
low-dimensional functional materials and interfaces, as well as highly
symmetric crystalline solids, disordered amorphous glasses, and weakly
interacting soft-matter systems in the liquid state and in solution. This
review highlights CP2K's capabilities for computing both static and dynamical
properties using quantum-mechanical and classical simulation methods. In
contrast to the accompanying theory and code paper [J. Chem. Phys. 152, 194103
(2020)], the focus here is on the practical usage and applications of CP2K,
with underlying theoretical concepts introduced only as needed.}},
  author       = {{Iannuzzi, Marcella and Wilhelm, Jan and Stein, Frederick and Bussy, Augustin and Elgabarty, Hossam and Golze, Dorothea and Hehn, Anna and Graml, Maximilian and Marek, Stepan and Gökmen, Beliz Sertcan and Schran, Christoph and Forbert, Harald and Khaliullin, Rustam Z. and Kozhevnikov, Anton and Taillefumier, Mathieu and Meli, Rocco and Rybkin, Vladimir and Brehm, Martin and Schade, Robert and Schütt, Ole and Pototschnig, Johann V. and Mirhosseini, Hossein and Knüpfer, Andreas and Marx, Dominik and Krack, Matthias and Hutter, Jürg and Kühne, Thomas D.}},
  booktitle    = {{arXiv:2508.15559}},
  title        = {{{The CP2K Program Package Made Simple}}},
  year         = {{2025}},
}

@article{62034,
  abstract     = {{Effective single-particle theories, such as Hartree–Fock, density functional theory, and tight-binding, are limited by the computational cost of the self-consistent field (SCF) procedure, which typically scales cubically with the system size. This makes large-scale applications impractical without specialized algorithms and hardware. Here, we present the submatrix and graphical processing unit (GPU)-accelerated software implementation of the PTB tight-binding potential, realized in the open-source ptb codebase [M. Mueller, A. Katbashev, and S. Ehlert (2025). “grimme-lab/ptb: v3.8.1,” Zenodo. https://zenodo.org/records/17015872]. We first benchmark a traditional diagonalization-based SCF solver against density-matrix-based purification approaches, systematically varying both system size and computer hardware. Our findings show that the usage of GPUs permits shifting the boundaries to much larger systems than previously thought feasible, achieving an overall 10–15-fold performance speedup. Second, we introduce the implementation of a decomposition-type submatrix method, specifically designed for efficient operation on mid- to large-sized systems, to address the computational overhead associated with full-system diagonalization. We demonstrate that, from a certain dimension (≈104 basis functions) on, our submatrix method reduces the overall computational cost while maintaining acceptable numerical accuracy. Our study demonstrates the significance of the interplay between modern hardware, algorithmic considerations, and novel tight-binding methods, paving the way for further development in this direction.}},
  author       = {{Katbashev, Abylay and Schade, Robert and Laß, Michael and Müller, Marcel and Grimme, Stefan and Hansen, Andreas and Kühne, Thomas}},
  issn         = {{0021-9606}},
  journal      = {{The Journal of Chemical Physics}},
  number       = {{13}},
  publisher    = {{AIP Publishing}},
  title        = {{{Submatrix and GPU-accelerated implementation of density matrix tight-binding}}},
  doi          = {{10.1063/5.0271379}},
  volume       = {{163}},
  year         = {{2025}},
}

@techreport{62981,
  abstract     = {{Otus is a high-performance computing cluster that was launched in 2025 and is operated by the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. The system is part of the National High Performance Computing (NHR) initiative. Otus complements the previous supercomputer Noctua 2, offering approximately twice the computing power while retaining the three node types that were characteristic of Noctua 2: 1) CPU compute nodes with different memory capacities, 2) high-end GPU nodes, and 3) HPC-grade FPGA nodes. On the Top500 list, which ranks the 500 most powerful supercomputers in the world, Otus is in position 164 with the CPU partition and in position 255 with the GPU partition (June 2025). On the Green500 list, ranking the 500 most energy-efficient supercomputers in the world, Otus is in position 5 with the GPU partition (June 2025).


This article provides a comprehensive overview of the system in terms of its hardware, software, system integration, and its overall integration into the data center building to ensure energy-efficient operation. The article aims to provide unique insights for scientists using the system and for other centers operating HPC clusters. The article will be continuously updated to reflect the latest system setup and measurements. }},
  author       = {{Ehtesabi, Sadaf and Hossain, Manoar and Kenter, Tobias and Krawinkel, Andreas and Ostermann, Lukas and Plessl, Christian and Riebler, Heinrich and Rohde, Stefan and Schade, Robert and Schwarz, Michael and Simon, Jens and Winnwa, Nils and Wiens, Alex and Wu, Xin}},
  keywords     = {{Otus, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing, Noctua 2, HPC}},
  pages        = {{33}},
  publisher    = {{Paderborn Center for Parallel Computing (PC2)}},
  title        = {{{Otus Supercomputer}}},
  doi          = {{10.48550/ARXIV.2512.07401}},
  volume       = {{1}},
  year         = {{2025}},
}

@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@article{53202,
  abstract     = {{At large scales, quantum systems may become advantageous over their classical counterparts at performing certain tasks. Developing tools to analyze these systems at the relevant scales, in a manner consistent with quantum mechanics, is therefore critical to benchmarking performance and characterizing their operation. While classical computational approaches cannot perform like-for-like computations of quantum systems beyond a certain scale, classical high-performance computing (HPC) may nevertheless be useful for precisely these characterization and certification tasks. By developing open-source customized algorithms using high-performance computing, we perform quantum tomography on a megascale quantum photonic detector covering a Hilbert space of 106. This requires finding 108 elements of the matrix corresponding to the positive operator valued measure (POVM), the quantum description of the detector, and is achieved in minutes of computation time. Moreover, by exploiting the structure of the problem, we achieve highly efficient parallel scaling, paving the way for quantum objects up to a system size of 1012 elements to be reconstructed using this method. In general, this shows that a consistent quantum mechanical description of quantum phenomena is applicable at everyday scales. More concretely, this enables the reconstruction of large-scale quantum sources, processes and detectors used in computation and sampling tasks, which may be necessary to prove their nonclassical character or quantum computational advantage.}},
  author       = {{Schapeler, Timon and Schade, Robert and Lass, Michael and Plessl, Christian and Bartley, Tim}},
  journal      = {{Quantum Science and Technology}},
  number       = {{1}},
  publisher    = {{IOP Publishing}},
  title        = {{{Scalable quantum detector tomography by high-performance computing}}},
  doi          = {{10.1088/2058-9565/ad8511}},
  volume       = {{10}},
  year         = {{2024}},
}

@inproceedings{43228,
  abstract     = {{The computation of electron repulsion integrals (ERIs) over Gaussian-type orbitals (GTOs) is a challenging problem in quantum-mechanics-based atomistic simulations. In practical simulations, several trillions of ERIs may have to be
computed for every time step.
In this work, we investigate FPGAs as accelerators for the ERI computation. We use template parameters, here within the Intel oneAPI tool flow, to create customized designs for 256 different ERI quartet classes, based on their orbitals. To maximize data reuse, all intermediates are buffered in FPGA on-chip memory with customized layout. The pre-calculation of intermediates also helps to overcome data dependencies caused by multi-dimensional recurrence
relations. The involved loop structures are partially or even fully unrolled for high throughput of FPGA kernels. Furthermore, a lossy compression algorithm utilizing arbitrary bitwidth integers is integrated in the FPGA kernels. To our
best knowledge, this is the first work on ERI computation on FPGAs that supports more than just the single most basic quartet class. Also, the integration of ERI computation and compression it a novelty that is not even covered by CPU or GPU libraries so far.
Our evaluation shows that using 16-bit integer for the ERI compression, the fastest FPGA kernels exceed the performance of 10 GERIS ($10 \times 10^9$ ERIs per second) on one Intel Stratix 10 GX 2800 FPGA, with maximum absolute errors around $10^{-7}$ - $10^{-5}$ Hartree. The measured throughput can be accurately explained by a performance model. The FPGA kernels deployed on 2 FPGAs outperform similar computations using the widely used libint reference on a two-socket server with 40 Xeon Gold 6148 CPU cores of the same process technology by factors up to 6.0x and on a new two-socket server with 128 EPYC 7713 CPU cores by up to 1.9x.}},
  author       = {{Wu, Xin and Kenter, Tobias and Schade, Robert and Kühne, Thomas and Plessl, Christian}},
  booktitle    = {{2023 IEEE 31st Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)}},
  pages        = {{162--173}},
  title        = {{{Computing and Compressing Electron Repulsion Integrals on FPGAs}}},
  doi          = {{10.1109/FCCM57271.2023.00026}},
  year         = {{2023}},
}

@article{45361,
  abstract     = {{<jats:p> The non-orthogonal local submatrix method applied to electronic structure–based molecular dynamics simulations is shown to exceed 1.1 EFLOP/s in FP16/FP32-mixed floating-point arithmetic when using 4400 NVIDIA A100 GPUs of the Perlmutter system. This is enabled by a modification of the original method that pushes the sustained fraction of the peak performance to about 80%. Example calculations are performed for SARS-CoV-2 spike proteins with up to 83 million atoms. </jats:p>}},
  author       = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Kühne, Thomas and Plessl, Christian}},
  issn         = {{1094-3420}},
  journal      = {{The International Journal of High Performance Computing Applications}},
  keywords     = {{Hardware and Architecture, Theoretical Computer Science, Software}},
  publisher    = {{SAGE Publications}},
  title        = {{{Breaking the exascale barrier for the electronic structure problem in ab-initio molecular dynamics}}},
  doi          = {{10.1177/10943420231177631}},
  year         = {{2023}},
}

@unpublished{33493,
  abstract     = {{Electronic structure calculations have been instrumental in providing many
important insights into a range of physical and chemical properties of various
molecular and solid-state systems. Their importance to various fields,
including materials science, chemical sciences, computational chemistry and
device physics, is underscored by the large fraction of available public
supercomputing resources devoted to these calculations. As we enter the
exascale era, exciting new opportunities to increase simulation numbers, sizes,
and accuracies present themselves. In order to realize these promises, the
community of electronic structure software developers will however first have
to tackle a number of challenges pertaining to the efficient use of new
architectures that will rely heavily on massive parallelism and hardware
accelerators. This roadmap provides a broad overview of the state-of-the-art in
electronic structure calculations and of the various new directions being
pursued by the community. It covers 14 electronic structure codes, presenting
their current status, their development priorities over the next five years,
and their plans towards tackling the challenges and leveraging the
opportunities presented by the advent of exascale computing.}},
  author       = {{Gavini, Vikram and Baroni, Stefano and Blum, Volker and Bowler, David R. and Buccheri, Alexander and Chelikowsky, James R. and Das, Sambit and Dawson, William and Delugas, Pietro and Dogan, Mehmet and Draxl, Claudia and Galli, Giulia and Genovese, Luigi and Giannozzi, Paolo and Giantomassi, Matteo and Gonze, Xavier and Govoni, Marco and Gulans, Andris and Gygi, François and Herbert, John M. and Kokott, Sebastian and Kühne, Thomas and Liou, Kai-Hsin and Miyazaki, Tsuyoshi and Motamarri, Phani and Nakata, Ayako and Pask, John E. and Plessl, Christian and Ratcliff, Laura E. and Richard, Ryan M. and Rossi, Mariana and Schade, Robert and Scheffler, Matthias and Schütt, Ole and Suryanarayana, Phanish and Torrent, Marc and Truflandier, Lionel and Windus, Theresa L. and Xu, Qimen and Yu, Victor W. -Z. and Perez, Danny}},
  booktitle    = {{arXiv:2209.12747}},
  title        = {{{Roadmap on Electronic Structure Codes in the Exascale Era}}},
  year         = {{2022}},
}

@unpublished{32404,
  abstract     = {{The CP2K program package, which can be considered as the swiss army knife of
atomistic simulations, is presented with a special emphasis on ab-initio
molecular dynamics using the second-generation Car-Parrinello method. After
outlining current and near-term development efforts with regards to massively
parallel low-scaling post-Hartree-Fock and eigenvalue solvers, novel approaches
on how we plan to take full advantage of future low-precision hardware
architectures are introduced. Our focus here is on combining our submatrix
method with the approximate computing paradigm to address the immanent exascale
era.}},
  author       = {{Kühne, Thomas and Plessl, Christian and Schade, Robert and Schütt, Ole}},
  booktitle    = {{arXiv:2205.14741}},
  title        = {{{CP2K on the road to exascale}}},
  year         = {{2022}},
}

@article{33226,
  abstract     = {{A parallel hybrid quantum-classical algorithm for the solution of the quantum-chemical ground-state energy problem on gate-based quantum computers is presented. This approach is based on the reduced density-matrix functional theory (RDMFT) formulation of the electronic structure problem. For that purpose, the density-matrix functional of the full system is decomposed into an indirectly coupled sum of density-matrix functionals for all its subsystems using the adaptive cluster approximation to RDMFT. The approximations involved in the decomposition and the adaptive cluster approximation itself can be systematically converged to the exact result. The solutions for the density-matrix functionals of the effective subsystems involves a constrained minimization over many-particle states that are approximated by parametrized trial states on the quantum computer similarly to the variational quantum eigensolver. The independence of the density-matrix functionals of the effective subsystems introduces a new level of parallelization and allows for the computational treatment of much larger molecules on a quantum computer with a given qubit count. In addition, for the proposed algorithm techniques are presented to reduce the qubit count, the number of quantum programs, as well as its depth. The evaluation of a density-matrix functional as the essential part of our approach is demonstrated for Hubbard-like systems on IBM quantum computers based on superconducting transmon qubits.}},
  author       = {{Schade, Robert and Bauer, Carsten and Tamoev, Konstantin and Mazur, Lukas and Plessl, Christian and Kühne, Thomas}},
  journal      = {{Phys. Rev. Research}},
  pages        = {{033160}},
  publisher    = {{American Physical Society}},
  title        = {{{Parallel quantum chemistry on noisy intermediate-scale quantum computers}}},
  doi          = {{10.1103/PhysRevResearch.4.033160}},
  volume       = {{4}},
  year         = {{2022}},
}

@unpublished{46275,
  abstract     = {{Electronic structure calculations have been instrumental in providing many
important insights into a range of physical and chemical properties of various
molecular and solid-state systems. Their importance to various fields,
including materials science, chemical sciences, computational chemistry and
device physics, is underscored by the large fraction of available public
supercomputing resources devoted to these calculations. As we enter the
exascale era, exciting new opportunities to increase simulation numbers, sizes,
and accuracies present themselves. In order to realize these promises, the
community of electronic structure software developers will however first have
to tackle a number of challenges pertaining to the efficient use of new
architectures that will rely heavily on massive parallelism and hardware
accelerators. This roadmap provides a broad overview of the state-of-the-art in
electronic structure calculations and of the various new directions being
pursued by the community. It covers 14 electronic structure codes, presenting
their current status, their development priorities over the next five years,
and their plans towards tackling the challenges and leveraging the
opportunities presented by the advent of exascale computing.}},
  author       = {{Gavini, Vikram and Baroni, Stefano and Blum, Volker and Bowler, David R. and Buccheri, Alexander and Chelikowsky, James R. and Das, Sambit and Dawson, William and Delugas, Pietro and Dogan, Mehmet and Draxl, Claudia and Galli, Giulia and Genovese, Luigi and Giannozzi, Paolo and Giantomassi, Matteo and Gonze, Xavier and Govoni, Marco and Gulans, Andris and Gygi, François and Herbert, John M. and Kokott, Sebastian and Kühne, Thomas and Liou, Kai-Hsin and Miyazaki, Tsuyoshi and Motamarri, Phani and Nakata, Ayako and Pask, John E. and Plessl, Christian and Ratcliff, Laura E. and Richard, Ryan M. and Rossi, Mariana and Schade, Robert and Scheffler, Matthias and Schütt, Ole and Suryanarayana, Phanish and Torrent, Marc and Truflandier, Lionel and Windus, Theresa L. and Xu, Qimen and Yu, Victor W. -Z. and Perez, Danny}},
  booktitle    = {{arXiv:2209.12747}},
  title        = {{{Roadmap on Electronic Structure Codes in the Exascale Era}}},
  year         = {{2022}},
}

@article{33684,
  author       = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Schütt, Ole and Lazzaro, Alfio and Pabst, Hans and Mohr, Stephan and Hutter, Jürg and Kühne, Thomas and Plessl, Christian}},
  issn         = {{0167-8191}},
  journal      = {{Parallel Computing}},
  keywords     = {{Artificial Intelligence, Computer Graphics and Computer-Aided Design, Computer Networks and Communications, Hardware and Architecture, Theoretical Computer Science, Software}},
  publisher    = {{Elsevier BV}},
  title        = {{{Towards electronic structure-based ab-initio molecular dynamics simulations with hundreds of millions of atoms}}},
  doi          = {{10.1016/j.parco.2022.102920}},
  volume       = {{111}},
  year         = {{2022}},
}

@article{16277,
  abstract     = {{CP2K is an open source electronic structure and molecular dynamics software package to perform atomistic simulations of solid-state, liquid, molecular, and biological systems. It is especially aimed at massively parallel and linear-scaling electronic structure methods and state-of-theart ab initio molecular dynamics simulations. Excellent performance for electronic structure calculations is achieved using novel algorithms implemented for modern high-performance computing systems. This review revisits the main capabilities of CP2K to perform efficient and accurate electronic structure simulations. The emphasis is put on density functional theory and multiple post–Hartree–Fock methods using the Gaussian and plane wave approach and its augmented all-electron extension.}},
  author       = {{Kühne, Thomas and Iannuzzi, Marcella and Ben, Mauro Del and Rybkin, Vladimir V. and Seewald, Patrick and Stein, Frederick and Laino, Teodoro and Khaliullin, Rustam Z. and Schütt, Ole and Schiffmann, Florian and Golze, Dorothea and Wilhelm, Jan and Chulkov, Sergey and Mohammad Hossein Bani-Hashemian, Mohammad Hossein Bani-Hashemian and Weber, Valéry and Borstnik, Urban and Taillefumier, Mathieu and Jakobovits, Alice Shoshana and Lazzaro, Alfio and Pabst, Hans and Müller, Tiziano and Schade, Robert and Guidon, Manuel and Andermatt, Samuel and Holmberg, Nico and Schenter, Gregory K. and Hehn, Anna and Bussy, Augustin and Belleflamme, Fabian and Tabacchi, Gloria and Glöß, Andreas and Lass, Michael and Bethune, Iain and Mundy, Christopher J. and Plessl, Christian and Watkins, Matt and VandeVondele, Joost and Krack, Matthias and Hutter, Jürg}},
  journal      = {{The Journal of Chemical Physics}},
  number       = {{19}},
  title        = {{{CP2K: An electronic structure and molecular dynamics software package - Quickstep: Efficient and accurate electronic structure calculations}}},
  doi          = {{10.1063/5.0007045}},
  volume       = {{152}},
  year         = {{2020}},
}

@inproceedings{16898,
  abstract     = {{Electronic structure calculations based on density-functional theory (DFT)
represent a significant part of today's HPC workloads and pose high demands on
high-performance computing resources. To perform these quantum-mechanical DFT
calculations on complex large-scale systems, so-called linear scaling methods
instead of conventional cubic scaling methods are required. In this work, we
take up the idea of the submatrix method and apply it to the DFT computations
in the software package CP2K. For that purpose, we transform the underlying
numeric operations on distributed, large, sparse matrices into computations on
local, much smaller and nearly dense matrices. This allows us to exploit the
full floating-point performance of modern CPUs and to make use of dedicated
accelerator hardware, where performance has been limited by memory bandwidth
before. We demonstrate both functionality and performance of our implementation
and show how it can be accelerated with GPUs and FPGAs.}},
  author       = {{Lass, Michael and Schade, Robert and Kühne, Thomas and Plessl, Christian}},
  booktitle    = {{Proc. International Conference for High Performance Computing, Networking, Storage and Analysis (SC)}},
  location     = {{Atlanta, GA, US}},
  pages        = {{1127--1140}},
  publisher    = {{IEEE Computer Society}},
  title        = {{{A Submatrix-Based Method for Approximate Matrix Function Evaluation in the Quantum Chemistry Code CP2K}}},
  doi          = {{10.1109/SC41405.2020.00084}},
  year         = {{2020}},
}

