@article{45361, abstract = {{ The non-orthogonal local submatrix method applied to electronic structure–based molecular dynamics simulations is shown to exceed 1.1 EFLOP/s in FP16/FP32-mixed floating-point arithmetic when using 4400 NVIDIA A100 GPUs of the Perlmutter system. This is enabled by a modification of the original method that pushes the sustained fraction of the peak performance to about 80%. Example calculations are performed for SARS-CoV-2 spike proteins with up to 83 million atoms. }}, author = {{Schade, Robert and Kenter, Tobias and Elgabarty, Hossam and Lass, Michael and Kühne, Thomas and Plessl, Christian}}, issn = {{1094-3420}}, journal = {{The International Journal of High Performance Computing Applications}}, keywords = {{Hardware and Architecture, Theoretical Computer Science, Software}}, publisher = {{SAGE Publications}}, title = {{{Breaking the exascale barrier for the electronic structure problem in ab-initio molecular dynamics}}}, doi = {{10.1177/10943420231177631}}, year = {{2023}}, } @article{38041, abstract = {{While FPGA accelerator boards and their respective high-level design tools are maturing, there is still a lack of multi-FPGA applications, libraries, and not least, benchmarks and reference implementations towards sustained HPC usage of these devices. As in the early days of GPUs in HPC, for workloads that can reasonably be decoupled into loosely coupled working sets, multi-accelerator support can be achieved by using standard communication interfaces like MPI on the host side. However, for performance and productivity, some applications can profit from a tighter coupling of the accelerators. FPGAs offer unique opportunities here when extending the dataflow characteristics to their communication interfaces. In this work, we extend the HPCC FPGA benchmark suite by multi-FPGA support and three missing benchmarks that particularly characterize or stress inter-device communication: b_eff, PTRANS, and LINPACK. With all benchmarks implemented for current boards with Intel and Xilinx FPGAs, we established a baseline for multi-FPGA performance. Additionally, for the communication-centric benchmarks, we explored the potential of direct FPGA-to-FPGA communication with a circuit-switched inter-FPGA network that is currently only available for one of the boards. The evaluation with parallel execution on up to 26 FPGA boards makes use of one of the largest academic FPGA installations.}}, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, issn = {{1936-7406}}, journal = {{ACM Transactions on Reconfigurable Technology and Systems}}, keywords = {{General Computer Science}}, publisher = {{Association for Computing Machinery (ACM)}}, title = {{{Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}}}, doi = {{10.1145/3576200}}, year = {{2023}}, } @article{46120, abstract = {{The rise of exascale supercomputers has fueled competition among GPU vendors, driving lattice QCD developers to write code that supports multiple APIs. Moreover, new developments in algorithms and physics research require frequent updates to existing software. These challenges have to be balanced against constantly changing personnel. At the same time, there is a wide range of applications for HISQ fermions in QCD studies. This situation encourages the development of software featuring a HISQ action that is flexible, high-performing, open source, easy to use, and easy to adapt. In this technical paper, we explain the design strategy, provide implementation details, list available algorithms and modules, and show key performance indicators for SIMULATeQCD, a simple multi-GPU lattice code for large-scale QCD calculations, mainly developed and used by the HotQCD collaboration. The code is publicly available on GitHub.}}, author = {{Mazur, Lukas and Bollweg, Dennis and Clarke, David A. and Altenkort, Luis and Kaczmarek, Olaf and Larsen, Rasmus and Shu, Hai-Tao and Goswami, Jishnu and Scior, Philipp and Sandmeyer, Hauke and Neumann, Marius and Dick, Henrik and Ali, Sajid and Kim, Jangho and Schmidt, Christian and Petreczky, Peter and Mukherjee, Swagato}}, journal = {{Computer Physics Communications}}, title = {{{SIMULATeQCD: A simple multi-GPU lattice code for QCD calculations}}}, doi = {{10.48550/ARXIV.2306.01098}}, year = {{2023}}, } @article{46119, author = {{Altenkort, Luis and Eller, Alexander M. and Francis, Anthony and Kaczmarek, Olaf and Mazur, Lukas and Moore, Guy D. and Shu, Hai-Tao}}, issn = {{2470-0010}}, journal = {{Physical Review D}}, number = {{1}}, publisher = {{American Physical Society (APS)}}, title = {{{Viscosity of pure-glue QCD from the lattice}}}, doi = {{10.1103/physrevd.108.014503}}, volume = {{108}}, year = {{2023}}, } @article{32234, author = {{Wojciechowski, M}}, issn = {{2352-3409}}, journal = {{Data Brief}}, pages = {{108318}}, title = {{{Dataset for random uniform distributions of 2D circles and 3D spheres.}}}, volume = {{43}}, year = {{2022}}, } @article{27364, author = {{Meyer, Marius and Kenter, Tobias and Plessl, Christian}}, issn = {{0743-7315}}, journal = {{Journal of Parallel and Distributed Computing}}, title = {{{In-depth FPGA Accelerator Performance Evaluation with Single Node Benchmarks from the HPC Challenge Benchmark Suite for Intel and Xilinx FPGAs using OpenCL}}}, doi = {{10.1016/j.jpdc.2021.10.007}}, year = {{2022}}, } @article{46121, author = {{Altenkort, Luis and Eller, Alexander M. and Kaczmarek, O. and Mazur, Lukas and Moore, Guy D. and Shu, Hai-Tao}}, issn = {{2470-0010}}, journal = {{Physical Review D}}, number = {{9}}, publisher = {{American Physical Society (APS)}}, title = {{{Lattice QCD noise reduction for bosonic correlators through blocking}}}, doi = {{10.1103/physrevd.105.094505}}, volume = {{105}}, year = {{2022}}, } @article{32183, author = {{Hou, W and Yao, Y and Li, Y and Peng, B and Shi, K and Zhou, Z and Pan, J and Liu, M and Hu, J}}, issn = {{2095-025x}}, journal = {{Frontiers of materials science}}, number = {{1}}, title = {{{Linearly shifting ferromagnetic resonance response of La0.7Sr0.3MnO3 thin film for body temperature sensors}}}, volume = {{16}}, year = {{2022}}, } @article{40523, abstract = {{AbstractTailored nanoscale quantum light sources, matching the specific needs of use cases, are crucial building blocks for photonic quantum technologies. Several different approaches to realize solid-state quantum emitters with high performance have been pursued and different concepts for energy tuning have been established. However, the properties of the emitted photons are always defined by the individual quantum emitter and can therefore not be controlled with full flexibility. Here we introduce an all-optical nonlinear method to tailor and control the single photon emission. We demonstrate a laser-controlled down-conversion process from an excited state of a semiconductor quantum three-level system. Based on this concept, we realize energy tuning and polarization control of the single photon emission with a control-laser field. Our results mark an important step towards tailored single photon emission from a photonic quantum system based on quantum optical principles.}}, author = {{Jonas, B. and Heinze, Dirk Florian and Schöll, E. and Kallert, P. and Langer, T. and Krehs, S. and Widhalm, A. and Jöns, Klaus and Reuter, Dirk and Schumacher, Stefan and Zrenner, Artur}}, issn = {{2041-1723}}, journal = {{Nature Communications}}, keywords = {{General Physics and Astronomy, General Biochemistry, Genetics and Molecular Biology, General Chemistry, Multidisciplinary}}, number = {{1}}, publisher = {{Springer Science and Business Media LLC}}, title = {{{Nonlinear down-conversion in a single quantum dot}}}, doi = {{10.1038/s41467-022-28993-3}}, volume = {{13}}, year = {{2022}}, } @article{33226, abstract = {{A parallel hybrid quantum-classical algorithm for the solution of the quantum-chemical ground-state energy problem on gate-based quantum computers is presented. This approach is based on the reduced density-matrix functional theory (RDMFT) formulation of the electronic structure problem. For that purpose, the density-matrix functional of the full system is decomposed into an indirectly coupled sum of density-matrix functionals for all its subsystems using the adaptive cluster approximation to RDMFT. The approximations involved in the decomposition and the adaptive cluster approximation itself can be systematically converged to the exact result. The solutions for the density-matrix functionals of the effective subsystems involves a constrained minimization over many-particle states that are approximated by parametrized trial states on the quantum computer similarly to the variational quantum eigensolver. The independence of the density-matrix functionals of the effective subsystems introduces a new level of parallelization and allows for the computational treatment of much larger molecules on a quantum computer with a given qubit count. In addition, for the proposed algorithm techniques are presented to reduce the qubit count, the number of quantum programs, as well as its depth. The evaluation of a density-matrix functional as the essential part of our approach is demonstrated for Hubbard-like systems on IBM quantum computers based on superconducting transmon qubits.}}, author = {{Schade, Robert and Bauer, Carsten and Tamoev, Konstantin and Mazur, Lukas and Plessl, Christian and Kühne, Thomas}}, journal = {{Phys. Rev. Research}}, pages = {{033160}}, publisher = {{American Physical Society}}, title = {{{Parallel quantum chemistry on noisy intermediate-scale quantum computers}}}, doi = {{10.1103/PhysRevResearch.4.033160}}, volume = {{4}}, year = {{2022}}, }