@inproceedings{59804,
  author       = {{Jungemann, Linus and Wintermann, Bjarne and Riebler, Heinrich and Plessl, Christian}},
  booktitle    = {{Proceedings of the 15th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies}},
  location     = {{Kumamoto, Japan}},
  publisher    = {{ACM}},
  title        = {{{FINN-HPC: Closing the Gap for Energy-Efficient Neural Network Inference on FPGAs in HPC}}},
  doi          = {{10.1145/3728179.3728189}},
  year         = {{2025}},
}

@inproceedings{59816,
  author       = {{Pape, Gerrit and Wintermann, Bjarne and Jungemann, Linus and Lass, Michael and Meyer, Marius and Riebler, Heinrich and Plessl, Christian}},
  booktitle    = {{Proceedings of the 15th International Symposium on Highly Efficient Accelerators and Reconfigurable Technologies}},
  location     = {{Kumamoto, Japan}},
  title        = {{{AuroraFlow, an Easy-to-Use, Low-Latency FPGA Communication Solution Demonstrated on Multi-FPGA Neural Network Inference}}},
  doi          = {{10.1145/3728179.3728190}},
  year         = {{2025}},
}

@techreport{62981,
  abstract     = {{Otus is a high-performance computing cluster that was launched in 2025 and is operated by the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. The system is part of the National High Performance Computing (NHR) initiative. Otus complements the previous supercomputer Noctua 2, offering approximately twice the computing power while retaining the three node types that were characteristic of Noctua 2: 1) CPU compute nodes with different memory capacities, 2) high-end GPU nodes, and 3) HPC-grade FPGA nodes. On the Top500 list, which ranks the 500 most powerful supercomputers in the world, Otus is in position 164 with the CPU partition and in position 255 with the GPU partition (June 2025). On the Green500 list, ranking the 500 most energy-efficient supercomputers in the world, Otus is in position 5 with the GPU partition (June 2025).


This article provides a comprehensive overview of the system in terms of its hardware, software, system integration, and its overall integration into the data center building to ensure energy-efficient operation. The article aims to provide unique insights for scientists using the system and for other centers operating HPC clusters. The article will be continuously updated to reflect the latest system setup and measurements. }},
  author       = {{Ehtesabi, Sadaf and Hossain, Manoar and Kenter, Tobias and Krawinkel, Andreas and Ostermann, Lukas and Plessl, Christian and Riebler, Heinrich and Rohde, Stefan and Schade, Robert and Schwarz, Michael and Simon, Jens and Winnwa, Nils and Wiens, Alex and Wu, Xin}},
  keywords     = {{Otus, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing, Noctua 2, HPC}},
  pages        = {{33}},
  publisher    = {{Paderborn Center for Parallel Computing (PC2)}},
  title        = {{{Otus Supercomputer}}},
  doi          = {{10.48550/ARXIV.2512.07401}},
  volume       = {{1}},
  year         = {{2025}},
}

@article{53663,
  abstract     = {{Noctua 2 is a supercomputer operated at the Paderborn Center for Parallel Computing (PC2) at Paderborn University in Germany. Noctua 2 was inaugurated in 2022 and is an Atos BullSequana XH2000 system. It consists mainly of three node types: 1) CPU Compute nodes with AMD EPYC processors in different main memory configurations, 2) GPU nodes with NVIDIA A100 GPUs, and 3) FPGA nodes with Xilinx Alveo U280 and Intel Stratix 10 FPGA cards. While CPUs and GPUs are known off-the-shelf components in HPC systems, the operation of a large number of FPGA cards from different vendors and a dedicated FPGA-to-FPGA network are unique characteristics of Noctua 2. This paper describes in detail the overall setup of Noctua 2 and gives insights into the operation of the cluster from a hardware, software and facility perspective.}},
  author       = {{Bauer, Carsten and Kenter, Tobias and Lass, Michael and Mazur, Lukas and Meyer, Marius and Nitsche, Holger and Riebler, Heinrich and Schade, Robert and Schwarz, Michael and Winnwa, Nils and Wiens, Alex and Wu, Xin and Plessl, Christian and Simon, Jens}},
  journal      = {{Journal of large-scale research facilities}},
  keywords     = {{Noctua 2, Supercomputer, FPGA, PC2, Paderborn Center for Parallel Computing}},
  title        = {{{Noctua 2 Supercomputer}}},
  doi          = {{10.17815/jlsrf-8-187 }},
  volume       = {{9}},
  year         = {{2024}},
}

@article{56604,
  abstract     = {{This manuscript makes the claim of having computed the 9th Dedekind number, D(9). This was done by accelerating the core operation of the process with an efficient FPGA design that outperforms an optimized 64-core CPU reference by 95x. The FPGA execution was parallelized on the Noctua 2 supercomputer at Paderborn University. The resulting value for D(9) is 286386577668298411128469151667598498812366. This value can be verified in two steps. We have made the data file containing the 490 M results available, each of which can be verified separately on CPU, and the whole file sums to our proposed value. The paper explains the mathematical approach in the first part, before putting the focus on a deep dive into the FPGA accelerator implementation followed by a performance analysis. The FPGA implementation was done in Register-Transfer Level using a dual-clock architecture and shows how we achieved an impressive FMax of 450 MHz on the targeted Stratix 10 GX 2,800 FPGAs. The total compute time used was 47,000 FPGA hours.}},
  author       = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems}},
  number       = {{3}},
  pages        = {{1--28}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{A Computation of the Ninth Dedekind Number Using FPGA Supercomputing}}},
  doi          = {{10.1145/3674147}},
  volume       = {{17}},
  year         = {{2024}},
}

@unpublished{43439,
  abstract     = {{This preprint makes the claim of having computed the $9^{th}$ Dedekind
Number. This was done by building an efficient FPGA Accelerator for the core
operation of the process, and parallelizing it on the Noctua 2 Supercluster at
Paderborn University. The resulting value is
286386577668298411128469151667598498812366. This value can be verified in two
steps. We have made the data file containing the 490M results available, each
of which can be verified separately on CPU, and the whole file sums to our
proposed value.}},
  author       = {{Van Hirtum, Lennart and De Causmaecker, Patrick and Goemaere, Jens and Kenter, Tobias and Riebler, Heinrich and Lass, Michael and Plessl, Christian}},
  booktitle    = {{arXiv:2304.03039}},
  title        = {{{A computation of D(9) using FPGA Supercomputing}}},
  year         = {{2023}},
}

@inbook{45893,
  author       = {{Hansmeier, Tim and Kenter, Tobias and Meyer, Marius and Riebler, Heinrich and Platzner, Marco and Plessl, Christian}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{165--182}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Compute Centers I: Heterogeneous Execution Environments}}},
  doi          = {{10.5281/zenodo.8068642}},
  volume       = {{412}},
  year         = {{2023}},
}

@article{7689,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Kenter, Tobias and Plessl, Christian}},
  journal      = {{ACM Trans. Archit. Code Optim. (TACO)}},
  keywords     = {{htrop}},
  number       = {{2}},
  pages        = {{14:1–14:26}},
  publisher    = {{ACM}},
  title        = {{{Transparent Acceleration for Heterogeneous Platforms with Compilation to OpenCL}}},
  doi          = {{10.1145/3319423}},
  volume       = {{16}},
  year         = {{2019}},
}

@phdthesis{34167,
  author       = {{Riebler, Heinrich}},
  title        = {{{Efficient parallel branch-and-bound search on FPGAs using work stealing and instance-specific designs}}},
  doi          = {{10.17619/UNIPB/1-830}},
  year         = {{2019}},
}

@inproceedings{1204,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{Proc. ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)}},
  isbn         = {{9781450349826}},
  keywords     = {{htrop}},
  publisher    = {{ACM}},
  title        = {{{Automated Code Acceleration Targeting Heterogeneous OpenCL Devices}}},
  doi          = {{10.1145/3178487.3178534}},
  year         = {{2018}},
}

@article{18,
  abstract     = {{Branch and bound (B&B) algorithms structure the search space as a tree and eliminate infeasible solutions early by pruning subtrees that cannot lead to a valid or optimal solution. Custom hardware designs significantly accelerate the execution of these algorithms. In this article, we demonstrate a high-performance B&B implementation on FPGAs. First, we identify general elements of B&B algorithms and describe their implementation as a finite state machine. Then, we introduce workers that autonomously cooperate using work stealing to allow parallel execution and full utilization of the target FPGA. Finally, we explore advantages of instance-specific designs that target a specific problem instance to improve performance.

We evaluate our concepts by applying them to a branch and bound problem, the reconstruction of corrupted AES keys obtained from cold-boot attacks. The evaluation shows that our work stealing approach is scalable with the available resources and provides speedups proportional to the number of workers. Instance-specific designs allow us to achieve an overall speedup of 47 × compared to the fastest implementation of AES key reconstruction so far. Finally, we demonstrate how instance-specific designs can be generated just-in-time such that the provided speedups outweigh the additional time required for design synthesis.}},
  author       = {{Riebler, Heinrich and Lass, Michael and Mittendorf, Robert and Löcke, Thomas and Plessl, Christian}},
  issn         = {{1936-7406}},
  journal      = {{ACM Transactions on Reconfigurable Technology and Systems (TRETS)}},
  keywords     = {{coldboot}},
  number       = {{3}},
  pages        = {{24:1--24:23}},
  publisher    = {{Association for Computing Machinery (ACM)}},
  title        = {{{Efficient Branch and Bound on FPGAs Using Work Stealing and Instance-Specific Designs}}},
  doi          = {{10.1145/3053687}},
  volume       = {{10}},
  year         = {{2017}},
}

@inproceedings{31,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian and Trainiti, Ettore M. G. and Durelli, Gianluca C. and Bolchini, Cristiana}},
  booktitle    = {{Proc. HiPEAC Workshop on Reonfigurable Computing (WRC)}},
  title        = {{{Using Just-in-Time Code Generation for Transparent Resource Management in Heterogeneous Systems}}},
  year         = {{2016}},
}

@inproceedings{138,
  abstract     = {{Hardware accelerators are becoming popular in academia and industry. To move one step further from the state-of-the-art multicore plus accelerator approaches, we present in this paper our innovative SAVEHSA architecture. It comprises of a heterogeneous hardware platform with three different high-end accelerators attached over PCIe (GPGPU, FPGA and Intel MIC). Such systems can process parallel workloads very efficiently whilst being more energy efficient than regular CPU systems. To leverage the heterogeneity, the workload has to be distributed among the computing units in a way that each unit is well-suited for the assigned task and executable code must be available. To tackle this problem we present two software components; the first can perform resource allocation at runtime while respecting system and application goals (in terms of throughput, energy, latency, etc.) and the second is able to analyze an application and generate executable code for an accelerator at runtime. We demonstrate the first proof-of-concept implementation of our framework on the heterogeneous platform, discuss different runtime policies and measure the introduced overheads.}},
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian and Trainiti, Ettore M. G.  and Durelli, Gianluca C. and Del Sozzo, Emanuele and Santambrogio, Marco D.  and Bolchini, Christina}},
  booktitle    = {{Proceedings of International Forum on Research and Technologies for Society and Industry (RTSI)}},
  pages        = {{1--5}},
  publisher    = {{IEEE}},
  title        = {{{Using Just-in-Time Code Generation for Transparent Resource Management in Heterogeneous Systems}}},
  doi          = {{10.1109/RTSI.2016.7740545}},
  year         = {{2016}},
}

@article{165,
  abstract     = {{A broad spectrum of applications can be accelerated by offloading computation intensive parts to reconfigurable hardware. However, to achieve speedups, the number of loop it- erations (trip count) needs to be sufficiently large to amortize offloading overheads. Trip counts are frequently not known at compile time, but only at runtime just before entering a loop. Therefore, we propose to generate code for both the CPU and the coprocessor, and defer the offloading decision to the application runtime. We demonstrate how a toolflow, based on the LLVM compiler framework, can automatically embed dynamic offloading de- cisions into the application code. We perform in-depth static and dynamic analysis of pop- ular benchmarks, which confirm the general potential of such an approach. We also pro- pose to optimize the offloading process by decoupling the runtime decision from the loop execution (decision slack). The feasibility of our approach is demonstrated by a toolflow that automatically identifies suitable data-parallel loops and generates code for the FPGA coprocessor of a Convey HC-1. We evaluate the integrated toolflow with representative loops executed for different input data sizes.}},
  author       = {{Vaz, Gavin Francis and Riebler, Heinrich and Kenter, Tobias and Plessl, Christian}},
  issn         = {{0045-7906}},
  journal      = {{Computers and Electrical Engineering}},
  pages        = {{91--111}},
  publisher    = {{Elsevier}},
  title        = {{{Potential and Methods for Embedding Dynamic Offloading Decisions into Application Code}}},
  doi          = {{10.1016/j.compeleceng.2016.04.021}},
  volume       = {{55}},
  year         = {{2016}},
}

@inproceedings{171,
  author       = {{Kenter, Tobias and Vaz, Gavin Francis and Riebler, Heinrich and Plessl, Christian}},
  booktitle    = {{Workshop on Reconfigurable Computing (WRC)}},
  title        = {{{Opportunities for deferring application partitioning and accelerator synthesis to runtime (extended abstract)}}},
  year         = {{2016}},
}

@inproceedings{238,
  abstract     = {{In this paper, we study how binary applications can be transparently accelerated with novel heterogeneous computing resources without requiring any manual porting or developer-provided hints. Our work is based on Binary Acceleration At Runtime (BAAR), our previously introduced binary acceleration mechanism that uses the LLVM Compiler Infrastructure. BAAR is designed as a client-server architecture. The client runs the program to be accelerated in an environment, which allows program analysis and profiling and identifies and extracts suitable program parts to be offloaded. The server compiles and optimizes these offloaded program parts for the accelerator and offers access to these functions to the client with a remote procedure call (RPC) interface. Our previous work proved the feasibility of our approach, but also showed that communication time and overheads limit the granularity of functions that can be meaningfully offloaded. In this work, we motivate the importance of a lightweight, high-performance communication between server and client and present a communication mechanism based on the Message Passing Interface (MPI). We evaluate our approach by using an Intel Xeon Phi 5110P as the acceleration target and show that the communication overhead can be reduced from 40% to 10%, thus enabling even small hotspots to benefit from offloading to an accelerator.}},
  author       = {{Damschen, Marvin and Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian}},
  booktitle    = {{Proceedings of the 2015 Conference on Design, Automation and Test in Europe (DATE)}},
  pages        = {{1078--1083}},
  publisher    = {{EDA Consortium / IEEE}},
  title        = {{{Transparent offloading of computational hotspots from binary code to Xeon Phi}}},
  doi          = {{10.7873/DATE.2015.1124}},
  year         = {{2015}},
}

@inproceedings{377,
  abstract     = {{In this paper, we study how AES key schedules can be reconstructed from decayed memory. This operation is a crucial and time consuming operation when trying to break encryption systems with cold-boot attacks. In software, the reconstruction of the AES master key can be performed using a recursive, branch-and-bound tree-search algorithm that exploits redundancies in the key schedule for constraining the search space. In this work, we investigate how this branch-and-bound algorithm can be accelerated with FPGAs. We translated the recursive search procedure to a state machine with an explicit stack for each recursion level and create optimized datapaths to accelerate in particular the processing of the most frequently accessed tree levels. We support two different decay models, of which especially the more realistic non-idealized asymmetric decay model causes very high runtimes in software. Our implementation on a Maxeler dataflow computing system outperforms a software implementation for this model by up to 27x, which makes cold-boot attacks against AES practical even for high error rates.}},
  author       = {{Riebler, Heinrich and Kenter, Tobias and Plessl, Christian and Sorge, Christoph}},
  booktitle    = {{Proceedings of Field-Programmable Custom Computing Machines (FCCM)}},
  keywords     = {{coldboot}},
  pages        = {{222--229}},
  publisher    = {{IEEE}},
  title        = {{{Reconstructing AES Key Schedules from Decayed Memory with FPGAs}}},
  doi          = {{10.1109/FCCM.2014.67}},
  year         = {{2014}},
}

@inproceedings{1778,
  author       = {{C. Durelli, Gianluca and Pogliani, Marcello and Miele, Antonio and Plessl, Christian and Riebler, Heinrich and Vaz, Gavin Francis and D. Santambrogio, Marco and Bolchini, Cristiana}},
  booktitle    = {{Proc. Int. Symp. on Parallel and Distributed Processing with Applications (ISPA)}},
  pages        = {{142--149}},
  publisher    = {{IEEE}},
  title        = {{{Runtime Resource Management in Heterogeneous System Architectures: The SAVE Approach}}},
  doi          = {{10.1109/ISPA.2014.27}},
  year         = {{2014}},
}

@inproceedings{439,
  abstract     = {{Reconfigurable architectures provide an opportunityto accelerate a wide range of applications, frequentlyby exploiting data-parallelism, where the same operations arehomogeneously executed on a (large) set of data. However, whenthe sequential code is executed on a host CPU and only dataparallelloops are executed on an FPGA coprocessor, a sufficientlylarge number of loop iterations (trip counts) is required, such thatthe control- and data-transfer overheads to the coprocessor canbe amortized. However, the trip count of large data-parallel loopsis frequently not known at compile time, but only at runtime justbefore entering a loop. Therefore, we propose to generate codeboth for the CPU and the coprocessor, and to defer the decisionwhere to execute the appropriate code to the runtime of theapplication when the trip count of the loop can be determinedjust at runtime. We demonstrate how an LLVM compiler basedtoolflow can automatically insert appropriate decision blocks intothe application code. Analyzing popular benchmark suites, weshow that this kind of runtime decisions is often applicable. Thepractical feasibility of our approach is demonstrated by a toolflowthat automatically identifies loops suitable for vectorization andgenerates code for the FPGA coprocessor of a Convey HC-1. Thetoolflow adds decisions based on a comparison of the runtimecomputedtrip counts to thresholds for specific loops and alsoincludes support to move just the required data to the coprocessor.We evaluate the integrated toolflow with characteristic loopsexecuted on different input data sizes.}},
  author       = {{Vaz, Gavin Francis and Riebler, Heinrich and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}},
  pages        = {{1--8}},
  publisher    = {{IEEE}},
  title        = {{{Deferring Accelerator Offloading Decisions to Application Runtime}}},
  doi          = {{10.1109/ReConFig.2014.7032509}},
  year         = {{2014}},
}

@misc{521,
  author       = {{Riebler, Heinrich}},
  keywords     = {{coldboot}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Identifikation und Wiederherstellung von kryptographischen Schlüsseln mit FPGAs}}},
  year         = {{2013}},
}

