@article{18, abstract = {{Branch and bound (B&B) algorithms structure the search space as a tree and eliminate infeasible solutions early by pruning subtrees that cannot lead to a valid or optimal solution. Custom hardware designs significantly accelerate the execution of these algorithms. In this article, we demonstrate a high-performance B&B implementation on FPGAs. First, we identify general elements of B&B algorithms and describe their implementation as a finite state machine. Then, we introduce workers that autonomously cooperate using work stealing to allow parallel execution and full utilization of the target FPGA. Finally, we explore advantages of instance-specific designs that target a specific problem instance to improve performance. We evaluate our concepts by applying them to a branch and bound problem, the reconstruction of corrupted AES keys obtained from cold-boot attacks. The evaluation shows that our work stealing approach is scalable with the available resources and provides speedups proportional to the number of workers. Instance-specific designs allow us to achieve an overall speedup of 47 × compared to the fastest implementation of AES key reconstruction so far. Finally, we demonstrate how instance-specific designs can be generated just-in-time such that the provided speedups outweigh the additional time required for design synthesis.}}, author = {{Riebler, Heinrich and Lass, Michael and Mittendorf, Robert and Löcke, Thomas and Plessl, Christian}}, issn = {{1936-7406}}, journal = {{ACM Transactions on Reconfigurable Technology and Systems (TRETS)}}, keywords = {{coldboot}}, number = {{3}}, pages = {{24:1--24:23}}, publisher = {{Association for Computing Machinery (ACM)}}, title = {{{Efficient Branch and Bound on FPGAs Using Work Stealing and Instance-Specific Designs}}}, doi = {{10.1145/3053687}}, volume = {{10}}, year = {{2017}}, } @article{1589, author = {{Schumacher, Jörn and Plessl, Christian and Vandelli, Wainer}}, journal = {{Journal of Physics: Conference Series}}, publisher = {{IOP Publishing}}, title = {{{High-Throughput and Low-Latency Network Communication with NetIO}}}, doi = {{10.1088/1742-6596/898/8/082003}}, volume = {{898}}, year = {{2017}}, } @article{165, abstract = {{A broad spectrum of applications can be accelerated by offloading computation intensive parts to reconfigurable hardware. However, to achieve speedups, the number of loop it- erations (trip count) needs to be sufficiently large to amortize offloading overheads. Trip counts are frequently not known at compile time, but only at runtime just before entering a loop. Therefore, we propose to generate code for both the CPU and the coprocessor, and defer the offloading decision to the application runtime. We demonstrate how a toolflow, based on the LLVM compiler framework, can automatically embed dynamic offloading de- cisions into the application code. We perform in-depth static and dynamic analysis of pop- ular benchmarks, which confirm the general potential of such an approach. We also pro- pose to optimize the offloading process by decoupling the runtime decision from the loop execution (decision slack). The feasibility of our approach is demonstrated by a toolflow that automatically identifies suitable data-parallel loops and generates code for the FPGA coprocessor of a Convey HC-1. We evaluate the integrated toolflow with representative loops executed for different input data sizes.}}, author = {{Vaz, Gavin Francis and Riebler, Heinrich and Kenter, Tobias and Plessl, Christian}}, issn = {{0045-7906}}, journal = {{Computers and Electrical Engineering}}, pages = {{91--111}}, publisher = {{Elsevier}}, title = {{{Potential and Methods for Embedding Dynamic Offloading Decisions into Application Code}}}, doi = {{10.1016/j.compeleceng.2016.04.021}}, volume = {{55}}, year = {{2016}}, } @article{1768, author = {{Plessl, Christian and Platzner, Marco and Schreier, Peter J.}}, journal = {{Informatik Spektrum}}, keywords = {{approximate computing, survey}}, number = {{5}}, pages = {{396--399}}, publisher = {{Springer}}, title = {{{Aktuelles Schlagwort: Approximate Computing}}}, doi = {{10.1007/s00287-015-0911-z}}, year = {{2015}}, } @article{296, abstract = {{FPGAs are known to permit huge gains in performance and efficiency for suitable applications but still require reduced design efforts and shorter development cycles for wider adoption. In this work, we compare the resulting performance of two design concepts that in different ways promise such increased productivity. As common starting point, we employ a kernel-centric design approach, where computational hotspots in an application are identified and individually accelerated on FPGA. By means of a complex stereo matching application, we evaluate two fundamentally different design philosophies and approaches for implementing the required kernels on FPGAs. In the first implementation approach, we designed individually specialized data flow kernels in a spatial programming language for a Maxeler FPGA platform; in the alternative design approach, we target a vector coprocessor with large vector lengths, which is implemented as a form of programmable overlay on the application FPGAs of a Convey HC-1. We assess both approaches in terms of overall system performance, raw kernel performance, and performance relative to invested resources. After compensating for the effects of the underlying hardware platforms, the specialized dataflow kernels on the Maxeler platform are around 3x faster than kernels executing on the Convey vector coprocessor. In our concrete scenario, due to trade-offs between reconfiguration overheads and exposed parallelism, the advantage of specialized dataflow kernels is reduced to around 2.5x.}}, author = {{Kenter, Tobias and Schmitz, Henning and Plessl, Christian}}, journal = {{International Journal of Reconfigurable Computing (IJRC)}}, publisher = {{Hindawi}}, title = {{{Exploring Tradeoffs between Specialized Kernels and a Reusable Overlay in a Stereo-Matching Case Study}}}, doi = {{10.1155/2015/859425}}, volume = {{2015}}, year = {{2015}}, } @article{1775, abstract = {{The ATLAS experiment at CERN is planning full deployment of a new unified optical link technology for connecting detector front end electronics on the timescale of the LHC Run 4 (2025). It is estimated that roughly 8000 GBT (GigaBit Transceiver) links, with transfer rates up to 10.24 Gbps, will replace existing links used for readout, detector control and distribution of timing and trigger information. A new class of devices will be needed to interface many GBT links to the rest of the trigger, data-acquisition and detector control systems. In this paper FELIX (Front End LInk eXchange) is presented, a PC-based device to route data from and to multiple GBT links via a high-performance general purpose network capable of a total throughput up to O(20 Tbps). FELIX implies architectural changes to the ATLAS data acquisition system, such as the use of industry standard COTS components early in the DAQ chain. Additionally the design and implementation of a FELIX demonstration platform is presented and hardware and software aspects will be discussed.}}, author = {{Anderson, J and Borga, A and Boterenbrood, H and Chen, H and Chen, K and Drake, G and Francis, D and Gorini, B and Lanni, F and Lehmann Miotto, G and Levinson, L and Narevicius, J and Plessl, Christian and Roich, A and Ryu, S and Schreuder, F and Schumacher, Jörn and Vandelli, Wainer and Vermeulen, J and Zhang, J}}, journal = {{Journal of Physics: Conference Series}}, publisher = {{IOP Publishing}}, title = {{{FELIX: a High-Throughput Network Approach for Interfacing to Front End Electronics for ATLAS Upgrades}}}, doi = {{10.1088/1742-6596/664/8/082050}}, volume = {{664}}, year = {{2015}}, } @article{1774, abstract = {{In this article an efficient numerical method to solve multiobjective optimization problems for fluid flow governed by the Navier Stokes equations is presented. In order to decrease the computational effort, a reduced order model is introduced using Proper Orthogonal Decomposition and a corresponding Galerkin Projection. A global, derivative free multiobjective optimization algorithm is applied to compute the Pareto set (i.e. the set of optimal compromises) for the concurrent objectives minimization of flow field fluctuations and control cost. The method is illustrated for a 2D flow around a cylinder at Re = 100.}}, author = {{Peitz, Sebastian and Dellnitz, Michael}}, issn = {{1617-7061}}, journal = {{PAMM}}, number = {{1}}, pages = {{613--614}}, publisher = {{WILEY-VCH Verlag}}, title = {{{Multiobjective Optimization of the Flow Around a Cylinder Using Model Order Reduction}}}, doi = {{10.1002/pamm.201510296}}, volume = {{15}}, year = {{2015}}, } @article{1772, author = {{Torresen, Jim and Plessl, Christian and Yao, Xin}}, journal = {{IEEE Computer}}, keywords = {{self-awareness, self-expression}}, number = {{7}}, pages = {{18--20}}, publisher = {{IEEE Computer Society}}, title = {{{Self-Aware and Self-Expressive Systems – Guest Editor's Introduction}}}, doi = {{10.1109/MC.2015.205}}, volume = {{48}}, year = {{2015}}, } @article{1769, abstract = {{Große zylindrische Stahlprüflinge werden mittels der Methode der finiten Differenzen im Zeitbereich (engl. finite differences in time domain, FDTD) simulativ untersucht. Dabei werden Pitch-Catch-Messanordnungen verwendet. Es werden zwei Bildgebungsansätze vorgestellt: ersterer basiert auf dem Imaging Principle nach Claerbout, letzterer basiert auf gradientenbasierter Optimierung eines Zielfunktionals.}}, author = {{Hegler, Sebastian and Statz, Christoph and Mütze, Marco and Mooshofer, Hubert and Goldammer, Matthias and Fendt, Karl and Schwarzer, Stefan and Feldhoff, Kim and Flehmig, Martin and Markwardt, Ulf and E. Nagel, Wolfgang and Schütte, Maria and Walther, Andrea and Meinel, Michael and Basermann, Achim and Plettemeier, Dirk}}, journal = {{tm - Technisches Messen}}, number = {{9}}, pages = {{440--450}}, publisher = {{Walter de Gruyter}}, title = {{{Simulative Ultraschall-Untersuchung von Pitch-Catch-Messanordnungen für große zylindrische Stahl-Prüflinge und gradientenbasierte Bildgebung}}}, doi = {{doi:10.1515/teme-2015-0031}}, volume = {{82}}, year = {{2015}}, } @article{1779, author = {{Giefers, Heiner and Plessl, Christian and Förstner, Jens}}, issn = {{0163-5964}}, journal = {{ACM SIGARCH Computer Architecture News}}, keywords = {{funding-maxup, tet_topic_hpc}}, number = {{5}}, pages = {{65--70}}, publisher = {{ACM}}, title = {{{Accelerating Finite Difference Time Domain Simulations with Reconfigurable Dataflow Computers}}}, doi = {{10.1145/2641361.2641372}}, volume = {{41}}, year = {{2014}}, }