@inproceedings{168,
  abstract     = {{The use of heterogeneous computing resources, such as Graphic Processing Units or other specialized coprocessors, has become widespread in recent years because of their per- formance and energy efficiency advantages. Approaches for managing and scheduling tasks to heterogeneous resources are still subject to research. Although queuing systems have recently been extended to support accelerator resources, a general solution that manages heterogeneous resources at the operating system- level to exploit a global view of the system state is still missing.In this paper we present a user space scheduler that enables task scheduling and migration on heterogeneous processing resources in Linux. Using run queues for available resources we perform scheduling decisions based on the system state and on task characterization from earlier measurements. With a pro- gramming pattern that supports the integration of checkpoints into applications, we preempt tasks and migrate them between three very different compute resources. Considering static and dynamic workload scenarios, we show that this approach can gain up to 17% performance, on average 7%, by effectively avoiding idle resources. We demonstrate that a work-conserving strategy without migration is no suitable alternative.}},
  author       = {{Lösch, Achim and Beisel, Tobias and Kenter, Tobias and Plessl, Christian and Platzner, Marco}},
  booktitle    = {{Proceedings of the 2016 Design, Automation & Test in Europe Conference & Exhibition (DATE)}},
  pages        = {{912--917}},
  publisher    = {{EDA Consortium / IEEE}},
  title        = {{{Performance-centric scheduling with task migration for a heterogeneous compute node in the data center}}},
  year         = {{2016}},
}

@inproceedings{171,
  author       = {{Kenter, Tobias and Vaz, Gavin Francis and Riebler, Heinrich and Plessl, Christian}},
  booktitle    = {{Workshop on Reconfigurable Computing (WRC)}},
  title        = {{{Opportunities for deferring application partitioning and accelerator synthesis to runtime (extended abstract)}}},
  year         = {{2016}},
}

@misc{3364,
  author       = {{Knorr, Christoph}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Evaluation von Bildverarbeitungsalgorithmen in heterogenen Rechenknoten}}},
  year         = {{2015}},
}

@article{1772,
  author       = {{Torresen, Jim and Plessl, Christian and Yao, Xin}},
  journal      = {{IEEE Computer}},
  keywords     = {{self-awareness, self-expression}},
  number       = {{7}},
  pages        = {{18--20}},
  publisher    = {{IEEE Computer Society}},
  title        = {{{Self-Aware and Self-Expressive Systems – Guest Editor's Introduction}}},
  doi          = {{10.1109/MC.2015.205}},
  volume       = {{48}},
  year         = {{2015}},
}

@misc{5413,
  author       = {{Funke, Lukas}},
  publisher    = {{Universität Paderborn}},
  title        = {{{An LLVM Based Toolchain for Transparent Acceleration of Digital Image Processing Applications using FPGA Overlay Architectures}}},
  year         = {{2015}},
}

@misc{5416,
  author       = {{Löcke, Thomas}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Instance-Specific Computing in Hard- and Software for Faster Solving of Complex Problems}}},
  year         = {{2015}},
}

@misc{5419,
  author       = {{Wallaschek, Felix}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Accelerating Programmable Logic Controllers with the use of FPGAs}}},
  year         = {{2015}},
}

@article{296,
  abstract     = {{FPGAs are known to permit huge gains in performance and efficiency for suitable applications but still require reduced design efforts and shorter development cycles for wider adoption. In this work, we compare the resulting performance of two design concepts that in different ways promise such increased productivity. As common starting point, we employ a kernel-centric design approach, where computational hotspots in an application are identified and individually accelerated on FPGA. By means of a complex stereo matching application, we evaluate two fundamentally different design philosophies and approaches for implementing the required kernels on FPGAs. In the first implementation approach, we designed individually specialized data flow kernels in a spatial programming language for a Maxeler FPGA platform; in the alternative design approach, we target a vector coprocessor with large vector lengths, which is implemented as a form of programmable overlay on the application FPGAs of a Convey HC-1. We assess both approaches in terms of overall system performance, raw kernel performance, and performance relative to invested resources. After compensating for the effects of the underlying hardware platforms, the specialized dataflow kernels on the Maxeler platform are around 3x faster than kernels executing on the Convey vector coprocessor. In our concrete scenario, due to trade-offs between reconfiguration overheads and exposed parallelism, the advantage of specialized dataflow kernels is reduced to around 2.5x.}},
  author       = {{Kenter, Tobias and Schmitz, Henning and Plessl, Christian}},
  journal      = {{International Journal of Reconfigurable Computing (IJRC)}},
  publisher    = {{Hindawi}},
  title        = {{{Exploring Tradeoffs between Specialized Kernels and a Reusable Overlay in a Stereo-Matching Case Study}}},
  doi          = {{10.1155/2015/859425}},
  volume       = {{2015}},
  year         = {{2015}},
}

@inproceedings{303,
  abstract     = {{This paper introduces Binary Acceleration At Runtime(BAAR), an easy-to-use on-the-fly binary acceleration mechanismwhich aims to tackle the problem of enabling existentsoftware to automatically utilize accelerators at runtime. BAARis based on the LLVM Compiler Infrastructure and has aclient-server architecture. The client runs the program to beaccelerated in an environment which allows program analysisand profiling. Program parts which are identified as suitable forthe available accelerator are exported and sent to the server.The server optimizes these program parts for the acceleratorand provides RPC execution for the client. The client transformsits program to utilize accelerated execution on the server foroffloaded program parts. We evaluate our work with a proofof-concept implementation of BAAR that uses an Intel XeonPhi 5110P as the acceleration target and performs automaticoffloading, parallelization and vectorization of suitable programparts. The practicality of BAAR for real-world examples is shownbased on a study of stencil codes. Our results show a speedup ofup to 4 without any developer-provided hints and 5.77 withhints over the same code compiled with the Intel Compiler atoptimization level O2 and running on an Intel Xeon E5-2670machine. Based on our insights gained during implementationand evaluation we outline future directions of research, e.g.,offloading more fine-granular program parts than functions, amore sophisticated communication mechanism or introducing onstack-replacement.}},
  author       = {{Damschen, Marvin and Plessl, Christian}},
  booktitle    = {{Proceedings of the 5th International Workshop on Adaptive Self-tuning Computing Systems (ADAPT)}},
  title        = {{{Easy-to-Use On-The-Fly Binary Program Acceleration on Many-Cores}}},
  year         = {{2015}},
}

@inproceedings{238,
  abstract     = {{In this paper, we study how binary applications can be transparently accelerated with novel heterogeneous computing resources without requiring any manual porting or developer-provided hints. Our work is based on Binary Acceleration At Runtime (BAAR), our previously introduced binary acceleration mechanism that uses the LLVM Compiler Infrastructure. BAAR is designed as a client-server architecture. The client runs the program to be accelerated in an environment, which allows program analysis and profiling and identifies and extracts suitable program parts to be offloaded. The server compiles and optimizes these offloaded program parts for the accelerator and offers access to these functions to the client with a remote procedure call (RPC) interface. Our previous work proved the feasibility of our approach, but also showed that communication time and overheads limit the granularity of functions that can be meaningfully offloaded. In this work, we motivate the importance of a lightweight, high-performance communication between server and client and present a communication mechanism based on the Message Passing Interface (MPI). We evaluate our approach by using an Intel Xeon Phi 5110P as the acceleration target and show that the communication overhead can be reduced from 40% to 10%, thus enabling even small hotspots to benefit from offloading to an accelerator.}},
  author       = {{Damschen, Marvin and Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian}},
  booktitle    = {{Proceedings of the 2015 Conference on Design, Automation and Test in Europe (DATE)}},
  pages        = {{1078--1083}},
  publisher    = {{EDA Consortium / IEEE}},
  title        = {{{Transparent offloading of computational hotspots from binary code to Xeon Phi}}},
  doi          = {{10.7873/DATE.2015.1124}},
  year         = {{2015}},
}

@misc{334,
  author       = {{Wagener, Peter}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Vertical Thread Migration in FPGA based Sound Localization}}},
  year         = {{2014}},
}

@inproceedings{347,
  abstract     = {{Dynamic thread duplication is a known redundancy technique for multi-cores. The approach duplicates a thread under observation for some time period and compares the signatures of the two threads to detect errors. Hybrid multi-cores, typically implemented on platform FPGAs, enable the unique option of running the thread under observation and its copy in different modalities, i.e., software and hardware. We denote our dynamic redundancy technique on hybrid multi-cores as thread shadowing. In this paper we present the concept of thread shadowing and an implementation on a multi-threaded hybrid multi-core architecture. We report on experiments with a block-processing application and demonstrate the overheads, detection latencies and coverage for a range of thread shadowing modes. The results show that trans-modal thread shadowing, although bearing long detection latencies, offers attractive coverage at a low overhead.}},
  author       = {{Meisner, Sebastian and Platzner, Marco}},
  booktitle    = {{Proceedings of the 10th International Symposium on Applied Reconfigurable Computing (ARC)}},
  editor       = {{Goehringer, Diana and Santambrogio, MarcoDomenico and Cardoso, JoãoM.P. and Bertels, Koen}},
  pages        = {{283--290}},
  publisher    = {{Springer}},
  title        = {{{Thread Shadowing: Using Dynamic Redundancy on Hybrid Multi-cores for Error Detection}}},
  doi          = {{10.1007/978-3-319-05960-0_30}},
  year         = {{2014}},
}

@misc{348,
  author       = {{Rüthing, Christoph}},
  publisher    = {{Universität Paderborn}},
  title        = {{{The Xilinx Zynq Architecture as a Platform for Reconfigurable Heterogeneous Multi-Cores}}},
  year         = {{2014}},
}

@inproceedings{368,
  abstract     = {{We consider the problem of scheduling a number of jobs on $m$ identical processors sharing a continuously divisible resource. Each job j comes with a resource requirement r_j \in {0,1}. The job can be processed at full speed if granted its full resource requirement. If receiving only an x-portion of r_j, it is processed at an x-fraction of the full speed. Our goal is to find a resource assignment that minimizes the makespan (i.e., the latest completion time). Variants of such problems, relating the resource assignment of jobs to their \emph{processing speeds}, have been studied under the term discrete-continuous scheduling. Known results are either very pessimistic or heuristic in nature.In this paper, we suggest and analyze a slightly simplified model. It focuses on the assignment of shared continuous resources to the processors. The job assignment to processors and the ordering of the jobs have already been fixed. It is shown that, even for unit size jobs, finding an optimal solution is NP-hard if the number of processors is part of the input. Positive results for unit size jobs include an efficient optimal algorithm for 2 processors. Moreover, we prove that balanced schedules yield a 2-1/m-approximation for a fixed number of processors. Such schedules are computed by our GreedyBalance algorithm, for which the bound is tight.}},
  author       = {{Brinkmann, Andre and Kling, Peter and Meyer auf der Heide, Friedhelm and Nagel, Lars and Riechers, Sören and Suess, Tim }},
  booktitle    = {{Proceedings of the 26th ACM Symposium on Parallelism in Algorithms and Architectures (SPAA)}},
  pages        = {{128--137}},
  title        = {{{Scheduling Shared Continuous Resources on Many-Cores}}},
  doi          = {{10.1145/2612669.2612698}},
  year         = {{2014}},
}

@inproceedings{374,
  abstract     = {{Run-time reconfiguration provides an opportunity to increase performance, reduce cost and improve energy efficiency in FPGA-based systems. However, run-time reconfigurable systems are more complex to implement than static only systems. This increases time to market, and introduces run-time overhead into the system. Our research aims to raise the abstraction level to develop run-time reconfigurable systems. We present operating system extensions which enable seamless integration of run-time reconfigurable hardware threads into applications. To improve resource utilization, the hardware threads are placed on a fine granularity tile grid. We take advantage of a relocatable module placer targeting modern FPGA to manage the reconfigurable area. The module placer accurately models the FPGA resources to compute feasible placement locations for the hardware threads at run-time. Finally, we evaluate our work by means of a case study that consists of a synthetic application to validate the functionality and performance of the implementation. The results show a reduction in reconfiguration time of up to 42% and more than double resource utilization.}},
  author       = {{Wold, Alexander and Agne, Andreas and Torresen, Jim}},
  booktitle    = {{Proceedings of the 10th International Symposium on Reconfigurable Computing: Architectures, Tools, and Applications}},
  editor       = {{Goehringer, Diana and Santambrogio, MarcoDomenico and Cardoso, JoãoM.P. and Bertels, Koen}},
  pages        = {{61--72}},
  title        = {{{Relocatable Hardware Threads in Run-Time Reconfigurable Systems}}},
  doi          = {{10.1007/978-3-319-05960-0_6}},
  year         = {{2014}},
}

@inproceedings{451,
  abstract     = {{We introduce the concept of budget games. Players choose a set of tasks and each task has a certain demand on every resource in the game. Each resource has a budget. If the budget is not enough to satisfy the sum of all demands, it has to be shared between the tasks. We study strategic budget games, where the budget is shared proportionally. We also consider a variant in which the order of the strategic decisions influences the distribution of the budgets. The complexity of the optimal solution as well as existence, complexity and quality of equilibria are analysed. Finally, we show that the time an ordered budget game needs to convergence towards an equilibrium may be exponential.}},
  author       = {{Drees, Maximilian and Riechers, Sören and Skopalik, Alexander}},
  booktitle    = {{Proceedings of the 7th International Symposium on Algorithmic Game Theory (SAGT)}},
  editor       = {{Lavi, Ron}},
  pages        = {{110--121}},
  title        = {{{Budget-restricted utility games with ordered strategic decisions}}},
  doi          = {{10.1007/978-3-662-44803-8_10}},
  year         = {{2014}},
}

@misc{460,
  author       = {{Mittendorf, Robert}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Advanced AES-key recovery from decayed RAM-dumps using multi-threading and FPGAs}}},
  year         = {{2014}},
}

@misc{466,
  author       = {{Brand, Marcel}},
  publisher    = {{Universität Paderborn}},
  title        = {{{A generalized loop accelerator implemented as a coarse grained array}}},
  year         = {{2014}},
}

@phdthesis{431,
  abstract     = {{In meiner Dissertation besch{\"a}ftige ich mich mit dem Entwurf und der Analyse energieeffizienter Schedulingalgorithmen, insbesondere f{\"u}r sogenannte Speed-Scaling Modelle. Diese stellen das theoretische Pendant von Techniken wie AMDs PowerNOW! und Intels SpeedStep dar, welche es erlauben die Geschwindigkeit von Prozessoren zur Laufzeit an die derzeitigen Bedingungen anzupassen. Theoretische Untersuchungen solcher Modelle sind auf eine Arbeit von Yao, Demers und Shenker (FOCS'95) zur{\"u}ckzuf{\"u}hren. Hier kombinieren die Autoren klassisches Deadline-Scheduling mit einem Prozessor der Speed-Scaling beherrscht. Es gilt Jobs verschiedener Gr{\"o}ße fristgerecht abzuarbeiten und die dabei verwendete Energie zu minimieren. Der Energieverbrauch des Prozessors wird durch eine konvexe Funktion $\POW\colon\R_{\geq0}\to\R_{\geq0}$ modelliert, welche die Geschwindigkeit auf den Energieverbrauch abbildet.Meine Dissertation betrachtet verschiedene Varianten des urspr{\"u}nglichen Speed-Scaling Modells. Forschungsrelevante Ergebnisse sind in den Kapiteln 3 bis 6 zu finden und erstrecken sich {\"u}ber die im Folgenden beschriebenen Aspekte:- Kapitel 3 und 4 betrachten verschiedene \emph{Price-Collecting} Varianten des Originalproblems. Hier d{\"u}rfen einzelne Deadlines verfehlt werden, sofern eine jobabh{\"a}ngige Strafe gezahlt wird. Ich entwerfe insbesondere Online-Algorithmen mit einer beweisbar guten Competitiveness. Dabei liefern meine Ergebnisse substantielle Verbesserungen bestehender Arbeiten und erweitern diese unter Anderem auf Szenarien mit mehreren Prozessoren.- In Kapitel 5 wird statt des klassischen Deadline-Schedulings eine Linearkombination der durchschnittlichen Antwortzeit und des Energieverbrauchs betrachtet. Die Frage, ob dieses Problem NP-schwer ist, stellt eine der zentralen Forschungsfragen in diesem Gebiet dar. F{\"u}r eine relaxierte Form dieser Frage entwerfe ich einen effizienter Algorithmus und beweise seine Optimalit{\"a}t.- Das letzte Kapitel betrachtet ein Modell, welches – auf den ersten Blick – nicht direkt zur Speed-Scaling Literatur z{\"a}hlt. Hier geht es stattdessen um ein allgemeines Resource-Constrained Scheduling, in dem sich die Prozessoren zusammen eine gemeinsame, beliebig aufteilbare Ressource teilen. Ich untersuche die Komplexit{\"a}t des Problems und entwerfe verschiedene Approximationsalgorithmen.}},
  author       = {{Kling, Peter}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Energy-efficient Scheduling Algorithms}}},
  year         = {{2014}},
}

@inproceedings{435,
  abstract     = {{We give a polynomial time algorithm to compute an optimal energy and fractional weighted flow trade-off schedule for a speed-scalable processor with discrete speeds.Our algorithm uses a geometric approach that is based on structural properties obtained from a primal-dual formulation of the problem.}},
  author       = {{Antoniadis, Antonios and Barcelo, Neal and Consuegra, Mario and Kling, Peer and Nugent, Michael and Pruhs, Kirk and Scquizzato, Michele}},
  booktitle    = {{Proceedings of the 31st Symposium on Theoretical Aspects of Computer Science (STACS)}},
  pages        = {{63----74}},
  title        = {{{Efficient Computation of Optimal Energy and Fractional Weighted Flow Trade-off Schedules}}},
  doi          = {{10.4230/LIPIcs.STACS.2014.63}},
  year         = {{2014}},
}

