@article{7689,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Kenter, Tobias and Plessl, Christian}},
  journal      = {{ACM Trans. Archit. Code Optim. (TACO)}},
  keywords     = {{htrop}},
  number       = {{2}},
  pages        = {{14:1–14:26}},
  publisher    = {{ACM}},
  title        = {{{Transparent Acceleration for Heterogeneous Platforms with Compilation to OpenCL}}},
  doi          = {{10.1145/3319423}},
  volume       = {{16}},
  year         = {{2019}},
}

@phdthesis{14849,
  author       = {{Vaz, Gavin Francis}},
  publisher    = {{Universität Paderborn}},
  title        = {{{Using Just-in-Time Code Generation to Transparently Accelerate Applications in Heterogeneous Systems}}},
  year         = {{2019}},
}

@inproceedings{1204,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{Proc. ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)}},
  isbn         = {{9781450349826}},
  keywords     = {{htrop}},
  publisher    = {{ACM}},
  title        = {{{Automated Code Acceleration Targeting Heterogeneous OpenCL Devices}}},
  doi          = {{10.1145/3178487.3178534}},
  year         = {{2018}},
}

@inproceedings{31,
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian and Trainiti, Ettore M. G. and Durelli, Gianluca C. and Bolchini, Cristiana}},
  booktitle    = {{Proc. HiPEAC Workshop on Reonfigurable Computing (WRC)}},
  title        = {{{Using Just-in-Time Code Generation for Transparent Resource Management in Heterogeneous Systems}}},
  year         = {{2016}},
}

@inproceedings{138,
  abstract     = {{Hardware accelerators are becoming popular in academia and industry. To move one step further from the state-of-the-art multicore plus accelerator approaches, we present in this paper our innovative SAVEHSA architecture. It comprises of a heterogeneous hardware platform with three different high-end accelerators attached over PCIe (GPGPU, FPGA and Intel MIC). Such systems can process parallel workloads very efficiently whilst being more energy efficient than regular CPU systems. To leverage the heterogeneity, the workload has to be distributed among the computing units in a way that each unit is well-suited for the assigned task and executable code must be available. To tackle this problem we present two software components; the first can perform resource allocation at runtime while respecting system and application goals (in terms of throughput, energy, latency, etc.) and the second is able to analyze an application and generate executable code for an accelerator at runtime. We demonstrate the first proof-of-concept implementation of our framework on the heterogeneous platform, discuss different runtime policies and measure the introduced overheads.}},
  author       = {{Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian and Trainiti, Ettore M. G.  and Durelli, Gianluca C. and Del Sozzo, Emanuele and Santambrogio, Marco D.  and Bolchini, Christina}},
  booktitle    = {{Proceedings of International Forum on Research and Technologies for Society and Industry (RTSI)}},
  pages        = {{1--5}},
  publisher    = {{IEEE}},
  title        = {{{Using Just-in-Time Code Generation for Transparent Resource Management in Heterogeneous Systems}}},
  doi          = {{10.1109/RTSI.2016.7740545}},
  year         = {{2016}},
}

@article{165,
  abstract     = {{A broad spectrum of applications can be accelerated by offloading computation intensive parts to reconfigurable hardware. However, to achieve speedups, the number of loop it- erations (trip count) needs to be sufficiently large to amortize offloading overheads. Trip counts are frequently not known at compile time, but only at runtime just before entering a loop. Therefore, we propose to generate code for both the CPU and the coprocessor, and defer the offloading decision to the application runtime. We demonstrate how a toolflow, based on the LLVM compiler framework, can automatically embed dynamic offloading de- cisions into the application code. We perform in-depth static and dynamic analysis of pop- ular benchmarks, which confirm the general potential of such an approach. We also pro- pose to optimize the offloading process by decoupling the runtime decision from the loop execution (decision slack). The feasibility of our approach is demonstrated by a toolflow that automatically identifies suitable data-parallel loops and generates code for the FPGA coprocessor of a Convey HC-1. We evaluate the integrated toolflow with representative loops executed for different input data sizes.}},
  author       = {{Vaz, Gavin Francis and Riebler, Heinrich and Kenter, Tobias and Plessl, Christian}},
  issn         = {{0045-7906}},
  journal      = {{Computers and Electrical Engineering}},
  pages        = {{91--111}},
  publisher    = {{Elsevier}},
  title        = {{{Potential and Methods for Embedding Dynamic Offloading Decisions into Application Code}}},
  doi          = {{10.1016/j.compeleceng.2016.04.021}},
  volume       = {{55}},
  year         = {{2016}},
}

@inproceedings{171,
  author       = {{Kenter, Tobias and Vaz, Gavin Francis and Riebler, Heinrich and Plessl, Christian}},
  booktitle    = {{Workshop on Reconfigurable Computing (WRC)}},
  title        = {{{Opportunities for deferring application partitioning and accelerator synthesis to runtime (extended abstract)}}},
  year         = {{2016}},
}

@inproceedings{238,
  abstract     = {{In this paper, we study how binary applications can be transparently accelerated with novel heterogeneous computing resources without requiring any manual porting or developer-provided hints. Our work is based on Binary Acceleration At Runtime (BAAR), our previously introduced binary acceleration mechanism that uses the LLVM Compiler Infrastructure. BAAR is designed as a client-server architecture. The client runs the program to be accelerated in an environment, which allows program analysis and profiling and identifies and extracts suitable program parts to be offloaded. The server compiles and optimizes these offloaded program parts for the accelerator and offers access to these functions to the client with a remote procedure call (RPC) interface. Our previous work proved the feasibility of our approach, but also showed that communication time and overheads limit the granularity of functions that can be meaningfully offloaded. In this work, we motivate the importance of a lightweight, high-performance communication between server and client and present a communication mechanism based on the Message Passing Interface (MPI). We evaluate our approach by using an Intel Xeon Phi 5110P as the acceleration target and show that the communication overhead can be reduced from 40% to 10%, thus enabling even small hotspots to benefit from offloading to an accelerator.}},
  author       = {{Damschen, Marvin and Riebler, Heinrich and Vaz, Gavin Francis and Plessl, Christian}},
  booktitle    = {{Proceedings of the 2015 Conference on Design, Automation and Test in Europe (DATE)}},
  pages        = {{1078--1083}},
  publisher    = {{EDA Consortium / IEEE}},
  title        = {{{Transparent offloading of computational hotspots from binary code to Xeon Phi}}},
  doi          = {{10.7873/DATE.2015.1124}},
  year         = {{2015}},
}

@inproceedings{388,
  abstract     = {{In order to leverage the use of reconfigurable architectures in general-purpose computing, quick and automated methods to find suitable accelerator designs are required. We tackle this challenge in both regards. In order to avoid long synthesis times, we target a vector copro- cessor, implemented on the FPGAs of a Convey HC-1. Previous studies showed that existing tools were not able to accelerate a real-world application with low effort. We present a toolflow to automatically identify suitable loops for vectorization, generate a corresponding hardware/software bipartition, and generate coprocessor code. Where applicable, we leverage outer-loop vectorization. We evaluate our tools with a set of characteristic loops, systematically analyzing different dependency and data layout properties.}},
  author       = {{Kenter, Tobias and Vaz, Gavin Francis and Plessl, Christian}},
  booktitle    = {{Proceedings of the International Symposium on Reconfigurable Computing: Architectures, Tools, and Applications (ARC)}},
  pages        = {{144--155}},
  publisher    = {{Springer International Publishing}},
  title        = {{{Partitioning and Vectorizing Binary Applications for a Reconfigurable Vector Computer}}},
  doi          = {{10.1007/978-3-319-05960-0_13}},
  volume       = {{8405}},
  year         = {{2014}},
}

@inproceedings{1778,
  author       = {{C. Durelli, Gianluca and Pogliani, Marcello and Miele, Antonio and Plessl, Christian and Riebler, Heinrich and Vaz, Gavin Francis and D. Santambrogio, Marco and Bolchini, Cristiana}},
  booktitle    = {{Proc. Int. Symp. on Parallel and Distributed Processing with Applications (ISPA)}},
  pages        = {{142--149}},
  publisher    = {{IEEE}},
  title        = {{{Runtime Resource Management in Heterogeneous System Architectures: The SAVE Approach}}},
  doi          = {{10.1109/ISPA.2014.27}},
  year         = {{2014}},
}

@inproceedings{439,
  abstract     = {{Reconfigurable architectures provide an opportunityto accelerate a wide range of applications, frequentlyby exploiting data-parallelism, where the same operations arehomogeneously executed on a (large) set of data. However, whenthe sequential code is executed on a host CPU and only dataparallelloops are executed on an FPGA coprocessor, a sufficientlylarge number of loop iterations (trip counts) is required, such thatthe control- and data-transfer overheads to the coprocessor canbe amortized. However, the trip count of large data-parallel loopsis frequently not known at compile time, but only at runtime justbefore entering a loop. Therefore, we propose to generate codeboth for the CPU and the coprocessor, and to defer the decisionwhere to execute the appropriate code to the runtime of theapplication when the trip count of the loop can be determinedjust at runtime. We demonstrate how an LLVM compiler basedtoolflow can automatically insert appropriate decision blocks intothe application code. Analyzing popular benchmark suites, weshow that this kind of runtime decisions is often applicable. Thepractical feasibility of our approach is demonstrated by a toolflowthat automatically identifies loops suitable for vectorization andgenerates code for the FPGA coprocessor of a Convey HC-1. Thetoolflow adds decisions based on a comparison of the runtimecomputedtrip counts to thresholds for specific loops and alsoincludes support to move just the required data to the coprocessor.We evaluate the integrated toolflow with characteristic loopsexecuted on different input data sizes.}},
  author       = {{Vaz, Gavin Francis and Riebler, Heinrich and Kenter, Tobias and Plessl, Christian}},
  booktitle    = {{Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}},
  pages        = {{1--8}},
  publisher    = {{IEEE}},
  title        = {{{Deferring Accelerator Offloading Decisions to Application Runtime}}},
  doi          = {{10.1109/ReConFig.2014.7032509}},
  year         = {{2014}},
}

@inproceedings{25292,
  author       = {{Rammig, Franz-Josef and Stahl, Katharina and Vaz, Gavin Francis}},
  booktitle    = {{Proc. 4th IEEE Workshop on Self-Organizing Real-Time Systems (SORT) 2013}},
  publisher    = {{IEEE}},
  title        = {{{A Framework for Enhancing Dependability in Self-x Systems by Artificial Immune Systems}}},
  year         = {{2013}},
}

@inproceedings{1785,
  author       = {{Rammig, Franz and Stahl, Katharina and Vaz, Gavin Francis}},
  booktitle    = {{IEEE Int. Symp. on Object/component/service-oriented Real-time distributed Computing (ISORC)}},
  pages        = {{1--10}},
  publisher    = {{IEEE}},
  title        = {{{A framework for enhancing dependability in self-x systems by Artificial Immune Systems}}},
  doi          = {{10.1109/ISORC.2013.6913240}},
  year         = {{2013}},
}