@article{16423,
  abstract     = {{Heterogeneous computing that exploits simultaneous co-processing with different device types has been shown to be effective at both increasing performance and reducing energy consumption. In this paper, we extend a scheduling framework encapsulated in a high-level C++ template and previously developed for heterogeneous chips comprising CPU and GPU cores, to new high-performance platforms for the data center, which include a cache coherent FPGA fabric and many-core CPU resources. Our goal is to evaluate the suitability of our framework with these new FPGA-based platforms, identifying performance benefits and limitations.We target the state-of-the-art HARP processor that includes 14 high-end Xeon classes tightly coupled to a FPGA device located in the same package. We select eight benchmarks from the high-performance computing domain that have been ported and optimized for this heterogeneous platform. The results show that a dynamic and adaptive scheduler that exploits simultaneous processing among the devices can improve performance up to a factor of 8 × compared to the best alternative solutions that only use the CPU cores or the FPGA fabric. Moreover, our proposal achieves up to 15% and 37% of improvement compared to the best heterogeneous solutions found with a dynamic and static schedulers, respectively.}},
  author       = {{Rodríguez, Andrés and Navarro, Angeles and Asenjo, Rafael and Corbera, Francisco and Gran, Rubén and Suárez, Darío and Nunez-Yanez, Jose}},
  issn         = {{0920-8542}},
  journal      = {{The Journal of Supercomputing}},
  keywords     = {{pc2-harp-ressources}},
  title        = {{{Parallel multiprocessing and scheduling on the heterogeneous Xeon+FPGA platform}}},
  doi          = {{10.1007/s11227-019-02935-1}},
  year         = {{2019}},
}

@article{2420,
  abstract     = {{ This paper presents the acceleration of minimum-cost covering problems by instance-specific hardware. First, we formulate the minimum-cost covering problem and discuss a branch \& bound algorithm to solve it. Then we describe instance-specific hardware architectures that implement branch \& bound in 3-valued logic and use reduction techniques similar to those found in software solvers. We further present prototypical accelerator implementations and a corresponding design tool flow. Our experiments reveal significant raw speedups up to five orders of magnitude for a set of smaller unate covering problems. Provided that hardware compilation times can be reduced, we conclude that instance-specific acceleration of hard minimum-cost covering problems will lead to substantial overall speedups. }},
  author       = {{Plessl, Christian and Platzner, Marco}},
  issn         = {{0920-8542}},
  journal      = {{Journal of Supercomputing}},
  keywords     = {{reconfigurable computing, instance-specific acceleration, minimum covering}},
  number       = {{2}},
  pages        = {{109--129}},
  publisher    = {{Kluwer Academic Publishers}},
  title        = {{{Instance-Specific Accelerators for Minimum Covering}}},
  doi          = {{10.1023/a:1024443416592}},
  volume       = {{26}},
  year         = {{2003}},
}

