@article{24788, author = {{Alhaddad, Samer and Förstner, Jens and Groth, Stefan and Grünewald, Daniel and Grynko, Yevgen and Hannig, Frank and Kenter, Tobias and Pfreundt, Franz‐Josef and Plessl, Christian and Schotte, Merlind and Steinke, Thomas and Teich, Jürgen and Weiser, Martin and Wende, Florian}}, issn = {{1532-0626}}, journal = {{Concurrency and Computation: Practice and Experience}}, keywords = {{tet_topic_hpc}}, pages = {{e6616}}, title = {{{The HighPerMeshes framework for numerical algorithms on unstructured grids}}}, doi = {{10.1002/cpe.6616}}, year = {{2021}}, } @inproceedings{16852, author = {{Groth, Stefan and Grünewald, Daniel and Teich, Jürgen and Hannig, Frank}}, booktitle = {{Proceedings of the 17th ACM International Conference on Computing Frontiers (CF '2020)}}, location = {{Catania, Sicily, Italy}}, publisher = {{ACM}}, title = {{{A Runtime System for Finite Element Methods in a Partitioned Global Address Space}}}, doi = {{10.1145/3387902.3392628}}, year = {{2020}}, } @inproceedings{15478, abstract = {{Stratix 10 FPGA cards have a good potential for the acceleration of HPC workloads since the Stratix 10 product line introduces devices with a large number of DSP and memory blocks. The high level synthesis of OpenCL codes can play a fundamental role for FPGAs in HPC, because it allows to implement different designs with lower development effort compared to hand optimized HDL. However, Stratix 10 cards are still hard to fully exploit using the Intel FPGA SDK for OpenCL. The implementation of designs with thousands of concurrent arithmetic operations often suffers from place and route problems that limit the maximum frequency or entirely prevent a successful synthesis. In order to overcome these issues for the implementation of the matrix multiplication, we formulate Cannon's matrix multiplication algorithm with regard to its efficient synthesis within the FPGA logic. We obtain a two-level block algorithm, where the lower level sub-matrices are multiplied using our Cannon's algorithm implementation. Following this design approach with multiple compute units, we are able to get maximum frequencies close to and above 300 MHz with high utilization of DSP and memory blocks. This allows for performance results above 1 TeraFLOPS.}}, author = {{Gorlani, Paolo and Kenter, Tobias and Plessl, Christian}}, booktitle = {{Proceedings of the International Conference on Field-Programmable Technology (FPT)}}, publisher = {{IEEE}}, title = {{{OpenCL Implementation of Cannon's Matrix Multiplication Algorithm on Intel Stratix 10 FPGAs}}}, doi = {{10.1109/ICFPT47387.2019.00020}}, year = {{2019}}, } @inproceedings{16223, abstract = {{Multigrid methods are fast and scalable numerical solvers for partial differential equations (PDEs) that possess a large design space for implementing their algorithmic components. Code generation approaches allow formulating multigrid methods on a higher level of abstraction that can then be used to derive a problem- and hardware-specific solutions. Since these problems have a considerable implementation variability, it is crucial to investigate a general mapping of core components in multigrid methods to the target software. With SYCL there exists a high-level C++ abstraction layer that is capable of targeting a multitude of architectures. We contribute a general way to map multigrid components to SYCL functionality and provide a performance evaluation for specific algorithmic component.}}, author = {{Groth, Stefan and Schmitt, Christian and Teich, Jürgen and Hannig, Frank}}, booktitle = {{Proceedings of the 22nd International Workshop on Software and Compilers for Embedded Systems - SCOPES '19}}, isbn = {{9781450367622}}, title = {{{SYCL Code Generation for Multigrid Methods}}}, doi = {{10.1145/3323439.3323984}}, year = {{2019}}, } @inproceedings{3588, abstract = {{In scientific computing, unstructured meshes are a crucial foundation for the simulation of real-world physical phenomena. Compared to regular grids, they allow resembling the computational domain with a much higher accuracy, which in turn leads to more efficient computations.
There exists a wealth of supporting libraries and frameworks that aid programmers with the implementation of applications working on such grids, each built on top of existing parallelization technologies. However, many approaches require the programmer to introduce a different programming paradigm into their application or provide different variants of the code. SYCL is a new programming standard providing a remedy to this dilemma by building on standard C ++17 with its so-called single-source approach: Programmers write standard C ++ code and expose parallelism using C++17 keywords. The application is
then transformed into a concrete implementation by the SYCL implementation. By encapsulating the OpenCL ecosystem, different SYCL implementations enable not only the programming of CPUs but also of heterogeneous platforms such as GPUs or other devices. For the first time, this paper showcases a SYCL-
based solver for the nodal Discontinuous Galerkin method for Maxwell’s equations on unstructured meshes. We compare our solution to a previous C-based implementation with respect to programmability and performance on heterogeneous platforms.