[{"citation":{"ama":"Stachura P, Wu X, Plessl C, Fang Z. SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry. In: <i>Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)</i>. Association for Computing Machinery; 2026:224-234. doi:<a href=\"https://doi.org/10.1145/3748173.3779198\">10.1145/3748173.3779198</a>","chicago":"Stachura, Philip, Xin Wu, Christian Plessl, and Zhenman Fang. “SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry.” In <i>Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)</i>, 224–34. New York, NY, USA: Association for Computing Machinery, 2026. <a href=\"https://doi.org/10.1145/3748173.3779198\">https://doi.org/10.1145/3748173.3779198</a>.","ieee":"P. Stachura, X. Wu, C. Plessl, and Z. Fang, “SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry,” in <i>Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)</i>, 2026, pp. 224–234, doi: <a href=\"https://doi.org/10.1145/3748173.3779198\">10.1145/3748173.3779198</a>.","bibtex":"@inproceedings{Stachura_Wu_Plessl_Fang_2026, place={New York, NY, USA}, title={SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry}, DOI={<a href=\"https://doi.org/10.1145/3748173.3779198\">10.1145/3748173.3779198</a>}, booktitle={Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)}, publisher={Association for Computing Machinery}, author={Stachura, Philip and Wu, Xin and Plessl, Christian and Fang, Zhenman}, year={2026}, pages={224–234} }","mla":"Stachura, Philip, et al. “SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry.” <i>Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)</i>, Association for Computing Machinery, 2026, pp. 224–34, doi:<a href=\"https://doi.org/10.1145/3748173.3779198\">10.1145/3748173.3779198</a>.","short":"P. Stachura, X. Wu, C. Plessl, Z. Fang, in: Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26), Association for Computing Machinery, New York, NY, USA, 2026, pp. 224–234.","apa":"Stachura, P., Wu, X., Plessl, C., &#38; Fang, Z. (2026). SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry. <i>Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA ’26)</i>, 224–234. <a href=\"https://doi.org/10.1145/3748173.3779198\">https://doi.org/10.1145/3748173.3779198</a>"},"page":"224-234","place":"New York, NY, USA","publication_status":"published","publication_identifier":{"isbn":["9798400720796"]},"main_file_link":[{"url":"https://dl.acm.org/doi/10.1145/3748173.3779198"}],"doi":"10.1145/3748173.3779198","author":[{"last_name":"Stachura","full_name":"Stachura, Philip","first_name":"Philip"},{"id":"77439","full_name":"Wu, Xin","last_name":"Wu","first_name":"Xin"},{"orcid":"0000-0001-5728-9982","last_name":"Plessl","id":"16153","full_name":"Plessl, Christian","first_name":"Christian"},{"first_name":"Zhenman","full_name":"Fang, Zhenman","last_name":"Fang"}],"date_updated":"2026-02-09T09:16:32Z","status":"public","type":"conference","user_id":"77439","department":[{"_id":"27"},{"_id":"518"}],"project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"63890","year":"2026","title":"SORCERI: Streaming Overlay Acceleration for Highly Contracted Electron Repulsion Integral Computations in Quantum Chemistry","date_created":"2026-02-06T06:43:22Z","publisher":"Association for Computing Machinery","abstract":[{"text":"The computation of highly contracted electron repulsion integrals (ERIs) is essential to achieve quantum accuracy in atomistic simulations based on quantum mechanics. Its growing computational demands make energy efficiency a critical concern. Recent studies demonstrate FPGAs’ superior performance and energy efficiency for computing primitive ERIs, but the computation of highly contracted ERIs introduces significant algorithmic complexity and new design challenges for FPGA acceleration.In this work, we present SORCERI, the first streaming overlay acceleration for highly contracted ERI computations on FPGAs. SORCERI introduces a novel streaming Rys computing unit to calculate roots and weights of Rys polynomials on-chip, and a streaming contraction unit for the contraction of primitive ERIs. This shifts the design bottleneck from limited CPU-FPGA communication bandwidth to available FPGA computation resources. To address practical deployment challenges for a large number of quartet classes, we design three streaming overlays, together with an efficient memory transpose optimization, to cover the 21 most commonly used quartet classes in realistic atomistic simulations. To address the new computation constraints, we use flexible calculation stages with a free-running streaming architecture to achieve high DSP utilization and good timing closure.Experiments demonstrate that SORCERI achieves an average 5.96x, 1.99x, and 1.16x better performance per watt than libint on a 64-core AMD EPYC 7713 CPU, libintx on an Nvidia A40 GPU, and SERI, the prior best-performing FPGA design for primitive ERIs. Furthermore, SORCERI reaches a peak throughput of 44.11 GERIS (109 ERIs per second) that is 1.52x, 1.13x, and 1.93x greater than libint, libintx and SERI, respectively. SORCERI will be released soon at https://github.com/SFU-HiAccel/SORCERI.","lang":"eng"}],"publication":"Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA '26)","language":[{"iso":"eng"}],"keyword":["electron repulsion integrals","quantum chemistry","atomistic simulation","overlay architecture","fpga acceleration"]},{"year":"2019","intvolume":"       123","page":"77-89","citation":{"mla":"Boschmann, Alexander, et al. “Zynq-Based Acceleration of Robust High Density Myoelectric Signal Processing.” <i>Journal of Parallel and Distributed Computing</i>, vol. 123, Elsevier, 2019, pp. 77–89, doi:<a href=\"https://doi.org/10.1016/j.jpdc.2018.07.004\">10.1016/j.jpdc.2018.07.004</a>.","bibtex":"@article{Boschmann_Agne_Thombansen_Witschen_Kraus_Platzner_2019, title={Zynq-based acceleration of robust high density myoelectric signal processing}, volume={123}, DOI={<a href=\"https://doi.org/10.1016/j.jpdc.2018.07.004\">10.1016/j.jpdc.2018.07.004</a>}, journal={Journal of Parallel and Distributed Computing}, publisher={Elsevier}, author={Boschmann, Alexander and Agne, Andreas and Thombansen, Georg and Witschen, Linus Matthias and Kraus, Florian and Platzner, Marco}, year={2019}, pages={77–89} }","short":"A. Boschmann, A. Agne, G. Thombansen, L.M. Witschen, F. Kraus, M. Platzner, Journal of Parallel and Distributed Computing 123 (2019) 77–89.","apa":"Boschmann, A., Agne, A., Thombansen, G., Witschen, L. M., Kraus, F., &#38; Platzner, M. (2019). Zynq-based acceleration of robust high density myoelectric signal processing. <i>Journal of Parallel and Distributed Computing</i>, <i>123</i>, 77–89. <a href=\"https://doi.org/10.1016/j.jpdc.2018.07.004\">https://doi.org/10.1016/j.jpdc.2018.07.004</a>","ama":"Boschmann A, Agne A, Thombansen G, Witschen LM, Kraus F, Platzner M. Zynq-based acceleration of robust high density myoelectric signal processing. <i>Journal of Parallel and Distributed Computing</i>. 2019;123:77-89. doi:<a href=\"https://doi.org/10.1016/j.jpdc.2018.07.004\">10.1016/j.jpdc.2018.07.004</a>","chicago":"Boschmann, Alexander, Andreas Agne, Georg Thombansen, Linus Matthias Witschen, Florian Kraus, and Marco Platzner. “Zynq-Based Acceleration of Robust High Density Myoelectric Signal Processing.” <i>Journal of Parallel and Distributed Computing</i> 123 (2019): 77–89. <a href=\"https://doi.org/10.1016/j.jpdc.2018.07.004\">https://doi.org/10.1016/j.jpdc.2018.07.004</a>.","ieee":"A. Boschmann, A. Agne, G. Thombansen, L. M. Witschen, F. Kraus, and M. Platzner, “Zynq-based acceleration of robust high density myoelectric signal processing,” <i>Journal of Parallel and Distributed Computing</i>, vol. 123, pp. 77–89, 2019."},"publication_identifier":{"issn":["0743-7315"]},"publication_status":"published","title":"Zynq-based acceleration of robust high density myoelectric signal processing","doi":"10.1016/j.jpdc.2018.07.004","publisher":"Elsevier","date_updated":"2022-01-06T06:51:13Z","volume":123,"author":[{"first_name":"Alexander","full_name":"Boschmann, Alexander","last_name":"Boschmann"},{"last_name":"Agne","full_name":"Agne, Andreas","first_name":"Andreas"},{"first_name":"Georg","full_name":"Thombansen, Georg","last_name":"Thombansen"},{"first_name":"Linus Matthias","last_name":"Witschen","id":"49051","full_name":"Witschen, Linus Matthias"},{"full_name":"Kraus, Florian","last_name":"Kraus","first_name":"Florian"},{"first_name":"Marco","last_name":"Platzner","id":"398","full_name":"Platzner, Marco"}],"date_created":"2019-07-12T13:13:55Z","abstract":[{"lang":"eng","text":"Advances in electromyographic (EMG) sensor technology and machine learning algorithms have led to an increased research effort into high density EMG-based pattern recognition methods for prosthesis control. With the goal set on an autonomous multi-movement prosthesis capable of performing training and classification of an amputee’s EMG signals, the focus of this paper lies in the acceleration of the embedded signal processing chain. We present two Xilinx Zynq-based architectures for accelerating two inherently different high density EMG-based control algorithms. The first hardware accelerated design achieves speed-ups of up to 4.8 over the software-only solution, allowing for a processing delay lower than the sample period of 1 ms. The second system achieved a speed-up of 5.5 over the software-only version and operates at a still satisfactory low processing delay of up to 15 ms while providing a higher reliability and robustness against electrode shift and noisy channels."}],"status":"public","publication":"Journal of Parallel and Distributed Computing","type":"journal_article","keyword":["High density electromyography","FPGA acceleration","Medical signal processing","Pattern recognition","Prosthetics"],"language":[{"iso":"eng"}],"_id":"11950","department":[{"_id":"78"}],"user_id":"398"}]