@unpublished{61152,
  abstract     = {{While neural network quantization effectively reduces the cost of matrix multiplications, aggressive quantization can expose non-matrix-multiply operations as significant performance and resource bottlenecks on embedded systems. Addressing such bottlenecks requires a comprehensive approach to tailoring the precision across operations in the inference computation. To this end, we introduce scaled-integer range analysis (SIRA), a static analysis technique employing interval arithmetic to determine the range, scale, and bias for tensors in quantized neural networks. We show how this information can be exploited to reduce the resource footprint of FPGA dataflow neural network accelerators via tailored bitwidth adaptation for accumulators and downstream operations, aggregation of scales and biases, and conversion of consecutive elementwise operations to thresholding operations. We integrate SIRA-driven optimizations into the open-source FINN framework, then evaluate their effectiveness across a range of quantized neural network workloads and compare implementation alternatives for non-matrix-multiply operations. We demonstrate an average reduction of 17% for LUTs, 66% for DSPs, and 22% for accumulator bitwidths with SIRA optimizations, providing detailed benchmark analysis and analytical models to guide the implementation style for non-matrix layers. Finally, we open-source SIRA to facilitate community exploration of its benefits across various applications and hardware platforms.}},
  author       = {{Umuroglu, Yaman and Berganski, Christoph and Jentzsch, Felix and Danilowicz, Michal and Kryjak, Tomasz and Bezaitis, Charalampos and Sjalander, Magnus and Colbert, Ian and Preusser, Thomas and Petri-Koenig, Jakoba and Blott, Michaela}},
  title        = {{{SIRA: Scaled-Integer Range Analysis for Optimizing FPGA Dataflow Neural Network Accelerators}}},
  year         = {{2025}},
}

@inproceedings{56481,
  author       = {{Berganski, Christoph and Jentzsch, Felix and Platzner, Marco and Kuhmichel, Max and Giefers, Heiner}},
  location     = {{Sydney}},
  title        = {{{FINN-T: Compiling Custom Dataflow Accelerators for Quantized Transformers}}},
  year         = {{2024}},
}

@inbook{45899,
  author       = {{Boschmann, Alexander and Clausing, Lennart and Jentzsch, Felix and Ghasemzadeh Mohammadi, Hassan and Platzner, Marco}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{225--236}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Flexible Industrial Analytics on Reconfigurable Systems-On-Chip}}},
  doi          = {{10.5281/zenodo.8068713}},
  volume       = {{412}},
  year         = {{2023}},
}

@inproceedings{53435,
  author       = {{Jentzsch, Felix}},
  booktitle    = {{2023 33rd International Conference on Field-Programmable Logic and Applications (FPL)}},
  publisher    = {{IEEE}},
  title        = {{{Hardware-Aware AutoML for Exploration of Custom FPGA Accelerators for RadioML}}},
  doi          = {{10.1109/fpl60245.2023.00066}},
  year         = {{2023}},
}

@article{33990,
  abstract     = {{Deep neural networks (DNNs) are penetrating into a broad spectrum of applications and replacing manual algorithmic implementations, including the radio frequency communications domain with classical signal processing algorithms. However, the high throughput (gigasamples per second) and low latency requirements of this application domain pose a significant hurdle for adopting computationally demanding DNNs. In this article, we explore highly specialized DNN inference accelerator approaches on field-programmable gate arrays (FPGAs) for RadioML modulation classification. Using an automated end-to-end flow for the generation of the FPGA solution, we can easily explore a spectrum of solutions that optimize for different design targets, including accuracy, power efficiency, resources, throughput, and latency. By leveraging reduced precision arithmetic and customized streaming dataflow, we demonstrate a solution that meets the application requirements and outperforms alternative FPGA efforts by 3.5x in terms of throughput. Against modern embedded graphics processing units (GPUs), we measure >10x higher throughput and >100x lower latency under comparable accuracy and power envelopes.}},
  author       = {{Jentzsch, Felix and Umuroglu, Yaman and Pappalardo, Alessandro and Blott, Michaela and Platzner, Marco}},
  journal      = {{IEEE Micro}},
  number       = {{6}},
  pages        = {{125--133}},
  publisher    = {{IEEE}},
  title        = {{{RadioML Meets FINN: Enabling Future RF Applications With FPGA Streaming Architectures}}},
  doi          = {{10.1109/MM.2022.3202091}},
  volume       = {{42}},
  year         = {{2022}},
}

@inproceedings{30908,
  author       = {{Ghasemzadeh Mohammadi, Hassan and Jentzsch, Felix and Kuschel, Maurice and Arshad, Rahil  and Rautmare, Sneha and Manjunatha, Suraj and Platzner, Marco and Boschmann, Alexander and Schollbach, Dirk }},
  booktitle    = {{ Machine Learning and Principles and Practice of Knowledge Discovery in Databases}},
  publisher    = {{Springer}},
  title        = {{{FLight: FPGA Acceleration of Lightweight DNN Model Inference in Industrial Analytics}}},
  doi          = {{https://doi.org/10.1007/978-3-030-93736-2_27}},
  year         = {{2021}},
}

