[{"abstract":[{"text":"The intelligibility of demodulated audio signals from analog high frequency transmissions, e.g., using single-sideband\r\n(SSB) modulation, can be severely degraded by channel distortions and/or a mismatch between modulation and demodulation carrier frequency. In this work a neural network (NN)-based approach for carrier frequency offset (CFO) estimation from demodulated SSB signals is proposed, whereby a task specific architecture is presented. Additionally, a simulation framework for SSB signals is introduced and utilized for training the NNs. The CFO estimator is combined with a speech enhancement network to investigate its influence on the enhancement performance. The NN-based system is compared to a recently proposed pitch tracking based approach on publicly available data from real high frequency transmissions. Experiments show that the NN exhibits good CFO estimation properties and results in significant improvements in speech intelligibility, especially when combined with a noise reduction network.","lang":"eng"}],"status":"public","file":[{"relation":"main_file","success":1,"content_type":"application/pdf","access_level":"closed","file_name":"cfo.pdf","file_id":"33472","file_size":1231379,"creator":"jensheit","date_created":"2022-09-22T10:48:31Z","date_updated":"2022-09-22T10:48:31Z"}],"publication":"Proceedings of the 30th European Signal Processing Conference (EUSIPCO)","type":"conference","ddc":["000"],"file_date_updated":"2022-09-22T10:48:31Z","language":[{"iso":"eng"}],"_id":"33471","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"460","place":"Belgrad","year":"2022","citation":{"short":"J. Heitkämper, J. Schmalenstroeer, R. Haeb-Umbach, in: Proceedings of the 30th European Signal Processing Conference (EUSIPCO), Belgrad, n.d.","mla":"Heitkämper, Jens, et al. “Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels.” <i>Proceedings of the 30th European Signal Processing Conference (EUSIPCO)</i>.","bibtex":"@inproceedings{Heitkämper_Schmalenstroeer_Haeb-Umbach, place={Belgrad}, title={Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels}, booktitle={Proceedings of the 30th European Signal Processing Conference (EUSIPCO)}, author={Heitkämper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold} }","apa":"Heitkämper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (n.d.). Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels. <i>Proceedings of the 30th European Signal Processing Conference (EUSIPCO)</i>. 30th European Signal Processing Conference (EUSIPCO), Belgrad.","ama":"Heitkämper J, Schmalenstroeer J, Haeb-Umbach R. Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels. In: <i>Proceedings of the 30th European Signal Processing Conference (EUSIPCO)</i>.","ieee":"J. Heitkämper, J. Schmalenstroeer, and R. Haeb-Umbach, “Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels,” presented at the 30th European Signal Processing Conference (EUSIPCO), Belgrad.","chicago":"Heitkämper, Jens, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels.” In <i>Proceedings of the 30th European Signal Processing Conference (EUSIPCO)</i>. Belgrad, n.d."},"has_accepted_license":"1","quality_controlled":"1","publication_status":"accepted","title":"Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels","conference":{"start_date":"2022-08-29","name":"30th European Signal Processing Conference (EUSIPCO)","location":"Belgrad","end_date":"2022-09-02"},"date_updated":"2023-10-26T08:15:57Z","author":[{"last_name":"Heitkämper","id":"27643","full_name":"Heitkämper, Jens","first_name":"Jens"},{"id":"460","full_name":"Schmalenstroeer, Joerg","last_name":"Schmalenstroeer","first_name":"Joerg"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2022-09-22T10:56:13Z"},{"language":[{"iso":"eng"}],"ddc":["004"],"publication":"2022 International Workshop on Acoustic Signal Enhancement (IWAENC)","file":[{"relation":"main_file","content_type":"application/pdf","file_id":"48991","access_level":"open_access","file_name":"iwaenc_22_camera_ready_ieee_check.pdf","file_size":266475,"date_created":"2023-11-17T06:40:40Z","creator":"tgburrek","date_updated":"2023-11-17T06:40:40Z"}],"date_created":"2022-10-18T09:30:24Z","publisher":"IEEE","title":"Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription","quality_controlled":"1","year":"2022","user_id":"44006","department":[{"_id":"54"}],"project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"33808","file_date_updated":"2023-11-17T06:40:40Z","type":"conference","status":"public","author":[{"full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek","first_name":"Tobias"},{"full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer","first_name":"Joerg"},{"first_name":"Jens","id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2023-11-17T06:40:58Z","doi":"10.1109/IWAENC53105.2022.9914772","conference":{"location":" Bamberg, Germany ","end_date":"2022-09-08","start_date":"2022-09-05","name":"17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022)"},"has_accepted_license":"1","citation":{"apa":"Gburrek, T., Schmalenstroeer, J., Heitkaemper, J., &#38; Haeb-Umbach, R. (2022). Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription. <i>2022 International Workshop on Acoustic Signal Enhancement (IWAENC)</i>. 17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022),  Bamberg, Germany . <a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">https://doi.org/10.1109/IWAENC53105.2022.9914772</a>","mla":"Gburrek, Tobias, et al. “Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription.” <i>2022 International Workshop on Acoustic Signal Enhancement (IWAENC)</i>, IEEE, 2022, doi:<a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">10.1109/IWAENC53105.2022.9914772</a>.","bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Heitkaemper_Haeb-Umbach_2022, title={Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription}, DOI={<a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">10.1109/IWAENC53105.2022.9914772</a>}, booktitle={2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}, publisher={IEEE}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Heitkaemper, Jens and Haeb-Umbach, Reinhold}, year={2022} }","short":"T. Gburrek, J. Schmalenstroeer, J. Heitkaemper, R. Haeb-Umbach, in: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), IEEE, 2022.","ama":"Gburrek T, Schmalenstroeer J, Heitkaemper J, Haeb-Umbach R. Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription. In: <i>2022 International Workshop on Acoustic Signal Enhancement (IWAENC)</i>. IEEE; 2022. doi:<a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">10.1109/IWAENC53105.2022.9914772</a>","ieee":"T. Gburrek, J. Schmalenstroeer, J. Heitkaemper, and R. Haeb-Umbach, “Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription,” presented at the 17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022),  Bamberg, Germany , 2022, doi: <a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">10.1109/IWAENC53105.2022.9914772</a>.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, Jens Heitkaemper, and Reinhold Haeb-Umbach. “Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription.” In <i>2022 International Workshop on Acoustic Signal Enhancement (IWAENC)</i>. IEEE, 2022. <a href=\"https://doi.org/10.1109/IWAENC53105.2022.9914772\">https://doi.org/10.1109/IWAENC53105.2022.9914772</a>."}},{"citation":{"short":"J. Heitkaemper, J. Schmalenstroeer, V. Ion, R. Haeb-Umbach, in: Speech Communication; 14th ITG-Symposium, 2021, pp. 1–5.","bibtex":"@inproceedings{Heitkaemper_Schmalenstroeer_Ion_Haeb-Umbach_2021, title={A Database for Research on Detection and Enhancement of Speech Transmitted over HF links}, booktitle={Speech Communication; 14th ITG-Symposium}, author={Heitkaemper, Jens and Schmalenstroeer, Joerg and Ion, Valentin and Haeb-Umbach, Reinhold}, year={2021}, pages={1–5} }","mla":"Heitkaemper, Jens, et al. “A Database for Research on Detection and Enhancement of Speech Transmitted over HF Links.” <i>Speech Communication; 14th ITG-Symposium</i>, 2021, pp. 1–5.","apa":"Heitkaemper, J., Schmalenstroeer, J., Ion, V., &#38; Haeb-Umbach, R. (2021). A Database for Research on Detection and Enhancement of Speech Transmitted over HF links. <i>Speech Communication; 14th ITG-Symposium</i>, 1–5.","ama":"Heitkaemper J, Schmalenstroeer J, Ion V, Haeb-Umbach R. A Database for Research on Detection and Enhancement of Speech Transmitted over HF links. In: <i>Speech Communication; 14th ITG-Symposium</i>. ; 2021:1-5.","ieee":"J. Heitkaemper, J. Schmalenstroeer, V. Ion, and R. Haeb-Umbach, “A Database for Research on Detection and Enhancement of Speech Transmitted over HF links,” in <i>Speech Communication; 14th ITG-Symposium</i>, 2021, pp. 1–5.","chicago":"Heitkaemper, Jens, Joerg Schmalenstroeer, Valentin Ion, and Reinhold Haeb-Umbach. “A Database for Research on Detection and Enhancement of Speech Transmitted over HF Links.” In <i>Speech Communication; 14th ITG-Symposium</i>, 1–5, 2021."},"page":"1-5","year":"2021","quality_controlled":"1","title":"A Database for Research on Detection and Enhancement of Speech Transmitted over HF links","author":[{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"id":"460","full_name":"Schmalenstroeer, Joerg","last_name":"Schmalenstroeer","first_name":"Joerg"},{"first_name":"Valentin","last_name":"Ion","full_name":"Ion, Valentin"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2021-09-09T08:41:25Z","date_updated":"2023-10-26T08:06:57Z","status":"public","type":"conference","publication":"Speech Communication; 14th ITG-Symposium","language":[{"iso":"eng"}],"user_id":"460","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"24000"},{"type":"conference","publication":"29th European Signal Processing Conference (EUSIPCO)","status":"public","_id":"23998","user_id":"460","department":[{"_id":"54"}],"extern":"1","language":[{"iso":"eng"}],"year":"2021","citation":{"ama":"Schmalenstroeer J, Heitkaemper J, Ullmann J, Haeb-Umbach R. Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech. In: <i>29th European Signal Processing Conference (EUSIPCO)</i>. ; 2021:1-5.","chicago":"Schmalenstroeer, Joerg, Jens Heitkaemper, Joerg Ullmann, and Reinhold Haeb-Umbach. “Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech.” In <i>29th European Signal Processing Conference (EUSIPCO)</i>, 1–5, 2021.","ieee":"J. Schmalenstroeer, J. Heitkaemper, J. Ullmann, and R. Haeb-Umbach, “Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech,” in <i>29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1–5.","bibtex":"@inproceedings{Schmalenstroeer_Heitkaemper_Ullmann_Haeb-Umbach_2021, title={Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech}, booktitle={29th European Signal Processing Conference (EUSIPCO)}, author={Schmalenstroeer, Joerg and Heitkaemper, Jens and Ullmann, Joerg and Haeb-Umbach, Reinhold}, year={2021}, pages={1–5} }","short":"J. Schmalenstroeer, J. Heitkaemper, J. Ullmann, R. Haeb-Umbach, in: 29th European Signal Processing Conference (EUSIPCO), 2021, pp. 1–5.","mla":"Schmalenstroeer, Joerg, et al. “Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech.” <i>29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1–5.","apa":"Schmalenstroeer, J., Heitkaemper, J., Ullmann, J., &#38; Haeb-Umbach, R. (2021). Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech. <i>29th European Signal Processing Conference (EUSIPCO)</i>, 1–5."},"page":"1-5","date_updated":"2023-11-15T14:56:38Z","oa":"1","date_created":"2021-09-09T08:40:04Z","author":[{"first_name":"Joerg","full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer"},{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"id":"16256","full_name":"Ullmann, Joerg","last_name":"Ullmann","first_name":"Joerg"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"title":"Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2103.01599","open_access":"1"}]},{"date_created":"2020-12-11T12:49:13Z","author":[{"first_name":"Christoph","last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767"},{"full_name":"Cord-Landwehr, Tobias","id":"44393","last_name":"Cord-Landwehr","first_name":"Tobias"},{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"full_name":"Zorila, Catalin","last_name":"Zorila","first_name":"Catalin"},{"first_name":"Daichi","full_name":"Hayakawa, Daichi","last_name":"Hayakawa"},{"first_name":"Mohan","last_name":"Li","full_name":"Li, Mohan"},{"first_name":"Min","last_name":"Liu","full_name":"Liu, Min"},{"first_name":"Rama","full_name":"Doddipatla, Rama","last_name":"Doddipatla"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"oa":"1","date_updated":"2022-01-06T06:54:33Z","title":"Towards a speaker diarization system for the CHiME 2020 dinner party transcription","has_accepted_license":"1","citation":{"ama":"Boeddeker C, Cord-Landwehr T, Heitkaemper J, et al. Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In: <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>. ; 2020.","ieee":"C. Boeddeker <i>et al.</i>, “Towards a speaker diarization system for the CHiME 2020 dinner party transcription,” in <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","chicago":"Boeddeker, Christoph, Tobias Cord-Landwehr, Jens Heitkaemper, Catalin Zorila, Daichi Hayakawa, Mohan Li, Min Liu, Rama Doddipatla, and Reinhold Haeb-Umbach. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","apa":"Boeddeker, C., Cord-Landwehr, T., Heitkaemper, J., Zorila, C., Hayakawa, D., Li, M., … Haeb-Umbach, R. (2020). Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>.","mla":"Boeddeker, Christoph, et al. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","short":"C. Boeddeker, T. Cord-Landwehr, J. Heitkaemper, C. Zorila, D. Hayakawa, M. Li, M. Liu, R. Doddipatla, R. Haeb-Umbach, in: Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments, 2020.","bibtex":"@inproceedings{Boeddeker_Cord-Landwehr_Heitkaemper_Zorila_Hayakawa_Li_Liu_Doddipatla_Haeb-Umbach_2020, title={Towards a speaker diarization system for the CHiME 2020 dinner party transcription}, booktitle={Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments}, author={Boeddeker, Christoph and Cord-Landwehr, Tobias and Heitkaemper, Jens and Zorila, Catalin and Hayakawa, Daichi and Li, Mohan and Liu, Min and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2020} }"},"year":"2020","department":[{"_id":"54"}],"user_id":"40767","_id":"20700","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"file_date_updated":"2020-12-11T12:48:48Z","language":[{"iso":"eng"}],"ddc":["000"],"publication":"Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments","type":"conference","status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_name":"template.pdf","file_id":"20702","access_level":"open_access","file_size":115421,"date_created":"2020-12-11T12:48:48Z","creator":"cbj","date_updated":"2020-12-11T12:48:48Z"}]},{"language":[{"iso":"eng"}],"user_id":"27643","_id":"20701","status":"public","abstract":[{"lang":"eng","text":"This paper describes Asteroid , the PyTorch -based audio source separation toolkit for researchers. Inspired by the most successful neural source separation systems, it provides all neural building blocks required to build such a system. To improve reproducibility, Kaldi-style recipes on common audio source separation datasets are also provided. This paper describes the software architecture of Asteroid and its most important features. By showing experimental results obtained with Asteroid ’s recipes, we show that our implementations are at least on par with most results reported in reference papers. The toolkit is publicly available at github.com/mpariente/asteroid."}],"publication":"Interspeech 2020","popular_science":"1","type":"conference","doi":"10.21437/interspeech.2020-1673","title":"Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers","date_created":"2020-12-11T12:46:16Z","author":[{"last_name":"Pariente","full_name":"Pariente, Manuel","first_name":"Manuel"},{"first_name":"Samuele","full_name":"Cornell, Samuele","last_name":"Cornell"},{"first_name":"Joris","last_name":"Cosentino","full_name":"Cosentino, Joris"},{"first_name":"Sunit","full_name":"Sivasankaran, Sunit","last_name":"Sivasankaran"},{"full_name":"Tzinis, Efthymios","last_name":"Tzinis","first_name":"Efthymios"},{"last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens","first_name":"Jens"},{"last_name":"Olvera","full_name":"Olvera, Michel","first_name":"Michel"},{"full_name":"Stöter, Fabian-Robert","last_name":"Stöter","first_name":"Fabian-Robert"},{"first_name":"Mathieu","last_name":"Hu","full_name":"Hu, Mathieu"},{"first_name":"Juan M.","full_name":"Martín-Doñas, Juan M.","last_name":"Martín-Doñas"},{"first_name":"David","last_name":"Ditter","full_name":"Ditter, David"},{"first_name":"Ariel","last_name":"Frank","full_name":"Frank, Ariel"},{"last_name":"Deleforge","full_name":"Deleforge, Antoine","first_name":"Antoine"},{"last_name":"Vincent","full_name":"Vincent, Emmanuel","first_name":"Emmanuel"}],"date_updated":"2022-01-06T06:54:33Z","citation":{"mla":"Pariente, Manuel, et al. “Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers.” <i>Interspeech 2020</i>, 2020, doi:<a href=\"https://doi.org/10.21437/interspeech.2020-1673\">10.21437/interspeech.2020-1673</a>.","bibtex":"@inproceedings{Pariente_Cornell_Cosentino_Sivasankaran_Tzinis_Heitkaemper_Olvera_Stöter_Hu_Martín-Doñas_et al._2020, title={Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2020-1673\">10.21437/interspeech.2020-1673</a>}, booktitle={Interspeech 2020}, author={Pariente, Manuel and Cornell, Samuele and Cosentino, Joris and Sivasankaran, Sunit and Tzinis, Efthymios and Heitkaemper, Jens and Olvera, Michel and Stöter, Fabian-Robert and Hu, Mathieu and Martín-Doñas, Juan M. and et al.}, year={2020} }","short":"M. Pariente, S. Cornell, J. Cosentino, S. Sivasankaran, E. Tzinis, J. Heitkaemper, M. Olvera, F.-R. Stöter, M. Hu, J.M. Martín-Doñas, D. Ditter, A. Frank, A. Deleforge, E. Vincent, in: Interspeech 2020, 2020.","apa":"Pariente, M., Cornell, S., Cosentino, J., Sivasankaran, S., Tzinis, E., Heitkaemper, J., … Vincent, E. (2020). Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers. In <i>Interspeech 2020</i>. <a href=\"https://doi.org/10.21437/interspeech.2020-1673\">https://doi.org/10.21437/interspeech.2020-1673</a>","chicago":"Pariente, Manuel, Samuele Cornell, Joris Cosentino, Sunit Sivasankaran, Efthymios Tzinis, Jens Heitkaemper, Michel Olvera, et al. “Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers.” In <i>Interspeech 2020</i>, 2020. <a href=\"https://doi.org/10.21437/interspeech.2020-1673\">https://doi.org/10.21437/interspeech.2020-1673</a>.","ieee":"M. Pariente <i>et al.</i>, “Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers,” in <i>Interspeech 2020</i>, 2020.","ama":"Pariente M, Cornell S, Cosentino J, et al. Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers. In: <i>Interspeech 2020</i>. ; 2020. doi:<a href=\"https://doi.org/10.21437/interspeech.2020-1673\">10.21437/interspeech.2020-1673</a>"},"year":"2020","publication_status":"published"},{"file":[{"file_name":"ms.pdf","file_id":"20699","access_level":"closed","file_size":3871374,"date_created":"2020-12-11T12:36:37Z","creator":"jensheit","date_updated":"2020-12-11T12:36:37Z","relation":"main_file","success":1,"content_type":"application/pdf"}],"abstract":[{"lang":"eng","text":"In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions."}],"publication":"ICASSP 2020 Virtual Barcelona Spain","language":[{"iso":"eng"}],"ddc":["000"],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"year":"2020","quality_controlled":"1","title":"Demystifying TasNet: A Dissecting Approach","date_created":"2020-11-25T14:56:53Z","status":"public","type":"conference","file_date_updated":"2020-12-11T12:36:37Z","user_id":"40767","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20504","citation":{"ieee":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, and R. Haeb-Umbach, “Demystifying TasNet: A Dissecting Approach,” 2020.","chicago":"Heitkaemper, Jens, Darius Jakobeit, Christoph Boeddeker, Lukas Drude, and Reinhold Haeb-Umbach. “Demystifying TasNet: A Dissecting Approach.” In <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","ama":"Heitkaemper J, Jakobeit D, Boeddeker C, Drude L, Haeb-Umbach R. Demystifying TasNet: A Dissecting Approach. In: <i>ICASSP 2020 Virtual Barcelona Spain</i>. ; 2020.","mla":"Heitkaemper, Jens, et al. “Demystifying TasNet: A Dissecting Approach.” <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","bibtex":"@inproceedings{Heitkaemper_Jakobeit_Boeddeker_Drude_Haeb-Umbach_2020, title={Demystifying TasNet: A Dissecting Approach}, booktitle={ICASSP 2020 Virtual Barcelona Spain}, author={Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}, year={2020} }","short":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, R. Haeb-Umbach, in: ICASSP 2020 Virtual Barcelona Spain, 2020.","apa":"Heitkaemper, J., Jakobeit, D., Boeddeker, C., Drude, L., &#38; Haeb-Umbach, R. (2020). Demystifying TasNet: A Dissecting Approach. <i>ICASSP 2020 Virtual Barcelona Spain</i>."},"has_accepted_license":"1","author":[{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"first_name":"Darius","full_name":"Jakobeit, Darius","last_name":"Jakobeit"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Drude","full_name":"Drude, Lukas","first_name":"Lukas"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_updated":"2022-01-13T08:47:32Z"},{"title":"Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments","author":[{"full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper","first_name":"Jens"},{"first_name":"Joerg","last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2020-11-25T15:03:19Z","date_updated":"2023-10-26T08:28:49Z","citation":{"bibtex":"@inproceedings{Heitkaemper_Schmalenstroeer_Haeb-Umbach_2020, title={Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}, booktitle={INTERSPEECH 2020 Virtual Shanghai China}, author={Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Heitkaemper, Jens, et al. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","short":"J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: INTERSPEECH 2020 Virtual Shanghai China, 2020.","apa":"Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2020). Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. <i>INTERSPEECH 2020 Virtual Shanghai China</i>.","ama":"Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. In: <i>INTERSPEECH 2020 Virtual Shanghai China</i>. ; 2020.","ieee":"J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments,” 2020.","chicago":"Heitkaemper, Jens, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” In <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020."},"year":"2020","has_accepted_license":"1","file_date_updated":"2020-12-11T12:33:04Z","language":[{"iso":"eng"}],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"ddc":["000"],"department":[{"_id":"54"}],"user_id":"460","_id":"20505","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"status":"public","file":[{"date_created":"2020-12-11T12:33:04Z","creator":"jensheit","date_updated":"2020-12-11T12:33:04Z","file_name":"ms.pdf","file_id":"20697","access_level":"closed","file_size":998706,"content_type":"application/pdf","relation":"main_file","success":1}],"abstract":[{"lang":"eng","text":"Speech activity detection (SAD), which often rests on the fact that the noise is \"more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.\r\nThe latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.\r\nThe statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art."}],"publication":"INTERSPEECH 2020 Virtual Shanghai China","type":"conference"},{"oa":"1","date_updated":"2022-01-06T06:54:04Z","author":[{"full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"first_name":"Christoph","id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_created":"2020-09-16T07:59:46Z","title":"SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition","has_accepted_license":"1","year":"2019","citation":{"ieee":"L. Drude, J. Heitkaemper, C. Boeddeker, and R. Haeb-Umbach, “SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition,” <i>ArXiv e-prints</i>, 2019.","chicago":"Drude, Lukas, Jens Heitkaemper, Christoph Boeddeker, and Reinhold Haeb-Umbach. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","ama":"Drude L, Heitkaemper J, Boeddeker C, Haeb-Umbach R. SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv e-prints</i>. 2019.","short":"L. Drude, J. Heitkaemper, C. Boeddeker, R. Haeb-Umbach, ArXiv E-Prints (2019).","mla":"Drude, Lukas, et al. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","bibtex":"@article{Drude_Heitkaemper_Boeddeker_Haeb-Umbach_2019, title={SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition}, journal={ArXiv e-prints}, author={Drude, Lukas and Heitkaemper, Jens and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2019} }","apa":"Drude, L., Heitkaemper, J., Boeddeker, C., &#38; Haeb-Umbach, R. (2019). SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv E-Prints</i>."},"_id":"19446","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-12-11T12:22:31Z","publication":"ArXiv e-prints","type":"journal_article","abstract":[{"text":"We present a multi-channel database of overlapping speech for training, evaluation, and detailed analysis of source separation and extraction algorithms: SMS-WSJ -- Spatialized Multi-Speaker Wall Street Journal. It consists of artificially mixed speech taken from the WSJ database, but unlike earlier databases we consider all WSJ0+1 utterances and take care of strictly separating the speaker sets present in the training, validation and test sets. When spatializing the data we ensure a high degree of randomness w.r.t. room size, array center and rotation, as well as speaker position. Furthermore, this paper offers a critical assessment of recently proposed measures of source separation performance. Alongside the code to generate the database we provide a source separation baseline and a Kaldi recipe with competitive word error rates to provide common ground for evaluation.","lang":"eng"}],"status":"public","file":[{"date_created":"2020-09-16T08:00:56Z","creator":"huesera","date_updated":"2020-12-11T12:22:31Z","access_level":"open_access","file_id":"19448","file_name":"ArXiv_2019_Drude.pdf","file_size":288594,"content_type":"application/pdf","relation":"main_file"}]},{"title":"A Study on Online Source Extraction in the Presence of Changing Speaker Positions","date_updated":"2022-01-06T06:52:06Z","oa":"1","date_created":"2019-11-06T09:43:03Z","author":[{"last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens","first_name":"Jens"},{"last_name":"Feher","full_name":"Feher, Thomas","first_name":"Thomas"},{"full_name":"Freitag, Michael","last_name":"Freitag","first_name":"Michael"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"year":"2019","citation":{"apa":"Heitkaemper, J., Feher, T., Freitag, M., &#38; Haeb-Umbach, R. (2019). A Study on Online Source Extraction in the Presence of Changing Speaker Positions. In <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>.","mla":"Heitkaemper, Jens, et al. “A Study on Online Source Extraction in the Presence of Changing Speaker Positions.” <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019.","short":"J. Heitkaemper, T. Feher, M. Freitag, R. Haeb-Umbach, in: International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia, 2019.","bibtex":"@inproceedings{Heitkaemper_Feher_Freitag_Haeb-Umbach_2019, title={A Study on Online Source Extraction in the Presence of Changing Speaker Positions}, booktitle={International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia}, author={Heitkaemper, Jens and Feher, Thomas and Freitag, Michael and Haeb-Umbach, Reinhold}, year={2019} }","ama":"Heitkaemper J, Feher T, Freitag M, Haeb-Umbach R. A Study on Online Source Extraction in the Presence of Changing Speaker Positions. In: <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>. ; 2019.","chicago":"Heitkaemper, Jens, Thomas Feher, Michael Freitag, and Reinhold Haeb-Umbach. “A Study on Online Source Extraction in the Presence of Changing Speaker Positions.” In <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019.","ieee":"J. Heitkaemper, T. Feher, M. Freitag, and R. Haeb-Umbach, “A Study on Online Source Extraction in the Presence of Changing Speaker Positions,” in <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019."},"has_accepted_license":"1","ddc":["006"],"language":[{"iso":"eng"}],"file_date_updated":"2019-11-08T07:47:12Z","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"14822","user_id":"59789","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"Multi-talker speech and moving speakers still pose a significant challenge to automatic speech recognition systems. Assuming an enrollment utterance of the target speakeris available, the so-called SpeakerBeam concept has been recently proposed to extract the target speaker from a speech mixture. If multi-channel input is available, spatial properties of the speaker can be exploited to support the source extraction. In this contribution we investigate different approaches to exploit such spatial information. In particular, we are interested in the question, how useful this information is if the target speaker changes his/her position. To this end, we present a SpeakerBeam-based source extraction network that is adapted to work on moving speakers by recursively updating the beamformer coefficients. Experimental results are presented on two data sets, one with articially created room impulse responses, and one with real room impulse responses and noise recorded in a conference room. Interestingly, spatial features turn out to be advantageous even if the speaker position changes."}],"file":[{"relation":"main_file","content_type":"application/pdf","file_name":"SLSP_2019_Heitkaemper_Paper.pdf","file_id":"14823","access_level":"open_access","file_size":578595,"date_created":"2019-11-06T10:02:26Z","creator":"huesera","date_updated":"2019-11-08T07:47:12Z"}],"status":"public","type":"conference","publication":"International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia"},{"date_updated":"2022-01-06T06:52:07Z","oa":"1","date_created":"2019-11-06T10:04:49Z","author":[{"first_name":"Juan M.","full_name":"Martin-Donas, Juan M.","last_name":"Martin-Donas"},{"first_name":"Jens","last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"},{"last_name":"Gomez","full_name":"Gomez, Angel M.","first_name":"Angel M."},{"first_name":"Antonio M.","full_name":"Peinado, Antonio M.","last_name":"Peinado"}],"title":"Multi-Channel Block-Online Source Extraction based on Utterance Adaptation","has_accepted_license":"1","year":"2019","citation":{"chicago":"Martin-Donas, Juan M., Jens Heitkaemper, Reinhold Haeb-Umbach, Angel M. Gomez, and Antonio M. Peinado. “Multi-Channel Block-Online Source Extraction Based on Utterance Adaptation.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"J. M. Martin-Donas, J. Heitkaemper, R. Haeb-Umbach, A. M. Gomez, and A. M. Peinado, “Multi-Channel Block-Online Source Extraction based on Utterance Adaptation,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ama":"Martin-Donas JM, Heitkaemper J, Haeb-Umbach R, Gomez AM, Peinado AM. Multi-Channel Block-Online Source Extraction based on Utterance Adaptation. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","apa":"Martin-Donas, J. M., Heitkaemper, J., Haeb-Umbach, R., Gomez, A. M., &#38; Peinado, A. M. (2019). Multi-Channel Block-Online Source Extraction based on Utterance Adaptation. In <i>INTERSPEECH 2019, Graz, Austria</i>.","short":"J.M. Martin-Donas, J. Heitkaemper, R. Haeb-Umbach, A.M. Gomez, A.M. Peinado, in: INTERSPEECH 2019, Graz, Austria, 2019.","bibtex":"@inproceedings{Martin-Donas_Heitkaemper_Haeb-Umbach_Gomez_Peinado_2019, title={Multi-Channel Block-Online Source Extraction based on Utterance Adaptation}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Martin-Donas, Juan M. and Heitkaemper, Jens and Haeb-Umbach, Reinhold and Gomez, Angel M. and Peinado, Antonio M.}, year={2019} }","mla":"Martin-Donas, Juan M., et al. “Multi-Channel Block-Online Source Extraction Based on Utterance Adaptation.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019."},"_id":"14824","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"59789","ddc":["000"],"file_date_updated":"2019-11-08T07:46:37Z","language":[{"iso":"eng"}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","abstract":[{"text":"This paper deals with multi-channel speech recognition in scenarios with multiple speakers. Recently, the spectral characteristics of a target speaker, extracted from an adaptation utterance, have been used to guide a neural network mask estimator to focus on that speaker. In this work we present two variants of speakeraware neural networks, which exploit both spectral and spatial information to allow better discrimination between target and interfering speakers. Thus, we introduce either a spatial preprocessing prior to the mask estimation or a spatial plus spectral speaker characterization block whose output is directly fed into the neural mask estimator. The target speaker’s spectral and spatial signature is extracted from an adaptation utterance recorded at the beginning of a session. We further adapt the architecture for low-latency processing by means of block-online beamforming that recursively updates the signal statistics. Experimental results show that the additional spatial information clearly improves source extraction, in particular in the same-gender case, and that our proposal achieves state-of-the-art performance in terms of distortion reduction and recognition accuracy.","lang":"eng"}],"status":"public","file":[{"content_type":"application/pdf","relation":"main_file","creator":"huesera","date_created":"2019-11-06T10:07:15Z","date_updated":"2019-11-08T07:46:37Z","file_name":"INTERSPEECH_2019_Heitkaemper_Paper.pdf","access_level":"open_access","file_id":"14825","file_size":225689}]},{"file_date_updated":"2019-11-08T07:45:15Z","language":[{"iso":"eng"}],"ddc":["000"],"department":[{"_id":"54"}],"user_id":"59789","_id":"14826","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_id":"14827","access_level":"open_access","file_name":"INTERSPEECH_2019_Boeddeker_Paper.pdf","file_size":216202,"date_created":"2019-11-06T10:10:23Z","creator":"huesera","date_updated":"2019-11-08T07:45:15Z"}],"abstract":[{"lang":"eng","text":"In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task."}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","title":"Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR","author":[{"first_name":"Naoyuki","full_name":"Kanda, Naoyuki","last_name":"Kanda"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens","first_name":"Jens"},{"last_name":"Fujita","full_name":"Fujita, Yusuke","first_name":"Yusuke"},{"last_name":"Horiguchi","full_name":"Horiguchi, Shota","first_name":"Shota"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2019-11-06T10:08:49Z","oa":"1","date_updated":"2022-01-06T06:52:07Z","citation":{"ieee":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, and R. Haeb-Umbach, “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","chicago":"Kanda, Naoyuki, Christoph Boeddeker, Jens Heitkaemper, Yusuke Fujita, Shota Horiguchi, and Reinhold Haeb-Umbach. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ama":"Kanda N, Boeddeker C, Heitkaemper J, Fujita Y, Horiguchi S, Haeb-Umbach R. Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","mla":"Kanda, Naoyuki, et al. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","short":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, R. Haeb-Umbach, in: INTERSPEECH 2019, Graz, Austria, 2019.","bibtex":"@inproceedings{Kanda_Boeddeker_Heitkaemper_Fujita_Horiguchi_Haeb-Umbach_2019, title={Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}, year={2019} }","apa":"Kanda, N., Boeddeker, C., Heitkaemper, J., Fujita, Y., Horiguchi, S., &#38; Haeb-Umbach, R. (2019). Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In <i>INTERSPEECH 2019, Graz, Austria</i>."},"year":"2019","has_accepted_license":"1"},{"department":[{"_id":"54"}],"user_id":"44006","_id":"11837","language":[{"iso":"eng"}],"publication":"ITG 2018, Oldenburg, Germany","type":"conference","status":"public","abstract":[{"lang":"eng","text":"We present a block-online multi-channel front end for automatic speech recognition in noisy and reverberated environments. It is an online version of our earlier proposed neural network supported acoustic beamformer, whose coefficients are calculated from noise and speech spatial covariance matrices which are estimated utilizing a neural mask estimator. However, the sparsity of speech in the STFT domain causes problems for the initial beamformer coefficients estimation in some frequency bins due to lack of speech observations. We propose two methods to mitigate this issue. The first is to lower the frequency resolution of the STFT, which comes with the additional advantage of a reduced time window, thus lowering the latency introduced by block processing. The second approach is to smooth beamforming coefficients along the frequency axis, thus exploiting their high interfrequency correlation. With both approaches the gap between offline and block-online beamformer performance, as measured by the word error rate achieved by a downstream speech recognizer, is significantly reduced. Experiments are carried out on two copora, representing noisy (CHiME-4) and noisy reverberant (voiceHome) environments."}],"date_created":"2019-07-12T05:29:13Z","author":[{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"id":"9168","full_name":"Heymann, Jahn","last_name":"Heymann","first_name":"Jahn"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"oa":"1","date_updated":"2022-01-06T06:51:11Z","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Heitkaemper_Paper.pdf","open_access":"1"}],"title":"Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming","related_material":{"link":[{"relation":"supplementary_material","description":"Slides","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Heitkaemper_Slides.pdf"}]},"citation":{"apa":"Heitkaemper, J., Heymann, J., &#38; Haeb-Umbach, R. (2018). Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming. In <i>ITG 2018, Oldenburg, Germany</i>.","bibtex":"@inproceedings{Heitkaemper_Heymann_Haeb-Umbach_2018, title={Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming}, booktitle={ITG 2018, Oldenburg, Germany}, author={Heitkaemper, Jens and Heymann, Jahn and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Heitkaemper, Jens, et al. “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming.” <i>ITG 2018, Oldenburg, Germany</i>, 2018.","short":"J. Heitkaemper, J. Heymann, R. Haeb-Umbach, in: ITG 2018, Oldenburg, Germany, 2018.","ieee":"J. Heitkaemper, J. Heymann, and R. Haeb-Umbach, “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming,” in <i>ITG 2018, Oldenburg, Germany</i>, 2018.","chicago":"Heitkaemper, Jens, Jahn Heymann, and Reinhold Haeb-Umbach. “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming.” In <i>ITG 2018, Oldenburg, Germany</i>, 2018.","ama":"Heitkaemper J, Heymann J, Haeb-Umbach R. Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming. In: <i>ITG 2018, Oldenburg, Germany</i>. ; 2018."},"year":"2018"},{"title":"Front-End Processing for the CHiME-5 Dinner Party Scenario","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Heitkaemper_Paper.pdf","open_access":"1"}],"date_updated":"2023-10-26T08:14:15Z","oa":"1","author":[{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens","first_name":"Jens"},{"first_name":"Joerg","last_name":"Schmalenstroeer","id":"460","full_name":"Schmalenstroeer, Joerg"},{"first_name":"Lukas","id":"11213","full_name":"Drude, Lukas","last_name":"Drude"},{"full_name":"Heymann, Jahn","last_name":"Heymann","first_name":"Jahn"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2019-07-30T14:35:15Z","year":"2018","citation":{"apa":"Boeddeker, C., Heitkaemper, J., Schmalenstroeer, J., Drude, L., Heymann, J., &#38; Haeb-Umbach, R. (2018). Front-End Processing for the CHiME-5 Dinner Party Scenario. <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>.","short":"C. Boeddeker, J. Heitkaemper, J. Schmalenstroeer, L. Drude, J. Heymann, R. Haeb-Umbach, in: Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India, 2018.","bibtex":"@inproceedings{Boeddeker_Heitkaemper_Schmalenstroeer_Drude_Heymann_Haeb-Umbach_2018, title={Front-End Processing for the CHiME-5 Dinner Party Scenario}, booktitle={Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}, author={Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Boeddeker, Christoph, et al. “Front-End Processing for the CHiME-5 Dinner Party Scenario.” <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018.","ama":"Boeddeker C, Heitkaemper J, Schmalenstroeer J, Drude L, Heymann J, Haeb-Umbach R. Front-End Processing for the CHiME-5 Dinner Party Scenario. In: <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>. ; 2018.","ieee":"C. Boeddeker, J. Heitkaemper, J. Schmalenstroeer, L. Drude, J. Heymann, and R. Haeb-Umbach, “Front-End Processing for the CHiME-5 Dinner Party Scenario,” 2018.","chicago":"Boeddeker, Christoph, Jens Heitkaemper, Joerg Schmalenstroeer, Lukas Drude, Jahn Heymann, and Reinhold Haeb-Umbach. “Front-End Processing for the CHiME-5 Dinner Party Scenario.” In <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018."},"quality_controlled":"1","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Heitkaemper_Poster.pdf","relation":"supplementary_material","description":"Poster"}]},"language":[{"iso":"eng"}],"project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"12899","user_id":"460","department":[{"_id":"54"}],"abstract":[{"text":"This contribution presents a speech enhancement system for the CHiME-5 Dinner Party Scenario. The front-end employs multi-channel linear time-variant filtering and achieves its gains without the use of a neural network. We present an adaptation of blind source separation techniques to the CHiME-5 database which we call Guided Source Separation (GSS). Using the baseline acoustic and language model, the combination of Weighted Prediction Error based dereverberation, guided source separation, and beamforming reduces the WER by 10:54% (relative) for the single array track and by 21:12% (relative) on the multiple array track.","lang":"eng"}],"status":"public","type":"conference","publication":"Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India"},{"type":"conference","publication":"Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India","status":"public","abstract":[{"text":"This paper describes the systems for the single-array track and the multiple-array track of the 5th CHiME Challenge. The final system is a combination of multiple systems, using Confusion Network Combination (CNC). The different systems presented here are utilizing different front-ends and training sets for a Bidirectional Long Short-Term Memory (BLSTM) Acoustic Model (AM). The front-end was replaced by enhancements provided by Paderborn University [1]. The back-end has been implemented using RASR [2] and RETURNN [3]. Additionally, a system combination including the hypothesis word graphs from the system of the submission [1] has been performed, which results in the final best system.","lang":"eng"}],"user_id":"460","department":[{"_id":"54"}],"_id":"11876","language":[{"iso":"eng"}],"quality_controlled":"1","citation":{"mla":"Kitza, Markus, et al. “The RWTH/UPB System Combination for the CHiME 2018 Workshop.” <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018.","bibtex":"@inproceedings{Kitza_Michel_Boeddeker_Heitkaemper_Menne_Schlüter_Ney_Schmalenstroeer_Drude_Heymann_et al._2018, title={The RWTH/UPB System Combination for the CHiME 2018 Workshop}, booktitle={Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}, author={Kitza, Markus and Michel, Wilfried and Boeddeker, Christoph and Heitkaemper, Jens and Menne, Tobias and Schlüter, Ralf and Ney, Hermann and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and et al.}, year={2018} }","short":"M. Kitza, W. Michel, C. Boeddeker, J. Heitkaemper, T. Menne, R. Schlüter, H. Ney, J. Schmalenstroeer, L. Drude, J. Heymann, R. Haeb-Umbach, in: Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India, 2018.","apa":"Kitza, M., Michel, W., Boeddeker, C., Heitkaemper, J., Menne, T., Schlüter, R., Ney, H., Schmalenstroeer, J., Drude, L., Heymann, J., &#38; Haeb-Umbach, R. (2018). The RWTH/UPB System Combination for the CHiME 2018 Workshop. <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>.","ama":"Kitza M, Michel W, Boeddeker C, et al. The RWTH/UPB System Combination for the CHiME 2018 Workshop. In: <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>. ; 2018.","ieee":"M. Kitza <i>et al.</i>, “The RWTH/UPB System Combination for the CHiME 2018 Workshop,” 2018.","chicago":"Kitza, Markus, Wilfried Michel, Christoph Boeddeker, Jens Heitkaemper, Tobias Menne, Ralf Schlüter, Hermann Ney, et al. “The RWTH/UPB System Combination for the CHiME 2018 Workshop.” In <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018."},"year":"2018","date_created":"2019-07-12T05:29:58Z","author":[{"first_name":"Markus","full_name":"Kitza, Markus","last_name":"Kitza"},{"first_name":"Wilfried","full_name":"Michel, Wilfried","last_name":"Michel"},{"first_name":"Christoph","last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767"},{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"last_name":"Menne","full_name":"Menne, Tobias","first_name":"Tobias"},{"first_name":"Ralf","last_name":"Schlüter","full_name":"Schlüter, Ralf"},{"full_name":"Ney, Hermann","last_name":"Ney","first_name":"Hermann"},{"first_name":"Joerg","full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer"},{"full_name":"Drude, Lukas","id":"11213","last_name":"Drude","first_name":"Lukas"},{"first_name":"Jahn","last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"oa":"1","date_updated":"2023-10-26T08:12:14Z","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Heitkaemper_RWTH_Paper.pdf"}],"title":"The RWTH/UPB System Combination for the CHiME 2018 Workshop"},{"type":"conference","publication":"ITG 2018, Oldenburg, Germany","abstract":[{"text":"Due to their distributed nature wireless acoustic sensor networks offer great potential for improved signal acquisition, processing and classification for applications such as monitoring and surveillance, home automation, or hands-free telecommunication. To reduce the communication demand with a central server and to raise the privacy level it is desirable to perform processing at node level. The limited processing and memory capabilities on a sensor node, however, stand in contrast to the compute and memory intensive deep learning algorithms used in modern speech and audio processing. In this work, we perform benchmarking of commonly used convolutional and recurrent neural network architectures on a Raspberry Pi based acoustic sensor node. We show that it is possible to run medium-sized neural network topologies used for speech enhancement and speech recognition in real time. For acoustic event recognition, where predictions in a lower temporal resolution are sufficient, it is even possible to run current state-of-the-art deep convolutional models with a real-time-factor of 0:11.","lang":"eng"}],"status":"public","_id":"11836","user_id":"460","department":[{"_id":"54"}],"language":[{"iso":"eng"}],"quality_controlled":"1","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Ebbers_Poster.pdf","relation":"supplementary_material","description":"Poster"}]},"year":"2018","citation":{"bibtex":"@inproceedings{Ebbers_Heitkaemper_Schmalenstroeer_Haeb-Umbach_2018, title={Benchmarking Neural Network Architectures for Acoustic Sensor Networks}, booktitle={ITG 2018, Oldenburg, Germany}, author={Ebbers, Janek and Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Ebbers, Janek, et al. “Benchmarking Neural Network Architectures for Acoustic Sensor Networks.” <i>ITG 2018, Oldenburg, Germany</i>, 2018.","short":"J. Ebbers, J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: ITG 2018, Oldenburg, Germany, 2018.","apa":"Ebbers, J., Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2018). Benchmarking Neural Network Architectures for Acoustic Sensor Networks. <i>ITG 2018, Oldenburg, Germany</i>.","ama":"Ebbers J, Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Benchmarking Neural Network Architectures for Acoustic Sensor Networks. In: <i>ITG 2018, Oldenburg, Germany</i>. ; 2018.","ieee":"J. Ebbers, J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Benchmarking Neural Network Architectures for Acoustic Sensor Networks,” 2018.","chicago":"Ebbers, Janek, Jens Heitkaemper, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Benchmarking Neural Network Architectures for Acoustic Sensor Networks.” In <i>ITG 2018, Oldenburg, Germany</i>, 2018."},"date_updated":"2023-10-26T08:12:40Z","oa":"1","author":[{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460","first_name":"Joerg"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_created":"2019-07-12T05:29:11Z","title":"Benchmarking Neural Network Architectures for Acoustic Sensor Networks","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Ebbers_Paper.pdf"}]},{"title":"A Priori SNR Estimation Using Weibull Mixture Model","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2016/ChHeiHa16.pdf"}],"date_updated":"2022-01-06T06:51:08Z","oa":"1","date_created":"2019-07-12T05:27:24Z","author":[{"first_name":"Aleksej","full_name":"Chinaev, Aleksej","last_name":"Chinaev"},{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"year":"2016","citation":{"bibtex":"@inproceedings{Chinaev_Heitkaemper_Haeb-Umbach_2016, title={A Priori SNR Estimation Using Weibull Mixture Model}, booktitle={12. ITG Fachtagung Sprachkommunikation (ITG 2016)}, author={Chinaev, Aleksej and Heitkaemper, Jens and Haeb-Umbach, Reinhold}, year={2016} }","short":"A. Chinaev, J. Heitkaemper, R. Haeb-Umbach, in: 12. ITG Fachtagung Sprachkommunikation (ITG 2016), 2016.","mla":"Chinaev, Aleksej, et al. “A Priori SNR Estimation Using Weibull Mixture Model.” <i>12. ITG Fachtagung Sprachkommunikation (ITG 2016)</i>, 2016.","apa":"Chinaev, A., Heitkaemper, J., &#38; Haeb-Umbach, R. (2016). A Priori SNR Estimation Using Weibull Mixture Model. In <i>12. ITG Fachtagung Sprachkommunikation (ITG 2016)</i>.","ieee":"A. Chinaev, J. Heitkaemper, and R. Haeb-Umbach, “A Priori SNR Estimation Using Weibull Mixture Model,” in <i>12. ITG Fachtagung Sprachkommunikation (ITG 2016)</i>, 2016.","chicago":"Chinaev, Aleksej, Jens Heitkaemper, and Reinhold Haeb-Umbach. “A Priori SNR Estimation Using Weibull Mixture Model.” In <i>12. ITG Fachtagung Sprachkommunikation (ITG 2016)</i>, 2016.","ama":"Chinaev A, Heitkaemper J, Haeb-Umbach R. A Priori SNR Estimation Using Weibull Mixture Model. In: <i>12. ITG Fachtagung Sprachkommunikation (ITG 2016)</i>. ; 2016."},"related_material":{"link":[{"description":"Presentation","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2016/ChHeiHa16_Presentation.pdf"}]},"language":[{"iso":"eng"}],"_id":"11743","user_id":"44006","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"This contribution introduces a novel causal a priori signal-to-noise ratio (SNR) estimator for single-channel speech enhancement. To exploit the advantages of the generalized spectral subtraction, a normalized ?-order magnitude (NAOM) domain is introduced where an a priori SNR estimation is carried out. In this domain, the NAOM coefficients of noise and clean speech signals are modeled by a Weibull distribution and aWeibullmixturemodel (WMM), respectively. While the parameters of the noise model are calculated from the noise power spectral density estimates, the speechWMM parameters are estimated from the noisy signal by applying a causal Expectation-Maximization algorithm. Further a maximum a posteriori estimate of the a priori SNR is developed. The experiments in different noisy environments show the superiority of the proposed estimator compared to the well-known decision-directed approach in terms of estimation error, estimator variance and speech quality of the enhanced signals when used for speech enhancement."}],"status":"public","type":"conference","publication":"12. ITG Fachtagung Sprachkommunikation (ITG 2016)"}]
