[{"publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","type":"journal_article","status":"public","department":[{"_id":"54"}],"user_id":"40767","_id":"17598","language":[{"iso":"eng"}],"page":"1-1","citation":{"ama":"Nakatani T, Boeddeker C, Kinoshita K, Ikeshita R, Delcroix M, Haeb-Umbach R. Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. Published online 2020:1-1. doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>","ieee":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, and R. Haeb-Umbach, “Jointly optimal denoising, dereverberation, and source separation,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, pp. 1–1, 2020, doi: <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","chicago":"Nakatani, Tomohiro, Christoph Boeddeker, Keisuke Kinoshita, Rintaro Ikeshita, Marc Delcroix, and Reinhold Haeb-Umbach. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>.","bibtex":"@article{Nakatani_Boeddeker_Kinoshita_Ikeshita_Delcroix_Haeb-Umbach_2020, title={Jointly optimal denoising, dereverberation, and source separation}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Nakatani, Tomohiro and Boeddeker, Christoph and Kinoshita, Keisuke and Ikeshita, Rintaro and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2020}, pages={1–1} }","mla":"Nakatani, Tomohiro, et al. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, pp. 1–1, doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","short":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing (2020) 1–1.","apa":"Nakatani, T., Boeddeker, C., Kinoshita, K., Ikeshita, R., Delcroix, M., &#38; Haeb-Umbach, R. (2020). Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>"},"year":"2020","author":[{"full_name":"Nakatani, Tomohiro","last_name":"Nakatani","first_name":"Tomohiro"},{"first_name":"Christoph","last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Rintaro","last_name":"Ikeshita","full_name":"Ikeshita, Rintaro"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2020-08-05T06:16:56Z","date_updated":"2022-12-05T12:34:01Z","oa":"1","doi":"10.1109/TASLP.2020.3013118","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2020/journal_2020_boeddeker.pdf"}],"title":"Jointly optimal denoising, dereverberation, and source separation"},{"date_created":"2020-11-25T14:56:53Z","title":"Demystifying TasNet: A Dissecting Approach","quality_controlled":"1","year":"2020","ddc":["000"],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"language":[{"iso":"eng"}],"publication":"ICASSP 2020 Virtual Barcelona Spain","abstract":[{"text":"In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions.","lang":"eng"}],"file":[{"creator":"jensheit","date_created":"2020-12-11T12:36:37Z","date_updated":"2020-12-11T12:36:37Z","access_level":"closed","file_id":"20699","file_name":"ms.pdf","file_size":3871374,"content_type":"application/pdf","relation":"main_file","success":1}],"date_updated":"2022-01-13T08:47:32Z","author":[{"first_name":"Jens","id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper"},{"last_name":"Jakobeit","full_name":"Jakobeit, Darius","first_name":"Darius"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"last_name":"Drude","full_name":"Drude, Lukas","first_name":"Lukas"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"has_accepted_license":"1","citation":{"chicago":"Heitkaemper, Jens, Darius Jakobeit, Christoph Boeddeker, Lukas Drude, and Reinhold Haeb-Umbach. “Demystifying TasNet: A Dissecting Approach.” In <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","ieee":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, and R. Haeb-Umbach, “Demystifying TasNet: A Dissecting Approach,” 2020.","ama":"Heitkaemper J, Jakobeit D, Boeddeker C, Drude L, Haeb-Umbach R. Demystifying TasNet: A Dissecting Approach. In: <i>ICASSP 2020 Virtual Barcelona Spain</i>. ; 2020.","bibtex":"@inproceedings{Heitkaemper_Jakobeit_Boeddeker_Drude_Haeb-Umbach_2020, title={Demystifying TasNet: A Dissecting Approach}, booktitle={ICASSP 2020 Virtual Barcelona Spain}, author={Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Heitkaemper, Jens, et al. “Demystifying TasNet: A Dissecting Approach.” <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","short":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, R. Haeb-Umbach, in: ICASSP 2020 Virtual Barcelona Spain, 2020.","apa":"Heitkaemper, J., Jakobeit, D., Boeddeker, C., Drude, L., &#38; Haeb-Umbach, R. (2020). Demystifying TasNet: A Dissecting Approach. <i>ICASSP 2020 Virtual Barcelona Spain</i>."},"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20504","user_id":"40767","department":[{"_id":"54"}],"file_date_updated":"2020-12-11T12:36:37Z","type":"conference","status":"public"},{"title":"Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments","author":[{"full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper","first_name":"Jens"},{"first_name":"Joerg","full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2020-11-25T15:03:19Z","date_updated":"2023-10-26T08:28:49Z","citation":{"short":"J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: INTERSPEECH 2020 Virtual Shanghai China, 2020.","bibtex":"@inproceedings{Heitkaemper_Schmalenstroeer_Haeb-Umbach_2020, title={Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}, booktitle={INTERSPEECH 2020 Virtual Shanghai China}, author={Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Heitkaemper, Jens, et al. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","apa":"Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2020). Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. <i>INTERSPEECH 2020 Virtual Shanghai China</i>.","chicago":"Heitkaemper, Jens, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” In <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","ieee":"J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments,” 2020.","ama":"Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. In: <i>INTERSPEECH 2020 Virtual Shanghai China</i>. ; 2020."},"year":"2020","has_accepted_license":"1","file_date_updated":"2020-12-11T12:33:04Z","language":[{"iso":"eng"}],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"ddc":["000"],"department":[{"_id":"54"}],"user_id":"460","_id":"20505","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"status":"public","file":[{"file_name":"ms.pdf","access_level":"closed","file_id":"20697","file_size":998706,"creator":"jensheit","date_created":"2020-12-11T12:33:04Z","date_updated":"2020-12-11T12:33:04Z","relation":"main_file","success":1,"content_type":"application/pdf"}],"abstract":[{"lang":"eng","text":"Speech activity detection (SAD), which often rests on the fact that the noise is \"more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.\r\nThe latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.\r\nThe statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art."}],"publication":"INTERSPEECH 2020 Virtual Shanghai China","type":"conference"},{"title":"End-to-End Training of Time Domain Audio Separation and Recognition","date_created":"2020-12-16T14:07:54Z","year":"2020","quality_controlled":"1","language":[{"iso":"eng"}],"ddc":["000"],"file":[{"file_size":192529,"access_level":"open_access","file_id":"20763","file_name":"ICASSP_2020_vonNeumann_Paper.pdf","date_updated":"2020-12-16T14:09:48Z","date_created":"2020-12-16T14:09:48Z","creator":"huesera","relation":"main_file","content_type":"application/pdf"}],"abstract":[{"text":"The rising interest in single-channel multi-speaker speech separation sparked development of End-to-End (E2E) approaches to multispeaker speech recognition. However, up until now, state-of-theart neural network–based time domain source separation has not yet been combined with E2E speech recognition. We here demonstrate how to combine a separation module based on a Convolutional Time domain Audio Separation Network (Conv-TasNet) with an E2E speech recognizer and how to train such a model jointly by distributing it over multiple GPUs or by approximating truncated back-propagation for the convolutional front-end. To put this work into perspective and illustrate the complexity of the design space, we provide a compact overview of single-channel multi-speaker recognition systems. Our experiments show a word error rate of 11.0% on WSJ0-2mix and indicate that our joint time domain model can yield substantial improvements over cascade DNN-HMM and monolithic E2E frequency domain systems proposed so far.","lang":"eng"}],"publication":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","doi":"10.1109/ICASSP40776.2020.9053461","author":[{"orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","full_name":"von Neumann, Thilo","id":"49870","first_name":"Thilo"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"full_name":"Nakatani, Tomohiro","last_name":"Nakatani","first_name":"Tomohiro"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_updated":"2023-11-15T12:17:45Z","oa":"1","citation":{"ama":"von Neumann T, Kinoshita K, Drude L, et al. End-to-End Training of Time Domain Audio Separation and Recognition. In: <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2020:7004-7008. doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Lukas Drude, Christoph Boeddeker, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “End-to-End Training of Time Domain Audio Separation and Recognition.” In <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–8, 2020. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>.","ieee":"T. von Neumann <i>et al.</i>, “End-to-End Training of Time Domain Audio Separation and Recognition,” in <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–7008, doi: <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","mla":"von Neumann, Thilo, et al. “End-to-End Training of Time Domain Audio Separation and Recognition.” <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–08, doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","bibtex":"@inproceedings{von Neumann_Kinoshita_Drude_Boeddeker_Delcroix_Nakatani_Haeb-Umbach_2020, title={End-to-End Training of Time Domain Audio Separation and Recognition}, DOI={<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Drude, Lukas and Boeddeker, Christoph and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={7004–7008} }","short":"T. von Neumann, K. Kinoshita, L. Drude, C. Boeddeker, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020, pp. 7004–7008.","apa":"von Neumann, T., Kinoshita, K., Drude, L., Boeddeker, C., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). End-to-End Training of Time Domain Audio Separation and Recognition. <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–7008. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>"},"page":"7004-7008","has_accepted_license":"1","file_date_updated":"2020-12-16T14:09:48Z","user_id":"49870","department":[{"_id":"54"}],"project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"20762","status":"public","type":"conference"},{"author":[{"first_name":"Thilo","id":"49870","full_name":"von Neumann, Thilo","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670"},{"first_name":"Christoph","id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker"},{"first_name":"Lukas","last_name":"Drude","full_name":"Drude, Lukas"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Marc","last_name":"Delcroix","full_name":"Delcroix, Marc"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2023-11-15T12:17:57Z","oa":"1","doi":"10.21437/Interspeech.2020-2519","has_accepted_license":"1","citation":{"ama":"von Neumann T, Boeddeker C, Drude L, et al. Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. In: <i>Proc. Interspeech 2020</i>. ; 2020:3097-3101. doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>","ieee":"T. von Neumann <i>et al.</i>, “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR,” in <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–3101, doi: <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>.","chicago":"Neumann, Thilo von, Christoph Boeddeker, Lukas Drude, Keisuke Kinoshita, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” In <i>Proc. Interspeech 2020</i>, 3097–3101, 2020. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>.","apa":"von Neumann, T., Boeddeker, C., Drude, L., Kinoshita, K., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. <i>Proc. Interspeech 2020</i>, 3097–3101. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>","bibtex":"@inproceedings{von Neumann_Boeddeker_Drude_Kinoshita_Delcroix_Nakatani_Haeb-Umbach_2020, title={Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>}, booktitle={Proc. Interspeech 2020}, author={von Neumann, Thilo and Boeddeker, Christoph and Drude, Lukas and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={3097–3101} }","short":"T. von Neumann, C. Boeddeker, L. Drude, K. Kinoshita, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: Proc. Interspeech 2020, 2020, pp. 3097–3101.","mla":"von Neumann, Thilo, et al. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–101, doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>."},"page":"3097-3101","user_id":"49870","department":[{"_id":"54"}],"project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20764","file_date_updated":"2020-12-16T14:14:14Z","type":"conference","status":"public","date_created":"2020-12-16T14:12:45Z","title":"Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR","quality_controlled":"1","year":"2020","language":[{"iso":"eng"}],"ddc":["000"],"publication":"Proc. Interspeech 2020","file":[{"file_size":267893,"file_id":"20765","file_name":"INTERSPEECH_2020_vonNeumann_Paper.pdf","access_level":"open_access","date_updated":"2020-12-16T14:14:14Z","date_created":"2020-12-16T14:14:14Z","creator":"huesera","relation":"main_file","content_type":"application/pdf"}],"abstract":[{"lang":"eng","text":"Most approaches to multi-talker overlapped speech separation and recognition assume that the number of simultaneously active speakers is given, but in realistic situations, it is typically unknown. To cope with this, we extend an iterative speech extraction system with mechanisms to count the number of sources and combine it with a single-talker speech recognizer to form the first end-to-end multi-talker automatic speech recognition system for an unknown number of active speakers. Our experiments show very promising performance in counting accuracy, source separation and speech recognition on simulated clean mixtures from WSJ0-2mix and WSJ0-3mix. Among others, we set a new state-of-the-art word error rate on the WSJ0-2mix database. Furthermore, our system generalizes well to a larger number of speakers than it ever saw during training, as shown in experiments with the WSJ0-4mix database. "}]},{"citation":{"short":"T. Gburrek, J. Schmalenstroeer, A. Brendel, W. Kellermann, R. Haeb-Umbach, in: European Signal Processing Conference (EUSIPCO), 2020.","mla":"Gburrek, Tobias, et al. “Deep Neural Network Based Distance Estimation for Geometry Calibration in Acoustic Sensor Network.” <i>European Signal Processing Conference (EUSIPCO)</i>, 2020.","bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Brendel_Kellermann_Haeb-Umbach_2020, title={Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network}, booktitle={European Signal Processing Conference (EUSIPCO)}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Brendel, Andreas and Kellermann, Walter and Haeb-Umbach, Reinhold}, year={2020} }","apa":"Gburrek, T., Schmalenstroeer, J., Brendel, A., Kellermann, W., &#38; Haeb-Umbach, R. (2020). Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network. <i>European Signal Processing Conference (EUSIPCO)</i>.","ieee":"T. Gburrek, J. Schmalenstroeer, A. Brendel, W. Kellermann, and R. Haeb-Umbach, “Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network,” 2020.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, Andreas Brendel, Walter Kellermann, and Reinhold Haeb-Umbach. “Deep Neural Network Based Distance Estimation for Geometry Calibration in Acoustic Sensor Network.” In <i>European Signal Processing Conference (EUSIPCO)</i>, 2020.","ama":"Gburrek T, Schmalenstroeer J, Brendel A, Kellermann W, Haeb-Umbach R. Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network. In: <i>European Signal Processing Conference (EUSIPCO)</i>. ; 2020."},"year":"2020","has_accepted_license":"1","quality_controlled":"1","title":"Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network","date_created":"2020-08-31T07:20:57Z","author":[{"first_name":"Tobias","id":"44006","full_name":"Gburrek, Tobias","last_name":"Gburrek"},{"first_name":"Joerg","id":"460","full_name":"Schmalenstroeer, Joerg","last_name":"Schmalenstroeer"},{"full_name":"Brendel, Andreas","last_name":"Brendel","first_name":"Andreas"},{"first_name":"Walter","full_name":"Kellermann, Walter","last_name":"Kellermann"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_updated":"2023-11-17T06:23:39Z","oa":"1","file":[{"date_created":"2023-11-17T06:21:40Z","creator":"tgburrek","date_updated":"2023-11-17T06:21:40Z","file_id":"48987","access_level":"open_access","file_name":"Gburrek2020.pdf","file_size":292159,"content_type":"application/pdf","relation":"main_file"}],"status":"public","abstract":[{"lang":"eng","text":"We present an approach to deep neural network based (DNN-based) distance estimation in reverberant rooms for supporting geometry calibration tasks in wireless acoustic sensor networks. Signal diffuseness information from acoustic signals is aggregated via the coherent-to-diffuse power ratio to obtain a distance-related feature, which is mapped to a source-to-microphone distance estimate by means of a DNN. This information is then combined with direction-of-arrival estimates from compact microphone arrays to infer the geometry of the sensor network. Unlike many other approaches to geometry calibration, the proposed scheme does only require that the sampling clocks of the sensor nodes are roughly synchronized. In simulations we show that the proposed DNN-based distance estimator generalizes to unseen acoustic environments and that precise estimates of the sensor node positions are obtained. "}],"type":"conference","publication":"European Signal Processing Conference (EUSIPCO)","language":[{"iso":"eng"}],"file_date_updated":"2023-11-17T06:21:40Z","ddc":["004"],"user_id":"44006","department":[{"_id":"54"}],"_id":"18651"},{"publication":"Proc. Interspeech 2020","abstract":[{"lang":"eng","text":"Recently, the source separation performance was greatly improved by time-domain audio source separation based on dual-path recurrent neural network (DPRNN). DPRNN is a simple but effective model for a long sequential data. While DPRNN is quite efficient in modeling a sequential data of the length of an utterance, i.e., about 5 to 10 second data, it is harder to apply it to longer sequences such as whole conversations consisting of multiple utterances. It is simply because, in such a case, the number of time steps consumed by its internal module called inter-chunk RNN becomes extremely large. To mitigate this problem, this paper proposes a multi-path RNN (MPRNN), a generalized version of DPRNN, that models the input data in a hierarchical manner. In the MPRNN framework, the input data is represented at several (>_ 3) time-resolutions, each of which is modeled by a specific RNN sub-module. For example, the RNN sub-module that deals with the finest resolution may model temporal relationship only within a phoneme, while the RNN sub-module handling the most coarse resolution may capture only the relationship between utterances such as speaker information. We perform experiments using simulated dialogue-like mixtures and show that MPRNN has greater model capacity, and it outperforms the current state-of-the-art DPRNN framework especially in online processing scenarios."}],"file":[{"content_type":"application/pdf","relation":"main_file","date_created":"2020-12-16T14:16:32Z","creator":"huesera","date_updated":"2020-12-16T14:16:32Z","access_level":"open_access","file_id":"20767","file_name":"INTERSPEECH_2020_vonNeumann1_Paper.pdf","file_size":1725219}],"ddc":["000"],"language":[{"iso":"eng"}],"quality_controlled":"1","year":"2020","date_created":"2020-12-16T14:15:24Z","title":"Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation","type":"conference","status":"public","_id":"20766","user_id":"49870","department":[{"_id":"54"}],"file_date_updated":"2020-12-16T14:16:32Z","has_accepted_license":"1","citation":{"apa":"Kinoshita, K., von Neumann, T., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation. <i>Proc. Interspeech 2020</i>, 2652–2656. <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">https://doi.org/10.21437/Interspeech.2020-2388</a>","mla":"Kinoshita, Keisuke, et al. “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and Its Application to Speaker Stream Separation.” <i>Proc. Interspeech 2020</i>, 2020, pp. 2652–56, doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>.","short":"K. Kinoshita, T. von Neumann, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: Proc. Interspeech 2020, 2020, pp. 2652–2656.","bibtex":"@inproceedings{Kinoshita_von Neumann_Delcroix_Nakatani_Haeb-Umbach_2020, title={Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>}, booktitle={Proc. Interspeech 2020}, author={Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={2652–2656} }","chicago":"Kinoshita, Keisuke, Thilo von Neumann, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and Its Application to Speaker Stream Separation.” In <i>Proc. Interspeech 2020</i>, 2652–56, 2020. <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">https://doi.org/10.21437/Interspeech.2020-2388</a>.","ieee":"K. Kinoshita, T. von Neumann, M. Delcroix, T. Nakatani, and R. Haeb-Umbach, “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation,” in <i>Proc. Interspeech 2020</i>, 2020, pp. 2652–2656, doi: <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>.","ama":"Kinoshita K, von Neumann T, Delcroix M, Nakatani T, Haeb-Umbach R. Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation. In: <i>Proc. Interspeech 2020</i>. ; 2020:2652-2656. doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>"},"page":"2652-2656","oa":"1","date_updated":"2023-11-15T12:14:25Z","author":[{"last_name":"Kinoshita","full_name":"Kinoshita, Keisuke","first_name":"Keisuke"},{"first_name":"Thilo","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","full_name":"von Neumann, Thilo","id":"49870"},{"first_name":"Marc","last_name":"Delcroix","full_name":"Delcroix, Marc"},{"first_name":"Tomohiro","last_name":"Nakatani","full_name":"Nakatani, Tomohiro"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"doi":"10.21437/Interspeech.2020-2388"},{"file_date_updated":"2020-12-16T08:57:22Z","user_id":"34851","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20753","status":"public","type":"conference","author":[{"last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek","first_name":"Janek"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_updated":"2023-11-22T08:27:32Z","oa":"1","citation":{"ieee":"J. Ebbers and R. Haeb-Umbach, “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection,” 2020.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection.” In <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>, 2020.","ama":"Ebbers J, Haeb-Umbach R. Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection. In: <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>. ; 2020.","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection.” <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>, 2020.","short":"J. Ebbers, R. Haeb-Umbach, in: Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020), 2020.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2020, title={Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection}, booktitle={Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2020} }","apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2020). Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection. <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>."},"has_accepted_license":"1","language":[{"iso":"eng"}],"ddc":["000"],"file":[{"date_updated":"2020-12-16T08:57:22Z","date_created":"2020-12-16T08:57:22Z","creator":"huesera","file_size":108326,"file_name":"DCASE2020Workshop_Ebbers_Paper.pdf","file_id":"20754","access_level":"open_access","content_type":"application/pdf","relation":"main_file"}],"abstract":[{"text":"In this paper we present our system for the detection and classification of acoustic scenes and events (DCASE) 2020 Challenge Task 4: Sound event detection and separation in domestic environments. We introduce two new models: the forward-backward convolutional recurrent neural network (FBCRNN) and the tag-conditioned convolutional neural network (CNN). The FBCRNN employs two recurrent neural network (RNN) classifiers sharing the same CNN for preprocessing. With one RNN processing a recording in forward direction and the other in backward direction, the two networks are trained to jointly predict audio tags, i.e., weak labels, at each time step within a recording, given that at each time step they have jointly processed the whole recording. The proposed training encourages the classifiers to tag events as soon as possible. Therefore, after training, the networks can be applied to shorter audio segments of, e.g., 200ms, allowing sound event detection (SED). Further, we propose a tag-conditioned CNN to complement SED. It is trained to predict strong labels while using (predicted) tags, i.e., weak labels, as additional input. For training pseudo strong labels from a FBCRNN ensemble are used. The presented system scored the fourth and third place in the systems and teams rankings, respectively. Subsequent improvements allow our system to even outperform the challenge baseline and winner systems in average by, respectively, 18.0% and 2.2% event-based F1-score on the validation set. Source code is publicly available at https://github.com/fgnt/pb_sed.","lang":"eng"}],"publication":"Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)","title":"Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection","date_created":"2020-12-16T08:55:27Z","year":"2020","quality_controlled":"1"},{"oa":"1","date_updated":"2024-11-14T09:17:32Z","author":[{"first_name":"Christoph","last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2020-12-11T12:28:49Z","title":"Jointly Optimal Dereverberation and Beamforming","doi":"10.1109/icassp40776.2020.9054393","publication_identifier":{"isbn":["9781509066315"]},"has_accepted_license":"1","publication_status":"published","year":"2020","citation":{"mla":"Boeddeker, Christoph, et al. “Jointly Optimal Dereverberation and Beamforming.” <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, doi:<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>.","short":"C. Boeddeker, T. Nakatani, K. Kinoshita, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020.","bibtex":"@inproceedings{Boeddeker_Nakatani_Kinoshita_Haeb-Umbach_2020, title={Jointly Optimal Dereverberation and Beamforming}, DOI={<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Boeddeker, Christoph and Nakatani, Tomohiro and Kinoshita, Keisuke and Haeb-Umbach, Reinhold}, year={2020} }","apa":"Boeddeker, C., Nakatani, T., Kinoshita, K., &#38; Haeb-Umbach, R. (2020). Jointly Optimal Dereverberation and Beamforming. <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">https://doi.org/10.1109/icassp40776.2020.9054393</a>","ieee":"C. Boeddeker, T. Nakatani, K. Kinoshita, and R. Haeb-Umbach, “Jointly Optimal Dereverberation and Beamforming,” 2020, doi: <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>.","chicago":"Boeddeker, Christoph, Tomohiro Nakatani, Keisuke Kinoshita, and Reinhold Haeb-Umbach. “Jointly Optimal Dereverberation and Beamforming.” In <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020. <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">https://doi.org/10.1109/icassp40776.2020.9054393</a>.","ama":"Boeddeker C, Nakatani T, Kinoshita K, Haeb-Umbach R. Jointly Optimal Dereverberation and Beamforming. In: <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2020. doi:<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>"},"_id":"20695","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-12-11T12:32:44Z","publication":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","type":"conference","status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_size":200127,"file_name":"convBF.pdf","access_level":"open_access","file_id":"20698","date_updated":"2020-12-11T12:32:44Z","date_created":"2020-12-11T12:32:44Z","creator":"cbj"}]},{"title":"Lektionen für Alexa \\& Co?!","doi":"10.1002/fors.201970104","date_updated":"2022-01-06T06:53:19Z","author":[{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_created":"2020-08-10T09:51:09Z","volume":44,"year":"2019","citation":{"ama":"Haeb-Umbach R. Lektionen für Alexa \\&#38; Co?! <i>forschung</i>. 2019;44(1):12-15. doi:<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>","chicago":"Haeb-Umbach, Reinhold. “Lektionen Für Alexa \\&#38; Co?!” <i>Forschung</i> 44, no. 1 (2019): 12–15. <a href=\"https://doi.org/10.1002/fors.201970104\">https://doi.org/10.1002/fors.201970104</a>.","ieee":"R. Haeb-Umbach, “Lektionen für Alexa \\&#38; Co?!,” <i>forschung</i>, vol. 44, no. 1, pp. 12–15, 2019.","apa":"Haeb-Umbach, R. (2019). Lektionen für Alexa \\&#38; Co?! <i>Forschung</i>, <i>44</i>(1), 12–15. <a href=\"https://doi.org/10.1002/fors.201970104\">https://doi.org/10.1002/fors.201970104</a>","short":"R. Haeb-Umbach, Forschung 44 (2019) 12–15.","mla":"Haeb-Umbach, Reinhold. “Lektionen Für Alexa \\&#38; Co?!” <i>Forschung</i>, vol. 44, no. 1, 2019, pp. 12–15, doi:<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>.","bibtex":"@article{Haeb-Umbach_2019, title={Lektionen für Alexa \\&#38; Co?!}, volume={44}, DOI={<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>}, number={1}, journal={forschung}, author={Haeb-Umbach, Reinhold}, year={2019}, pages={12–15} }"},"page":"12-15","intvolume":"        44","issue":"1","language":[{"iso":"eng"}],"_id":"17762","user_id":"44006","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"Abstract Wenn akustische Signalverarbeitung mit automatisiertem Lernen verknüpft wird: Nachrichtentechniker arbeiten mit mehreren Mikrofonen und tiefen neuronalen Netzen an besserer Spracherkennung unter widrigsten Bedingungen. Von solchen Sensornetzwerken könnten langfristig auch digitale Sprachassistenten profitieren."}],"status":"public","type":"journal_article","publication":"forschung"},{"type":"journal_article","publication":"ArXiv e-prints","abstract":[{"text":"We present a multi-channel database of overlapping speech for training, evaluation, and detailed analysis of source separation and extraction algorithms: SMS-WSJ -- Spatialized Multi-Speaker Wall Street Journal. It consists of artificially mixed speech taken from the WSJ database, but unlike earlier databases we consider all WSJ0+1 utterances and take care of strictly separating the speaker sets present in the training, validation and test sets. When spatializing the data we ensure a high degree of randomness w.r.t. room size, array center and rotation, as well as speaker position. Furthermore, this paper offers a critical assessment of recently proposed measures of source separation performance. Alongside the code to generate the database we provide a source separation baseline and a Kaldi recipe with competitive word error rates to provide common ground for evaluation.","lang":"eng"}],"file":[{"access_level":"open_access","file_name":"ArXiv_2019_Drude.pdf","file_id":"19448","file_size":288594,"date_created":"2020-09-16T08:00:56Z","creator":"huesera","date_updated":"2020-12-11T12:22:31Z","relation":"main_file","content_type":"application/pdf"}],"status":"public","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"19446","user_id":"40767","department":[{"_id":"54"}],"ddc":["000"],"file_date_updated":"2020-12-11T12:22:31Z","language":[{"iso":"eng"}],"has_accepted_license":"1","year":"2019","citation":{"mla":"Drude, Lukas, et al. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","short":"L. Drude, J. Heitkaemper, C. Boeddeker, R. Haeb-Umbach, ArXiv E-Prints (2019).","bibtex":"@article{Drude_Heitkaemper_Boeddeker_Haeb-Umbach_2019, title={SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition}, journal={ArXiv e-prints}, author={Drude, Lukas and Heitkaemper, Jens and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2019} }","apa":"Drude, L., Heitkaemper, J., Boeddeker, C., &#38; Haeb-Umbach, R. (2019). SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv E-Prints</i>.","ama":"Drude L, Heitkaemper J, Boeddeker C, Haeb-Umbach R. SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv e-prints</i>. 2019.","chicago":"Drude, Lukas, Jens Heitkaemper, Christoph Boeddeker, and Reinhold Haeb-Umbach. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","ieee":"L. Drude, J. Heitkaemper, C. Boeddeker, and R. Haeb-Umbach, “SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition,” <i>ArXiv e-prints</i>, 2019."},"date_updated":"2022-01-06T06:54:04Z","oa":"1","date_created":"2020-09-16T07:59:46Z","author":[{"first_name":"Lukas","full_name":"Drude, Lukas","last_name":"Drude"},{"full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper","first_name":"Jens"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"title":"SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition"},{"abstract":[{"text":"We present an unsupervised training approach for a neural network-based mask estimator in an acoustic beamforming application. The network is trained to maximize a likelihood criterion derived from a spatial mixture model of the observations. It is trained from scratch without requiring any parallel data consisting of degraded input and clean training targets. Thus, training can be carried out on real recordings of noisy speech rather than simulated ones. In contrast to previous work on unsupervised training of neural mask estimators, our approach avoids the need for a possibly pre-trained teacher model entirely. We demonstrate the effectiveness of our approach by speech recognition experiments on two different datasets: one mainly deteriorated by noise (CHiME 4) and one by reverberation (REVERB). The results show that the performance of the proposed system is on par with a supervised system using oracle target masks for training and with a system trained using a model-based teacher.","lang":"eng"}],"status":"public","file":[{"file_name":"INTERSPEECH_2019_Drude_Paper.pdf","access_level":"open_access","file_id":"12914","file_size":223413,"date_created":"2019-08-13T06:36:44Z","creator":"huesera","date_updated":"2019-08-13T06:41:35Z","relation":"main_file","content_type":"application/pdf"}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","ddc":["000"],"file_date_updated":"2019-08-13T06:41:35Z","language":[{"iso":"eng"}],"_id":"11965","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"59789","year":"2019","citation":{"chicago":"Drude, Lukas, Jahn Heymann, and Reinhold Haeb-Umbach. “Unsupervised Training of Neural Mask-Based Beamforming.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"L. Drude, J. Heymann, and R. Haeb-Umbach, “Unsupervised training of neural mask-based beamforming,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ama":"Drude L, Heymann J, Haeb-Umbach R. Unsupervised training of neural mask-based beamforming. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","apa":"Drude, L., Heymann, J., &#38; Haeb-Umbach, R. (2019). Unsupervised training of neural mask-based beamforming. In <i>INTERSPEECH 2019, Graz, Austria</i>.","bibtex":"@inproceedings{Drude_Heymann_Haeb-Umbach_2019, title={Unsupervised training of neural mask-based beamforming}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, year={2019} }","mla":"Drude, Lukas, et al. “Unsupervised Training of Neural Mask-Based Beamforming.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","short":"L. Drude, J. Heymann, R. Haeb-Umbach, in: INTERSPEECH 2019, Graz, Austria, 2019."},"has_accepted_license":"1","title":"Unsupervised training of neural mask-based beamforming","oa":"1","date_updated":"2022-01-06T06:51:14Z","author":[{"full_name":"Drude, Lukas","id":"11213","last_name":"Drude","first_name":"Lukas"},{"last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168","first_name":"Jahn"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_created":"2019-07-18T09:11:39Z"},{"ddc":["000"],"file_date_updated":"2019-08-14T07:19:13Z","language":[{"iso":"eng"}],"_id":"12874","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"59789","abstract":[{"lang":"eng","text":"We propose a training scheme to train neural network-based source separation algorithms from scratch when parallel clean data is unavailable. In particular, we demonstrate that an unsupervised spatial clustering algorithm is sufficient to guide the training of a deep clustering system. We argue that previous work on deep clustering requires strong supervision and elaborate on why this is a limitation. We demonstrate that (a) the single-channel deep clustering system trained according to the proposed scheme alone is able to achieve a similar performance as the multi-channel teacher in terms of word error rates and (b) initializing the spatial clustering approach with the deep clustering result yields a relative word error rate reduction of 26% over the unsupervised teacher."}],"status":"public","file":[{"file_name":"ICASSP_2019_Drude_Paper.pdf","access_level":"open_access","file_id":"12925","file_size":368225,"creator":"huesera","date_created":"2019-08-14T07:19:13Z","date_updated":"2019-08-14T07:19:13Z","relation":"main_file","content_type":"application/pdf"}],"publication":"ICASSP 2019, Brighton, UK","type":"conference","title":"Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation","date_updated":"2022-01-06T06:51:21Z","oa":"1","author":[{"first_name":"Lukas","id":"11213","full_name":"Drude, Lukas","last_name":"Drude"},{"first_name":"Daniel","last_name":"Hasenklever","full_name":"Hasenklever, Daniel"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_created":"2019-07-23T07:37:54Z","year":"2019","citation":{"apa":"Drude, L., Hasenklever, D., &#38; Haeb-Umbach, R. (2019). Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation. In <i>ICASSP 2019, Brighton, UK</i>.","mla":"Drude, Lukas, et al. “Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation.” <i>ICASSP 2019, Brighton, UK</i>, 2019.","short":"L. Drude, D. Hasenklever, R. Haeb-Umbach, in: ICASSP 2019, Brighton, UK, 2019.","bibtex":"@inproceedings{Drude_Hasenklever_Haeb-Umbach_2019, title={Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation}, booktitle={ICASSP 2019, Brighton, UK}, author={Drude, Lukas and Hasenklever, Daniel and Haeb-Umbach, Reinhold}, year={2019} }","ama":"Drude L, Hasenklever D, Haeb-Umbach R. Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation. In: <i>ICASSP 2019, Brighton, UK</i>. ; 2019.","chicago":"Drude, Lukas, Daniel Hasenklever, and Reinhold Haeb-Umbach. “Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation.” In <i>ICASSP 2019, Brighton, UK</i>, 2019.","ieee":"L. Drude, D. Hasenklever, and R. Haeb-Umbach, “Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation,” in <i>ICASSP 2019, Brighton, UK</i>, 2019."},"has_accepted_license":"1"},{"has_accepted_license":"1","citation":{"ama":"Heymann J, Drude L, Haeb-Umbach R, Kinoshita K, Nakatani T. Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR. In: <i>ICASSP 2019, Brighton, UK</i>. ; 2019.","chicago":"Heymann, Jahn, Lukas Drude, Reinhold Haeb-Umbach, Keisuke Kinoshita, and Tomohiro Nakatani. “Joint Optimization of Neural Network-Based WPE Dereverberation and Acoustic Model for Robust Online ASR.” In <i>ICASSP 2019, Brighton, UK</i>, 2019.","ieee":"J. Heymann, L. Drude, R. Haeb-Umbach, K. Kinoshita, and T. Nakatani, “Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR,” in <i>ICASSP 2019, Brighton, UK</i>, 2019.","short":"J. Heymann, L. Drude, R. Haeb-Umbach, K. Kinoshita, T. Nakatani, in: ICASSP 2019, Brighton, UK, 2019.","mla":"Heymann, Jahn, et al. “Joint Optimization of Neural Network-Based WPE Dereverberation and Acoustic Model for Robust Online ASR.” <i>ICASSP 2019, Brighton, UK</i>, 2019.","bibtex":"@inproceedings{Heymann_Drude_Haeb-Umbach_Kinoshita_Nakatani_2019, title={Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR}, booktitle={ICASSP 2019, Brighton, UK}, author={Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}, year={2019} }","apa":"Heymann, J., Drude, L., Haeb-Umbach, R., Kinoshita, K., &#38; Nakatani, T. (2019). Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR. In <i>ICASSP 2019, Brighton, UK</i>."},"year":"2019","date_created":"2019-07-23T07:42:26Z","author":[{"first_name":"Jahn","last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168"},{"last_name":"Drude","full_name":"Drude, Lukas","id":"11213","first_name":"Lukas"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"}],"date_updated":"2022-01-06T06:51:22Z","oa":"1","title":"Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR","publication":"ICASSP 2019, Brighton, UK","type":"conference","status":"public","file":[{"date_created":"2019-12-17T07:28:06Z","creator":"huesera","date_updated":"2019-12-17T07:28:06Z","file_name":"ICASSP_2019_Heymann_Paper.pdf","file_id":"15334","access_level":"open_access","file_size":199109,"content_type":"application/pdf","relation":"main_file"}],"abstract":[{"lang":"eng","text":"Signal dereverberation using the Weighted Prediction Error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. First proposed as an iterative algorithm, follow-up works have reformulated it as a recursive least squares algorithm and therefore enabled its use in online applications. For this algorithm, the estimation of the power spectral density (PSD) of the anechoic signal plays an important role and strongly influences its performance. Recently, we showed that using a neural network PSD estimator leads to improved performance for online automatic speech recognition. This, however, comes at a price. To train the network, we require parallel data, i.e., utterances simultaneously available in clean and reverberated form. Here we propose to overcome this limitation by training the network jointly with the acoustic model of the speech recognizer. To be specific, the gradients computed from the cross-entropy loss between the target senone sequence and the acoustic model network output is backpropagated through the complex-valued dereverberation filter estimation to the neural network for PSD estimation. Evaluation on two databases demonstrates improved performance for on-line processing scenarios while imposing fewer requirements on the available training data and thus widening the range of applications."}],"department":[{"_id":"54"}],"user_id":"59789","_id":"12875","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"language":[{"iso":"eng"}],"file_date_updated":"2019-12-17T07:28:06Z","ddc":["000"]},{"publication":"Journal of Statistical Software 89(4)","type":"conference","status":"public","file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2019-08-14T07:16:05Z","date_created":"2019-08-14T07:16:05Z","creator":"huesera","file_size":1522964,"access_level":"open_access","file_name":"JournalofStatisticalSoftware_2019_Drude_Paper.pdf","file_id":"12923"}],"abstract":[{"text":"In this paper, we present libDirectional, a MATLAB library for directional statistics and directional estimation. It supports a variety of commonly used distributions on the unit circle, such as the von Mises, wrapped normal, and wrapped Cauchy distributions. Furthermore, various distributions on higher-dimensional manifolds such as the unit hypersphere and the hypertorus are available. Based on these distributions, several recursive filtering algorithms in libDirectional allow estimation on these manifolds. The functionality is implemented in a clear, well-documented, and object-oriented structure that is both easy to use and easy to extend.","lang":"eng"}],"department":[{"_id":"54"}],"user_id":"59789","_id":"12876","language":[{"iso":"eng"}],"file_date_updated":"2019-08-14T07:16:05Z","ddc":["000"],"has_accepted_license":"1","citation":{"ama":"Kurz G, Gilitschenski I, Pfaff F, et al. Directional Statistics and Filtering Using libDirectional. In: <i>Journal of Statistical Software 89(4)</i>. ; 2019.","chicago":"Kurz, Gerhard, Igor Gilitschenski, Florian Pfaff, Lukas Drude, Uwe D. Hanebeck, Reinhold Haeb-Umbach, and Roland Y. Siegwart. “Directional Statistics and Filtering Using LibDirectional.” In <i>Journal of Statistical Software 89(4)</i>, 2019.","ieee":"G. Kurz <i>et al.</i>, “Directional Statistics and Filtering Using libDirectional,” in <i>Journal of Statistical Software 89(4)</i>, 2019.","apa":"Kurz, G., Gilitschenski, I., Pfaff, F., Drude, L., Hanebeck, U. D., Haeb-Umbach, R., &#38; Siegwart, R. Y. (2019). Directional Statistics and Filtering Using libDirectional. In <i>Journal of Statistical Software 89(4)</i>.","short":"G. Kurz, I. Gilitschenski, F. Pfaff, L. Drude, U.D. Hanebeck, R. Haeb-Umbach, R.Y. Siegwart, in: Journal of Statistical Software 89(4), 2019.","bibtex":"@inproceedings{Kurz_Gilitschenski_Pfaff_Drude_Hanebeck_Haeb-Umbach_Siegwart_2019, title={Directional Statistics and Filtering Using libDirectional}, booktitle={Journal of Statistical Software 89(4)}, author={Kurz, Gerhard and Gilitschenski, Igor and Pfaff, Florian and Drude, Lukas and Hanebeck, Uwe D. and Haeb-Umbach, Reinhold and Siegwart, Roland Y.}, year={2019} }","mla":"Kurz, Gerhard, et al. “Directional Statistics and Filtering Using LibDirectional.” <i>Journal of Statistical Software 89(4)</i>, 2019."},"year":"2019","author":[{"first_name":"Gerhard","last_name":"Kurz","full_name":"Kurz, Gerhard"},{"last_name":"Gilitschenski","full_name":"Gilitschenski, Igor","first_name":"Igor"},{"last_name":"Pfaff","full_name":"Pfaff, Florian","first_name":"Florian"},{"last_name":"Drude","id":"11213","full_name":"Drude, Lukas","first_name":"Lukas"},{"full_name":"Hanebeck, Uwe D.","last_name":"Hanebeck","first_name":"Uwe D."},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"},{"last_name":"Siegwart","full_name":"Siegwart, Roland Y.","first_name":"Roland Y."}],"date_created":"2019-07-23T07:44:59Z","oa":"1","date_updated":"2022-01-06T06:51:22Z","title":"Directional Statistics and Filtering Using libDirectional"},{"date_created":"2019-07-26T08:38:46Z","title":"Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation","year":"2019","ddc":["050"],"language":[{"iso":"eng"}],"publication":"IEEE Journal of Selected Topics in Signal Processing","abstract":[{"lang":"eng","text":"We formulate a generic framework for blind source separation (BSS), which allows integrating data-driven spectro-temporal methods, such as deep clustering and deep attractor networks, with physically motivated probabilistic spatial methods, such as complex angular central Gaussian mixture models. The integrated model exploits the complementary strengths of the two approaches to BSS: the strong modeling power of neural networks, which, however, is based on supervised learning, and the ease of unsupervised learning of the spatial mixture models whose few parameters can be estimated on as little as a single segment of a real mixture of speech. Experiments are carried out on both artificially mixed speech and true recordings of speech mixtures. The experiments verify that the integrated models consistently outperform the individual components. We further extend the models to cope with noisy, reverberant speech and introduce a cross-domain teacher–student training where the mixture model serves as the teacher to provide training targets for the student neural network."}],"file":[{"date_created":"2019-08-07T07:12:21Z","creator":"huesera","date_updated":"2019-08-14T07:11:22Z","access_level":"open_access","file_name":"IEEE Jounal_2019_Drude_Paper.pdf","file_id":"12903","file_size":967424,"content_type":"application/pdf","relation":"main_file"}],"date_updated":"2022-01-06T06:51:23Z","oa":"1","author":[{"first_name":"Lukas","id":"11213","full_name":"Drude, Lukas","last_name":"Drude"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"doi":"10.1109/JSTSP.2019.2912565","publication_identifier":{"eissn":["1941-0484"]},"has_accepted_license":"1","citation":{"bibtex":"@article{Drude_Haeb-Umbach_2019, title={Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation}, DOI={<a href=\"https://doi.org/10.1109/JSTSP.2019.2912565\">10.1109/JSTSP.2019.2912565</a>}, journal={IEEE Journal of Selected Topics in Signal Processing}, author={Drude, Lukas and Haeb-Umbach, Reinhold}, year={2019} }","mla":"Drude, Lukas, and Reinhold Haeb-Umbach. “Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation.” <i>IEEE Journal of Selected Topics in Signal Processing</i>, 2019, doi:<a href=\"https://doi.org/10.1109/JSTSP.2019.2912565\">10.1109/JSTSP.2019.2912565</a>.","short":"L. Drude, R. Haeb-Umbach, IEEE Journal of Selected Topics in Signal Processing (2019).","apa":"Drude, L., &#38; Haeb-Umbach, R. (2019). Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation. <i>IEEE Journal of Selected Topics in Signal Processing</i>. <a href=\"https://doi.org/10.1109/JSTSP.2019.2912565\">https://doi.org/10.1109/JSTSP.2019.2912565</a>","ama":"Drude L, Haeb-Umbach R. Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation. <i>IEEE Journal of Selected Topics in Signal Processing</i>. 2019. doi:<a href=\"https://doi.org/10.1109/JSTSP.2019.2912565\">10.1109/JSTSP.2019.2912565</a>","chicago":"Drude, Lukas, and Reinhold Haeb-Umbach. “Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation.” <i>IEEE Journal of Selected Topics in Signal Processing</i>, 2019. <a href=\"https://doi.org/10.1109/JSTSP.2019.2912565\">https://doi.org/10.1109/JSTSP.2019.2912565</a>.","ieee":"L. Drude and R. Haeb-Umbach, “Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation,” <i>IEEE Journal of Selected Topics in Signal Processing</i>, 2019."},"project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"12890","user_id":"11213","department":[{"_id":"54"}],"file_date_updated":"2019-08-14T07:11:22Z","type":"journal_article","status":"public"},{"ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-02-06T07:42:55Z","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"15816","user_id":"59789","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"Despite the strong modeling power of neural network acoustic models, speech enhancement has been shown to deliver additional word error rate improvements if multi-channel data is available. However, there has been a longstanding debate whether enhancement should also be carried out on the ASR training data. In an extensive experimental evaluation on the acoustically very challenging CHiME-5 dinner party data we show that: (i) cleaning up the training data can lead to substantial error rate reductions, and (ii) enhancement in training is advisable as long as enhancement in test is at least as strong as in training. This approach stands in contrast and delivers larger gains than the common strategy reported in the literature to augment the training database with additional artificially degraded speech. Together with an acoustic model topology consisting of initial CNN layers followed by factorized TDNN layers we achieve with 41.6% and 43.2% WER on the DEV and EVAL test sets, respectively, a new single-system state-of-the-art result on the CHiME-5 data. This is a 8% relative improvement compared to the best word error rate published so far for a speech recognizer without system combination."}],"file":[{"date_updated":"2020-02-06T07:42:42Z","date_created":"2020-02-06T07:42:42Z","creator":"huesera","file_size":200256,"file_id":"15817","access_level":"open_access","file_name":"ASRU_2019_Boeddeker_Paper.pdf","content_type":"application/pdf","relation":"main_file"},{"relation":"main_file","content_type":"application/pdf","file_name":"ASRU_2019_Boeddeker_Poster.pdf","access_level":"open_access","file_id":"15818","file_size":123963,"date_created":"2020-02-06T07:42:55Z","creator":"huesera","date_updated":"2020-02-06T07:42:55Z"}],"status":"public","type":"conference","publication":"ASRU 2019, Sentosa, Singapore","title":"An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription","oa":"1","date_updated":"2022-01-06T06:52:37Z","date_created":"2020-02-06T07:35:08Z","author":[{"last_name":"Zorila","full_name":"Zorila, Catalin","first_name":"Catalin"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"full_name":"Doddipatla, Rama","last_name":"Doddipatla","first_name":"Rama"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"year":"2019","citation":{"bibtex":"@inproceedings{Zorila_Boeddeker_Doddipatla_Haeb-Umbach_2019, title={An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription}, booktitle={ASRU 2019, Sentosa, Singapore}, author={Zorila, Catalin and Boeddeker, Christoph and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2019} }","short":"C. Zorila, C. Boeddeker, R. Doddipatla, R. Haeb-Umbach, in: ASRU 2019, Sentosa, Singapore, 2019.","mla":"Zorila, Catalin, et al. “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription.” <i>ASRU 2019, Sentosa, Singapore</i>, 2019.","apa":"Zorila, C., Boeddeker, C., Doddipatla, R., &#38; Haeb-Umbach, R. (2019). An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription. In <i>ASRU 2019, Sentosa, Singapore</i>.","ama":"Zorila C, Boeddeker C, Doddipatla R, Haeb-Umbach R. An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription. In: <i>ASRU 2019, Sentosa, Singapore</i>. ; 2019.","ieee":"C. Zorila, C. Boeddeker, R. Doddipatla, and R. Haeb-Umbach, “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription,” in <i>ASRU 2019, Sentosa, Singapore</i>, 2019.","chicago":"Zorila, Catalin, Christoph Boeddeker, Rama Doddipatla, and Reinhold Haeb-Umbach. “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription.” In <i>ASRU 2019, Sentosa, Singapore</i>, 2019."},"has_accepted_license":"1"},{"author":[{"full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper","first_name":"Jens"},{"last_name":"Feher","full_name":"Feher, Thomas","first_name":"Thomas"},{"last_name":"Freitag","full_name":"Freitag, Michael","first_name":"Michael"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2019-11-06T09:43:03Z","oa":"1","date_updated":"2022-01-06T06:52:06Z","title":"A Study on Online Source Extraction in the Presence of Changing Speaker Positions","has_accepted_license":"1","citation":{"chicago":"Heitkaemper, Jens, Thomas Feher, Michael Freitag, and Reinhold Haeb-Umbach. “A Study on Online Source Extraction in the Presence of Changing Speaker Positions.” In <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019.","ieee":"J. Heitkaemper, T. Feher, M. Freitag, and R. Haeb-Umbach, “A Study on Online Source Extraction in the Presence of Changing Speaker Positions,” in <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019.","ama":"Heitkaemper J, Feher T, Freitag M, Haeb-Umbach R. A Study on Online Source Extraction in the Presence of Changing Speaker Positions. In: <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>. ; 2019.","apa":"Heitkaemper, J., Feher, T., Freitag, M., &#38; Haeb-Umbach, R. (2019). A Study on Online Source Extraction in the Presence of Changing Speaker Positions. In <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>.","bibtex":"@inproceedings{Heitkaemper_Feher_Freitag_Haeb-Umbach_2019, title={A Study on Online Source Extraction in the Presence of Changing Speaker Positions}, booktitle={International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia}, author={Heitkaemper, Jens and Feher, Thomas and Freitag, Michael and Haeb-Umbach, Reinhold}, year={2019} }","short":"J. Heitkaemper, T. Feher, M. Freitag, R. Haeb-Umbach, in: International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia, 2019.","mla":"Heitkaemper, Jens, et al. “A Study on Online Source Extraction in the Presence of Changing Speaker Positions.” <i>International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia</i>, 2019."},"year":"2019","department":[{"_id":"54"}],"user_id":"59789","_id":"14822","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"file_date_updated":"2019-11-08T07:47:12Z","language":[{"iso":"eng"}],"ddc":["006"],"publication":"International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia","type":"conference","status":"public","file":[{"file_size":578595,"file_id":"14823","access_level":"open_access","file_name":"SLSP_2019_Heitkaemper_Paper.pdf","date_updated":"2019-11-08T07:47:12Z","creator":"huesera","date_created":"2019-11-06T10:02:26Z","relation":"main_file","content_type":"application/pdf"}],"abstract":[{"lang":"eng","text":"Multi-talker speech and moving speakers still pose a significant challenge to automatic speech recognition systems. Assuming an enrollment utterance of the target speakeris available, the so-called SpeakerBeam concept has been recently proposed to extract the target speaker from a speech mixture. If multi-channel input is available, spatial properties of the speaker can be exploited to support the source extraction. In this contribution we investigate different approaches to exploit such spatial information. In particular, we are interested in the question, how useful this information is if the target speaker changes his/her position. To this end, we present a SpeakerBeam-based source extraction network that is adapted to work on moving speakers by recursively updating the beamformer coefficients. Experimental results are presented on two data sets, one with articially created room impulse responses, and one with real room impulse responses and noise recorded in a conference room. Interestingly, spatial features turn out to be advantageous even if the speaker position changes."}]},{"status":"public","file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2019-11-08T07:46:37Z","date_created":"2019-11-06T10:07:15Z","creator":"huesera","file_size":225689,"access_level":"open_access","file_id":"14825","file_name":"INTERSPEECH_2019_Heitkaemper_Paper.pdf"}],"abstract":[{"lang":"eng","text":"This paper deals with multi-channel speech recognition in scenarios with multiple speakers. Recently, the spectral characteristics of a target speaker, extracted from an adaptation utterance, have been used to guide a neural network mask estimator to focus on that speaker. In this work we present two variants of speakeraware neural networks, which exploit both spectral and spatial information to allow better discrimination between target and interfering speakers. Thus, we introduce either a spatial preprocessing prior to the mask estimation or a spatial plus spectral speaker characterization block whose output is directly fed into the neural mask estimator. The target speaker’s spectral and spatial signature is extracted from an adaptation utterance recorded at the beginning of a session. We further adapt the architecture for low-latency processing by means of block-online beamforming that recursively updates the signal statistics. Experimental results show that the additional spatial information clearly improves source extraction, in particular in the same-gender case, and that our proposal achieves state-of-the-art performance in terms of distortion reduction and recognition accuracy."}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","language":[{"iso":"eng"}],"file_date_updated":"2019-11-08T07:46:37Z","ddc":["000"],"department":[{"_id":"54"}],"user_id":"59789","_id":"14824","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"citation":{"apa":"Martin-Donas, J. M., Heitkaemper, J., Haeb-Umbach, R., Gomez, A. M., &#38; Peinado, A. M. (2019). Multi-Channel Block-Online Source Extraction based on Utterance Adaptation. In <i>INTERSPEECH 2019, Graz, Austria</i>.","short":"J.M. Martin-Donas, J. Heitkaemper, R. Haeb-Umbach, A.M. Gomez, A.M. Peinado, in: INTERSPEECH 2019, Graz, Austria, 2019.","bibtex":"@inproceedings{Martin-Donas_Heitkaemper_Haeb-Umbach_Gomez_Peinado_2019, title={Multi-Channel Block-Online Source Extraction based on Utterance Adaptation}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Martin-Donas, Juan M. and Heitkaemper, Jens and Haeb-Umbach, Reinhold and Gomez, Angel M. and Peinado, Antonio M.}, year={2019} }","mla":"Martin-Donas, Juan M., et al. “Multi-Channel Block-Online Source Extraction Based on Utterance Adaptation.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ama":"Martin-Donas JM, Heitkaemper J, Haeb-Umbach R, Gomez AM, Peinado AM. Multi-Channel Block-Online Source Extraction based on Utterance Adaptation. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","chicago":"Martin-Donas, Juan M., Jens Heitkaemper, Reinhold Haeb-Umbach, Angel M. Gomez, and Antonio M. Peinado. “Multi-Channel Block-Online Source Extraction Based on Utterance Adaptation.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"J. M. Martin-Donas, J. Heitkaemper, R. Haeb-Umbach, A. M. Gomez, and A. M. Peinado, “Multi-Channel Block-Online Source Extraction based on Utterance Adaptation,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019."},"year":"2019","has_accepted_license":"1","title":"Multi-Channel Block-Online Source Extraction based on Utterance Adaptation","date_created":"2019-11-06T10:04:49Z","author":[{"first_name":"Juan M.","full_name":"Martin-Donas, Juan M.","last_name":"Martin-Donas"},{"first_name":"Jens","id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"},{"full_name":"Gomez, Angel M.","last_name":"Gomez","first_name":"Angel M."},{"last_name":"Peinado","full_name":"Peinado, Antonio M.","first_name":"Antonio M."}],"date_updated":"2022-01-06T06:52:07Z","oa":"1"},{"citation":{"bibtex":"@inproceedings{Kanda_Boeddeker_Heitkaemper_Fujita_Horiguchi_Haeb-Umbach_2019, title={Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}, year={2019} }","short":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, R. Haeb-Umbach, in: INTERSPEECH 2019, Graz, Austria, 2019.","mla":"Kanda, Naoyuki, et al. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","apa":"Kanda, N., Boeddeker, C., Heitkaemper, J., Fujita, Y., Horiguchi, S., &#38; Haeb-Umbach, R. (2019). Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In <i>INTERSPEECH 2019, Graz, Austria</i>.","chicago":"Kanda, Naoyuki, Christoph Boeddeker, Jens Heitkaemper, Yusuke Fujita, Shota Horiguchi, and Reinhold Haeb-Umbach. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, and R. Haeb-Umbach, “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ama":"Kanda N, Boeddeker C, Heitkaemper J, Fujita Y, Horiguchi S, Haeb-Umbach R. Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019."},"year":"2019","has_accepted_license":"1","title":"Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR","author":[{"last_name":"Kanda","full_name":"Kanda, Naoyuki","first_name":"Naoyuki"},{"last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph","first_name":"Christoph"},{"first_name":"Jens","last_name":"Heitkaemper","id":"27643","full_name":"Heitkaemper, Jens"},{"full_name":"Fujita, Yusuke","last_name":"Fujita","first_name":"Yusuke"},{"first_name":"Shota","full_name":"Horiguchi, Shota","last_name":"Horiguchi"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2019-11-06T10:08:49Z","date_updated":"2022-01-06T06:52:07Z","oa":"1","status":"public","file":[{"file_id":"14827","access_level":"open_access","file_name":"INTERSPEECH_2019_Boeddeker_Paper.pdf","file_size":216202,"creator":"huesera","date_created":"2019-11-06T10:10:23Z","date_updated":"2019-11-08T07:45:15Z","relation":"main_file","content_type":"application/pdf"}],"abstract":[{"text":"In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task.","lang":"eng"}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","language":[{"iso":"eng"}],"file_date_updated":"2019-11-08T07:45:15Z","ddc":["000"],"department":[{"_id":"54"}],"user_id":"59789","_id":"14826","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}]}]
