[{"status":"public","abstract":[{"text":"Due to the ad hoc nature of wireless acoustic sensor networks, the position of the sensor nodes is typically unknown. This contribution proposes a technique to estimate the position and orientation of the sensor nodes from the recorded speech signals. The method assumes that a node comprises a microphone array with synchronously sampled microphones rather than a single microphone, but does not require the sampling clocks of the nodes to be synchronized. From the observed audio signals, the distances between the acoustic sources and arrays, as well as the directions of arrival, are estimated. They serve as input to a non-linear least squares problem, from which both the sensor nodes’ positions and orientations, as well as the source positions, are alternatingly estimated in an iterative process. Given one set of unknowns, i.e., either the source positions or the sensor nodes’ geometry, the other set of unknowns can be computed in closed-form. The proposed approach is computationally efficient and the first one, which employs both distance and directional information for geometry calibration in a common cost function. Since both distance and direction of arrival measurements suffer from outliers, e.g., caused by strong reflections of the sound waves on the surfaces of the room, we introduce measures to deemphasize or remove unreliable measurements. Additionally, we discuss modifications of our previously proposed deep neural network-based acoustic distance estimator, to account not only for omnidirectional sources but also for directional sources. Simulation results show good positioning accuracy and compare very favorably with alternative approaches from the literature.","lang":"eng"}],"publication":"EURASIP Journal on Audio, Speech, and Music Processing","type":"journal_article","language":[{"iso":"eng"}],"department":[{"_id":"54"}],"user_id":"44006","_id":"22528","citation":{"chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Geometry Calibration in Wireless Acoustic Sensor Networks Utilizing DoA and Distance Information.” <i>EURASIP Journal on Audio, Speech, and Music Processing</i>, 2021. <a href=\"https://doi.org/10.1186/s13636-021-00210-x\">https://doi.org/10.1186/s13636-021-00210-x</a>.","ieee":"T. Gburrek, J. Schmalenstroeer, and R. Haeb-Umbach, “Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information,” <i>EURASIP Journal on Audio, Speech, and Music Processing</i>, 2021, doi: <a href=\"https://doi.org/10.1186/s13636-021-00210-x\">10.1186/s13636-021-00210-x</a>.","ama":"Gburrek T, Schmalenstroeer J, Haeb-Umbach R. Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information. <i>EURASIP Journal on Audio, Speech, and Music Processing</i>. Published online 2021. doi:<a href=\"https://doi.org/10.1186/s13636-021-00210-x\">10.1186/s13636-021-00210-x</a>","apa":"Gburrek, T., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2021). Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information. <i>EURASIP Journal on Audio, Speech, and Music Processing</i>. <a href=\"https://doi.org/10.1186/s13636-021-00210-x\">https://doi.org/10.1186/s13636-021-00210-x</a>","short":"T. Gburrek, J. Schmalenstroeer, R. Haeb-Umbach, EURASIP Journal on Audio, Speech, and Music Processing (2021).","bibtex":"@article{Gburrek_Schmalenstroeer_Haeb-Umbach_2021, title={Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information}, DOI={<a href=\"https://doi.org/10.1186/s13636-021-00210-x\">10.1186/s13636-021-00210-x</a>}, journal={EURASIP Journal on Audio, Speech, and Music Processing}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2021} }","mla":"Gburrek, Tobias, et al. “Geometry Calibration in Wireless Acoustic Sensor Networks Utilizing DoA and Distance Information.” <i>EURASIP Journal on Audio, Speech, and Music Processing</i>, 2021, doi:<a href=\"https://doi.org/10.1186/s13636-021-00210-x\">10.1186/s13636-021-00210-x</a>."},"year":"2021","publication_identifier":{"issn":["1687-4722"]},"quality_controlled":"1","publication_status":"published","doi":"10.1186/s13636-021-00210-x","main_file_link":[{"open_access":"1","url":"https://asmp-eurasipjournals.springeropen.com/articles/10.1186/s13636-021-00210-x"}],"title":"Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information","author":[{"first_name":"Tobias","full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek"},{"first_name":"Joerg","last_name":"Schmalenstroeer","id":"460","full_name":"Schmalenstroeer, Joerg"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_created":"2021-07-05T05:30:15Z","oa":"1","date_updated":"2023-11-17T06:36:17Z"},{"ddc":["004"],"language":[{"iso":"eng"}],"file_date_updated":"2023-11-17T06:30:11Z","_id":"23994","user_id":"44006","department":[{"_id":"54"}],"file":[{"file_name":"icassp21.pdf","file_id":"48988","access_level":"open_access","file_size":312400,"creator":"tgburrek","date_created":"2023-11-17T06:29:40Z","date_updated":"2023-11-17T06:30:11Z","relation":"main_file","content_type":"application/pdf"}],"status":"public","type":"conference","publication":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","title":"Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks","doi":"10.1109/icassp39728.2021.9413831","date_updated":"2023-11-17T06:30:12Z","oa":"1","author":[{"full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek","first_name":"Tobias"},{"full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer","first_name":"Joerg"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2021-09-09T08:30:16Z","year":"2021","citation":{"ama":"Gburrek T, Schmalenstroeer J, Haeb-Umbach R. Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks. In: <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2021. doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">10.1109/icassp39728.2021.9413831</a>","ieee":"T. Gburrek, J. Schmalenstroeer, and R. Haeb-Umbach, “Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks,” 2021, doi: <a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">10.1109/icassp39728.2021.9413831</a>.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks.” In <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021. <a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">https://doi.org/10.1109/icassp39728.2021.9413831</a>.","apa":"Gburrek, T., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2021). Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks. <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. <a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">https://doi.org/10.1109/icassp39728.2021.9413831</a>","bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Haeb-Umbach_2021, title={Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks}, DOI={<a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">10.1109/icassp39728.2021.9413831</a>}, booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2021} }","mla":"Gburrek, Tobias, et al. “Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks.” <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9413831\">10.1109/icassp39728.2021.9413831</a>.","short":"T. Gburrek, J. Schmalenstroeer, R. Haeb-Umbach, in: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021."},"publication_status":"published","has_accepted_license":"1","quality_controlled":"1"},{"quality_controlled":"1","has_accepted_license":"1","citation":{"short":"T. Gburrek, J. Schmalenstroeer, R. Haeb-Umbach, in: Speech Communication; 14th ITG-Symposium, 2021, pp. 1–5.","bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Haeb-Umbach_2021, title={On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks}, booktitle={Speech Communication; 14th ITG-Symposium}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2021}, pages={1–5} }","mla":"Gburrek, Tobias, et al. “On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks.” <i>Speech Communication; 14th ITG-Symposium</i>, 2021, pp. 1–5.","apa":"Gburrek, T., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2021). On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks. <i>Speech Communication; 14th ITG-Symposium</i>, 1–5.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks.” In <i>Speech Communication; 14th ITG-Symposium</i>, 1–5, 2021.","ieee":"T. Gburrek, J. Schmalenstroeer, and R. Haeb-Umbach, “On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks,” in <i>Speech Communication; 14th ITG-Symposium</i>, 2021, pp. 1–5.","ama":"Gburrek T, Schmalenstroeer J, Haeb-Umbach R. On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks. In: <i>Speech Communication; 14th ITG-Symposium</i>. ; 2021:1-5."},"page":"1-5","year":"2021","date_created":"2021-09-09T08:40:44Z","author":[{"first_name":"Tobias","last_name":"Gburrek","id":"44006","full_name":"Gburrek, Tobias"},{"full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer","first_name":"Joerg"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2023-11-17T06:32:20Z","title":"On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks","type":"conference","publication":"Speech Communication; 14th ITG-Symposium","file":[{"date_updated":"2023-11-17T06:31:37Z","date_created":"2023-11-17T06:31:37Z","creator":"tgburrek","file_size":449694,"access_level":"open_access","file_name":"dist_est.pdf","file_id":"48989","content_type":"application/pdf","relation":"main_file"}],"status":"public","user_id":"44006","department":[{"_id":"54"}],"_id":"23999","file_date_updated":"2023-11-17T06:31:37Z","language":[{"iso":"eng"}],"ddc":["004"]},{"_id":"23997","department":[{"_id":"54"}],"user_id":"44006","language":[{"iso":"eng"}],"publication":"29th European Signal Processing Conference (EUSIPCO)","type":"conference","status":"public","oa":"1","date_updated":"2023-11-17T06:37:10Z","date_created":"2021-09-09T08:39:06Z","author":[{"first_name":"Aleksej","last_name":"Chinaev","full_name":"Chinaev, Aleksej"},{"first_name":"Gerald","last_name":"Enzner","full_name":"Enzner, Gerald"},{"id":"44006","full_name":"Gburrek, Tobias","last_name":"Gburrek","first_name":"Tobias"},{"full_name":"Schmalenstroeer, Joerg","id":"460","last_name":"Schmalenstroeer","first_name":"Joerg"}],"title":"Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss","main_file_link":[{"open_access":"1","url":"https://eurasip.org/Proceedings/Eusipco/Eusipco2021/pdfs/0001110.pdf"}],"quality_controlled":"1","year":"2021","page":"1-5","citation":{"apa":"Chinaev, A., Enzner, G., Gburrek, T., &#38; Schmalenstroeer, J. (2021). Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss. <i>29th European Signal Processing Conference (EUSIPCO)</i>, 1–5.","bibtex":"@inproceedings{Chinaev_Enzner_Gburrek_Schmalenstroeer_2021, title={Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss}, booktitle={29th European Signal Processing Conference (EUSIPCO)}, author={Chinaev, Aleksej and Enzner, Gerald and Gburrek, Tobias and Schmalenstroeer, Joerg}, year={2021}, pages={1–5} }","mla":"Chinaev, Aleksej, et al. “Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss.” <i>29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1–5.","short":"A. Chinaev, G. Enzner, T. Gburrek, J. Schmalenstroeer, in: 29th European Signal Processing Conference (EUSIPCO), 2021, pp. 1–5.","ama":"Chinaev A, Enzner G, Gburrek T, Schmalenstroeer J. Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss. In: <i>29th European Signal Processing Conference (EUSIPCO)</i>. ; 2021:1-5.","chicago":"Chinaev, Aleksej, Gerald Enzner, Tobias Gburrek, and Joerg Schmalenstroeer. “Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss.” In <i>29th European Signal Processing Conference (EUSIPCO)</i>, 1–5, 2021.","ieee":"A. Chinaev, G. Enzner, T. Gburrek, and J. Schmalenstroeer, “Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss,” in <i>29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1–5."}},{"title":"Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations","date_created":"2022-01-13T07:55:29Z","year":"2021","quality_controlled":"1","ddc":["000"],"language":[{"iso":"eng"}],"abstract":[{"text":"In this work we address disentanglement of style and content in speech signals. We propose a fully convolutional variational autoencoder employing two encoders: a content encoder and a style encoder. To foster disentanglement, we propose adversarial contrastive predictive coding. This new disentanglement method does neither need parallel data nor any supervision. We show that the proposed technique is capable of separating speaker and content traits into the two different representations and show competitive speaker-content disentanglement performance compared to other unsupervised approaches. We further demonstrate an increased robustness of the content representation against a train-test mismatch compared to spectral features, when used for phone recognition.","lang":"eng"}],"file":[{"access_level":"open_access","file_id":"29305","file_name":"Template.pdf","file_size":236628,"creator":"ebbers","date_created":"2022-01-13T07:56:30Z","date_updated":"2022-01-13T08:19:19Z","relation":"main_file","content_type":"application/pdf"}],"publication":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","date_updated":"2023-11-22T08:29:42Z","oa":"1","author":[{"last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851","first_name":"Janek"},{"last_name":"Kuhlmann","full_name":"Kuhlmann, Michael","id":"49871","first_name":"Michael"},{"full_name":"Cord-Landwehr, Tobias","id":"44393","last_name":"Cord-Landwehr","first_name":"Tobias"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"page":"3860–3864","citation":{"apa":"Ebbers, J., Kuhlmann, M., Cord-Landwehr, T., &#38; Haeb-Umbach, R. (2021). Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations. <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 3860–3864.","bibtex":"@inproceedings{Ebbers_Kuhlmann_Cord-Landwehr_Haeb-Umbach_2021, title={Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations}, booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Ebbers, Janek and Kuhlmann, Michael and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}, year={2021}, pages={3860–3864} }","short":"J. Ebbers, M. Kuhlmann, T. Cord-Landwehr, R. Haeb-Umbach, in: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, pp. 3860–3864.","mla":"Ebbers, Janek, et al. “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations.” <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, pp. 3860–3864.","ama":"Ebbers J, Kuhlmann M, Cord-Landwehr T, Haeb-Umbach R. Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations. In: <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2021:3860–3864.","chicago":"Ebbers, Janek, Michael Kuhlmann, Tobias Cord-Landwehr, and Reinhold Haeb-Umbach. “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations.” In <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 3860–3864, 2021.","ieee":"J. Ebbers, M. Kuhlmann, T. Cord-Landwehr, and R. Haeb-Umbach, “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations,” in <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, pp. 3860–3864."},"has_accepted_license":"1","file_date_updated":"2022-01-13T08:19:19Z","_id":"29304","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"34851","status":"public","type":"conference"},{"ddc":["000"],"keyword":["Continuous speech separation","automatic speech recognition","overlapped speech","permutation invariant training"],"language":[{"iso":"eng"}],"abstract":[{"lang":"eng","text":"Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.\r\nFurther, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. "}],"file":[{"relation":"supplementary_material","content_type":"video/mp4","title":"Video for INTERSPEECH 2021","file_size":9550220,"file_id":"28327","file_name":"Interspeech 2021 voiceover-002-compressed.mp4","access_level":"open_access","date_updated":"2021-12-06T10:48:30Z","creator":"tvn","date_created":"2021-12-06T10:39:13Z"},{"relation":"slides","content_type":"application/vnd.openxmlformats-officedocument.presentationml.presentation","title":"Slides from INTERSPEECH 2021","file_size":1337297,"access_level":"open_access","file_id":"28328","file_name":"Graph-PIT-poster-presentation.pptx","date_updated":"2021-12-06T10:47:01Z","creator":"tvn","date_created":"2021-12-06T10:47:01Z"},{"file_name":"INTERSPEECH2021_Graph_PIT.pdf","file_id":"28329","access_level":"open_access","file_size":226589,"date_created":"2021-12-06T10:48:21Z","creator":"tvn","date_updated":"2021-12-06T10:48:21Z","relation":"main_file","content_type":"application/pdf"}],"publication":"Interspeech 2021","title":"Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers","date_created":"2021-10-25T08:50:01Z","year":"2021","quality_controlled":"1","file_date_updated":"2021-12-06T10:48:30Z","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"26770","user_id":"49870","department":[{"_id":"54"}],"status":"public","type":"conference","doi":"10.21437/interspeech.2021-1177","conference":{"name":"Interspeech"},"date_updated":"2023-11-15T12:14:40Z","oa":"1","author":[{"full_name":"von Neumann, Thilo","id":"49870","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","first_name":"Thilo"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"first_name":"Christoph","last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"citation":{"ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers,” presented at the Interspeech, 2021, doi: <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” In <i>Interspeech 2021</i>, 2021. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>.","ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. In: <i>Interspeech 2021</i>. ; 2021. doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>","bibtex":"@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2021, title={Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>}, booktitle={Interspeech 2021}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Interspeech 2021, 2021.","mla":"von Neumann, Thilo, et al. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” <i>Interspeech 2021</i>, 2021, doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. <i>Interspeech 2021</i>. Interspeech. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>"},"publication_status":"published","has_accepted_license":"1","related_material":{"link":[{"relation":"software","url":"https://github.com/fgnt/graph_pit"}]}},{"conference":{"start_date":"2021-09-29","name":"Speech Communication; 14th ITG Conference","location":"Kiel","end_date":"2021-10-01"},"title":"Speeding Up Permutation Invariant Training for Source Separation","date_created":"2022-01-07T10:40:56Z","author":[{"last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","id":"49870","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"first_name":"Marc","last_name":"Delcroix","full_name":"Delcroix, Marc"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"oa":"1","date_updated":"2023-11-15T12:16:31Z","citation":{"mla":"von Neumann, Thilo, et al. “Speeding Up Permutation Invariant Training for Source Separation.” <i>Speech Communication; 14th ITG Conference</i>, 2021.","short":"T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, R. Haeb-Umbach, in: Speech Communication; 14th ITG Conference, 2021.","bibtex":"@inproceedings{von Neumann_Boeddeker_Kinoshita_Delcroix_Haeb-Umbach_2021, title={Speeding Up Permutation Invariant Training for Source Separation}, booktitle={Speech Communication; 14th ITG Conference}, author={von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","apa":"von Neumann, T., Boeddeker, C., Kinoshita, K., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Speeding Up Permutation Invariant Training for Source Separation. <i>Speech Communication; 14th ITG Conference</i>. Speech Communication; 14th ITG Conference, Kiel.","chicago":"Neumann, Thilo von, Christoph Boeddeker, Keisuke Kinoshita, Marc Delcroix, and Reinhold Haeb-Umbach. “Speeding Up Permutation Invariant Training for Source Separation.” In <i>Speech Communication; 14th ITG Conference</i>, 2021.","ieee":"T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, and R. Haeb-Umbach, “Speeding Up Permutation Invariant Training for Source Separation,” presented at the Speech Communication; 14th ITG Conference, Kiel, 2021.","ama":"von Neumann T, Boeddeker C, Kinoshita K, Delcroix M, Haeb-Umbach R. Speeding Up Permutation Invariant Training for Source Separation. In: <i>Speech Communication; 14th ITG Conference</i>. ; 2021."},"year":"2021","has_accepted_license":"1","quality_controlled":"1","file_date_updated":"2022-01-07T10:42:54Z","language":[{"iso":"eng"}],"ddc":["000"],"user_id":"49870","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"29173","file":[{"date_updated":"2022-01-06T13:23:27Z","creator":"tvn","date_created":"2022-01-06T13:23:27Z","file_size":191938,"access_level":"open_access","file_name":"poster.pdf","file_id":"29180","content_type":"application/pdf","relation":"poster"},{"date_updated":"2022-01-07T10:42:54Z","date_created":"2022-01-07T10:42:54Z","creator":"tvn","file_size":236670,"access_level":"open_access","file_id":"29181","file_name":"ITG2021_Speeding_up_Permutation_Invariant_Training.pdf","content_type":"application/pdf","relation":"main_file"}],"status":"public","type":"conference","publication":"Speech Communication; 14th ITG Conference"},{"file_date_updated":"2022-01-13T08:19:50Z","_id":"29308","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"34851","status":"public","type":"conference","oa":"1","date_updated":"2023-11-22T08:28:32Z","author":[{"last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851","first_name":"Janek"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"place":"Barcelona, Spain","page":"226–230","citation":{"apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2021). Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments. <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 226–230.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2021, place={Barcelona, Spain}, title={Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments}, booktitle={Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2021}, pages={226–230} }","short":"J. Ebbers, R. Haeb-Umbach, in: Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021), Barcelona, Spain, 2021, pp. 226–230.","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments.” <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 2021, pp. 226–230.","ama":"Ebbers J, Haeb-Umbach R. Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments. In: <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>. ; 2021:226–230.","ieee":"J. Ebbers and R. Haeb-Umbach, “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments,” in <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 2021, pp. 226–230.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments.” In <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 226–230. Barcelona, Spain, 2021."},"publication_identifier":{"isbn":["978-84-09-36072-7"]},"has_accepted_license":"1","ddc":["000"],"language":[{"iso":"eng"}],"abstract":[{"text":"In this paper we present our system for the Detection and Classification of Acoustic Scenes and Events (DCASE) 2021 Challenge Task 4: Sound Event Detection and Separation in Domestic Environments, where it scored the fourth rank. Our presented solution is an advancement of our system used in the previous edition of the task.We use a forward-backward convolutional recurrent neural network (FBCRNN) for tagging and pseudo labeling followed by tag-conditioned sound event detection (SED) models which are trained using strong pseudo labels provided by the FBCRNN. Our advancement over our earlier model is threefold. First, we introduce a strong label loss in the objective of the FBCRNN to take advantage of the strongly labeled synthetic data during training. Second, we perform multiple iterations of self-training for both the FBCRNN and tag-conditioned SED models. Third, while we used only tag-conditioned CNNs as our SED model in the previous edition we here explore sophisticated tag-conditioned SED model architectures, namely, bidirectional CRNNs and bidirectional convolutional transformer neural networks (CTNNs), and combine them. With metric and class specific tuning of median filter lengths for post-processing, our final SED model, consisting of 6 submodels (2 of each architecture), achieves on the public evaluation set poly-phonic sound event detection scores (PSDS) of 0.455 for scenario 1 and 0.684 for scenario as well as a collar-based F1-score of 0.596 outperforming the baselines and our model from the previous edition by far. Source code is publicly available at https://github.com/fgnt/pb_sed.","lang":"eng"}],"file":[{"date_updated":"2022-01-13T08:19:50Z","date_created":"2022-01-13T08:08:54Z","creator":"ebbers","file_size":239462,"file_name":"template.pdf","file_id":"29309","access_level":"open_access","content_type":"application/pdf","relation":"main_file"}],"publication":"Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)","title":"Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments","date_created":"2022-01-13T08:07:47Z","year":"2021","quality_controlled":"1"},{"type":"conference","publication":"Proceedings of the 29th European Signal Processing Conference (EUSIPCO)","file":[{"relation":"main_file","content_type":"application/pdf","file_size":213938,"access_level":"open_access","file_id":"29307","file_name":"conference_101719.pdf","date_updated":"2022-01-13T08:19:35Z","date_created":"2022-01-13T08:03:26Z","creator":"ebbers"}],"status":"public","abstract":[{"text":"Recently, there has been a rising interest in sound recognition via Acoustic Sensor Networks to support applications such as ambient assisted living or environmental habitat monitoring. With state-of-the-art sound recognition being dominated by deep-learning-based approaches, there is a high demand for labeled training data. Despite the availability of large-scale  data sets such as Google's AudioSet, acquiring training data matching a certain application environment is still often a problem. In this paper we are concerned with human activity monitoring in a domestic environment using an ASN consisting of multiple nodes each providing multichannel signals. We propose a self-training based domain adaptation approach, which only requires unlabeled data from the target environment. Here, a sound recognition system trained on AudioSet, the teacher, generates pseudo labels for data from the target environment on which a student network is trained. The student can furthermore glean information about the spatial arrangement of sensors and sound sources to further improve classification performance. It is shown that  the student significantly improves recognition performance over the pre-trained teacher without relying on labeled data from the environment the system is deployed in.","lang":"eng"}],"user_id":"34851","department":[{"_id":"54"}],"project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"29306","file_date_updated":"2022-01-13T08:19:35Z","language":[{"iso":"eng"}],"ddc":["000"],"has_accepted_license":"1","quality_controlled":"1","citation":{"apa":"Ebbers, J., Keyser, M. C., &#38; Haeb-Umbach, R. (2021). Adapting Sound Recognition to A New Environment Via Self-Training. <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 1135–1139.","short":"J. Ebbers, M.C. Keyser, R. Haeb-Umbach, in: Proceedings of the 29th European Signal Processing Conference (EUSIPCO), 2021, pp. 1135–1139.","mla":"Ebbers, Janek, et al. “Adapting Sound Recognition to A New Environment Via Self-Training.” <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1135–1139.","bibtex":"@inproceedings{Ebbers_Keyser_Haeb-Umbach_2021, title={Adapting Sound Recognition to A New Environment Via Self-Training}, booktitle={Proceedings of the 29th European Signal Processing Conference (EUSIPCO)}, author={Ebbers, Janek and Keyser, Moritz Curt and Haeb-Umbach, Reinhold}, year={2021}, pages={1135–1139} }","chicago":"Ebbers, Janek, Moritz Curt Keyser, and Reinhold Haeb-Umbach. “Adapting Sound Recognition to A New Environment Via Self-Training.” In <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 1135–1139, 2021.","ieee":"J. Ebbers, M. C. Keyser, and R. Haeb-Umbach, “Adapting Sound Recognition to A New Environment Via Self-Training,” in <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1135–1139.","ama":"Ebbers J, Keyser MC, Haeb-Umbach R. Adapting Sound Recognition to A New Environment Via Self-Training. In: <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>. ; 2021:1135–1139."},"page":"1135–1139","year":"2021","date_created":"2022-01-13T08:01:21Z","author":[{"first_name":"Janek","full_name":"Ebbers, Janek","id":"34851","last_name":"Ebbers"},{"first_name":"Moritz Curt","last_name":"Keyser","full_name":"Keyser, Moritz Curt"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"oa":"1","date_updated":"2023-11-22T08:28:50Z","title":"Adapting Sound Recognition to A New Environment Via Self-Training"},{"type":"journal_article","status":"public","_id":"24456","project":[{"grant_number":"438445824","name":"TRR 318: TRR 318 - Erklärbarkeit konstruieren","_id":"109"}],"department":[{"_id":"603"},{"_id":"749"},{"_id":"424"},{"_id":"67"},{"_id":"574"},{"_id":"184"},{"_id":"757"},{"_id":"54"},{"_id":"178"}],"user_id":"42933","article_type":"original","file_date_updated":"2023-11-20T16:33:51Z","has_accepted_license":"1","publication_identifier":{"issn":["2379-8920","2379-8939"]},"publication_status":"published","page":"717-728","intvolume":"        13","citation":{"apa":"Rohlfing, K. J., Cimiano, P., Scharlau, I., Matzner, T., Buhl, H. M., Buschmeier, H., Esposito, E., Grimminger, A., Hammer, B., Haeb-Umbach, R., Horwath, I., Hüllermeier, E., Kern, F., Kopp, S., Thommes, K., Ngonga Ngomo, A.-C., Schulte, C., Wachsmuth, H., Wagner, P., &#38; Wrede, B. (2021). Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems. <i>IEEE Transactions on Cognitive and Developmental Systems</i>, <i>13</i>(3), 717–728. <a href=\"https://doi.org/10.1109/tcds.2020.3044366\">https://doi.org/10.1109/tcds.2020.3044366</a>","mla":"Rohlfing, Katharina J., et al. “Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems.” <i>IEEE Transactions on Cognitive and Developmental Systems</i>, vol. 13, no. 3, 2021, pp. 717–28, doi:<a href=\"https://doi.org/10.1109/tcds.2020.3044366\">10.1109/tcds.2020.3044366</a>.","bibtex":"@article{Rohlfing_Cimiano_Scharlau_Matzner_Buhl_Buschmeier_Esposito_Grimminger_Hammer_Haeb-Umbach_et al._2021, title={Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems}, volume={13}, DOI={<a href=\"https://doi.org/10.1109/tcds.2020.3044366\">10.1109/tcds.2020.3044366</a>}, number={3}, journal={IEEE Transactions on Cognitive and Developmental Systems}, author={Rohlfing, Katharina J. and Cimiano, Philipp and Scharlau, Ingrid and Matzner, Tobias and Buhl, Heike M. and Buschmeier, Hendrik and Esposito, Elena and Grimminger, Angela and Hammer, Barbara and Haeb-Umbach, Reinhold and et al.}, year={2021}, pages={717–728} }","short":"K.J. Rohlfing, P. Cimiano, I. Scharlau, T. Matzner, H.M. Buhl, H. Buschmeier, E. Esposito, A. Grimminger, B. Hammer, R. Haeb-Umbach, I. Horwath, E. Hüllermeier, F. Kern, S. Kopp, K. Thommes, A.-C. Ngonga Ngomo, C. Schulte, H. Wachsmuth, P. Wagner, B. Wrede, IEEE Transactions on Cognitive and Developmental Systems 13 (2021) 717–728.","ama":"Rohlfing KJ, Cimiano P, Scharlau I, et al. Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems. <i>IEEE Transactions on Cognitive and Developmental Systems</i>. 2021;13(3):717-728. doi:<a href=\"https://doi.org/10.1109/tcds.2020.3044366\">10.1109/tcds.2020.3044366</a>","chicago":"Rohlfing, Katharina J., Philipp Cimiano, Ingrid Scharlau, Tobias Matzner, Heike M. Buhl, Hendrik Buschmeier, Elena Esposito, et al. “Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems.” <i>IEEE Transactions on Cognitive and Developmental Systems</i> 13, no. 3 (2021): 717–28. <a href=\"https://doi.org/10.1109/tcds.2020.3044366\">https://doi.org/10.1109/tcds.2020.3044366</a>.","ieee":"K. J. Rohlfing <i>et al.</i>, “Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems,” <i>IEEE Transactions on Cognitive and Developmental Systems</i>, vol. 13, no. 3, pp. 717–728, 2021, doi: <a href=\"https://doi.org/10.1109/tcds.2020.3044366\">10.1109/tcds.2020.3044366</a>."},"oa":"1","date_updated":"2023-12-05T10:15:02Z","volume":13,"author":[{"first_name":"Katharina J.","last_name":"Rohlfing","id":"50352","full_name":"Rohlfing, Katharina J."},{"first_name":"Philipp","full_name":"Cimiano, Philipp","last_name":"Cimiano"},{"last_name":"Scharlau","orcid":"0000-0003-2364-9489","full_name":"Scharlau, Ingrid","id":"451","first_name":"Ingrid"},{"id":"65695","full_name":"Matzner, Tobias","last_name":"Matzner","first_name":"Tobias"},{"full_name":"Buhl, Heike M.","id":"27152","last_name":"Buhl","first_name":"Heike M."},{"first_name":"Hendrik","last_name":"Buschmeier","full_name":"Buschmeier, Hendrik"},{"last_name":"Esposito","full_name":"Esposito, Elena","first_name":"Elena"},{"id":"57578","full_name":"Grimminger, Angela","last_name":"Grimminger","first_name":"Angela"},{"full_name":"Hammer, Barbara","last_name":"Hammer","first_name":"Barbara"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"},{"last_name":"Horwath","full_name":"Horwath, Ilona","id":"68836","first_name":"Ilona"},{"first_name":"Eyke","full_name":"Hüllermeier, Eyke","id":"48129","last_name":"Hüllermeier"},{"full_name":"Kern, Friederike","last_name":"Kern","first_name":"Friederike"},{"first_name":"Stefan","full_name":"Kopp, Stefan","last_name":"Kopp"},{"full_name":"Thommes, Kirsten","id":"72497","last_name":"Thommes","first_name":"Kirsten"},{"first_name":"Axel-Cyrille","full_name":"Ngonga Ngomo, Axel-Cyrille","id":"65716","last_name":"Ngonga Ngomo"},{"first_name":"Carsten","full_name":"Schulte, Carsten","id":"60311","last_name":"Schulte"},{"first_name":"Henning","last_name":"Wachsmuth","id":"3900","full_name":"Wachsmuth, Henning"},{"last_name":"Wagner","full_name":"Wagner, Petra","first_name":"Petra"},{"first_name":"Britta","full_name":"Wrede, Britta","last_name":"Wrede"}],"doi":"10.1109/tcds.2020.3044366","publication":"IEEE Transactions on Cognitive and Developmental Systems","abstract":[{"lang":"eng","text":"One objective of current research in explainable intelligent systems is to implement social aspects in order to increase the relevance of explanations. In this paper, we argue that a novel conceptual framework is needed to overcome shortcomings of existing AI systems with little attention to processes of interaction and learning. Drawing from research in interaction and development, we first outline the novel conceptual framework that pushes the design of AI systems toward true interactivity with an emphasis on the role of the partner and social relevance. We propose that AI systems will be able to provide a meaningful and relevant explanation only if the process of explaining is extended to active contribution of both partners that brings about dynamics that is modulated by different levels of analysis. Accordingly, our conceptual framework comprises monitoring and scaffolding as key concepts and claims that the process of explaining is not only modulated by the interaction between explainee and explainer but is embedded into a larger social context in which conventionalized and routinized behaviors are established. We discuss our conceptual framework in relation to the established objectives of transparency and autonomy that are raised for the design of explainable AI systems currently."}],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":626217,"file_name":"2020-12-01_explainability_final_version.pdf","file_id":"49081","access_level":"open_access","date_updated":"2023-11-20T16:33:51Z","creator":"haebumb","date_created":"2023-11-20T16:33:51Z"}],"keyword":["Explainability","process ofexplaining andunderstanding","explainable artificial systems"],"ddc":["300"],"language":[{"iso":"eng"}],"quality_controlled":"1","issue":"3","year":"2021","date_created":"2021-09-14T20:52:57Z","title":"Explanation as a Social Practice: Toward a Conceptual Framework for the Social Design of AI Systems"},{"editor":[{"first_name":"Ronald","last_name":"Böck","full_name":"Böck, Ronald"},{"first_name":"Ingo","full_name":"Siegert, Ingo","last_name":"Siegert"},{"full_name":"Wendemuth, Andreas","last_name":"Wendemuth","first_name":"Andreas"}],"status":"public","type":"conference","publication":"Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020","keyword":["Poster"],"language":[{"iso":"eng"}],"_id":"17763","user_id":"44006","department":[{"_id":"54"}],"year":"2020","citation":{"short":"R. Haeb-Umbach, in: R. Böck, I. Siegert, A. Wendemuth (Eds.), Studientexte Zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020, TUDpress, Dresden, 2020, pp. 227–234.","mla":"Haeb-Umbach, Reinhold. “Sprachtechnologien Für Digitale Assistenten.” <i>Studientexte Zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020</i>, edited by Ronald Böck et al., TUDpress, Dresden, 2020, pp. 227–34.","bibtex":"@inproceedings{Haeb-Umbach_2020, title={Sprachtechnologien für Digitale Assistenten}, booktitle={Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020}, publisher={TUDpress, Dresden}, author={Haeb-Umbach, Reinhold}, editor={Böck, Ronald and Siegert, Ingo and Wendemuth, AndreasEditors}, year={2020}, pages={227–234} }","apa":"Haeb-Umbach, R. (2020). Sprachtechnologien für Digitale Assistenten. In R. Böck, I. Siegert, &#38; A. Wendemuth (Eds.), <i>Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020</i> (pp. 227–234). TUDpress, Dresden.","ieee":"R. Haeb-Umbach, “Sprachtechnologien für Digitale Assistenten,” in <i>Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020</i>, 2020, pp. 227–234.","chicago":"Haeb-Umbach, Reinhold. “Sprachtechnologien Für Digitale Assistenten.” In <i>Studientexte Zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020</i>, edited by Ronald Böck, Ingo Siegert, and Andreas Wendemuth, 227–34. TUDpress, Dresden, 2020.","ama":"Haeb-Umbach R. Sprachtechnologien für Digitale Assistenten. In: Böck R, Siegert I, Wendemuth A, eds. <i>Studientexte Zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020</i>. TUDpress, Dresden; 2020:227-234."},"page":"227-234","publication_identifier":{"isbn":["978-3-959081-93-1"]},"title":"Sprachtechnologien für Digitale Assistenten","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2020/ESSV_2020_haeb_umbach.pdf"}],"oa":"1","publisher":"TUDpress, Dresden","date_updated":"2022-01-06T06:53:19Z","date_created":"2020-08-10T09:53:12Z","author":[{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}]},{"ddc":["000"],"file_date_updated":"2020-12-11T12:48:48Z","language":[{"iso":"eng"}],"_id":"20700","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767","status":"public","file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2020-12-11T12:48:48Z","date_created":"2020-12-11T12:48:48Z","creator":"cbj","file_size":115421,"file_name":"template.pdf","file_id":"20702","access_level":"open_access"}],"publication":"Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments","type":"conference","title":"Towards a speaker diarization system for the CHiME 2020 dinner party transcription","date_updated":"2022-01-06T06:54:33Z","oa":"1","author":[{"first_name":"Christoph","last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph"},{"last_name":"Cord-Landwehr","id":"44393","full_name":"Cord-Landwehr, Tobias","first_name":"Tobias"},{"first_name":"Jens","id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper"},{"last_name":"Zorila","full_name":"Zorila, Catalin","first_name":"Catalin"},{"last_name":"Hayakawa","full_name":"Hayakawa, Daichi","first_name":"Daichi"},{"last_name":"Li","full_name":"Li, Mohan","first_name":"Mohan"},{"first_name":"Min","last_name":"Liu","full_name":"Liu, Min"},{"first_name":"Rama","full_name":"Doddipatla, Rama","last_name":"Doddipatla"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_created":"2020-12-11T12:49:13Z","year":"2020","citation":{"apa":"Boeddeker, C., Cord-Landwehr, T., Heitkaemper, J., Zorila, C., Hayakawa, D., Li, M., … Haeb-Umbach, R. (2020). Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>.","mla":"Boeddeker, Christoph, et al. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","bibtex":"@inproceedings{Boeddeker_Cord-Landwehr_Heitkaemper_Zorila_Hayakawa_Li_Liu_Doddipatla_Haeb-Umbach_2020, title={Towards a speaker diarization system for the CHiME 2020 dinner party transcription}, booktitle={Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments}, author={Boeddeker, Christoph and Cord-Landwehr, Tobias and Heitkaemper, Jens and Zorila, Catalin and Hayakawa, Daichi and Li, Mohan and Liu, Min and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2020} }","short":"C. Boeddeker, T. Cord-Landwehr, J. Heitkaemper, C. Zorila, D. Hayakawa, M. Li, M. Liu, R. Doddipatla, R. Haeb-Umbach, in: Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments, 2020.","ama":"Boeddeker C, Cord-Landwehr T, Heitkaemper J, et al. Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In: <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>. ; 2020.","ieee":"C. Boeddeker <i>et al.</i>, “Towards a speaker diarization system for the CHiME 2020 dinner party transcription,” in <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","chicago":"Boeddeker, Christoph, Tobias Cord-Landwehr, Jens Heitkaemper, Catalin Zorila, Daichi Hayakawa, Mohan Li, Min Liu, Rama Doddipatla, and Reinhold Haeb-Umbach. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020."},"has_accepted_license":"1"},{"title":"Jointly optimal denoising, dereverberation, and source separation","doi":"10.1109/TASLP.2020.3013118","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2020/journal_2020_boeddeker.pdf"}],"date_updated":"2022-12-05T12:34:01Z","oa":"1","date_created":"2020-08-05T06:16:56Z","author":[{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"last_name":"Ikeshita","full_name":"Ikeshita, Rintaro","first_name":"Rintaro"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"year":"2020","page":"1-1","citation":{"ama":"Nakatani T, Boeddeker C, Kinoshita K, Ikeshita R, Delcroix M, Haeb-Umbach R. Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. Published online 2020:1-1. doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>","chicago":"Nakatani, Tomohiro, Christoph Boeddeker, Keisuke Kinoshita, Rintaro Ikeshita, Marc Delcroix, and Reinhold Haeb-Umbach. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>.","ieee":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, and R. Haeb-Umbach, “Jointly optimal denoising, dereverberation, and source separation,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, pp. 1–1, 2020, doi: <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","apa":"Nakatani, T., Boeddeker, C., Kinoshita, K., Ikeshita, R., Delcroix, M., &#38; Haeb-Umbach, R. (2020). Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>","bibtex":"@article{Nakatani_Boeddeker_Kinoshita_Ikeshita_Delcroix_Haeb-Umbach_2020, title={Jointly optimal denoising, dereverberation, and source separation}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Nakatani, Tomohiro and Boeddeker, Christoph and Kinoshita, Keisuke and Ikeshita, Rintaro and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2020}, pages={1–1} }","mla":"Nakatani, Tomohiro, et al. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, pp. 1–1, doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","short":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing (2020) 1–1."},"language":[{"iso":"eng"}],"_id":"17598","department":[{"_id":"54"}],"user_id":"40767","status":"public","publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","type":"journal_article"},{"status":"public","type":"conference","file_date_updated":"2020-12-11T12:36:37Z","department":[{"_id":"54"}],"user_id":"40767","_id":"20504","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"citation":{"apa":"Heitkaemper, J., Jakobeit, D., Boeddeker, C., Drude, L., &#38; Haeb-Umbach, R. (2020). Demystifying TasNet: A Dissecting Approach. <i>ICASSP 2020 Virtual Barcelona Spain</i>.","short":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, R. Haeb-Umbach, in: ICASSP 2020 Virtual Barcelona Spain, 2020.","bibtex":"@inproceedings{Heitkaemper_Jakobeit_Boeddeker_Drude_Haeb-Umbach_2020, title={Demystifying TasNet: A Dissecting Approach}, booktitle={ICASSP 2020 Virtual Barcelona Spain}, author={Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Heitkaemper, Jens, et al. “Demystifying TasNet: A Dissecting Approach.” <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","chicago":"Heitkaemper, Jens, Darius Jakobeit, Christoph Boeddeker, Lukas Drude, and Reinhold Haeb-Umbach. “Demystifying TasNet: A Dissecting Approach.” In <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","ieee":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, and R. Haeb-Umbach, “Demystifying TasNet: A Dissecting Approach,” 2020.","ama":"Heitkaemper J, Jakobeit D, Boeddeker C, Drude L, Haeb-Umbach R. Demystifying TasNet: A Dissecting Approach. In: <i>ICASSP 2020 Virtual Barcelona Spain</i>. ; 2020."},"has_accepted_license":"1","author":[{"id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper","first_name":"Jens"},{"first_name":"Darius","last_name":"Jakobeit","full_name":"Jakobeit, Darius"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"last_name":"Drude","full_name":"Drude, Lukas","first_name":"Lukas"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2022-01-13T08:47:32Z","file":[{"relation":"main_file","success":1,"content_type":"application/pdf","file_name":"ms.pdf","file_id":"20699","access_level":"closed","file_size":3871374,"creator":"jensheit","date_created":"2020-12-11T12:36:37Z","date_updated":"2020-12-11T12:36:37Z"}],"abstract":[{"lang":"eng","text":"In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions."}],"publication":"ICASSP 2020 Virtual Barcelona Spain","language":[{"iso":"eng"}],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"ddc":["000"],"year":"2020","quality_controlled":"1","title":"Demystifying TasNet: A Dissecting Approach","date_created":"2020-11-25T14:56:53Z"},{"abstract":[{"text":"Following the success of the 1st, 2nd, 3rd, 4th and 5th CHiME challenges we\r\norganize the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).\r\nThe new challenge revisits the previous CHiME-5 challenge and further considers\r\nthe problem of distant multi-microphone conversational speech diarization and\r\nrecognition in everyday home environments. Speech material is the same as the\r\nprevious CHiME-5 recordings except for accurate array synchronization. The\r\nmaterial was elicited using a dinner party scenario with efforts taken to\r\ncapture data that is representative of natural conversational speech. This\r\npaper provides a baseline description of the CHiME-6 challenge for both\r\nsegmented multispeaker speech recognition (Track 1) and unsegmented\r\nmultispeaker speech recognition (Track 2). Of note, Track 2 is the first\r\nchallenge activity in the community to tackle an unsegmented multispeaker\r\nspeech recognition scenario with a complete set of reproducible open source\r\nbaselines providing speech enhancement, speaker diarization, and speech\r\nrecognition modules.","lang":"eng"}],"status":"public","publication":"arXiv:2004.09249","type":"preprint","language":[{"iso":"eng"}],"_id":"28263","department":[{"_id":"54"}],"user_id":"40767","year":"2020","citation":{"chicago":"Watanabe, Shinji, Michael Mandel, Jon Barker, Emmanuel Vincent, Ashish Arora, Xuankai Chang, Sanjeev Khudanpur, et al. “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings.” <i>ArXiv:2004.09249</i>, 2020.","ieee":"S. Watanabe <i>et al.</i>, “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings,” <i>arXiv:2004.09249</i>. 2020.","ama":"Watanabe S, Mandel M, Barker J, et al. CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings. <i>arXiv:200409249</i>. Published online 2020.","bibtex":"@article{Watanabe_Mandel_Barker_Vincent_Arora_Chang_Khudanpur_Manohar_Povey_Raj_et al._2020, title={CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings}, journal={arXiv:2004.09249}, author={Watanabe, Shinji and Mandel, Michael and Barker, Jon and Vincent, Emmanuel and Arora, Ashish and Chang, Xuankai and Khudanpur, Sanjeev and Manohar, Vimal and Povey, Daniel and Raj, Desh and et al.}, year={2020} }","mla":"Watanabe, Shinji, et al. “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings.” <i>ArXiv:2004.09249</i>, 2020.","short":"S. Watanabe, M. Mandel, J. Barker, E. Vincent, A. Arora, X. Chang, S. Khudanpur, V. Manohar, D. Povey, D. Raj, D. Snyder, A.S. Subramanian, J. Trmal, B.B. Yair, C. Boeddeker, Z. Ni, Y. Fujita, S. Horiguchi, N. Kanda, T. Yoshioka, N. Ryant, ArXiv:2004.09249 (2020).","apa":"Watanabe, S., Mandel, M., Barker, J., Vincent, E., Arora, A., Chang, X., Khudanpur, S., Manohar, V., Povey, D., Raj, D., Snyder, D., Subramanian, A. S., Trmal, J., Yair, B. B., Boeddeker, C., Ni, Z., Fujita, Y., Horiguchi, S., Kanda, N., … Ryant, N. (2020). CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings. In <i>arXiv:2004.09249</i>."},"title":"CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings","date_updated":"2022-01-13T08:34:37Z","date_created":"2021-12-03T12:13:01Z","author":[{"full_name":"Watanabe, Shinji","last_name":"Watanabe","first_name":"Shinji"},{"first_name":"Michael","full_name":"Mandel, Michael","last_name":"Mandel"},{"first_name":"Jon","last_name":"Barker","full_name":"Barker, Jon"},{"full_name":"Vincent, Emmanuel","last_name":"Vincent","first_name":"Emmanuel"},{"first_name":"Ashish","full_name":"Arora, Ashish","last_name":"Arora"},{"full_name":"Chang, Xuankai","last_name":"Chang","first_name":"Xuankai"},{"first_name":"Sanjeev","full_name":"Khudanpur, Sanjeev","last_name":"Khudanpur"},{"full_name":"Manohar, Vimal","last_name":"Manohar","first_name":"Vimal"},{"full_name":"Povey, Daniel","last_name":"Povey","first_name":"Daniel"},{"first_name":"Desh","last_name":"Raj","full_name":"Raj, Desh"},{"last_name":"Snyder","full_name":"Snyder, David","first_name":"David"},{"first_name":"Aswin Shanmugam","full_name":"Subramanian, Aswin Shanmugam","last_name":"Subramanian"},{"full_name":"Trmal, Jan","last_name":"Trmal","first_name":"Jan"},{"last_name":"Yair","full_name":"Yair, Bar Ben","first_name":"Bar Ben"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"full_name":"Ni, Zhaoheng","last_name":"Ni","first_name":"Zhaoheng"},{"first_name":"Yusuke","full_name":"Fujita, Yusuke","last_name":"Fujita"},{"full_name":"Horiguchi, Shota","last_name":"Horiguchi","first_name":"Shota"},{"first_name":"Naoyuki","last_name":"Kanda","full_name":"Kanda, Naoyuki"},{"last_name":"Yoshioka","full_name":"Yoshioka, Takuya","first_name":"Takuya"},{"last_name":"Ryant","full_name":"Ryant, Neville","first_name":"Neville"}]},{"language":[{"iso":"eng"}],"file_date_updated":"2020-12-11T12:33:04Z","ddc":["000"],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"user_id":"460","department":[{"_id":"54"}],"project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20505","file":[{"content_type":"application/pdf","relation":"main_file","success":1,"creator":"jensheit","date_created":"2020-12-11T12:33:04Z","date_updated":"2020-12-11T12:33:04Z","file_name":"ms.pdf","access_level":"closed","file_id":"20697","file_size":998706}],"status":"public","abstract":[{"text":"Speech activity detection (SAD), which often rests on the fact that the noise is \"more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.\r\nThe latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.\r\nThe statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art.","lang":"eng"}],"type":"conference","publication":"INTERSPEECH 2020 Virtual Shanghai China","title":"Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments","author":[{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"first_name":"Joerg","last_name":"Schmalenstroeer","id":"460","full_name":"Schmalenstroeer, Joerg"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2020-11-25T15:03:19Z","date_updated":"2023-10-26T08:28:49Z","citation":{"apa":"Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2020). Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. <i>INTERSPEECH 2020 Virtual Shanghai China</i>.","bibtex":"@inproceedings{Heitkaemper_Schmalenstroeer_Haeb-Umbach_2020, title={Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}, booktitle={INTERSPEECH 2020 Virtual Shanghai China}, author={Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2020} }","short":"J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: INTERSPEECH 2020 Virtual Shanghai China, 2020.","mla":"Heitkaemper, Jens, et al. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","ama":"Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. In: <i>INTERSPEECH 2020 Virtual Shanghai China</i>. ; 2020.","ieee":"J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments,” 2020.","chicago":"Heitkaemper, Jens, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” In <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020."},"year":"2020","has_accepted_license":"1"},{"file_date_updated":"2020-12-16T14:09:48Z","department":[{"_id":"54"}],"user_id":"49870","_id":"20762","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"status":"public","type":"conference","doi":"10.1109/ICASSP40776.2020.9053461","author":[{"orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","id":"49870","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"last_name":"Kinoshita","full_name":"Kinoshita, Keisuke","first_name":"Keisuke"},{"last_name":"Drude","full_name":"Drude, Lukas","first_name":"Lukas"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_updated":"2023-11-15T12:17:45Z","oa":"1","page":"7004-7008","citation":{"apa":"von Neumann, T., Kinoshita, K., Drude, L., Boeddeker, C., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). End-to-End Training of Time Domain Audio Separation and Recognition. <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–7008. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>","mla":"von Neumann, Thilo, et al. “End-to-End Training of Time Domain Audio Separation and Recognition.” <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–08, doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","short":"T. von Neumann, K. Kinoshita, L. Drude, C. Boeddeker, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020, pp. 7004–7008.","bibtex":"@inproceedings{von Neumann_Kinoshita_Drude_Boeddeker_Delcroix_Nakatani_Haeb-Umbach_2020, title={End-to-End Training of Time Domain Audio Separation and Recognition}, DOI={<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Drude, Lukas and Boeddeker, Christoph and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={7004–7008} }","ieee":"T. von Neumann <i>et al.</i>, “End-to-End Training of Time Domain Audio Separation and Recognition,” in <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–7008, doi: <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Lukas Drude, Christoph Boeddeker, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “End-to-End Training of Time Domain Audio Separation and Recognition.” In <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–8, 2020. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>.","ama":"von Neumann T, Kinoshita K, Drude L, et al. End-to-End Training of Time Domain Audio Separation and Recognition. In: <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2020:7004-7008. doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>"},"has_accepted_license":"1","language":[{"iso":"eng"}],"ddc":["000"],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":192529,"access_level":"open_access","file_id":"20763","file_name":"ICASSP_2020_vonNeumann_Paper.pdf","date_updated":"2020-12-16T14:09:48Z","creator":"huesera","date_created":"2020-12-16T14:09:48Z"}],"abstract":[{"text":"The rising interest in single-channel multi-speaker speech separation sparked development of End-to-End (E2E) approaches to multispeaker speech recognition. However, up until now, state-of-theart neural network–based time domain source separation has not yet been combined with E2E speech recognition. We here demonstrate how to combine a separation module based on a Convolutional Time domain Audio Separation Network (Conv-TasNet) with an E2E speech recognizer and how to train such a model jointly by distributing it over multiple GPUs or by approximating truncated back-propagation for the convolutional front-end. To put this work into perspective and illustrate the complexity of the design space, we provide a compact overview of single-channel multi-speaker recognition systems. Our experiments show a word error rate of 11.0% on WSJ0-2mix and indicate that our joint time domain model can yield substantial improvements over cascade DNN-HMM and monolithic E2E frequency domain systems proposed so far.","lang":"eng"}],"publication":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","title":"End-to-End Training of Time Domain Audio Separation and Recognition","date_created":"2020-12-16T14:07:54Z","year":"2020","quality_controlled":"1"},{"type":"conference","status":"public","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20764","user_id":"49870","department":[{"_id":"54"}],"file_date_updated":"2020-12-16T14:14:14Z","has_accepted_license":"1","citation":{"bibtex":"@inproceedings{von Neumann_Boeddeker_Drude_Kinoshita_Delcroix_Nakatani_Haeb-Umbach_2020, title={Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>}, booktitle={Proc. Interspeech 2020}, author={von Neumann, Thilo and Boeddeker, Christoph and Drude, Lukas and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={3097–3101} }","short":"T. von Neumann, C. Boeddeker, L. Drude, K. Kinoshita, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: Proc. Interspeech 2020, 2020, pp. 3097–3101.","mla":"von Neumann, Thilo, et al. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–101, doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>.","apa":"von Neumann, T., Boeddeker, C., Drude, L., Kinoshita, K., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. <i>Proc. Interspeech 2020</i>, 3097–3101. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>","chicago":"Neumann, Thilo von, Christoph Boeddeker, Lukas Drude, Keisuke Kinoshita, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” In <i>Proc. Interspeech 2020</i>, 3097–3101, 2020. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>.","ieee":"T. von Neumann <i>et al.</i>, “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR,” in <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–3101, doi: <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>.","ama":"von Neumann T, Boeddeker C, Drude L, et al. Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. In: <i>Proc. Interspeech 2020</i>. ; 2020:3097-3101. doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>"},"page":"3097-3101","oa":"1","date_updated":"2023-11-15T12:17:57Z","author":[{"first_name":"Thilo","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","id":"49870","full_name":"von Neumann, Thilo"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Drude","full_name":"Drude, Lukas","first_name":"Lukas"},{"last_name":"Kinoshita","full_name":"Kinoshita, Keisuke","first_name":"Keisuke"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"doi":"10.21437/Interspeech.2020-2519","publication":"Proc. Interspeech 2020","abstract":[{"text":"Most approaches to multi-talker overlapped speech separation and recognition assume that the number of simultaneously active speakers is given, but in realistic situations, it is typically unknown. To cope with this, we extend an iterative speech extraction system with mechanisms to count the number of sources and combine it with a single-talker speech recognizer to form the first end-to-end multi-talker automatic speech recognition system for an unknown number of active speakers. Our experiments show very promising performance in counting accuracy, source separation and speech recognition on simulated clean mixtures from WSJ0-2mix and WSJ0-3mix. Among others, we set a new state-of-the-art word error rate on the WSJ0-2mix database. Furthermore, our system generalizes well to a larger number of speakers than it ever saw during training, as shown in experiments with the WSJ0-4mix database. ","lang":"eng"}],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":267893,"file_name":"INTERSPEECH_2020_vonNeumann_Paper.pdf","file_id":"20765","access_level":"open_access","date_updated":"2020-12-16T14:14:14Z","date_created":"2020-12-16T14:14:14Z","creator":"huesera"}],"ddc":["000"],"language":[{"iso":"eng"}],"quality_controlled":"1","year":"2020","date_created":"2020-12-16T14:12:45Z","title":"Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR"},{"citation":{"bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Brendel_Kellermann_Haeb-Umbach_2020, title={Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network}, booktitle={European Signal Processing Conference (EUSIPCO)}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Brendel, Andreas and Kellermann, Walter and Haeb-Umbach, Reinhold}, year={2020} }","short":"T. Gburrek, J. Schmalenstroeer, A. Brendel, W. Kellermann, R. Haeb-Umbach, in: European Signal Processing Conference (EUSIPCO), 2020.","mla":"Gburrek, Tobias, et al. “Deep Neural Network Based Distance Estimation for Geometry Calibration in Acoustic Sensor Network.” <i>European Signal Processing Conference (EUSIPCO)</i>, 2020.","apa":"Gburrek, T., Schmalenstroeer, J., Brendel, A., Kellermann, W., &#38; Haeb-Umbach, R. (2020). Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network. <i>European Signal Processing Conference (EUSIPCO)</i>.","ama":"Gburrek T, Schmalenstroeer J, Brendel A, Kellermann W, Haeb-Umbach R. Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network. In: <i>European Signal Processing Conference (EUSIPCO)</i>. ; 2020.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, Andreas Brendel, Walter Kellermann, and Reinhold Haeb-Umbach. “Deep Neural Network Based Distance Estimation for Geometry Calibration in Acoustic Sensor Network.” In <i>European Signal Processing Conference (EUSIPCO)</i>, 2020.","ieee":"T. Gburrek, J. Schmalenstroeer, A. Brendel, W. Kellermann, and R. Haeb-Umbach, “Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network,” 2020."},"year":"2020","has_accepted_license":"1","quality_controlled":"1","title":"Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network","date_created":"2020-08-31T07:20:57Z","author":[{"full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek","first_name":"Tobias"},{"first_name":"Joerg","id":"460","full_name":"Schmalenstroeer, Joerg","last_name":"Schmalenstroeer"},{"first_name":"Andreas","full_name":"Brendel, Andreas","last_name":"Brendel"},{"last_name":"Kellermann","full_name":"Kellermann, Walter","first_name":"Walter"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_updated":"2023-11-17T06:23:39Z","oa":"1","file":[{"file_size":292159,"file_id":"48987","access_level":"open_access","file_name":"Gburrek2020.pdf","date_updated":"2023-11-17T06:21:40Z","creator":"tgburrek","date_created":"2023-11-17T06:21:40Z","relation":"main_file","content_type":"application/pdf"}],"status":"public","abstract":[{"lang":"eng","text":"We present an approach to deep neural network based (DNN-based) distance estimation in reverberant rooms for supporting geometry calibration tasks in wireless acoustic sensor networks. Signal diffuseness information from acoustic signals is aggregated via the coherent-to-diffuse power ratio to obtain a distance-related feature, which is mapped to a source-to-microphone distance estimate by means of a DNN. This information is then combined with direction-of-arrival estimates from compact microphone arrays to infer the geometry of the sensor network. Unlike many other approaches to geometry calibration, the proposed scheme does only require that the sampling clocks of the sensor nodes are roughly synchronized. In simulations we show that the proposed DNN-based distance estimator generalizes to unseen acoustic environments and that precise estimates of the sensor node positions are obtained. "}],"type":"conference","publication":"European Signal Processing Conference (EUSIPCO)","language":[{"iso":"eng"}],"file_date_updated":"2023-11-17T06:21:40Z","ddc":["004"],"user_id":"44006","department":[{"_id":"54"}],"_id":"18651"},{"date_created":"2020-12-16T14:15:24Z","title":"Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation","quality_controlled":"1","year":"2020","ddc":["000"],"language":[{"iso":"eng"}],"publication":"Proc. Interspeech 2020","abstract":[{"text":"Recently, the source separation performance was greatly improved by time-domain audio source separation based on dual-path recurrent neural network (DPRNN). DPRNN is a simple but effective model for a long sequential data. While DPRNN is quite efficient in modeling a sequential data of the length of an utterance, i.e., about 5 to 10 second data, it is harder to apply it to longer sequences such as whole conversations consisting of multiple utterances. It is simply because, in such a case, the number of time steps consumed by its internal module called inter-chunk RNN becomes extremely large. To mitigate this problem, this paper proposes a multi-path RNN (MPRNN), a generalized version of DPRNN, that models the input data in a hierarchical manner. In the MPRNN framework, the input data is represented at several (>_ 3) time-resolutions, each of which is modeled by a specific RNN sub-module. For example, the RNN sub-module that deals with the finest resolution may model temporal relationship only within a phoneme, while the RNN sub-module handling the most coarse resolution may capture only the relationship between utterances such as speaker information. We perform experiments using simulated dialogue-like mixtures and show that MPRNN has greater model capacity, and it outperforms the current state-of-the-art DPRNN framework especially in online processing scenarios.","lang":"eng"}],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":1725219,"access_level":"open_access","file_id":"20767","file_name":"INTERSPEECH_2020_vonNeumann1_Paper.pdf","date_updated":"2020-12-16T14:16:32Z","date_created":"2020-12-16T14:16:32Z","creator":"huesera"}],"date_updated":"2023-11-15T12:14:25Z","oa":"1","author":[{"last_name":"Kinoshita","full_name":"Kinoshita, Keisuke","first_name":"Keisuke"},{"orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","full_name":"von Neumann, Thilo","id":"49870","first_name":"Thilo"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"doi":"10.21437/Interspeech.2020-2388","has_accepted_license":"1","citation":{"ieee":"K. Kinoshita, T. von Neumann, M. Delcroix, T. Nakatani, and R. Haeb-Umbach, “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation,” in <i>Proc. Interspeech 2020</i>, 2020, pp. 2652–2656, doi: <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>.","chicago":"Kinoshita, Keisuke, Thilo von Neumann, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and Its Application to Speaker Stream Separation.” In <i>Proc. Interspeech 2020</i>, 2652–56, 2020. <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">https://doi.org/10.21437/Interspeech.2020-2388</a>.","ama":"Kinoshita K, von Neumann T, Delcroix M, Nakatani T, Haeb-Umbach R. Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation. In: <i>Proc. Interspeech 2020</i>. ; 2020:2652-2656. doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>","apa":"Kinoshita, K., von Neumann, T., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation. <i>Proc. Interspeech 2020</i>, 2652–2656. <a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">https://doi.org/10.21437/Interspeech.2020-2388</a>","mla":"Kinoshita, Keisuke, et al. “Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and Its Application to Speaker Stream Separation.” <i>Proc. Interspeech 2020</i>, 2020, pp. 2652–56, doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>.","short":"K. Kinoshita, T. von Neumann, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: Proc. Interspeech 2020, 2020, pp. 2652–2656.","bibtex":"@inproceedings{Kinoshita_von Neumann_Delcroix_Nakatani_Haeb-Umbach_2020, title={Multi-Path RNN for Hierarchical Modeling of Long Sequential Data and its Application to Speaker Stream Separation}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2020-2388\">10.21437/Interspeech.2020-2388</a>}, booktitle={Proc. Interspeech 2020}, author={Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={2652–2656} }"},"page":"2652-2656","_id":"20766","user_id":"49870","department":[{"_id":"54"}],"file_date_updated":"2020-12-16T14:16:32Z","type":"conference","status":"public"}]
