[{"status":"public","publication":"Interspeech 2022","type":"conference","language":[{"iso":"eng"}],"_id":"33954","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"},{"grant_number":"448568305","_id":"508","name":"Automatische Transkription von Gesprächssituationen"}],"department":[{"_id":"54"}],"user_id":"40767","year":"2022","citation":{"short":"C. Boeddeker, T. Cord-Landwehr, T. von Neumann, R. Haeb-Umbach, in: Interspeech 2022, ISCA, 2022.","bibtex":"@inproceedings{Boeddeker_Cord-Landwehr_von Neumann_Haeb-Umbach_2022, title={An Initialization Scheme for Meeting Separation with Spatial Mixture Models}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2022-10929\">10.21437/interspeech.2022-10929</a>}, booktitle={Interspeech 2022}, publisher={ISCA}, author={Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}, year={2022} }","mla":"Boeddeker, Christoph, et al. “An Initialization Scheme for Meeting Separation with Spatial Mixture Models.” <i>Interspeech 2022</i>, ISCA, 2022, doi:<a href=\"https://doi.org/10.21437/interspeech.2022-10929\">10.21437/interspeech.2022-10929</a>.","apa":"Boeddeker, C., Cord-Landwehr, T., von Neumann, T., &#38; Haeb-Umbach, R. (2022). An Initialization Scheme for Meeting Separation with Spatial Mixture Models. <i>Interspeech 2022</i>. <a href=\"https://doi.org/10.21437/interspeech.2022-10929\">https://doi.org/10.21437/interspeech.2022-10929</a>","ama":"Boeddeker C, Cord-Landwehr T, von Neumann T, Haeb-Umbach R. An Initialization Scheme for Meeting Separation with Spatial Mixture Models. In: <i>Interspeech 2022</i>. ISCA; 2022. doi:<a href=\"https://doi.org/10.21437/interspeech.2022-10929\">10.21437/interspeech.2022-10929</a>","chicago":"Boeddeker, Christoph, Tobias Cord-Landwehr, Thilo von Neumann, and Reinhold Haeb-Umbach. “An Initialization Scheme for Meeting Separation with Spatial Mixture Models.” In <i>Interspeech 2022</i>. ISCA, 2022. <a href=\"https://doi.org/10.21437/interspeech.2022-10929\">https://doi.org/10.21437/interspeech.2022-10929</a>.","ieee":"C. Boeddeker, T. Cord-Landwehr, T. von Neumann, and R. Haeb-Umbach, “An Initialization Scheme for Meeting Separation with Spatial Mixture Models,” 2022, doi: <a href=\"https://doi.org/10.21437/interspeech.2022-10929\">10.21437/interspeech.2022-10929</a>."},"publication_status":"published","title":"An Initialization Scheme for Meeting Separation with Spatial Mixture Models","doi":"10.21437/interspeech.2022-10929","main_file_link":[{"open_access":"1","url":"https://www.isca-archive.org/interspeech_2022/boeddeker22_interspeech.pdf"}],"date_updated":"2025-02-12T09:06:56Z","oa":"1","publisher":"ISCA","date_created":"2022-10-28T10:53:56Z","author":[{"first_name":"Christoph","last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767"},{"last_name":"Cord-Landwehr","full_name":"Cord-Landwehr, Tobias","id":"44393","first_name":"Tobias"},{"last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","id":"49870","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}]},{"year":"2022","citation":{"ama":"Kinoshita K, von Neumann T, Delcroix M, Boeddeker C, Haeb-Umbach R. Utterance-by-utterance overlap-aware neural diarization with Graph-PIT. In: <i>Proc. Interspeech 2022</i>. ISCA; 2022:1486-1490. doi:<a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">10.21437/Interspeech.2022-11408</a>","ieee":"K. Kinoshita, T. von Neumann, M. Delcroix, C. Boeddeker, and R. Haeb-Umbach, “Utterance-by-utterance overlap-aware neural diarization with Graph-PIT,” in <i>Proc. Interspeech 2022</i>, 2022, pp. 1486–1490, doi: <a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">10.21437/Interspeech.2022-11408</a>.","chicago":"Kinoshita, Keisuke, Thilo von Neumann, Marc Delcroix, Christoph Boeddeker, and Reinhold Haeb-Umbach. “Utterance-by-Utterance Overlap-Aware Neural Diarization with Graph-PIT.” In <i>Proc. Interspeech 2022</i>, 1486–90. ISCA, 2022. <a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">https://doi.org/10.21437/Interspeech.2022-11408</a>.","apa":"Kinoshita, K., von Neumann, T., Delcroix, M., Boeddeker, C., &#38; Haeb-Umbach, R. (2022). Utterance-by-utterance overlap-aware neural diarization with Graph-PIT. <i>Proc. Interspeech 2022</i>, 1486–1490. <a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">https://doi.org/10.21437/Interspeech.2022-11408</a>","bibtex":"@inproceedings{Kinoshita_von Neumann_Delcroix_Boeddeker_Haeb-Umbach_2022, title={Utterance-by-utterance overlap-aware neural diarization with Graph-PIT}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">10.21437/Interspeech.2022-11408</a>}, booktitle={Proc. Interspeech 2022}, publisher={ISCA}, author={Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2022}, pages={1486–1490} }","short":"K. Kinoshita, T. von Neumann, M. Delcroix, C. Boeddeker, R. Haeb-Umbach, in: Proc. Interspeech 2022, ISCA, 2022, pp. 1486–1490.","mla":"Kinoshita, Keisuke, et al. “Utterance-by-Utterance Overlap-Aware Neural Diarization with Graph-PIT.” <i>Proc. Interspeech 2022</i>, ISCA, 2022, pp. 1486–90, doi:<a href=\"https://doi.org/10.21437/Interspeech.2022-11408\">10.21437/Interspeech.2022-11408</a>."},"page":"1486-1490","publication_status":"published","quality_controlled":"1","title":"Utterance-by-utterance overlap-aware neural diarization with Graph-PIT","main_file_link":[{"url":"https://www.isca-archive.org/interspeech_2022/kinoshita22_interspeech.pdf"}],"conference":{"name":"Interspeech 2022"},"doi":"10.21437/Interspeech.2022-11408","publisher":"ISCA","date_updated":"2025-02-12T09:09:05Z","author":[{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Thilo","id":"49870","full_name":"von Neumann, Thilo","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2022-10-28T12:07:57Z","abstract":[{"text":"Recent speaker diarization studies showed that integration of end-to-end neural diarization (EEND) and clustering-based diarization is a promising approach for achieving state-of-the-art performance on various tasks. Such an approach first divides an observed signal into fixed-length segments, then performs {\\it segment-level} local diarization based on an EEND module, and merges the segment-level results via clustering to form a final global diarization result. The segmentation is done to limit the number of speakers in each segment since the current EEND cannot handle a large number of speakers. In this paper, we argue that such an approach involving the segmentation has several issues; for example, it inevitably faces a dilemma that larger segment sizes increase both the context available for enhancing the performance and the number of speakers for the local EEND module to handle. To resolve such a problem, this paper proposes a novel framework that performs diarization without segmentation. However, it can still handle challenging data containing many speakers and a significant amount of overlapping speech. The proposed method can take an entire meeting for inference and perform {\\it utterance-by-utterance} diarization that clusters utterance activities in terms of speakers. To this end, we leverage a neural network training scheme called Graph-PIT proposed recently for neural source separation. Experiments with simulated active-meeting-like data and CALLHOME data show the superiority of the proposed approach over the conventional methods.","lang":"eng"}],"status":"public","type":"conference","publication":"Proc. Interspeech 2022","language":[{"iso":"eng"}],"_id":"33958","user_id":"40767","department":[{"_id":"54"}]},{"author":[{"full_name":"Zhang, Wangyou","last_name":"Zhang","first_name":"Wangyou"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"first_name":"Shinji","last_name":"Watanabe","full_name":"Watanabe, Shinji"},{"full_name":"Nakatani, Tomohiro","last_name":"Nakatani","first_name":"Tomohiro"},{"first_name":"Marc","last_name":"Delcroix","full_name":"Delcroix, Marc"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"first_name":"Tsubasa","full_name":"Ochiai, Tsubasa","last_name":"Ochiai"},{"first_name":"Naoyuki","last_name":"Kamo","full_name":"Kamo, Naoyuki"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"},{"first_name":"Yanmin","last_name":"Qian","full_name":"Qian, Yanmin"}],"date_created":"2021-12-03T11:31:42Z","date_updated":"2022-01-13T08:31:27Z","doi":"10.1109/icassp39728.2021.9414464","title":"End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend","publication_status":"published","citation":{"apa":"Zhang, W., Boeddeker, C., Watanabe, S., Nakatani, T., Delcroix, M., Kinoshita, K., Ochiai, T., Kamo, N., Haeb-Umbach, R., &#38; Qian, Y. (2021). End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend. <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. <a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">https://doi.org/10.1109/icassp39728.2021.9414464</a>","mla":"Zhang, Wangyou, et al. “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend.” <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">10.1109/icassp39728.2021.9414464</a>.","short":"W. Zhang, C. Boeddeker, S. Watanabe, T. Nakatani, M. Delcroix, K. Kinoshita, T. Ochiai, N. Kamo, R. Haeb-Umbach, Y. Qian, in: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021.","bibtex":"@inproceedings{Zhang_Boeddeker_Watanabe_Nakatani_Delcroix_Kinoshita_Ochiai_Kamo_Haeb-Umbach_Qian_2021, title={End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend}, DOI={<a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">10.1109/icassp39728.2021.9414464</a>}, booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Zhang, Wangyou and Boeddeker, Christoph and Watanabe, Shinji and Nakatani, Tomohiro and Delcroix, Marc and Kinoshita, Keisuke and Ochiai, Tsubasa and Kamo, Naoyuki and Haeb-Umbach, Reinhold and Qian, Yanmin}, year={2021} }","chicago":"Zhang, Wangyou, Christoph Boeddeker, Shinji Watanabe, Tomohiro Nakatani, Marc Delcroix, Keisuke Kinoshita, Tsubasa Ochiai, Naoyuki Kamo, Reinhold Haeb-Umbach, and Yanmin Qian. “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend.” In <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021. <a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">https://doi.org/10.1109/icassp39728.2021.9414464</a>.","ieee":"W. Zhang <i>et al.</i>, “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend,” 2021, doi: <a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">10.1109/icassp39728.2021.9414464</a>.","ama":"Zhang W, Boeddeker C, Watanabe S, et al. End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend. In: <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2021. doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9414464\">10.1109/icassp39728.2021.9414464</a>"},"year":"2021","department":[{"_id":"54"}],"user_id":"40767","_id":"28256","language":[{"iso":"eng"}],"publication":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","type":"conference","status":"public"},{"language":[{"iso":"eng"}],"user_id":"40767","department":[{"_id":"54"}],"_id":"28262","status":"public","type":"conference","publication":"2021 IEEE Spoken Language Technology Workshop (SLT)","doi":"10.1109/slt48900.2021.9383615","title":"ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration","date_created":"2021-12-03T12:07:35Z","author":[{"last_name":"Li","full_name":"Li, Chenda","first_name":"Chenda"},{"first_name":"Jing","last_name":"Shi","full_name":"Shi, Jing"},{"first_name":"Wangyou","last_name":"Zhang","full_name":"Zhang, Wangyou"},{"first_name":"Aswin Shanmugam","last_name":"Subramanian","full_name":"Subramanian, Aswin Shanmugam"},{"first_name":"Xuankai","full_name":"Chang, Xuankai","last_name":"Chang"},{"first_name":"Naoyuki","full_name":"Kamo, Naoyuki","last_name":"Kamo"},{"first_name":"Moto","last_name":"Hira","full_name":"Hira, Moto"},{"last_name":"Hayashi","full_name":"Hayashi, Tomoki","first_name":"Tomoki"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Zhuo","last_name":"Chen","full_name":"Chen, Zhuo"},{"first_name":"Shinji","last_name":"Watanabe","full_name":"Watanabe, Shinji"}],"date_updated":"2022-01-13T08:34:25Z","citation":{"ama":"Li C, Shi J, Zhang W, et al. ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration. In: <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>. ; 2021. doi:<a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">10.1109/slt48900.2021.9383615</a>","chicago":"Li, Chenda, Jing Shi, Wangyou Zhang, Aswin Shanmugam Subramanian, Xuankai Chang, Naoyuki Kamo, Moto Hira, et al. “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration.” In <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>, 2021. <a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">https://doi.org/10.1109/slt48900.2021.9383615</a>.","ieee":"C. Li <i>et al.</i>, “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration,” 2021, doi: <a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">10.1109/slt48900.2021.9383615</a>.","apa":"Li, C., Shi, J., Zhang, W., Subramanian, A. S., Chang, X., Kamo, N., Hira, M., Hayashi, T., Boeddeker, C., Chen, Z., &#38; Watanabe, S. (2021). ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration. <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>. <a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">https://doi.org/10.1109/slt48900.2021.9383615</a>","short":"C. Li, J. Shi, W. Zhang, A.S. Subramanian, X. Chang, N. Kamo, M. Hira, T. Hayashi, C. Boeddeker, Z. Chen, S. Watanabe, in: 2021 IEEE Spoken Language Technology Workshop (SLT), 2021.","bibtex":"@inproceedings{Li_Shi_Zhang_Subramanian_Chang_Kamo_Hira_Hayashi_Boeddeker_Chen_et al._2021, title={ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration}, DOI={<a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">10.1109/slt48900.2021.9383615</a>}, booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)}, author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and et al.}, year={2021} }","mla":"Li, Chenda, et al. “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration.” <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>, 2021, doi:<a href=\"https://doi.org/10.1109/slt48900.2021.9383615\">10.1109/slt48900.2021.9383615</a>."},"year":"2021","publication_status":"published"},{"type":"conference","publication":"2021 IEEE Spoken Language Technology Workshop (SLT)","status":"public","_id":"28261","user_id":"40767","department":[{"_id":"54"}],"language":[{"iso":"eng"}],"publication_status":"published","year":"2021","citation":{"chicago":"Li, Chenda, Yi Luo, Cong Han, Jinyu Li, Takuya Yoshioka, Tianyan Zhou, Marc Delcroix, et al. “Dual-Path RNN for Long Recording Speech Separation.” In <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>, 2021. <a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">https://doi.org/10.1109/slt48900.2021.9383514</a>.","ieee":"C. Li <i>et al.</i>, “Dual-Path RNN for Long Recording Speech Separation,” 2021, doi: <a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">10.1109/slt48900.2021.9383514</a>.","ama":"Li C, Luo Y, Han C, et al. Dual-Path RNN for Long Recording Speech Separation. In: <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>. ; 2021. doi:<a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">10.1109/slt48900.2021.9383514</a>","apa":"Li, C., Luo, Y., Han, C., Li, J., Yoshioka, T., Zhou, T., Delcroix, M., Kinoshita, K., Boeddeker, C., Qian, Y., Watanabe, S., &#38; Chen, Z. (2021). Dual-Path RNN for Long Recording Speech Separation. <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>. <a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">https://doi.org/10.1109/slt48900.2021.9383514</a>","mla":"Li, Chenda, et al. “Dual-Path RNN for Long Recording Speech Separation.” <i>2021 IEEE Spoken Language Technology Workshop (SLT)</i>, 2021, doi:<a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">10.1109/slt48900.2021.9383514</a>.","short":"C. Li, Y. Luo, C. Han, J. Li, T. Yoshioka, T. Zhou, M. Delcroix, K. Kinoshita, C. Boeddeker, Y. Qian, S. Watanabe, Z. Chen, in: 2021 IEEE Spoken Language Technology Workshop (SLT), 2021.","bibtex":"@inproceedings{Li_Luo_Han_Li_Yoshioka_Zhou_Delcroix_Kinoshita_Boeddeker_Qian_et al._2021, title={Dual-Path RNN for Long Recording Speech Separation}, DOI={<a href=\"https://doi.org/10.1109/slt48900.2021.9383514\">10.1109/slt48900.2021.9383514</a>}, booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)}, author={Li, Chenda and Luo, Yi and Han, Cong and Li, Jinyu and Yoshioka, Takuya and Zhou, Tianyan and Delcroix, Marc and Kinoshita, Keisuke and Boeddeker, Christoph and Qian, Yanmin and et al.}, year={2021} }"},"date_updated":"2022-01-13T08:34:07Z","date_created":"2021-12-03T12:07:03Z","author":[{"first_name":"Chenda","full_name":"Li, Chenda","last_name":"Li"},{"full_name":"Luo, Yi","last_name":"Luo","first_name":"Yi"},{"last_name":"Han","full_name":"Han, Cong","first_name":"Cong"},{"full_name":"Li, Jinyu","last_name":"Li","first_name":"Jinyu"},{"first_name":"Takuya","last_name":"Yoshioka","full_name":"Yoshioka, Takuya"},{"first_name":"Tianyan","full_name":"Zhou, Tianyan","last_name":"Zhou"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Yanmin","last_name":"Qian","full_name":"Qian, Yanmin"},{"last_name":"Watanabe","full_name":"Watanabe, Shinji","first_name":"Shinji"},{"first_name":"Zhuo","last_name":"Chen","full_name":"Chen, Zhuo"}],"title":"Dual-Path RNN for Long Recording Speech Separation","doi":"10.1109/slt48900.2021.9383514"},{"date_created":"2023-05-15T07:59:33Z","title":"A Comparison and Combination of Unsupervised Blind Source Separation  Techniques","year":"2021","external_id":{"arxiv":["2106.05627"]},"ddc":["000"],"language":[{"iso":"eng"}],"publication":"ITG Conference on Speech Communication","abstract":[{"lang":"eng","text":"Unsupervised blind source separation methods do not require a training phase\r\nand thus cannot suffer from a train-test mismatch, which is a common concern in\r\nneural network based source separation. The unsupervised techniques can be\r\ncategorized in two classes, those building upon the sparsity of speech in the\r\nShort-Time Fourier transform domain and those exploiting non-Gaussianity or\r\nnon-stationarity of the source signals. In this contribution, spatial mixture\r\nmodels which fall in the first category and independent vector analysis (IVA)\r\nas a representative of the second category are compared w.r.t. their separation\r\nperformance and the performance of a downstream speech recognizer on a\r\nreverberant dataset of reasonable size. Furthermore, we introduce a serial\r\nconcatenation of the two, where the result of the mixture model serves as\r\ninitialization of IVA, which achieves significantly better WER performance than\r\neach algorithm individually and even approaches the performance of a much more\r\ncomplex neural network based technique."}],"file":[{"file_size":295972,"file_id":"44856","file_name":"2106.05627.pdf","access_level":"open_access","date_updated":"2023-11-15T15:29:32Z","date_created":"2023-05-16T08:37:31Z","creator":"frra","relation":"main_file","content_type":"application/pdf"}],"oa":"1","date_updated":"2023-11-15T15:29:32Z","author":[{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Frederik","id":"72602","full_name":"Rautenberg, Frederik","last_name":"Rautenberg"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"conference":{"location":"Kiel","name":"ITG Conference on Speech Communication"},"main_file_link":[{"url":"https://arxiv.org/pdf/2106.05627.pdf","open_access":"1"}],"has_accepted_license":"1","citation":{"ama":"Boeddeker C, Rautenberg F, Haeb-Umbach R. A Comparison and Combination of Unsupervised Blind Source Separation  Techniques. In: <i>ITG Conference on Speech Communication</i>. ; 2021.","chicago":"Boeddeker, Christoph, Frederik Rautenberg, and Reinhold Haeb-Umbach. “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques.” In <i>ITG Conference on Speech Communication</i>, 2021.","ieee":"C. Boeddeker, F. Rautenberg, and R. Haeb-Umbach, “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques,” presented at the ITG Conference on Speech Communication, Kiel, 2021.","apa":"Boeddeker, C., Rautenberg, F., &#38; Haeb-Umbach, R. (2021). A Comparison and Combination of Unsupervised Blind Source Separation  Techniques. <i>ITG Conference on Speech Communication</i>. ITG Conference on Speech Communication, Kiel.","bibtex":"@inproceedings{Boeddeker_Rautenberg_Haeb-Umbach_2021, title={A Comparison and Combination of Unsupervised Blind Source Separation  Techniques}, booktitle={ITG Conference on Speech Communication}, author={Boeddeker, Christoph and Rautenberg, Frederik and Haeb-Umbach, Reinhold}, year={2021} }","short":"C. Boeddeker, F. Rautenberg, R. Haeb-Umbach, in: ITG Conference on Speech Communication, 2021.","mla":"Boeddeker, Christoph, et al. “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques.” <i>ITG Conference on Speech Communication</i>, 2021."},"_id":"44843","department":[{"_id":"54"}],"user_id":"40767","file_date_updated":"2023-11-15T15:29:32Z","type":"conference","status":"public"},{"has_accepted_license":"1","publication_status":"published","year":"2021","citation":{"mla":"Boeddeker, Christoph, et al. “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation.” <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">10.1109/icassp39728.2021.9414661</a>.","short":"C. Boeddeker, W. Zhang, T. Nakatani, K. Kinoshita, T. Ochiai, M. Delcroix, N. Kamo, Y. Qian, R. Haeb-Umbach, in: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021.","bibtex":"@inproceedings{Boeddeker_Zhang_Nakatani_Kinoshita_Ochiai_Delcroix_Kamo_Qian_Haeb-Umbach_2021, title={Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation}, DOI={<a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">10.1109/icassp39728.2021.9414661</a>}, booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Boeddeker, Christoph and Zhang, Wangyou and Nakatani, Tomohiro and Kinoshita, Keisuke and Ochiai, Tsubasa and Delcroix, Marc and Kamo, Naoyuki and Qian, Yanmin and Haeb-Umbach, Reinhold}, year={2021} }","apa":"Boeddeker, C., Zhang, W., Nakatani, T., Kinoshita, K., Ochiai, T., Delcroix, M., Kamo, N., Qian, Y., &#38; Haeb-Umbach, R. (2021). Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation. <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. <a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">https://doi.org/10.1109/icassp39728.2021.9414661</a>","chicago":"Boeddeker, Christoph, Wangyou Zhang, Tomohiro Nakatani, Keisuke Kinoshita, Tsubasa Ochiai, Marc Delcroix, Naoyuki Kamo, Yanmin Qian, and Reinhold Haeb-Umbach. “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation.” In <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021. <a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">https://doi.org/10.1109/icassp39728.2021.9414661</a>.","ieee":"C. Boeddeker <i>et al.</i>, “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation,” 2021, doi: <a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">10.1109/icassp39728.2021.9414661</a>.","ama":"Boeddeker C, Zhang W, Nakatani T, et al. Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation. In: <i>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2021. doi:<a href=\"https://doi.org/10.1109/icassp39728.2021.9414661\">10.1109/icassp39728.2021.9414661</a>"},"date_updated":"2023-11-15T15:18:09Z","oa":"1","date_created":"2021-12-03T12:00:16Z","author":[{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"full_name":"Zhang, Wangyou","last_name":"Zhang","first_name":"Wangyou"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"last_name":"Ochiai","full_name":"Ochiai, Tsubasa","first_name":"Tsubasa"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"last_name":"Kamo","full_name":"Kamo, Naoyuki","first_name":"Naoyuki"},{"first_name":"Yanmin","full_name":"Qian, Yanmin","last_name":"Qian"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"title":"Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation","doi":"10.1109/icassp39728.2021.9414661","publication":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","type":"conference","status":"public","file":[{"creator":"cbj","date_created":"2021-12-03T12:01:20Z","date_updated":"2023-11-15T15:18:08Z","access_level":"open_access","file_id":"28260","file_name":"ICASSP2021_BSSEval.pdf","file_size":228717,"content_type":"application/pdf","relation":"main_file"}],"_id":"28259","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2023-11-15T15:18:08Z"},{"quality_controlled":"1","year":"2021","date_created":"2021-10-25T08:50:01Z","title":"Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers","publication":"Interspeech 2021","file":[{"creator":"tvn","date_created":"2021-12-06T10:39:13Z","date_updated":"2021-12-06T10:48:30Z","file_id":"28327","access_level":"open_access","file_name":"Interspeech 2021 voiceover-002-compressed.mp4","title":"Video for INTERSPEECH 2021","file_size":9550220,"content_type":"video/mp4","relation":"supplementary_material"},{"file_size":1337297,"title":"Slides from INTERSPEECH 2021","file_id":"28328","access_level":"open_access","file_name":"Graph-PIT-poster-presentation.pptx","date_updated":"2021-12-06T10:47:01Z","date_created":"2021-12-06T10:47:01Z","creator":"tvn","relation":"slides","content_type":"application/vnd.openxmlformats-officedocument.presentationml.presentation"},{"date_updated":"2021-12-06T10:48:21Z","date_created":"2021-12-06T10:48:21Z","creator":"tvn","file_size":226589,"file_id":"28329","file_name":"INTERSPEECH2021_Graph_PIT.pdf","access_level":"open_access","content_type":"application/pdf","relation":"main_file"}],"abstract":[{"text":"Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.\r\nFurther, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. ","lang":"eng"}],"language":[{"iso":"eng"}],"ddc":["000"],"keyword":["Continuous speech separation","automatic speech recognition","overlapped speech","permutation invariant training"],"related_material":{"link":[{"url":"https://github.com/fgnt/graph_pit","relation":"software"}]},"publication_status":"published","has_accepted_license":"1","citation":{"apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. <i>Interspeech 2021</i>. Interspeech. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>","mla":"von Neumann, Thilo, et al. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” <i>Interspeech 2021</i>, 2021, doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Interspeech 2021, 2021.","bibtex":"@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2021, title={Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>}, booktitle={Interspeech 2021}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” In <i>Interspeech 2021</i>, 2021. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>.","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers,” presented at the Interspeech, 2021, doi: <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. In: <i>Interspeech 2021</i>. ; 2021. doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>"},"author":[{"first_name":"Thilo","id":"49870","full_name":"von Neumann, Thilo","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"oa":"1","date_updated":"2023-11-15T12:14:40Z","doi":"10.21437/interspeech.2021-1177","conference":{"name":"Interspeech"},"type":"conference","status":"public","user_id":"49870","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"26770","file_date_updated":"2021-12-06T10:48:30Z"},{"has_accepted_license":"1","quality_controlled":"1","year":"2021","citation":{"chicago":"Neumann, Thilo von, Christoph Boeddeker, Keisuke Kinoshita, Marc Delcroix, and Reinhold Haeb-Umbach. “Speeding Up Permutation Invariant Training for Source Separation.” In <i>Speech Communication; 14th ITG Conference</i>, 2021.","ieee":"T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, and R. Haeb-Umbach, “Speeding Up Permutation Invariant Training for Source Separation,” presented at the Speech Communication; 14th ITG Conference, Kiel, 2021.","ama":"von Neumann T, Boeddeker C, Kinoshita K, Delcroix M, Haeb-Umbach R. Speeding Up Permutation Invariant Training for Source Separation. In: <i>Speech Communication; 14th ITG Conference</i>. ; 2021.","apa":"von Neumann, T., Boeddeker, C., Kinoshita, K., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Speeding Up Permutation Invariant Training for Source Separation. <i>Speech Communication; 14th ITG Conference</i>. Speech Communication; 14th ITG Conference, Kiel.","bibtex":"@inproceedings{von Neumann_Boeddeker_Kinoshita_Delcroix_Haeb-Umbach_2021, title={Speeding Up Permutation Invariant Training for Source Separation}, booktitle={Speech Communication; 14th ITG Conference}, author={von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","short":"T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, R. Haeb-Umbach, in: Speech Communication; 14th ITG Conference, 2021.","mla":"von Neumann, Thilo, et al. “Speeding Up Permutation Invariant Training for Source Separation.” <i>Speech Communication; 14th ITG Conference</i>, 2021."},"date_updated":"2023-11-15T12:16:31Z","oa":"1","author":[{"full_name":"von Neumann, Thilo","id":"49870","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","first_name":"Thilo"},{"last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph","first_name":"Christoph"},{"last_name":"Kinoshita","full_name":"Kinoshita, Keisuke","first_name":"Keisuke"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2022-01-07T10:40:56Z","title":"Speeding Up Permutation Invariant Training for Source Separation","conference":{"name":"Speech Communication; 14th ITG Conference","start_date":"2021-09-29","end_date":"2021-10-01","location":"Kiel"},"publication":"Speech Communication; 14th ITG Conference","type":"conference","status":"public","file":[{"relation":"poster","content_type":"application/pdf","file_id":"29180","access_level":"open_access","file_name":"poster.pdf","file_size":191938,"creator":"tvn","date_created":"2022-01-06T13:23:27Z","date_updated":"2022-01-06T13:23:27Z"},{"date_created":"2022-01-07T10:42:54Z","creator":"tvn","date_updated":"2022-01-07T10:42:54Z","file_id":"29181","file_name":"ITG2021_Speeding_up_Permutation_Invariant_Training.pdf","access_level":"open_access","file_size":236670,"content_type":"application/pdf","relation":"main_file"}],"_id":"29173","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"49870","ddc":["000"],"file_date_updated":"2022-01-07T10:42:54Z","language":[{"iso":"eng"}]},{"publication":"Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments","type":"conference","status":"public","file":[{"access_level":"open_access","file_name":"template.pdf","file_id":"20702","file_size":115421,"date_created":"2020-12-11T12:48:48Z","creator":"cbj","date_updated":"2020-12-11T12:48:48Z","relation":"main_file","content_type":"application/pdf"}],"_id":"20700","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767","ddc":["000"],"file_date_updated":"2020-12-11T12:48:48Z","language":[{"iso":"eng"}],"has_accepted_license":"1","year":"2020","citation":{"ama":"Boeddeker C, Cord-Landwehr T, Heitkaemper J, et al. Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In: <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>. ; 2020.","chicago":"Boeddeker, Christoph, Tobias Cord-Landwehr, Jens Heitkaemper, Catalin Zorila, Daichi Hayakawa, Mohan Li, Min Liu, Rama Doddipatla, and Reinhold Haeb-Umbach. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","ieee":"C. Boeddeker <i>et al.</i>, “Towards a speaker diarization system for the CHiME 2020 dinner party transcription,” in <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","mla":"Boeddeker, Christoph, et al. “Towards a Speaker Diarization System for the CHiME 2020 Dinner Party Transcription.” <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>, 2020.","bibtex":"@inproceedings{Boeddeker_Cord-Landwehr_Heitkaemper_Zorila_Hayakawa_Li_Liu_Doddipatla_Haeb-Umbach_2020, title={Towards a speaker diarization system for the CHiME 2020 dinner party transcription}, booktitle={Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments}, author={Boeddeker, Christoph and Cord-Landwehr, Tobias and Heitkaemper, Jens and Zorila, Catalin and Hayakawa, Daichi and Li, Mohan and Liu, Min and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2020} }","short":"C. Boeddeker, T. Cord-Landwehr, J. Heitkaemper, C. Zorila, D. Hayakawa, M. Li, M. Liu, R. Doddipatla, R. Haeb-Umbach, in: Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments, 2020.","apa":"Boeddeker, C., Cord-Landwehr, T., Heitkaemper, J., Zorila, C., Hayakawa, D., Li, M., … Haeb-Umbach, R. (2020). Towards a speaker diarization system for the CHiME 2020 dinner party transcription. In <i>Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments</i>."},"oa":"1","date_updated":"2022-01-06T06:54:33Z","date_created":"2020-12-11T12:49:13Z","author":[{"last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph","first_name":"Christoph"},{"first_name":"Tobias","id":"44393","full_name":"Cord-Landwehr, Tobias","last_name":"Cord-Landwehr"},{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"first_name":"Catalin","full_name":"Zorila, Catalin","last_name":"Zorila"},{"last_name":"Hayakawa","full_name":"Hayakawa, Daichi","first_name":"Daichi"},{"full_name":"Li, Mohan","last_name":"Li","first_name":"Mohan"},{"first_name":"Min","full_name":"Liu, Min","last_name":"Liu"},{"full_name":"Doddipatla, Rama","last_name":"Doddipatla","first_name":"Rama"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"title":"Towards a speaker diarization system for the CHiME 2020 dinner party transcription"},{"citation":{"mla":"Nakatani, Tomohiro, et al. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, pp. 1–1, doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","bibtex":"@article{Nakatani_Boeddeker_Kinoshita_Ikeshita_Delcroix_Haeb-Umbach_2020, title={Jointly optimal denoising, dereverberation, and source separation}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Nakatani, Tomohiro and Boeddeker, Christoph and Kinoshita, Keisuke and Ikeshita, Rintaro and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2020}, pages={1–1} }","short":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing (2020) 1–1.","apa":"Nakatani, T., Boeddeker, C., Kinoshita, K., Ikeshita, R., Delcroix, M., &#38; Haeb-Umbach, R. (2020). Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>","chicago":"Nakatani, Tomohiro, Christoph Boeddeker, Keisuke Kinoshita, Rintaro Ikeshita, Marc Delcroix, and Reinhold Haeb-Umbach. “Jointly Optimal Denoising, Dereverberation, and Source Separation.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, 2020, 1–1. <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">https://doi.org/10.1109/TASLP.2020.3013118</a>.","ieee":"T. Nakatani, C. Boeddeker, K. Kinoshita, R. Ikeshita, M. Delcroix, and R. Haeb-Umbach, “Jointly optimal denoising, dereverberation, and source separation,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, pp. 1–1, 2020, doi: <a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>.","ama":"Nakatani T, Boeddeker C, Kinoshita K, Ikeshita R, Delcroix M, Haeb-Umbach R. Jointly optimal denoising, dereverberation, and source separation. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. Published online 2020:1-1. doi:<a href=\"https://doi.org/10.1109/TASLP.2020.3013118\">10.1109/TASLP.2020.3013118</a>"},"page":"1-1","year":"2020","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2020/journal_2020_boeddeker.pdf"}],"doi":"10.1109/TASLP.2020.3013118","title":"Jointly optimal denoising, dereverberation, and source separation","author":[{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"first_name":"Rintaro","full_name":"Ikeshita, Rintaro","last_name":"Ikeshita"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2020-08-05T06:16:56Z","date_updated":"2022-12-05T12:34:01Z","oa":"1","status":"public","type":"journal_article","publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","language":[{"iso":"eng"}],"user_id":"40767","department":[{"_id":"54"}],"_id":"17598"},{"file":[{"date_updated":"2020-12-11T12:36:37Z","date_created":"2020-12-11T12:36:37Z","creator":"jensheit","file_size":3871374,"access_level":"closed","file_id":"20699","file_name":"ms.pdf","content_type":"application/pdf","success":1,"relation":"main_file"}],"abstract":[{"lang":"eng","text":"In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions."}],"publication":"ICASSP 2020 Virtual Barcelona Spain","language":[{"iso":"eng"}],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"ddc":["000"],"year":"2020","quality_controlled":"1","title":"Demystifying TasNet: A Dissecting Approach","date_created":"2020-11-25T14:56:53Z","status":"public","type":"conference","file_date_updated":"2020-12-11T12:36:37Z","department":[{"_id":"54"}],"user_id":"40767","_id":"20504","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"citation":{"ama":"Heitkaemper J, Jakobeit D, Boeddeker C, Drude L, Haeb-Umbach R. Demystifying TasNet: A Dissecting Approach. In: <i>ICASSP 2020 Virtual Barcelona Spain</i>. ; 2020.","ieee":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, and R. Haeb-Umbach, “Demystifying TasNet: A Dissecting Approach,” 2020.","chicago":"Heitkaemper, Jens, Darius Jakobeit, Christoph Boeddeker, Lukas Drude, and Reinhold Haeb-Umbach. “Demystifying TasNet: A Dissecting Approach.” In <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","mla":"Heitkaemper, Jens, et al. “Demystifying TasNet: A Dissecting Approach.” <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","bibtex":"@inproceedings{Heitkaemper_Jakobeit_Boeddeker_Drude_Haeb-Umbach_2020, title={Demystifying TasNet: A Dissecting Approach}, booktitle={ICASSP 2020 Virtual Barcelona Spain}, author={Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}, year={2020} }","short":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, R. Haeb-Umbach, in: ICASSP 2020 Virtual Barcelona Spain, 2020.","apa":"Heitkaemper, J., Jakobeit, D., Boeddeker, C., Drude, L., &#38; Haeb-Umbach, R. (2020). Demystifying TasNet: A Dissecting Approach. <i>ICASSP 2020 Virtual Barcelona Spain</i>."},"has_accepted_license":"1","author":[{"first_name":"Jens","full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper"},{"last_name":"Jakobeit","full_name":"Jakobeit, Darius","first_name":"Darius"},{"first_name":"Christoph","id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker"},{"first_name":"Lukas","full_name":"Drude, Lukas","last_name":"Drude"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_updated":"2022-01-13T08:47:32Z"},{"title":"CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings","date_updated":"2022-01-13T08:34:37Z","author":[{"first_name":"Shinji","full_name":"Watanabe, Shinji","last_name":"Watanabe"},{"full_name":"Mandel, Michael","last_name":"Mandel","first_name":"Michael"},{"full_name":"Barker, Jon","last_name":"Barker","first_name":"Jon"},{"first_name":"Emmanuel","full_name":"Vincent, Emmanuel","last_name":"Vincent"},{"first_name":"Ashish","full_name":"Arora, Ashish","last_name":"Arora"},{"last_name":"Chang","full_name":"Chang, Xuankai","first_name":"Xuankai"},{"last_name":"Khudanpur","full_name":"Khudanpur, Sanjeev","first_name":"Sanjeev"},{"last_name":"Manohar","full_name":"Manohar, Vimal","first_name":"Vimal"},{"first_name":"Daniel","full_name":"Povey, Daniel","last_name":"Povey"},{"first_name":"Desh","full_name":"Raj, Desh","last_name":"Raj"},{"first_name":"David","full_name":"Snyder, David","last_name":"Snyder"},{"full_name":"Subramanian, Aswin Shanmugam","last_name":"Subramanian","first_name":"Aswin Shanmugam"},{"full_name":"Trmal, Jan","last_name":"Trmal","first_name":"Jan"},{"first_name":"Bar Ben","last_name":"Yair","full_name":"Yair, Bar Ben"},{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"first_name":"Zhaoheng","last_name":"Ni","full_name":"Ni, Zhaoheng"},{"full_name":"Fujita, Yusuke","last_name":"Fujita","first_name":"Yusuke"},{"full_name":"Horiguchi, Shota","last_name":"Horiguchi","first_name":"Shota"},{"last_name":"Kanda","full_name":"Kanda, Naoyuki","first_name":"Naoyuki"},{"last_name":"Yoshioka","full_name":"Yoshioka, Takuya","first_name":"Takuya"},{"last_name":"Ryant","full_name":"Ryant, Neville","first_name":"Neville"}],"date_created":"2021-12-03T12:13:01Z","year":"2020","citation":{"ama":"Watanabe S, Mandel M, Barker J, et al. CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings. <i>arXiv:200409249</i>. Published online 2020.","ieee":"S. Watanabe <i>et al.</i>, “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings,” <i>arXiv:2004.09249</i>. 2020.","chicago":"Watanabe, Shinji, Michael Mandel, Jon Barker, Emmanuel Vincent, Ashish Arora, Xuankai Chang, Sanjeev Khudanpur, et al. “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings.” <i>ArXiv:2004.09249</i>, 2020.","apa":"Watanabe, S., Mandel, M., Barker, J., Vincent, E., Arora, A., Chang, X., Khudanpur, S., Manohar, V., Povey, D., Raj, D., Snyder, D., Subramanian, A. S., Trmal, J., Yair, B. B., Boeddeker, C., Ni, Z., Fujita, Y., Horiguchi, S., Kanda, N., … Ryant, N. (2020). CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings. In <i>arXiv:2004.09249</i>.","bibtex":"@article{Watanabe_Mandel_Barker_Vincent_Arora_Chang_Khudanpur_Manohar_Povey_Raj_et al._2020, title={CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings}, journal={arXiv:2004.09249}, author={Watanabe, Shinji and Mandel, Michael and Barker, Jon and Vincent, Emmanuel and Arora, Ashish and Chang, Xuankai and Khudanpur, Sanjeev and Manohar, Vimal and Povey, Daniel and Raj, Desh and et al.}, year={2020} }","mla":"Watanabe, Shinji, et al. “CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for  Unsegmented Recordings.” <i>ArXiv:2004.09249</i>, 2020.","short":"S. Watanabe, M. Mandel, J. Barker, E. Vincent, A. Arora, X. Chang, S. Khudanpur, V. Manohar, D. Povey, D. Raj, D. Snyder, A.S. Subramanian, J. Trmal, B.B. Yair, C. Boeddeker, Z. Ni, Y. Fujita, S. Horiguchi, N. Kanda, T. Yoshioka, N. Ryant, ArXiv:2004.09249 (2020)."},"language":[{"iso":"eng"}],"_id":"28263","user_id":"40767","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"Following the success of the 1st, 2nd, 3rd, 4th and 5th CHiME challenges we\r\norganize the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).\r\nThe new challenge revisits the previous CHiME-5 challenge and further considers\r\nthe problem of distant multi-microphone conversational speech diarization and\r\nrecognition in everyday home environments. Speech material is the same as the\r\nprevious CHiME-5 recordings except for accurate array synchronization. The\r\nmaterial was elicited using a dinner party scenario with efforts taken to\r\ncapture data that is representative of natural conversational speech. This\r\npaper provides a baseline description of the CHiME-6 challenge for both\r\nsegmented multispeaker speech recognition (Track 1) and unsegmented\r\nmultispeaker speech recognition (Track 2). Of note, Track 2 is the first\r\nchallenge activity in the community to tackle an unsegmented multispeaker\r\nspeech recognition scenario with a complete set of reproducible open source\r\nbaselines providing speech enhancement, speaker diarization, and speech\r\nrecognition modules."}],"status":"public","type":"preprint","publication":"arXiv:2004.09249"},{"page":"7004-7008","citation":{"ama":"von Neumann T, Kinoshita K, Drude L, et al. End-to-End Training of Time Domain Audio Separation and Recognition. In: <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2020:7004-7008. doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Lukas Drude, Christoph Boeddeker, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “End-to-End Training of Time Domain Audio Separation and Recognition.” In <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–8, 2020. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>.","ieee":"T. von Neumann <i>et al.</i>, “End-to-End Training of Time Domain Audio Separation and Recognition,” in <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–7008, doi: <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","apa":"von Neumann, T., Kinoshita, K., Drude, L., Boeddeker, C., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). End-to-End Training of Time Domain Audio Separation and Recognition. <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 7004–7008. <a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">https://doi.org/10.1109/ICASSP40776.2020.9053461</a>","bibtex":"@inproceedings{von Neumann_Kinoshita_Drude_Boeddeker_Delcroix_Nakatani_Haeb-Umbach_2020, title={End-to-End Training of Time Domain Audio Separation and Recognition}, DOI={<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Drude, Lukas and Boeddeker, Christoph and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={7004–7008} }","mla":"von Neumann, Thilo, et al. “End-to-End Training of Time Domain Audio Separation and Recognition.” <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, pp. 7004–08, doi:<a href=\"https://doi.org/10.1109/ICASSP40776.2020.9053461\">10.1109/ICASSP40776.2020.9053461</a>.","short":"T. von Neumann, K. Kinoshita, L. Drude, C. Boeddeker, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020, pp. 7004–7008."},"has_accepted_license":"1","doi":"10.1109/ICASSP40776.2020.9053461","date_updated":"2023-11-15T12:17:45Z","oa":"1","author":[{"orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","id":"49870","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"first_name":"Tomohiro","last_name":"Nakatani","full_name":"Nakatani, Tomohiro"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"status":"public","type":"conference","file_date_updated":"2020-12-16T14:09:48Z","_id":"20762","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"49870","year":"2020","quality_controlled":"1","title":"End-to-End Training of Time Domain Audio Separation and Recognition","date_created":"2020-12-16T14:07:54Z","abstract":[{"lang":"eng","text":"The rising interest in single-channel multi-speaker speech separation sparked development of End-to-End (E2E) approaches to multispeaker speech recognition. However, up until now, state-of-theart neural network–based time domain source separation has not yet been combined with E2E speech recognition. We here demonstrate how to combine a separation module based on a Convolutional Time domain Audio Separation Network (Conv-TasNet) with an E2E speech recognizer and how to train such a model jointly by distributing it over multiple GPUs or by approximating truncated back-propagation for the convolutional front-end. To put this work into perspective and illustrate the complexity of the design space, we provide a compact overview of single-channel multi-speaker recognition systems. Our experiments show a word error rate of 11.0% on WSJ0-2mix and indicate that our joint time domain model can yield substantial improvements over cascade DNN-HMM and monolithic E2E frequency domain systems proposed so far."}],"file":[{"file_id":"20763","file_name":"ICASSP_2020_vonNeumann_Paper.pdf","access_level":"open_access","file_size":192529,"creator":"huesera","date_created":"2020-12-16T14:09:48Z","date_updated":"2020-12-16T14:09:48Z","relation":"main_file","content_type":"application/pdf"}],"publication":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","ddc":["000"],"language":[{"iso":"eng"}]},{"publication":"Proc. Interspeech 2020","abstract":[{"text":"Most approaches to multi-talker overlapped speech separation and recognition assume that the number of simultaneously active speakers is given, but in realistic situations, it is typically unknown. To cope with this, we extend an iterative speech extraction system with mechanisms to count the number of sources and combine it with a single-talker speech recognizer to form the first end-to-end multi-talker automatic speech recognition system for an unknown number of active speakers. Our experiments show very promising performance in counting accuracy, source separation and speech recognition on simulated clean mixtures from WSJ0-2mix and WSJ0-3mix. Among others, we set a new state-of-the-art word error rate on the WSJ0-2mix database. Furthermore, our system generalizes well to a larger number of speakers than it ever saw during training, as shown in experiments with the WSJ0-4mix database. ","lang":"eng"}],"file":[{"file_id":"20765","file_name":"INTERSPEECH_2020_vonNeumann_Paper.pdf","access_level":"open_access","file_size":267893,"creator":"huesera","date_created":"2020-12-16T14:14:14Z","date_updated":"2020-12-16T14:14:14Z","relation":"main_file","content_type":"application/pdf"}],"ddc":["000"],"language":[{"iso":"eng"}],"quality_controlled":"1","year":"2020","date_created":"2020-12-16T14:12:45Z","title":"Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR","type":"conference","status":"public","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"20764","user_id":"49870","department":[{"_id":"54"}],"file_date_updated":"2020-12-16T14:14:14Z","has_accepted_license":"1","citation":{"apa":"von Neumann, T., Boeddeker, C., Drude, L., Kinoshita, K., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2020). Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. <i>Proc. Interspeech 2020</i>, 3097–3101. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>","bibtex":"@inproceedings{von Neumann_Boeddeker_Drude_Kinoshita_Delcroix_Nakatani_Haeb-Umbach_2020, title={Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR}, DOI={<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>}, booktitle={Proc. Interspeech 2020}, author={von Neumann, Thilo and Boeddeker, Christoph and Drude, Lukas and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={3097–3101} }","short":"T. von Neumann, C. Boeddeker, L. Drude, K. Kinoshita, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: Proc. Interspeech 2020, 2020, pp. 3097–3101.","mla":"von Neumann, Thilo, et al. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–101, doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>.","chicago":"Neumann, Thilo von, Christoph Boeddeker, Lukas Drude, Keisuke Kinoshita, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR.” In <i>Proc. Interspeech 2020</i>, 3097–3101, 2020. <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">https://doi.org/10.21437/Interspeech.2020-2519</a>.","ieee":"T. von Neumann <i>et al.</i>, “Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR,” in <i>Proc. Interspeech 2020</i>, 2020, pp. 3097–3101, doi: <a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>.","ama":"von Neumann T, Boeddeker C, Drude L, et al. Multi-Talker ASR for an Unknown Number of Sources: Joint Training of Source Counting, Separation and ASR. In: <i>Proc. Interspeech 2020</i>. ; 2020:3097-3101. doi:<a href=\"https://doi.org/10.21437/Interspeech.2020-2519\">10.21437/Interspeech.2020-2519</a>"},"page":"3097-3101","oa":"1","date_updated":"2023-11-15T12:17:57Z","author":[{"first_name":"Thilo","id":"49870","full_name":"von Neumann, Thilo","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670"},{"first_name":"Christoph","id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker"},{"first_name":"Lukas","full_name":"Drude, Lukas","last_name":"Drude"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"doi":"10.21437/Interspeech.2020-2519"},{"title":"Jointly Optimal Dereverberation and Beamforming","doi":"10.1109/icassp40776.2020.9054393","date_updated":"2024-11-14T09:17:32Z","oa":"1","date_created":"2020-12-11T12:28:49Z","author":[{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"first_name":"Tomohiro","last_name":"Nakatani","full_name":"Nakatani, Tomohiro"},{"first_name":"Keisuke","full_name":"Kinoshita, Keisuke","last_name":"Kinoshita"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"year":"2020","citation":{"short":"C. Boeddeker, T. Nakatani, K. Kinoshita, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020.","bibtex":"@inproceedings{Boeddeker_Nakatani_Kinoshita_Haeb-Umbach_2020, title={Jointly Optimal Dereverberation and Beamforming}, DOI={<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Boeddeker, Christoph and Nakatani, Tomohiro and Kinoshita, Keisuke and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Boeddeker, Christoph, et al. “Jointly Optimal Dereverberation and Beamforming.” <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020, doi:<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>.","apa":"Boeddeker, C., Nakatani, T., Kinoshita, K., &#38; Haeb-Umbach, R. (2020). Jointly Optimal Dereverberation and Beamforming. <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">https://doi.org/10.1109/icassp40776.2020.9054393</a>","ama":"Boeddeker C, Nakatani T, Kinoshita K, Haeb-Umbach R. Jointly Optimal Dereverberation and Beamforming. In: <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2020. doi:<a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>","chicago":"Boeddeker, Christoph, Tomohiro Nakatani, Keisuke Kinoshita, and Reinhold Haeb-Umbach. “Jointly Optimal Dereverberation and Beamforming.” In <i>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2020. <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">https://doi.org/10.1109/icassp40776.2020.9054393</a>.","ieee":"C. Boeddeker, T. Nakatani, K. Kinoshita, and R. Haeb-Umbach, “Jointly Optimal Dereverberation and Beamforming,” 2020, doi: <a href=\"https://doi.org/10.1109/icassp40776.2020.9054393\">10.1109/icassp40776.2020.9054393</a>."},"has_accepted_license":"1","publication_identifier":{"isbn":["9781509066315"]},"publication_status":"published","ddc":["000"],"file_date_updated":"2020-12-11T12:32:44Z","language":[{"iso":"eng"}],"_id":"20695","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"40767","status":"public","file":[{"file_size":200127,"file_name":"convBF.pdf","access_level":"open_access","file_id":"20698","date_updated":"2020-12-11T12:32:44Z","creator":"cbj","date_created":"2020-12-11T12:32:44Z","relation":"main_file","content_type":"application/pdf"}],"publication":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","type":"conference"},{"project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"19446","user_id":"40767","department":[{"_id":"54"}],"ddc":["000"],"file_date_updated":"2020-12-11T12:22:31Z","language":[{"iso":"eng"}],"type":"journal_article","publication":"ArXiv e-prints","abstract":[{"lang":"eng","text":"We present a multi-channel database of overlapping speech for training, evaluation, and detailed analysis of source separation and extraction algorithms: SMS-WSJ -- Spatialized Multi-Speaker Wall Street Journal. It consists of artificially mixed speech taken from the WSJ database, but unlike earlier databases we consider all WSJ0+1 utterances and take care of strictly separating the speaker sets present in the training, validation and test sets. When spatializing the data we ensure a high degree of randomness w.r.t. room size, array center and rotation, as well as speaker position. Furthermore, this paper offers a critical assessment of recently proposed measures of source separation performance. Alongside the code to generate the database we provide a source separation baseline and a Kaldi recipe with competitive word error rates to provide common ground for evaluation."}],"file":[{"content_type":"application/pdf","relation":"main_file","creator":"huesera","date_created":"2020-09-16T08:00:56Z","date_updated":"2020-12-11T12:22:31Z","file_name":"ArXiv_2019_Drude.pdf","access_level":"open_access","file_id":"19448","file_size":288594}],"status":"public","oa":"1","date_updated":"2022-01-06T06:54:04Z","date_created":"2020-09-16T07:59:46Z","author":[{"first_name":"Lukas","full_name":"Drude, Lukas","last_name":"Drude"},{"first_name":"Jens","full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper"},{"first_name":"Christoph","id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"title":"SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition","has_accepted_license":"1","year":"2019","citation":{"ama":"Drude L, Heitkaemper J, Boeddeker C, Haeb-Umbach R. SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv e-prints</i>. 2019.","ieee":"L. Drude, J. Heitkaemper, C. Boeddeker, and R. Haeb-Umbach, “SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition,” <i>ArXiv e-prints</i>, 2019.","chicago":"Drude, Lukas, Jens Heitkaemper, Christoph Boeddeker, and Reinhold Haeb-Umbach. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","bibtex":"@article{Drude_Heitkaemper_Boeddeker_Haeb-Umbach_2019, title={SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition}, journal={ArXiv e-prints}, author={Drude, Lukas and Heitkaemper, Jens and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2019} }","mla":"Drude, Lukas, et al. “SMS-WSJ: Database, Performance Measures, and Baseline Recipe for Multi-Channel Source Separation and Recognition.” <i>ArXiv E-Prints</i>, 2019.","short":"L. Drude, J. Heitkaemper, C. Boeddeker, R. Haeb-Umbach, ArXiv E-Prints (2019).","apa":"Drude, L., Heitkaemper, J., Boeddeker, C., &#38; Haeb-Umbach, R. (2019). SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition. <i>ArXiv E-Prints</i>."}},{"has_accepted_license":"1","citation":{"apa":"Zorila, C., Boeddeker, C., Doddipatla, R., &#38; Haeb-Umbach, R. (2019). An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription. In <i>ASRU 2019, Sentosa, Singapore</i>.","bibtex":"@inproceedings{Zorila_Boeddeker_Doddipatla_Haeb-Umbach_2019, title={An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription}, booktitle={ASRU 2019, Sentosa, Singapore}, author={Zorila, Catalin and Boeddeker, Christoph and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2019} }","mla":"Zorila, Catalin, et al. “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription.” <i>ASRU 2019, Sentosa, Singapore</i>, 2019.","short":"C. Zorila, C. Boeddeker, R. Doddipatla, R. Haeb-Umbach, in: ASRU 2019, Sentosa, Singapore, 2019.","ama":"Zorila C, Boeddeker C, Doddipatla R, Haeb-Umbach R. An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription. In: <i>ASRU 2019, Sentosa, Singapore</i>. ; 2019.","chicago":"Zorila, Catalin, Christoph Boeddeker, Rama Doddipatla, and Reinhold Haeb-Umbach. “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription.” In <i>ASRU 2019, Sentosa, Singapore</i>, 2019.","ieee":"C. Zorila, C. Boeddeker, R. Doddipatla, and R. Haeb-Umbach, “An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription,” in <i>ASRU 2019, Sentosa, Singapore</i>, 2019."},"year":"2019","author":[{"full_name":"Zorila, Catalin","last_name":"Zorila","first_name":"Catalin"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"full_name":"Doddipatla, Rama","last_name":"Doddipatla","first_name":"Rama"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2020-02-06T07:35:08Z","date_updated":"2022-01-06T06:52:37Z","oa":"1","title":"An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription","publication":"ASRU 2019, Sentosa, Singapore","type":"conference","status":"public","file":[{"date_updated":"2020-02-06T07:42:42Z","date_created":"2020-02-06T07:42:42Z","creator":"huesera","file_size":200256,"file_id":"15817","file_name":"ASRU_2019_Boeddeker_Paper.pdf","access_level":"open_access","content_type":"application/pdf","relation":"main_file"},{"relation":"main_file","content_type":"application/pdf","file_name":"ASRU_2019_Boeddeker_Poster.pdf","access_level":"open_access","file_id":"15818","file_size":123963,"creator":"huesera","date_created":"2020-02-06T07:42:55Z","date_updated":"2020-02-06T07:42:55Z"}],"abstract":[{"lang":"eng","text":"Despite the strong modeling power of neural network acoustic models, speech enhancement has been shown to deliver additional word error rate improvements if multi-channel data is available. However, there has been a longstanding debate whether enhancement should also be carried out on the ASR training data. In an extensive experimental evaluation on the acoustically very challenging CHiME-5 dinner party data we show that: (i) cleaning up the training data can lead to substantial error rate reductions, and (ii) enhancement in training is advisable as long as enhancement in test is at least as strong as in training. This approach stands in contrast and delivers larger gains than the common strategy reported in the literature to augment the training database with additional artificially degraded speech. Together with an acoustic model topology consisting of initial CNN layers followed by factorized TDNN layers we achieve with 41.6% and 43.2% WER on the DEV and EVAL test sets, respectively, a new single-system state-of-the-art result on the CHiME-5 data. This is a 8% relative improvement compared to the best word error rate published so far for a speech recognizer without system combination."}],"department":[{"_id":"54"}],"user_id":"59789","_id":"15816","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"language":[{"iso":"eng"}],"file_date_updated":"2020-02-06T07:42:55Z","ddc":["000"]},{"citation":{"apa":"Kanda, N., Boeddeker, C., Heitkaemper, J., Fujita, Y., Horiguchi, S., &#38; Haeb-Umbach, R. (2019). Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In <i>INTERSPEECH 2019, Graz, Austria</i>.","mla":"Kanda, Naoyuki, et al. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","short":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, R. Haeb-Umbach, in: INTERSPEECH 2019, Graz, Austria, 2019.","bibtex":"@inproceedings{Kanda_Boeddeker_Heitkaemper_Fujita_Horiguchi_Haeb-Umbach_2019, title={Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}, year={2019} }","ama":"Kanda N, Boeddeker C, Heitkaemper J, Fujita Y, Horiguchi S, Haeb-Umbach R. Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","chicago":"Kanda, Naoyuki, Christoph Boeddeker, Jens Heitkaemper, Yusuke Fujita, Shota Horiguchi, and Reinhold Haeb-Umbach. “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"N. Kanda, C. Boeddeker, J. Heitkaemper, Y. Fujita, S. Horiguchi, and R. Haeb-Umbach, “Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR,” in <i>INTERSPEECH 2019, Graz, Austria</i>, 2019."},"year":"2019","has_accepted_license":"1","title":"Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR","author":[{"last_name":"Kanda","full_name":"Kanda, Naoyuki","first_name":"Naoyuki"},{"first_name":"Christoph","last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767"},{"last_name":"Heitkaemper","full_name":"Heitkaemper, Jens","id":"27643","first_name":"Jens"},{"first_name":"Yusuke","full_name":"Fujita, Yusuke","last_name":"Fujita"},{"last_name":"Horiguchi","full_name":"Horiguchi, Shota","first_name":"Shota"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2019-11-06T10:08:49Z","date_updated":"2022-01-06T06:52:07Z","oa":"1","status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_size":216202,"file_id":"14827","file_name":"INTERSPEECH_2019_Boeddeker_Paper.pdf","access_level":"open_access","date_updated":"2019-11-08T07:45:15Z","date_created":"2019-11-06T10:10:23Z","creator":"huesera"}],"abstract":[{"lang":"eng","text":"In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task."}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","file_date_updated":"2019-11-08T07:45:15Z","language":[{"iso":"eng"}],"ddc":["000"],"department":[{"_id":"54"}],"user_id":"59789","_id":"14826","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}]},{"oa":"1","date_updated":"2022-01-06T06:51:11Z","date_created":"2019-07-12T05:29:53Z","author":[{"last_name":"Drude","id":"11213","full_name":"Drude, Lukas","first_name":"Lukas"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"first_name":"Jahn","id":"9168","full_name":"Heymann, Jahn","last_name":"Heymann"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"last_name":"Nakatani","full_name":"Nakatani, Tomohiro","first_name":"Tomohiro"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"title":"Integration neural network based beamforming and weighted prediction error dereverberation","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Drude_Paper.pdf","open_access":"1"}],"related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Drude_Slides.pdf","description":"Slides","relation":"supplementary_material"}]},"year":"2018","citation":{"apa":"Drude, L., Boeddeker, C., Heymann, J., Kinoshita, K., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2018). Integration neural network based beamforming and weighted prediction error dereverberation. In <i>INTERSPEECH 2018, Hyderabad, India</i>.","mla":"Drude, Lukas, et al. “Integration Neural Network Based Beamforming and Weighted Prediction Error Dereverberation.” <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","bibtex":"@inproceedings{Drude_Boeddeker_Heymann_Kinoshita_Delcroix_Nakatani_Haeb-Umbach_2018, title={Integration neural network based beamforming and weighted prediction error dereverberation}, booktitle={INTERSPEECH 2018, Hyderabad, India}, author={Drude, Lukas and Boeddeker, Christoph and Heymann, Jahn and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2018} }","short":"L. Drude, C. Boeddeker, J. Heymann, K. Kinoshita, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: INTERSPEECH 2018, Hyderabad, India, 2018.","chicago":"Drude, Lukas, Christoph Boeddeker, Jahn Heymann, Keisuke Kinoshita, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Integration Neural Network Based Beamforming and Weighted Prediction Error Dereverberation.” In <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","ieee":"L. Drude <i>et al.</i>, “Integration neural network based beamforming and weighted prediction error dereverberation,” in <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","ama":"Drude L, Boeddeker C, Heymann J, et al. Integration neural network based beamforming and weighted prediction error dereverberation. In: <i>INTERSPEECH 2018, Hyderabad, India</i>. ; 2018."},"_id":"11872","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"40767","language":[{"iso":"eng"}],"publication":"INTERSPEECH 2018, Hyderabad, India","type":"conference","abstract":[{"text":"The weighted prediction error (WPE) algorithm has proven to be a very successful dereverberation method for the REVERB challenge. Likewise, neural network based mask estimation for beamforming demonstrated very good noise suppression in the CHiME 3 and CHiME 4 challenges. Recently, it has been shown that this estimator can also be trained to perform dereverberation and denoising jointly. However, up to now a comparison of a neural beamformer and WPE is still missing, so is an investigation into a combination of the two. Therefore, we here provide an extensive evaluation of both and consequently propose variants to integrate deep neural network based beamforming with WPE. For these integrated variants we identify a consistent word error rate (WER) reduction on two distinct databases. In particular, our study shows that deep learning based beamforming benefits from a model-based dereverberation technique (i.e. WPE) and vice versa. Our key findings are: (a) Neural beamforming yields the lower WERs in comparison to WPE the more channels and noise are present. (b) Integration of WPE and a neural beamformer consistently outperforms all stand-alone systems.","lang":"eng"}],"status":"public"}]
