[{"publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","abstract":[{"text":"Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.\r\nCSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.\r\nThis is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.\r\nRecently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.\r\nIt can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.\r\nIn this contribution, we further investigate the Graph-PIT training scheme.\r\nWe show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.\r\nModels trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.\r\nWe simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.\r\nIt eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.\r\nGraph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.\r\nFurthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.","lang":"eng"}],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":7185077,"file_id":"35607","access_level":"open_access","file_name":"main.pdf","date_updated":"2023-01-11T08:50:19Z","date_created":"2023-01-09T17:46:05Z","creator":"haebumb"}],"keyword":["Continuous Speech Separation","Source Separation","Graph-PIT","Dynamic Programming","Permutation Invariant Training"],"ddc":["000"],"language":[{"iso":"eng"}],"quality_controlled":"1","year":"2023","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","date_created":"2023-01-09T17:24:17Z","title":"Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria","type":"journal_article","status":"public","_id":"35602","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"49870","article_type":"original","file_date_updated":"2023-01-11T08:50:19Z","publication_identifier":{"issn":["2329-9290","2329-9304"]},"has_accepted_license":"1","publication_status":"published","page":"576-589","intvolume":"        31","citation":{"chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i> 31 (2023): 576–89. <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">https://doi.org/10.1109/taslp.2022.3228629</a>.","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 31, pp. 576–589, 2023, doi: <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>.","ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. 2023;31:576-589. doi:<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>","bibtex":"@article{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2023, title={Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}, volume={31}, DOI={<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023}, pages={576–589} }","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing 31 (2023) 576–589.","mla":"von Neumann, Thilo, et al. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 31, Institute of Electrical and Electronics Engineers (IEEE), 2023, pp. 576–89, doi:<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>.","apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2023). Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, <i>31</i>, 576–589. <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">https://doi.org/10.1109/taslp.2022.3228629</a>"},"oa":"1","date_updated":"2023-11-15T12:16:11Z","volume":31,"author":[{"last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","full_name":"von Neumann, Thilo","id":"49870","first_name":"Thilo"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph","first_name":"Christoph"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"doi":"10.1109/taslp.2022.3228629"},{"title":"Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers","date_created":"2021-10-25T08:50:01Z","year":"2021","quality_controlled":"1","keyword":["Continuous speech separation","automatic speech recognition","overlapped speech","permutation invariant training"],"ddc":["000"],"language":[{"iso":"eng"}],"abstract":[{"lang":"eng","text":"Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.\r\nFurther, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. "}],"file":[{"file_size":9550220,"title":"Video for INTERSPEECH 2021","file_id":"28327","file_name":"Interspeech 2021 voiceover-002-compressed.mp4","access_level":"open_access","date_updated":"2021-12-06T10:48:30Z","date_created":"2021-12-06T10:39:13Z","creator":"tvn","relation":"supplementary_material","content_type":"video/mp4"},{"content_type":"application/vnd.openxmlformats-officedocument.presentationml.presentation","relation":"slides","date_created":"2021-12-06T10:47:01Z","creator":"tvn","date_updated":"2021-12-06T10:47:01Z","file_id":"28328","file_name":"Graph-PIT-poster-presentation.pptx","access_level":"open_access","file_size":1337297,"title":"Slides from INTERSPEECH 2021"},{"relation":"main_file","content_type":"application/pdf","file_size":226589,"file_name":"INTERSPEECH2021_Graph_PIT.pdf","access_level":"open_access","file_id":"28329","date_updated":"2021-12-06T10:48:21Z","date_created":"2021-12-06T10:48:21Z","creator":"tvn"}],"publication":"Interspeech 2021","doi":"10.21437/interspeech.2021-1177","conference":{"name":"Interspeech"},"date_updated":"2023-11-15T12:14:40Z","oa":"1","author":[{"last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670","id":"49870","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"citation":{"ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. In: <i>Interspeech 2021</i>. ; 2021. doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” In <i>Interspeech 2021</i>, 2021. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>.","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers,” presented at the Interspeech, 2021, doi: <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. <i>Interspeech 2021</i>. Interspeech. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>","bibtex":"@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2021, title={Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>}, booktitle={Interspeech 2021}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Interspeech 2021, 2021.","mla":"von Neumann, Thilo, et al. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” <i>Interspeech 2021</i>, 2021, doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>."},"has_accepted_license":"1","publication_status":"published","related_material":{"link":[{"relation":"software","url":"https://github.com/fgnt/graph_pit"}]},"file_date_updated":"2021-12-06T10:48:30Z","_id":"26770","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"49870","status":"public","type":"conference"}]